diff --git a/.travis.yml b/.travis.yml
index 3f323a8540..eee7674fe7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -149,7 +149,7 @@ matrix:
 
     - &test-macos
       os: osx
-      osx_image: xcode8
+      osx_image: xcode10.1
       before_script:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
         - brew update
@@ -160,6 +160,7 @@ matrix:
         - BTYPE="BINARY=64 INTERFACE64=1"
 
     - <<: *test-macos
+      osx_image: xcode8.3
       env:
         - BTYPE="BINARY=32"
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ac5dd93de6..969696179a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 5)
+set(OpenBLAS_PATCH_VERSION 6)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions
@@ -42,6 +42,19 @@ endif()
 
 #######
 
+if(MSVC AND MSVC_STATIC_CRT)
+    set(CompilerFlags
+            CMAKE_CXX_FLAGS
+            CMAKE_CXX_FLAGS_DEBUG
+            CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_C_FLAGS
+            CMAKE_C_FLAGS_DEBUG
+            CMAKE_C_FLAGS_RELEASE
+            )
+    foreach(CompilerFlag ${CompilerFlags})
+      string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
+    endforeach()
+endif()
 
 message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
 
@@ -62,10 +75,10 @@ endif ()
 
 set(SUBDIRS	${BLASDIRS})
 if (NOT NO_LAPACK)
-  list(APPEND SUBDIRS lapack)
   if(BUILD_RELAPACK)
     list(APPEND SUBDIRS relapack/src)
   endif()
+  list(APPEND SUBDIRS lapack)
 endif ()
 
 # set which float types we want to build for
@@ -134,7 +147,7 @@ endif ()
 
 # Only generate .def for dll on MSVC and always produce pdb files for debug and release
 if(MSVC)
-  if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
+  if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
     set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
   endif()
   set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
@@ -149,15 +162,9 @@ if (${DYNAMIC_ARCH})
   endforeach()
 endif ()
 
-# Only build shared libs for MSVC
-if (MSVC)
-  set(BUILD_SHARED_LIBS ON)
-endif()
-
-
 # add objects to the openblas lib
 add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
-target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
+target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
 
 # Android needs to explicitly link against libm
 if(ANDROID)
@@ -166,7 +173,7 @@ endif()
 
 # Handle MSVC exports
 if(MSVC AND BUILD_SHARED_LIBS)
-  if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
+  if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
     include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
   else()
     # Creates verbose .def file (51KB vs 18KB)
@@ -217,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
   SOVERSION ${OpenBLAS_MAJOR_VERSION}
 )
 
+if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
+  if (NOT MSVC)
+    target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
+  else()
+    target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
+  endif()
+endif()
+
 if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
 if (NOT DEFINED ARCH)
   set(ARCH_IN "x86_64")
@@ -314,7 +329,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 if(NOT NOFORTRAN)
   message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
 
-  set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h)
+  set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h)
   file(WRITE  ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
   file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
   file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
@@ -327,10 +342,11 @@ endif()
 if(NOT NO_CBLAS)
 	message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
 
+	set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
 	file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
 	string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
-	file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}")
-	install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h)
+	file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
+	install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 endif()
 
 if(NOT NO_LAPACKE)
diff --git a/Changelog.txt b/Changelog.txt
index 49b26873a8..8df35d5c3a 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,82 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.6
+29-Apr-2019
+
+common:
+	* the build tools now check that a given cpu TARGET is actually valid
+	* the build-time check of system features (c_check) has been made
+  	  less dependent on particular perl features (this should mainly
+  	  benefit building on Windows)
+	* several problem with the ReLAPACK integration were fixed,
+	  including INTERFACE64 support and building a shared library
+	* building with CMAKE on BSD systems was improved
+	* a non-absolute SUM function was added based on the
+  	  existing optimized code for ASUM
+	* CBLAS interfaces to the IxMIN and IxMAX functions were added
+	* a name clash between LAPACKE and BOOST headers was resolved
+	* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel
+	  kernels
+	* a crash on thread (key) deletion with the USE_TLS=1 memory management
+	  option was fixed
+	* restored several earlier fixes, in particular for OpenMP performance,
+  	  building on BSD, and calling fork on CYGWIN, which had inadvertently
+  	  been dropped in the 0.3.3 rewrite of the memory management code.
+
+x86_64:
+	* the AVX512 DGEMM kernel has been disabled again due to unsolved problems
+	* building with old versions of MSVC was fixed
+	* it is now possible to build a static library on Windows with CMAKE
+	* accessing environment variables on CYGWIN at run time was fixed
+	* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
+	* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected
+	* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported
+  	  with CMAKE as well
+	* building for DYNAMIC_ARCH with GENERIC as the default target is now supported
+	* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed
+	* assembly bugs involving undeclared modification of input operands were fixed
+  	  in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem,
+	  Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause
+	  test failures or segfaults when compiled with recent versions of gcc from 8 onward.
+	* a similar bug was fixed in the blas_quickdivide code used to split workloads
+	  in most functions
+	* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX
+	* fixed building on SkylakeX systems when either the compiler or the (emulated) operating
+	  environment does not support AVX512
+	* improved GEMM performance on ZEN targets
+
+x86:
+	* build failures caused by the recently added checks for AVX512 were fixed
+	* an inline assembly bug involving undeclared modification of an input argument was
+  	  fixed in the blas_quickdivide code used to split workloads in most functions
+	* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX
+
+MIPS32:
+	* a bug in the IMIN implementation made it return the result of IMAX
+
+POWER:
+	* single precision BLAS1/2 functions have received optimized POWER8 kernels
+	* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel
+	* building on PPC970 systems under OSX Leopard or Tiger is now supported
+	* out-of-bounds memory accesses in the gemm_beta microkernels were fixed
+	* building a shared library on AIX is now supported for POWER6
+	* DYNAMIC_ARCH support has been added for POWER6 and newer
+
+ARMv7:
+	* corrected xDOT behaviour with zero INC_X or INC_Y
+	* a bug in the IMIN implementation made it return the result of IMAX
+
+ARMv8:
+	* added support for HiSilicon TSV110 cpus
+	* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
+	* cross-compilation with CMAKE now works again
+	* a bug in the IMIN implementation made it return the result of IMAX
+	* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
+
+IBM Z:
+	* optimized microkernels for single precicion BLAS1/2 functions have been added
+	  for both Z13 and Z14
+
 ====================================================================
 Version 0.3.5
 31-Dec-2018
diff --git a/Makefile b/Makefile
index 21096f893c..273fde33ed 100644
--- a/Makefile
+++ b/Makefile
@@ -96,7 +96,7 @@ endif
 	@echo
 
 shared :
-ifndef NO_SHARED
+ifneq ($(NO_SHARED), 1)
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
 	@$(MAKE) -C exports so
 	@ln -fs $(LIBSONAME) $(LIBPREFIX).so
diff --git a/Makefile.arm64 b/Makefile.arm64
index cd16dbfaed..4d10ff6844 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99)
 CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
 endif
+
+ifeq ($(CORE), TSV110)
+CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
+FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
+endif
diff --git a/Makefile.install b/Makefile.install
index 069c96c6aa..fefecd98d5 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -58,14 +58,14 @@ ifndef NO_LAPACKE
 endif
 
 #for install static library
-ifndef NO_STATIC
+ifneq ($(NO_STATIC),1)
 	@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 	@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
 	ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 endif
 #for install shared library
-ifndef NO_SHARED
+ifneq ($(NO_SHARED),1)
 	@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
 	@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@@ -106,14 +106,14 @@ ifndef NO_LAPACKE
 endif
 
 #for install static library
-ifndef NO_STATIC
+ifneq ($(NO_STATIC),1)
 	@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 	@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
 	ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 endif
 #for install shared library
-ifndef NO_SHARED
+ifneq ($(NO_SHARED),1)
 	@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 	@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
@@ -138,7 +138,7 @@ endif
 	@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 
-ifndef NO_SHARED
+ifneq ($(NO_SHARED),1)
 #ifeq logical or
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
 	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
diff --git a/Makefile.power b/Makefile.power
index a49372ad73..195f1930f8 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -9,7 +9,15 @@ else
 USE_OPENMP = 1
 endif
 
-
+ifeq ($(CORE), POWER9)
+ifeq ($(USE_OPENMP), 1)
+COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
+FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
+else
+COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
+FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
+endif
+endif
 
 ifeq ($(CORE), POWER8)
 ifeq ($(USE_OPENMP), 1)
diff --git a/Makefile.rule b/Makefile.rule
index 3033455d32..21782a2b9d 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.5
+VERSION = 0.3.6
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -48,6 +48,8 @@ VERSION = 0.3.5
 # HOSTCC = gcc
 
 # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
+# Please note that AVX is not available on 32-bit.
+# Setting BINARY=32 disables AVX/AVX2/AVX-512.
 # BINARY=64
 
 # About threaded BLAS. It will be automatically detected if you don't
@@ -57,7 +59,7 @@ VERSION = 0.3.5
 # USE_THREAD = 0
 
 # If you're going to use this library with OpenMP, please comment it in.
-# This flag is always set for POWER8. Don't modify the flag 
+# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
 # USE_OPENMP = 1
 
 # The OpenMP scheduler to use - by default this is "static" and you
@@ -68,36 +70,45 @@ VERSION = 0.3.5
 # allow you to select the scheduler from the environment variable OMP_SCHEDULE
 # CCOMMON_OPT += -DOMP_SCHED=dynamic
 
-# You can define maximum number of threads. Basically it should be
-# less than actual number of cores. If you don't specify one, it's
-# automatically detected by the the script.
+# You can define the maximum number of threads. Basically it should be less
+# than or equal to the number of CPU threads. If you don't specify one, it's
+# automatically detected by the build system.
+# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to 
+# restrict NUM_THREADS to the number of physical cores. By default, the automatic 
+# detection includes logical CPUs, thus allowing the use of SMT.
+# Users may opt at runtime to use less than NUM_THREADS threads.
+#
+# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS
+# value (eg. 32-256) if you expect your users to use that many threads. Due to the way
+# some internal structures are allocated, using a large NUM_THREADS value has a RAM
+# footprint penalty, even if users reduce the actual number of threads at runtime.
 # NUM_THREADS = 24
 
 # If you have enabled USE_OPENMP and your application would call
-# OpenBLAS's calculation API from multi threads, please comment it in.
-# This flag defines how many instances of OpenBLAS's calculation API can 
-# actually run in parallel. If more threads call OpenBLAS's calculation API,
+# OpenBLAS's calculation API from multiple threads, please comment this in.
+# This flag defines how many instances of OpenBLAS's calculation API can actually
+# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API,
 # they need to wait for the preceding API calls to finish or risk data corruption.
 # NUM_PARALLEL = 2
 
-# if you don't need to install the static library, please comment it in.
+# If you don't need to install the static library, please comment this in.
 # NO_STATIC = 1
 
-# if you don't need generate the shared library, please comment it in.
+# If you don't need to generate the shared library, please comment this in.
 # NO_SHARED = 1
 
-# If you don't need CBLAS interface, please comment it in.
+# If you don't need the CBLAS interface, please comment this in.
 # NO_CBLAS = 1
 
-# If you only want CBLAS interface without installing Fortran compiler,
-# please comment it in.
+# If you only want the CBLAS interface without installing a Fortran compiler,
+# please comment this in.
 # ONLY_CBLAS = 1
 
-# If you don't need LAPACK, please comment it in.
-# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
+# If you don't need LAPACK, please comment this in.
+# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1.
 # NO_LAPACK = 1
 
-# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
+# If you don't need LAPACKE (C Interface to LAPACK), please comment this in.
 # NO_LAPACKE = 1
 
 # Build LAPACK Deprecated functions since LAPACK 3.6.0
@@ -106,7 +117,7 @@ BUILD_LAPACK_DEPRECATED = 1
 # Build RecursiveLAPACK on top of LAPACK
 # BUILD_RELAPACK = 1
 
-# If you want to use legacy threaded Level 3 implementation.
+# If you want to use the legacy threaded Level 3 implementation.
 # USE_SIMPLE_THREADED_LEVEL3 = 1
 
 # If you want to use the new, still somewhat experimental code that uses
@@ -116,8 +127,8 @@ BUILD_LAPACK_DEPRECATED = 1
 # USE_TLS = 1
 
 # If you want to drive whole 64bit region by BLAS. Not all Fortran
-# compiler supports this. It's safe to keep comment it out if you
-# are not sure(equivalent to "-i8" option).
+# compilers support this. It's safe to keep this commented out if you
+# are not sure. (This is equivalent to the "-i8" ifort option).
 # INTERFACE64 = 1
 
 # Unfortunately most of kernel won't give us high quality buffer.
@@ -125,10 +136,18 @@ BUILD_LAPACK_DEPRECATED = 1
 # but it will consume time. If you don't like it, you can disable one.
 NO_WARMUP = 1
 
-# If you want to disable CPU/Memory affinity on Linux.
+# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling.
+# This feature is only implemented on Linux, and is always disabled on other platforms.
+# Enabling affinity handling may improve performance, especially on NUMA systems, but 
+# it may conflict with certain applications that also try to manage affinity.
+# This conflict can result in threads of the application calling OpenBLAS ending up locked
+# to the same core(s) as OpenBLAS, possibly binding all threads to a single core.
+# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing
+# else modifies affinity settings.
+# Note: enabling affinity has been known to cause problems with NumPy and R
 NO_AFFINITY = 1
 
-# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
+# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
 # BIGNUMA = 1
 
 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
@@ -180,7 +199,7 @@ NO_AFFINITY = 1
 # been reported to be optimal for certain workloads (50 is the recommended value for Julia).
 # GEMM_MULTITHREAD_THRESHOLD = 4
 
-# If you need santy check by comparing reference BLAS. It'll be very
+# If you need sanity check by comparing results to reference BLAS. It'll be very
 # slow (Not implemented yet).
 # SANITY_CHECK = 1
 
diff --git a/Makefile.system b/Makefile.system
index fb8e7ea419..a95d6190f3 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -65,6 +65,7 @@ endif
 
 ifdef TARGET
 GETARCH_FLAGS := -DFORCE_$(TARGET)
+GETARCH_FLAGS += -DUSER_TARGET
 endif
 
 # Force fallbacks for 32bit
@@ -94,6 +95,9 @@ endif
 ifeq ($(TARGET), ZEN)
 GETARCH_FLAGS := -DFORCE_BARCELONA
 endif
+ifeq ($(TARGET), ARMV8)
+GETARCH_FLAGS := -DFORCE_ARMV7
+endif
 endif
 
 
@@ -151,7 +155,8 @@ GETARCH_FLAGS	+= -DNO_AVX
 endif
 
 ifeq ($(BINARY), 32)
-GETARCH_FLAGS	+= -DNO_AVX
+GETARCH_FLAGS	+= -DNO_AVX -DNO_AVX2 -DNO_AVX512
+NO_AVX512 = 1
 endif
 
 ifeq ($(NO_AVX2), 1)
@@ -523,6 +528,12 @@ DYNAMIC_CORE += THUNDERX
 DYNAMIC_CORE += THUNDERX2T99
 endif
 
+ifeq ($(ARCH), power)
+DYNAMIC_CORE = POWER6
+DYNAMIC_CORE += POWER8
+DYNAMIC_CORE += POWER9
+endif
+
 # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
 ifndef DYNAMIC_CORE
 override DYNAMIC_ARCH=
diff --git a/Makefile.zarch b/Makefile.zarch
index 9ec9dc79fc..47ea1eb717 100644
--- a/Makefile.zarch
+++ b/Makefile.zarch
@@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
 FCOMMON_OPT += -march=z13 -mzvector
 endif
 
+ifeq ($(CORE), Z14)
+CCOMMON_OPT += -march=z14 -mzvector
+FCOMMON_OPT += -march=z14 -mzvector
+endif
diff --git a/TargetList.txt b/TargetList.txt
index 3d04a57cf3..6a57bf1af7 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -48,6 +48,7 @@ POWER5
 POWER6
 POWER7
 POWER8
+POWER9
 PPCG4
 PPC970
 PPC970MP
@@ -90,7 +91,9 @@ CORTEXA73
 FALKOR
 THUNDERX
 THUNDERX2T99
+TSV110
 
 9.System Z:
 ZARCH_GENERIC
 Z13
+Z14
diff --git a/appveyor.yml b/appveyor.yml
index 141d3a130c..44a616aaaf 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -53,9 +53,9 @@ before_build:
   - ps: if (-Not (Test-Path .\build)) { mkdir build }
   - cd build
   - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
-  - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl ..
+  - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
   - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
-  - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON ..
+  - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
 
 build_script:
   - cmake --build .
diff --git a/benchmark/scripts/R/deig.R b/benchmark/scripts/R/deig.R
index ece727fb37..c6d541dcf2 100755
--- a/benchmark/scripts/R/deig.R
+++ b/benchmark/scripts/R/deig.R
@@ -2,6 +2,8 @@
 
 argv <- commandArgs(trailingOnly = TRUE)
 
+if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
+
 nfrom <- 128
 nto <- 2048
 nstep <- 128
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
       loops <- as.numeric(argv[z])
     }
   }
-
 }
 
 p <- Sys.getenv("OPENBLAS_LOOPS")
@@ -27,29 +28,21 @@ if (p != "") {
   loops <- as.numeric(p)
 }
 
-
-cat(sprintf(
-  "From %.0f To %.0f Step=%.0f Loops=%.0f\n",
-  nfrom,
-  nto,
-  nstep,
-  loops
-))
+cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
 cat(sprintf("      SIZE             Flops                   Time\n"))
 
 n <- nfrom
 while (n <= nto) {
-  A <- matrix(rnorm(n * n), ncol = n, nrow = n)
+  A <- matrix(rnorm(n * n), nrow = n)
   ev <- 0
   z <- system.time(for (l in 1:loops) {
     ev <- eigen(A)
   })
 
-  mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
+  mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06)
 
   st <- sprintf("%.0fx%.0f :", n, n)
   cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
 
   n <- n + nstep
-
 }
diff --git a/benchmark/scripts/R/dgemm.R b/benchmark/scripts/R/dgemm.R
index 75297dfb83..d7c3e81084 100755
--- a/benchmark/scripts/R/dgemm.R
+++ b/benchmark/scripts/R/dgemm.R
@@ -2,6 +2,8 @@
 
 argv <- commandArgs(trailingOnly = TRUE)
 
+if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
+
 nfrom <- 128
 nto <- 2048
 nstep <- 128
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
       loops <- as.numeric(argv[z])
     }
   }
-
 }
 
 p <- Sys.getenv("OPENBLAS_LOOPS")
@@ -27,26 +28,13 @@ if (p != "") {
   loops <- as.numeric(p)
 }
 
-
-cat(sprintf(
-  "From %.0f To %.0f Step=%.0f Loops=%.0f\n",
-  nfrom,
-  nto,
-  nstep,
-  loops
-))
+cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
 cat(sprintf("      SIZE             Flops                   Time\n"))
 
 n <- nfrom
 while (n <= nto) {
-  A <- matrix(runif(n * n),
-              ncol = n,
-              nrow = n,
-              byrow = TRUE)
-  B <- matrix(runif(n * n),
-              ncol = n,
-              nrow = n,
-              byrow = TRUE)
+  A <- matrix(runif(n * n), nrow = n)
+  B <- matrix(runif(n * n), nrow = n)
   C <- 1
 
   z <- system.time(for (l in 1:loops) {
@@ -54,11 +42,10 @@ while (n <= nto) {
     l <- l + 1
   })
 
-  mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
+  mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06)
 
   st <- sprintf("%.0fx%.0f :", n, n)
   cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
 
   n <- n + nstep
-
 }
diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R
index a3fb78da71..46301570bc 100755
--- a/benchmark/scripts/R/dsolve.R
+++ b/benchmark/scripts/R/dsolve.R
@@ -2,6 +2,8 @@
 
 argv <- commandArgs(trailingOnly = TRUE)
 
+if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
+
 nfrom <- 128
 nto <- 2048
 nstep <- 128
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
       loops <- as.numeric(argv[z])
     }
   }
-
 }
 
 p <- Sys.getenv("OPENBLAS_LOOPS")
@@ -27,31 +28,22 @@ if (p != "") {
   loops <- as.numeric(p)
 }
 
-
-cat(sprintf(
-  "From %.0f To %.0f Step=%.0f Loops=%.0f\n",
-  nfrom,
-  nto,
-  nstep,
-  loops
-))
+cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
 cat(sprintf("      SIZE             Flops                   Time\n"))
 
 n <- nfrom
 while (n <= nto) {
-  A <- matrix(rnorm(n * n), ncol = n, nrow = n)
-  B <- matrix(rnorm(n * n), ncol = n, nrow = n)
+  A <- matrix(rnorm(n * n), nrow = n)
+  B <- matrix(rnorm(n * n), nrow = n)
 
   z <- system.time(for (l in 1:loops) {
     solve(A, B)
   })
 
-  mflops <-
-    (2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
+  mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06)
 
   st <- sprintf("%.0fx%.0f :", n, n)
   cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
 
   n <- n + nstep
-
 }
diff --git a/c_check b/c_check
index 9dc237bebc..d93b756d53 100644
--- a/c_check
+++ b/c_check
@@ -1,7 +1,7 @@
 #!/usr/bin/perl
 
-use File::Basename;
-use File::Temp qw(tempfile);
+#use File::Basename;
+# use File::Temp qw(tempfile);
 
 # Checking cross compile
 $hostos   = `uname -s | sed -e s/\-.*//`;    chop($hostos);
@@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64");
 $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
 $hostarch = "zarch" if ($hostarch eq "s390x");
 
-$tmpf = new File::Temp( UNLINK => 1 );
+#$tmpf = new File::Temp( UNLINK => 1 );
 $binary = $ENV{"BINARY"};
 
 $makefile = shift(@ARGV);
@@ -31,12 +31,25 @@ if ($?) {
 
 $cross_suffix = "";
 
-if (dirname($compiler_name) ne ".") {
-    $cross_suffix .= dirname($compiler_name) . "/";
-}
+eval "use File::Basename";
+if ($@){ 
+    warn "could not load PERL module File::Basename, emulating its functionality";
+    my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
+    if ($dirnam ne ".") {
+	$cross_suffix .= $dirnam . "/";
+    }
+    my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
+	if ($basnam =~ /([^\s]*-)(.*)/) {
+	$cross_suffix .= $1;
+    }
+} else {
+    if (dirname($compiler_name) ne ".") {
+	$cross_suffix .= dirname($compiler_name) . "/";
+    }
 
-if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
-    $cross_suffix .= $1;
+    if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
+	$cross_suffix .= $1;
+    }
 }
 
 $compiler = "";
@@ -171,20 +184,26 @@ if ($?) {
 
 $have_msa = 0;
 if (($architecture eq "mips") || ($architecture eq "mips64")) {
-    $code = '"addvi.b $w0, $w1, 1"';
-    $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
-    print $tmpf "#include <msa.h>\n\n";
-    print $tmpf "void main(void){ __asm__ volatile($code); }\n";
-
-    $args = "$msa_flags -o $tmpf.o -x c $tmpf";
-    my @cmd = ("$compiler_name $args");
-    system(@cmd) == 0;
-    if ($? != 0) {
-        $have_msa = 0;
+    eval "use File::Temp qw(tempfile)";
+    if ($@){ 
+	warn "could not load PERL module File::Temp, so could not check MSA capatibility";
     } else {
-        $have_msa = 1;
+	$tmpf = new File::Temp( UNLINK => 1 );
+	$code = '"addvi.b $w0, $w1, 1"';
+	$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
+	print $tmpf "#include <msa.h>\n\n";
+	print $tmpf "void main(void){ __asm__ volatile($code); }\n";
+
+	$args = "$msa_flags -o $tmpf.o -x c $tmpf";
+	my @cmd = ("$compiler_name $args");
+	system(@cmd) == 0;
+	if ($? != 0) {
+	    $have_msa = 0;
+	} else {
+	    $have_msa = 1;
+	}
+	unlink("$tmpf.o");
     }
-    unlink("$tmpf.o");
 }
 
 $architecture = x86    if ($data =~ /ARCH_X86/);
@@ -204,17 +223,25 @@ $binformat    = bin64  if ($data =~ /BINARY_64/);
 
 $no_avx512= 0;
 if (($architecture eq "x86") || ($architecture eq "x86_64")) {
-    $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
-    print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
-    $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
-    my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
-    system(@cmd) == 0;
-    if ($? != 0) {
-	$no_avx512 = 1;
-    } else {
+    eval "use File::Temp qw(tempfile)";
+    if ($@){ 
+	warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
 	$no_avx512 = 0;
+    } else {
+#	$tmpf = new File::Temp( UNLINK => 1 );
+	($fh,$tmpf) = tempfile( UNLINK => 1 );
+	$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
+	print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
+	$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
+	my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
+	system(@cmd) == 0;
+	if ($? != 0) {
+	    $no_avx512 = 1;
+	} else {
+	    $no_avx512 = 0;
+	}
+	unlink("tmpf.o");
     }
-    unlink("tmpf.o");
 }
 
 $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
diff --git a/cblas.h b/cblas.h
index d340a20371..1a87074d6b 100644
--- a/cblas.h
+++ b/cblas.h
@@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
 float  cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 
+float  cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float  cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
 float  cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX);
 double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
 float  cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void  *X, OPENBLAS_CONST blasint incX);
@@ -88,6 +93,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
 CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 
+CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
+CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
 void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 63fb86fa21..470ea2a8f3 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -74,6 +74,9 @@ if (DYNAMIC_ARCH)
     if (NOT NO_AVX512)
       set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
     endif ()
+    if (DYNAMIC_LIST)
+	set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
+    endif ()
   endif ()
 
   if (NOT DYNAMIC_CORE)
diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake
index fad84de519..0ed09e7763 100644
--- a/cmake/kernel.cmake
+++ b/cmake/kernel.cmake
@@ -107,6 +107,12 @@ macro(SetDefaultL1)
   set(DAXPBYKERNEL ../arm/axpby.c)
   set(CAXPBYKERNEL ../arm/zaxpby.c)
   set(ZAXPBYKERNEL ../arm/zaxpby.c)
+  set(SSUMKERNEL sum.S)
+  set(DSUMKERNEL sum.S)
+  set(CSUMKERNEL zsum.S)
+  set(ZSUMKERNEL zsum.S)
+  set(QSUMKERNEL sum.S)
+  set(XSUMKERNEL zsum.S)
 endmacro ()
 
 macro(SetDefaultL2)
@@ -162,4 +168,4 @@ macro(SetDefaultL3)
   set(DGEADD_KERNEL ../generic/geadd.c)
   set(CGEADD_KERNEL ../generic/zgeadd.c)
   set(ZGEADD_KERNEL ../generic/zgeadd.c)
-endmacro ()
\ No newline at end of file
+endmacro ()
diff --git a/cmake/os.cmake b/cmake/os.cmake
index 1321ef6194..2d25e7aaae 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -8,6 +8,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
   set(NO_EXPRECISION 1)
 endif ()
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly")
+  set(EXTRALIB "${EXTRALIB} -lm")
+  set(NO_EXPRECISION 1)
+endif ()
+
 if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX")
   set(EXTRALIB "${EXTRALIB} -lm")
 endif ()
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 6ed99e807b..a67c44bf5c 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -87,13 +87,18 @@ endif ()
 # Cannot run getarch on target if we are cross-compiling
 if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
   # Write to config as getarch would
+  if (DEFINED TARGET_CORE)
+  set(TCORE ${TARGET_CORE})
+  else()
+  set(TCORE ${CORE})
+  endif()
 
   # TODO: Set up defines that getarch sets up based on every other target
   # Perhaps this should be inside a different file as it grows larger
   file(APPEND ${TARGET_CONF_TEMP}
-    "#define ${CORE}\n"
-    "#define CHAR_CORENAME \"${CORE}\"\n")
-  if ("${CORE}" STREQUAL "ARMV7")
+    "#define ${TCORE}\n"
+    "#define CHAR_CORENAME \"${TCORE}\"\n")
+  if ("${TCORE}" STREQUAL "ARMV7")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE\t65536\n"
       "#define L1_DATA_LINESIZE\t32\n"
@@ -108,7 +113,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
     set(SGEMM_UNROLL_N 4)
     set(DGEMM_UNROLL_M 4)
     set(DGEMM_UNROLL_N 4)
-  elseif ("${CORE}" STREQUAL "ARMV8")
+  elseif ("${TCORE}" STREQUAL "ARMV8")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE\t32768\n"
       "#define L1_DATA_LINESIZE\t64\n"
@@ -118,9 +123,16 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
       "#define DTB_SIZE\t4096\n"
       "#define L2_ASSOCIATIVE\t32\n"
       "#define ARMV8\n")
-    set(SGEMM_UNROLL_M 4)
+    set(SGEMM_UNROLL_M 16)
     set(SGEMM_UNROLL_N 4)
-  elseif ("${CORE}" STREQUAL "CORTEXA57" OR "${CORE}" STREQUAL "CORTEXA53")
+    set(DGEMM_UNROLL_M 8)
+    set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 8)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 4)
+    set(ZGEMM_UNROLL_N 4)
+    set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_CODE_SIZE\t32768\n"
       "#define L1_CODE_LINESIZE\t64\n"
@@ -144,9 +156,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
     set(DGEMM_UNROLL_N 4)
     set(CGEMM_UNROLL_M 8)
     set(CGEMM_UNROLL_N 4)
-    set(ZGEMM_UNROLL_M 8)
+    set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
-  elseif ("${CORE}" STREQUAL "CORTEXA72" OR "${CORE}" STREQUAL "CORTEXA73")
+    set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_CODE_SIZE\t49152\n"
       "#define L1_CODE_LINESIZE\t64\n"
@@ -170,9 +183,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
     set(DGEMM_UNROLL_N 4)
     set(CGEMM_UNROLL_M 8)
     set(CGEMM_UNROLL_N 4)
-    set(ZGEMM_UNROLL_M 8)
+    set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
-  elseif ("${CORE}" STREQUAL "FALKOR")
+    set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "FALKOR")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_CODE_SIZE\t65536\n"
       "#define L1_CODE_LINESIZE\t64\n"
@@ -196,9 +210,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
     set(DGEMM_UNROLL_N 4)
     set(CGEMM_UNROLL_M 8)
     set(CGEMM_UNROLL_N 4)
-    set(ZGEMM_UNROLL_M 8)
+    set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
-  elseif ("${CORE}" STREQUAL "THUNDERX)
+    set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "THUNDERX")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_CODE_SIZE\t32768\n"
       "#define L1_CODE_LINESIZE\t64\n"
@@ -224,7 +239,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
     set(CGEMM_UNROLL_N 2)
     set(ZGEMM_UNROLL_M 2)
     set(ZGEMM_UNROLL_N 2)
-  elseif ("${CORE}" STREQUAL "THUNDERX2T99)
+    set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "THUNDERX2T99")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_CODE_SIZE\t32768\n"
       "#define L1_CODE_LINESIZE\t64\n"
@@ -240,7 +256,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
       "#define L3_ASSOCIATIVE\t32\n"
       "#define DTB_DEFAULT_ENTRIES\t64\n"
       "#define DTB_SIZE\t4096\n"
-      "#define VULCAN\n")
+      "#define ARMV8\n")
     set(SGEMM_UNROLL_M 16)
     set(SGEMM_UNROLL_N 4)
     set(DGEMM_UNROLL_M 8)
@@ -249,6 +265,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
     set(CGEMM_UNROLL_N 4)
     set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
+    set(SYMV_P 16)
   endif()
 
   # Or should this actually be NUM_CORES?
diff --git a/cmake/system.cmake b/cmake/system.cmake
index a060d98cb0..7fda2adb92 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
   if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
     set(TARGET "BARCELONA")
   endif ()
+  if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
+    set(TARGET "ARMV7")
+  endif ()
 endif ()
 
 if (DEFINED TARGET)
@@ -184,6 +187,13 @@ if (DYNAMIC_ARCH)
   endif ()
 endif ()
 
+if (DYNAMIC_LIST)
+  set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST")
+  foreach(DCORE ${DYNAMIC_LIST})
+    set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}")
+  endforeach ()
+endif ()
+
 if (NO_LAPACK)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK")
   #Disable LAPACK C interface
diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index 6b602c1b0f..94d3ba6437 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -39,13 +39,21 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
   set(MIPS64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
-  set(X86_64 1)
+  if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+    set(X86_64 1)
+  else()
+    set(X86 1)
+  endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
   set(X86 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
   set(ARM 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
-  set(ARM64 1)
+  if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+    set(ARM64 1)
+  else()
+    set(ARM 1)
+  endif()
 endif()
 
 if (X86_64)
@@ -78,7 +86,7 @@ endif()
 
 if (X86_64 OR X86)
   file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
-execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
+execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
 if (NO_AVX512 EQUAL 1)
 set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
 endif()
diff --git a/common.h b/common.h
index 7fcd5e3163..0ac74bb20a 100644
--- a/common.h
+++ b/common.h
@@ -85,6 +85,8 @@ extern "C" {
 
 #if !defined(_MSC_VER)
 #include <unistd.h>
+#elif _MSC_VER < 1900
+#define snprintf _snprintf
 #endif
 #include <time.h>
 
@@ -348,6 +350,11 @@ typedef int blasint;
 #endif
 #endif
 
+#ifdef POWER9
+#ifndef YIELDING
+#define YIELDING        __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
+#endif
+#endif
 
 /*
 #ifdef PILEDRIVER
@@ -439,7 +446,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 typedef char env_var_t[MAX_PATH];
 #define readenv(p, n) 0
 #else
-#ifdef OS_WINDOWS
+#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
 typedef char env_var_t[MAX_PATH];
 #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
 #else
diff --git a/common_c.h b/common_c.h
index ce0f2a5bdb..40ecf5b8b8 100644
--- a/common_c.h
+++ b/common_c.h
@@ -19,6 +19,7 @@
 #define	CDOTC_K			cdotc_k
 #define	CNRM2_K			cnrm2_k
 #define	CSCAL_K			cscal_k
+#define	CSUM_K			csum_k
 #define	CSWAP_K			cswap_k
 #define	CROT_K			csrot_k
 
@@ -249,6 +250,7 @@
 #define	CDOTC_K			gotoblas -> cdotc_k
 #define	CNRM2_K			gotoblas -> cnrm2_k
 #define	CSCAL_K			gotoblas -> cscal_k
+#define	CSUM_K			gotoblas -> csum_k
 #define	CSWAP_K			gotoblas -> cswap_k
 #define	CROT_K			gotoblas -> csrot_k
 
diff --git a/common_d.h b/common_d.h
index ad99451867..94dc3eea88 100644
--- a/common_d.h
+++ b/common_d.h
@@ -19,6 +19,7 @@
 #define	DDOTC_K			ddot_k
 #define	DNRM2_K			dnrm2_k
 #define	DSCAL_K			dscal_k
+#define	DSUM_K			dsum_k
 #define	DSWAP_K			dswap_k
 #define	DROT_K			drot_k
 
@@ -174,6 +175,7 @@
 #define	DDOTC_K			gotoblas -> ddot_k
 #define	DNRM2_K			gotoblas -> dnrm2_k
 #define	DSCAL_K			gotoblas -> dscal_k
+#define	DSUM_K			gotoblas -> dsum_k
 #define	DSWAP_K			gotoblas -> dswap_k
 #define	DROT_K			gotoblas -> drot_k
 
diff --git a/common_interface.h b/common_interface.h
index 15f69e02f1..c350ac8ec0 100644
--- a/common_interface.h
+++ b/common_interface.h
@@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
 double BLASFUNC(dzasum)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
 
+FLOATRET  BLASFUNC(ssum) (blasint *, float  *, blasint *);
+FLOATRET  BLASFUNC(scsum)(blasint *, float  *, blasint *);
+double BLASFUNC(dsum) (blasint *, double *, blasint *);
+xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
+double BLASFUNC(dzsum)(blasint *, double *, blasint *);
+xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);
+
 blasint    BLASFUNC(isamax)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(idamax)(blasint *, double *, blasint *);
 blasint    BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);
diff --git a/common_level1.h b/common_level1.h
index 32ffd6f188..74cafb6dbb 100644
--- a/common_level1.h
+++ b/common_level1.h
@@ -100,6 +100,13 @@ float   casum_k (BLASLONG, float  *, BLASLONG);
 double  zasum_k (BLASLONG, double *, BLASLONG);
 xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
 
+float   ssum_k (BLASLONG, float  *, BLASLONG);
+double  dsum_k (BLASLONG, double *, BLASLONG);
+xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
+float   csum_k (BLASLONG, float  *, BLASLONG);
+double  zsum_k (BLASLONG, double *, BLASLONG);
+xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);
+
 float   samax_k (BLASLONG, float  *, BLASLONG);
 double  damax_k (BLASLONG, double *, BLASLONG);
 xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);
diff --git a/common_macro.h b/common_macro.h
index 15ba6f9db9..d2503aa65e 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -66,6 +66,7 @@
 #define	DOTC_K			QDOTC_K
 #define	NRM2_K			QNRM2_K
 #define	SCAL_K			QSCAL_K
+#define	SUM_K			QSUM_K
 #define	SWAP_K			QSWAP_K
 #define	ROT_K			QROT_K
 
@@ -356,6 +357,7 @@
 #define	DOTC_K			DDOTC_K
 #define	NRM2_K			DNRM2_K
 #define	SCAL_K			DSCAL_K
+#define	SUM_K			DSUM_K
 #define	SWAP_K			DSWAP_K
 #define	ROT_K			DROT_K
 
@@ -658,6 +660,7 @@
 #define	DOTC_K			SDOTC_K
 #define	NRM2_K			SNRM2_K
 #define	SCAL_K			SSCAL_K
+#define	SUM_K			SSUM_K
 #define	SWAP_K			SSWAP_K
 #define	ROT_K			SROT_K
 
@@ -962,6 +965,7 @@
 #define	DOTC_K			XDOTC_K
 #define	NRM2_K			XNRM2_K
 #define	SCAL_K			XSCAL_K
+#define	SUM_K			XSUM_K
 #define	SWAP_K			XSWAP_K
 #define	ROT_K			XROT_K
 
@@ -1363,6 +1367,7 @@
 #define	DOTC_K			ZDOTC_K
 #define	NRM2_K			ZNRM2_K
 #define	SCAL_K			ZSCAL_K
+#define	SUM_K			ZSUM_K
 #define	SWAP_K			ZSWAP_K
 #define	ROT_K			ZROT_K
 
@@ -1785,6 +1790,7 @@
 #define	DOTC_K			CDOTC_K
 #define	NRM2_K			CNRM2_K
 #define	SCAL_K			CSCAL_K
+#define	SUM_K			CSUM_K
 #define	SWAP_K			CSWAP_K
 #define	ROT_K			CROT_K
 
diff --git a/common_param.h b/common_param.h
index 8f162c01f5..574d5e176d 100644
--- a/common_param.h
+++ b/common_param.h
@@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
 
   float  (*snrm2_k) (BLASLONG, float *, BLASLONG);
   float  (*sasum_k) (BLASLONG, float *, BLASLONG);
+  float  (*ssum_k)  (BLASLONG, float *, BLASLONG);
   int    (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
   float  (*sdot_k)  (BLASLONG, float *, BLASLONG, float *, BLASLONG);
   double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
 
   double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
   double (*dasum_k) (BLASLONG, double *, BLASLONG);
+  double (*dsum_k)  (BLASLONG, double *, BLASLONG);
   int    (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
   double (*ddot_k)  (BLASLONG, double *, BLASLONG, double *, BLASLONG);
   int    (*drot_k)  (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
@@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
 
  xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
  xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
+ xdouble (*qsum_k)  (BLASLONG, xdouble *, BLASLONG);
   int    (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
  xdouble (*qdot_k)  (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
   int    (*qrot_k)  (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
@@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
 
   float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
   float (*casum_k) (BLASLONG, float *, BLASLONG);
+  float (*csum_k)  (BLASLONG, float *, BLASLONG);
   int    (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
   openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
   openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
 
   double (*znrm2_k) (BLASLONG, double *, BLASLONG);
   double (*zasum_k) (BLASLONG, double *, BLASLONG);
+  double (*zsum_k)  (BLASLONG, double *, BLASLONG);
   int    (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
   openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
   openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
@@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
 
   xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
   xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
+  xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
   int    (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
   openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
   openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
diff --git a/common_power.h b/common_power.h
index e3a1a7aef4..889205c75c 100644
--- a/common_power.h
+++ b/common_power.h
@@ -39,7 +39,7 @@
 #ifndef COMMON_POWER
 #define COMMON_POWER
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #define MB		__asm__ __volatile__ ("eieio":::"memory")
 #define WMB		__asm__ __volatile__ ("eieio":::"memory")
 #else
@@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define HAVE_PREFETCH
 #endif
 
-#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
+#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
 #define DCBT_ARG	0
 #else
 #define DCBT_ARG	8
@@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define L1_PREFETCH	dcbtst
 #endif
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #define L1_DUALFETCH
 #define L1_PREFETCHSIZE (16 + 128 * 100)
 #define L1_PREFETCH	dcbtst
@@ -598,9 +598,14 @@ REALNAME:;\
 #ifndef __64BIT__
 #define PROLOGUE \
 	.machine "any";\
+	.toc;\
 	.globl .REALNAME;\
+	.globl REALNAME;\
+	.csect REALNAME[DS],3;\
+REALNAME:;\
+	.long .REALNAME, TOC[tc0], 0;\
 	.csect .text[PR],5;\
-.REALNAME:;
+.REALNAME:
 
 #define EPILOGUE \
 _section_.text:;\
@@ -611,9 +616,14 @@ _section_.text:;\
 
 #define PROLOGUE \
 	.machine "any";\
+	.toc;\
 	.globl .REALNAME;\
+	.globl REALNAME;\
+	.csect REALNAME[DS],3;\
+REALNAME:;\
+	.llong .REALNAME, TOC[tc0], 0;\
 	.csect .text[PR], 5;\
-.REALNAME:;
+.REALNAME:
 
 #define EPILOGUE \
 _section_.text:;\
@@ -802,7 +812,7 @@ Lmcount$lazy_ptr:
 #define BUFFER_SIZE     (  2 << 20)
 #elif defined(PPC440FP2)
 #define BUFFER_SIZE     ( 16 << 20)
-#elif defined(POWER8)
+#elif defined(POWER8) || defined(POWER9)
 #define BUFFER_SIZE     ( 64 << 20)
 #else
 #define BUFFER_SIZE     ( 16 << 20)
diff --git a/common_q.h b/common_q.h
index 30ad3727ad..b4ace3a628 100644
--- a/common_q.h
+++ b/common_q.h
@@ -19,6 +19,7 @@
 #define	QDOTC_K			qdot_k
 #define	QNRM2_K			qnrm2_k
 #define	QSCAL_K			qscal_k
+#define	QSUM_K			qsum_k
 #define	QSWAP_K			qswap_k
 #define	QROT_K			qrot_k
 
@@ -161,6 +162,7 @@
 #define	QDOTC_K			gotoblas -> qdot_k
 #define	QNRM2_K			gotoblas -> qnrm2_k
 #define	QSCAL_K			gotoblas -> qscal_k
+#define	QSUM_K			gotoblas -> qsum_k
 #define	QSWAP_K			gotoblas -> qswap_k
 #define	QROT_K			gotoblas -> qrot_k
 
diff --git a/common_s.h b/common_s.h
index 3c1600859e..23c432f7c5 100644
--- a/common_s.h
+++ b/common_s.h
@@ -12,6 +12,7 @@
 #define	ISMAX_K			ismax_k
 #define	ISMIN_K			ismin_k
 #define	SASUM_K			sasum_k
+#define	SSUM_K			ssum_k
 #define	SAXPYU_K		saxpy_k
 #define	SAXPYC_K		saxpy_k
 #define	SCOPY_K			scopy_k
@@ -170,6 +171,7 @@
 #define	ISMAX_K			gotoblas -> ismax_k
 #define	ISMIN_K			gotoblas -> ismin_k
 #define	SASUM_K			gotoblas -> sasum_k
+#define	SSUM_K			gotoblas -> ssum_k
 #define	SAXPYU_K		gotoblas -> saxpy_k
 #define	SAXPYC_K		gotoblas -> saxpy_k
 #define	SCOPY_K			gotoblas -> scopy_k
diff --git a/common_x.h b/common_x.h
index 03b98db4f4..2ed525faa1 100644
--- a/common_x.h
+++ b/common_x.h
@@ -19,6 +19,7 @@
 #define	XDOTC_K			xdotc_k
 #define	XNRM2_K			xnrm2_k
 #define	XSCAL_K			xscal_k
+#define	XSUM_K			xsum_k
 #define	XSWAP_K			xswap_k
 #define	XROT_K			xqrot_k
 
@@ -227,6 +228,7 @@
 #define	XDOTC_K			gotoblas -> xdotc_k
 #define	XNRM2_K			gotoblas -> xnrm2_k
 #define	XSCAL_K			gotoblas -> xscal_k
+#define	XSUM_K			gotoblas -> xsum_k
 #define	XSWAP_K			gotoblas -> xswap_k
 #define	XROT_K			gotoblas -> xqrot_k
 
diff --git a/common_x86.h b/common_x86.h
index 4f538c948e..3fdffe2a85 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
 	
   y = blas_quick_divide_table[y];
 
-  __asm__ __volatile__  ("mull %0" :"=d" (result) :"a"(x), "0" (y));
+  __asm__ __volatile__  ("mull %0" :"=d" (result), "+a"(x): "0" (y));
 
   return result;
 #endif
diff --git a/common_x86_64.h b/common_x86_64.h
index 62e138e342..718a81050b 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -134,7 +134,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
 			     "=b" (*ebx),
 			     "=c" (*ecx),
 			     "=d" (*edx)
-			     : "0" (op));
+			     : "0" (op), "c"(0));
 #endif
 }
 
@@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
 	
   y = blas_quick_divide_table[y];
 
-  __asm__ __volatile__  ("mull %0" :"=d" (result) :"a"(x), "0" (y));
+  __asm__ __volatile__  ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
 
   return result;
 }
diff --git a/common_z.h b/common_z.h
index b4f58bb0c8..f1e78dd088 100644
--- a/common_z.h
+++ b/common_z.h
@@ -19,6 +19,7 @@
 #define	ZDOTC_K			zdotc_k
 #define	ZNRM2_K			znrm2_k
 #define	ZSCAL_K			zscal_k
+#define	ZSUM_K			zsum_k
 #define	ZSWAP_K			zswap_k
 #define	ZROT_K			zdrot_k
 
@@ -249,6 +250,7 @@
 #define	ZDOTC_K			gotoblas -> zdotc_k
 #define	ZNRM2_K			gotoblas -> znrm2_k
 #define	ZSCAL_K			gotoblas -> zscal_k
+#define	ZSUM_K			gotoblas -> zsum_k
 #define	ZSWAP_K			gotoblas -> zswap_k
 #define	ZROT_K			gotoblas -> zdrot_k
 
diff --git a/cpuid.h b/cpuid.h
index a6bc211f3e..697f43133e 100644
--- a/cpuid.h
+++ b/cpuid.h
@@ -53,6 +53,7 @@
 #define VENDOR_SIS	  8
 #define VENDOR_TRANSMETA  9
 #define VENDOR_NSC	 10
+#define VENDOR_HYGON	 11
 #define VENDOR_UNKNOWN   99
 
 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
@@ -116,6 +117,7 @@
 #define CORE_EXCAVATOR   26
 #define CORE_ZEN         27
 #define CORE_SKYLAKEX    28
+#define CORE_DHYANA	 29
 
 #define HAVE_SSE      (1 <<  0)
 #define HAVE_SSE2     (1 <<  1)
@@ -139,6 +141,7 @@
 #define HAVE_FMA4     (1 <<  19)
 #define HAVE_FMA3     (1 <<  20)
 #define HAVE_AVX512VL (1 <<  21)
+#define HAVE_AVX2     (1 <<  22)
 
 #define CACHE_INFO_L1_I     1
 #define CACHE_INFO_L1_D     2
@@ -214,5 +217,8 @@ typedef struct {
 #define CPUTYPE_EXCAVATOR 		50
 #define CPUTYPE_ZEN 			51
 #define CPUTYPE_SKYLAKEX		52
+#define CPUTYPE_DHYANA			53
+
+#define CPUTYPE_HYGON_UNKNOWN		54
 
 #endif
diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index 5077d7b11c..a5e731d747 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -39,6 +39,8 @@
 // Cavium
 #define CPU_THUNDERX      7
 #define CPU_THUNDERX2T99  8
+//Hisilicon
+#define CPU_TSV110        9
 
 static char *cpuname[] = {
   "UNKNOWN",
@@ -49,7 +51,8 @@ static char *cpuname[] = {
   "CORTEXA73",
   "FALKOR",
   "THUNDERX",
-  "THUNDERX2T99"
+  "THUNDERX2T99",
+  "TSV110"
 };
 
 static char *cpuname_lower[] = {
@@ -61,7 +64,8 @@ static char *cpuname_lower[] = {
   "cortexa73",
   "falkor",
   "thunderx",
-  "thunderx2t99"
+  "thunderx2t99",
+  "tsv110"
 };
 
 int get_feature(char *search)
@@ -145,6 +149,9 @@ int detect(void)
 			return CPU_THUNDERX;
     else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
 			return CPU_THUNDERX2T99;
+    // HiSilicon
+    else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
+                        return CPU_TSV110;
 	}
 
 	p = (char *) NULL ;
@@ -286,6 +293,21 @@ void get_cpuconfig(void)
 			printf("#define DTB_DEFAULT_ENTRIES  64       \n");
 			printf("#define DTB_SIZE             4096     \n");
 			break;
+			
+		case CPU_TSV110:
+			printf("#define TSV110                        \n");
+			printf("#define L1_CODE_SIZE         65536    \n");
+			printf("#define L1_CODE_LINESIZE     64       \n");
+			printf("#define L1_CODE_ASSOCIATIVE  4        \n");
+			printf("#define L1_DATA_SIZE         65536    \n");
+			printf("#define L1_DATA_LINESIZE     64       \n");
+			printf("#define L1_DATA_ASSOCIATIVE  4        \n");
+			printf("#define L2_SIZE              524228   \n");
+			printf("#define L2_LINESIZE          64       \n");
+			printf("#define L2_ASSOCIATIVE       8        \n");
+			printf("#define DTB_DEFAULT_ENTRIES  64       \n");
+			printf("#define DTB_SIZE             4096     \n");
+			break;	
 	}
 }
 
diff --git a/cpuid_power.c b/cpuid_power.c
index 82a3f4aace..d5ba6fb2ce 100644
--- a/cpuid_power.c
+++ b/cpuid_power.c
@@ -94,7 +94,7 @@ char *corename[] = {
   "CELL",
   "PPCG4",
   "POWER8",
-  "POWER8"   	
+  "POWER9"   	
 };
 
 int detect(void){
@@ -124,7 +124,7 @@ int detect(void){
   if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
   if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
   if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
-  if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8;
+  if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
   if (!strncasecmp(p, "Cell",   4)) return CPUTYPE_CELL;
   if (!strncasecmp(p, "7447",   4)) return CPUTYPE_PPCG4;
 
@@ -156,7 +156,7 @@ int detect(void){
   if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
   if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
   if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
-  if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8;
+  if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
   if (!strncasecmp(p, "Cell",   4)) return CPUTYPE_CELL;
   if (!strncasecmp(p, "7447",   4)) return CPUTYPE_PPCG4;
   return CPUTYPE_POWER5;
@@ -180,7 +180,7 @@ int id;
 __asm __volatile("mfpvr %0" : "=r"(id));
 switch ( id >> 16 ) {
   case 0x4e: // POWER9
-    return CPUTYPE_POWER8;
+    return CPUTYPE_POWER9;
     break;
   case 0x4d:
   case 0x4b: // POWER8/8E 
diff --git a/cpuid_x86.c b/cpuid_x86.c
index eb986b6b68..884d4b78ae 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -97,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
     ("mov %%ebx, %%edi;"
      "cpuid;"
      "xchgl %%ebx, %%edi;"
-     : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
+     : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc");
 #else
   __asm__ __volatile__
-    ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
+    ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc");
 #endif
 }
 
@@ -211,6 +211,44 @@ int support_avx(){
 #endif
 }
 
+int support_avx2(){
+#ifndef NO_AVX2
+  int eax, ebx, ecx=0, edx;
+  int ret=0;
+
+  if (!support_avx()) 
+    return 0;
+  cpuid(7, &eax, &ebx, &ecx, &edx);
+  if((ebx & (1<<7)) != 0)
+      ret=1;  //OS supports AVX2
+  return ret;
+#else
+  return 0;
+#endif
+}
+
+int support_avx512(){
+#if !defined(NO_AVX) && !defined(NO_AVX512)
+  int eax, ebx, ecx, edx;
+  int ret=0;
+
+  if (!support_avx()) 
+    return 0;
+  cpuid(7, &eax, &ebx, &ecx, &edx);
+  if((ebx & 32) != 32){
+      ret=0;  //OS does not even support AVX2
+  }
+  if((ebx & (1<<31)) != 0){
+    xgetbv(0, &eax, &edx); 
+    if((eax & 0xe0) == 0xe0)
+      ret=1;  //OS supports AVX512VL
+  }
+  return ret;
+#else
+  return 0;
+#endif
+}
+
 
 int get_vendor(void){
   int eax, ebx, ecx, edx;
@@ -233,6 +271,7 @@ int get_vendor(void){
   if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
   if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
   if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC;
+  if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON;
 
   if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
 
@@ -294,6 +333,8 @@ int get_cputype(int gettype){
     if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
 #ifndef NO_AVX
     if (support_avx()) feature |= HAVE_AVX;
+    if (support_avx2()) feature |= HAVE_AVX2;
+    if (support_avx512()) feature |= HAVE_AVX512VL;
     if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
 #endif
 
@@ -1006,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
     }
   }
 
-  if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) {
+  if ((get_vendor() == VENDOR_AMD) ||
+      (get_vendor() == VENDOR_HYGON) ||
+      (get_vendor() == VENDOR_CENTAUR)) {
     cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
 
     LDTB.size        = 4096;
@@ -1228,22 +1271,18 @@ int get_cpuname(void){
 	    return CPUTYPE_NEHALEM;
         case 12:
 	case 15:
-          if(support_avx())
-#ifndef NO_AVX2
+          if(support_avx2())
             return CPUTYPE_HASWELL;
-#else
+          if(support_avx())
 	    return CPUTYPE_SANDYBRIDGE;
-#endif
           else
 	    return CPUTYPE_NEHALEM;
 	case 13:
 	  //Broadwell
-          if(support_avx())
-#ifndef NO_AVX2
+          if(support_avx2())
             return CPUTYPE_HASWELL;
-#else
+          if(support_avx())
 	    return CPUTYPE_SANDYBRIDGE;
-#endif
           else
 	    return CPUTYPE_NEHALEM;
 	}
@@ -1252,33 +1291,27 @@ int get_cpuname(void){
         switch (model) {
         case 5:
 	case 6:
-          if(support_avx())
-#ifndef NO_AVX2
+          if(support_avx2())
             return CPUTYPE_HASWELL;
-#else
+          if(support_avx())
 	    return CPUTYPE_SANDYBRIDGE;
-#endif
           else
 	    return CPUTYPE_NEHALEM;
 	case 7:
 	case 15:
 	  //Broadwell
-          if(support_avx())
-#ifndef NO_AVX2
+          if(support_avx2())
             return CPUTYPE_HASWELL;
-#else
+          if(support_avx())
 	    return CPUTYPE_SANDYBRIDGE;
-#endif
           else
 	    return CPUTYPE_NEHALEM;
 	case 14:
 	  //Skylake
-          if(support_avx())
-#ifndef NO_AVX2
+          if(support_avx2())
             return CPUTYPE_HASWELL;
-#else
+          if(support_avx())
 	    return CPUTYPE_SANDYBRIDGE;
-#endif
           else
 	    return CPUTYPE_NEHALEM;
 	case 12:
@@ -1292,80 +1325,66 @@ int get_cpuname(void){
         switch (model) {
 	case 6:
 	  //Broadwell
-          if(support_avx())
-#ifndef NO_AVX2
+          if(support_avx2())
             return CPUTYPE_HASWELL;
-#else
+          if(support_avx())
 	    return CPUTYPE_SANDYBRIDGE;
-#endif
           else
 	    return CPUTYPE_NEHALEM;
 	case 5:
 	  // Skylake X
-#ifndef NO_AVX512
-	  return CPUTYPE_SKYLAKEX;
-#else
-	  if(support_avx())
-#ifndef NO_AVX2
-	  return CPUTYPE_HASWELL;
-#else
-	  return CPUTYPE_SANDYBRIDGE;
-#endif
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
+          if(support_avx2())
+            return CPUTYPE_HASWELL;
+          if(support_avx())
+	    return CPUTYPE_SANDYBRIDGE;
 	  else
 	  return CPUTYPE_NEHALEM;
-#endif			
         case 14:
 	  // Skylake
-          if(support_avx())
-#ifndef NO_AVX2
+          if(support_avx2())
             return CPUTYPE_HASWELL;
-#else
+          if(support_avx())
 	    return CPUTYPE_SANDYBRIDGE;
-#endif
           else
 	    return CPUTYPE_NEHALEM;
 	case 7:
 	    // Xeon Phi Knights Landing
-          if(support_avx())
-#ifndef NO_AVX2
+          if(support_avx2())
             return CPUTYPE_HASWELL;
-#else
+          if(support_avx())
 	    return CPUTYPE_SANDYBRIDGE;
-#endif
           else
 	    return CPUTYPE_NEHALEM;
 	case 12:
 	    // Apollo Lake
+	case 15:
+	    // Denverton		
 	    return CPUTYPE_NEHALEM;
 	}
 	break;
       case 6:
         switch (model) {
         case 6: // Cannon Lake
-#ifndef NO_AVX512
-	  return CPUTYPE_SKYLAKEX;
-#else
-	  if(support_avx())
-#ifndef NO_AVX2
-	  return CPUTYPE_HASWELL;
-#else
-	  return CPUTYPE_SANDYBRIDGE;
-#endif
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
+          if(support_avx2())
+            return CPUTYPE_HASWELL;
+          if(support_avx())
+	    return CPUTYPE_SANDYBRIDGE;
 	  else
 	  return CPUTYPE_NEHALEM;
-#endif			
         }
       break;  
       case 9:
-      case 8: 
+      case 8:      
         switch (model) {
-	case 14: // Kaby Lake
-          if(support_avx())
-#ifndef NO_AVX2
+	case 14: // Kaby Lake and refreshes
+          if(support_avx2())
             return CPUTYPE_HASWELL;
-#else
+          if(support_avx())
 	    return CPUTYPE_SANDYBRIDGE;
-#endif
           else
 	    return CPUTYPE_NEHALEM;
 	}
@@ -1469,6 +1488,26 @@ int get_cpuname(void){
     return CPUTYPE_AMD_UNKNOWN;
   }
 
+  if (vendor == VENDOR_HYGON){
+    switch (family) {
+    case 0xf:
+      switch (exfamily) {
+      case 9:
+          //Hygon Dhyana
+	  if(support_avx())
+#ifndef NO_AVX2
+	    return CPUTYPE_ZEN;
+#else
+	    return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
+#endif
+	  else
+	    return CPUTYPE_BARCELONA;
+        }
+      break;
+    }
+    return CPUTYPE_HYGON_UNKNOWN;
+  }
+
   if (vendor == VENDOR_CYRIX){
     switch (family) {
     case 0x4:
@@ -1590,7 +1629,8 @@ static char *cpuname[] = {
   "STEAMROLLER",
   "EXCAVATOR",
   "ZEN",
-  "SKYLAKEX"	
+  "SKYLAKEX",
+  "DHYANA"
 };
 
 static char *lowercpuname[] = {
@@ -1645,7 +1685,8 @@ static char *lowercpuname[] = {
   "steamroller",
   "excavator",
   "zen",
-  "skylakex"
+  "skylakex",
+  "dhyana"
 };
 
 static char *corename[] = {
@@ -1677,7 +1718,8 @@ static char *corename[] = {
   "STEAMROLLER",
   "EXCAVATOR",
   "ZEN",
-  "SKYLAKEX"	
+  "SKYLAKEX",
+  "DHYANA"
 };
 
 static char *corename_lower[] = {
@@ -1709,7 +1751,8 @@ static char *corename_lower[] = {
   "steamroller",
   "excavator",
   "zen",
-  "skylakex"	
+  "skylakex",
+  "dhyana"
 };
 
 
@@ -2026,6 +2069,23 @@ int get_coretype(void){
     }
   }
 
+  if (vendor == VENDOR_HYGON){
+    if (family == 0xf){
+        if (exfamily == 9) {
+	  if(support_avx())
+#ifndef NO_AVX2
+	    return CORE_ZEN;
+#else
+	    return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
+#endif
+	  else
+	    return CORE_BARCELONA;
+	} else {
+		return CORE_BARCELONA;
+	}
+    }
+  }
+
   if (vendor == VENDOR_CENTAUR) {
     switch (family) {
     case 0x6:
@@ -2112,6 +2172,8 @@ void get_cpuconfig(void){
     if (features & HAVE_SSE4A)   printf("#define HAVE_SSE4A\n");
     if (features & HAVE_SSE5 )   printf("#define HAVE_SSSE5\n");
     if (features & HAVE_AVX )    printf("#define HAVE_AVX\n");
+    if (features & HAVE_AVX2 )    printf("#define HAVE_AVX2\n");
+    if (features & HAVE_AVX512VL )    printf("#define HAVE_AVX512VL\n");
     if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
     if (features & HAVE_3DNOW)   printf("#define HAVE_3DNOW\n");
     if (features & HAVE_FMA4 )    printf("#define HAVE_FMA4\n");
@@ -2180,6 +2242,8 @@ void get_sse(void){
   if (features & HAVE_SSE4A)   printf("HAVE_SSE4A=1\n");
   if (features & HAVE_SSE5 )   printf("HAVE_SSSE5=1\n");
   if (features & HAVE_AVX )    printf("HAVE_AVX=1\n");
+  if (features & HAVE_AVX2 )    printf("HAVE_AVX2=1\n");
+  if (features & HAVE_AVX512VL )    printf("HAVE_AVX512VL=1\n");
   if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
   if (features & HAVE_3DNOW)   printf("HAVE_3DNOW=1\n");
   if (features & HAVE_FMA4 )    printf("HAVE_FMA4=1\n");
diff --git a/cpuid_zarch.c b/cpuid_zarch.c
index e0d9221f31..896ed94f5d 100644
--- a/cpuid_zarch.c
+++ b/cpuid_zarch.c
@@ -27,9 +27,9 @@
 
 #include <string.h>
 
-#define CPU_GENERIC    	0
-#define CPU_Z13       	1
-#define CPU_Z14       	2
+#define CPU_GENERIC     0
+#define CPU_Z13         1
+#define CPU_Z14         2
 
 static char *cpuname[] = {
   "ZARCH_GENERIC",
@@ -64,10 +64,8 @@ int detect(void)
 
   if (strstr(p, "2964")) return CPU_Z13;
   if (strstr(p, "2965")) return CPU_Z13;
-
-  /* detect z14, but fall back to z13 */
-  if (strstr(p, "3906")) return CPU_Z13;
-  if (strstr(p, "3907")) return CPU_Z13;
+  if (strstr(p, "3906")) return CPU_Z14;
+  if (strstr(p, "3907")) return CPU_Z14;
 
   return CPU_GENERIC;
 }
@@ -116,7 +114,14 @@ void get_cpuconfig(void)
 	  break;
 	case CPU_Z14:
 	  printf("#define Z14\n");
+	  printf("#define L1_DATA_SIZE 131072\n");
+	  printf("#define L1_DATA_LINESIZE 256\n");
+	  printf("#define L1_DATA_ASSOCIATIVE 8\n");
+	  printf("#define L2_SIZE 4194304\n");
+	  printf("#define L2_LINESIZE 256\n");
+	  printf("#define L2_ASSOCIATIVE 8\n");
 	  printf("#define DTB_DEFAULT_ENTRIES 64\n");
+	  printf("#define DTB_SIZE 4096\n");
 	  break;
 	}
 }
diff --git a/ctest.c b/ctest.c
index 0571e9e028..5e869b901f 100644
--- a/ctest.c
+++ b/ctest.c
@@ -113,7 +113,7 @@ ARCH_X86
 ARCH_X86_64
 #endif
 
-#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER)
+#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__)
 ARCH_POWER
 #endif
 
diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c
index 24b881a93b..43eeb40d25 100644
--- a/driver/level2/trmv_thread.c
+++ b/driver/level2/trmv_thread.c
@@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
 
     range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
     range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
-    if (range_n[num_cpu] > m) range_n[num_cpu] = m;
+    if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
 
     queue[num_cpu].mode    = mode;
     queue[num_cpu].routine = trmv_kernel;
@@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
 
     range_m[num_cpu + 1] = range_m[num_cpu] + width;
     range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
-    if (range_n[num_cpu] > m) range_n[num_cpu] = m;
+    if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
 
     queue[num_cpu].mode    = mode;
     queue[num_cpu].routine = trmv_kernel;
diff --git a/driver/others/Makefile b/driver/others/Makefile
index 3dc2e7c1ba..d4b5c26d53 100644
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -18,8 +18,12 @@ ifeq ($(DYNAMIC_ARCH), 1)
 ifeq ($(ARCH),arm64)
 COMMONOBJS	+=  dynamic_arm64.$(SUFFIX)
 else
+ifeq ($(ARCH),power)
+COMMONOBJS	+=  dynamic_power.$(SUFFIX)
+else
 COMMONOBJS	+=  dynamic.$(SUFFIX)
 endif
+endif
 else
 COMMONOBJS	+=  parameter.$(SUFFIX)
 endif
@@ -78,8 +82,12 @@ ifeq ($(DYNAMIC_ARCH), 1)
 ifeq ($(ARCH),arm64)
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
 else
+ifeq ($(ARCH),power)
+HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
+else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
 endif
+endif
 else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
 endif
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index bae344c593..0b38ee3658 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){
     SetEvent(pool.killed);
 
     for(i = 0; i < blas_num_threads - 1; i++){
+      // Could also just use WaitForMultipleObjects
       WaitForSingleObject(blas_threads[i], 5);  //INFINITE);
 #ifndef OS_WINDOWSSTORE
 // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
       TerminateThread(blas_threads[i],0);
 #endif
+      CloseHandle(blas_threads[i]);
     }
 
+    CloseHandle(pool.filled);
+    CloseHandle(pool.killed);
+
     blas_server_avail = 0;
   }
 
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 1f67dc5215..045fc65b8c 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -274,6 +274,7 @@ extern gotoblas_t  gotoblas_SKYLAKEX;
 #define VENDOR_INTEL      1
 #define VENDOR_AMD        2
 #define VENDOR_CENTAUR    3
+#define VENDOR_HYGON	  4
 #define VENDOR_UNKNOWN   99
 
 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
@@ -304,9 +305,49 @@ int support_avx(){
 #endif
 }
 
+int support_avx2(){
+#ifndef NO_AVX2
+  int eax, ebx, ecx=0, edx;
+  int ret=0;
+
+  if (!support_avx())
+    return 0;
+  cpuid(7, &eax, &ebx, &ecx, &edx);
+  if((ebx & (1<<7)) != 0)
+      ret=1;  //OS supports AVX2
+  return ret;
+#else
+  return 0;
+#endif
+}
+
+int support_avx512(){
+#if !defined(NO_AVX) && !defined(NO_AVX512)
+  int eax, ebx, ecx, edx;
+  int ret=0;
+
+  if (!support_avx())
+    return 0;
+  cpuid(7, &eax, &ebx, &ecx, &edx);
+  if((ebx & (1<<7)) != 1){
+      ret=0;  //OS does not even support AVX2
+  }
+  if((ebx & (1<<31)) != 0){
+    xgetbv(0, &eax, &edx);
+    if((eax & 0xe0) == 0xe0)
+      ret=1;  //OS supports AVX512VL
+  }
+  return ret;
+#else
+  return 0;
+#endif
+}
+
 extern void openblas_warning(int verbose, const char * msg);
 #define FALLBACK_VERBOSE 1
 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
+#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
+#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
 #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
 
 static int get_vendor(void){
@@ -329,6 +370,7 @@ static int get_vendor(void){
   if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
   if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
   if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
+  if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
 
   if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
 
@@ -403,18 +445,24 @@ static gotoblas_t *get_coretype(void){
 	}
 	//Intel Haswell
 	if (model == 12 || model == 15) {
-	  if(support_avx())
+	  if(support_avx2())
 	    return &gotoblas_HASWELL;
-	  else{
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
 	//Intel Broadwell
 	if (model == 13) {
-	  if(support_avx())
+	  if(support_avx2())
 	    return &gotoblas_HASWELL;
-	  else{
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
@@ -424,27 +472,36 @@ static gotoblas_t *get_coretype(void){
       case 4:
 		//Intel Haswell
 	if (model == 5 || model == 6) {
-	  if(support_avx())
+	  if(support_avx2())
 	    return &gotoblas_HASWELL;
-	  else{
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
 	//Intel Broadwell
 	if (model == 7 || model == 15) {
-	  if(support_avx())
+	  if(support_avx2())
 	    return &gotoblas_HASWELL;
-	  else{
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
 	//Intel Skylake
 	if (model == 14) {
-	  if(support_avx())
+	  if(support_avx2())
 	    return &gotoblas_HASWELL;
-	  else{
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
@@ -457,72 +514,86 @@ static gotoblas_t *get_coretype(void){
       case 5:
 	//Intel Broadwell
 	if (model == 6) {
-	  if(support_avx())
+	  if(support_avx2())
 	    return &gotoblas_HASWELL;
-	  else{
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
 	if (model == 5) {	
 	// Intel Skylake X
-#ifndef NO_AVX512
-	  return &gotoblas_SKYLAKEX;
-#else		
-	  if(support_avx())
+          if (support_avx512()) 
+	    return &gotoblas_SKYLAKEX;
+	  if(support_avx2()){
+	    openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
 	    return &gotoblas_HASWELL;
-	  else {
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM;
-	  }
-#endif		
+          }
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+          openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+          return &gotoblas_NEHALEM;
+          }
 	}
 	//Intel Skylake
 	if (model == 14) {
-	  if(support_avx())
+	  if(support_avx2())
 	    return &gotoblas_HASWELL;
-	  else{
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
 	//Intel Phi Knights Landing
 	if (model == 7) {
-	  if(support_avx())
+	  if(support_avx2()){
+	    openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
 	    return &gotoblas_HASWELL;
-	  else{
+	  }  
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
-	//Apollo Lake
-	if (model == 12) { 
+	//Apollo Lake or Denverton
+	if (model == 12 || model == 15) { 
 	  return &gotoblas_NEHALEM;
 	}	
 	return NULL;
       case 6:
         if (model == 6) {
           // Cannon Lake
-#ifndef NO_AVX512
-	  return &gotoblas_SKYLAKEX;
-#else
-	  if(support_avx())
-#ifndef NO_AVX2
-	  return &gotoblas_HASWELL;
-#else
-	  return &gotoblas_SANDYBRIDGE;
-#endif
-	  else
-	  return &gotoblas_NEHALEM;
-#endif			
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM;
+	  }
         }
         return NULL;  
       case 9:
       case 8:
 	if (model == 14 ) { // Kaby Lake
-	  if(support_avx())
+	  if(support_avx2())
 	    return &gotoblas_HASWELL;
-	  else{
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
@@ -535,7 +606,7 @@ static gotoblas_t *get_coretype(void){
     }
   }
 
-  if (vendor == VENDOR_AMD){
+  if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
     if (family <= 0xe) {
         // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
         cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
@@ -615,6 +686,13 @@ static gotoblas_t *get_coretype(void){
 	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
+      } else if (exfamily == 9) {
+	  if(support_avx())
+	    return &gotoblas_ZEN;
+	  else{
+	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
+	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+        }
       }else {
 	return &gotoblas_BARCELONA;
       }
diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
new file mode 100644
index 0000000000..0c4a87a5e3
--- /dev/null
+++ b/driver/others/dynamic_power.c
@@ -0,0 +1,102 @@
+
+#include "common.h"
+
+extern gotoblas_t gotoblas_POWER6;
+extern gotoblas_t gotoblas_POWER8;
+extern gotoblas_t gotoblas_POWER9;
+
+extern void openblas_warning(int verbose, const char *msg);
+
+static char *corename[] = {
+	"unknown",
+	"POWER6",
+	"POWER8",
+	"POWER9"
+};
+
+#define NUM_CORETYPES 4
+
+char *gotoblas_corename(void) {
+	if (gotoblas == &gotoblas_POWER6)	return corename[1];
+	if (gotoblas == &gotoblas_POWER8)	return corename[2];
+	if (gotoblas == &gotoblas_POWER9)	return corename[3];
+	return corename[0];
+}
+
+static gotoblas_t *get_coretype(void) {
+
+	if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
+		return &gotoblas_POWER6;
+	if (__builtin_cpu_is("power8"))
+		return &gotoblas_POWER8;
+	if (__builtin_cpu_is("power9"))
+		return &gotoblas_POWER9;
+	return NULL;
+}
+
+static gotoblas_t *force_coretype(char * coretype) {
+
+	int i ;
+	int found = -1;
+	char message[128];
+
+	for ( i = 0 ; i < NUM_CORETYPES; i++)
+	{
+		if (!strncasecmp(coretype, corename[i], 20))
+		{
+			found = i;
+			break;
+		}
+	}
+
+	switch (found)
+	{
+	case  1: return (&gotoblas_POWER6);
+	case  2: return (&gotoblas_POWER8);
+	case  3: return (&gotoblas_POWER9);
+	default: return NULL;
+	}
+	snprintf(message, 128, "Core not found: %s\n", coretype);
+	openblas_warning(1, message);
+}
+
+void gotoblas_dynamic_init(void) {
+
+	char coremsg[128];
+	char coren[22];
+	char *p;
+
+
+	if (gotoblas) return;
+
+	p = getenv("OPENBLAS_CORETYPE");
+	if ( p )
+	{
+		gotoblas = force_coretype(p);
+	}
+	else
+	{
+		gotoblas = get_coretype();
+	}
+
+	if (gotoblas == NULL)
+	{
+		snprintf(coremsg, 128, "Falling back to POWER8 core\n");
+		openblas_warning(1, coremsg);
+		gotoblas = &gotoblas_POWER8;
+	}
+
+	if (gotoblas && gotoblas -> init) {
+		strncpy(coren,gotoblas_corename(),20);
+		sprintf(coremsg, "Core: %s\n",coren);
+		openblas_warning(2, coremsg);
+		gotoblas -> init();
+	} else {
+		openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
+		exit(1);
+	}
+}
+
+void gotoblas_dynamic_quit(void) {
+	gotoblas = NULL;
+}
diff --git a/driver/others/memory.c b/driver/others/memory.c
index 6f7a7db825..ac8545f350 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -198,45 +198,68 @@ int get_num_procs(void);
 #else
 int get_num_procs(void) {
   static int nums = 0;
-cpu_set_t *cpusetp;
-size_t size;
-int ret;
-int i,n;
+  cpu_set_t cpuset,*cpusetp;
+  size_t size;
+  int ret;
+
+#if defined(__GLIBC_PREREQ)
+#if !__GLIBC_PREREQ(2, 7)
+  int i;
+#if !__GLIBC_PREREQ(2, 6)
+  int n;
+#endif
+#endif
+#endif
 
   if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
 #if !defined(OS_LINUX)
-     return nums;
+  return nums;
 #endif
 
 #if !defined(__GLIBC_PREREQ)
-   return nums;
+  return nums;
 #else
  #if !__GLIBC_PREREQ(2, 3)
-   return nums;
+  return nums;
  #endif
 
  #if !__GLIBC_PREREQ(2, 7)
-  ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
+  ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
   if (ret!=0) return nums;
   n=0;
   #if !__GLIBC_PREREQ(2, 6)
   for (i=0;i<nums;i++)
-     if (CPU_ISSET(i,cpusetp)) n++;
+     if (CPU_ISSET(i,cpuset)) n++;
   nums=n;
   #else
-  nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
+  nums = CPU_COUNT(sizeof(cpuset),&cpuset);
   #endif
   return nums;
  #else
-  cpusetp = CPU_ALLOC(nums);
-  if (cpusetp == NULL) return nums;
-  size = CPU_ALLOC_SIZE(nums);
-  ret = sched_getaffinity(0,size,cpusetp);
-  if (ret!=0) return nums;
-  ret = CPU_COUNT_S(size,cpusetp);
-  if (ret > 0 && ret < nums) nums = ret;
-  CPU_FREE(cpusetp);
-  return nums;
+  if (nums >= CPU_SETSIZE) {
+    cpusetp = CPU_ALLOC(nums);
+      if (cpusetp == NULL) {
+        return nums;
+      }
+    size = CPU_ALLOC_SIZE(nums);
+    ret = sched_getaffinity(0,size,cpusetp);
+    if (ret!=0) {
+      CPU_FREE(cpusetp);
+      return nums;
+    }
+    ret = CPU_COUNT_S(size,cpusetp);
+    if (ret > 0 && ret < nums) nums = ret;	
+    CPU_FREE(cpusetp);
+    return nums;
+  } else {
+    ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
+    if (ret!=0) {
+      return nums;
+    }
+    ret = CPU_COUNT(&cpuset);
+    if (ret > 0 && ret < nums) nums = ret;	
+    return nums;
+  }
  #endif
 #endif
 }
@@ -1073,11 +1096,6 @@ static volatile int memory_initialized = 0;
     }
     free(table);
   }
-#if defined(OS_WINDOWS)
-  TlsFree(local_storage_key);
-#else
-  pthread_key_delete(local_storage_key);
-#endif		
 }
 
 static void blas_memory_init(){
@@ -1295,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) {
   free(map_address);
 }
 
+#ifdef SMP
+void blas_thread_memory_cleanup(void) {
+    blas_memory_cleanup((void*)get_memory_table());
+}
+#endif
+
+
 void blas_shutdown(void){
 #ifdef SMP
   BLASFUNC(blas_thread_shutdown)();
@@ -1304,7 +1329,7 @@ void blas_shutdown(void){
   /* Only cleanupIf we were built for threading and TLS was initialized */
   if (local_storage_key)
 #endif
-    blas_memory_cleanup((void*)get_memory_table());
+    blas_thread_memory_cleanup();
 
 #ifdef SEEK_ADDRESS
   base_address      = 0UL;
@@ -1491,6 +1516,14 @@ void DESTRUCTOR gotoblas_quit(void) {
 
   blas_shutdown();
 
+#if defined(SMP)
+#if defined(OS_WINDOWS)
+  TlsFree(local_storage_key);
+#else
+  pthread_key_delete(local_storage_key);
+#endif		
+#endif
+
 #ifdef PROFILE
    moncontrol (0);
 #endif
@@ -1526,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD  ul_reason_for_call, LPVOID lpReser
       break;
     case DLL_THREAD_DETACH:
 #if defined(SMP)
-      blas_memory_cleanup((void*)get_memory_table());
+      blas_thread_memory_cleanup();
 #endif
       break;
     case DLL_PROCESS_DETACH:
@@ -1600,9 +1633,11 @@ void gotoblas_dummy_for_PGI(void) {
 #endif
 
 #else
+/* USE_TLS / COMPILE_TLS not set */
+
 #include <errno.h>
 
-#ifdef OS_WINDOWS
+#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
 #define ALLOC_WINDOWS
 #ifndef MEM_LARGE_PAGES
 #define MEM_LARGE_PAGES  0x20000000
@@ -1616,7 +1651,7 @@ void gotoblas_dummy_for_PGI(void) {
 #include <stdio.h>
 #include <fcntl.h>
 
-#ifndef OS_WINDOWS
+#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
 #include <sys/mman.h>
 #ifndef NO_SYSV_IPC
 #include <sys/shm.h>
@@ -1636,7 +1671,7 @@ void gotoblas_dummy_for_PGI(void) {
 #include <sys/resource.h>
 #endif
 
-#if defined(OS_FREEBSD) || defined(OS_DARWIN)
+#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
 #include <sys/sysctl.h>
 #include <sys/resource.h>
 #endif
@@ -1675,9 +1710,12 @@ void gotoblas_dummy_for_PGI(void) {
 #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
 #define CONSTRUCTOR	__attribute__ ((constructor))
 #define DESTRUCTOR	__attribute__ ((destructor))
-#else
+#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
 #define CONSTRUCTOR	__attribute__ ((constructor(101)))
 #define DESTRUCTOR	__attribute__ ((destructor(101)))
+#else
+#define CONSTRUCTOR	__attribute__ ((constructor))
+#define DESTRUCTOR	__attribute__ ((destructor))
 #endif
 
 #ifdef DYNAMIC_ARCH
@@ -1701,45 +1739,70 @@ void goto_set_num_threads(int num_threads) {};
 int get_num_procs(void);
 #else
 int get_num_procs(void) {
+
   static int nums = 0;
-cpu_set_t *cpusetp;
-size_t size;
-int ret;
-int i,n;
+  cpu_set_t cpuset,*cpusetp;
+  size_t size;
+  int ret;
+
+#if defined(__GLIBC_PREREQ)
+#if !__GLIBC_PREREQ(2, 7)
+  int i;
+#if !__GLIBC_PREREQ(2, 6)
+  int n;
+#endif
+#endif
+#endif
 
   if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
 #if !defined(OS_LINUX)
-     return nums;
+  return nums;
 #endif
 
 #if !defined(__GLIBC_PREREQ)
-   return nums;
+  return nums;
 #else
  #if !__GLIBC_PREREQ(2, 3)
-   return nums;
+  return nums;
  #endif
 
  #if !__GLIBC_PREREQ(2, 7)
-  ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
+  ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
   if (ret!=0) return nums;
   n=0;
   #if !__GLIBC_PREREQ(2, 6)
   for (i=0;i<nums;i++)
-     if (CPU_ISSET(i,cpusetp)) n++;
+     if (CPU_ISSET(i,cpuset)) n++;
   nums=n;
   #else
-  nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
+  nums = CPU_COUNT(sizeof(cpuset),&cpuset);
   #endif
   return nums;
  #else
-  cpusetp = CPU_ALLOC(nums);
-  if (cpusetp == NULL) return nums;
-  size = CPU_ALLOC_SIZE(nums);
-  ret = sched_getaffinity(0,size,cpusetp);
-  if (ret!=0) return nums;
-  nums = CPU_COUNT_S(size,cpusetp);
-  CPU_FREE(cpusetp);
-  return nums;
+  if (nums >= CPU_SETSIZE) {
+    cpusetp = CPU_ALLOC(nums);
+      if (cpusetp == NULL) {
+        return nums;
+      }
+    size = CPU_ALLOC_SIZE(nums);
+    ret = sched_getaffinity(0,size,cpusetp);
+    if (ret!=0) {
+      CPU_FREE(cpusetp);
+      return nums;
+    }
+    ret = CPU_COUNT_S(size,cpusetp);
+    if (ret > 0 && ret < nums) nums = ret;	
+    CPU_FREE(cpusetp);
+    return nums;
+  } else {
+    ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
+    if (ret!=0) {
+      return nums;
+    }
+    ret = CPU_COUNT(&cpuset);
+    if (ret > 0 && ret < nums) nums = ret;	
+    return nums;
+  }
  #endif
 #endif
 }
@@ -1753,7 +1816,7 @@ int get_num_procs(void) {
   return nums;
 }
 #endif
-	
+
 #ifdef OS_HAIKU
 int get_num_procs(void) {
   static int nums = 0;
@@ -1790,7 +1853,7 @@ int get_num_procs(void) {
 
 #endif
 
-#if defined(OS_FREEBSD)
+#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
 
 int get_num_procs(void) {
 
@@ -1867,7 +1930,7 @@ void openblas_fork_handler()
   //   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
   // In the mean time build with USE_OPENMP=0 or link against another
   // implementation of OpenMP.
-#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER)
+#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
   int err;
   err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
   if(err != 0)
@@ -1880,7 +1943,7 @@ extern int openblas_goto_num_threads_env();
 extern int openblas_omp_num_threads_env();
 
 int blas_get_cpu_number(void){
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   int max_num;
 #endif
   int blas_goto_num   = 0;
@@ -1888,11 +1951,11 @@ int blas_get_cpu_number(void){
 
   if (blas_num_threads) return blas_num_threads;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   max_num = get_num_procs();
 #endif
 
-  blas_goto_num = 0;
+  // blas_goto_num = 0;
 #ifndef USE_OPENMP
   blas_goto_num=openblas_num_threads_env();
   if (blas_goto_num < 0) blas_goto_num = 0;
@@ -1904,7 +1967,7 @@ int blas_get_cpu_number(void){
 
 #endif
 
-  blas_omp_num = 0;
+  // blas_omp_num = 0;
   blas_omp_num=openblas_omp_num_threads_env();
   if (blas_omp_num < 0) blas_omp_num = 0;
 
@@ -1912,7 +1975,7 @@ int blas_get_cpu_number(void){
   else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
   else blas_num_threads = MAX_CPU_NUMBER;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   if (blas_num_threads > max_num) blas_num_threads = max_num;
 #endif
 
@@ -1999,11 +2062,15 @@ static void *alloc_mmap(void *address){
   }
 
   if (map_address != (void *)-1) {
+#if defined(SMP) && !defined(USE_OPENMP)
     LOCK_COMMAND(&alloc_lock);
+#endif    
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
     release_pos ++;
+#if defined(SMP) && !defined(USE_OPENMP)
     UNLOCK_COMMAND(&alloc_lock);
+#endif    
   }
 
 #ifdef OS_LINUX
@@ -2145,14 +2212,18 @@ static void *alloc_mmap(void *address){
 #if defined(OS_LINUX) && !defined(NO_WARMUP)
   }
 #endif
-  LOCK_COMMAND(&alloc_lock);
 
   if (map_address != (void *)-1) {
+#if defined(SMP) && !defined(USE_OPENMP)
+    LOCK_COMMAND(&alloc_lock);
+#endif
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
     release_pos ++;
+#if defined(SMP) && !defined(USE_OPENMP)
+    UNLOCK_COMMAND(&alloc_lock);
+#endif
   }
-  UNLOCK_COMMAND(&alloc_lock);
 
   return map_address;
 }
@@ -2520,7 +2591,7 @@ void *blas_memory_alloc(int procpos){
 
   int position;
 #if defined(WHEREAMI) && !defined(USE_OPENMP)
-  int mypos;
+  int mypos = 0;
 #endif
 
   void *map_address;
@@ -2551,6 +2622,11 @@ void *blas_memory_alloc(int procpos){
     NULL,
   };
   void *(**func)(void *address);
+
+#if defined(USE_OPENMP)
+  if (!memory_initialized) {
+#endif
+
   LOCK_COMMAND(&alloc_lock);
 
   if (!memory_initialized) {
@@ -2586,6 +2662,9 @@ void *blas_memory_alloc(int procpos){
 
   }
   UNLOCK_COMMAND(&alloc_lock);
+#if defined(USE_OPENMP)
+  }
+#endif
 
 #ifdef DEBUG
   printf("Alloc Start ...\n");
@@ -2600,13 +2679,17 @@ void *blas_memory_alloc(int procpos){
 
   do {
     if (!memory[position].used && (memory[position].pos == mypos)) {
+#if defined(SMP) && !defined(USE_OPENMP)
       LOCK_COMMAND(&alloc_lock);
-//      blas_lock(&memory[position].lock);
-
+#else      
+      blas_lock(&memory[position].lock);
+#endif
       if (!memory[position].used) goto allocation;
-
+#if defined(SMP) && !defined(USE_OPENMP)
       UNLOCK_COMMAND(&alloc_lock);
-//      blas_unlock(&memory[position].lock);
+#else
+      blas_unlock(&memory[position].lock);
+#endif      
     }
 
     position ++;
@@ -2618,21 +2701,26 @@ void *blas_memory_alloc(int procpos){
 
   position = 0;
 
+#if defined(SMP) && !defined(USE_OPENMP)
   LOCK_COMMAND(&alloc_lock);
+#endif
   do {
-/*    if (!memory[position].used) { */
-/*      blas_lock(&memory[position].lock);*/
-
+#if defined(USE_OPENMP)	  
+    if (!memory[position].used) { 
+      blas_lock(&memory[position].lock);
+#endif
       if (!memory[position].used) goto allocation;
       
-/*      blas_unlock(&memory[position].lock);*/
-/*    } */
-
+#if defined(USE_OPENMP)
+      blas_unlock(&memory[position].lock);      
+    }
+#endif
     position ++;
 
   } while (position < NUM_BUFFERS);
-  UNLOCK_COMMAND(&alloc_lock);
-
+#if defined(SMP) && !defined(USE_OPENMP)
+  UNLOCK_COMMAND(&alloc_lock);	
+#endif
   goto error;
 
   allocation :
@@ -2642,10 +2730,11 @@ void *blas_memory_alloc(int procpos){
 #endif
 
   memory[position].used = 1;
-
+#if defined(SMP) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);
-/*  blas_unlock(&memory[position].lock);*/
-
+#else
+  blas_unlock(&memory[position].lock);	
+#endif
   if (!memory[position].addr) {
     do {
 #ifdef DEBUG
@@ -2690,9 +2779,13 @@ void *blas_memory_alloc(int procpos){
 
     } while ((BLASLONG)map_address == -1);
 
+#if defined(SMP) && !defined(USE_OPENMP)
     LOCK_COMMAND(&alloc_lock);
+#endif    
     memory[position].addr = map_address;
+#if defined(SMP) && !defined(USE_OPENMP)
     UNLOCK_COMMAND(&alloc_lock);
+#endif
 
 #ifdef DEBUG
     printf("  Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
@@ -2746,8 +2839,9 @@ void blas_memory_free(void *free_area){
 #endif
 
   position = 0;
+#if defined(SMP) && !defined(USE_OPENMP)
   LOCK_COMMAND(&alloc_lock);
-
+#endif
   while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
     position++;
 
@@ -2761,7 +2855,9 @@ void blas_memory_free(void *free_area){
   WMB;
 
   memory[position].used = 0;
+#if defined(SMP) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);
+#endif
 
 #ifdef DEBUG
   printf("Unmap Succeeded.\n\n");
@@ -2776,8 +2872,9 @@ void blas_memory_free(void *free_area){
   for (position = 0; position < NUM_BUFFERS; position++)
     printf("%4ld  %p : %d\n", position, memory[position].addr, memory[position].used);
 #endif
+#if defined(SMP) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);
-
+#endif
   return;
 }
 
diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c
index eca494dca3..81648fb7c3 100644
--- a/driver/others/openblas_get_config.c
+++ b/driver/others/openblas_get_config.c
@@ -35,12 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <string.h>
 
-#if defined(_WIN32) && defined(_MSC_VER)
-#if _MSC_VER < 1900
-#define snprintf _snprintf
-#endif
-#endif
-
 static char* openblas_config_str=""
 "OpenBLAS "
  VERSION
diff --git a/exports/Makefile b/exports/Makefile
index 3a5f77db3d..b1348bd4ac 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -141,6 +141,14 @@ else
 	$(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed
 ../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c
 endif
+
+ifeq ($(F_COMPILER), INTEL)
+	$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
+	-Wl,--whole-archive $< -Wl,--no-whole-archive \
+	-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
+	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
+else
+
 ifneq ($(C_COMPILER), LSB)
 	$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
 	-Wl,--whole-archive $< -Wl,--no-whole-archive \
@@ -152,6 +160,7 @@ else
 	-Wl,--whole-archive $< -Wl,--no-whole-archive \
 	-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
 	$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
+endif
 endif
 	rm -f linktest
 
diff --git a/exports/dllinit.c b/exports/dllinit.c
index 02ff092e99..4a05c0e146 100644
--- a/exports/dllinit.c
+++ b/exports/dllinit.c
@@ -40,15 +40,25 @@
 
 void gotoblas_init(void);
 void gotoblas_quit(void);
+#if defined(SMP) && defined(USE_TLS)
+void blas_thread_memory_cleanup(void);
+#endif
 
 BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
-
-  if (reason == DLL_PROCESS_ATTACH) {
-    gotoblas_init();
-  }
-
-  if (reason == DLL_PROCESS_DETACH) {
-    gotoblas_quit();
+  switch(reason) {
+      case DLL_PROCESS_ATTACH:
+        gotoblas_init();
+        break;
+      case DLL_PROCESS_DETACH:
+        gotoblas_quit();
+        break;
+      case DLL_THREAD_ATTACH:
+        break;
+      case DLL_THREAD_DETACH:
+#if defined(SMP) && defined(USE_TLS)
+        blas_thread_memory_cleanup();
+#endif
+        break;
   }
 
   return TRUE;
diff --git a/getarch.c b/getarch.c
index 146f1f36fb..4d960356c2 100644
--- a/getarch.c
+++ b/getarch.c
@@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <unistd.h>
 #endif
 
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#else
+#define NO_AVX512
+#endif
 /* #define FORCE_P2		*/
 /* #define FORCE_KATMAI		*/
 /* #define FORCE_COPPERMINE	*/
@@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef FORCE_SKYLAKEX
+#ifdef NO_AVX512
+#define FORCE
+#define FORCE_INTEL
+#define ARCHITECTURE    "X86"
+#define SUBARCHITECTURE "HASWELL"
+#define ARCHCONFIG   "-DHASWELL " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
+                     "-DFMA3"
+#define LIBNAME   "haswell"
+#define CORENAME  "HASWELL"
+#else
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
@@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LIBNAME   "skylakex"
 #define CORENAME  "SKYLAKEX"
 #endif
+#endif
 
 #ifdef FORCE_ATOM
 #define FORCE
@@ -618,6 +637,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "POWER8"
 #endif
 
+#if defined(FORCE_POWER9) 
+#define FORCE
+#define ARCHITECTURE    "POWER"
+#define SUBARCHITECTURE "POWER9"
+#define SUBDIRNAME      "power"
+#define ARCHCONFIG   "-DPOWER9 " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
+		     "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
+		     "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME   "power9"
+#define CORENAME  "POWER9"
+#endif
 
 #ifdef FORCE_PPCG4
 #define FORCE
@@ -1046,6 +1077,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
+#ifdef FORCE_TSV110
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "TSV110"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DTSV110 " \
+       "-DL1_CODE_SIZE=65536  -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
+       "-DL1_DATA_SIZE=65536  -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
+       "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME   "tsv110"
+#define CORENAME  "TSV110"
+#else
+#endif
+
+
 #ifdef FORCE_ZARCH_GENERIC
 #define FORCE
 #define ARCHITECTURE    "ZARCH"
@@ -1066,8 +1114,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "Z13"
 #endif
 
+#ifdef FORCE_Z14
+#define FORCE
+#define ARCHITECTURE    "ZARCH"
+#define SUBARCHITECTURE "Z14"
+#define ARCHCONFIG   "-DZ14 " \
+       "-DDTB_DEFAULT_ENTRIES=64"
+#define LIBNAME   "z14"
+#define CORENAME  "Z14"
+#endif
+
 #ifndef FORCE
 
+#ifdef USER_TARGET
+#error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt"
+#endif
+
 #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
     defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__)
 #ifndef POWER
diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 8b25344c01..f76d5c13f7 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES
   rotm.c rotmg.c # N.B. these do not have complex counterparts
   rot.c
   asum.c
+  sum.c
 )
 
 # these will have 'z' prepended for the complex version
@@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES})
     GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX")
     GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX")
     GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX")
+    GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX")
   endif ()
   if (${float_type} STREQUAL "ZCOMPLEX")
     GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX")
@@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES})
     GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
     GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
     GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
+    GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
   endif ()
 endforeach ()
 
diff --git a/interface/Makefile b/interface/Makefile
index 20ec74e9ee..f0577796d5 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -25,7 +25,7 @@ SBLAS1OBJS    = \
 		saxpy.$(SUFFIX) sswap.$(SUFFIX) \
 		scopy.$(SUFFIX) sscal.$(SUFFIX) \
 		sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \
-		sasum.$(SUFFIX) snrm2.$(SUFFIX) \
+		sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \
 		smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
 		smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
 		srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
@@ -51,7 +51,7 @@ DBLAS1OBJS    = \
 		daxpy.$(SUFFIX) dswap.$(SUFFIX) \
 		dcopy.$(SUFFIX) dscal.$(SUFFIX) \
 		ddot.$(SUFFIX) \
-		dasum.$(SUFFIX) dnrm2.$(SUFFIX) \
+		dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \
 		dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
 		dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
 		drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
@@ -76,7 +76,7 @@ CBLAS1OBJS    = \
 		caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
 		ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \
 		cdotc.$(SUFFIX)  cdotu.$(SUFFIX) \
-		scasum.$(SUFFIX) scnrm2.$(SUFFIX) \
+		scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \
 		scamax.$(SUFFIX) icamax.$(SUFFIX) \
 		scamin.$(SUFFIX) icamin.$(SUFFIX) \
 		csrot.$(SUFFIX) crotg.$(SUFFIX) \
@@ -105,7 +105,7 @@ ZBLAS1OBJS    = \
 		zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
 		zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \
 		zdotc.$(SUFFIX)  zdotu.$(SUFFIX) \
-		dzasum.$(SUFFIX)  dznrm2.$(SUFFIX) \
+		dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \
 		dzamax.$(SUFFIX) izamax.$(SUFFIX) \
 		dzamin.$(SUFFIX) izamin.$(SUFFIX) \
 		zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
@@ -146,7 +146,7 @@ QBLAS1OBJS    = \
 		qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
 		qcopy.$(SUFFIX) qscal.$(SUFFIX) \
 		qdot.$(SUFFIX) \
-		qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
+		qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
 		qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
 		qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
 		qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@@ -168,7 +168,7 @@ XBLAS1OBJS    = \
 		xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
 		xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
 		xdotc.$(SUFFIX)  xdotu.$(SUFFIX) \
-		qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
+		qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
 		qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
 		qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
 		xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@@ -203,7 +203,7 @@ ifdef QUAD_PRECISION
 QBLAS1OBJS    = \
 		qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
 		qcopy.$(SUFFIX) qscal.$(SUFFIX) \
-		qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
+		qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
 		qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
 		qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
 		qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@@ -224,7 +224,7 @@ QBLAS3OBJS    = \
 XBLAS1OBJS    = \
 		xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
 		xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
-		qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
+		qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
 		qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
 		qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
 		xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@@ -263,7 +263,8 @@ CSBLAS1OBJS   = \
 	cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \
 	cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
 	cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
-	cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX)
+	cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
+	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
 
 CSBLAS2OBJS   = \
 	cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@@ -280,7 +281,8 @@ CDBLAS1OBJS   = \
 	cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
 	cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
 	cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
-	cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX)
+	cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
+	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
 
 CDBLAS2OBJS   = \
 	cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@@ -300,7 +302,8 @@ CCBLAS1OBJS   = \
 	cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
 	cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
 	cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
-	cblas_caxpby.$(SUFFIX)
+	cblas_caxpby.$(SUFFIX) \
+	cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
 
 CCBLAS2OBJS   = \
 	cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
@@ -326,7 +329,9 @@ CZBLAS1OBJS   = \
 	cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
 	cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
 	cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
-	cblas_zaxpby.$(SUFFIX)
+	cblas_zaxpby.$(SUFFIX) \
+	cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
+
 
 CZBLAS2OBJS   = \
 	cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \
@@ -560,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c
 qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)
 
+ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+
+dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+
+qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+
+scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+
+dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+
+qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+
 snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)
 
@@ -1383,6 +1406,18 @@ cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c
 cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c
 	$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
 
+cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c
+	$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c
+	$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
+	$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
+	$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
+
 cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
@@ -1395,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c
 cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
+cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c
+	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
+
+cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c
+	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
+
+cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c
+	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
+
+cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c
+	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
+
 cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
@@ -1402,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
 cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c
-	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
+		$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
 cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
diff --git a/interface/sum.c b/interface/sum.c
new file mode 100644
index 0000000000..dfdcc5dcc5
--- /dev/null
+++ b/interface/sum.c
@@ -0,0 +1,97 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#ifndef CBLAS
+
+FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
+
+  BLASLONG n    = *N;
+  BLASLONG incx = *INCX;
+  FLOATRET ret;
+
+  PRINT_DEBUG_NAME;
+
+  if (n <= 0) return 0;
+
+  IDEBUG_START;
+
+  FUNCTION_PROFILE_START();
+
+  ret = (FLOATRET)SUM_K(n, x, incx);
+
+  FUNCTION_PROFILE_END(COMPSIZE, n, n);
+
+  IDEBUG_END;
+
+  return ret;
+}
+
+#else
+#ifdef COMPLEX
+FLOAT CNAME(blasint n, void *vx, blasint incx){
+  FLOAT *x = (FLOAT*) vx;
+#else
+FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
+#endif
+
+  FLOAT ret;
+
+  PRINT_DEBUG_CNAME;
+
+  if (n <= 0) return 0;
+
+  IDEBUG_START;
+
+  FUNCTION_PROFILE_START();
+
+  ret = SUM_K(n, x, incx);
+
+  FUNCTION_PROFILE_END(COMPSIZE, n, n);
+
+  IDEBUG_END;
+
+  return ret;
+}
+
+#endif
diff --git a/interface/trmv.c b/interface/trmv.c
index 7c40ae976f..2e52527a3c 100644
--- a/interface/trmv.c
+++ b/interface/trmv.c
@@ -218,11 +218,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
   buffer = (FLOAT *)blas_memory_alloc(1);
 
 #ifdef SMP
-/*  nthreads = num_cpu_avail(2);
+  nthreads = num_cpu_avail(2);
 
-FIXME trmv_thread was found to be broken, see issue 1332 */
-  nthreads = 1;
-  
   if (nthreads == 1) {
 #endif
 
diff --git a/interface/trsm.c b/interface/trsm.c
index 5c2750e791..715c83a1f3 100644
--- a/interface/trsm.c
+++ b/interface/trsm.c
@@ -81,6 +81,12 @@
 #endif
 #endif
 
+#ifndef COMPLEX
+#define SMP_FACTOR 256
+#else
+#define SMP_FACTOR 128
+#endif
+
 static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
 #ifndef TRMM
   TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN,
@@ -198,7 +204,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG,
   if (side  < 0)                info =  1;
 
   if (info != 0) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)-1);
     return;
   }
 
@@ -366,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order,
   mode |= (trans << BLAS_TRANSA_SHIFT);
   mode |= (side  << BLAS_RSIDE_SHIFT);
 
-  if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
+/*
+  if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD )
 	args.nthreads = 1;
   else
-	if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
+	if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD )
 		args.nthreads = 1;
+*/
+  if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD)
+	args.nthreads = 1;
   else
 	args.nthreads = num_cpu_avail(3);
 		
diff --git a/interface/ztrmv.c b/interface/ztrmv.c
index 0e16632e06..4c47e9e913 100644
--- a/interface/ztrmv.c
+++ b/interface/ztrmv.c
@@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
   } else
       nthreads = 1;
 
-/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
-  nthreads = 1;
-
   if(nthreads > 1) {
     buffer_size = n > 16 ? 0 : n * 4 + 40;
   }
diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index 2a330df4e9..ad15b8f250 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
       GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type})
       GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type})
       GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type})
+      GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type})
 
       if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
         GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type})
diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1
index a8f9cf0974..970703230a 100644
--- a/kernel/Makefile.L1
+++ b/kernel/Makefile.L1
@@ -340,6 +340,32 @@ ifndef XSCALKERNEL
 XSCALKERNEL = zscal.S
 endif
 
+### SUM ###
+
+ifndef SSUMKERNEL
+SSUMKERNEL =  sum.S
+endif
+
+ifndef DSUMKERNEL
+DSUMKERNEL =  sum.S
+endif
+
+ifndef CSUMKERNEL
+CSUMKERNEL = zsum.S
+endif
+
+ifndef ZSUMKERNEL
+ZSUMKERNEL = zsum.S
+endif
+
+ifndef QSUMKERNEL
+QSUMKERNEL =  sum.S
+endif
+
+ifndef XSUMKERNEL
+XSUMKERNEL = zsum.S
+endif
+
 ### SWAP ###
 
 ifndef SSWAPKERNEL
@@ -453,7 +479,7 @@ endif
 SBLASOBJS	+= \
 	 samax_k$(TSUFFIX).$(SUFFIX)  samin_k$(TSUFFIX).$(SUFFIX)  smax_k$(TSUFFIX).$(SUFFIX)  smin_k$(TSUFFIX).$(SUFFIX) \
 	isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \
-	sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
+	sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
 	sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \
 	snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \
 	saxpby_k$(TSUFFIX).$(SUFFIX)
@@ -463,31 +489,32 @@ DBLASOBJS	+= \
 	idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \
 	dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \
 	dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \
-	daxpby_k$(TSUFFIX).$(SUFFIX)
+	daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX)
 
 QBLASOBJS	+= \
 	 qamax_k$(TSUFFIX).$(SUFFIX)  qamin_k$(TSUFFIX).$(SUFFIX)  qmax_k$(TSUFFIX).$(SUFFIX)  qmin_k$(TSUFFIX).$(SUFFIX) \
 	iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \
 	qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \
-	qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX)
+	qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \
+	qsum_k$(TSUFFIX).$(SUFFIX)
 
 CBLASOBJS	+= \
 	camax_k$(TSUFFIX).$(SUFFIX)  camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \
 	casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \
 	cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \
-	cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX)
+	cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX)
 
 ZBLASOBJS	+= \
 	 zamax_k$(TSUFFIX).$(SUFFIX)  zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \
 	zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \
 	zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \
-	zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX)
+	zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX)
 
 XBLASOBJS	+= \
 	 xamax_k$(TSUFFIX).$(SUFFIX)  xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \
 	xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \
 	xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
-	xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX)
+	xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX)
 
 ### AMAX ###
 
@@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX)  $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX)  : $(KE
 $(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX)  $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(IQMINKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
 
-
+### ASUM ###
 $(KDIR)sasum_k$(TSUFFIX).$(SUFFIX)   $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX)   : $(KERNELDIR)/$(SASUMKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
 
@@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX)  $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX)  : $(KE
 $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX)  $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(XASUMKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
 
+### SUM ###
+$(KDIR)ssum_k$(TSUFFIX).$(SUFFIX)   $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX)   : $(KERNELDIR)/$(SSUMKERNEL)
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
+
+$(KDIR)dsum_k$(TSUFFIX).$(SUFFIX)   $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX)   : $(KERNELDIR)/$(DSUMKERNEL)
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
+
+$(KDIR)qsum_k$(TSUFFIX).$(SUFFIX)   $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX)   : $(KERNELDIR)/$(QSUMKERNEL)
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
+
+$(KDIR)csum_k$(TSUFFIX).$(SUFFIX)  $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(CSUMKERNEL)
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@
+
+$(KDIR)zsum_k$(TSUFFIX).$(SUFFIX)  $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(ZSUMKERNEL)
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@
+
+$(KDIR)xsum_k$(TSUFFIX).$(SUFFIX)  $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(XSUMKERNEL)
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
+
+### AXPY ###
 $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX)  $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(SAXPYKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
 
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 9258f216dd..f83def47b4 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B)
 USE_TRMM = 1
 endif
 
-ifeq ($(TARGET), GENERIC)
+ifeq ($(CORE), GENERIC)
 USE_TRMM = 1
 endif
 
@@ -44,10 +44,18 @@ ifeq ($(CORE), POWER8)
 USE_TRMM = 1
 endif
 
+ifeq ($(CORE), POWER9)
+USE_TRMM = 1
+endif
+
 ifeq ($(ARCH), zarch)
 USE_TRMM = 1
 endif
 
+ifeq ($(CORE), Z14)
+USE_TRMM = 1
+endif
+
 
 
 
diff --git a/kernel/alpha/sum.S b/kernel/alpha/sum.S
new file mode 100644
index 0000000000..3902817a70
--- /dev/null
+++ b/kernel/alpha/sum.S
@@ -0,0 +1,206 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE	88
+
+#define N	$16
+#define X	$17
+#define INCX	$18
+#define I	$19
+
+#define s0	$f0
+#define s1	$f1
+#define s2	$f10
+#define s3	$f11
+
+#define a0	$f12
+#define a1	$f13
+#define a2	$f14
+#define a3	$f15
+#define a4	$f16
+#define a5	$f17
+#define a6	$f18
+#define a7	$f19
+
+#define t0	$f20
+#define t1	$f21
+#define t2	$f22
+#define t3	$f23
+
+	PROLOGUE
+	PROFCODE
+
+	fclr	s0
+	unop
+	fclr	t0
+	ble	N,  $L999
+
+	sra	N, 3, I
+	fclr	s1
+	fclr	s2
+	ble	I, $L15
+
+	LD	a0,  0 * SIZE(X)
+	fclr	t1
+	SXADDQ	INCX, X, X
+	fclr	t2
+
+	LD	a1,  0 * SIZE(X)
+	fclr	t3
+	SXADDQ	INCX, X, X
+	fclr	s3
+
+	LD	a2,  0 * SIZE(X)
+	SXADDQ	INCX, X, X
+	LD	a3,  0 * SIZE(X)
+	SXADDQ	INCX, X, X
+
+	LD	a4,  0 * SIZE(X)
+	SXADDQ	INCX, X, X
+	LD	a5,  0 * SIZE(X)
+	SXADDQ	INCX, X, X
+
+	lda	I,  -1(I)
+	ble	I, $L13
+	.align 4
+
+$L12:
+	ADD	s0, t0, s0
+	ldl	$31, PREFETCHSIZE * 2 * SIZE(X)
+	fmov	a0, t0
+	lda	I,  -1(I)
+
+	ADD	s1, t1, s1
+	LD	a6,  0 * SIZE(X)
+	fmov	a1, t1
+	SXADDQ	INCX, X, X
+
+	ADD	s2, t2, s2
+	LD	a7,  0 * SIZE(X)
+	fmov	a2, t2
+	SXADDQ	INCX, X, X
+
+	ADD	s3, t3, s3
+	LD	a0,  0 * SIZE(X)
+	fmov	a3, t3
+	SXADDQ	INCX, X, X
+
+	ADD	s0, t0, s0
+	LD	a1,  0 * SIZE(X)
+	fmov	a4, t0
+	SXADDQ	INCX, X, X
+
+	ADD	s1, t1, s1
+	LD	a2,  0 * SIZE(X)
+	fmov	a5, t1
+	SXADDQ	INCX, X, X
+
+	ADD	s2, t2, s2
+	LD	a3,  0 * SIZE(X)
+	fmov	a6, t2
+	SXADDQ	INCX, X, X
+
+	ADD	s3, t3, s3
+	LD	a4,  0 * SIZE(X)
+	fmov	a7, t3
+	SXADDQ	INCX, X, X
+
+	LD	a5,  0 * SIZE(X)
+	unop
+	SXADDQ	INCX, X, X
+	bne	I, $L12
+	.align 4
+
+$L13:
+	ADD	s0, t0, s0
+	LD	a6,  0 * SIZE(X)
+	fmov	a0, t0
+	SXADDQ	INCX, X, X
+
+	ADD	s1, t1, s1
+	LD	a7,  0 * SIZE(X)
+	fmov	a1, t1
+	SXADDQ	INCX, X, X
+
+	ADD	s2, t2, s2
+	fmov	a2, t2
+	ADD	s3, t3, s3
+	fmov	a3, t3
+
+	ADD	s0, t0, s0
+	fmov	a4, t0
+	ADD	s1, t1, s1
+	fmov	a5, t1
+	ADD	s2, t2, s2
+	fmov	a6, t2
+	ADD	s3, t3, s3
+	fmov	a7, t3
+
+	ADD	s1, t1, s1
+	ADD	s2, t2, s2
+	ADD	s3, t3, s3
+
+	ADD	s0, s1, s0
+	ADD	s2, s3, s2
+	.align 4
+
+$L15:
+	and	N, 7, I
+	ADD	s0, s2, s0
+	unop
+	ble	I, $L999
+	.align 4
+
+$L17:
+	ADD	s0, t0, s0
+	LD	a0,  0 * SIZE(X)
+	SXADDQ	INCX, X, X
+	fmov	a0, t0
+
+	lda	I,  -1(I)
+	bne	I, $L17
+	.align 4
+
+$L999:
+	ADD	s0, t0, s0
+	ret
+	EPILOGUE
diff --git a/kernel/alpha/zsum.S b/kernel/alpha/zsum.S
new file mode 100644
index 0000000000..1ad0eb137d
--- /dev/null
+++ b/kernel/alpha/zsum.S
@@ -0,0 +1,208 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "version.h"
+
+#define PREFETCHSIZE	88
+
+#define N	$16
+#define X	$17
+#define INCX	$18
+#define I	$19
+
+#define s0	$f0
+#define s1	$f1
+#define s2	$f10
+#define s3	$f11
+
+#define a0	$f12
+#define a1	$f13
+#define a2	$f14
+#define a3	$f15
+#define a4	$f16
+#define a5	$f17
+#define a6	$f18
+#define a7	$f19
+
+#define t0	$f20
+#define t1	$f21
+#define t2	$f22
+#define t3	$f23
+
+	PROLOGUE
+	PROFCODE
+
+	fclr	s0
+	unop
+	fclr	t0
+	addq	INCX, INCX, INCX
+
+	fclr	s1
+	unop
+	fclr	t1
+	ble	N,  $L999
+
+	fclr	s2
+	sra	N, 2, I
+	fclr	s3
+	ble	I, $L15
+
+	LD	a0,  0 * SIZE(X)
+	fclr	t2
+	LD	a1,  1 * SIZE(X)
+	SXADDQ	INCX, X, X
+
+	LD	a2,  0 * SIZE(X)
+	fclr	t3
+	LD	a3,  1 * SIZE(X)
+	SXADDQ	INCX, X, X
+
+	LD	a4,  0 * SIZE(X)
+	LD	a5,  1 * SIZE(X)
+	SXADDQ	INCX, X, X
+	lda	I,  -1(I)
+
+	ble	I, $L13
+	.align 4
+
+$L12:
+	ADD	s0, t0, s0
+	ldl	$31, PREFETCHSIZE * SIZE(X)
+	fmov	a0, t0
+	lda	I,  -1(I)
+
+	ADD	s1, t1, s1
+	LD	a6,  0 * SIZE(X)
+	fmov	a1, t1
+	unop
+
+	ADD	s2, t2, s2
+	LD	a7,  1 * SIZE(X)
+	fmov	a2, t2
+	SXADDQ	INCX, X, X
+
+	ADD	s3, t3, s3
+	LD	a0,  0 * SIZE(X)
+	fmov	a3, t3
+	unop
+
+	ADD	s0, t0, s0
+	LD	a1,  1 * SIZE(X)
+	fmov	a4, t0
+	SXADDQ	INCX, X, X
+
+	ADD	s1, t1, s1
+	LD	a2,  0 * SIZE(X)
+	fmov	a5, t1
+	unop
+
+	ADD	s2, t2, s2
+	LD	a3,  1 * SIZE(X)
+	fmov	a6, t2
+	SXADDQ	INCX, X, X
+
+	ADD	s3, t3, s3
+	LD	a4,  0 * SIZE(X)
+	fmov	a7, t3
+	unop
+
+	LD	a5,  1 * SIZE(X)
+	unop
+	SXADDQ	INCX, X, X
+	bne	I, $L12
+	.align 4
+
+$L13:
+	ADD	s0, t0, s0
+	LD	a6,  0 * SIZE(X)
+	fmov	a0, t0
+
+	ADD	s1, t1, s1
+	LD	a7,  1 * SIZE(X)
+	fmov	a1, t1
+	SXADDQ	INCX, X, X
+
+	ADD	s2, t2, s2
+	fmov	a2, t2
+	ADD	s3, t3, s3
+	fmov	a3, t3
+
+	ADD	s0, t0, s0
+	fmov	a4, t0
+	ADD	s1, t1, s1
+	fmov	a5, t1
+	ADD	s2, t2, s2
+	fmov	a6, t2
+	ADD	s3, t3, s3
+	fmov	a7, t3
+
+	ADD	s2, t2, s2
+	ADD	s3, t3, s3
+
+	.align 4
+
+$L15:
+	ADD	s0, s2, s0
+	and	N, 3, I
+	ADD	s1, s3, s1
+	ble	I, $L999
+	.align 4
+
+$L17:
+	ADD	s0, t0, s0
+	LD	a0,  0 * SIZE(X)
+	fmov	a0, t0
+	lda	I,  -1(I)
+
+	ADD	s1, t1, s1
+	LD	a1,  1 * SIZE(X)
+	fmov	a1, t1
+	SXADDQ	INCX, X, X
+
+	bne	I, $L17
+	.align 4
+
+$L999:
+	ADD	s0, t0, s0
+	ADD	s1, t1, s1
+
+	ADD	s0, s1, s0
+	ret
+	EPILOGUE
diff --git a/kernel/arm/KERNEL.ARMV5 b/kernel/arm/KERNEL.ARMV5
index 10808e2d93..e977dda3a0 100644
--- a/kernel/arm/KERNEL.ARMV5
+++ b/kernel/arm/KERNEL.ARMV5
@@ -35,6 +35,11 @@ DASUMKERNEL  = ../arm/asum.c
 CASUMKERNEL  = ../arm/zasum.c
 ZASUMKERNEL  = ../arm/zasum.c
 
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
 SAXPYKERNEL  = ../arm/axpy.c
 DAXPYKERNEL  = ../arm/axpy.c
 CAXPYKERNEL  = ../arm/zaxpy.c
diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index 960dae67b0..b773a5ba03 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -37,6 +37,9 @@ DASUMKERNEL  = asum_vfp.S
 CASUMKERNEL  = asum_vfp.S
 ZASUMKERNEL  = asum_vfp.S
 
+SSUMKERNEL  = sum_vfp.S
+DSUMKERNEL  = sum_vfp.S
+
 SAXPYKERNEL  = axpy_vfp.S
 DAXPYKERNEL  = axpy_vfp.S
 CAXPYKERNEL  = axpy_vfp.S
diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c
index 598cba3871..ffc65226ed 100644
--- a/kernel/arm/imin.c
+++ b/kernel/arm/imin.c
@@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
 	while(i < n)
 	{
-		if( x[ix] > minf )
+		if( x[ix] < minf )
 		{
 			min = i;
 			minf = x[ix];
diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c
new file mode 100644
index 0000000000..7b78ec61a4
--- /dev/null
+++ b/kernel/arm/sum.c
@@ -0,0 +1,51 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* trivial copy of asum.c with the ABS() removed                                       *
+**************************************************************************************/
+
+
+#include "common.h"
+#include <math.h>
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	n *= inc_x;
+	while(i < n)
+	{
+		sumf += x[i];
+		i += inc_x;
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/arm/sum_vfp.S b/kernel/arm/sum_vfp.S
new file mode 100644
index 0000000000..d33d99ed3e
--- /dev/null
+++ b/kernel/arm/sum_vfp.S
@@ -0,0 +1,425 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed                                    *
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	N	r0
+#define	X	r1
+#define	INC_X	r2
+
+
+#define I	r12
+
+#define X_PRE	512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#if	!defined(COMPLEX)
+
+#if	defined(DOUBLE)
+
+.macro KERNEL_F4
+
+	pld	[ X, #X_PRE  ]
+	vldmia.f64	X!, { d4 - d5 }
+	vadd.f64   d0  , d0,  d4
+	vldmia.f64	X!, { d6 - d7 }
+	vadd.f64   d1  , d1,  d5
+	vadd.f64   d0  , d0,  d6
+	vadd.f64   d1  , d1,  d7
+
+.endm
+
+.macro KERNEL_F1
+
+	vldmia.f64	X!, { d4 }
+	vadd.f64   d0  , d0,  d4
+
+.endm
+
+
+.macro KERNEL_S4
+
+	vldmia.f64	X, { d4 }
+	vadd.f64   d0  , d0,  d4
+	add	X, X, INC_X
+
+	vldmia.f64	X, { d4 }
+	vadd.f64   d0  , d0,  d4
+	add	X, X, INC_X
+
+	vldmia.f64	X, { d4 }
+	vadd.f64   d0  , d0,  d4
+	add	X, X, INC_X
+
+	vldmia.f64	X, { d4 }
+	vadd.f64   d0  , d0,  d4
+	add	X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+	vldmia.f64	X, { d4 }
+	vadd.f64   d0  , d0,  d4
+	add	X, X, INC_X
+
+.endm
+
+#else
+
+.macro KERNEL_F4
+
+	vldmia.f32	X!, { s4 - s5 }
+	vadd.f32   s0  , s0,  s4
+	vldmia.f32	X!, { s6 - s7 }
+	vadd.f32   s1  , s1,  s5
+	vadd.f32   s0  , s0,  s6
+	vadd.f32   s1  , s1,  s7
+
+.endm
+
+.macro KERNEL_F1
+
+	vldmia.f32	X!, { s4 }
+	vadd.f32   s0  , s0,  s4
+
+.endm
+
+
+.macro KERNEL_S4
+
+	vldmia.f32	X, { s4 }
+	vadd.f32   s0  , s0,  s4
+	add	X, X, INC_X
+
+	vldmia.f32	X, { s4 }
+	vadd.f32   s0  , s0,  s4
+	add	X, X, INC_X
+
+	vldmia.f32	X, { s4 }
+	vadd.f32   s0  , s0,  s4
+	add	X, X, INC_X
+
+	vldmia.f32	X, { s4 }
+	vadd.f32   s0  , s0,  s4
+	add	X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+	vldmia.f32	X, { s4 }
+	vadd.f32   s0  , s0,  s4
+	add	X, X, INC_X
+
+.endm
+
+
+#endif
+
+#else
+
+#if	defined(DOUBLE)
+
+.macro KERNEL_F4
+
+	pld	[ X, #X_PRE  ]
+	vldmia.f64	X!, { d4 - d5 }
+	vadd.f64   d0  , d0,  d4
+	vldmia.f64	X!, { d6 - d7 }
+	vadd.f64   d1  , d1,  d5
+	vadd.f64   d0  , d0,  d6
+	vadd.f64   d1  , d1,  d7
+
+	pld	[ X, #X_PRE  ]
+	vldmia.f64	X!, { d4 - d5 }
+	vadd.f64   d0  , d0,  d4
+	vldmia.f64	X!, { d6 - d7 }
+	vadd.f64   d1  , d1,  d5
+	vadd.f64   d0  , d0,  d6
+	vadd.f64   d1  , d1,  d7
+
+
+.endm
+
+.macro KERNEL_F1
+
+	vldmia.f64	X!, { d4 }
+	vadd.f64   d0  , d0,  d4
+
+	vldmia.f64	X!, { d4 }
+	vadd.f64   d0  , d0,  d4
+
+
+.endm
+
+
+.macro KERNEL_S4
+
+	vldmia.f64	X, { d4 -d5 }
+	vadd.f64   d0  , d0,  d4
+	vadd.f64   d0  , d0,  d5
+	add	X, X, INC_X
+
+	vldmia.f64	X, { d4 -d5 }
+	vadd.f64   d0  , d0,  d4
+	vadd.f64   d0  , d0,  d5
+	add	X, X, INC_X
+
+	vldmia.f64	X, { d4 -d5 }
+	vadd.f64   d0  , d0,  d4
+	vadd.f64   d0  , d0,  d5
+	add	X, X, INC_X
+
+	vldmia.f64	X, { d4 -d5 }
+	vadd.f64   d0  , d0,  d4
+	vadd.f64   d0  , d0,  d5
+	add	X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+	vldmia.f64	X, { d4 -d5 }
+	vadd.f64   d0  , d0,  d4
+	vadd.f64   d0  , d0,  d5
+	add	X, X, INC_X
+
+.endm
+
+#else
+
+.macro KERNEL_F4
+
+	pld	[ X, #X_PRE  ]
+	vldmia.f32	X!, { s4 - s5 }
+	vadd.f32   s0  , s0,  s4
+	vldmia.f32	X!, { s6 - s7 }
+	vadd.f32   s1  , s1,  s5
+	vadd.f32   s0  , s0,  s6
+	vadd.f32   s1  , s1,  s7
+
+	vldmia.f32	X!, { s4 - s5 }
+	vadd.f32   s0  , s0,  s4
+	vldmia.f32	X!, { s6 - s7 }
+	vadd.f32   s1  , s1,  s5
+	vadd.f32   s0  , s0,  s6
+	vadd.f32   s1  , s1,  s7
+
+
+.endm
+
+.macro KERNEL_F1
+
+	vldmia.f32	X!, { s4 }
+	vadd.f32   s0  , s0,  s4
+
+	vldmia.f32	X!, { s4 }
+	vadd.f32   s0  , s0,  s4
+
+.endm
+
+
+.macro KERNEL_S4
+
+	vldmia.f32	X, { s4 -s5 }
+	vadd.f32   s0  , s0,  s4
+	vadd.f32   s0  , s0,  s5
+	add	X, X, INC_X
+
+	vldmia.f32	X, { s4 -s5 }
+	vadd.f32   s0  , s0,  s4
+	vadd.f32   s0  , s0,  s5
+	add	X, X, INC_X
+
+	vldmia.f32	X, { s4 -s5 }
+	vadd.f32   s0  , s0,  s4
+	vadd.f32   s0  , s0,  s5
+	add	X, X, INC_X
+
+	vldmia.f32	X, { s4 -s5 }
+	vadd.f32   s0  , s0,  s4
+	vadd.f32   s0  , s0,  s5
+	add	X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+	vldmia.f32	X, { s4 -s5 }
+	vadd.f32   s0  , s0,  s4
+	vadd.f32   s0  , s0,  s5
+	add	X, X, INC_X
+
+.endm
+
+#endif
+
+#endif
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+        movs    r12, #0                                          // clear floating point register
+        vmov    s0, r12
+        vmov    s1, r12
+#if     defined(DOUBLE)
+        vcvt.f64.f32    d0, s0
+        vcvt.f64.f32    d1, s1
+#endif
+
+	cmp	N, #0
+	ble	asum_kernel_L999
+
+	cmp	INC_X, #0
+	beq	asum_kernel_L999
+
+	cmp	INC_X, #1
+	bne	asum_kernel_S_BEGIN
+
+
+asum_kernel_F_BEGIN:
+
+	asrs	I, N, #2					// I = N / 4
+	ble	asum_kernel_F1
+
+	.align 5
+
+asum_kernel_F4:
+
+#if !defined(DOUBLE) && !defined(COMPLEX)
+	pld	[ X, #X_PRE  ]
+#endif
+	KERNEL_F4
+
+	subs	I, I, #1
+	ble	asum_kernel_F1
+
+	KERNEL_F4
+
+	subs	I, I, #1
+	bne	asum_kernel_F4
+
+asum_kernel_F1:
+
+	ands	I, N, #3
+	ble	asum_kernel_L999
+
+asum_kernel_F10:
+
+	KERNEL_F1
+
+	subs    I, I, #1
+        bne     asum_kernel_F10
+
+	b	asum_kernel_L999
+
+asum_kernel_S_BEGIN:
+
+#if defined(COMPLEX)
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
+#else
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
+#endif
+
+#else
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+#else
+	lsl	INC_X, INC_X, #2				// INC_X * SIZE
+#endif
+
+#endif
+
+	asrs	I, N, #2					// I = N / 4
+	ble	asum_kernel_S1
+
+	.align 5
+
+asum_kernel_S4:
+
+	KERNEL_S4
+
+	subs	I, I, #1
+	bne	asum_kernel_S4
+
+asum_kernel_S1:
+
+	ands	I, N, #3
+	ble	asum_kernel_L999
+
+asum_kernel_S10:
+
+	KERNEL_S1
+
+	subs    I, I, #1
+        bne     asum_kernel_S10
+
+
+asum_kernel_L999:
+
+
+#if defined(DOUBLE)
+	vadd.f64	d0 , d0, d1				// set return value
+#else
+	vadd.f32	s0 , s0, s1				// set return value
+#endif
+
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+	vmov	r0, s0
+#else
+	vmov	r0, r1, d0
+#endif
+#endif
+
+	bx	lr
+
+	EPILOGUE
+
diff --git a/kernel/arm/zsum.c b/kernel/arm/zsum.c
new file mode 100644
index 0000000000..cd24f99957
--- /dev/null
+++ b/kernel/arm/zsum.c
@@ -0,0 +1,57 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* trivial copy of zasum.c with the ABS() removed                                      *
+**************************************************************************************/
+
+
+#include "common.h"
+#include <math.h>
+
+#define CSUM1(x,i)	x[i]+x[i+1]
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	inc_x2 = 2 * inc_x;
+
+	n *= inc_x2;
+	while(i < n)
+	{
+		sumf += CSUM1(x,i);
+		i += inc_x2;
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110
new file mode 100644
index 0000000000..04d6940d7a
--- /dev/null
+++ b/kernel/arm64/KERNEL.TSV110
@@ -0,0 +1,175 @@
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+STRMMKERNEL	= ../generic/trmmkernel_4x4.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL  = amax.S
+DAMAXKERNEL  = amax.S
+CAMAXKERNEL  = zamax.S
+ZAMAXKERNEL  = zamax.S
+
+ISAMAXKERNEL = iamax.S
+IDAMAXKERNEL = iamax.S
+ICAMAXKERNEL = izamax.S
+IZAMAXKERNEL = izamax.S
+
+SASUMKERNEL  = asum.S
+DASUMKERNEL  = asum.S
+CASUMKERNEL  = casum.S
+ZASUMKERNEL  = zasum.S
+
+SAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = axpy.S
+CAXPYKERNEL  = zaxpy.S
+ZAXPYKERNEL  = zaxpy.S
+
+SCOPYKERNEL  = copy.S
+DCOPYKERNEL  = copy.S
+CCOPYKERNEL  = copy.S
+ZCOPYKERNEL  = copy.S
+
+SDOTKERNEL   = dot.S
+DDOTKERNEL   = dot.S
+CDOTKERNEL   = zdot.S
+ZDOTKERNEL   = zdot.S
+DSDOTKERNEL  = dot.S
+
+SNRM2KERNEL  = nrm2.S
+DNRM2KERNEL  = nrm2.S
+CNRM2KERNEL  = znrm2.S
+ZNRM2KERNEL  = znrm2.S
+
+SROTKERNEL   = rot.S
+DROTKERNEL   = rot.S
+CROTKERNEL   = zrot.S
+ZROTKERNEL   = zrot.S
+
+SSCALKERNEL  = scal.S
+DSCALKERNEL  = scal.S
+CSCALKERNEL  = zscal.S
+ZSCALKERNEL  = zscal.S
+
+SSWAPKERNEL  = swap.S
+DSWAPKERNEL  = swap.S
+CSWAPKERNEL  = swap.S
+ZSWAPKERNEL  = swap.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
diff --git a/kernel/arm64/csum.S b/kernel/arm64/csum.S
new file mode 100644
index 0000000000..90746bc392
--- /dev/null
+++ b/kernel/arm64/csum.S
@@ -0,0 +1,164 @@
+/*******************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	N	x0	/* vector length */
+#define	X	x1	/* X vector address */
+#define	INC_X	x2	/* X stride */
+#define I	x5	/* loop variable */
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+#define REG0	wzr
+#define SUMF	s0
+#define TMPF	s1
+#define TMPVF	{v1.s}[0]
+#define SZ	4
+
+/******************************************************************************/
+
+.macro KERNEL_F1
+	ld1	{v1.2s}, [X], #8
+	ext	v2.8b, v1.8b, v1.8b, #4
+	fadd	TMPF, TMPF, s2
+	fadd	SUMF, SUMF, TMPF
+.endm
+
+.macro KERNEL_F8
+	ld1	{v1.4s, v2.4s, v3.4s, v4.4s}, [X]
+	add	X, X, #64
+
+	PRFM	PLDL1KEEP, [X, #1024]
+
+	fadd	v1.4s, v1.4s, v2.4s
+	fadd	v3.4s, v3.4s, v4.4s
+	fadd	v0.4s, v0.4s, v1.4s
+	fadd	v0.4s, v0.4s, v3.4s
+.endm
+
+.macro KERNEL_F8_FINALIZE
+	ext	v1.16b, v0.16b, v0.16b, #8
+	fadd	v0.2s, v0.2s, v1.2s
+	faddp	SUMF, v0.2s
+.endm
+
+.macro INIT_S
+	lsl	INC_X, INC_X, #3
+.endm
+
+.macro KERNEL_S1
+	ld1	{v1.2s}, [X], INC_X
+	ext	v2.8b, v1.8b, v1.8b, #4
+	fadd	TMPF, TMPF, s2
+	fadd	SUMF, SUMF, TMPF
+
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	fmov	SUMF, REG0
+	fmov	s1, SUMF
+
+	cmp	N, xzr
+	ble	.Lcsum_kernel_L999
+	cmp	INC_X, xzr
+	ble	.Lcsum_kernel_L999
+
+	cmp	INC_X, #1
+	bne	.Lcsum_kernel_S_BEGIN
+
+.Lcsum_kernel_F_BEGIN:
+
+	asr	I, N, #3
+	cmp	I, xzr
+	beq	.Lcsum_kernel_F1
+
+.Lcsum_kernel_F8:
+
+	KERNEL_F8
+
+	subs	I, I, #1
+	bne	.Lcsum_kernel_F8
+
+	KERNEL_F8_FINALIZE
+
+.Lcsum_kernel_F1:
+
+	ands	I, N, #7
+	ble	.Lcsum_kernel_L999
+
+.Lcsum_kernel_F10:
+
+	KERNEL_F1
+
+	subs    I, I, #1
+        bne     .Lcsum_kernel_F10
+
+.Lcsum_kernel_L999:
+	ret
+
+.Lcsum_kernel_S_BEGIN:
+
+	INIT_S
+
+	asr	I, N, #2
+	cmp	I, xzr
+	ble	.Lcsum_kernel_S1
+
+.Lcsum_kernel_S4:
+
+	KERNEL_S1
+	KERNEL_S1
+	KERNEL_S1
+	KERNEL_S1
+
+	subs	I, I, #1
+	bne	.Lcsum_kernel_S4
+
+.Lcsum_kernel_S1:
+
+	ands	I, N, #3
+	ble	.Lcsum_kernel_L999
+
+.Lcsum_kernel_S10:
+
+	KERNEL_S1
+
+	subs    I, I, #1
+        bne     .Lcsum_kernel_S10
+
+	ret
+
+	EPILOGUE
diff --git a/kernel/arm64/sum.S b/kernel/arm64/sum.S
new file mode 100644
index 0000000000..16d0dc4e44
--- /dev/null
+++ b/kernel/arm64/sum.S
@@ -0,0 +1,186 @@
+/*******************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	N	x0	/* vector length */
+#define	X	x1	/* X vector address */
+#define	INC_X	x2	/* X stride */
+#define I	x5	/* loop variable */
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+#if !defined(DOUBLE)
+#define REG0	wzr
+#define SUMF	s0
+#define TMPF	s1
+#define TMPVF	{v1.s}[0]
+#define SZ	4
+#else
+#define REG0	xzr
+#define SUMF	d0
+#define TMPF	d1
+#define TMPVF	{v1.d}[0]
+#define SZ	8
+#endif
+
+/******************************************************************************/
+
+.macro KERNEL_F1
+	ldr	TMPF, [X], #SZ
+	fadd	SUMF, SUMF, TMPF
+.endm
+
+.macro KERNEL_F8
+#if !defined(DOUBLE)
+	ld1	{v1.4s, v2.4s}, [X], #32	// Load [X3, X2, X1, X0]
+	fadd	v1.4s, v1.4s, v2.4s		// [X3+X1, X2+X0]
+	fadd	v0.4s, v0.4s, v1.4s		// [X3+X1, X2+X0]
+	PRFM	PLDL1KEEP, [X, #1024]
+#else // DOUBLE
+	ld1	{v2.2d, v3.2d, v4.2d, v5.2d}, [X]
+	add	X, X, #64
+
+	PRFM	PLDL1KEEP, [X, #1024]
+
+	fadd	v2.2d, v2.2d, v3.2d
+	fadd	v4.2d, v4.2d, v5.2d
+	fadd	v0.2d, v0.2d, v2.2d
+	fadd	v0.2d, v0.2d, v4.2d
+#endif
+.endm
+
+.macro KERNEL_F8_FINALIZE
+#if !defined(DOUBLE)
+	ext	v1.16b, v0.16b, v0.16b, #8
+	fadd	v0.2s, v0.2s, v1.2s
+	faddp	SUMF, v0.2s
+#else
+	faddp	SUMF, v0.2d
+#endif
+.endm
+
+.macro INIT_S
+#if !defined(DOUBLE)
+	lsl	INC_X, INC_X, #2
+#else
+	lsl	INC_X, INC_X, #3
+#endif
+.endm
+
+.macro KERNEL_S1
+	ld1	TMPVF, [X], INC_X
+	fadd	SUMF, SUMF, TMPF
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	fmov	SUMF, REG0
+#if !defined(DOUBLE)
+	fmov	s1, SUMF
+#else
+	fmov	d1, SUMF
+#endif
+
+	cmp	N, xzr
+	ble	.Lsum_kernel_L999
+	cmp	INC_X, xzr
+	ble	.Lsum_kernel_L999
+
+	cmp	INC_X, #1
+	bne	.Lsum_kernel_S_BEGIN
+
+.Lsum_kernel_F_BEGIN:
+
+	asr	I, N, #3
+	cmp	I, xzr
+	beq	.Lsum_kernel_F1
+
+.Lsum_kernel_F8:
+
+	KERNEL_F8
+
+	subs	I, I, #1
+	bne	.Lsum_kernel_F8
+
+	KERNEL_F8_FINALIZE
+
+.Lsum_kernel_F1:
+
+	ands	I, N, #7
+	ble	.Lsum_kernel_L999
+
+.Lsum_kernel_F10:
+
+	KERNEL_F1
+
+	subs    I, I, #1
+        bne     .Lsum_kernel_F10
+
+.Lsum_kernel_L999:
+	ret
+
+.Lsum_kernel_S_BEGIN:
+
+	INIT_S
+
+	asr	I, N, #2
+	cmp	I, xzr
+	ble	.Lsum_kernel_S1
+
+.Lsum_kernel_S4:
+
+	KERNEL_S1
+	KERNEL_S1
+	KERNEL_S1
+	KERNEL_S1
+
+	subs	I, I, #1
+	bne	.Lsum_kernel_S4
+
+.Lsum_kernel_S1:
+
+	ands	I, N, #3
+	ble	.Lsum_kernel_L999
+
+.Lsum_kernel_S10:
+
+	KERNEL_S1
+
+	subs    I, I, #1
+        bne     .Lsum_kernel_S10
+
+	ret
+
+	EPILOGUE
diff --git a/kernel/arm64/zsum.S b/kernel/arm64/zsum.S
new file mode 100644
index 0000000000..67ea3cb4d0
--- /dev/null
+++ b/kernel/arm64/zsum.S
@@ -0,0 +1,158 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	N	x0	/* vector length */
+#define	X	x1	/* X vector address */
+#define	INC_X	x2	/* X stride */
+#define I	x5	/* loop variable */
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+#define REG0	xzr
+#define SUMF	d0
+#define TMPF	d1
+#define TMPVF	{v1.d}[0]
+#define SZ	8
+
+/******************************************************************************/
+
+.macro KERNEL_F1
+	ld1	{v1.2d}, [X], #16
+	faddp	TMPF, v1.2d
+	fadd	SUMF, SUMF, TMPF
+.endm
+
+.macro KERNEL_F4
+	ld1	{v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
+
+	fadd	v1.2d, v1.2d, v2.2d
+	fadd	v3.2d, v3.2d, v4.2d
+
+	fadd	v0.2d, v0.2d, v1.2d
+	fadd	v0.2d, v0.2d, v3.2d
+
+	PRFM	PLDL1KEEP, [X, #1024]
+.endm
+
+.macro KERNEL_F4_FINALIZE
+	faddp	SUMF, v0.2d
+.endm
+
+.macro INIT_S
+	lsl	INC_X, INC_X, #4
+.endm
+
+.macro KERNEL_S1
+	ld1	{v1.2d}, [X], INC_X
+	faddp	TMPF, v1.2d
+	fadd	SUMF, SUMF, TMPF
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	fmov	SUMF, REG0
+
+	cmp	N, xzr
+	ble	.Lzsum_kernel_L999
+	cmp	INC_X, xzr
+	ble	.Lzsum_kernel_L999
+
+	cmp	INC_X, #1
+	bne	.Lzsum_kernel_S_BEGIN
+
+.Lzsum_kernel_F_BEGIN:
+
+	asr	I, N, #2
+	cmp	I, xzr
+	beq	.Lzsum_kernel_F1
+
+.Lzsum_kernel_F4:
+
+	KERNEL_F4
+
+	subs	I, I, #1
+	bne	.Lzsum_kernel_F4
+
+	KERNEL_F4_FINALIZE
+
+.Lzsum_kernel_F1:
+
+	ands	I, N, #3
+	ble	.Lzsum_kernel_L999
+
+.Lzsum_kernel_F10:
+
+	KERNEL_F1
+
+	subs    I, I, #1
+        bne     .Lzsum_kernel_F10
+
+.Lzsum_kernel_L999:
+	ret
+
+.Lzsum_kernel_S_BEGIN:
+
+	INIT_S
+
+	asr	I, N, #2
+	cmp	I, xzr
+	ble	.Lzsum_kernel_S1
+
+.Lzsum_kernel_S4:
+
+	KERNEL_S1
+	KERNEL_S1
+	KERNEL_S1
+	KERNEL_S1
+
+	subs	I, I, #1
+	bne	.Lzsum_kernel_S4
+
+.Lzsum_kernel_S1:
+
+	ands	I, N, #3
+	ble	.Lzsum_kernel_L999
+
+.Lzsum_kernel_S10:
+
+	KERNEL_S1
+
+	subs    I, I, #1
+        bne     .Lzsum_kernel_S10
+
+	ret
+
+	EPILOGUE
diff --git a/kernel/ia64/KERNEL b/kernel/ia64/KERNEL
index 10a7e61e25..870aac473e 100644
--- a/kernel/ia64/KERNEL
+++ b/kernel/ia64/KERNEL
@@ -60,6 +60,10 @@ CASUMKERNEL  = asum.S
 ZASUMKERNEL  = asum.S
 XASUMKERNEL  = asum.S
 
+CSUMKERNEL  = sum.S
+ZSUMKERNEL  = sum.S
+XSUMKERNEL  = sum.S
+
 CNRM2KERNEL  = nrm2.S
 ZNRM2KERNEL  = nrm2.S
 XNRM2KERNEL  = nrm2.S
diff --git a/kernel/ia64/sum.S b/kernel/ia64/sum.S
new file mode 100644
index 0000000000..561d5d7715
--- /dev/null
+++ b/kernel/ia64/sum.S
@@ -0,0 +1,358 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2019, The OpenBLAS project                              */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#ifdef XDOUBLE
+#define PREFETCH_SIZE ( 8 * 16 +  4)
+#elif defined(DOUBLE)
+#define PREFETCH_SIZE (16 * 16 +  8)
+#else
+#define PREFETCH_SIZE (32 * 16 + 16)
+#endif
+
+#ifndef COMPLEX
+#define COMPADD	0
+#define STRIDE INCX
+#else
+#define COMPADD	1
+#define STRIDE SIZE
+#endif
+
+#define PRE1	r2
+
+#define I	r17
+#define J	r18
+#define INCX16	r21
+
+#define PR	r30
+#define ARLC	r31
+
+#define N	r32
+#define X	r33
+#define INCX	r34
+
+
+	PROLOGUE
+	.prologue
+	PROFCODE
+	{ .mfi
+	adds	PRE1 = PREFETCH_SIZE * SIZE, X
+	mov	f8   = f0
+	.save ar.lc, ARLC
+	mov	ARLC = ar.lc
+	}
+	;;
+	.body
+#ifdef F_INTERFACE
+	{ .mmi
+	LDINT	N    = [N]
+	LDINT	INCX = [INCX]
+	nop.i	0
+	}
+	;;
+#ifndef USE64BITINT
+	{ .mii
+	nop.m	0
+	sxt4	N = N
+	sxt4	INCX = INCX
+	}
+	;;
+#endif
+#endif
+	{ .mmi
+	cmp.lt	p0, p6 = r0, INCX
+	cmp.lt	p0, p7 = r0, N
+	shr	I =  N, (4 - COMPADD)
+	}
+	{ .mbb
+	and	J = ((1 << (4 - COMPADD)) - 1), N
+	(p6) 	br.ret.sptk.many b0
+	(p7) 	br.ret.sptk.many b0
+	}
+	;;
+	{ .mfi
+	adds	I = -1, I
+	mov	f10 = f0
+	mov	PR = pr
+	}
+	{ .mfi
+	cmp.eq	p9, p0  =   r0, J
+	mov	f9  = f0
+	tbit.z	p0, p12 = N, 3 - COMPADD
+	}
+	;;
+	{ .mmi
+	cmp.eq	p16, p0 = r0, r0
+	cmp.ne	p17, p0 = r0, r0
+	mov	ar.ec= 3
+	}
+	{ .mfi
+	cmp.ne	p18, p0 = r0, r0
+	mov	f11 = f0
+	shl	INCX = INCX, BASE_SHIFT + COMPADD
+	}
+	;;
+	{ .mmi
+#ifdef XDOUBLE
+	shladd	INCX16  = INCX, (3 - COMPADD), r0
+#else
+	shladd	INCX16  = INCX, (4 - COMPADD), r0
+#endif
+	cmp.ne	p19, p0 = r0, r0
+	mov	ar.lc = I
+	}
+	{ .mmb
+	cmp.gt	p8 ,p0  =   r0, I
+#ifdef COMPLEX
+       adds	INCX = - SIZE, INCX
+#else
+	nop.m	0
+#endif
+	(p8) br.cond.dpnt  .L55
+	}
+	;;
+	.align 32
+
+.L52:
+	{ .mmf
+	(p16) lfetch.nt1 [PRE1], INCX16
+	(p16) LDFD	f32  = [X], STRIDE
+	}
+	{ .mfb
+	(p19) FADD	f8  = f8,  f71
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f35  = [X], INCX
+	}
+	{ .mfb
+	(p19) FADD	f9  = f9,  f74
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f38  = [X], STRIDE
+	}
+	{ .mfb
+	(p19) FADD	f10 = f10, f77
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f41  = [X], INCX
+	}
+	{ .mfb
+	(p19) FADD	f11 = f11, f80
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f44  = [X], STRIDE
+	}
+	{ .mfb
+	(p18) FADD	f8  = f8,  f34
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f47  = [X], INCX
+	}
+	{ .mfb
+	(p18) FADD	f9  = f9,  f37
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f50  = [X], STRIDE
+	}
+	{ .mfb
+	(p18) FADD	f10 = f10, f40
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f53  = [X], INCX
+	}
+	{ .mfb
+	(p18) FADD	f11 = f11, f43
+	}
+	;;
+	{ .mmf
+#ifdef XDOUBLE
+	(p16) lfetch.nt1 [PRE1], INCX16
+#endif
+	(p16) LDFD	f56  = [X], STRIDE
+	}
+	{ .mfb
+	(p18) FADD	f8  = f8,  f46
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f59  = [X], INCX
+	}
+	{ .mfb
+	(p18) FADD	f9  = f9,  f49
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f62  = [X], STRIDE
+	}
+	{ .mfb
+	(p18) FADD	f10 = f10, f52
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f65  = [X], INCX
+	}
+	{ .mfb
+	(p18) FADD	f11 = f11, f55
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f68  = [X], STRIDE
+	}
+	{ .mfb
+	(p18) FADD	f8  = f8,  f58
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f71  = [X], INCX
+	}
+	{ .mfb
+	(p18) FADD	f9  = f9,  f61
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f74  = [X], STRIDE
+	}
+	{ .mfb
+	(p18) FADD	f10 = f10, f64
+	}
+	;;
+	{ .mmf
+	(p16) LDFD	f77  = [X], INCX
+	}
+	{ .mfb
+	(p18) FADD	f11 = f11, f67
+	br.ctop.sptk.few .L52
+	}
+	;;
+	FADD	f8  = f8,  f71
+	FADD	f9  = f9,  f74
+	FADD	f10 = f10, f77
+	FADD	f11 = f11, f80
+	.align 32
+	;;
+.L55:
+	(p12) LDFD	f32  = [X], STRIDE
+	(p9) br.cond.dptk .L998
+	;;
+	(p12) LDFD	f33  = [X], INCX
+	;;
+	(p12) LDFD	f34  = [X], STRIDE
+	;;
+	(p12) LDFD	f35  = [X], INCX
+	tbit.z	p0, p13 = N, (2 - COMPADD)
+	;;
+	(p12) LDFD	f36  = [X], STRIDE
+	tbit.z	p0, p14 = N, (1 - COMPADD)
+	;;
+	(p12) LDFD	f37  = [X], INCX
+#ifndef COMPLEX
+	tbit.z	p0, p15 = N, 0
+#endif
+	;;
+	(p12) LDFD	f38  = [X], STRIDE
+	;;
+	(p12) LDFD	f39  = [X], INCX
+	;;
+	(p13) LDFD	f40  = [X], STRIDE
+	;;
+	(p13) LDFD	f41  = [X], INCX
+	;;
+	(p13) LDFD	f42  = [X], STRIDE
+	(p12) FADD	f8  = f8,  f32
+	;;
+	(p13) LDFD	f43  = [X], INCX
+	(p12) FADD	f9  = f9,  f33
+	;;
+	(p14) LDFD	f44  = [X], STRIDE
+	(p12) FADD	f10 = f10, f34
+	;;
+	(p14) LDFD	f45  = [X], INCX
+	(p12) FADD	f11 = f11, f35
+	;;
+#ifndef COMPLEX
+	(p15) LDFD	f46  = [X]
+#endif
+	(p12) FADD	f8  = f8,  f36
+	;;
+	(p12) FADD	f9  = f9,  f37
+	(p12) FADD	f10 = f10, f38
+	(p12) FADD	f11 = f11, f39
+	;;
+	(p13) FADD	f8  = f8,  f40
+	(p13) FADD	f9  = f9,  f41
+#ifndef COMPLEX
+#endif
+	(p13) FADD	f10 = f10, f42
+	;;
+	(p13) FADD	f11 = f11, f43
+	(p14) FADD	f8  = f8,  f44
+	(p14) FADD	f9  = f9,  f45
+#ifndef COMPLEX
+	(p15) FADD	f10 = f10, f46
+#endif
+	;;
+	.align 32
+
+.L998:
+	{ .mfi
+	FADD	f8  = f8,  f9
+	mov	ar.lc  = ARLC
+	}
+	{ .mmf
+	FADD	f10 = f10, f11
+	}
+	;;
+	{ .mii
+	mov	pr = PR, -65474
+	}
+	;;
+	{ .mfb
+	FADD	f8  = f8,  f10
+	br.ret.sptk.many b0
+	}
+	EPILOGUE
diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600
index 1ab1930698..9a6e06d673 100644
--- a/kernel/mips/KERNEL.P5600
+++ b/kernel/mips/KERNEL.P5600
@@ -30,6 +30,11 @@ IDMAXKERNEL  = ../mips/imax.c
 ISMINKERNEL  = ../mips/imin.c
 IDMINKERNEL  = ../mips/imin.c
 
+SSUMKERNEL  = ../mips/sum.c
+DSUMKERNEL  = ../mips/sum.c
+CSUMKERNEL  = ../mips/zsum.c
+ZSUMKERNEL  = ../mips/zsum.c
+
 ifdef HAVE_MSA
 SASUMKERNEL  = ../mips/sasum_msa.c
 DASUMKERNEL  = ../mips/dasum_msa.c
diff --git a/kernel/mips/imin.c b/kernel/mips/imin.c
index d9b283d2d9..bf130613bf 100644
--- a/kernel/mips/imin.c
+++ b/kernel/mips/imin.c
@@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
 	while(i < n)
 	{
-		if( x[ix] > minf )
+		if( x[ix] < minf )
 		{
 			min = i;
 			minf = x[ix];
diff --git a/kernel/mips/sum.c b/kernel/mips/sum.c
new file mode 100644
index 0000000000..8ce3812a19
--- /dev/null
+++ b/kernel/mips/sum.c
@@ -0,0 +1,47 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	n *= inc_x;
+	while(i < n)
+	{
+		sumf += x[i];
+		i += inc_x;
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/mips/zsum.c b/kernel/mips/zsum.c
new file mode 100644
index 0000000000..01f8ced7c7
--- /dev/null
+++ b/kernel/mips/zsum.c
@@ -0,0 +1,52 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define CSUM1(x,i)	x[i]+x[i+1]
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	inc_x2 = 2 * inc_x;
+
+	n *= inc_x2;
+	while(i < n)
+	{
+		sumf += CSUM1(x,i);
+		i += inc_x2;
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/mips64/sum.S b/kernel/mips64/sum.S
new file mode 100644
index 0000000000..261630d49d
--- /dev/null
+++ b/kernel/mips64/sum.S
@@ -0,0 +1,332 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N	$4
+#define	X	$5
+#define INCX	$6
+
+#define I	$2
+#define TEMP	$3
+
+#define a1	$f2
+#define a2	$f3
+#define a3	$f4
+#define a4	$f5
+#define a5	$f6
+#define a6	$f7
+#define a7	$f8
+#define a8	$f9
+
+#define t1	$f10
+#define t2	$f11
+#define t3	$f12
+#define t4	$f13
+
+#define s1	$f0
+#define s2	$f1
+
+	PROLOGUE
+
+#ifdef F_INTERFACE
+	LDINT	N,     0(N)
+	LDINT	INCX,  0(INCX)
+#endif
+
+	MTC	$0,  s1
+
+	MTC	$0,  s2
+	dsll	INCX, INCX, BASE_SHIFT
+
+	blez	N, .L999
+	li	TEMP, SIZE
+
+	bne	INCX, TEMP, .L20
+	dsra	I, N, 3
+
+	blez	I, .L15
+	NOP
+
+	LD	a1,  0 * SIZE(X)
+	LD	a2,  1 * SIZE(X)
+	LD	a3,  2 * SIZE(X)
+	LD	a4,  3 * SIZE(X)
+
+	LD	a5,  4 * SIZE(X)
+	MOV	t1, a1
+	LD	a6,  5 * SIZE(X)
+	MOV	t2, a2
+	LD	a7,  6 * SIZE(X)
+	MOV	t3, a3
+
+	MOV	t4, a4
+	daddiu	I, I, -1
+
+	blez	I, .L13
+	LD	a8,  7 * SIZE(X)
+	.align 3
+
+.L12:
+	ADD	s1, s1, t1
+	LD	a1,  8 * SIZE(X)
+
+	MOV	t1, a5
+	daddiu	I, I, -1
+
+	ADD	s2, s2, t2
+	LD	a2,  9 * SIZE(X)
+
+	MOV	t2, a6
+	NOP
+
+	ADD	s1, s1, t3
+	LD	a3, 10 * SIZE(X)
+
+	MOV	t3, a7
+	NOP
+
+	ADD	s2, s2, t4
+	LD	a4, 11 * SIZE(X)
+
+	MOV	t4, a8
+	daddiu	X, X, 8 * SIZE
+
+	ADD	s1, s1, t1
+	LD	a5,  4 * SIZE(X)
+
+	MOV	t1, a1
+	NOP
+
+	ADD	s2, s2, t2
+	LD	a6,  5 * SIZE(X)
+
+	MOV	t2, a2
+	NOP
+
+	ADD	s1, s1, t3
+	LD	a7,  6 * SIZE(X)
+
+	MOV	t3, a3
+	NOP
+
+	ADD	s2, s2, t4
+	LD	a8,  7 * SIZE(X)
+
+	bgtz	I, .L12
+	MOV	t4, a4
+	.align 3
+
+.L13:
+	ADD	s1, s1, t1
+	daddiu	X, X, 8 * SIZE
+
+	MOV	t1, a5
+	NOP
+
+	ADD	s2, s2, t2
+	MOV	t2, a6
+
+	ADD	s1, s1, t3
+	MOV	t3, a7
+
+	ADD	s2, s2, t4
+	MOV	t4, a8
+
+	ADD	s1, s1, t1
+	ADD	s2, s2, t2
+	ADD	s1, s1, t3
+	ADD	s2, s2, t4
+	.align 3
+
+.L15:
+	andi	I,  N, 7
+
+	blez	I, .L999
+	NOP
+	.align	3
+
+.L16:
+	LD	a1,  0 * SIZE(X)
+	daddiu	I, I, -1
+
+	MOV	t1, a1
+
+	ADD	s1, s1, t1
+
+	bgtz	I, .L16
+	daddiu	X, X, SIZE
+
+	j	.L999
+	NOP
+	.align 3
+
+.L20:
+	blez	I, .L25
+	NOP
+
+	LD	a1,  0 * SIZE(X)
+	daddu	X, X, INCX
+
+	LD	a2,  0 * SIZE(X)
+	daddu	X, X, INCX
+
+	LD	a3,  0 * SIZE(X)
+	daddu	X, X, INCX
+
+	LD	a4,  0 * SIZE(X)
+	daddu	X, X, INCX
+
+	LD	a5,  0 * SIZE(X)
+	daddu	X, X, INCX
+
+	LD	a6,  0 * SIZE(X)
+	daddu	X, X, INCX
+
+	MOV	t1, a1
+	LD	a7,  0 * SIZE(X)
+
+	MOV	t2, a2
+	daddu	X, X, INCX
+
+	MOV	t3, a3
+	LD	a8,  0 * SIZE(X)
+
+	MOV	t4, a4
+	daddiu	I, I, -1
+
+	blez	I, .L24
+	daddu	X, X, INCX
+	.align 3
+
+.L23:
+	ADD	s1, s1, t1
+	LD	a1,  0 * SIZE(X)
+
+	MOV	t1, a5
+	daddu	X, X, INCX
+
+	ADD	s2, s2, t2
+	LD	a2,  0 * SIZE(X)
+
+	MOV	t2, a6
+	daddu	X, X, INCX
+
+	ADD	s1, s1, t3
+	LD	a3,  0 * SIZE(X)
+
+	MOV	t3, a7
+	daddu	X, X, INCX
+
+	ADD	s2, s2, t4
+	LD	a4,  0 * SIZE(X)
+
+	MOV	t4, a8
+	daddu	X, X, INCX
+
+	ADD	s1, s1, t1
+	LD	a5,  0 * SIZE(X)
+
+	MOV	t1, a1
+	daddu	X, X, INCX
+
+	ADD	s2, s2, t2
+	LD	a6,  0 * SIZE(X)
+
+	MOV	t2, a2
+	daddu	X, X, INCX
+
+	ADD	s1, s1, t3
+	LD	a7,  0 * SIZE(X)
+
+	MOV	t3, a3
+	daddu	X, X, INCX
+
+	ADD	s2, s2, t4
+	LD	a8,  0 * SIZE(X)
+
+	MOV	t4, a4
+	daddiu	I, I, -1
+
+	bgtz	I, .L23
+	daddu	X, X, INCX
+	.align 3
+
+.L24:
+	ADD	s1, s1, t1
+	MOV	t1, a5
+
+	ADD	s2, s2, t2
+	MOV	t2, a6
+
+	ADD	s1, s1, t3
+	MOV	t3, a7
+
+	ADD	s2, s2, t4
+	MOV	t4, a8
+
+	ADD	s1, s1, t1
+	ADD	s2, s2, t2
+	ADD	s1, s1, t3
+	ADD	s2, s2, t4
+	.align 3
+
+.L25:
+	andi	I,  N, 7
+
+	blez	I, .L999
+	NOP
+	.align	3
+
+.L26:
+	LD	a1,  0 * SIZE(X)
+	daddiu	I, I, -1
+
+	MOV	t1, a1
+	daddu	X, X, INCX
+
+	bgtz	I, .L26
+	ADD	s1, s1, t1
+	.align 3
+
+.L999:
+	j	$31
+	ADD	s1, s1, s2
+
+	EPILOGUE
diff --git a/kernel/mips64/zsum.S b/kernel/mips64/zsum.S
new file mode 100644
index 0000000000..129b97900e
--- /dev/null
+++ b/kernel/mips64/zsum.S
@@ -0,0 +1,204 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N	$4
+#define	X	$5
+#define INCX	$6
+
+#define I	$2
+#define TEMP	$3
+
+#define a1	$f2
+#define a2	$f3
+#define a3	$f4
+#define a4	$f5
+#define a5	$f6
+#define a6	$f7
+#define a7	$f8
+#define a8	$f9
+
+#define t1	$f10
+#define t2	$f11
+#define t3	$f12
+#define t4	$f13
+
+#define s1	$f0
+#define s2	$f1
+
+	PROLOGUE
+
+#ifdef F_INTERFACE
+	LDINT	N,     0(N)
+	LDINT	INCX,  0(INCX)
+#endif
+
+	MTC	$0,  s1
+
+	MTC	$0,  s2
+	dsll	INCX, INCX, ZBASE_SHIFT
+
+	blez	N, .L999
+	dsra	I, N, 2
+
+	blez	I, .L25
+	NOP
+
+	LD	a1,  0 * SIZE(X)
+	LD	a2,  1 * SIZE(X)
+	daddu	X, X, INCX
+
+	LD	a3,  0 * SIZE(X)
+	LD	a4,  1 * SIZE(X)
+	daddu	X, X, INCX
+
+	LD	a5,  0 * SIZE(X)
+	LD	a6,  1 * SIZE(X)
+	daddu	X, X, INCX
+
+	MOV	t1, a1
+	MOV	t2, a2
+
+	LD	a7,  0 * SIZE(X)
+	LD	a8,  1 * SIZE(X)
+
+	MOV	t3, a3
+	MOV	t4, a4
+	daddiu	I, I, -1
+
+	blez	I, .L24
+	daddu	X, X, INCX
+	.align 3
+
+.L23:
+	ADD	s1, s1, t1
+	LD	a1,  0 * SIZE(X)
+
+	MOV	t1, a5
+	daddiu	I, I, -1
+
+	ADD	s2, s2, t2
+	LD	a2,  1 * SIZE(X)
+
+	MOV	t2, a6
+	daddu	X, X, INCX
+
+	ADD	s1, s1, t3
+	LD	a3,  0 * SIZE(X)
+
+	MOV	t3, a7
+	NOP
+
+	ADD	s2, s2, t4
+	LD	a4,  1 * SIZE(X)
+
+	MOV	t4, a8
+	daddu	X, X, INCX
+
+	ADD	s1, s1, t1
+	LD	a5,  0 * SIZE(X)
+
+	MOV	t1, a1
+	NOP
+
+	ADD	s2, s2, t2
+	LD	a6,  1 * SIZE(X)
+
+	MOV	t2, a2
+	daddu	X, X, INCX
+
+	ADD	s1, s1, t3
+	LD	a7,  0 * SIZE(X)
+
+	MOV	t3, a3
+	LD	a8,  1 * SIZE(X)
+
+	ADD	s2, s2, t4
+	daddu	X, X, INCX
+
+	bgtz	I, .L23
+	MOV	t4, a4
+	.align 3
+
+.L24:
+	ADD	s1, s1, t1
+	MOV	t1, a5
+
+	ADD	s2, s2, t2
+	MOV	t2, a6
+
+	ADD	s1, s1, t3
+	MOV	t3, a7
+
+	ADD	s2, s2, t4
+	MOV	t4, a8
+
+	ADD	s1, s1, t1
+	ADD	s2, s2, t2
+	ADD	s1, s1, t3
+	ADD	s2, s2, t4
+	.align 3
+
+.L25:
+	andi	I,  N, 3
+
+	blez	I, .L999
+	NOP
+	.align	3
+
+.L26:
+	LD	a1,  0 * SIZE(X)
+	LD	a2,  1 * SIZE(X)
+
+	MOV	t1, a1
+	daddiu	I, I, -1
+	MOV	t2, a2
+	daddu	X, X, INCX
+
+	ADD	s1, s1, t1
+	bgtz	I, .L26
+	ADD	s2, s2, t2
+	.align 3
+
+.L999:
+	j	$31
+	ADD	s1, s1, s2
+
+	EPILOGUE
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index 1aa0610785..43f004fbbb 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -13,40 +13,40 @@ SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMOTCOPY    = sgemm_tcopy_8_power8.S 
-SGEMMINCOPYOBJ =  sgemm_incopy.o
-SGEMMITCOPYOBJ =  sgemm_itcopy.o
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 DGEMMKERNEL    =  dgemm_kernel_16x4_power8.S
 DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 DGEMMITCOPY    =  dgemm_tcopy_16_power8.S
 DGEMMONCOPY    =  dgemm_ncopy_4_power8.S
 DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ =  dgemm_incopy.o
-DGEMMITCOPYOBJ =  dgemm_itcopy.o
-DGEMMONCOPYOBJ =  dgemm_oncopy.o
-DGEMMOTCOPYOBJ =  dgemm_otcopy.o
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 CGEMMKERNEL    = cgemm_kernel_8x4_power8.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
 CGEMMITCOPY    = cgemm_tcopy_8_power8.S
 CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
-CGEMMINCOPYOBJ =  cgemm_incopy.o
-CGEMMITCOPYOBJ =  cgemm_itcopy.o
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 
 ZGEMMKERNEL    = zgemm_kernel_8x2_power8.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
 ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
 ZGEMMITCOPY    = zgemm_tcopy_8_power8.S
-ZGEMMONCOPYOBJ =  zgemm_oncopy.o
-ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
-ZGEMMINCOPYOBJ =  zgemm_incopy.o
-ZGEMMITCOPYOBJ =  zgemm_itcopy.o
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
@@ -89,14 +89,14 @@ ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 #SMINKERNEL   = ../arm/min.c
 #DMINKERNEL   = ../arm/min.c
 #
-#ISAMAXKERNEL = ../arm/iamax.c
+ISAMAXKERNEL = isamax.c
 IDAMAXKERNEL = idamax.c
-#ICAMAXKERNEL = ../arm/izamax.c
-IZAMAXKERNEL =  izamax.c
+ICAMAXKERNEL = icamax.c
+IZAMAXKERNEL = izamax.c
 #
-#ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL =  idamin.c
-#ICAMINKERNEL = ../arm/izamin.c
+ISAMINKERNEL = isamin.c
+IDAMINKERNEL = idamin.c
+ICAMINKERNEL = icamin.c
 IZAMINKERNEL = izamin.c
 #
 #ISMAXKERNEL  = ../arm/imax.c
@@ -110,9 +110,9 @@ DASUMKERNEL  = dasum.c
 CASUMKERNEL  = casum.c
 ZASUMKERNEL  = zasum.c
 #
-#SAXPYKERNEL  = ../arm/axpy.c
+SAXPYKERNEL  = saxpy.c
 DAXPYKERNEL  = daxpy.c
-#CAXPYKERNEL  = ../arm/zaxpy.c
+CAXPYKERNEL  = caxpy.c
 ZAXPYKERNEL  = zaxpy.c
 #
 SCOPYKERNEL  = scopy.c
@@ -123,7 +123,7 @@ ZCOPYKERNEL  = zcopy.c
 SDOTKERNEL   =  sdot.c
 DDOTKERNEL   =  ddot.c
 DSDOTKERNEL  =  sdot.c
-#CDOTKERNEL   = ../arm/zdot.c
+CDOTKERNEL   =  cdot.c
 ZDOTKERNEL   =  zdot.c
 #
 SNRM2KERNEL  = ../arm/nrm2.c
@@ -133,7 +133,7 @@ ZNRM2KERNEL  = ../arm/znrm2.c
 #
 SROTKERNEL   = srot.c
 DROTKERNEL   = drot.c
-CROTKERNEL   = zrot.c
+CROTKERNEL   = crot.c
 ZROTKERNEL   = zrot.c
 #
 SSCALKERNEL  = sscal.c
@@ -147,14 +147,14 @@ CSWAPKERNEL  = cswap.c
 ZSWAPKERNEL  = zswap.c
 #
 
-#SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVNKERNEL = sgemv_n.c
 DGEMVNKERNEL = dgemv_n.c
-#CGEMVNKERNEL = ../arm/zgemv_n.c
+CGEMVNKERNEL = cgemv_n.c
 ZGEMVNKERNEL = zgemv_n_4.c
 #
-#SGEMVTKERNEL = ../arm/gemv_t.c
+SGEMVTKERNEL = sgemv_t.c
 DGEMVTKERNEL = dgemv_t.c
-#CGEMVTKERNEL = ../arm/zgemv_t.c
+CGEMVTKERNEL = cgemv_t.c
 ZGEMVTKERNEL = zgemv_t_4.c
 
 
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
new file mode 100644
index 0000000000..e166f252fc
--- /dev/null
+++ b/kernel/power/KERNEL.POWER9
@@ -0,0 +1,184 @@
+#SGEMM_BETA = ../generic/gemm_beta.c
+#DGEMM_BETA = ../generic/gemm_beta.c
+#CGEMM_BETA = ../generic/zgemm_beta.c
+#ZGEMM_BETA = ../generic/zgemm_beta.c
+
+STRMMKERNEL	= strmm_kernel_16x8_power8.S
+DTRMMKERNEL	= dgemm_kernel_power9.S
+CTRMMKERNEL	= ctrmm_kernel_8x4_power8.S
+ZTRMMKERNEL	= ztrmm_kernel_8x2_power8.S
+
+SGEMMKERNEL    =  sgemm_kernel_16x8_power8.S
+SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMITCOPY    = sgemm_tcopy_16_power8.S
+SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
+SGEMMOTCOPY    = sgemm_tcopy_8_power8.S 
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_power9.S
+DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+DGEMMITCOPY    =  dgemm_tcopy_16_power8.S
+DGEMMONCOPY    =  dgemm_ncopy_4_power8.S
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    = cgemm_kernel_8x4_power8.S
+CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+CGEMMITCOPY    = cgemm_tcopy_8_power8.S
+CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    = zgemm_kernel_8x2_power8.S
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+ZGEMMITCOPY    = zgemm_tcopy_8_power8.S
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= dtrsm_kernel_LT_16x4_power8.S
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
+#CGEMM3MKERNEL    =  zgemm3m_kernel_8x4_sse3.S
+#ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_sse3.S
+
+#Pure C for other kernels
+#SAMAXKERNEL  = ../arm/amax.c
+#DAMAXKERNEL  = ../arm/amax.c
+#CAMAXKERNEL  = ../arm/zamax.c
+#ZAMAXKERNEL  = ../arm/zamax.c
+#
+#SAMINKERNEL  = ../arm/amin.c
+#DAMINKERNEL  = ../arm/amin.c
+#CAMINKERNEL  = ../arm/zamin.c
+#ZAMINKERNEL  = ../arm/zamin.c
+#
+#SMAXKERNEL   = ../arm/max.c
+#DMAXKERNEL   = ../arm/max.c
+#
+#SMINKERNEL   = ../arm/min.c
+#DMINKERNEL   = ../arm/min.c
+#
+ISAMAXKERNEL = isamax.c
+IDAMAXKERNEL = idamax.c
+ICAMAXKERNEL = icamax.c
+IZAMAXKERNEL = izamax.c
+#
+ISAMINKERNEL = isamin.c
+IDAMINKERNEL = idamin.c
+ICAMINKERNEL = icamin.c
+IZAMINKERNEL = izamin.c
+#
+#ISMAXKERNEL  = ../arm/imax.c
+#IDMAXKERNEL  = ../arm/imax.c
+#
+#ISMINKERNEL  = ../arm/imin.c
+#IDMINKERNEL  = ../arm/imin.c
+#
+SASUMKERNEL  = sasum.c
+DASUMKERNEL  = dasum.c
+CASUMKERNEL  = casum.c
+ZASUMKERNEL  = zasum.c
+#
+SAXPYKERNEL  = saxpy.c
+DAXPYKERNEL  = daxpy.c
+CAXPYKERNEL  = caxpy.c
+ZAXPYKERNEL  = zaxpy.c
+#
+SCOPYKERNEL  = scopy.c
+DCOPYKERNEL  = dcopy.c
+CCOPYKERNEL  = ccopy.c
+ZCOPYKERNEL  = zcopy.c
+#
+SDOTKERNEL   =  sdot.c
+DDOTKERNEL   =  ddot.c
+DSDOTKERNEL  =  sdot.c
+CDOTKERNEL   =  cdot.c
+ZDOTKERNEL   =  zdot.c
+#
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+#
+SROTKERNEL   = srot.c
+DROTKERNEL   = drot.c
+CROTKERNEL   = crot.c
+ZROTKERNEL   = zrot.c
+#
+SSCALKERNEL  = sscal.c
+DSCALKERNEL  = dscal.c
+CSCALKERNEL  = zscal.c
+ZSCALKERNEL  = zscal.c
+#
+SSWAPKERNEL  = sswap.c
+DSWAPKERNEL  = dswap.c
+CSWAPKERNEL  = cswap.c
+ZSWAPKERNEL  = zswap.c
+#
+
+SGEMVNKERNEL = sgemv_n.c
+DGEMVNKERNEL = dgemv_n.c
+CGEMVNKERNEL = cgemv_n.c
+ZGEMVNKERNEL = zgemv_n_4.c
+#
+SGEMVTKERNEL = sgemv_t.c
+DGEMVTKERNEL = dgemv_t.c
+CGEMVTKERNEL = cgemv_t.c
+ZGEMVTKERNEL = zgemv_t_4.c
+
+
+#SSYMV_U_KERNEL =  ../generic/symv_k.c
+#SSYMV_L_KERNEL =  ../generic/symv_k.c
+#DSYMV_U_KERNEL =  ../generic/symv_k.c
+#DSYMV_L_KERNEL =  ../generic/symv_k.c
+#QSYMV_U_KERNEL =  ../generic/symv_k.c
+#QSYMV_L_KERNEL =  ../generic/symv_k.c
+#CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+#CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+#ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+#ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+#XSYMV_U_KERNEL =  ../generic/zsymv_k.c
+#XSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+#ZHEMV_U_KERNEL =  ../generic/zhemv_k.c
+#ZHEMV_L_KERNEL =  ../generic/zhemv_k.c
+
+LSAME_KERNEL = ../generic/lsame.c
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+#Dump kernel
+CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
+ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
diff --git a/kernel/power/casum.c b/kernel/power/casum.c
index d1108581d3..a9ece07685 100644
--- a/kernel/power/casum.c
+++ b/kernel/power/casum.c
@@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(POWER8)
+#if defined(POWER8)  || defined(POWER9)
 #include "casum_microk_power8.c"
 #endif
 
diff --git a/kernel/power/caxpy.c b/kernel/power/caxpy.c
new file mode 100644
index 0000000000..4bdf13c34e
--- /dev/null
+++ b/kernel/power/caxpy.c
@@ -0,0 +1,145 @@
+/*
+Copyright (c) 2013-2018, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+ 
+
+#ifndef HAVE_ASM_KERNEL
+#include <altivec.h> 
+static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i)
+{
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register __vector float valpha_r = {alpha_r, alpha_r,alpha_r, alpha_r};
+    register __vector float valpha_i = {-alpha_i, alpha_i,-alpha_i, alpha_i};
+
+#else
+    register __vector float valpha_r = {alpha_r, -alpha_r,alpha_r, -alpha_r};
+    register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i};
+#endif
+
+    __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+    register __vector float *vy = (__vector float *) y;
+    register __vector float *vx = (__vector float *) x;
+    BLASLONG i=0;
+    for (; i < n/2; i += 8) {
+
+        register __vector float vy_0 = vy[i];
+        register __vector float vy_1 = vy[i + 1];
+        register __vector float vy_2 = vy[i + 2];
+        register __vector float vy_3 = vy[i + 3];
+        register __vector float vy_4 = vy[i + 4];
+        register __vector float vy_5 = vy[i + 5];
+        register __vector float vy_6 = vy[i + 6];
+        register __vector float vy_7 = vy[i + 7];
+        register __vector float vx_0 = vx[i];
+        register __vector float vx_1 = vx[i + 1];
+        register __vector float vx_2 = vx[i + 2];
+        register __vector float vx_3 = vx[i + 3];
+        register __vector float vx_4 = vx[i + 4];
+        register __vector float vx_5 = vx[i + 5];
+        register __vector float vx_6 = vx[i + 6];
+        register __vector float vx_7 = vx[i + 7];
+        vy_0 += vx_0*valpha_r;
+        vy_1 += vx_1*valpha_r;
+        vy_2 += vx_2*valpha_r;
+        vy_3 += vx_3*valpha_r;
+        vy_4 += vx_4*valpha_r;
+        vy_5 += vx_5*valpha_r;
+        vy_6 += vx_6*valpha_r;
+        vy_7 += vx_7*valpha_r;
+        vx_0 = vec_perm(vx_0, vx_0, swap_mask);
+        vx_1 = vec_perm(vx_1, vx_1, swap_mask);
+        vx_2 = vec_perm(vx_2, vx_2, swap_mask);
+        vx_3 = vec_perm(vx_3, vx_3, swap_mask);
+        vx_4 = vec_perm(vx_4, vx_4, swap_mask);
+        vx_5 = vec_perm(vx_5, vx_5, swap_mask);
+        vx_6 = vec_perm(vx_6, vx_6, swap_mask);
+        vx_7 = vec_perm(vx_7, vx_7, swap_mask);
+        vy_0 += vx_0*valpha_i;
+        vy_1 += vx_1*valpha_i;
+        vy_2 += vx_2*valpha_i;
+        vy_3 += vx_3*valpha_i;
+        vy_4 += vx_4*valpha_i;
+        vy_5 += vx_5*valpha_i;
+        vy_6 += vx_6*valpha_i;
+        vy_7 += vx_7*valpha_i;
+        vy[i] = vy_0;
+        vy[i + 1] = vy_1;
+        vy[i + 2] = vy_2;
+        vy[i + 3] = vy_3;
+        vy[i + 4] = vy_4;
+        vy[i + 5] = vy_5 ;
+        vy[i + 6] = vy_6 ;
+        vy[i + 7] = vy_7 ;        
+
+    }
+}
+#endif
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
+    BLASLONG i = 0;
+    BLASLONG ix = 0, iy = 0;
+    if (n <= 0) return (0);
+    if ((inc_x == 1) && (inc_y == 1)) {
+        BLASLONG n1 = n & -16;
+        if (n1) { 
+            caxpy_kernel_16(n1, x, y, da_r,da_i);
+            ix = 2 * n1;
+        }
+        i = n1;
+        while (i < n) {
+#if !defined(CONJ)
+            y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
+            y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
+#else
+            y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
+            y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
+#endif
+            i++;
+            ix += 2;
+        }
+        return (0);
+
+    }
+    inc_x *= 2;
+    inc_y *= 2;
+    while (i < n) {
+#if !defined(CONJ)
+        y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
+        y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
+#else
+        y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
+        y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
+#endif
+        ix += inc_x;
+        iy += inc_y;
+        i++;
+    }
+    return (0);
+}
+
diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c
index ce7d674753..50df84cc50 100644
--- a/kernel/power/ccopy.c
+++ b/kernel/power/ccopy.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "ccopy_microk_power8.c"
 #endif
 
diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c
new file mode 100644
index 0000000000..f86a33f228
--- /dev/null
+++ b/kernel/power/cdot.c
@@ -0,0 +1,164 @@
+/*Copyright (c) 2013-201\n8, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "common.h"
+
+#ifndef HAVE_KERNEL_8
+#include <altivec.h> 
+static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
+{
+    __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+    register __vector float *vy = (__vector float *) y;
+    register __vector float *vx = (__vector float *) x;
+    BLASLONG i = 0;
+    register __vector float vd_0  = { 0 };
+    register __vector float vd_1  = { 0 };
+    register __vector float vd_2  = { 0 };
+    register __vector float vd_3  = { 0 };
+    register __vector float vdd_0 = { 0 };
+    register __vector float vdd_1 = { 0 };
+    register __vector float vdd_2 = { 0 };
+    register __vector float vdd_3 = { 0 };
+    for (; i < n/2; i += 4) {
+
+        register __vector float vyy_0 ;
+        register __vector float vyy_1 ;
+        register __vector float vyy_2 ;
+        register __vector float vyy_3 ;
+
+        register __vector float vy_0 = vy[i];
+        register __vector float vy_1 = vy[i + 1];
+        register __vector float vy_2 = vy[i + 2];
+        register __vector float vy_3 = vy[i + 3]; 
+        register __vector float vx_0= vx[i];
+        register __vector float vx_1 = vx[i + 1];
+        register __vector float vx_2 = vx[i + 2];
+        register __vector float vx_3 = vx[i + 3]; 
+        vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
+        vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
+        vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
+        vyy_3 = vec_perm(vy_3, vy_3, swap_mask);  
+
+        vd_0 += vx_0 * vy_0;
+        vd_1 += vx_1 * vy_1;
+        vd_2 += vx_2 * vy_2;
+        vd_3 += vx_3 * vy_3;
+
+        vdd_0 += vx_0 * vyy_0;
+        vdd_1 += vx_1 * vyy_1;
+        vdd_2 += vx_2 * vyy_2;
+        vdd_3 += vx_3 * vyy_3;       
+       
+
+    }
+    //aggregate
+    vd_0 = vd_0 + vd_1 +vd_2 +vd_3;
+    vdd_0= vdd_0 + vdd_1 +vdd_2 +vdd_3; 
+     //reverse and aggregate 
+    vd_1=vec_xxpermdi(vd_0,vd_0,2)  ;
+    vdd_1=vec_xxpermdi(vdd_0,vdd_0,2);
+    vd_2=vd_0+vd_1;
+    vdd_2=vdd_0+vdd_1;
+
+    dot[0]=vd_2[0];
+    dot[1]=vd_2[1];
+    dot[2]=vdd_2[0];
+    dot[3]=vdd_2[1];
+ 
+}
+#endif
+ 
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
+    BLASLONG i = 0;
+    BLASLONG ix=0, iy=0;
+    OPENBLAS_COMPLEX_FLOAT result;
+    FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
+
+    if (n <= 0) {
+        CREAL(result) = 0.0;
+        CIMAG(result) = 0.0;
+        return (result);
+
+    }
+
+    if ((inc_x == 1) && (inc_y == 1)) {
+
+        BLASLONG n1 = n & -8;
+        BLASLONG j=0; 
+
+        if (n1){
+            cdot_kernel_8(n1, x, y, dot);
+            i = n1;
+            j = n1 <<1;
+        }
+ 
+
+        while (i < n) {
+
+            dot[0] += x[j] * y[j];
+            dot[1] += x[j + 1] * y[j + 1];
+            dot[2] += x[j] * y[j + 1];
+            dot[3] += x[j + 1] * y[j];
+
+            j += 2;
+            i++;
+
+        }
+
+
+    } else {
+        i = 0;
+        ix = 0;
+        iy = 0;
+        inc_x <<= 1;
+        inc_y <<= 1;
+        while (i < n) {
+
+            dot[0] += x[ix] * y[iy];
+            dot[1] += x[ix + 1] * y[iy + 1];
+            dot[2] += x[ix] * y[iy + 1];
+            dot[3] += x[ix + 1] * y[iy];
+
+            ix += inc_x;
+            iy += inc_y;
+            i++;
+
+        }
+    }
+
+#if !defined(CONJ)
+    CREAL(result) = dot[0] - dot[1];
+    CIMAG(result) = dot[2] + dot[3];
+#else
+    CREAL(result) = dot[0] + dot[1];
+    CIMAG(result) = dot[2] - dot[3];
+
+#endif
+
+    return (result);
+
+}
diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c
new file mode 100644
index 0000000000..cb01e196e4
--- /dev/null
+++ b/kernel/power/cgemv_n.c
@@ -0,0 +1,585 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "common.h" 
+#include <altivec.h>   
+#define NBMAX 1024
+
+
+static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+
+ 
+static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+  
+ FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
+    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
+    register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
+    register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]};
+    register __vector float vx2_r = {x[4], x[4],x[4], x[4]};
+    register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]};
+    register __vector float vx3_r = {x[6], x[6],x[6], x[6]};
+    register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]};
+#else
+    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
+    register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
+    register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
+    register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
+    register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]};
+    register __vector float vx2_i = {x[5], x[5],x[5], x[5]};
+    register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
+    register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
+#endif
+    register __vector float *vy = (__vector float *) y;
+    register __vector float *vptr_a0 = (__vector float *) a0;
+    register __vector float *vptr_a1 = (__vector float *) a1;
+    register __vector float *vptr_a2 = (__vector float *) a2;
+    register __vector float *vptr_a3 = (__vector float *) a3; 
+    BLASLONG  i = 0; 
+    for (;i< n / 2; i+=2) {
+        register __vector float vy_0  = vy[i];
+        register __vector float vy_1  = vy[i + 1];
+        register __vector float va0   = vptr_a0[i];
+        register __vector float va1   = vptr_a1[i];
+        register __vector float va2   = vptr_a2[i];
+        register __vector float va3   = vptr_a3[i];
+        register __vector float va0_1 = vptr_a0[i + 1];
+        register __vector float va1_1 = vptr_a1[i + 1];
+        register __vector float va2_1 = vptr_a2[i + 1];
+        register __vector float va3_1 = vptr_a3[i + 1];
+
+        vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
+        vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
+        va0   = vec_perm(va0, va0,swap_mask);
+        va0_1 = vec_perm(va0_1, va0_1,swap_mask);
+        va1   = vec_perm(va1, va1,swap_mask);
+        va1_1 = vec_perm(va1_1, va1_1,swap_mask);
+        va2   = vec_perm(va2, va2,swap_mask);
+        va2_1 = vec_perm(va2_1, va2_1,swap_mask);
+        va3   = vec_perm(va3, va3,swap_mask);
+        va3_1 = vec_perm(va3_1, va3_1,swap_mask);
+        vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
+        vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
+
+        vy[i] = vy_0;
+        vy[i + 1] = vy_1;
+    }
+
+}	
+ 
+
+
+static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+ 
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda; 
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
+    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
+    register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
+    register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; 
+#else
+    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
+    register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
+    register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
+    register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; 
+#endif
+    register __vector float *vy = (__vector float *) y;
+    register __vector float *vptr_a0 = (__vector float *) a0;
+    register __vector float *vptr_a1 = (__vector float *) a1; 
+    BLASLONG  i = 0; 
+    for (;i< n / 2; i+=2) {
+        register __vector float vy_0  = vy[i];
+        register __vector float vy_1  = vy[i + 1];
+        register __vector float va0   = vptr_a0[i];
+        register __vector float va1   = vptr_a1[i]; 
+        register __vector float va0_1 = vptr_a0[i + 1];
+        register __vector float va1_1 = vptr_a1[i + 1]; 
+        register __vector float va0x   = vec_perm(va0, va0,swap_mask);
+        register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
+        register __vector float va1x   = vec_perm(va1, va1,swap_mask);
+        register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask);
+        vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
+        vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; 
+
+        vy[i] = vy_0;
+        vy[i + 1] = vy_1;
+    }
+
+}
+
+ 
+
+static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
+
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
+    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; 
+#else
+    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
+    register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; 
+#endif
+    register __vector float *vy = (__vector float *) y;
+    register __vector float *vptr_a0 = (__vector float *) ap; 
+    BLASLONG  i = 0; 
+    for (;i< n / 2; i+=2) {
+        register __vector float vy_0  = vy[i];
+        register __vector float vy_1  = vy[i + 1];
+        register __vector float va0   = vptr_a0[i];
+        register __vector float va0_1 = vptr_a0[i + 1]; 
+        register __vector float va0x   = vec_perm(va0, va0,swap_mask);
+        register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
+        vy_0 += va0*vx0_r + va0x*vx0_i;
+        vy_1 += va0_1*vx0_r + va0x_1*vx0_i; 
+
+        vy[i] = vy_0;
+        vy[i + 1] = vy_1;
+    }
+}
+
+
+
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
+    BLASLONG i;
+
+
+    if (inc_dest != 2) {
+ 		FLOAT temp_r;
+		FLOAT temp_i;
+		for ( i=0; i<n; i++ )
+		{
+#if !defined(XCONJ) 
+			temp_r = alpha_r * src[0] - alpha_i * src[1];
+			temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+			temp_r =  alpha_r * src[0] + alpha_i * src[1];
+			temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+			*dest += temp_r;
+			*(dest+1) += temp_i;
+
+			src+=2;
+			dest += inc_dest;
+		}
+        return;
+    } else {
+        __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+#if   !defined(XCONJ) 
+
+        register __vector float valpha_r = {alpha_r, alpha_r, alpha_r, alpha_r};
+        register __vector float valpha_i = {-alpha_i, alpha_i, -alpha_i, alpha_i};
+
+#else
+        register __vector float valpha_r = {alpha_r, -alpha_r, alpha_r, -alpha_r};
+        register __vector float valpha_i = {alpha_i, alpha_i, alpha_i, alpha_i};
+#endif
+
+        register __vector float *vptr_src = (__vector float *) src;
+        register __vector float *vptr_y = (__vector float *) dest; 
+        for (i = 0; i < n/2; i += 2 ){
+
+            register __vector float vy_0 = vptr_y[i];
+            register __vector float vy_1 = vptr_y[i +1]; 
+
+            register __vector float vsrc = vptr_src[i];
+            register __vector float vsrc_1 = vptr_src[i + 1]; 
+            register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
+            register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
+
+            vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
+            vy_1 += vsrc_1*valpha_r +  vsrcx_1*valpha_i;  
+            vptr_y[i] = vy_0;
+            vptr_y[i+1 ] = vy_1;  
+
+        }
+ 
+    }
+    return;
+}
+
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
+    BLASLONG i;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2;
+
+    FLOAT xbuffer[8], *ybuffer;
+
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    ybuffer = buffer;
+
+    inc_x *= 2;
+    inc_y *= 2;
+    lda *= 2;
+
+    n1 = n / 4;
+    n2 = n % 4;
+
+    m3 = m % 4;
+    m1 = m - (m % 4);
+    m2 = (m % NBMAX) - (m % 4);
+
+    y_ptr = y;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        a_ptr = a;
+
+        x_ptr = x; 
+
+        memset(ybuffer, 0, NB * 2*sizeof(FLOAT));  
+
+        if (inc_x == 2) {
+
+            for (i = 0; i < n1; i++) {
+                cgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
+
+                a_ptr += lda << 2;
+                x_ptr += 8;
+            }
+
+            if (n2 & 2) {
+                cgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
+                x_ptr += 4;
+                a_ptr += 2 * lda;
+
+            }
+
+            if (n2 & 1) {
+                cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
+                x_ptr += 2;
+                a_ptr += lda;
+
+            }
+        } else {
+
+            for (i = 0; i < n1; i++) {
+
+                xbuffer[0] = x_ptr[0];
+                xbuffer[1] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[2] = x_ptr[0];
+                xbuffer[3] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[4] = x_ptr[0];
+                xbuffer[5] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[6] = x_ptr[0];
+                xbuffer[7] = x_ptr[1];
+                x_ptr += inc_x;
+
+                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
+
+                a_ptr += lda << 2;
+            }
+
+            for (i = 0; i < n2; i++) {
+                xbuffer[0] = x_ptr[0];
+                xbuffer[1] = x_ptr[1];
+                x_ptr += inc_x;
+                cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
+                a_ptr += lda;
+
+            }
+
+        }
+
+        add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
+        a += 2 * NB;
+        y_ptr += NB * inc_y;
+    }
+
+    if (m3 == 0) return (0);
+
+    if (m3 == 1) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r = 0.0;
+        FLOAT temp_i = 0.0;
+
+        if (lda == 2 && inc_x == 2) {
+
+            for (i = 0; i < (n & -2); i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
+                temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
+                temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
+#endif
+
+                a_ptr += 4;
+                x_ptr += 4;
+            }
+
+            for (; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+                a_ptr += 2;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+        y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+        y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+        y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+        return (0);
+    }
+
+    if (m3 == 2) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r0 = 0.0;
+        FLOAT temp_i0 = 0.0;
+        FLOAT temp_r1 = 0.0;
+        FLOAT temp_i1 = 0.0;
+
+        if (lda == 4 && inc_x == 2) {
+
+            for (i = 0; i < (n & -2); i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+
+                temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
+                temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
+                temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
+                temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+
+                temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
+                temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
+                temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+                temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
+#endif
+
+                a_ptr += 8;
+                x_ptr += 4;
+            }
+
+            for (; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+                a_ptr += 4;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+        return (0);
+    }
+
+    if (m3 == 3) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r0 = 0.0;
+        FLOAT temp_i0 = 0.0;
+        FLOAT temp_r1 = 0.0;
+        FLOAT temp_i1 = 0.0;
+        FLOAT temp_r2 = 0.0;
+        FLOAT temp_i2 = 0.0;
+
+        if (lda == 6 && inc_x == 2) {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+                a_ptr += 6;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
+        y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+#else
+        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
+        y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+#endif
+        return (0);
+    }
+
+    return (0);
+}
+
diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c
new file mode 100644
index 0000000000..c646618cfa
--- /dev/null
+++ b/kernel/power/cgemv_t.c
@@ -0,0 +1,571 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "common.h"
+
+#define NBMAX 1024 
+#include <altivec.h> 
+static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+
+static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
+    BLASLONG i;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
+    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* va2 = (__vector float*) a2;
+    __vector float* va3 = (__vector float*) a3;
+    __vector float* v_x = (__vector float*) x;
+
+    for (i = 0; i < n / 2; i+=2) {
+        register __vector float vx_0  = v_x[i]; 
+        register __vector float vx_1  = v_x[i+1];         
+        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
+        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
+
+        vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
+        vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; 
+        vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
+        vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; 
+        vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1];
+        vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1]; 
+        vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1];
+        vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1]; 
+
+    }
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
+
+    register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
+    register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
+
+    register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3];
+    register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3];
+
+    register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3];
+    register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3];
+
+#else
+    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
+
+    register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
+    register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3];
+
+    register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3];
+    register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3];
+
+    register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3];
+    register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3];
+
+#endif    
+
+#if !defined(XCONJ)
+
+    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
+    y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
+    y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
+    y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
+    y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
+
+#else
+
+    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+    y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
+    y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+    y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
+    y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
+
+#endif
+
+}
+ 
+
+static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda; 
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
+    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; 
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1; 
+    __vector float* v_x = (__vector float*) x;
+
+    for (i = 0; i < n / 2; i+=2) {
+        register __vector float vx_0  = v_x[i]; 
+        register __vector float vx_1  = v_x[i+1];         
+        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
+        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
+
+        vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
+        vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; 
+        vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
+        vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];  
+
+    }
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
+
+    register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
+    register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
+ 
+
+#else
+    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
+
+    register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
+    register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; 
+
+#endif    
+
+#if !defined(XCONJ)
+
+    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; 
+
+#else
+
+    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; 
+
+#endif
+  
+}
+ 
+
+static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
+   BLASLONG i;  
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
+    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; 
+    __vector float* va0 = (__vector float*) ap; 
+    __vector float* v_x = (__vector float*) x;
+
+    for (i = 0; i < n / 2; i+=2) {
+        register __vector float vx_0  = v_x[i]; 
+        register __vector float vx_1  = v_x[i+1];         
+        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
+        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
+
+        vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
+        vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];  
+
+    }
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; 
+
+#else
+    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; 
+
+#endif    
+
+#if !defined(XCONJ)
+
+    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; 
+
+#else
+
+    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; 
+
+#endif
+
+
+}
+ 
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+        *dest = *src;
+        *(dest + 1) = *(src + 1);
+        dest += 2;
+        src += inc_src;
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
+    BLASLONG i;
+    BLASLONG j;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2;
+
+    FLOAT ybuffer[8], *xbuffer;
+
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    inc_x <<= 1;
+    inc_y <<= 1;
+    lda <<= 1;
+
+    xbuffer = buffer;
+
+    n1 = n >> 2;
+    n2 = n & 3;
+
+    m3 = m & 3;
+    m1 = m - m3;
+    m2 = (m & (NBMAX - 1)) - m3;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        y_ptr = y;
+        a_ptr = a;
+        x_ptr = x;
+
+        if (inc_x != 2)
+            copy_x(NB, x_ptr, xbuffer, inc_x);
+        else
+            xbuffer = x_ptr;
+
+        if (inc_y == 2) {
+
+            for (i = 0; i < n1; i++) {
+                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
+                a_ptr += lda << 2;
+                y_ptr += 8;
+
+            }
+
+            if (n2 & 2) {
+                cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
+                a_ptr += lda << 1;
+                y_ptr += 4;
+
+            }
+
+            if (n2 & 1) {
+                cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
+                a_ptr += lda;
+                y_ptr += 2;
+
+            }
+
+        } else {
+
+            for (i = 0; i < n1; i++) {
+                memset(ybuffer, 0, sizeof (ybuffer));
+                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
+
+                a_ptr += lda << 2;
+
+                y_ptr[0] += ybuffer[0];
+                y_ptr[1] += ybuffer[1];
+                y_ptr += inc_y;
+                y_ptr[0] += ybuffer[2];
+                y_ptr[1] += ybuffer[3];
+                y_ptr += inc_y;
+                y_ptr[0] += ybuffer[4];
+                y_ptr[1] += ybuffer[5];
+                y_ptr += inc_y;
+                y_ptr[0] += ybuffer[6];
+                y_ptr[1] += ybuffer[7];
+                y_ptr += inc_y;
+
+            }
+
+            for (i = 0; i < n2; i++) {
+                memset(ybuffer, 0, sizeof (ybuffer));
+                cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
+                a_ptr += lda;
+                y_ptr[0] += ybuffer[0];
+                y_ptr[1] += ybuffer[1];
+                y_ptr += inc_y;
+
+            }
+
+        }
+        a += 2 * NB;
+        x += NB * inc_x;
+    }
+
+    if (m3 == 0) return (0);
+
+    x_ptr = x;
+    j = 0;
+    a_ptr = a;
+    y_ptr = y;
+
+    if (m3 == 3) {
+
+        FLOAT temp_r;
+        FLOAT temp_i;
+        FLOAT x0 = x_ptr[0];
+        FLOAT x1 = x_ptr[1];
+        x_ptr += inc_x;
+        FLOAT x2 = x_ptr[0];
+        FLOAT x3 = x_ptr[1];
+        x_ptr += inc_x;
+        FLOAT x4 = x_ptr[0];
+        FLOAT x5 = x_ptr[1];
+        while (j < n) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+            temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
+            temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+            temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
+            temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j++;
+        }
+        return (0);
+    }
+
+    if (m3 == 2) {
+
+        FLOAT temp_r;
+        FLOAT temp_i;
+        FLOAT temp_r1;
+        FLOAT temp_i1;
+        FLOAT x0 = x_ptr[0];
+        FLOAT x1 = x_ptr[1];
+        x_ptr += inc_x;
+        FLOAT x2 = x_ptr[0];
+        FLOAT x3 = x_ptr[1];
+
+        while (j < (n & -2)) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+            a_ptr += lda;
+            temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+            temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
+            temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+            a_ptr += lda;
+            temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+            temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
+            temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+            y_ptr += inc_y;
+            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+            y_ptr += inc_y;
+            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j += 2;
+        }
+
+        while (j < n) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j++;
+        }
+
+        return (0);
+    }
+
+    if (m3 == 1) {
+
+        FLOAT temp_r;
+        FLOAT temp_i;
+        FLOAT temp_r1;
+        FLOAT temp_i1;
+        FLOAT x0 = x_ptr[0];
+        FLOAT x1 = x_ptr[1];
+
+        while (j < (n & -2)) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+            a_ptr += lda;
+            temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+            a_ptr += lda;
+            temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+            y_ptr += inc_y;
+            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+            y_ptr += inc_y;
+            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j += 2;
+        }
+
+        while (j < n) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j++;
+        }
+        return (0);
+    }
+
+    return (0);
+
+}
+
diff --git a/kernel/power/crot.c b/kernel/power/crot.c
new file mode 100644
index 0000000000..959a9eda06
--- /dev/null
+++ b/kernel/power/crot.c
@@ -0,0 +1,231 @@
+/***************************************************************************
+Copyright (c) 2013-2018, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#include "common.h"
+ 
+#if defined(POWER8) || defined(POWER9)
+
+static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
+{
+  __vector float t0;
+  __vector float t1;
+  __vector float t2;
+  __vector float t3;
+  __vector float t4;
+  __vector float t5;
+  __vector float t6;
+  __vector float t7;
+  __asm__
+    (
+       "xscvdpspn   36, %x[cos]               \n\t" // load c to all words
+       "xxspltw     36, 36, 0                 \n\t" 
+       "xscvdpspn   37, %x[sin]               \n\t" // load s to all words
+       "xxspltw     37, 37, 0                 \n\t" 
+       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
+       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
+       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
+       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
+       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
+       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
+       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
+       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
+       "addi        %[x_ptr], %[x_ptr], 64    \n\t" 
+       "addi        %[y_ptr], %[y_ptr], 64    \n\t" 
+       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
+       "ble         2f                        \n\t" 
+       ".p2align    5                         \n\t" 
+       "1:                                    \n\t" 
+       "xvmulsp     40, 32, 36                \n\t" // c * x
+       "xvmulsp     41, 33, 36                \n\t" 
+       "xvmulsp     42, 34, 36                \n\t" 
+       "xvmulsp     43, 35, 36                \n\t" 
+       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
+       "xvmulsp     %x[x2], 49, 36            \n\t" 
+       "xvmulsp     %x[x1], 50, 36            \n\t" 
+       "xvmulsp     %x[x3], 51, 36            \n\t" 
+       "xvmulsp     44, 32, 37                \n\t" // s * x
+       "xvmulsp     45, 33, 37                \n\t" 
+       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
+       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
+       "xvmulsp     46, 34, 37                \n\t" 
+       "xvmulsp     47, 35, 37                \n\t" 
+       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
+       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
+       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
+       "xvmulsp     %x[x5], 49, 37            \n\t" 
+       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
+       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
+       "xvmulsp     %x[x6], 50, 37            \n\t" 
+       "xvmulsp     %x[x7], 51, 37            \n\t" 
+       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
+       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
+       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
+       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
+       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
+       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
+       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
+       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
+       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
+       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
+       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
+       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
+       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
+       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
+       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
+       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
+       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
+       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x3], %[i48], %[y_ptr]  \n\t" 
+       "addi        %[x_ptr], %[x_ptr], 128   \n\t" 
+       "addi        %[y_ptr], %[y_ptr], 128   \n\t" 
+       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
+       "bgt         1b                        \n\t" 
+       "2:                                    \n\t" 
+       "xvmulsp     40, 32, 36                \n\t" // c * x
+       "xvmulsp     41, 33, 36                \n\t" 
+       "xvmulsp     42, 34, 36                \n\t" 
+       "xvmulsp     43, 35, 36                \n\t" 
+       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
+       "xvmulsp     %x[x2], 49, 36            \n\t" 
+       "xvmulsp     %x[x1], 50, 36            \n\t" 
+       "xvmulsp     %x[x3], 51, 36            \n\t" 
+       "xvmulsp     44, 32, 37                \n\t" // s * x
+       "xvmulsp     45, 33, 37                \n\t" 
+       "xvmulsp     46, 34, 37                \n\t" 
+       "xvmulsp     47, 35, 37                \n\t" 
+       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
+       "xvmulsp     %x[x5], 49, 37            \n\t" 
+       "xvmulsp     %x[x6], 50, 37            \n\t" 
+       "xvmulsp     %x[x7], 51, 37            \n\t" 
+       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
+       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
+       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
+       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
+       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
+       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
+       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
+       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
+       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
+       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
+       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
+       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
+       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
+       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
+       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
+       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x3], %[i48], %[y_ptr]  "
+     :
+       [mem_x]  "+m"  (*(float (*)[2*n])x),
+       [mem_y]  "+m"  (*(float (*)[2*n])y),
+       [temp_n] "+r"  (n),
+       [x_ptr]  "+&b" (x),
+       [y_ptr]  "+&b" (y),
+       [x0]     "=wa" (t0),
+       [x1]     "=wa" (t2),
+       [x2]     "=wa" (t1),
+       [x3]     "=wa" (t3),
+       [x4]     "=wa" (t4),
+       [x5]     "=wa" (t5),
+       [x6]     "=wa" (t6),
+       [x7]     "=wa" (t7)     
+     : 
+       [cos]    "f"   (c),
+       [sin]    "f"   (s),
+       [i16]    "b"   (16),
+       [i32]    "b"   (32),
+       [i48]    "b"   (48)     
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+}
+ 
+#endif
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+   BLASLONG i=0;
+    BLASLONG ix=0,iy=0;
+    FLOAT temp[2];
+    BLASLONG inc_x2;
+    BLASLONG inc_y2;
+
+    if ( n <= 0     )  return(0); 
+
+    if ( (inc_x == 1) && (inc_y == 1) )
+    {
+
+        BLASLONG n1 = n & -8; 
+        if ( n1 > 0 )
+        { 
+            crot_kernel_8(n1, x, y, c, s);
+            i=n1; 
+            ix=2*n1; 
+        }
+
+         while(i < n)
+           {
+                temp[0]   = c*x[ix]   + s*y[ix] ;
+                temp[1]   = c*x[ix+1] + s*y[ix+1] ;
+                y[ix]     = c*y[ix]   - s*x[ix] ;
+                y[ix+1]   = c*y[ix+1] - s*x[ix+1] ;
+                x[ix]     = temp[0] ;
+                x[ix+1]   = temp[1] ;
+
+                ix += 2 ; 
+                i++ ;
+
+            }
+
+    }
+    else
+    {
+        inc_x2 = 2 * inc_x ;
+        inc_y2 = 2 * inc_y ;
+        while(i < n)
+        {
+            temp[0]   = c*x[ix]   + s*y[iy] ;
+            temp[1]   = c*x[ix+1] + s*y[iy+1] ;
+            y[iy]     = c*y[iy]   - s*x[ix] ;
+            y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
+            x[ix]     = temp[0] ;
+            x[ix+1]   = temp[1] ;
+
+            ix += inc_x2 ;
+            iy += inc_y2 ;
+            i++ ;
+
+        }
+    }
+	return(0);
+}
+
diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c
index da97c896e8..31e02fe5a4 100644
--- a/kernel/power/cswap.c
+++ b/kernel/power/cswap.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8)
+#if defined(POWER8)  || defined(POWER9)
 #include "cswap_microk_power8.c"
 #endif
 
diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c
index 73962c2f21..d0e060977c 100644
--- a/kernel/power/dasum.c
+++ b/kernel/power/dasum.c
@@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "dasum_microk_power8.c"
 #endif
 
diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c
index df0572e8ee..f09611ff09 100644
--- a/kernel/power/daxpy.c
+++ b/kernel/power/daxpy.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "daxpy_microk_power8.c"
 #endif
 
diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c
index 059c0e5a94..27b39144ba 100644
--- a/kernel/power/dcopy.c
+++ b/kernel/power/dcopy.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "dcopy_microk_power8.c"
 #endif
 
diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c
index e43470e23d..f985df1c5a 100644
--- a/kernel/power/ddot.c
+++ b/kernel/power/ddot.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8) 
+#if defined(POWER8)  || defined(POWER9)
 #include "ddot_microk_power8.c"
 #endif
 
diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S
new file mode 100644
index 0000000000..a1762dcf20
--- /dev/null
+++ b/kernel/power/dgemm_kernel_power9.S
@@ -0,0 +1,249 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+ 
+ 
+
+ 
+#define STACKSIZE  (512 )
+#define ALPHA_SP   (296+192)(SP)
+#define FZERO	(304+192)(SP)
+ 
+
+ 
+#define	M	r3
+#define	N	r4
+#define	K	r5
+ 
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+ 
+ 
+
+#define alpha_r vs18
+
+#define o0	0
+
+
+#define T4	r12
+#define T3	r11
+#define C4	r14
+#define o8	r15
+#define o24	r16
+#define C2	r17
+#define L	r18
+#define T1	r19
+#define C3	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define o16	r27
+#define	o32	r28
+#define	o48	r29
+
+#define PRE	r30
+#define T2	r31
+
+#include "dgemm_macros_power9.S"
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+    stxv    v20,  288(SP)
+    stxv    v21,  304(SP)
+    stxv    v22,  320(SP)
+    stxv    v23,  336(SP)
+    stxv    v24,  352(SP)
+    stxv    v25,  368(SP)
+    stxv    v26,  384(SP)
+    stxv    v27,  400(SP)
+    stxv    v28,  416(SP)
+    stxv    v29,  432(SP)
+    stxv    v30,  448(SP)
+    stxv    v31,  464(SP)
+
+
+	stfd	f1,  ALPHA_SP
+	stw	r0,  FZERO 
+
+	slwi	LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+
+	cmpwi	cr0, M, 0
+	ble	.L999_H1
+	cmpwi	cr0, N, 0
+	ble	.L999_H1
+	cmpwi	cr0, K, 0
+	ble	.L999_H1
+
+ 
+ 
+   	addi	T1, SP, 296+192
+ 
+
+	li	PRE, 384
+	li	o8 , 8
+	li	o16, 16
+	li	o24, 24
+	li	o32, 32
+	li	o48, 48
+
+
+	lxvdsx	alpha_r, 0, T1
+
+#include "dgemm_logic_power9.S"
+
+.L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+ 
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+ 
+    lxv    v20,  288(SP)
+    lxv    v21,  304(SP)
+    lxv    v22,  320(SP)
+    lxv    v23,  336(SP)
+    lxv    v24,  352(SP)
+    lxv    v25,  368(SP)
+    lxv    v26,  384(SP)
+    lxv    v27,  400(SP)
+    lxv    v28,  416(SP)
+    lxv    v29,  432(SP)
+    lxv    v30,  448(SP)
+    lxv    v31,  464(SP)
+
+	addi	SP, SP, STACKSIZE
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/dgemm_logic_power9.S b/kernel/power/dgemm_logic_power9.S
new file mode 100644
index 0000000000..251839d19e
--- /dev/null
+++ b/kernel/power/dgemm_logic_power9.S
@@ -0,0 +1,1981 @@
+/***************************************************************************
+Copyright (c) 2013-2019 The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+
+#define MY_ALIGN .align 3
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   neg TEMP_REG, OFFSET 
+#endif
+
+	srawi.		J,	N,	2
+	ble		LDGEMM_L4_END
+
+LDGEMM_L4_BEGIN:
+
+ 
+	li		T1,	128
+	li		T2,	256
+ 
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	2
+	add		C,	C,	T3
+
+ 
+	dcbt		A,	T1
+	dcbt		A,	T2
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LDGEMM_L4x16_END
+
+	MY_ALIGN
+LDGEMM_L4x16_BEGIN:
+
+	li		L,	-128
+
+
+	SAVE4x16_REGS
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,4
+#else
+	mr		BO,	B
+#endif	
+ 
+	and		T1,	CO,	L
+	and		T2,	C2,	L
+	and		T3,	C3,	L
+	and		T4,	C4,	L
+
+	dcbt		T1,	r0
+	dcbt		T2,	r0
+	dcbt		T3,	r0
+	dcbt		T4,	r0
+ 
+
+	addi		T1, T1, 128
+	addi		T2, T2, 128
+	addi		T3, T3, 128
+	addi		T4, T4, 128
+
+	dcbt		T1,	r0
+	dcbt		T2,	r0
+	dcbt		T3,	r0
+	dcbt		T4,	r0
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T3,K,TEMP_REG,16,4
+   srawi.		L, T3,	5
+#else
+   srawi.		L,	K,	5
+#endif 
+	
+	ble		LDGEMM_L4x16_SUB0
+
+
+	MY_ALIGN
+LDGEMM_L4x16_LOOP_START:
+
+	li	T2,	512
+ 
+ 
+	LOAD4x16_1
+    ##OffsetA=128 OffsetB=32
+    addi AO,AO,2176
+ #   addi BO,BO,32 
+	addic.		L,	L,	-1
+
+	ble		LDGEMM_L4x16_LOOP_END
+
+	
+	mtctr		L
+
+	MY_ALIGN
+
+LDGEMM_L4x16_LOOP:
+
+	#dcbt	AO,	PRE
+    KERNEL4x16_I1_L2_2  -2048,32, 0,0
+    KERNEL4x16_I1_L2_2  -2048,32, 1,0
+    KERNEL4x16_I1_L2_2  -2048,32, 2,0
+    KERNEL4x16_I1_L2_2  -2048,32, 3,0
+    KERNEL4x16_I1_L2_2  -2048,32, 4,0
+    KERNEL4x16_I1_L2_2  -2048,32, 5,0        
+    KERNEL4x16_I1_L2_2  -2048,32, 6,0
+    KERNEL4x16_I1_L2_2  -2048,32, 7,0  
+    KERNEL4x16_I1_L2_2  -2048,32, 8,0      
+    KERNEL4x16_I1_L2_2  -2048,32, 9,0
+    KERNEL4x16_I1_L2_2  -2048,32, 10,0
+    KERNEL4x16_I1_L2_2  -2048,32, 11,0
+    KERNEL4x16_I1_L2_2  -2048,32, 12,0
+    KERNEL4x16_I1_L2_2  -2048,32, 13,0    
+    KERNEL4x16_I1_L2_2  -2048,32, 14,0    
+    KERNEL4x16_I1_L2_2  -2048,32, 15,1  	
+
+
+	bdnz		LDGEMM_L4x16_LOOP
+
+	MY_ALIGN
+	MY_ALIGN
+LDGEMM_L4x16_LOOP_END:
+
+    KERNEL4x16_I1_L2_2  -2048,32, 0,0
+    KERNEL4x16_I1_L2_2  -2048,32, 1,0
+    KERNEL4x16_I1_L2_2  -2048,32, 2,0
+    KERNEL4x16_I1_L2_2  -2048,32, 3,0
+    KERNEL4x16_I1_L2_2  -2048,32, 4,0
+    KERNEL4x16_I1_L2_2  -2048,32, 5,0        
+    KERNEL4x16_I1_L2_2  -2048,32, 6,0
+    KERNEL4x16_I1_L2_2  -2048,32, 7,0  
+    KERNEL4x16_I1_L2_2  -2048,32, 8,0      
+    KERNEL4x16_I1_L2_2  -2048,32, 9,0
+    KERNEL4x16_I1_L2_2  -2048,32, 10,0
+    KERNEL4x16_I1_L2_2  -2048,32, 11,0
+    KERNEL4x16_I1_L2_2  -2048,32, 12,0
+    KERNEL4x16_I1_L2_2  -2048,32, 13,0    
+    KERNEL4x16_I1_L2_2  -2048,32, 14,0    
+    KERNEL4x16_I1_L2_3  -2048,32, 15,1    
+	b		LDGEMM_L4x16_SUB1
+
+
+	MY_ALIGN
+LDGEMM_L4x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	31
+#else
+	andi.		L,	K,	31
+#endif
+	KERNEL4x16 1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x16_SAVE
+	b		LDGEMM_L4x16_SUB2
+	MY_ALIGN
+LDGEMM_L4x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	31
+#else
+	andi.		L,	K,	31
+#endif	
+	ble		LDGEMM_L4x16_SAVE
+	MY_ALIGN
+LDGEMM_L4x16_SUB2:
+
+    andi.      T1,L, 16
+    ble LDGEMM_L4x16_SUB2_8
+	LOAD4x16_0 
+    KERNEL4x16_I1_L2_2  128,32, 0,0
+    KERNEL4x16_I1_L2_2  128,32, 1,0
+    KERNEL4x16_I1_L2_2  128,32, 2,0
+    KERNEL4x16_I1_L2_2  128,32, 3,0
+    KERNEL4x16_I1_L2_2  128,32, 4,0
+    KERNEL4x16_I1_L2_2  128,32, 5,0        
+    KERNEL4x16_I1_L2_2  128,32, 6,0
+    KERNEL4x16_I1_L2_3  128,32, 7,1 
+    MY_ALIGN
+LDGEMM_L4x16_SUB2_8:
+    andi.      T1,L, 8
+    ble LDGEMM_L4x16_SUB2_4
+	LOAD4x16_0
+    KERNEL4x16_I1_L2_2  128,32, 0,0
+    KERNEL4x16_I1_L2_2  128,32, 1,0
+    KERNEL4x16_I1_L2_2  128,32, 2,0
+    KERNEL4x16_I1_L2_3  128,32, 3,1
+	MY_ALIGN
+LDGEMM_L4x16_SUB2_4:
+    andi.      T1,L, 4
+    ble LDGEMM_L4x16_SUB2_2 
+	LOAD4x16_0
+    KERNEL4x16_I1_L2_2  128,32, 0,0
+    KERNEL4x16_I1_L2_3  128,32, 1,1
+	MY_ALIGN	
+LDGEMM_L4x16_SUB2_2:
+    andi.      T1,L, 2
+    ble LDGEMM_L4x16_SUB2_1
+    LOAD4x16_0
+    KERNEL4x16_I1_L2_3  128,32, 0,1
+    MY_ALIGN
+LDGEMM_L4x16_SUB2_1:
+    andi.      T1,L, 1
+    ble LDGEMM_L4x16_SAVE	
+    KERNEL4x16 0
+#	addic.		L,	L,	-1
+#	bgt		LDGEMM_L4x16_SUB2
+
+	MY_ALIGN
+LDGEMM_L4x16_SAVE:
+	SAVE4x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LDGEMM_L4x16_BEGIN
+
+LDGEMM_L4x16_END:
+
+LDGEMM_L4x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		LDGEMM_L4x1_END
+
+	andi.		T1,	M,	8
+	ble		LDGEMM_L4x8_END
+
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+    REFRESH_TEMP_BK T3,K,TEMP_REG,8,4
+    srawi.		L, T3,	4	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	4	
+#endif		 
+ 
+
+	ble		LDGEMM_L4x8_SUB0
+
+LDGEMM_L4x8_LOOP_START:
+
+
+	LOAD4x8_1
+    ##OffsetA=64 OffsetB=32
+
+
+	addic.		L,	L,	-1
+
+	ble		LDGEMM_L4x8_LOOP_END
+
+    mtctr		L
+	MY_ALIGN
+
+LDGEMM_L4x8_LOOP:
+
+    KERNEL4x8_I1_L2_2  64,32, 0,0
+    KERNEL4x8_I1_L2_2  64,32, 1,0
+    KERNEL4x8_I1_L2_2  64,32, 2,0
+    KERNEL4x8_I1_L2_2  64,32, 3,0
+    KERNEL4x8_I1_L2_2  64,32, 4,0
+    KERNEL4x8_I1_L2_2  64,32, 5,0        
+    KERNEL4x8_I1_L2_2  64,32, 6,0
+    KERNEL4x8_I1_L2_2  64,32, 7,1     
+
+	bdnz		LDGEMM_L4x8_LOOP
+	MY_ALIGN
+LDGEMM_L4x8_LOOP_END:
+
+    KERNEL4x8_I1_L2_2  64,32, 0,0
+    KERNEL4x8_I1_L2_2  64,32, 1,0
+    KERNEL4x8_I1_L2_2  64,32, 2,0
+    KERNEL4x8_I1_L2_2  64,32, 3,0
+    KERNEL4x8_I1_L2_2  64,32, 4,0
+    KERNEL4x8_I1_L2_2  64,32, 5,0        
+    KERNEL4x8_I1_L2_2  64,32, 6,0
+    KERNEL4x8_I1_L2_3  64,32, 7,1  
+
+	b		LDGEMM_L4x8_SUB1
+	MY_ALIGN
+LDGEMM_L4x8_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	15
+#else
+	andi.		L,	K,	15
+#endif
+	KERNEL4x8 1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x8_SAVE
+	b		LDGEMM_L4x8_SUB2
+	MY_ALIGN
+LDGEMM_L4x8_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	15
+#else
+	andi.		L,	K,	15
+#endif	
+	ble		LDGEMM_L4x8_SAVE
+	MY_ALIGN
+LDGEMM_L4x8_SUB2:
+
+    andi.      T1,L, 8
+    ble LDGEMM_L4x8_SUB2_4
+	LOAD4x8_0
+    KERNEL4x8_I1_L2_2  64,32, 0,0
+    KERNEL4x8_I1_L2_2  64,32, 1,0
+    KERNEL4x8_I1_L2_2  64,32, 2,0
+    KERNEL4x8_I1_L2_3  64,32, 3,1
+	MY_ALIGN
+LDGEMM_L4x8_SUB2_4:
+    andi.      T1,L, 4
+    ble LDGEMM_L4x8_SUB2_2 
+	LOAD4x8_0
+    KERNEL4x8_I1_L2_2  64,32, 0,0
+    KERNEL4x8_I1_L2_3  64,32, 1,1
+	MY_ALIGN	
+LDGEMM_L4x8_SUB2_2:
+    andi.      T1,L, 2
+    ble LDGEMM_L4x8_SUB2_1
+    LOAD4x8_0
+    KERNEL4x8_I1_L2_3  64,32, 0,1
+    MY_ALIGN
+LDGEMM_L4x8_SUB2_1:
+    andi.      T1,L, 1
+    ble LDGEMM_L4x8_SAVE	
+    KERNEL4x8 0
+ 
+	MY_ALIGN
+LDGEMM_L4x8_SAVE:
+	SAVE4x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4
+#endif	
+LDGEMM_L4x8_END:
+
+LDGEMM_L4x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		LDGEMM_L4x4_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+    REFRESH_TEMP_BK T3,K,TEMP_REG,4,4
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif		
+	ble		LDGEMM_L4x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L4x4_SUB4
+
+LDGEMM_L4x4_LOOP_START:
+
+	#dcbt	AO,	PRE
+	LOAD4x4_1
+	KERNEL4x4_I1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	#dcbt	AO,	PRE
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	#dcbt	AO,	PRE
+	KERNEL4x4_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L4x4_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L4x4_LOOP:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	#dcbt	AO,	PRE
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	#dcbt	AO,	PRE
+	KERNEL4x4_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x4_LOOP
+
+LDGEMM_L4x4_LOOP_END:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_E2
+
+	b		LDGEMM_L4x4_SUB1
+
+LDGEMM_L4x4_SUB4:
+
+	KERNEL4x4_SUBI1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	b		LDGEMM_L4x4_SUB1
+
+LDGEMM_L4x4_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL4x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x4_SAVE
+	b		LDGEMM_L4x4_SUB2
+
+LDGEMM_L4x4_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L4x4_SAVE
+
+LDGEMM_L4x4_SUB2:
+
+	KERNEL4x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x4_SUB2
+
+LDGEMM_L4x4_SAVE:
+
+	SAVE4x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4
+#endif	
+LDGEMM_L4x4_END:
+
+LDGEMM_L4x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		LDGEMM_L4x2_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+    REFRESH_TEMP_BK T3,K,TEMP_REG,2,4
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L4x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L4x2_SUB4
+
+LDGEMM_L4x2_LOOP_START:
+
+	LOAD4x2_1
+	KERNEL4x2_I1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L4x2_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L4x2_LOOP:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x2_LOOP
+
+LDGEMM_L4x2_LOOP_END:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_E2
+
+	b		LDGEMM_L4x2_SUB1
+
+LDGEMM_L4x2_SUB4:
+
+	KERNEL4x2_SUBI1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	b		LDGEMM_L4x2_SUB1
+
+LDGEMM_L4x2_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL4x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x2_SAVE
+	b		LDGEMM_L4x2_SUB2
+
+LDGEMM_L4x2_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L4x2_SAVE
+
+LDGEMM_L4x2_SUB2:
+
+	KERNEL4x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x2_SUB2
+
+LDGEMM_L4x2_SAVE:
+
+	SAVE4x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4
+#endif	
+LDGEMM_L4x2_END:
+
+LDGEMM_L4x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		LDGEMM_L4x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+    REFRESH_TEMP_BK T3,K,TEMP_REG,1,4
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L4x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L4x1_SUB4
+
+LDGEMM_L4x1_LOOP_START:
+
+	LOAD4x1_1
+	KERNEL4x1_I1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L4x1_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L4x1_LOOP:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x1_LOOP
+
+LDGEMM_L4x1_LOOP_END:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_E2
+
+	b		LDGEMM_L4x1_SUB1
+
+LDGEMM_L4x1_SUB4:
+
+	KERNEL4x1_SUBI1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	b		LDGEMM_L4x1_SUB1
+
+LDGEMM_L4x1_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL4x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x1_SAVE
+	b		LDGEMM_L4x1_SUB2
+
+LDGEMM_L4x1_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L4x1_SAVE
+
+LDGEMM_L4x1_SUB2:
+
+	KERNEL4x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x1_SUB2
+
+LDGEMM_L4x1_SAVE:
+
+	SAVE4x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4
+#endif	
+LDGEMM_L4x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 4
+#endif
+	addic.		J,	J,	-1
+	bgt		LDGEMM_L4_BEGIN
+
+	andi.		T2,	N,	3
+	ble		.L999
+
+LDGEMM_L4_END:
+
+	b		LDGEMM_L2_BEGIN
+
+.L999_H1:
+
+	b		.L999
+
+LDGEMM_L2_BEGIN:
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	andi.		T1,	N,	2
+	ble		LDGEMM_L2_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+	srawi.		I,	M,	4
+	ble		LDGEMM_L2x16_END
+
+LDGEMM_L2x16_BEGIN:
+
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,16,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x16_SUB4
+
+LDGEMM_L2x16_LOOP_START:
+
+	#dcbt		AO,	PRE
+	LOAD2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_I1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x16_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x16_LOOP:
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x16_LOOP
+
+LDGEMM_L2x16_LOOP_END:
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	KERNEL2x16_E2
+
+	b		LDGEMM_L2x16_SUB1
+
+LDGEMM_L2x16_SUB4:
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_SUBI1
+	#dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	#dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	#dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+
+	b		LDGEMM_L2x16_SUB1
+
+LDGEMM_L2x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x16_SAVE
+	b		LDGEMM_L2x16_SUB2
+
+LDGEMM_L2x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x16_SAVE
+
+LDGEMM_L2x16_SUB2:
+
+	KERNEL2x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x16_SUB2
+
+LDGEMM_L2x16_SAVE:
+
+	SAVE2x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2
+#endif	
+	addic.		I,	I,	-1
+	bgt		LDGEMM_L2x16_BEGIN
+
+LDGEMM_L2x16_END:
+
+LDGEMM_L2x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		LDGEMM_L2x1_END
+
+	andi.		T1,	M,	8
+	ble		LDGEMM_L2x8_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,8,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x8_SUB4
+
+LDGEMM_L2x8_LOOP_START:
+
+	#dcbt	AO,	PRE
+	LOAD2x8_1
+	KERNEL2x8_I1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x8_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x8_LOOP:
+
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x8_LOOP
+
+LDGEMM_L2x8_LOOP_END:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		LDGEMM_L2x8_SUB1
+
+LDGEMM_L2x8_SUB4:
+
+	KERNEL2x8_SUBI1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		LDGEMM_L2x8_SUB1
+
+LDGEMM_L2x8_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x8_SAVE
+	b		LDGEMM_L2x8_SUB2
+
+LDGEMM_L2x8_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x8_SAVE
+
+LDGEMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x8_SUB2
+
+LDGEMM_L2x8_SAVE:
+
+	SAVE2x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2
+#endif
+LDGEMM_L2x8_END:
+
+LDGEMM_L2x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		LDGEMM_L2x4_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,4,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x4_SUB4
+
+LDGEMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x4_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x4_LOOP
+
+LDGEMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		LDGEMM_L2x4_SUB1
+
+LDGEMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		LDGEMM_L2x4_SUB1
+
+LDGEMM_L2x4_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x4_SAVE
+	b		LDGEMM_L2x4_SUB2
+
+LDGEMM_L2x4_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x4_SAVE
+
+LDGEMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x4_SUB2
+
+LDGEMM_L2x4_SAVE:
+
+	SAVE2x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2
+#endif
+LDGEMM_L2x4_END:
+
+LDGEMM_L2x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		LDGEMM_L2x2_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,2,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x2_SUB4
+
+LDGEMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x2_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x2_LOOP
+
+LDGEMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		LDGEMM_L2x2_SUB1
+
+LDGEMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		LDGEMM_L2x2_SUB1
+
+LDGEMM_L2x2_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x2_SAVE
+	b		LDGEMM_L2x2_SUB2
+
+LDGEMM_L2x2_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x2_SAVE
+
+LDGEMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x2_SUB2
+
+LDGEMM_L2x2_SAVE:
+
+	SAVE2x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2
+#endif
+LDGEMM_L2x2_END:
+
+LDGEMM_L2x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		LDGEMM_L2x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,1,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x1_SUB4
+
+LDGEMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x1_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x1_LOOP
+
+LDGEMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		LDGEMM_L2x1_SUB1
+
+LDGEMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		LDGEMM_L2x1_SUB1
+
+LDGEMM_L2x1_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x1_SAVE
+	b		LDGEMM_L2x1_SUB2
+
+LDGEMM_L2x1_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x1_SAVE
+
+LDGEMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x1_SUB2
+
+LDGEMM_L2x1_SAVE:
+
+	SAVE2x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2
+#endif
+LDGEMM_L2x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 2
+#endif
+LDGEMM_L2_END:
+LDGEMM_L1_BEGIN:
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	andi.		T1,	N,	1
+	ble		LDGEMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+	srawi.		I,	M,	4
+	ble		LDGEMM_L1x16_END
+
+LDGEMM_L1x16_BEGIN:
+
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,16,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x16_SUB4
+
+LDGEMM_L1x16_LOOP_START:
+
+	#dcbt		AO,	PRE
+	LOAD1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_I1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x16_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x16_LOOP:
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x16_LOOP
+
+LDGEMM_L1x16_LOOP_END:
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	KERNEL1x16_E2
+
+	b		LDGEMM_L1x16_SUB1
+
+LDGEMM_L1x16_SUB4:
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_SUBI1
+	#dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	#dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	#dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+
+	b		LDGEMM_L1x16_SUB1
+
+LDGEMM_L1x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x16_SAVE
+	b		LDGEMM_L1x16_SUB2
+
+LDGEMM_L1x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x16_SAVE
+
+LDGEMM_L1x16_SUB2:
+
+	KERNEL1x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x16_SUB2
+
+LDGEMM_L1x16_SAVE:
+
+	SAVE1x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1
+#endif
+	addic.		I,	I,	-1
+	bgt		LDGEMM_L1x16_BEGIN
+
+LDGEMM_L1x16_END:
+
+LDGEMM_L1x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		LDGEMM_L1x1_END
+
+	andi.		T1,	M,	8
+	ble		LDGEMM_L1x8_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,8,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x8_SUB4
+
+LDGEMM_L1x8_LOOP_START:
+
+	#dcbt	AO,	PRE
+	LOAD1x8_1
+	KERNEL1x8_I1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x8_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x8_LOOP:
+
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x8_LOOP
+
+LDGEMM_L1x8_LOOP_END:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		LDGEMM_L1x8_SUB1
+
+LDGEMM_L1x8_SUB4:
+
+	KERNEL1x8_SUBI1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		LDGEMM_L1x8_SUB1
+
+LDGEMM_L1x8_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x8_SAVE
+	b		LDGEMM_L1x8_SUB2
+
+LDGEMM_L1x8_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x8_SAVE
+
+LDGEMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x8_SUB2
+
+LDGEMM_L1x8_SAVE:
+
+	SAVE1x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1
+#endif
+LDGEMM_L1x8_END:
+
+LDGEMM_L1x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		LDGEMM_L1x4_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,4,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x4_SUB4
+
+LDGEMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x4_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x4_LOOP
+
+LDGEMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		LDGEMM_L1x4_SUB1
+
+LDGEMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		LDGEMM_L1x4_SUB1
+
+LDGEMM_L1x4_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x4_SAVE
+	b		LDGEMM_L1x4_SUB2
+
+LDGEMM_L1x4_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x4_SAVE
+
+LDGEMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x4_SUB2
+
+LDGEMM_L1x4_SAVE:
+
+	SAVE1x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1
+#endif
+LDGEMM_L1x4_END:
+
+LDGEMM_L1x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		LDGEMM_L1x2_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,2,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x2_SUB4
+
+LDGEMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x2_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x2_LOOP
+
+LDGEMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		LDGEMM_L1x2_SUB1
+
+LDGEMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		LDGEMM_L1x2_SUB1
+
+LDGEMM_L1x2_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x2_SAVE
+	b		LDGEMM_L1x2_SUB2
+
+LDGEMM_L1x2_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x2_SAVE
+
+LDGEMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x2_SUB2
+
+LDGEMM_L1x2_SAVE:
+
+	SAVE1x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1
+#endif
+LDGEMM_L1x2_END:
+
+LDGEMM_L1x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		LDGEMM_L1x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,1,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x1_SUB4
+
+LDGEMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x1_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x1_LOOP
+
+LDGEMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		LDGEMM_L1x1_SUB1
+
+LDGEMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		LDGEMM_L1x1_SUB1
+
+LDGEMM_L1x1_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x1_SAVE
+	b		LDGEMM_L1x1_SUB2
+
+LDGEMM_L1x1_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x1_SAVE
+
+LDGEMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x1_SUB2
+
+LDGEMM_L1x1_SAVE:
+
+	SAVE1x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1
+#endif
+LDGEMM_L1x1_END:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 1
+#endif
+LDGEMM_L1_END:
diff --git a/kernel/power/dgemm_macros_power9.S b/kernel/power/dgemm_macros_power9.S
new file mode 100644
index 0000000000..c4b8270b82
--- /dev/null
+++ b/kernel/power/dgemm_macros_power9.S
@@ -0,0 +1,3623 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+/*********************************************************************
+* Macros for N=4, M=16                                               *
+*********************************************************************/
+.macro LOAD4x16_1
+   LOAD4x16 1
+.endm
+
+.macro LOAD4x16_0
+   LOAD4x16 0
+.endm
+.macro LOAD4x16  Zero
+
+	lxv	vs24,	0(BO)
+	lxv	vs26,	16(BO)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+
+	lxv	vs0,	 0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO)
+ 
+
+	lxv	vs4,	64(AO)
+	lxv	vs5,	80(AO)
+	lxv	vs6,	96(AO)
+	lxv	vs7,	112(AO)
+.if \Zero==1 
+    xxlxor		vs32,vs32,vs32
+    xxlxor		vs33,vs33,vs33
+	xxlxor		vs34,vs34,vs34
+	xxlxor		vs35,vs35,vs35
+	xxlxor		vs36,vs36,vs36
+	xxlxor		vs37,vs37,vs37
+	xxlxor		vs38,vs38,vs38
+	xxlxor		vs39,vs39,vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+	xxlxor		vs48,	vs48,	vs48
+	xxlxor		vs49,	vs49,	vs49
+	xxlxor		vs50,	vs50,	vs50
+	xxlxor		vs51,	vs51,	vs51 
+	xxlxor		vs52,	vs52,	vs52
+	xxlxor		vs53,	vs53,	vs53
+	xxlxor		vs54,	vs54,	vs54
+	xxlxor		vs55,	vs55,	vs55 
+	xxlxor		vs56,	vs56,	vs56
+	xxlxor		vs57,	vs57,	vs57
+	xxlxor		vs58,	vs58,	vs58
+	xxlxor		vs59,	vs59,	vs59 
+	xxlxor		vs60,	vs60,	vs60
+	xxlxor		vs61,	vs61,	vs61
+	xxlxor		vs62,	vs62,	vs62
+	xxlxor		vs63,	vs63,	vs63	
+.endif
+.endm
+
+  
+#define unit_size 8
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+
+.macro KERNEL4x16_L1_L2  Index,IsLast
+  KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0
+.endm
+
+
+
+.macro KERNEL4x16_I1_L2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L2_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x16_I2_L2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I2_L2_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I2_L2_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x16_L1_L2_I  AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete
+
+.if \First ==1
+	xvmuldp		vs32,	vs0,	vs24
+	xvmuldp		vs33,	vs1,	vs24
+	xvmuldp		vs34,	vs2,	vs24
+	xvmuldp		vs35,	vs3,	vs24
+.else
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+.endif
+	lxv	vs8,	DISP32(\Index,0+\OffsetA)(\AREG)
+	lxv	vs9,	DISP32(\Index,16+\OffsetA)(\AREG)
+	lxv	vs10,	DISP32(\Index,32+\OffsetA)(\AREG)
+	lxv	vs11,	DISP32(\Index,48+\OffsetA)(\AREG)
+.if \First ==1
+	xvmuldp		vs36,	vs4,	vs24
+	xvmuldp		vs37,	vs5,	vs24
+	xvmuldp		vs38,	vs6,	vs24
+	xvmuldp		vs39,	vs7,	vs24
+.else
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+.endif
+	lxv	vs28,	DISP8(\Index,0  +\OffsetB)(\BREG)
+	lxv	vs30,	DISP8(\Index,16  +\OffsetB)(\BREG)
+	xxpermdi	vs29,	vs28,	vs28,2	
+	xxpermdi	vs31,	vs30,	vs30,2
+.if \First ==1
+	xvmuldp		vs40,	vs0,	vs25
+	xvmuldp		vs41,	vs1,	vs25
+	xvmuldp		vs42,	vs2,	vs25
+	xvmuldp		vs43,	vs3,	vs25
+
+
+	xvmuldp		vs44,	vs4,	vs25
+	xvmuldp		vs45,	vs5,	vs25
+	xvmuldp		vs46,	vs6,	vs25
+	xvmuldp		vs47,	vs7,	vs25
+
+
+	xvmuldp		vs48,	vs0,	vs26
+	xvmuldp		vs49,	vs1,	vs26
+	xvmuldp		vs50,	vs2,	vs26
+	xvmuldp		vs51,	vs3,	vs26
+
+
+.else
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+
+.endif
+	lxv	vs12,  DISP32(\Index,64+\OffsetA)(\AREG)
+	lxv	vs13,  DISP32(\Index,80+\OffsetA)(\AREG)
+.if \First ==1
+	xvmuldp		vs52,	vs4,	vs26
+	xvmuldp		vs53,	vs5,	vs26
+	xvmuldp		vs54,	vs6,	vs26
+	xvmuldp		vs55,	vs7,	vs26
+
+.else
+	xvmaddadp		vs52,	vs4,	vs26
+	xvmaddadp		vs53,	vs5,	vs26
+	xvmaddadp		vs54,	vs6,	vs26
+	xvmaddadp		vs55,	vs7,	vs26
+.endif
+	lxv	vs14,  DISP32(\Index,96+\OffsetA)(\AREG)
+	lxv	vs15,  DISP32(\Index,112+\OffsetA)(\AREG)
+.if \First ==1
+	xvmuldp		vs56,	vs0,	vs27
+	xvmuldp		vs57,	vs1,	vs27
+	xvmuldp		vs58,	vs2,	vs27
+	xvmuldp		vs59,	vs3,	vs27
+
+ 
+
+	xvmuldp		vs60,	vs4,	vs27
+	xvmuldp		vs61,	vs5,	vs27
+	xvmuldp		vs62,	vs6,	vs27
+	xvmuldp		vs63,	vs7,	vs27
+
+.else
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+ 
+
+	xvmaddadp		vs60,	vs4,	vs27
+	xvmaddadp		vs61,	vs5,	vs27
+	xvmaddadp		vs62,	vs6,	vs27
+	xvmaddadp		vs63,	vs7,	vs27
+.endif
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+.if \Complete==0
+	lxv	vs0,	DISP32(\Index,128+\OffsetA)(\AREG)
+	lxv	vs1,	DISP32(\Index,144+\OffsetA)(\AREG)
+.endif
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+.if \Complete==0
+	lxv	vs24,	DISP8(\Index,32  +\OffsetB)(\BREG)
+	lxv	vs26,	DISP8(\Index,48  +\OffsetB)(\BREG)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+.endif
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+.if \Complete==0
+	lxv	vs2,	DISP32(\Index,160+\OffsetA)(\AREG)
+	lxv	vs3,	DISP32(\Index,176+\OffsetA)(\AREG)
+.endif
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+.if \Complete==0
+	lxv	vs4,	DISP32(\Index,192+\OffsetA)(\AREG)
+	lxv	vs5,	DISP32(\Index,208+\OffsetA)(\AREG)
+.endif
+	xvmaddadp		vs52,	vs12,	vs30
+	xvmaddadp		vs53,	vs13,	vs30
+	xvmaddadp		vs54,	vs14,	vs30
+	xvmaddadp		vs55,	vs15,	vs30
+.if \Complete==0
+	lxv	vs6,	DISP32(\Index,224+\OffsetA)(\AREG)
+	lxv	vs7,	DISP32(\Index,240+\OffsetA)(\AREG)
+.endif
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+ 
+
+	xvmaddadp		vs60,	vs12,	vs31
+	
+	xvmaddadp		vs61,	vs13,	vs31
+	xvmaddadp		vs62,	vs14,	vs31
+	
+	xvmaddadp		vs63,	vs15,	vs31
+  .if \IsLast==1	
+  .if \Complete==1
+	addi		\AREG, \AREG, DISP32(\Index,128+\OffsetA)
+	addi		\BREG, \BREG,  DISP8(\Index,32+\OffsetB)
+  .else
+	addi		\AREG, \AREG, DISP32(\Index,256)
+	addi		\BREG, \BREG,  DISP8(\Index,64)
+  .endif
+  .endif
+  
+
+.endm
+
+ 
+
+.macro KERNEL4x16 First
+
+	lxv	vs24,	0(BO)
+	lxv	vs26,	16(BO)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+
+	lxv	vs0,	0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO) 
+
+	lxv	vs4,	64(AO)
+	lxv	vs5,	80(AO)
+	lxv	vs6,	96(AO)
+	lxv	vs7,	112(AO)
+
+
+ 
+	addi		BO, BO, 32
+  addi		AO, AO, 128
+
+.if \First==1
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+	xvmuldp			vs52,	vs4,	vs26
+	xvmuldp			vs53,	vs5,	vs26
+	xvmuldp			vs54,	vs6,	vs26
+	xvmuldp			vs55,	vs7,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+	xvmuldp			vs60,	vs4,	vs27
+	xvmuldp			vs61,	vs5,	vs27
+	xvmuldp			vs62,	vs6,	vs27
+	xvmuldp			vs63,	vs7,	vs27
+.else
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+ 
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+ 
+	xvmaddadp		vs52,	vs4,	vs26
+	xvmaddadp		vs53,	vs5,	vs26
+	xvmaddadp		vs54,	vs6,	vs26
+	xvmaddadp		vs55,	vs7,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+	xvmaddadp		vs60,	vs4,	vs27
+	xvmaddadp		vs61,	vs5,	vs27
+	xvmaddadp		vs62,	vs6,	vs27
+	xvmaddadp		vs63,	vs7,	vs27
+
+.endif
+.endm
+
+.macro SAVE4x16_REGS
+	add		C2,	CO,	LDC
+	add		C3,	C2,	LDC
+	add		C4,	C3,	LDC
+.endm
+
+.macro SAVE4x16
+#ifndef TRMMKERNEL
+	lxv		vs0,	0(CO)
+	lxv		vs2,	16(CO)
+	lxv		vs4,	32(CO)
+	lxv		vs6,	48(CO)
+#endif	
+	xxpermdi  vs8, vs40,vs32,1
+ 	xxpermdi  vs9 ,vs32,vs40,1
+#ifndef TRMMKERNEL
+	lxv		vs24,	64(CO)
+	lxv		vs26,	80(CO)
+	lxv		vs28,	96(CO)
+	lxv		vs30,	112(CO)
+#endif	
+	xxpermdi  vs10, vs41,vs33,1		 
+ 	xxpermdi  vs11 ,vs33,vs41,1
+#ifndef TRMMKERNEL	 
+	lxv		vs1,	0(C2)
+	lxv		vs3,	16(C2)
+	lxv		vs5,	32(C2)
+	lxv		vs7,	48(C2)
+#endif	
+	xxpermdi  vs12, vs42,vs34,1
+ 	xxpermdi  vs13 ,vs34,vs42,1
+#ifndef TRMMKERNEL
+	lxv		vs25,	64(C2)
+	lxv		vs27,	80(C2)
+#endif	
+	xxpermdi  vs14, vs43,vs35,1		 
+ 	xxpermdi  vs15 ,vs35,vs43,1	
+#ifndef TRMMKERNEL	 
+	lxv		vs29,	96(C2)
+	lxv		vs31,	112(C2)	
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs8,	alpha_r 
+	xvmaddadp	vs1,	vs9,	alpha_r 
+	xvmaddadp	vs2,	vs10,	alpha_r 
+	xvmaddadp	vs3,	vs11,	alpha_r 
+#else
+	xvmuldp	vs0,	vs8,	alpha_r 
+	xvmuldp	vs1,	vs9,	alpha_r 
+	xvmuldp	vs2,	vs10,	alpha_r 
+	xvmuldp	vs3,	vs11,	alpha_r 
+
+#endif
+	xxpermdi  vs8, vs44,vs36,1
+ 	xxpermdi  vs9 ,vs36,vs44,1
+	xxpermdi  vs10, vs45,vs37,1		 
+ 	xxpermdi  vs11 ,vs37,vs45,1
+#ifndef TRMMKERNEL
+	xvmaddadp	vs4,	vs12,	alpha_r 
+	xvmaddadp	vs5,	vs13,	alpha_r 
+	xvmaddadp	vs6,	vs14,	alpha_r 
+	xvmaddadp	vs7,	vs15,	alpha_r 
+#else
+	xvmuldp	vs4,	vs12,	alpha_r 
+	xvmuldp	vs5,	vs13,	alpha_r 
+	xvmuldp	vs6,	vs14,	alpha_r 
+	xvmuldp	vs7,	vs15,	alpha_r 
+#endif
+	xxpermdi  vs12, vs46,vs38,1
+ 	xxpermdi  vs13 ,vs38,vs46,1
+	xxpermdi  vs14, vs47,vs39,1		 
+ 	xxpermdi  vs15 ,vs39,vs47,1
+
+#ifndef TRMMKERNEL 
+	xvmaddadp	vs24,	vs8,	alpha_r 
+	xvmaddadp	vs25,	vs9,	alpha_r 
+	xvmaddadp	vs26,	vs10,	alpha_r 
+	xvmaddadp	vs27,	vs11,	alpha_r 
+
+	xvmaddadp	vs28,	vs12,	alpha_r 
+	xvmaddadp	vs29,	vs13,	alpha_r 
+	xvmaddadp	vs30,	vs14,	alpha_r 
+	xvmaddadp	vs31,	vs15,	alpha_r 
+#else
+	xvmuldp	vs24,	vs8,	alpha_r 
+	xvmuldp	vs25,	vs9,	alpha_r 
+	xvmuldp	vs26,	vs10,	alpha_r 
+	xvmuldp	vs27,	vs11,	alpha_r 
+
+	xvmuldp	vs28,	vs12,	alpha_r 
+	xvmuldp	vs29,	vs13,	alpha_r 
+	xvmuldp	vs30,	vs14,	alpha_r 
+	xvmuldp	vs31,	vs15,	alpha_r 
+
+#endif
+	stxv		vs0,	0(CO)
+	stxv		vs2,	16(CO)
+	stxv		vs4,	32(CO)
+	stxv		vs6,	48(CO)
+
+	stxv		vs24,	64(CO)
+	stxv		vs26,	80(CO)
+	stxv		vs28,	96(CO)
+	stxv		vs30,	112(CO)
+
+	stxv		vs1,	0(C2)
+	stxv		vs3,	16(C2)
+	stxv		vs5,	32(C2)
+	stxv		vs7,	48(C2)
+	
+	stxv		vs25,	64(C2)
+	stxv		vs27,	80(C2)
+	stxv		vs29,	96(C2)
+	stxv		vs31,	112(C2)	
+#ifndef TRMMKERNEL
+ 	lxv		vs0,	0(C3)
+	lxv		vs2,	16(C3)
+	lxv		vs4,	32(C3)
+	lxv		vs6,	48(C3)
+#endif	
+	xxpermdi  vs8, vs56,vs48,1
+ 	xxpermdi  vs9 ,vs48,vs56,1
+#ifndef TRMMKERNEL	 
+	lxv		vs24,	64(C3)
+	lxv		vs26,	80(C3)
+#endif	
+	xxpermdi  vs10, vs57,vs49,1		 
+ 	xxpermdi  vs11 ,vs49,vs57,1	
+#ifndef TRMMKERNEL	 
+	lxv		vs28,	96(C3)
+	lxv		vs30,	112(C3)
+#endif	
+	xxpermdi  vs12, vs58,vs50,1
+ 	xxpermdi  vs13 ,vs50,vs58,1
+#ifndef TRMMKERNEL	 
+	lxv		vs1,	0(C4)
+	lxv		vs3,	16(C4)
+#endif	
+	xxpermdi  vs14, vs59,vs51,1		 
+ 	xxpermdi  vs15 ,vs51,vs59,1	
+#ifndef TRMMKERNEL	 
+	lxv		vs5,	32(C4)
+	lxv		vs7,	48(C4)
+
+	lxv		vs25,	64(C4)
+	lxv		vs27,	80(C4)
+	lxv		vs29,	96(C4)
+	lxv		vs31,	112(C4)	
+#endif
+ 
+#ifndef TRMMKERNEL 
+	xvmaddadp	vs0,	vs8,	alpha_r 
+	xvmaddadp	vs1,	vs9,	alpha_r 
+	xvmaddadp	vs2,	vs10,	alpha_r 
+	xvmaddadp	vs3,	vs11,	alpha_r 
+#else
+	xvmuldp	vs0,	vs8,	alpha_r 
+	xvmuldp	vs1,	vs9,	alpha_r 
+	xvmuldp	vs2,	vs10,	alpha_r 
+	xvmuldp	vs3,	vs11,	alpha_r 
+
+#endif
+
+	xxpermdi  vs8, vs60,vs52,1
+ 	xxpermdi  vs9 ,vs52,vs60,1
+	xxpermdi  vs10, vs61,vs53,1		 
+ 	xxpermdi  vs11 ,vs53,vs61,1
+#ifndef TRMMKERNEL
+	xvmaddadp	vs4,	vs12,	alpha_r 
+	xvmaddadp	vs5,	vs13,	alpha_r 
+	xvmaddadp	vs6,	vs14,	alpha_r 
+	xvmaddadp	vs7,	vs15,	alpha_r 
+#else
+	xvmuldp	vs4,	vs12,	alpha_r 
+	xvmuldp	vs5,	vs13,	alpha_r 
+	xvmuldp	vs6,	vs14,	alpha_r 
+	xvmuldp	vs7,	vs15,	alpha_r 
+#endif
+
+
+	xxpermdi  vs12, vs62,vs54,1
+ 	xxpermdi  vs13 ,vs54,vs62,1
+	xxpermdi  vs14, vs63,vs55,1		 
+ 	xxpermdi  vs15 ,vs55,vs63,1
+#ifndef TRMMKERNEL 
+	xvmaddadp	vs24,	vs8,	alpha_r 
+	xvmaddadp	vs25,	vs9,	alpha_r 
+	xvmaddadp	vs26,	vs10,	alpha_r 
+	xvmaddadp	vs27,	vs11,	alpha_r 
+
+	xvmaddadp	vs28,	vs12,	alpha_r 
+	xvmaddadp	vs29,	vs13,	alpha_r 
+	xvmaddadp	vs30,	vs14,	alpha_r 
+	xvmaddadp	vs31,	vs15,	alpha_r 
+#else
+	xvmuldp	vs24,	vs8,	alpha_r 
+	xvmuldp	vs25,	vs9,	alpha_r 
+	xvmuldp	vs26,	vs10,	alpha_r 
+	xvmuldp	vs27,	vs11,	alpha_r 
+
+	xvmuldp	vs28,	vs12,	alpha_r 
+	xvmuldp	vs29,	vs13,	alpha_r 
+	xvmuldp	vs30,	vs14,	alpha_r 
+	xvmuldp	vs31,	vs15,	alpha_r 
+#endif
+ 	stxv		vs0,	0(C3)
+	stxv		vs2,	16(C3)
+	stxv		vs4,	32(C3)
+	stxv		vs6,	48(C3)
+
+	stxv		vs24,	64(C3)
+	stxv		vs26,	80(C3)
+	stxv		vs28,	96(C3)
+	stxv		vs30,	112(C3)
+
+	stxv		vs1,	0(C4)
+	stxv		vs3,	16(C4)
+	stxv		vs5,	32(C4)
+	stxv		vs7,	48(C4)
+	
+	stxv		vs25,	64(C4)
+	stxv		vs27,	80(C4)
+	stxv		vs29,	96(C4)
+	stxv		vs31,	112(C4)	
+
+	addi		CO,	CO,	128
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD4x8_1
+   LOAD4x8 1
+.endm
+
+.macro LOAD4x8_0
+   LOAD4x8 0
+.endm
+.macro LOAD4x8  Zero
+
+	lxv	vs24,	0(BO)
+	lxv	vs26,	16(BO)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+
+	lxv	vs0,	 0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO)
+ 
+
+
+.if \Zero==1 
+    xxlxor		vs32,vs32,vs32
+    xxlxor		vs33,vs33,vs33
+	xxlxor		vs34,vs34,vs34
+	xxlxor		vs35,vs35,vs35
+
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+
+	xxlxor		vs48,	vs48,	vs48
+	xxlxor		vs49,	vs49,	vs49
+	xxlxor		vs50,	vs50,	vs50
+	xxlxor		vs51,	vs51,	vs51 
+
+	xxlxor		vs56,	vs56,	vs56
+	xxlxor		vs57,	vs57,	vs57
+	xxlxor		vs58,	vs58,	vs58
+	xxlxor		vs59,	vs59,	vs59 
+
+.endif
+.endm
+
+  
+ 
+.macro KERNEL4x8_L1_L2  Index,IsLast
+  KERNEL4x8_L1_L2_I  0,0,0, \Index,\IsLast,0
+.endm
+
+
+
+.macro KERNEL4x8_I1_L2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L2_I  1,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L2_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L2_I  0,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L2_I  0,\OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x8_L1_L2_I  First, OffsetA,OffsetB, Index,IsLast ,Complete
+
+	lxv	vs8,	DISP16(\Index,0+\OffsetA)(AO)
+	lxv	vs9,	DISP16(\Index,16+\OffsetA)(AO)
+.if \First ==1
+	xvmuldp		vs32,	vs0,	vs24
+	xvmuldp		vs33,	vs1,	vs24
+	xvmuldp		vs34,	vs2,	vs24
+	xvmuldp		vs35,	vs3,	vs24
+.else
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+.endif
+
+	lxv	vs10,	DISP16(\Index,32+\OffsetA)(AO)
+	lxv	vs11,	DISP16(\Index,48+\OffsetA)(AO)
+
+
+
+.if \First ==1
+	xvmuldp		vs40,	vs0,	vs25
+	xvmuldp		vs41,	vs1,	vs25
+	xvmuldp		vs42,	vs2,	vs25
+	xvmuldp		vs43,	vs3,	vs25
+
+
+	xvmuldp		vs48,	vs0,	vs26
+	xvmuldp		vs49,	vs1,	vs26
+	xvmuldp		vs50,	vs2,	vs26
+	xvmuldp		vs51,	vs3,	vs26
+
+
+.else
+
+	lxv	vs28,	DISP8(\Index,0  +\OffsetB)(BO)
+	lxv	vs30,	DISP8(\Index,16  +\OffsetB)(BO)
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+
+.endif
+	xxpermdi	vs29,	vs28,	vs28,2	
+	xxpermdi	vs31,	vs30,	vs30,2
+.if \First ==1
+	xvmuldp		vs56,	vs0,	vs27
+	xvmuldp		vs57,	vs1,	vs27
+	xvmuldp		vs58,	vs2,	vs27
+	xvmuldp		vs59,	vs3,	vs27
+
+.else
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+.endif
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+.if \Complete==0
+	lxv	vs0,	DISP16(\Index,64+\OffsetA)(AO)
+	lxv	vs1,	DISP16(\Index,80+\OffsetA)(AO) 
+.endif
+
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.if \Complete==0 
+	lxv	vs2,	DISP16(\Index,96+\OffsetA)(AO)
+	lxv	vs3,	DISP16(\Index,112+\OffsetA)(AO)
+.endif	
+
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+.if \Complete==0
+	lxv	vs24,	DISP8(\Index,32  +\OffsetB)(BO)
+	lxv	vs26,	DISP8(\Index,48  +\OffsetB)(BO) 
+.endif
+ 
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+.if \Complete==0 
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+.endif
+
+  .if \IsLast==1	
+  .if \Complete==1
+	addi		AO, AO, DISP16(\Index,64+\OffsetA)
+	addi		BO, BO,  DISP8(\Index,32+\OffsetB)
+  .else
+	addi		AO, AO, DISP16(\Index,128)
+	addi		BO, BO,  DISP8(\Index,64)
+  .endif
+  .endif
+  
+
+.endm
+
+ 
+
+.macro KERNEL4x8 First
+
+	lxv	vs24,	0(BO)
+	lxv	vs26,	16(BO)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+
+	lxv	vs0,	0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO) 
+
+
+
+ 
+	addi		BO, BO, 32
+    addi		AO, AO, 64
+
+.if \First==1
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+ 
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+ 
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+ 
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+ 
+.else
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+ 
+
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+ 
+
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+
+.endif
+.endm
+
+ 
+
+.macro SAVE4x8
+	add		T2,	CO,	LDC
+	add		T3,	T2,	LDC
+	add		T4,	T3,	LDC
+#ifndef TRMMKERNEL
+	lxv		vs0,	0(CO)
+	lxv		vs2,	16(CO)
+#endif	
+	xxpermdi  vs8, vs40,vs32,1
+ 	xxpermdi  vs9 ,vs32,vs40,1
+#ifndef TRMMKERNEL	 
+	lxv		vs4,	32(CO)
+	lxv		vs6,	48(CO)
+#endif	
+	xxpermdi  vs10, vs41,vs33,1		 
+ 	xxpermdi  vs11 ,vs33,vs41,1
+#ifndef TRMMKERNEL	 
+	lxv		vs1,	0(T2)
+	lxv		vs3,	16(T2)
+#endif	
+	xxpermdi  vs12, vs42,vs34,1
+ 	xxpermdi  vs13 ,vs34,vs42,1
+#ifndef TRMMKERNEL	 
+	lxv		vs5,	32(T2)
+	lxv		vs7,	48(T2)
+#endif	
+	xxpermdi  vs14, vs43,vs35,1		 
+ 	xxpermdi  vs15 ,vs35,vs43,1	
+ 
+
+
+#ifndef TRMMKERNEL 
+	xvmaddadp	vs0,	vs8,	alpha_r 
+	xvmaddadp	vs1,	vs9,	alpha_r 
+	xvmaddadp	vs2,	vs10,	alpha_r 
+	xvmaddadp	vs3,	vs11,	alpha_r 
+
+	xvmaddadp	vs4,	vs12,	alpha_r 
+	xvmaddadp	vs5,	vs13,	alpha_r 
+	xvmaddadp	vs6,	vs14,	alpha_r 
+	xvmaddadp	vs7,	vs15,	alpha_r 
+#else
+	xvmuldp	vs0,	vs8,	alpha_r 
+	xvmuldp	vs1,	vs9,	alpha_r 
+	xvmuldp	vs2,	vs10,	alpha_r 
+	xvmuldp	vs3,	vs11,	alpha_r 
+
+	xvmuldp	vs4,	vs12,	alpha_r 
+	xvmuldp	vs5,	vs13,	alpha_r 
+	xvmuldp	vs6,	vs14,	alpha_r 
+	xvmuldp	vs7,	vs15,	alpha_r 
+
+#endif
+ 
+
+	stxv		vs0,	0(CO)
+	stxv		vs2,	16(CO)
+	stxv		vs4,	32(CO)
+	stxv		vs6,	48(CO)
+
+ 
+	stxv		vs1,	0(T2)
+	stxv		vs3,	16(T2)
+	stxv		vs5,	32(T2)
+	stxv		vs7,	48(T2)
+	
+ 
+	xxpermdi  vs8, vs56,vs48,1
+ 	xxpermdi  vs9 ,vs48,vs56,1
+#ifndef TRMMKERNEL 
+ 	lxv		vs0,	0(T3)
+	lxv		vs2,	16(T3)
+#endif	
+	xxpermdi  vs10, vs57,vs49,1		 
+ 	xxpermdi  vs11 ,vs49,vs57,1	
+#ifndef TRMMKERNEL 	 
+	lxv		vs4,	32(T3)
+	lxv		vs6,	48(T3)
+#endif 
+	xxpermdi  vs12, vs58,vs50,1
+ 	xxpermdi  vs13 ,vs50,vs58,1
+#ifndef TRMMKERNEL 	 
+	lxv		vs1,	0(T4)
+	lxv		vs3,	16(T4)
+#endif	
+	xxpermdi  vs14, vs59,vs51,1		 
+ 	xxpermdi  vs15 ,vs51,vs59,1	
+#ifndef TRMMKERNEL 	 
+	lxv		vs5,	32(T4)
+	lxv		vs7,	48(T4)
+ 
+ 
+	xvmaddadp	vs0,	vs8,	alpha_r 
+	xvmaddadp	vs1,	vs9,	alpha_r 
+	xvmaddadp	vs2,	vs10,	alpha_r 
+	xvmaddadp	vs3,	vs11,	alpha_r 
+	
+
+
+	xvmaddadp	vs4,	vs12,	alpha_r 
+	xvmaddadp	vs5,	vs13,	alpha_r 
+	xvmaddadp	vs6,	vs14,	alpha_r 
+	xvmaddadp	vs7,	vs15,	alpha_r 
+#else
+	xvmuldp	vs0,	vs8,	alpha_r 
+	xvmuldp	vs1,	vs9,	alpha_r 
+	xvmuldp	vs2,	vs10,	alpha_r 
+	xvmuldp	vs3,	vs11,	alpha_r 
+	
+
+
+	xvmuldp	vs4,	vs12,	alpha_r 
+	xvmuldp	vs5,	vs13,	alpha_r 
+	xvmuldp	vs6,	vs14,	alpha_r 
+	xvmuldp	vs7,	vs15,	alpha_r 
+
+#endif
+
+
+ 	stxv		vs0,	0(T3)
+	stxv		vs2,	16(T3)
+	stxv		vs4,	32(T3)
+	stxv		vs6,	48(T3)
+
+ 
+	stxv		vs1,	0(T4)
+	stxv		vs3,	16(T4)
+	stxv		vs5,	32(T4)
+	stxv		vs7,	48(T4)
+	
+ 
+
+	addi		CO,	CO,	64
+.endm
+
+
+/*********************************************************************
+* Macros for N=4, M=4                                                *
+*********************************************************************/
+
+.macro LOAD4x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+
+.endm
+
+.macro SAVE4x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+	xvmaddadp	vs1,	vs49,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+	xvmuldp		vs1,	vs49,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+	xvmaddadp	vs9,	vs57,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+	xvmuldp		vs9,	vs57,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=2                                                *
+*********************************************************************/
+
+.macro LOAD4x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro SAVE4x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=1                                                *
+*********************************************************************/
+
+.macro LOAD4x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+	lxsdx	vs30,	o16,	BO
+	lxsdx	vs31,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+	xsmuldp			vs48,	vs0,	vs26
+
+	xsmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+	lxsdx	vs30,	o16,	BO
+	lxsdx	vs31,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+	xsmaddadp		vs48,	vs0,	vs26
+
+	xsmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+	xsmaddadp		vs48,	vs8,	vs30
+
+	xsmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+	xsmaddadp		vs48,	vs8,	vs30
+
+	xsmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+	xsmuldp			vs48,	vs0,	vs26
+
+	xsmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+	xsmaddadp		vs48,	vs0,	vs26
+
+	xsmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro SAVE4x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs40,	alpha_r
+#else
+	xsmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs48,	alpha_r
+#else
+	xsmuldp		vs0,	vs48,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs56,	alpha_r
+#else
+	xsmuldp		vs8,	vs56,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=16                                               *
+*********************************************************************/
+
+.macro LOAD2x16_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+.endm
+
+.macro KERNEL2x16_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+.endm
+
+.macro SAVE2x16
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+	xvmaddadp	vs4,	vs36,	alpha_r
+	xvmaddadp	vs5,	vs37,	alpha_r
+	xvmaddadp	vs6,	vs38,	alpha_r
+	xvmaddadp	vs7,	vs39,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+	xvmuldp		vs4,	vs36,	alpha_r
+	xvmuldp		vs5,	vs37,	alpha_r
+	xvmuldp		vs6,	vs38,	alpha_r
+	xvmuldp		vs7,	vs39,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+
+	lxvd2x		vs12,	0,	T2
+	lxvd2x		vs13,	o16,	T2
+	lxvd2x		vs14,	o32,	T2
+	lxvd2x		vs15,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+	xvmaddadp	vs12,	vs44,	alpha_r
+	xvmaddadp	vs13,	vs45,	alpha_r
+	xvmaddadp	vs14,	vs46,	alpha_r
+	xvmaddadp	vs15,	vs47,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+	xvmuldp		vs12,	vs44,	alpha_r
+	xvmuldp		vs13,	vs45,	alpha_r
+	xvmuldp		vs14,	vs46,	alpha_r
+	xvmuldp		vs15,	vs47,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	stxvd2x		vs12,	0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	addi		CO,	CO,	128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+.endm
+
+.macro SAVE2x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	addi		CO,	CO,	64
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=4                                                *
+*********************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+.endm
+
+.macro SAVE2x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=2                                                *
+*********************************************************************/
+
+.macro LOAD2x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro SAVE2x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=1                                                *
+*********************************************************************/
+
+.macro LOAD2x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro SAVE2x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs40,	alpha_r
+#else
+	xsmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=16                                               *
+*********************************************************************/
+
+.macro LOAD1x16_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+.endm
+
+.macro KERNEL1x16_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+.endm
+
+.macro SAVE1x16
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+	xvmaddadp	vs4,	vs36,	alpha_r
+	xvmaddadp	vs5,	vs37,	alpha_r
+	xvmaddadp	vs6,	vs38,	alpha_r
+	xvmaddadp	vs7,	vs39,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+	xvmuldp		vs4,	vs36,	alpha_r
+	xvmuldp		vs5,	vs37,	alpha_r
+	xvmuldp		vs6,	vs38,	alpha_r
+	xvmuldp		vs7,	vs39,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	addi		CO,	CO,	128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+.endm
+
+.macro SAVE1x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	addi		CO,	CO,	64
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=4                                                *
+*********************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+.endm
+
+.macro SAVE1x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=2                                                *
+*********************************************************************/
+
+.macro LOAD1x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro SAVE1x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=1                                                *
+*********************************************************************/
+
+.macro LOAD1x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro SAVE1x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	7			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	6			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	5			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	4			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	3			 
+		.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*16;
+// 		ptrbb = bb + off*2;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+16;	// number of values in A
+// #else
+// 		temp = off+2;	// number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 16; // number of values in A
+// #else
+// 		temp -= 2; // number of values in B
+// #endif
+// 		ptrba += temp*16;
+// 		ptrbb += temp*2;
+// #endif
+
+// #ifdef LEFT
+// 		off += 16; // number of values in A
+// #endif
+*/
+ 
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+
+    #endif
+
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
+.endm
\ No newline at end of file
diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c
index 57f9f9e728..b458e11fcb 100644
--- a/kernel/power/dgemv_n.c
+++ b/kernel/power/dgemv_n.c
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "dgemv_n_microk_power8.c"
 #endif
 
diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c
index 3974ed62dd..b8589a1311 100644
--- a/kernel/power/dgemv_t.c
+++ b/kernel/power/dgemv_t.c
@@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#define NBMAX 8192
-#define PREFETCH 1
+#define NBMAX 1024
+//#define PREFETCH 1
 #include <altivec.h> 
 
 #define HAVE_KERNEL4x8_ASM 1
diff --git a/kernel/power/drot.c b/kernel/power/drot.c
index 3e107486f6..baeb542051 100644
--- a/kernel/power/drot.c
+++ b/kernel/power/drot.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #pragma GCC optimize "O1"
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "drot_microk_power8.c"
 #endif
 
diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c
index f32dc4bad6..779a08e9ce 100644
--- a/kernel/power/dscal.c
+++ b/kernel/power/dscal.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) 
+#if defined(POWER8) || defined(POWER9)
 #include "dscal_microk_power8.c"
 #endif
 
diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c
index fd2dec9c49..52b7f50dad 100644
--- a/kernel/power/dswap.c
+++ b/kernel/power/dswap.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "dswap_microk_power8.c"
 #endif
 
diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S
index 62d7761ec7..7acc05b4df 100644
--- a/kernel/power/gemm_beta.S
+++ b/kernel/power/gemm_beta.S
@@ -129,7 +129,7 @@ LL(12):
 	STFD	f0,  14 * SIZE(CO1)
 	STFD	f0,  15 * SIZE(CO1)
 
-	dcbst	PRE, CO1
+	dcbtst	PRE, CO1
 	addi	CO1, CO1,  16 * SIZE
 	bdnz	LL(12)
 	.align 4
diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c
new file mode 100644
index 0000000000..06fc5d8ad7
--- /dev/null
+++ b/kernel/power/icamax.c
@@ -0,0 +1,328 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
+
+#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code
+
+#if  !defined(USE_MASK_PERMUTATIONS)
+
+static inline __attribute__((always_inline))  __vector float mvec_mergee(__vector float a,__vector float b ){
+  __vector float result;
+  __asm__ ( 
+      "vmrgew %0,%1,%2;\n" 
+      : "=v" (result) 
+      : "v" (a), 
+      "v" (b) 
+      : );
+  return result;
+}
+
+static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){
+  __vector float result;
+  __asm__ ( 
+      "vmrgow %0,%1,%2;\n" 
+      : "=v" (result) 
+      : "v" (a), 
+      "v" (b) 
+      : );
+  return result;
+}
+
+#endif
+
+/**
+ * Find  maximum index 
+ * Warning: requirements n>0  and n % 32 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param maxf  (out) maximum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG   ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { 
+
+    BLASLONG index;
+    BLASLONG i;
+#if  defined(USE_MASK_PERMUTATIONS)    
+    register __vector unsigned int static_index0 = {0,1,2,3};
+#else
+    register __vector unsigned int static_index0 = {2,0,3,1};
+#endif    
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0; 
+    register __vector unsigned int static_index2=static_index0 +temp1; 
+    register __vector unsigned int static_index3=static_index1 +temp1;  
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
+    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
+    register __vector float quadruple_values={0,0,0,0};
+
+    register __vector float * v_ptrx=(__vector float *)x;
+#if  defined(USE_MASK_PERMUTATIONS)    
+    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
+    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
+#endif    
+    for(; i<n; i+=32 ){
+       //absolute temporary complex vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+#if defined(USE_MASK_PERMUTATIONS)       
+       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
+       register __vector float ti=vec_perm(v0,v1,image_pack_mask); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
+       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+#else
+       register __vector float t1=mvec_mergee(v0,v1);
+       register __vector float ti=mvec_mergeo(v0,v1); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       register __vector float t2= mvec_mergee(v2,v3);
+       register __vector float ti2=mvec_mergeo(v2,v3); 
+       v1=t2+ti2;
+       t1=mvec_mergee(v4,v5);
+       ti=mvec_mergeo(v4,v5);      
+       v2=t1+ti; //sum
+       t2=mvec_mergee(v6,v7);
+       ti2=mvec_mergeo(v6,v7); 
+       v3=t2+ti2;
+
+#endif
+       // now we have 16 summed elements . lets compare them
+       v_ptrx+=8;
+       register __vector bool int r1=vec_cmpgt(v1,v0);
+       register __vector bool int r2=vec_cmpgt(v3,v2);
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for first 16 values
+       r1=vec_cmpgt(v1,v0);
+       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1); 
+
+       //absolute temporary complex vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+#if defined(USE_MASK_PERMUTATIONS)       
+       t1=vec_perm(v0,v1,real_pack_mask);
+       ti=vec_perm(v0,v1,image_pack_mask); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       t2=vec_perm(v2,v3,real_pack_mask);
+       ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+#else
+       t1=mvec_mergee(v0,v1);
+       ti=mvec_mergeo(v0,v1); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       t2=mvec_mergee(v2,v3);
+       ti2=mvec_mergeo(v2,v3); 
+       v1=t2+ti2;
+       t1=mvec_mergee(v4,v5);
+       ti=mvec_mergeo(v4,v5);      
+       v2=t1+ti; //sum
+       t2=mvec_mergee(v6,v7);
+       ti2=mvec_mergeo(v6,v7); 
+       v3=t2+ti2;
+
+#endif
+       // now we have 16 summed elements {from 16 to 31} . lets compare them
+       v_ptrx+=8;
+       r1=vec_cmpgt(v1,v0);
+       r2=vec_cmpgt(v3,v2);
+       ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for the second 16 values
+       r1=vec_cmpgt(v1,v0);
+       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1); 
+       indv0+=temp1; //make index from 16->31
+
+       //find final quadruple from 32 elements
+       r2=vec_cmpgt(vv0,vf0);
+       ind2 = vec_sel( indf0,indv0,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+       //get asbolute index
+       ind2+=temp0;
+       //compare with old quadruple and update 
+       r1=vec_cmpgt(vv0,quadruple_values);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
+       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
+
+       temp0+=temp_add;     
+    }
+
+    //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the maximum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+      index=i1>i2?i2:i1;
+    }else if(a2>a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4>a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+       index=i1>index?index:i1;
+       *maxf=a1; 
+    }else if(a3>a1){
+       index=i1;
+       *maxf=a3;
+    }else{ 
+        *maxf=a1;
+    }
+    return index; 
+
+}
+ 
+  
+
+ 
+ 
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    BLASLONG ix = 0;
+    FLOAT maxf = 0;
+    BLASLONG max = 0;
+    BLASLONG inc_x2;
+
+    if (n <= 0 || inc_x <= 0) return(max);
+     
+    if (inc_x == 1) {
+
+      BLASLONG n1 = n & -32;
+      if (n1 > 0) {
+
+            max = ciamax_kernel_32(n1, x, &maxf); 
+            i = n1;
+            ix = n1 << 1;
+      }
+
+      while(i < n)
+    {
+        if( CABS1(x,ix) > maxf )
+        {
+            max = i;
+            maxf = CABS1(x,ix);
+        }
+        ix += 2;
+        i++;
+    }
+        return (max + 1);
+
+    } else {
+ 
+      inc_x2 = 2 * inc_x;
+
+    maxf = CABS1(x,0);
+    ix += inc_x2;
+    i++;
+
+    while(i < n)
+    {
+        if( CABS1(x,ix) > maxf )
+        {
+            max = i;
+            maxf = CABS1(x,ix);
+        }
+        ix += inc_x2;
+        i++;
+    }
+        return (max + 1);
+    }
+ 
+}
+
+
diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c
new file mode 100644
index 0000000000..36432c9933
--- /dev/null
+++ b/kernel/power/icamin.c
@@ -0,0 +1,266 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
+
+
+
+ 
+/**
+ * Find  minimum index 
+ * Warning: requirements n>0  and n % 32 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param minf  (out) minimum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG   ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { 
+
+    BLASLONG index;
+    BLASLONG i;
+    register __vector unsigned int static_index0 = {0,1,2,3};
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
+    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
+    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
+    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
+    float first_min=CABS1(x,0);
+    register __vector float quadruple_values={first_min,first_min,first_min,first_min};
+
+    register __vector float * v_ptrx=(__vector float *)x;
+    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
+    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
+    for(; i<n; i+=32){
+       //absolute temporary complex vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
+       register __vector float ti=vec_perm(v0,v1,image_pack_mask);      
+       v0=t1+ti; //sum quadruple real with quadruple image
+       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
+       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+       // now we have 16 summed elements . lets compare them
+       v_ptrx+=8;
+       register __vector bool int r1=vec_cmpgt(v0,v1);
+       register __vector bool int r2=vec_cmpgt(v2,v3);
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for first 16 values
+       r1=vec_cmpgt(v0,v1);
+       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1); 
+
+       //absolute temporary complex vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+       t1=vec_perm(v0,v1,real_pack_mask);
+       ti=vec_perm(v0,v1,image_pack_mask);      
+       v0=t1+ti; //sum quadruple real with quadruple image
+       t2=vec_perm(v2,v3,real_pack_mask);
+       ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+       // now we have 16 summed elements {from 16 to 31} . lets compare them
+       v_ptrx+=8;
+       r1=vec_cmpgt(v0,v1);
+       r2=vec_cmpgt(v2,v3);
+       ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for the second 16 values
+       r1=vec_cmpgt(v0,v1);
+       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1); 
+       indv0+=temp1; //make index from 16->31
+
+       //find final quadruple from 32 elements
+       r2=vec_cmpgt(vf0,vv0);
+       ind2 = vec_sel( indf0,indv0,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+       //get asbolute index
+       ind2+=temp0;
+       //compare with old quadruple and update 
+       r1=vec_cmpgt(quadruple_values,vv0);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
+       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
+
+       temp0+=temp_add;     
+    }
+
+ //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the minimum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+       index=i1>i2?i2:i1;
+    }else if(a2<a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4<a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+      index=i1>index?index:i1;
+       *minf=a1; 
+    }else if(a3<a1){
+       index=i1;
+       *minf=a3;
+    }else{ 
+        *minf=a1;
+    }
+    return index;
+
+}
+ 
+  
+
+ 
+
+ 
+ 
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i=0;
+    BLASLONG ix=0;
+    FLOAT minf;
+    BLASLONG min=0;
+    BLASLONG inc_x2;
+
+    if (n <= 0 || inc_x <= 0) return(min);
+    
+
+    if (inc_x == 1) {
+        minf = CABS1(x,0); //index will not be incremented
+        BLASLONG n1 = n & -32;
+        if (n1 > 0) {
+
+            min = ciamin_kernel_32(n1, x, &minf);
+            i = n1;
+            ix = n1 << 1;
+        }
+      
+
+        while(i < n)
+        {
+            if( CABS1(x,ix) < minf )
+            {
+                min = i;
+                minf = CABS1(x,ix);
+            }
+            ix += 2;
+            i++;
+        }
+        return (min + 1);
+
+    } else {
+ 
+        inc_x2 = 2 * inc_x;
+
+        minf = CABS1(x,0);
+        ix += inc_x2;
+        i++;
+
+        while(i < n)
+        {
+            if( CABS1(x,ix) < minf )
+            {
+                min = i;
+                minf = CABS1(x,ix);
+            }
+            ix += inc_x2;
+            i++;
+        }
+        return (min + 1);
+    }
+ 
+}
+
+
diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c
index f4d1d1bdb4..7fe0f8a330 100644
--- a/kernel/power/idamin.c
+++ b/kernel/power/idamin.c
@@ -89,10 +89,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             ".p2align   5            \n\t"
 
             "1: \n\t"
-            "xvcmpgedp  2,44,45  \n\t "
-            "xvcmpgedp  3,46,47  \n\t "
-            "xvcmpgedp  4,48,49  \n\t "
-            "xvcmpgedp  5,50,51  \n\t"
+            "xvcmpgtdp  2,44,45  \n\t "
+            "xvcmpgtdp  3,46,47  \n\t "
+            "xvcmpgtdp  4,48,49  \n\t "
+            "xvcmpgtdp  5,50,51  \n\t"
 
             "xxsel    32,40,41,2 \n\t"
             "xxsel     0,44,45,2 \n\t" 
@@ -103,8 +103,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "xxsel    35,42,43,5 \n\t"
             "xxsel    47,50,51,5 \n\t"
 
-            "xvcmpgedp 2,0, 1     \n\t"
-            "xvcmpgedp 3, 45,47   \n\t"
+            "xvcmpgtdp 2,0, 1     \n\t"
+            "xvcmpgtdp 3, 45,47   \n\t"
 
             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" 
     
@@ -125,7 +125,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t" 
 
             //choose smaller from first and second part
-            "xvcmpgedp 4, 0,5     \n\t" 
+            "xvcmpgtdp 4, 0,5     \n\t" 
             "xxsel     3, 0,5,4  \n\t"
             "xxsel     33,32,34,4  \n\t"
 
@@ -139,7 +139,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"
 
             //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)   
-            "xvcmpgedp 2,39, 3    \n\t"
+            "xvcmpgtdp 2,39, 3    \n\t"
             "xxsel     39,39,3,2  \n\t"
             "xxsel     38,38,33,2  \n\t"
     
@@ -162,10 +162,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
 //<-----------jump here from first load
              "2:                  \n\t"
     
-            "xvcmpgedp  2,44,45  \n\t "
-            "xvcmpgedp  3,46,47  \n\t "
-            "xvcmpgedp  4,48,49  \n\t "
-            "xvcmpgedp  5,50,51  \n\t"
+            "xvcmpgtdp  2,44,45  \n\t "
+            "xvcmpgtdp  3,46,47  \n\t "
+            "xvcmpgtdp  4,48,49  \n\t "
+            "xvcmpgtdp  5,50,51  \n\t"
 
             "xxsel    32,40,41,2 \n\t"
             "xxsel     0,44,45,2 \n\t" 
@@ -176,8 +176,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "xxsel    35,42,43,5 \n\t"
             "xxsel    47,50,51,5 \n\t"
 
-            "xvcmpgedp 2,0, 1     \n\t"
-            "xvcmpgedp 3, 45,47   \n\t"
+            "xvcmpgtdp 2,0, 1     \n\t"
+            "xvcmpgtdp 3, 45,47   \n\t"
             "xxsel     32,32,33,2 \n\t"
             "xxsel       0 ,0,1,2 \n\t"
             "xxsel     34,34,35,3 \n\t"
@@ -194,7 +194,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t" 
 
             //choose smaller from first and second part
-            "xvcmpgedp 4, 0,5     \n\t" 
+            "xvcmpgtdp 4, 0,5     \n\t" 
             "xxsel     3, 0,5,4  \n\t"
             "xxsel     33,32,34,4  \n\t"
 
@@ -210,7 +210,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
  
 
             //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)   
-            "xvcmpgedp 2,39, 3    \n\t"
+            "xvcmpgtdp 2,39, 3    \n\t"
             "xxsel     39,39,3,2  \n\t"
             "xxsel     38,38,33,2  \n\t"
     
@@ -238,10 +238,10 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
 //==============================================================================
 
-            "xvcmpgedp  2,44,45  \n\t "
-            "xvcmpgedp  3,46,47  \n\t "
-            "xvcmpgedp  4,48,49  \n\t "
-            "xvcmpgedp  5,50,51  \n\t"
+            "xvcmpgtdp  2,44,45  \n\t "
+            "xvcmpgtdp  3,46,47  \n\t "
+            "xvcmpgtdp  4,48,49  \n\t "
+            "xvcmpgtdp  5,50,51  \n\t"
 
             "xxsel    32,40,41,2 \n\t"
             "xxsel     0,44,45,2 \n\t" 
@@ -252,8 +252,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "xxsel    35,42,43,5 \n\t"
             "xxsel    47,50,51,5 \n\t"
 
-            "xvcmpgedp 2,0, 1     \n\t"
-            "xvcmpgedp 3, 45,47   \n\t"
+            "xvcmpgtdp 2,0, 1     \n\t"
+            "xvcmpgtdp 3, 45,47   \n\t"
   
     
             "xxsel     32,32,33,2 \n\t"
@@ -264,14 +264,14 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
             "vaddudm     2,2,4  \n\t" // vs34=vs34 + vs36{8,8}  
             //choose smaller from first and second part
-            "xvcmpgedp 4, 0,5     \n\t" 
+            "xvcmpgtdp 4, 0,5     \n\t" 
             "xxsel     3, 0,5,4   \n\t"
             "xxsel     33,32,34,4 \n\t" 
 
             "vaddudm  1,1,5  \n\t"  //  get real index for first smaller   
 
             //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)   
-            "xvcmpgedp 2,39, 3    \n\t"
+            "xvcmpgtdp 2,39, 3    \n\t"
             "xxsel     39,39,3,2  \n\t"
             "xxsel     38,38,33,2  \n\t"
 
@@ -284,7 +284,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
              //0b001110=14
             "bc 14,24, 3f  \n\t" 
-            "xvcmpgedp  4,39, 40  \n\t"
+            "xvcmpgtdp  4,39, 40  \n\t"
             "xxsel    0,39,40,4           \n\t"
             "xxsel    1,38,32,4  \n\t"
             "stxsdx    0,0,%[ptr_minf]     \n\t" 
diff --git a/kernel/power/isamax.c b/kernel/power/isamax.c
new file mode 100644
index 0000000000..bf1af78d6d
--- /dev/null
+++ b/kernel/power/isamax.c
@@ -0,0 +1,288 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+
+
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+
+/**
+ * Find  maximum index 
+ * Warning: requirements n>0  and n % 64 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param maxf  (out) maximum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) {
+    BLASLONG index;
+    BLASLONG i=0;
+    register __vector unsigned int static_index0 = {0,1,2,3};
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
+    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
+    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
+    register __vector float quadruple_values={0,0,0,0};
+    register __vector float * v_ptrx=(__vector float *)x;
+    for(; i<n; i+=64){
+       //absolute temporary vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       register __vector bool int r1=vec_cmpgt(v1,v0);
+       register __vector bool int r2=vec_cmpgt(v3,v2);
+       register __vector bool int r3=vec_cmpgt(v5,v4);
+       register __vector bool int r4=vec_cmpgt(v7,v6);
+      
+       //select
+       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1);
+
+       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vf1= vec_sel(v2,v3,r2);
+
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+        r1=vec_cmpgt(vf1,vf0);
+       r2=vec_cmpgt(v1,v0);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_first= vec_sel(ind0_first,ind1,r1);
+       vf0= vec_sel(vf0,vf1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vf1= vec_sel(v0,v1,r2);
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the first 32 values
+       r1=vec_cmpgt(vf1,vf0);
+       ind0_first = vec_sel(ind0_first,ind2,r1);
+       vf0= vec_sel(vf0,vf1,r1);
+ 
+       ind0_first+=temp0; //get absolute index
+
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+       //second part of 32
+       // absolute temporary vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       r1=vec_cmpgt(v1,v0);
+       r2=vec_cmpgt(v3,v2);
+       r3=vec_cmpgt(v5,v4);
+       r4=vec_cmpgt(v7,v6);
+       //select
+       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1);
+
+       ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vv1= vec_sel(v2,v3,r2);
+
+       ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+       r1=vec_cmpgt(vv1,vv0);
+       r2=vec_cmpgt(v1,v0);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_second= vec_sel(ind0_second,ind1,r1);
+       vv0= vec_sel(vv0,vv1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vv1= vec_sel(v0,v1,r2) ;  
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the second 32 values
+       r1=vec_cmpgt(vv1,vv0);
+       ind0_second = vec_sel(ind0_second,ind2,r1);
+       vv0= vec_sel(vv0,vv1,r1);
+
+       ind0_second+=temp0; //get absolute index
+    
+       //find final quadruple from 64 elements
+       r2=vec_cmpgt(vv0,vf0);
+       ind2 = vec_sel( ind0_first,ind0_second,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+
+       //compare with old quadruple and update 
+       r3=vec_cmpgt(vv0,quadruple_values);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
+       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
+
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+ 
+    }
+
+    //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the maximum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+      index=i1>i2?i2:i1;
+    }else if(a2>a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4>a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+       index=i1>index?index:i1;
+       *maxf=a1; 
+    }else if(a3>a1){
+       index=i1;
+       *maxf=a3;
+    }else{ 
+        *maxf=a1;
+    }
+    return index;
+
+}
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+    BLASLONG i = 0;
+    BLASLONG j = 0;
+    FLOAT maxf = 0.0;
+    BLASLONG max = 0;
+
+    if (n <= 0 || inc_x <= 0) return (max);
+
+    if (inc_x == 1) {
+
+        BLASLONG n1 = n & -64;
+        if (n1 > 0) {
+
+            max = siamax_kernel_64(n1, x, &maxf);
+
+            i = n1;
+        }
+
+        while (i < n) {
+            if (ABS(x[i]) > maxf) {
+                max = i;
+                maxf = ABS(x[i]);
+            }
+            i++;
+        }
+        return (max + 1);
+
+    } else {
+
+        BLASLONG n1 = n & -4;
+        while (j < n1) {
+
+            if (ABS(x[i]) > maxf) {
+                max = j;
+                maxf = ABS(x[i]);
+            }
+            if (ABS(x[i + inc_x]) > maxf) {
+                max = j + 1;
+                maxf = ABS(x[i + inc_x]);
+            }
+            if (ABS(x[i + 2 * inc_x]) > maxf) {
+                max = j + 2;
+                maxf = ABS(x[i + 2 * inc_x]);
+            }
+            if (ABS(x[i + 3 * inc_x]) > maxf) {
+                max = j + 3;
+                maxf = ABS(x[i + 3 * inc_x]);
+            }
+
+            i += inc_x * 4;
+
+            j += 4;
+
+        }
+
+
+        while (j < n) {
+            if (ABS(x[i]) > maxf) {
+                max = j;
+                maxf = ABS(x[i]);
+            }
+            i += inc_x;
+            j++;
+        }
+        return (max + 1);
+    }
+}
diff --git a/kernel/power/isamin.c b/kernel/power/isamin.c
new file mode 100644
index 0000000000..1c1f0ad788
--- /dev/null
+++ b/kernel/power/isamin.c
@@ -0,0 +1,288 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+/**
+ * Find  minimum index 
+ * Warning: requirements n>0  and n % 64 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param minf  (out) minimum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) {
+    BLASLONG index;
+    BLASLONG i=0;
+    register __vector unsigned int static_index0 = {0,1,2,3};
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
+    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
+    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3};
+    register __vector float * v_ptrx=(__vector float *)x;
+    register __vector float quadruple_values=vec_abs(v_ptrx[0]);
+    for(; i<n; i+=64){
+       //absolute temporary vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       register __vector bool int r1=vec_cmpgt(v0,v1);
+       register __vector bool int r2=vec_cmpgt(v2,v3);
+       register __vector bool int r3=vec_cmpgt(v4,v5);
+       register __vector bool int r4=vec_cmpgt(v6,v7);
+              
+       //select
+       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1);
+
+       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vf1= vec_sel(v2,v3,r2);
+
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+       r1=vec_cmpgt(vf0,vf1);
+       r2=vec_cmpgt(v0,v1);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_first= vec_sel(ind0_first,ind1,r1);
+       vf0= vec_sel(vf0,vf1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vf1= vec_sel(v0,v1,r2);
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the first 32 values
+       r1=vec_cmpgt(vf0,vf1);
+       ind0_first = vec_sel(ind0_first,ind2,r1);
+       vf0= vec_sel(vf0,vf1,r1);
+ 
+       ind0_first+=temp0; //get absolute index
+       
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+       //second part of 32
+       // absolute temporary vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       r1=vec_cmpgt(v0,v1);
+       r2=vec_cmpgt(v2,v3);
+       r3=vec_cmpgt(v4,v5);
+       r4=vec_cmpgt(v6,v7);
+       //select
+       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1);
+
+       ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vv1= vec_sel(v2,v3,r2);
+
+       ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+       r1=vec_cmpgt(vv0,vv1);
+       r2=vec_cmpgt(v0,v1);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_second= vec_sel(ind0_second,ind1,r1);
+       vv0= vec_sel(vv0,vv1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vv1= vec_sel(v0,v1,r2) ;  
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the second 32 values
+       r1=vec_cmpgt(vv0,vv1);
+       ind0_second = vec_sel(ind0_second,ind2,r1);
+       vv0= vec_sel(vv0,vv1,r1);
+
+       ind0_second+=temp0; //get absolute index
+        
+       //find final quadruple from 64 elements
+       r2=vec_cmpgt(vf0,vv0);
+       ind2 = vec_sel( ind0_first,ind0_second,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+             
+       //compare with old quadruple and update 
+       r3=vec_cmpgt( quadruple_values,vv0);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
+       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
+            
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+       
+      
+    }
+
+    //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the minimum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+       index=i1>i2?i2:i1;
+    }else if(a2<a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4<a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+      index=i1>index?index:i1;
+       *minf=a1; 
+    }else if(a3<a1){
+       index=i1;
+       *minf=a3;
+    }else{ 
+        *minf=a1;
+    }
+    return index;
+
+}
+
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+    BLASLONG i = 0;
+    BLASLONG j = 0; 
+    BLASLONG min = 0;
+    FLOAT minf = 0.0;
+    
+    if (n <= 0 || inc_x <= 0) return (min);
+    minf = ABS(x[0]); //index's not incremented
+    if (inc_x == 1) {
+
+        BLASLONG n1 = n & -64;
+        if (n1 > 0) {
+
+            min = siamin_kernel_64(n1, x, &minf);
+            i = n1;
+        }
+
+        while (i < n) {
+            if (ABS(x[i]) < minf) {
+                min = i;
+                minf = ABS(x[i]);
+            }
+            i++;
+        }
+        return (min + 1);
+
+    } else {
+
+        BLASLONG n1 = n & -4;
+        while (j < n1) {
+
+            if (ABS(x[i]) < minf) {
+                min = j;
+                minf = ABS(x[i]);
+            }
+            if (ABS(x[i + inc_x]) < minf) {
+                min = j + 1;
+                minf = ABS(x[i + inc_x]);
+            }
+            if (ABS(x[i + 2 * inc_x]) < minf) {
+                min = j + 2;
+                minf = ABS(x[i + 2 * inc_x]);
+            }
+            if (ABS(x[i + 3 * inc_x]) < minf) {
+                min = j + 3;
+                minf = ABS(x[i + 3 * inc_x]);
+            }
+
+            i += inc_x * 4;
+
+            j += 4;
+
+        }
+
+
+        while (j < n) {
+            if (ABS(x[i]) < minf) {
+                min = j;
+                minf = ABS(x[i]);
+            }
+            i += inc_x;
+            j++;
+        }
+        return (min + 1);
+    }
+}
diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c
index 448247ffd5..1ffa3ba8b2 100644
--- a/kernel/power/izamin.c
+++ b/kernel/power/izamin.c
@@ -101,8 +101,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
 
 
-            "xvcmpgedp  50,46,47  \n\t "
-            "xvcmpgedp  51,48,49  \n\t "
+            "xvcmpgtdp  50,46,47  \n\t "
+            "xvcmpgtdp  51,48,49  \n\t "
 
             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"   
 
@@ -114,7 +114,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "lxvd2x  44,      0,%[ptr_tmp] \n\t"
             "lxvd2x  45, %[i16],%[ptr_tmp] \n\t"
 
-            "xvcmpgedp  2,0,1  \n\t "             
+            "xvcmpgtdp  2,0,1  \n\t "             
             "lxvd2x  46, %[i32],%[ptr_tmp] \n\t"
             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t"
 
@@ -126,7 +126,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
              //cmp with previous
 
-            "xvcmpgedp 4,39,3     \n\t "  
+            "xvcmpgtdp 4,39,3     \n\t "  
             "vaddudm   5,5,4      \n\t"     
 
             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
@@ -166,8 +166,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "xvadddp    48,  4,5 \n\t"
             "xvadddp    49,  44,45 \n\t"
 
-            "xvcmpgedp  50,46,47  \n\t "
-            "xvcmpgedp  51,48,49  \n\t "
+            "xvcmpgtdp  50,46,47  \n\t "
+            "xvcmpgtdp  51,48,49  \n\t "
 
             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"   
 
@@ -179,7 +179,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "lxvd2x  44,      0,%[ptr_tmp] \n\t"
             "lxvd2x  45, %[i16],%[ptr_tmp] \n\t"
 
-            "xvcmpgedp  2,0,1  \n\t "             
+            "xvcmpgtdp  2,0,1  \n\t "             
             "lxvd2x  46, %[i32],%[ptr_tmp] \n\t"
             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t"
 
@@ -191,7 +191,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
              //cmp with previous
 
-            "xvcmpgedp 4,39,3     \n\t "  
+            "xvcmpgtdp 4,39,3     \n\t "  
             "vaddudm   5,5,4      \n\t"     
 
             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
@@ -235,15 +235,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
 
 
-            "xvcmpgedp  50,46,47  \n\t "
-            "xvcmpgedp  51,48,49  \n\t "
+            "xvcmpgtdp  50,46,47  \n\t "
+            "xvcmpgtdp  51,48,49  \n\t "
 
             "xxsel    32,40,41,50 \n\t"
             "xxsel     0,46,47,50 \n\t" 
             "xxsel    33,42,43,51 \n\t"
             "xxsel     1,48,49,51 \n\t"  
 
-            "xvcmpgedp  2,0,1  \n\t " 
+            "xvcmpgtdp  2,0,1  \n\t " 
             "xxsel    32,32,33,2 \n\t" 
             "xxsel    3,0,1,2 \n\t" 
      
@@ -252,7 +252,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"            
              //cmp with previous
 
-            "xvcmpgedp 4,39,3     \n\t "  
+            "xvcmpgtdp 4,39,3     \n\t "  
             "vaddudm   5,5,4      \n\t"     
             "xxsel     38,38,32,4 \n\t" 
             "xxsel    39,39,3,4    \n\t" 
@@ -267,7 +267,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
             //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
              //0b001110=14
             "bc 14,24, 3f  \n\t" 
-            "xvcmpgedp  4,39, 40  \n\t"
+            "xvcmpgtdp  4,39, 40  \n\t"
             "xxsel    0,39,40,4           \n\t"
             "xxsel    1,38,32,4  \n\t"
             "stxsdx    0,0,%[ptr_minf]     \n\t" 
diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c
index fb10b1d27e..5908347d3d 100644
--- a/kernel/power/sasum.c
+++ b/kernel/power/sasum.c
@@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "sasum_microk_power8.c"
 #endif
 
diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c
new file mode 100644
index 0000000000..393cdfadc4
--- /dev/null
+++ b/kernel/power/saxpy.c
@@ -0,0 +1,129 @@
+/***************************************************************************
+Copyright (c) 2013-2018, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+ 
+
+
+#ifndef HAVE_KERNEL_8
+#include <altivec.h> 
+
+static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
+{
+    BLASLONG  i = 0;
+    __vector float v_a = {alpha,alpha,alpha,alpha}; 
+    __vector float * v_y=(__vector float *)y;
+    __vector float * v_x=(__vector float *)x;
+        
+    for(; i<n/4; i+=16){
+
+        v_y[i]    += v_a * v_x[i];
+        v_y[i+1]  += v_a * v_x[i+1];
+        v_y[i+2]  += v_a * v_x[i+2];
+        v_y[i+3]  += v_a * v_x[i+3];
+        v_y[i+4]  += v_a * v_x[i+4];
+        v_y[i+5]  += v_a * v_x[i+5];
+        v_y[i+6]  += v_a * v_x[i+6];
+        v_y[i+7]  += v_a * v_x[i+7]; 
+        v_y[i+8]  += v_a * v_x[i+8];
+        v_y[i+9]  += v_a * v_x[i+9];
+        v_y[i+10] += v_a * v_x[i+10];
+        v_y[i+11] += v_a * v_x[i+11];
+        v_y[i+12] += v_a * v_x[i+12];
+        v_y[i+13] += v_a * v_x[i+13];
+        v_y[i+14] += v_a * v_x[i+14];
+        v_y[i+15] += v_a * v_x[i+15];
+    }
+}
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0 )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -64;
+
+		if ( n1 )
+			saxpy_kernel_64(n1, x, y, da);
+
+		i = n1;
+		while(i < n)
+		{
+
+			y[i] += da * x[i] ;
+			i++ ;
+
+		}
+		return(0);
+
+
+	}
+
+	BLASLONG n1 = n & -4;
+
+	while(i < n1)
+	{
+
+		FLOAT m1      = da * x[ix] ;
+		FLOAT m2      = da * x[ix+inc_x] ;
+		FLOAT m3      = da * x[ix+2*inc_x] ;
+		FLOAT m4      = da * x[ix+3*inc_x] ;
+
+		y[iy]         += m1 ;
+		y[iy+inc_y]   += m2 ;
+		y[iy+2*inc_y] += m3 ;
+		y[iy+3*inc_y] += m4 ;
+
+		ix  += inc_x*4 ;
+		iy  += inc_y*4 ;
+		i+=4 ;
+
+	}
+
+	while(i < n)
+	{
+
+		y[iy] += da * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
+
diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c
index 167c29babc..5e3fe45a57 100644
--- a/kernel/power/scopy.c
+++ b/kernel/power/scopy.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "scopy_microk_power8.c"
 #endif
 
diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c
index 4fdc2f5b55..ae527dde9d 100644
--- a/kernel/power/sdot.c
+++ b/kernel/power/sdot.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) 
+#if defined(POWER8)  || defined(POWER9)
 #include "sdot_microk_power8.c"
 #endif
 
diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c
new file mode 100644
index 0000000000..9704757fe4
--- /dev/null
+++ b/kernel/power/sgemv_n.c
@@ -0,0 +1,465 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#define NBMAX 4096
+
+static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+    BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; 
+    FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3]; 
+    b0 = a0 + lda4 ;
+	b1 = a1 + lda4 ;
+	b2 = a2 + lda4 ;
+	b3 = a3 + lda4 ;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    x4 = xo[4] * *alpha;
+    x5 = xo[5] * *alpha;
+    x6 = xo[6] * *alpha;
+    x7 = xo[7] * *alpha;
+    __vector float* va0 = (__vector float*)a0;
+    __vector float* va1 = (__vector float*)a1;
+    __vector float* va2 = (__vector float*)a2;
+    __vector float* va3 = (__vector float*)a3;
+    __vector float* vb0 = (__vector float*)b0;
+    __vector float* vb1 = (__vector float*)b1;
+    __vector float* vb2 = (__vector float*)b2;
+    __vector float* vb3 = (__vector float*)b3; 
+    
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1};
+    __vector float   v_x2 = {x2,x2,x2,x2};
+    __vector float   v_x3 = {x3,x3,x3,x3};
+    __vector float   v_x4 = {x4,x4,x4,x4};
+    __vector float   v_x5 = {x5,x5,x5,x5};
+    __vector float   v_x6 = {x6,x6,x6,x6};
+    __vector float   v_x7 = {x7,x7,x7,x7};
+    __vector float* v_y =(__vector float*)y;   
+ 
+    for ( i=0; i< n/4; i++)
+    {
+        register __vector float vy=v_y[i];
+        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ; 
+        vy  += v_x4 * vb0[i]   +  v_x5 * vb1[i]   + v_x6 * vb2[i]   + v_x7 * vb3[i] ;
+        v_y[i] =vy;  
+    }
+
+}
+	 
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+    BLASLONG i;
+    FLOAT x0,x1,x2,x3;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1};
+    __vector float   v_x2 = {x2,x2,x2,x2};
+    __vector float   v_x3 = {x3,x3,x3,x3};
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1];
+    __vector float* va2 = (__vector float*)ap[2];
+    __vector float* va3 = (__vector float*)ap[3]; 
+ 
+    for ( i=0; i< n/4; i++ )
+    {
+        register __vector float vy=v_y[i];
+        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ;  
+        v_y[i] =vy;     
+    }
+
+} 
+
+static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0,x1;
+    x0 = x[0] * *alpha;
+    x1 = x[1] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1]; 
+ 
+    for ( i=0; i< n/4; i++ )
+    { 
+        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i] ;     
+    }
+
+} 
+ 
+ 
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0 ;
+    x0 = x[0] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap; 
+ 
+    for ( i=0; i< n/4; i++ )
+    { 
+        v_y[i]   += v_x0 * va0[i]  ;        
+    }
+
+}
+ 
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+    BLASLONG i;
+        
+    for ( i=0; i<n; i++ ){
+            *dest += *src;
+            src++;
+            dest += inc_dest;
+    }
+    return;
+     
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda8 =  lda << 3;
+	FLOAT xbuffer[8],*ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+        if ( inc_x == 1 )
+	{
+		n1 = n >> 3 ;
+		n2 = n &  7 ;
+	}
+	else
+	{
+		n1 = n >> 2 ;
+		n2 = n &  3 ;
+
+	}
+	
+        m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*4);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+
+			for( i = 0; i < n1 ; i++)
+			{
+				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
+				ap[0] += lda8; 
+				ap[1] += lda8; 
+				ap[2] += lda8; 
+				ap[3] += lda8; 
+				a_ptr += lda8;
+				x_ptr += 8;	
+			}
+
+
+			if ( n2 & 4 )
+			{
+				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); 
+                a_ptr += lda;
+                x_ptr += 1;   
+			}
+
+
+		}
+		else
+		{
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	if ( m3 == 3 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		if ( lda == 3 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
+				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+				a_ptr += 12;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += 3;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		return(0);
+	}
+
+
+	if ( m3 == 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		return(0);
+	}
+
+	if ( m3 == 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+		return(0);
+	}
+
+
+	return(0);
+}
+
+
diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c
new file mode 100644
index 0000000000..9bc93ced67
--- /dev/null
+++ b/kernel/power/sgemv_n_8.c
@@ -0,0 +1,513 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/****Note***
+UnUsed kernel
+This kernel works. But it was not competitive enough to be added in production
+It could be used and tested in future or could provide barebone for switching to inline assembly
+*/
+
+#include "common.h"
+
+#define NBMAX 4096
+
+static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+    BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; 
+    FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3]; 
+    b0 = a0 + lda4 ;
+	b1 = a1 + lda4 ;
+	b2 = a2 + lda4 ;
+	b3 = a3 + lda4 ;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    x4 = xo[4] * *alpha;
+    x5 = xo[5] * *alpha;
+    x6 = xo[6] * *alpha;
+    x7 = xo[7] * *alpha;
+    __vector float* va0 = (__vector float*)a0;
+    __vector float* va1 = (__vector float*)a1;
+    __vector float* va2 = (__vector float*)a2;
+    __vector float* va3 = (__vector float*)a3;
+    __vector float* vb0 = (__vector float*)b0;
+    __vector float* vb1 = (__vector float*)b1;
+    __vector float* vb2 = (__vector float*)b2;
+    __vector float* vb3 = (__vector float*)b3; 
+    
+    register __vector float   v_x0 = {x0,x0,x0,x0};
+    register __vector float   v_x1 = {x1,x1,x1,x1};
+    register __vector float   v_x2 = {x2,x2,x2,x2};
+    register __vector float   v_x3 = {x3,x3,x3,x3};
+    register __vector float   v_x4 = {x4,x4,x4,x4};
+    register __vector float   v_x5 = {x5,x5,x5,x5};
+    register __vector float   v_x6 = {x6,x6,x6,x6};
+    register __vector float   v_x7 = {x7,x7,x7,x7};
+    __vector float* v_y =(__vector float*)y;   
+ 
+    for ( i=0; i< n/4; i+=2)
+    {
+        register __vector float vy_1=v_y[i];
+        register __vector float vy_2=v_y[i+1];
+        register __vector float va0_1=va0[i] ; 
+        register __vector float va0_2=va0[i+1] ; 
+        register __vector float va1_1=va1[i] ; 
+        register __vector float va1_2=va1[i+1] ; 
+        register __vector float va2_1=va2[i] ; 
+        register __vector float va2_2=va2[i+1] ; 
+        register __vector float va3_1=va3[i] ; 
+        register __vector float va3_2=va3[i+1] ;
+        register __vector float vb0_1=vb0[i] ; 
+        register __vector float vb0_2=vb0[i+1] ; 
+        register __vector float vb1_1=vb1[i] ; 
+        register __vector float vb1_2=vb1[i+1] ; 
+        register __vector float vb2_1=vb2[i] ; 
+        register __vector float vb2_2=vb2[i+1] ; 
+        register __vector float vb3_1=vb3[i] ; 
+        register __vector float vb3_2=vb3[i+1] ;         
+        vy_1   += v_x0 * va0_1  +  v_x1 * va1_1  + v_x2 * va2_1  + v_x3 * va3_1 ;
+        vy_1   += v_x4 * vb0_1   +  v_x5 * vb1_1   + v_x6 * vb2_1   + v_x7 * vb3_1 ;
+        vy_2   +=  v_x0 * va0_2   +  v_x1 * va1_2   + v_x2 * va2_2   + v_x3 * va3_2 ; 
+        vy_2   += v_x4 * vb0_2   +  v_x5 * vb1_2   + v_x6 * vb2_2   + v_x7 * vb3_2 ;
+        v_y[i] =vy_1;
+        v_y[i+1] =vy_2;   
+    }
+
+}
+	 
+static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+    BLASLONG i;
+    FLOAT x0,x1,x2,x3;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1};
+    __vector float   v_x2 = {x2,x2,x2,x2};
+    __vector float   v_x3 = {x3,x3,x3,x3};
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1];
+    __vector float* va2 = (__vector float*)ap[2];
+    __vector float* va3 = (__vector float*)ap[3]; 
+ 
+    for ( i=0; i< n/4; i+=2 )
+    {
+        register __vector float vy_1=v_y[i];
+        register __vector float vy_2=v_y[i+1];
+        register __vector float va0_1=va0[i] ; 
+        register __vector float va0_2=va0[i+1] ; 
+        register __vector float va1_1=va1[i] ; 
+        register __vector float va1_2=va1[i+1] ; 
+        register __vector float va2_1=va2[i] ; 
+        register __vector float va2_2=va2[i+1] ; 
+        register __vector float va3_1=va3[i] ; 
+        register __vector float va3_2=va3[i+1] ;      
+        vy_1   += v_x0 * va0_1  +  v_x1 * va1_1  + v_x2 * va2_1  + v_x3 * va3_1 ;
+        vy_2   +=  v_x0 * va0_2   +  v_x1 * va1_2   + v_x2 * va2_2   + v_x3 * va3_2 ;
+        v_y[i] =vy_1;
+        v_y[i+1] =vy_2;   
+    }
+  
+} 
+
+static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0,x1;
+    x0 = x[0] * *alpha;
+    x1 = x[1] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1]; 
+ 
+    for ( i=0; i< n/4; i+=2 )
+    { 
+        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i] ;
+        v_y[i+1]  += v_x0 * va0[i+1]   +  v_x1 * va1[i+1] ;     
+    }
+
+} 
+ 
+ 
+static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0 ;
+    x0 = x[0] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap; 
+ 
+    for ( i=0; i< n/4; i+=2 )
+    { 
+        v_y[i]   += v_x0 * va0[i]   ;
+        v_y[i+1] +=   v_x0 * va0[i+1]   ;        
+    }
+
+}
+ 
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+    BLASLONG i;
+        
+    for ( i=0; i<n; i++ ){
+            *dest += *src;
+            src++;
+            dest += inc_dest;
+    }
+    return;
+     
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda8 =  lda << 3;
+	FLOAT xbuffer[8],*ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+        if ( inc_x == 1 )
+	{
+		n1 = n >> 3 ;
+		n2 = n &  7 ;
+	}
+	else
+	{
+		n1 = n >> 2 ;
+		n2 = n &  3 ;
+
+	}
+	 
+        m3 = m & 7  ;
+        m1 = m - m3;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*4);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+
+			for( i = 0; i < n1 ; i++)
+			{
+				sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
+				ap[0] += lda8; 
+				ap[1] += lda8; 
+				ap[2] += lda8; 
+				ap[3] += lda8; 
+				a_ptr += lda8;
+				x_ptr += 8;	
+			}
+
+
+			if ( n2 & 4 )
+			{
+				sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); 
+                a_ptr += lda;
+                x_ptr += 1;   
+			}
+
+
+		}
+		else
+		{
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	 
+	if ( m3 & 4 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		FLOAT temp3 = 0.0;		
+		if ( lda == 4 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1];
+				temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1];
+
+				temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12]  * x_ptr[3];
+				temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3];
+				temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3];
+				temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3];
+
+				a_ptr += 16;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				temp3 += a_ptr[3] * x_ptr[0] ;
+				a_ptr +=4;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				temp3 += a_ptr[3] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp3; 
+		y_ptr += inc_y;
+        a     += 4;
+	}
+
+
+	if ( m3 & 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+ 		y_ptr += inc_y;
+        a     += 2;
+	}
+
+	if ( m3 & 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+ 
+ 
+	}
+
+
+	return(0);
+}
+
+
diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c
new file mode 100644
index 0000000000..96434a13f6
--- /dev/null
+++ b/kernel/power/sgemv_t.c
@@ -0,0 +1,480 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "common.h"
+
+#define NBMAX 2048
+
+#include <altivec.h> 
+ 
+static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i;  
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0};
+    register __vector float temp4 = {0,0,0,0};
+    register __vector float temp5 = {0,0,0,0};
+    register __vector float temp6 = {0,0,0,0};
+    register __vector float temp7 = {0,0,0,0};
+
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    a4 = a3 + lda;
+    a5 = a4 + lda;
+    a6 = a5 + lda;
+    a7 = a6 + lda;
+    va0 = (__vector float*) a0;
+    va1 = (__vector float*) a1;
+    va2 = (__vector float*) a2;
+    va3 = (__vector float*) a3;
+    va4 = (__vector float*) a4;
+    va5 = (__vector float*) a5;
+    va6 = (__vector float*) a6;
+    va7 = (__vector float*) a7;
+    v_x = (__vector float*) x;
+ 
+   
+        for (i = 0; i < n/4; i ++) {
+            temp0 += v_x[i] * va0[i];
+            temp1 += v_x[i] * va1[i];
+            temp2 += v_x[i] * va2[i];
+            temp3 += v_x[i] * va3[i];
+            temp4 += v_x[i] * va4[i];
+            temp5 += v_x[i] * va5[i];
+            temp6 += v_x[i] * va6[i];
+            temp7 += v_x[i] * va7[i]; 
+        }
+    
+  
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+    y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
+    y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
+    y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
+    y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
+
+}
+ 
+
+static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i = 0;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* va2 = (__vector float*) a2;
+    __vector float* va3 = (__vector float*) a3;
+    __vector float* v_x = (__vector float*) x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0}; 
+
+    for (i = 0; i < n / 4; i ++) {
+        temp0 += v_x[i] * va0[i];
+        temp1 += v_x[i] * va1[i];
+        temp2 += v_x[i] * va2[i];
+        temp3 += v_x[i] * va3[i]; 
+    }
+ 
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+}
+ 
+
+static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
+
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    __vector float temp1 = {0,0,0,0};
+    for (i = 0; i < n / 4; i ++) {
+        temp0 += v_x[i] * va0[i];
+        temp1 += v_x[i] * va1[i];
+    }
+
+
+
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); 
+}
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    for (i = 0; i < n / 4; i ++) {
+        temp0 += v_x[i] * va0[i] ;
+    }
+
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+
+}
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+        *dest++ = *src;
+        src += inc_src;
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
+    BLASLONG i;
+    BLASLONG j;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2;
+
+    FLOAT ybuffer[8], *xbuffer;
+
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    xbuffer = buffer;
+
+    n1 = n >> 3;
+    n2 = n & 7;
+
+    m3 = m & 3;
+    m1 = m - m3;
+    m2 = (m & (NBMAX - 1)) - m3;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        y_ptr = y;
+        a_ptr = a;
+        x_ptr = x;
+
+        if (inc_x != 1)
+            copy_x(NB, x_ptr, xbuffer, inc_x);
+        else
+            xbuffer = x_ptr;
+
+        BLASLONG lda8 = lda << 3;
+
+
+        if (inc_y == 1) {
+
+            for (i = 0; i < n1; i++) {
+                 
+                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
+ 
+                y_ptr += 8;
+                a_ptr += lda8;
+        
+            }
+
+        } else {
+                   
+            for (i = 0; i < n1; i++) {
+                ybuffer[0] = 0;
+                ybuffer[1] = 0;
+                ybuffer[2] = 0;
+                ybuffer[3] = 0;
+                ybuffer[4] = 0;
+                ybuffer[5] = 0;
+                ybuffer[6] = 0;
+                ybuffer[7] = 0;
+                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+ 
+
+                *y_ptr += ybuffer[0];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[1];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[2];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[3];
+                y_ptr += inc_y;
+
+                *y_ptr += ybuffer[4];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[5];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[6];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[7];
+                y_ptr += inc_y;
+
+                a_ptr += lda8;
+            }
+
+        }
+
+
+        if (n2 & 4) {
+            ybuffer[0] = 0;
+            ybuffer[1] = 0;
+            ybuffer[2] = 0;
+            ybuffer[3] = 0;
+            sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+            a_ptr += lda<<2;
+
+            *y_ptr += ybuffer[0];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[1];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[2];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[3];
+            y_ptr += inc_y;
+        }
+
+        if (n2 & 2) {
+            sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
+            a_ptr += lda << 1;
+            y_ptr += 2 * inc_y;
+
+        }
+
+        if (n2 & 1) {
+            sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
+            a_ptr += lda;
+            y_ptr += inc_y;
+
+        }
+
+        a += NB;
+        x += NB * inc_x;
+
+
+    }
+
+    if (m3 == 0) return (0);
+
+    x_ptr = x;
+    a_ptr = a;
+    if (m3 == 3) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp2 = *x_ptr * alpha;
+
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 3 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+                y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
+                y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
+                y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+                aj += 12;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+                aj += 3;
+            }
+
+        } else {
+
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    aj += lda;
+                }
+
+            } else {
+
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+
+            }
+
+        }
+        return (0);
+    }
+
+    if (m3 == 2) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 2 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
+                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
+                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
+                aj += 8;
+
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                aj += 2;
+            }
+
+        } else {
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    aj += lda;
+                }
+
+            } else {
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+            }
+
+        }
+        return (0);
+
+    }
+
+    FLOAT xtemp = *x_ptr * alpha;
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
+    if (lda == 1 && inc_y == 1) {
+        for (j = 0; j < (n & -4); j += 4) {
+            y_ptr[j] += aj[j] * xtemp;
+            y_ptr[j + 1] += aj[j + 1] * xtemp;
+            y_ptr[j + 2] += aj[j + 2] * xtemp;
+            y_ptr[j + 3] += aj[j + 3] * xtemp;
+        }
+        for (; j < n; j++) {
+            y_ptr[j] += aj[j] * xtemp;
+        }
+
+
+    } else {
+        if (inc_y == 1) {
+
+            BLASLONG register lda2 = lda << 1;
+            BLASLONG register lda4 = lda << 2;
+            BLASLONG register lda3 = lda2 + lda;
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += *aj * xtemp;
+                y_ptr[j + 1] += *(aj + lda) * xtemp;
+                y_ptr[j + 2] += *(aj + lda2) * xtemp;
+                y_ptr[j + 3] += *(aj + lda3) * xtemp;
+                aj += lda4;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += *aj * xtemp;
+                aj += lda;
+            }
+
+        } else {
+            for (j = 0; j < n; j++) {
+                *y_ptr += *aj * xtemp;
+                y_ptr += inc_y;
+                aj += lda;
+            }
+
+        }
+    }
+
+    return (0);
+
+}
+
diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c
new file mode 100644
index 0000000000..5e9cd63ac3
--- /dev/null
+++ b/kernel/power/sgemv_t_8.c
@@ -0,0 +1,508 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+ 
+/****Note***
+UnUsed kernel
+This kernel works. But it was not competitive enough to be added in production
+It could be used and tested in future or could be used as base for switching to inline assembly
+*/
+
+#include "common.h"
+#include <stdio.h>
+#define NBMAX 4096
+
+#include <altivec.h> 
+ 
+static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i;  
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0};
+    register __vector float temp4 = {0,0,0,0};
+    register __vector float temp5 = {0,0,0,0};
+    register __vector float temp6 = {0,0,0,0};
+    register __vector float temp7 = {0,0,0,0};
+
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    a4 = a3 + lda;
+    a5 = a4 + lda;
+    a6 = a5 + lda;
+    a7 = a6 + lda;
+    va0 = (__vector float*) a0;
+    va1 = (__vector float*) a1;
+    va2 = (__vector float*) a2;
+    va3 = (__vector float*) a3;
+    va4 = (__vector float*) a4;
+    va5 = (__vector float*) a5;
+    va6 = (__vector float*) a6;
+    va7 = (__vector float*) a7;
+    v_x = (__vector float*) x;
+ 
+   
+        for (i = 0; i < n/4; i +=2) {
+            register __vector float vx1=v_x[i] ; 
+            register __vector float vx2=v_x[i+1] ; 
+            register __vector float va0_1=va0[i] ; 
+            register __vector float va0_2=va0[i+1] ; 
+            register __vector float va1_1=va1[i] ; 
+            register __vector float va1_2=va1[i+1] ; 
+            register __vector float va2_1=va2[i] ; 
+            register __vector float va2_2=va2[i+1] ; 
+            register __vector float va3_1=va3[i] ; 
+            register __vector float va3_2=va3[i+1] ; 
+            register __vector float va4_1=va4[i] ; 
+            register __vector float va4_2=va4[i+1] ;
+            register __vector float va5_1=va5[i] ; 
+            register __vector float va5_2=va5[i+1] ; 
+            register __vector float va6_1=va6[i] ; 
+            register __vector float va6_2=va6[i+1] ; 
+            register __vector float va7_1=va7[i] ; 
+            register __vector float va7_2=va7[i+1] ;                       
+            temp0 += vx1* va0_1 + vx2 * va0_2;
+            temp1 += vx1* va1_1 + vx2 * va1_2;
+            temp2 += vx1* va2_1 + vx2 * va2_2;
+            temp3 += vx1* va3_1 + vx2 * va3_2;
+            temp4 += vx1* va4_1 + vx2 * va4_2;
+            temp5 += vx1* va5_1 + vx2 * va5_2;
+            temp6 += vx1* va6_1 + vx2 * va6_2;
+            temp7 += vx1* va7_1 + vx2 * va7_2;  
+        }
+    
+  
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+    y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
+    y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
+    y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
+    y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
+
+}
+ 
+
+static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i = 0;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* va2 = (__vector float*) a2;
+    __vector float* va3 = (__vector float*) a3;
+    __vector float* v_x = (__vector float*) x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0}; 
+
+    for (i = 0; i < n / 4; i +=2) {
+        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
+        temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1];
+        temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1];
+        temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; 
+    }
+ 
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+}
+ 
+
+static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
+
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    __vector float temp1 = {0,0,0,0};
+    for (i = 0; i < n / 4; i +=2) {
+        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
+        temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; 
+    }
+
+
+
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); 
+}
+
+static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    for (i = 0; i < n / 4; i +=2) {
+        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; 
+    }
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+
+}
+ 
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+        *dest++ = *src;
+        src += inc_src;
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
+    BLASLONG i;
+    BLASLONG j;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2;
+
+    FLOAT ybuffer[8], *xbuffer;
+
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    xbuffer = buffer;
+
+    n1 = n >> 3;
+    n2 = n & 7;
+
+    m3 = m & 7;
+    m1 = m - m3;
+    m2 = (m & (NBMAX - 1)) - m3;
+  
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        y_ptr = y;
+        a_ptr = a;
+        x_ptr = x;
+
+        if (inc_x != 1)
+            copy_x(NB, x_ptr, xbuffer, inc_x);
+        else
+            xbuffer = x_ptr;
+
+        BLASLONG lda8 = lda << 3;
+
+  
+        if (inc_y == 1) {
+
+            for (i = 0; i < n1; i++) {
+                 
+                sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
+ 
+                y_ptr += 8;
+                a_ptr += lda8;
+        
+            }
+
+        } else {
+                   
+            for (i = 0; i < n1; i++) {
+                ybuffer[0] = 0;
+                ybuffer[1] = 0;
+                ybuffer[2] = 0;
+                ybuffer[3] = 0;
+                ybuffer[4] = 0;
+                ybuffer[5] = 0;
+                ybuffer[6] = 0;
+                ybuffer[7] = 0;
+                sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+ 
+
+                *y_ptr += ybuffer[0];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[1];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[2];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[3];
+                y_ptr += inc_y;
+
+                *y_ptr += ybuffer[4];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[5];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[6];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[7];
+                y_ptr += inc_y;
+
+                a_ptr += lda8;
+            }
+
+        }
+
+
+        if (n2 & 4) {
+            ybuffer[0] = 0;
+            ybuffer[1] = 0;
+            ybuffer[2] = 0;
+            ybuffer[3] = 0;
+            sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+            a_ptr += lda<<2;
+
+            *y_ptr += ybuffer[0];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[1];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[2];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[3];
+            y_ptr += inc_y;
+        }
+
+        if (n2 & 2) {
+            sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
+            a_ptr += lda << 1;
+            y_ptr += 2 * inc_y;
+
+        }
+
+        if (n2 & 1) {
+            sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha);
+            a_ptr += lda;
+            y_ptr += inc_y;
+
+        }
+
+        a += NB;
+        x += NB * inc_x;
+
+
+    }
+
+    if (m3 == 0) return (0);
+
+    x_ptr = x;
+    a_ptr = a;
+    if (m3 & 4) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp2 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp3 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+        if (lda == 4 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1  +  aj[2] * xtemp2 + aj[3] * xtemp3;
+                y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1  +  aj[6] * xtemp2 + aj[7] * xtemp3;
+                y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1  +  aj[10] * xtemp2 + aj[11] * xtemp3;
+                y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1  +  aj[14] * xtemp2 + aj[15] * xtemp3;
+                aj += 16;
+
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 +  aj[2] * xtemp2 + aj[3] * xtemp3;
+                aj += 4;
+            }
+
+        } else if (inc_y == 1) {
+        
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2  + *(aj +  lda2 +3) * xtemp3;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2  + *(aj +  lda3+3) * xtemp3;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3;
+                    aj += lda;
+                }
+
+        } else {
+
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+
+            } 
+            if (m3==4) return (0);
+            a_ptr += 4; 
+    }
+
+    if (m3 & 2 ) {
+  
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 2 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
+                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
+                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
+                aj += 8;
+
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                aj += 2;
+            }
+
+        } else {
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    aj += lda;
+                }
+
+            } else {
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+            }
+
+        } 
+        if (m3==2) return (0);
+        a_ptr += 2; 
+    }
+    if (m3 & 1) {
+          
+    FLOAT xtemp = *x_ptr * alpha;
+            x_ptr += inc_x;
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
+    if (lda == 1 && inc_y == 1) {
+        for (j = 0; j < (n & -4); j += 4) {
+            y_ptr[j] += aj[j] * xtemp;
+            y_ptr[j + 1] += aj[j + 1] * xtemp;
+            y_ptr[j + 2] += aj[j + 2] * xtemp;
+            y_ptr[j + 3] += aj[j + 3] * xtemp;
+        }
+        for (; j < n; j++) {
+            y_ptr[j] += aj[j] * xtemp;
+        }
+
+
+    } else {
+        if (inc_y == 1) {
+
+            BLASLONG register lda2 = lda << 1;
+            BLASLONG register lda4 = lda << 2;
+            BLASLONG register lda3 = lda2 + lda;
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += *aj * xtemp;
+                y_ptr[j + 1] += *(aj + lda) * xtemp;
+                y_ptr[j + 2] += *(aj + lda2) * xtemp;
+                y_ptr[j + 3] += *(aj + lda3) * xtemp;
+                aj += lda4;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += *aj * xtemp;
+                aj += lda;
+            }
+
+        } else {
+            for (j = 0; j < n; j++) {
+                *y_ptr += *aj * xtemp;
+                y_ptr += inc_y;
+                aj += lda;
+            }
+
+        }
+    
+    }
+                a_ptr += 1; 
+    }
+    return (0);
+
+}
+
diff --git a/kernel/power/srot.c b/kernel/power/srot.c
index d2910ff875..6af813c161 100644
--- a/kernel/power/srot.c
+++ b/kernel/power/srot.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #pragma GCC optimize "O1"
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "srot_microk_power8.c"
 #endif
 
diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c
index bd5cdc43fe..4f3ba56980 100644
--- a/kernel/power/sscal.c
+++ b/kernel/power/sscal.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) 
+#if defined(POWER8) || defined(POWER9)
 #include "sscal_microk_power8.c"
 #endif
 
diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c
index 932652b376..23d13280fb 100644
--- a/kernel/power/sswap.c
+++ b/kernel/power/sswap.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "sswap_microk_power8.c"
 #endif
 
diff --git a/kernel/power/sum.S b/kernel/power/sum.S
new file mode 100644
index 0000000000..eda2c5f2c0
--- /dev/null
+++ b/kernel/power/sum.S
@@ -0,0 +1,446 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N	r3
+#define X	r4
+#define INCX	r5
+
+#define PREA	r8
+
+#define FZERO	f0
+
+#define STACKSIZE 160
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0,   0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+	stw	r0,   144(SP)
+	lfs	FZERO,144(SP)
+
+#ifdef F_INTERFACE
+	LDINT	N,    0(N)
+	LDINT	INCX, 0(INCX)
+#endif
+
+	slwi	INCX, INCX, BASE_SHIFT
+
+	fmr	f1,  FZERO
+	fmr	f2,  FZERO
+	fmr	f3,  FZERO
+	fmr	f4,  FZERO
+	fmr	f5,  FZERO
+	fmr	f6,  FZERO
+	fmr	f7,  FZERO
+
+	li	PREA, L1_PREFETCHSIZE
+
+	cmpwi	cr0, N, 0
+	ble-	LL(999)
+
+	cmpwi	cr0, INCX, 0
+	ble-	LL(999)
+
+	cmpwi	cr0, INCX, SIZE
+	bne-	cr0, LL(100)
+
+	srawi.	r0, N, 4
+	mtspr	CTR, r0
+	beq-	cr0, LL(50)
+	.align 4
+
+	LFD	f8,    0 * SIZE(X)
+	LFD	f9,    1 * SIZE(X)
+	LFD	f10,   2 * SIZE(X)
+	LFD	f11,   3 * SIZE(X)
+	LFD	f12,   4 * SIZE(X)
+	LFD	f13,   5 * SIZE(X)
+	LFD	f14,   6 * SIZE(X)
+	LFD	f15,   7 * SIZE(X)
+
+	LFD	f24,   8 * SIZE(X)
+	LFD	f25,   9 * SIZE(X)
+	LFD	f26,  10 * SIZE(X)
+	LFD	f27,  11 * SIZE(X)
+	LFD	f28,  12 * SIZE(X)
+	LFD	f29,  13 * SIZE(X)
+	LFD	f30,  14 * SIZE(X)
+	LFD	f31,  15 * SIZE(X)
+
+	fmr	f16, f8
+	fmr	f17, f9
+	fmr	f18, f10
+	fmr	f19, f11
+
+	fmr	f20, f12
+	fmr	f21, f13
+	fmr	f22, f14
+	fmr	f23, f15
+	bdz	LL(20)
+	.align 4
+
+LL(10):
+	FADD	f0, f0, f16
+	fmr	f16, f24
+	FADD	f1, f1, f17
+	fmr	f17, f25
+
+	FADD	f2, f2, f18
+	fmr	f18, f26
+	FADD	f3, f3, f19
+	fmr	f19, f27
+
+	LFD	f8,   16 * SIZE(X)
+	LFD	f9,   17 * SIZE(X)
+	LFD	f10,  18 * SIZE(X)
+	LFD	f11,  19 * SIZE(X)
+
+	FADD	f4, f4, f20
+	fmr	f20, f28
+	FADD	f5, f5, f21
+	fmr	f21, f29
+
+	FADD	f6, f6, f22
+	fmr	f22, f30
+	FADD	f7, f7, f23
+	fmr	f23, f31
+
+	LFD	f12,  20 * SIZE(X)
+	LFD	f13,  21 * SIZE(X)
+	LFD	f14,  22 * SIZE(X)
+	LFD	f15,  23 * SIZE(X)
+
+	FADD	f0, f0, f16
+	fmr	f16, f8
+	FADD	f1, f1, f17
+	fmr	f17, f9
+
+	FADD	f2, f2, f18
+	fmr	f18, f10
+	FADD	f3, f3, f19
+	fmr	f19, f11
+
+	LFD	f24,  24 * SIZE(X)
+	LFD	f25,  25 * SIZE(X)
+	LFD	f26,  26 * SIZE(X)
+	LFD	f27,  27 * SIZE(X)
+
+	FADD	f4, f4, f20
+	fmr	f20, f12
+	FADD	f5, f5, f21
+	fmr	f21, f13
+
+	FADD	f6, f6, f22
+	fmr	f22, f14
+	FADD	f7, f7, f23
+	fmr	f23, f15
+
+	LFD	f28,  28 * SIZE(X)
+	LFD	f29,  29 * SIZE(X)
+	LFD	f30,  30 * SIZE(X)
+	LFD	f31,  31 * SIZE(X)
+
+#ifndef POWER6
+	L1_PREFETCH	X, PREA
+#endif
+	addi	X, X, 16 * SIZE
+#ifdef POWER6
+	L1_PREFETCH	X, PREA
+#endif
+
+	bdnz	LL(10)
+	.align 4
+
+LL(20):
+	FADD	f0, f0, f16
+	fmr	f16, f24
+	FADD	f1, f1, f17
+	fmr	f17, f25
+
+	FADD	f2, f2, f18
+	fmr	f18, f26
+	FADD	f3, f3, f19
+	fmr	f19, f27
+
+	FADD	f4, f4, f20
+	fmr	f20, f28
+	FADD	f5, f5, f21
+	fmr	f21, f29
+
+	FADD	f6, f6, f22
+	fmr	f22, f30
+	FADD	f7, f7, f23
+	fmr	f23, f31
+
+	FADD	f0, f0, f16
+	FADD	f1, f1, f17
+	FADD	f2, f2, f18
+	FADD	f3, f3, f19
+
+	FADD	f4, f4, f20
+	FADD	f5, f5, f21
+	FADD	f6, f6, f22
+	FADD	f7, f7, f23
+	addi	X, X, 16 * SIZE
+	.align 4
+
+LL(50):
+	andi.	r0,  N, 15
+	mtspr	CTR, r0
+	beq	LL(999)
+	.align 4
+
+LL(60):
+	LFD	f8,  0 * SIZE(X)
+	addi	X, X,  1 * SIZE
+
+	FADD	f0, f0,  f8
+
+	bdnz	LL(60)
+	b	LL(999)
+	.align 4
+
+LL(100):
+	sub	X, X, INCX
+
+	srawi.	r0, N, 4
+	mtspr	CTR,  r0
+	beq-	LL(150)
+
+	LFDUX	f8,    X, INCX
+	LFDUX	f9,    X, INCX
+	LFDUX	f10,   X, INCX
+	LFDUX	f11,   X, INCX
+	LFDUX	f12,   X, INCX
+	LFDUX	f13,   X, INCX
+	LFDUX	f14,   X, INCX
+	LFDUX	f15,   X, INCX
+
+	LFDUX	f24,   X, INCX
+	LFDUX	f25,   X, INCX
+	LFDUX	f26,   X, INCX
+	LFDUX	f27,   X, INCX
+	LFDUX	f28,   X, INCX
+	LFDUX	f29,   X, INCX
+	LFDUX	f30,   X, INCX
+	LFDUX	f31,   X, INCX
+
+	fmr	f16, f8
+	fmr	f17, f9
+	fmr	f18, f10
+	fmr	f19, f11
+
+	fmr	f20, f12
+	fmr	f21, f13
+	fmr	f22, f14
+	fmr	f23, f15
+	bdz	LL(120)
+	.align 4
+
+LL(110):
+	FADD	f0, f0, f16
+	fmr	f16, f24
+	FADD	f1, f1, f17
+	fmr	f17, f25
+
+	FADD	f2, f2, f18
+	fmr	f18, f26
+	FADD	f3, f3, f19
+	fmr	f19, f27
+
+	LFDUX	f8,    X, INCX
+	LFDUX	f9,    X, INCX
+	LFDUX	f10,   X, INCX
+	LFDUX	f11,   X, INCX
+
+	FADD	f4, f4, f20
+	fmr	f20, f28
+	FADD	f5, f5, f21
+	fmr	f21, f29
+
+	FADD	f6, f6, f22
+	fmr	f22, f30
+	FADD	f7, f7, f23
+	fmr	f23, f31
+
+	LFDUX	f12,   X, INCX
+	LFDUX	f13,   X, INCX
+	LFDUX	f14,   X, INCX
+	LFDUX	f15,   X, INCX
+
+	FADD	f0, f0, f16
+	fmr	f16, f8
+	FADD	f1, f1, f17
+	fmr	f17, f9
+
+	FADD	f2, f2, f18
+	fmr	f18, f10
+	FADD	f3, f3, f19
+	fmr	f19, f11
+
+	LFDUX	f24,   X, INCX
+	LFDUX	f25,   X, INCX
+	LFDUX	f26,   X, INCX
+	LFDUX	f27,   X, INCX
+
+	FADD	f4, f4, f20
+	fmr	f20, f12
+	FADD	f5, f5, f21
+	fmr	f21, f13
+
+	FADD	f6, f6, f22
+	fmr	f22, f14
+	FADD	f7, f7, f23
+	fmr	f23, f15
+
+	LFDUX	f28,   X, INCX
+	LFDUX	f29,   X, INCX
+	LFDUX	f30,   X, INCX
+	LFDUX	f31,   X, INCX
+	bdnz	LL(110)
+	.align 4
+
+LL(120):
+	FADD	f0, f0, f16
+	fmr	f16, f24
+	FADD	f1, f1, f17
+	fmr	f17, f25
+
+	FADD	f2, f2, f18
+	fmr	f18, f26
+	FADD	f3, f3, f19
+	fmr	f19, f27
+
+	FADD	f4, f4, f20
+	fmr	f20, f28
+	FADD	f5, f5, f21
+	fmr	f21, f29
+
+	FADD	f6, f6, f22
+	fmr	f22, f30
+	FADD	f7, f7, f23
+	fmr	f23, f31
+
+	FADD	f0, f0, f16
+	FADD	f1, f1, f17
+	FADD	f2, f2, f18
+	FADD	f3, f3, f19
+
+	FADD	f4, f4, f20
+	FADD	f5, f5, f21
+	FADD	f6, f6, f22
+	FADD	f7, f7, f23
+	.align 4
+
+LL(150):
+	andi.	r0,  N, 15
+	mtspr	CTR, r0
+	beq	LL(999)
+	.align 4
+
+LL(160):
+	LFDUX	f8,    X, INCX
+	FADD	f0,  f0, f8
+	bdnz	LL(160)
+	.align 4
+
+LL(999):
+	FADD	f0,  f0,  f1
+	FADD	f2,  f2,  f3
+	FADD	f4,  f4,  f5
+	FADD	f6,  f6,  f7
+
+	FADD	f0,  f0,  f2
+	FADD	f4,  f4,  f6
+	FADD	f1,  f0,  f4
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	addi	SP, SP, STACKSIZE
+	blr
+
+	EPILOGUE
diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c
index 0b6b87d46d..f61c62e75b 100644
--- a/kernel/power/zasum.c
+++ b/kernel/power/zasum.c
@@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "zasum_microk_power8.c"
 #endif
 
diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c
index dd7ab6c3cc..f0f8c69108 100644
--- a/kernel/power/zaxpy.c
+++ b/kernel/power/zaxpy.c
@@ -36,19 +36,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "zaxpy_microk_power8.c"
 #endif
 
 
 #ifndef HAVE_KERNEL_4
 
-static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i)
 {
 	BLASLONG register i  = 0;
 	BLASLONG register ix = 0;
-	FLOAT da_r = alpha[0];
-	FLOAT da_i = alpha[1];
+ 
 	
 
 	while(i < n)
diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c
index a7658f7ab9..b21d6ef15c 100644
--- a/kernel/power/zcopy.c
+++ b/kernel/power/zcopy.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "zcopy_microk_power8.c"
 #endif
 
diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c
index b83f832b13..fd36c7f448 100644
--- a/kernel/power/zdot.c
+++ b/kernel/power/zdot.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8) 
+#if defined(POWER8) || defined(POWER9)
 #include "zdot_microk_power8.c"
 #endif
 
diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S
index 43b72ca157..1f4c29210d 100644
--- a/kernel/power/zgemm_beta.S
+++ b/kernel/power/zgemm_beta.S
@@ -134,7 +134,7 @@ LL(12):
 	STFD	f0,  14 * SIZE(CO1)
 	STFD	f0,  15 * SIZE(CO1)
 
-	dcbst	PRE, CO1
+	dcbtst	PRE, CO1
 	addi	CO1, CO1,  16 * SIZE
 	bdnz	LL(12)
 	.align 4
diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c
index 8b250a7f1e..167b0a1586 100644
--- a/kernel/power/zgemv_n_4.c
+++ b/kernel/power/zgemv_n_4.c
@@ -389,20 +389,14 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
         register __vector double va0_2 = vptr_a0[i + 2];
         register __vector double va0_3 = vptr_a0[i + 3];
 
-        vy_0 += va0*vx0_r;
-        vy_1 += va0_1*vx0_r;
-        vy_2 += va0_2*vx0_r;
-        vy_3 += va0_3*vx0_r;
-
-        va0 = vec_xxpermdi(va0, va0, 2);
-        va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
-        va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
-        va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
-
-        vy_0 += va0*vx0_i;
-        vy_1 += va0_1*vx0_i;
-        vy_2 += va0_2*vx0_i;
-        vy_3 += va0_3*vx0_i;
+        register __vector double va0x = vec_xxpermdi(va0, va0, 2);
+        register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2);
+        register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2);
+        register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2);
+        vy_0 += va0*vx0_r + va0x*vx0_i;
+        vy_1 += va0_1*vx0_r +  va0x_1*vx0_i;
+        vy_2 += va0_2*vx0_r + va0x_2*vx0_i;
+        vy_3 += va0_3*vx0_r + va0x_3*vx0_i; 
 
         vy[i] = vy_0;
         vy[i + 1] = vy_1;
diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c
index 5722064946..20a0812dd2 100644
--- a/kernel/power/zgemv_t_4.c
+++ b/kernel/power/zgemv_t_4.c
@@ -59,11 +59,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
     i = 0;
     n = n << 1;
     while (i < n) {
-//        __builtin_prefetch(&x[i]);
-//        __builtin_prefetch(&a0[i]);   
-//        __builtin_prefetch(&a1[i]);
-//        __builtin_prefetch(&a2[i]);
-//        __builtin_prefetch(&a3[i]);
+
         register __vector double vx_0 = *(__vector double*) (&x[i]);
         register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
         register __vector double vx_2 = *(__vector double*) (&x[i + 4]);
diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c
index 14d677f249..a1b441d2c8 100644
--- a/kernel/power/zscal.c
+++ b/kernel/power/zscal.c
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #pragma GCC optimize "O1"
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #if defined(DOUBLE)
 #include "zscal_microk_power8.c"
 #endif
diff --git a/kernel/power/zsum.S b/kernel/power/zsum.S
new file mode 100644
index 0000000000..8396012e8a
--- /dev/null
+++ b/kernel/power/zsum.S
@@ -0,0 +1,452 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N	r3
+#define X	r4
+#define INCX	r5
+
+#define INCXM1	r9
+#define PREA	r8
+
+#define FZERO	f0
+
+#define STACKSIZE 160
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0,   0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+	stw	r0,   144(SP)
+	lfs	FZERO,144(SP)
+
+#ifdef F_INTERFACE
+	LDINT	N,    0(N)
+	LDINT	INCX, 0(INCX)
+#endif
+
+	slwi	INCX, INCX, ZBASE_SHIFT
+	subi	INCXM1, INCX, SIZE
+
+	fmr	f1,  FZERO
+	fmr	f2,  FZERO
+	fmr	f3,  FZERO
+	fmr	f4,  FZERO
+	fmr	f5,  FZERO
+	fmr	f6,  FZERO
+	fmr	f7,  FZERO
+
+	li	PREA, L1_PREFETCHSIZE
+
+	cmpwi	cr0, N, 0
+	ble-	LL(999)
+
+	cmpwi	cr0, INCX, 0
+	ble-	LL(999)
+
+	cmpwi	cr0, INCX, 2 * SIZE
+	bne-	cr0, LL(100)
+
+	srawi.	r0, N, 3
+	mtspr	CTR, r0
+	beq-	cr0, LL(50)
+	.align 4
+
+	LFD	f8,    0 * SIZE(X)
+	LFD	f9,    1 * SIZE(X)
+	LFD	f10,   2 * SIZE(X)
+	LFD	f11,   3 * SIZE(X)
+	LFD	f12,   4 * SIZE(X)
+	LFD	f13,   5 * SIZE(X)
+	LFD	f14,   6 * SIZE(X)
+	LFD	f15,   7 * SIZE(X)
+
+	LFD	f24,   8 * SIZE(X)
+	LFD	f25,   9 * SIZE(X)
+	LFD	f26,  10 * SIZE(X)
+	LFD	f27,  11 * SIZE(X)
+	LFD	f28,  12 * SIZE(X)
+	LFD	f29,  13 * SIZE(X)
+	LFD	f30,  14 * SIZE(X)
+	LFD	f31,  15 * SIZE(X)
+
+	fmr	f16, f8
+	fmr	f17, f9
+	fmr	f18, f10
+	fmr	f19, f11
+
+	fmr	f20, f12
+	fmr	f21, f13
+	fmr	f22, f14
+	fmr	f23, f15
+	bdz	LL(20)
+	.align 4
+
+LL(10):
+	FADD	f0, f0, f16
+	fmr	f16, f24
+	FADD	f1, f1, f17
+	fmr	f17, f25
+
+	FADD	f2, f2, f18
+	fmr	f18, f26
+	FADD	f3, f3, f19
+	fmr	f19, f27
+
+	LFD	f8,   16 * SIZE(X)
+	LFD	f9,   17 * SIZE(X)
+	LFD	f10,  18 * SIZE(X)
+	LFD	f11,  19 * SIZE(X)
+
+	FADD	f4, f4, f20
+	fmr	f20, f28
+	FADD	f5, f5, f21
+	fmr	f21, f29
+
+	FADD	f6, f6, f22
+	fmr	f22, f30
+	FADD	f7, f7, f23
+	fmr	f23, f31
+
+	LFD	f12,  20 * SIZE(X)
+	LFD	f13,  21 * SIZE(X)
+	LFD	f14,  22 * SIZE(X)
+	LFD	f15,  23 * SIZE(X)
+
+	FADD	f0, f0, f16
+	fmr	f16, f8
+	FADD	f1, f1, f17
+	fmr	f17, f9
+
+	FADD	f2, f2, f18
+	fmr	f18, f10
+	FADD	f3, f3, f19
+	fmr	f19, f11
+
+	LFD	f24,  24 * SIZE(X)
+	LFD	f25,  25 * SIZE(X)
+	LFD	f26,  26 * SIZE(X)
+	LFD	f27,  27 * SIZE(X)
+
+	FADD	f4, f4, f20
+	fmr	f20, f12
+	FADD	f5, f5, f21
+	fmr	f21, f13
+
+	FADD	f6, f6, f22
+	fmr	f22, f14
+	FADD	f7, f7, f23
+	fmr	f23, f15
+
+	LFD	f28,  28 * SIZE(X)
+	LFD	f29,  29 * SIZE(X)
+	LFD	f30,  30 * SIZE(X)
+	LFD	f31,  31 * SIZE(X)
+
+#ifndef POWER6
+	L1_PREFETCH	X, PREA
+#endif
+	addi	X, X, 16 * SIZE
+#ifdef POWER6
+	L1_PREFETCH	X, PREA
+#endif
+
+	bdnz	LL(10)
+	.align 4
+
+LL(20):
+	FADD	f0, f0, f16
+	fmr	f16, f24
+	FADD	f1, f1, f17
+	fmr	f17, f25
+
+	FADD	f2, f2, f18
+	fmr	f18, f26
+	FADD	f3, f3, f19
+	fmr	f19, f27
+
+	FADD	f4, f4, f20
+	fmr	f20, f28
+	FADD	f5, f5, f21
+	fmr	f21, f29
+
+	FADD	f6, f6, f22
+	fmr	f22, f30
+	FADD	f7, f7, f23
+	fmr	f23, f31
+
+	FADD	f0, f0, f16
+	FADD	f1, f1, f17
+	FADD	f2, f2, f18
+	FADD	f3, f3, f19
+
+	FADD	f4, f4, f20
+	FADD	f5, f5, f21
+	FADD	f6, f6, f22
+	FADD	f7, f7, f23
+	addi	X, X, 16 * SIZE
+	.align 4
+
+LL(50):
+	andi.	r0,  N, 7
+	mtspr	CTR, r0
+	beq	LL(999)
+	.align 4
+
+LL(60):
+	LFD	f8,  0 * SIZE(X)
+	LFD	f9,  1 * SIZE(X)
+	addi	X, X,  2 * SIZE
+
+	FADD	f0, f0,  f8
+	FADD	f1, f1,  f9
+
+	bdnz	LL(60)
+	b	LL(999)
+	.align 4
+
+LL(100):
+	sub	X, X, INCXM1
+
+	srawi.	r0, N, 3
+	mtspr	CTR,  r0
+	beq-	LL(150)
+
+	LFDX	f8,    X, INCXM1
+	LFDUX	f9,    X, INCX
+	LFDX	f10,   X, INCXM1
+	LFDUX	f11,   X, INCX
+	LFDX	f12,   X, INCXM1
+	LFDUX	f13,   X, INCX
+	LFDX	f14,   X, INCXM1
+	LFDUX	f15,   X, INCX
+
+	LFDX	f24,   X, INCXM1
+	LFDUX	f25,   X, INCX
+	LFDX	f26,   X, INCXM1
+	LFDUX	f27,   X, INCX
+	LFDX	f28,   X, INCXM1
+	LFDUX	f29,   X, INCX
+	LFDX	f30,   X, INCXM1
+	LFDUX	f31,   X, INCX
+
+	fmr	f16, f8
+	fmr	f17, f9
+	fmr	f18, f10
+	fmr	f19, f11
+
+	fmr	f20, f12
+	fmr	f21, f13
+	fmr	f22, f14
+	fmr	f23, f15
+	bdz	LL(120)
+	.align 4
+
+LL(110):
+	FADD	f0, f0, f16
+	fmr	f16, f24
+	FADD	f1, f1, f17
+	fmr	f17, f25
+
+	FADD	f2, f2, f18
+	fmr	f18, f26
+	FADD	f3, f3, f19
+	fmr	f19, f27
+
+	LFDX	f8,    X, INCXM1
+	LFDUX	f9,    X, INCX
+	LFDX	f10,   X, INCXM1
+	LFDUX	f11,   X, INCX
+
+	FADD	f4, f4, f20
+	fmr	f20, f28
+	FADD	f5, f5, f21
+	fmr	f21, f29
+
+	FADD	f6, f6, f22
+	fmr	f22, f30
+	FADD	f7, f7, f23
+	fmr	f23, f31
+
+	LFDX	f12,   X, INCXM1
+	LFDUX	f13,   X, INCX
+	LFDX	f14,   X, INCXM1
+	LFDUX	f15,   X, INCX
+
+	FADD	f0, f0, f16
+	fmr	f16, f8
+	FADD	f1, f1, f17
+	fmr	f17, f9
+
+	FADD	f2, f2, f18
+	fmr	f18, f10
+	FADD	f3, f3, f19
+	fmr	f19, f11
+
+	LFDX	f24,   X, INCXM1
+	LFDUX	f25,   X, INCX
+	LFDX	f26,   X, INCXM1
+	LFDUX	f27,   X, INCX
+
+	FADD	f4, f4, f20
+	fmr	f20, f12
+	FADD	f5, f5, f21
+	fmr	f21, f13
+
+	FADD	f6, f6, f22
+	fmr	f22, f14
+	FADD	f7, f7, f23
+	fmr	f23, f15
+
+	LFDX	f28,   X, INCXM1
+	LFDUX	f29,   X, INCX
+	LFDX	f30,   X, INCXM1
+	LFDUX	f31,   X, INCX
+	bdnz	LL(110)
+	.align 4
+
+LL(120):
+	FADD	f0, f0, f16
+	fmr	f16, f24
+	FADD	f1, f1, f17
+	fmr	f17, f25
+
+	FADD	f2, f2, f18
+	fmr	f18, f26
+	FADD	f3, f3, f19
+	fmr	f19, f27
+
+	FADD	f4, f4, f20
+	fmr	f20, f28
+	FADD	f5, f5, f21
+	fmr	f21, f29
+
+	FADD	f6, f6, f22
+	fmr	f22, f30
+	FADD	f7, f7, f23
+	fmr	f23, f31
+
+	FADD	f0, f0, f16
+	FADD	f1, f1, f17
+	FADD	f2, f2, f18
+	FADD	f3, f3, f19
+
+	FADD	f4, f4, f20
+	FADD	f5, f5, f21
+	FADD	f6, f6, f22
+	FADD	f7, f7, f23
+	.align 4
+
+LL(150):
+	andi.	r0,  N, 7
+	mtspr	CTR, r0
+	beq	LL(999)
+	.align 4
+
+LL(160):
+	LFDX	f8,    X, INCXM1
+	LFDUX	f9,    X, INCX
+	FADD	f0,  f0, f8
+	FADD	f1,  f1, f9
+	bdnz	LL(160)
+	.align 4
+
+LL(999):
+	FADD	f0,  f0,  f1
+	FADD	f2,  f2,  f3
+	FADD	f4,  f4,  f5
+	FADD	f6,  f6,  f7
+
+	FADD	f0,  f0,  f2
+	FADD	f4,  f4,  f6
+	FADD	f1,  f0,  f4
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	addi	SP, SP, STACKSIZE
+	blr
+
+	EPILOGUE
diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c
index 5ec1eee2e1..1d8826f414 100644
--- a/kernel/power/zswap.c
+++ b/kernel/power/zswap.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8)
+#if defined(POWER8) || defined(POWER9)
 #include "zswap_microk_power8.c"
 #endif
 
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 6d4028b0b2..8e8214e702 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -70,7 +70,7 @@ gotoblas_t TABLE_NAME = {
 
   samax_kTS,  samin_kTS,  smax_kTS,  smin_kTS,
   isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
-  snrm2_kTS,  sasum_kTS,  scopy_kTS, sdot_kTS,
+  snrm2_kTS,  sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS,
   dsdot_kTS,
   srot_kTS,   saxpy_kTS,  sscal_kTS, sswap_kTS,
   sgemv_nTS,  sgemv_tTS, sger_kTS,
@@ -126,7 +126,7 @@ gotoblas_t TABLE_NAME = {
 
   damax_kTS,  damin_kTS,  dmax_kTS,  dmin_kTS,
   idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS,
-  dnrm2_kTS,  dasum_kTS,  dcopy_kTS, ddot_kTS,
+  dnrm2_kTS,  dasum_kTS,  dsum_kTS, dcopy_kTS, ddot_kTS,
   drot_kTS,   daxpy_kTS,  dscal_kTS, dswap_kTS,
   dgemv_nTS,  dgemv_tTS,  dger_kTS,
   dsymv_LTS,  dsymv_UTS,
@@ -178,7 +178,7 @@ gotoblas_t TABLE_NAME = {
 
   qamax_kTS,  qamin_kTS,  qmax_kTS,  qmin_kTS,
   iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS,
-  qnrm2_kTS,  qasum_kTS,  qcopy_kTS, qdot_kTS,
+  qnrm2_kTS,  qasum_kTS,  qsum_kTS, qcopy_kTS, qdot_kTS,
   qrot_kTS,   qaxpy_kTS,  qscal_kTS, qswap_kTS,
   qgemv_nTS,  qgemv_tTS,  qger_kTS,
   qsymv_LTS,  qsymv_UTS,
@@ -234,7 +234,7 @@ gotoblas_t TABLE_NAME = {
 #endif
 
   camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
-  cnrm2_kTS, casum_kTS, ccopy_kTS,
+  cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS,
   cdotu_kTS, cdotc_kTS, csrot_kTS,
   caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS,
 
@@ -369,7 +369,7 @@ gotoblas_t TABLE_NAME = {
 #endif
 
   zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
-  znrm2_kTS, zasum_kTS, zcopy_kTS,
+  znrm2_kTS, zasum_kTS, zsum_kTS, zcopy_kTS,
   zdotu_kTS, zdotc_kTS, zdrot_kTS,
   zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS,
 
@@ -500,7 +500,7 @@ gotoblas_t TABLE_NAME = {
   XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N),
 
   xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS,
-  xnrm2_kTS, xasum_kTS, xcopy_kTS,
+  xnrm2_kTS, xasum_kTS, xsum_kTS, xcopy_kTS,
   xdotu_kTS, xdotc_kTS, xqrot_kTS,
   xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS,
 
@@ -718,6 +718,27 @@ static void init_parameter(void) {
 
 }
 #else // defined(ARCH_ARM64)
+#if defined(ARCH_POWER)
+static void init_parameter(void) {
+
+  TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
+  TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
+  TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
+  TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
+
+  TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
+  TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
+  TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
+  TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
+
+
+  TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
+  TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
+  TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
+  TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
+}
+#else //POWER
+
 #ifdef ARCH_X86
 static int get_l2_size_old(void){
   int i, eax, ebx, ecx, edx, cpuid_level;
@@ -1303,4 +1324,5 @@ static void init_parameter(void) {
 
 
 }
+#endif //POWER
 #endif //defined(ARCH_ARM64)
diff --git a/kernel/sparc/sum.S b/kernel/sparc/sum.S
new file mode 100644
index 0000000000..f26abb85f9
--- /dev/null
+++ b/kernel/sparc/sum.S
@@ -0,0 +1,325 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N	%i0
+#define X	%i1
+#define INCX	%i2
+#define I	%i3
+
+#ifdef DOUBLE
+#define c1	%f0
+#define c2	%f2
+#define t1	%f8
+#define t2	%f10
+#define t3	%f12
+#define t4	%f14
+
+#define a1	%f16
+#define a2	%f18
+#define a3	%f20
+#define a4	%f22
+#define a5	%f24
+#define a6	%f26
+#define a7	%f28
+#define a8	%f30
+#else
+#define c1	%f0
+#define c2	%f1
+#define t1	%f4
+#define t2	%f5
+#define t3	%f6
+#define t4	%f7
+
+#define a1	%f8
+#define a2	%f9
+#define a3	%f10
+#define a4	%f11
+#define a5	%f12
+#define a6	%f13
+#define a7	%f14
+#define a8	%f15
+#endif
+
+	PROLOGUE
+	SAVESP
+
+	FCLR(0)
+
+	sll	INCX, BASE_SHIFT, INCX
+
+	FMOV	c1, c2
+	FMOV	c1, t1
+	FMOV	c1, t2
+	FMOV	c1, t3
+	FMOV	c1, t4
+
+	cmp	INCX, 0
+	ble	.LL19
+	cmp	INCX, SIZE
+	bne	.LL50
+
+	sra	N, 3, I
+	cmp	I, 0
+	ble,pn	%icc, .LL15
+	nop
+
+	LDF	[X +  0 * SIZE], a1
+	add	I, -1, I
+	LDF	[X +  1 * SIZE], a2
+	cmp	I, 0
+	LDF	[X +  2 * SIZE], a3
+	LDF	[X +  3 * SIZE], a4
+	LDF	[X +  4 * SIZE], a5
+	LDF	[X +  5 * SIZE], a6
+	LDF	[X +  6 * SIZE], a7
+	LDF	[X +  7 * SIZE], a8
+
+	ble,pt	%icc, .LL12
+	add	X, 8 * SIZE, X
+
+#define PREFETCHSIZE 128
+
+.LL11:
+	FADD	c1, t1, c1
+	prefetch [X  + PREFETCHSIZE * SIZE], 0
+	FMOV	a1, t1
+	LDF	[X +  0 * SIZE], a1
+
+	FADD	c2, t2, c2
+	add	I, -1, I
+	FMOV	a2, t2
+	LDF	[X +  1 * SIZE], a2
+
+	FADD	c1, t3, c1
+	cmp	I, 0
+	FMOV	a3, t3
+	LDF	[X +  2 * SIZE], a3
+
+	FADD	c2, t4, c2
+	nop
+	FMOV	a4, t4
+	LDF	[X +  3 * SIZE], a4
+
+	FADD	c1, t1, c1
+	nop
+	FMOV	a5, t1
+	LDF	[X +  4 * SIZE], a5
+
+	FADD	c2, t2, c2
+	nop
+	FMOV	a6, t2
+	LDF	[X +  5 * SIZE], a6
+
+	FADD	c1, t3, c1
+	FMOV	a7, t3
+	LDF	[X +  6 * SIZE], a7
+	add	X, 8 * SIZE, X
+
+	FADD	c2, t4, c2
+	FMOV	a8, t4
+	bg,pt	%icc, .LL11
+	LDF	[X -  1 * SIZE], a8
+
+.LL12:
+	FADD	c1, t1, c1
+	FMOV	a1, t1
+	FADD	c2, t2, c2
+	FMOV	a2, t2
+
+	FADD	c1, t3, c1
+	FMOV	a3, t3
+	FADD	c2, t4, c2
+	FMOV	a4, t4
+
+	FADD	c1, t1, c1
+	FMOV	a5, t1
+	FADD	c2, t2, c2
+	FMOV	a6, t2
+
+	FADD	c1, t3, c1
+	FMOV	a7, t3
+	FADD	c2, t4, c2
+	FMOV	a8, t4
+
+.LL15:
+	and	N, 7, I
+	cmp	I,  0
+	ble,a,pn %icc, .LL19
+	nop
+
+.LL16:
+	LDF	[X +  0 * SIZE], a1
+	add	I, -1, I
+	cmp	I, 0
+	FADD	c1, t1, c1
+	FMOV	a1, t1
+	bg,pt	%icc, .LL16
+	add	X, 1 * SIZE, X
+
+.LL19:
+	FADD	c1, t1, c1
+	FADD	c2, t2, c2
+	FADD	c1, t3, c1
+	FADD	c2, t4, c2
+
+	FADD	c1, c2, c1
+	return	%i7 + 8
+	clr	%g0
+
+.LL50:
+	sra	N, 3, I
+	cmp	I, 0
+	ble,pn	%icc, .LL55
+	nop
+
+	LDF	[X +  0 * SIZE], a1
+	add	X, INCX, X
+	LDF	[X +  0 * SIZE], a2
+	add	X, INCX, X
+	LDF	[X +  0 * SIZE], a3
+	add	X, INCX, X
+	LDF	[X +  0 * SIZE], a4
+	add	X, INCX, X
+	LDF	[X +  0 * SIZE], a5
+	add	X, INCX, X
+	LDF	[X +  0 * SIZE], a6
+	add	X, INCX, X
+	add	I, -1, I
+	LDF	[X +  0 * SIZE], a7
+	cmp	I, 0
+	add	X, INCX, X
+	LDF	[X +  0 * SIZE], a8
+
+	ble,pt	%icc, .LL52
+	add	X, INCX, X
+
+.LL51:
+	FADD	c1, t1, c1
+	add	I, -1, I
+	FMOV	a1, t1
+	LDF	[X +  0 * SIZE], a1
+	add	X, INCX, X
+
+	FADD	c2, t2, c2
+	cmp	I, 0
+	FMOV	a2, t2
+	LDF	[X +  0 * SIZE], a2
+	add	X, INCX, X
+
+	FADD	c1, t3, c1
+	FMOV	a3, t3
+	LDF	[X +  0 * SIZE], a3
+	add	X, INCX, X
+
+	FADD	c2, t4, c2
+	FMOV	a4, t4
+	LDF	[X +  0 * SIZE], a4
+	add	X, INCX, X
+
+	FADD	c1, t1, c1
+	FMOV	a5, t1
+	LDF	[X +  0 * SIZE], a5
+	add	X, INCX, X
+
+	FADD	c2, t2, c2
+	FMOV	a6, t2
+	LDF	[X +  0 * SIZE], a6
+	add	X, INCX, X
+
+	FADD	c1, t3, c1
+	FMOV	a7, t3
+	LDF	[X +  0 * SIZE], a7
+	add	X, INCX, X
+
+	FADD	c2, t4, c2
+	FMOV	a8, t4
+	LDF	[X +  0 * SIZE], a8
+
+	bg,pt	%icc, .LL51
+	add	X, INCX, X
+
+.LL52:
+	FADD	c1, t1, c1
+	FMOV	a1, t1
+	FADD	c2, t2, c2
+	FMOV	a2, t2
+
+	FADD	c1, t3, c1
+	FMOV	a3, t3
+	FADD	c2, t4, c2
+	FMOV	a4, t4
+
+	FADD	c1, t1, c1
+	FMOV	a5, t1
+	FADD	c2, t2, c2
+	FMOV	a6, t2
+
+	FADD	c1, t3, c1
+	FMOV	a7, t3
+	FADD	c2, t4, c2
+	FMOV	a8, t4
+
+.LL55:
+	and	N, 7, I
+	cmp	I,  0
+	ble,a,pn %icc, .LL59
+	nop
+
+.LL56:
+	LDF	[X +  0 * SIZE], a1
+	FADD	c1, t1, c1
+	add	I, -1, I
+	FMOV	a1, t1
+	cmp	I, 0
+	bg,pt	%icc, .LL56
+	add	X, INCX, X
+
+.LL59:
+	FADD	c1, t1, c1
+	FADD	c2, t2, c2
+	FADD	c1, t3, c1
+	FADD	c2, t4, c2
+
+	FADD	c1, c2, c1
+	return	%i7 + 8
+	clr	%o0
+
+	EPILOGUE
diff --git a/kernel/sparc/zsum.S b/kernel/sparc/zsum.S
new file mode 100644
index 0000000000..bc167dc72a
--- /dev/null
+++ b/kernel/sparc/zsum.S
@@ -0,0 +1,327 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N	%i0
+#define X	%i1
+#define INCX	%i2
+#define I	%i3
+
+#ifdef DOUBLE
+#define c1	%f0
+#define c2	%f2
+#define t1	%f8
+#define t2	%f10
+#define t3	%f12
+#define t4	%f14
+
+#define a1	%f16
+#define a2	%f18
+#define a3	%f20
+#define a4	%f22
+#define a5	%f24
+#define a6	%f26
+#define a7	%f28
+#define a8	%f30
+#else
+#define c1	%f0
+#define c2	%f1
+#define t1	%f4
+#define t2	%f5
+#define t3	%f6
+#define t4	%f7
+
+#define a1	%f8
+#define a2	%f9
+#define a3	%f10
+#define a4	%f11
+#define a5	%f12
+#define a6	%f13
+#define a7	%f14
+#define a8	%f15
+#endif
+
+	PROLOGUE
+	SAVESP
+
+	FCLR(0)
+
+	sll	INCX, ZBASE_SHIFT, INCX
+
+	FMOV	c1, c2
+	FMOV	c1, t1
+	FMOV	c1, t2
+	FMOV	c1, t3
+	FMOV	c1, t4
+
+	cmp	INCX, 0
+	ble	.LL19
+	nop
+
+	cmp	INCX, 2 * SIZE
+	bne	.LL50
+	nop
+
+	sra	N, 2, I
+	cmp	I, 0
+	ble,pn	%icc, .LL15
+	nop
+
+	LDF	[X +  0 * SIZE], a1
+	add	I, -1, I
+	LDF	[X +  1 * SIZE], a2
+	cmp	I, 0
+	LDF	[X +  2 * SIZE], a3
+	LDF	[X +  3 * SIZE], a4
+	LDF	[X +  4 * SIZE], a5
+	LDF	[X +  5 * SIZE], a6
+	LDF	[X +  6 * SIZE], a7
+	LDF	[X +  7 * SIZE], a8
+
+	ble,pt	%icc, .LL12
+	add	X, 8 * SIZE, X
+
+#define PREFETCHSIZE 32
+
+.LL11:
+	FADD	c1, t1, c1
+	prefetch [X  + PREFETCHSIZE * SIZE], 0
+	FMOV	a1, t1
+	LDF	[X +  0 * SIZE], a1
+
+	FADD	c2, t2, c2
+	add	I, -1, I
+	FMOV	a2, t2
+	LDF	[X +  1 * SIZE], a2
+
+	FADD	c1, t3, c1
+	cmp	I, 0
+	FMOV	a3, t3
+	LDF	[X +  2 * SIZE], a3
+
+	FADD	c2, t4, c2
+	nop
+	FMOV	a4, t4
+	LDF	[X +  3 * SIZE], a4
+
+	FADD	c1, t1, c1
+	nop
+	FMOV	a5, t1
+	LDF	[X +  4 * SIZE], a5
+
+	FADD	c2, t2, c2
+	nop
+	FMOV	a6, t2
+	LDF	[X +  5 * SIZE], a6
+
+	FADD	c1, t3, c1
+	FMOV	a7, t3
+	LDF	[X +  6 * SIZE], a7
+	add	X, 8 * SIZE, X
+
+	FADD	c2, t4, c2
+	FMOV	a8, t4
+	bg,pt	%icc, .LL11
+	LDF	[X -  1 * SIZE], a8
+
+.LL12:
+	FADD	c1, t1, c1
+	FMOV	a1, t1
+	FADD	c2, t2, c2
+	FMOV	a2, t2
+
+	FADD	c1, t3, c1
+	FMOV	a3, t3
+	FADD	c2, t4, c2
+	FMOV	a4, t4
+
+	FADD	c1, t1, c1
+	FMOV	a5, t1
+	FADD	c2, t2, c2
+	FMOV	a6, t2
+
+	FADD	c1, t3, c1
+	FMOV	a7, t3
+	FADD	c2, t4, c2
+	FMOV	a8, t4
+
+.LL15:
+	and	N, 3, I
+	cmp	I,  0
+	ble,a,pn %icc, .LL19
+	nop
+
+.LL16:
+	LDF	[X +  0 * SIZE], a1
+	LDF	[X +  1 * SIZE], a2
+	add	I, -1, I
+	cmp	I, 0
+	FADD	c1, t1, c1
+	FADD	c2, t2, c2
+	FMOV	a1, t1
+	FMOV	a2, t2
+	bg,pt	%icc, .LL16
+	add	X, 2 * SIZE, X
+
+.LL19:
+	FADD	c1, t1, c1
+	FADD	c2, t2, c2
+	FADD	c1, t3, c1
+	FADD	c2, t4, c2
+
+	FADD	c1, c2, c1
+	return	%i7 + 8
+	clr	%g0
+
+.LL50:
+	sra	N, 2, I
+	cmp	I, 0
+	ble,pn	%icc, .LL55
+	nop
+
+	LDF	[X +  0 * SIZE], a1
+	LDF	[X +  1 * SIZE], a2
+	add	X, INCX, X
+	LDF	[X +  0 * SIZE], a3
+	LDF	[X +  1 * SIZE], a4
+	add	X, INCX, X
+	LDF	[X +  0 * SIZE], a5
+	LDF	[X +  1 * SIZE], a6
+	add	X, INCX, X
+	add	I, -1, I
+	LDF	[X +  0 * SIZE], a7
+	cmp	I, 0
+	LDF	[X +  1 * SIZE], a8
+
+	ble,pt	%icc, .LL52
+	add	X, INCX, X
+
+.LL51:
+	FADD	c1, t1, c1
+	add	I, -1, I
+	FMOV	a1, t1
+	LDF	[X +  0 * SIZE], a1
+
+	FADD	c2, t2, c2
+	cmp	I, 0
+	FMOV	a2, t2
+	LDF	[X +  1 * SIZE], a2
+	add	X, INCX, X
+
+	FADD	c1, t3, c1
+	FMOV	a3, t3
+	LDF	[X +  0 * SIZE], a3
+
+	FADD	c2, t4, c2
+	FMOV	a4, t4
+	LDF	[X +  1 * SIZE], a4
+	add	X, INCX, X
+
+	FADD	c1, t1, c1
+	FMOV	a5, t1
+	LDF	[X +  0 * SIZE], a5
+
+	FADD	c2, t2, c2
+	FMOV	a6, t2
+	LDF	[X +  1 * SIZE], a6
+	add	X, INCX, X
+
+	FADD	c1, t3, c1
+	FMOV	a7, t3
+	LDF	[X +  0 * SIZE], a7
+
+	FADD	c2, t4, c2
+	FMOV	a8, t4
+	LDF	[X +  1 * SIZE], a8
+
+	bg,pt	%icc, .LL51
+	add	X, INCX, X
+
+.LL52:
+	FADD	c1, t1, c1
+	FMOV	a1, t1
+	FADD	c2, t2, c2
+	FMOV	a2, t2
+
+	FADD	c1, t3, c1
+	FMOV	a3, t3
+	FADD	c2, t4, c2
+	FMOV	a4, t4
+
+	FADD	c1, t1, c1
+	FMOV	a5, t1
+	FADD	c2, t2, c2
+	FMOV	a6, t2
+
+	FADD	c1, t3, c1
+	FMOV	a7, t3
+	FADD	c2, t4, c2
+	FMOV	a8, t4
+
+.LL55:
+	and	N, 3, I
+	cmp	I,  0
+	ble,a,pn %icc, .LL59
+	nop
+
+.LL56:
+	LDF	[X +  0 * SIZE], a1
+	LDF	[X +  1 * SIZE], a2
+	FADD	c1, t1, c1
+	FADD	c2, t2, c2
+	add	I, -1, I
+	FMOV	a1, t1
+	FMOV	a2, t2
+	cmp	I, 0
+	bg,pt	%icc, .LL56
+	add	X, INCX, X
+
+.LL59:
+	FADD	c1, t1, c1
+	FADD	c2, t2, c2
+	FADD	c1, t3, c1
+	FADD	c2, t4, c2
+
+	FADD	c1, c2, c1
+
+	return	%i7 + 8
+	clr	%o0
+
+	EPILOGUE
diff --git a/kernel/x86/KERNEL.generic b/kernel/x86/KERNEL.generic
index 672edb0696..0aac0ce996 100644
--- a/kernel/x86/KERNEL.generic
+++ b/kernel/x86/KERNEL.generic
@@ -94,6 +94,11 @@ DASUMKERNEL  = ../arm/asum.c
 CASUMKERNEL  = ../arm/zasum.c
 ZASUMKERNEL  = ../arm/zasum.c
 
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
 SAXPYKERNEL  = ../arm/axpy.c
 DAXPYKERNEL  = ../arm/axpy.c
 CAXPYKERNEL  = ../arm/zaxpy.c
diff --git a/kernel/x86/sum.S b/kernel/x86/sum.S
new file mode 100644
index 0000000000..b24f34c8be
--- /dev/null
+++ b/kernel/x86/sum.S
@@ -0,0 +1,207 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACK	 8
+#define ARGS	 0
+
+#define STACK_M		 4 + STACK + ARGS(%esp)
+#define STACK_X		 8 + STACK + ARGS(%esp)
+#define STACK_INCX	12 + STACK + ARGS(%esp)
+
+#define M	%edx
+#define X	%ecx
+#define INCX	%esi
+
+#define I	%eax
+
+#include "l1param.h"
+
+	PROLOGUE
+
+	pushl	%esi
+	pushl	%ebx
+
+	PROFCODE
+
+#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
+	EMMS
+#endif
+
+	movl	STACK_M,    M
+	movl	STACK_X,    X
+	movl	STACK_INCX, INCX
+
+#ifdef F_INTERFACE
+	movl	(M),    M
+	movl	(INCX), INCX
+#endif
+
+	fldz
+	testl	M, M
+	jle	.L999
+	testl	INCX, INCX
+	jle	.L999
+
+	sall	$BASE_SHIFT, INCX
+	fldz
+	fldz
+	fldz
+	cmpl	$SIZE, INCX
+	jne	.L40
+
+	movl	M,  I
+	sarl	$3, I
+	jle	.L20
+	ALIGN_4
+
+.L10:
+#ifdef PREFETCH
+	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
+#endif
+
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	FLD	2 * SIZE(X)
+	FLD	3 * SIZE(X)
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	FLD	4 * SIZE(X)
+	FLD	5 * SIZE(X)
+	FLD	6 * SIZE(X)
+	FLD	7 * SIZE(X)
+
+	addl	$8 * SIZE, X
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	decl	I
+	jg	.L10
+	ALIGN_4
+
+.L20:
+	movl	M,  I
+	andl	$7, I
+	jle	.L998
+	ALIGN_4
+
+
+.L21:
+	FLD	(X)
+	faddp	%st,%st(1)
+	addl	$1 * SIZE, X
+	decl	I
+	jg	.L21
+	jmp	.L998
+	ALIGN_4
+
+.L40:
+	movl	M,  I
+	sarl	$3, I
+	jle	.L60
+	ALIGN_4
+
+.L50:
+	FLD	(X)
+	addl	INCX, X
+	FLD	(X)
+	addl	INCX, X
+	FLD	(X)
+	addl	INCX, X
+	FLD	(X)
+	addl	INCX, X
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	FLD	(X)
+	addl	INCX, X
+	FLD	(X)
+	addl	INCX, X
+	FLD	(X)
+	addl	INCX, X
+	FLD	(X)
+	addl	INCX, X
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	decl	I
+	jg	.L50
+	ALIGN_4
+
+.L60:
+	movl	M,  I
+	andl	$7, I
+	jle	.L998
+	ALIGN_4
+
+
+.L61:
+	FLD	(X)
+	addl	INCX, X
+	faddp	%st,%st(1)
+	decl	I
+	jg	.L61
+	ALIGN_4
+
+.L998:
+	faddp	%st,%st(2)
+	faddp	%st,%st(1)
+	faddp	%st,%st(1)
+	ALIGN_4
+
+.L999:
+	popl	%ebx
+	popl	%esi
+	ret
+
+	EPILOGUE
diff --git a/kernel/x86/zsum.S b/kernel/x86/zsum.S
new file mode 100644
index 0000000000..cd2ce61db6
--- /dev/null
+++ b/kernel/x86/zsum.S
@@ -0,0 +1,208 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACK	 8
+#define ARGS	 0
+
+#define STACK_M		 4 + STACK + ARGS(%esp)
+#define STACK_X		 8 + STACK + ARGS(%esp)
+#define STACK_INCX	12 + STACK + ARGS(%esp)
+
+#define M	%edx
+#define X	%ecx
+#define INCX	%esi
+
+#define I	%eax
+
+#include "l1param.h"
+
+	PROLOGUE
+
+	pushl	%esi
+	pushl	%ebx
+
+	PROFCODE
+
+#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
+	EMMS
+#endif
+
+	movl	STACK_M,    M
+	movl	STACK_X,    X
+	movl	STACK_INCX, INCX
+
+#ifdef F_INTERFACE
+	movl	(M),    M
+	movl	(INCX), INCX
+#endif
+
+	fldz
+	testl	M,  M
+	jle	.L999
+	testl	INCX, INCX
+	jle	.L999
+
+	sall	$ZBASE_SHIFT, INCX
+
+	fldz
+	fldz
+	fldz
+	cmpl	$SIZE * 2, INCX
+	jne	.L40
+
+	movl	M,  I
+	sarl	$2, I
+	jle	.L20
+	ALIGN_4
+
+.L10:
+#ifdef PREFETCH
+	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
+#endif
+
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	FLD	2 * SIZE(X)
+	FLD	3 * SIZE(X)
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	FLD	4 * SIZE(X)
+	FLD	5 * SIZE(X)
+	FLD	6 * SIZE(X)
+	FLD	7 * SIZE(X)
+
+	addl	$8 * SIZE, X
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	decl	I
+	jg	.L10
+	ALIGN_4
+
+.L20:
+	movl	M,  I
+	andl	$3, I
+	jle	.L998
+	ALIGN_4
+
+
+.L21:
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	faddp	%st,%st(3)
+	faddp	%st,%st(1)
+	addl	$2 * SIZE, X
+	decl	I
+	jg	.L21
+	jmp	.L998
+	ALIGN_4
+
+.L40:
+	movl	M,  I
+	sarl	$2, I
+	jle	.L60
+	ALIGN_4
+
+.L50:
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	addl	INCX, X
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	addl	INCX, X
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	addl	INCX, X
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	addl	INCX, X
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	decl	I
+	jg	.L50
+	ALIGN_4
+
+.L60:
+	movl	M,  I
+	andl	$3, I
+	jle	.L998
+	ALIGN_4
+
+
+.L61:
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	addl	INCX, X
+	faddp	%st,%st(3)
+	faddp	%st,%st(1)
+	decl	I
+	jg	.L61
+	ALIGN_4
+
+.L998:
+	faddp	%st,%st(2)
+	faddp	%st,%st(1)
+	faddp	%st,%st(1)
+	ALIGN_4
+
+.L999:
+	popl	%ebx
+	popl	%esi
+	ret
+
+	EPILOGUE
diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index acc6356d60..5d0a300b5e 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -7,7 +7,7 @@ SGEMMITCOPY    =  sgemm_tcopy_16_skylakex.c
 SGEMMONCOPY    =  sgemm_ncopy_4_skylakex.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 
-DGEMMKERNEL    =  dgemm_kernel_4x8_skylakex.c
+#DGEMMKERNEL    =  dgemm_kernel_4x8_skylakex.c
 
 DGEMMINCOPY    =  dgemm_ncopy_8_skylakex.c
 DGEMMITCOPY    =  dgemm_tcopy_8_skylakex.c
diff --git a/kernel/x86_64/KERNEL.generic b/kernel/x86_64/KERNEL.generic
index a23e59f3f8..7cb0cb836c 100644
--- a/kernel/x86_64/KERNEL.generic
+++ b/kernel/x86_64/KERNEL.generic
@@ -94,6 +94,11 @@ DASUMKERNEL  = ../arm/asum.c
 CASUMKERNEL  = ../arm/zasum.c
 ZASUMKERNEL  = ../arm/zasum.c
 
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
 SAXPYKERNEL  = ../arm/axpy.c
 DAXPYKERNEL  = ../arm/axpy.c
 CAXPYKERNEL  = ../arm/zaxpy.c
diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c
index 33bda09434..ca2209340c 100644
--- a/kernel/x86_64/caxpy_microk_bulldozer-2.c
+++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c
@@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
@@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper					    \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c
index 00e2e6a42a..b605ea34c8 100644
--- a/kernel/x86_64/caxpy_microk_haswell-2.c
+++ b/kernel/x86_64/caxpy_microk_haswell-2.c
@@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c
index a798fd9779..72d37afed6 100644
--- a/kernel/x86_64/caxpy_microk_sandy-2.c
+++ b/kernel/x86_64/caxpy_microk_sandy-2.c
@@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper					    \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c
index 87370b0320..7ca7af0701 100644
--- a/kernel/x86_64/caxpy_microk_steamroller-2.c
+++ b/kernel/x86_64/caxpy_microk_steamroller-2.c
@@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper					    \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
@@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c
index f587aa0366..1186559130 100644
--- a/kernel/x86_64/cdot_microk_bulldozer-2.c
+++ b/kernel/x86_64/cdot_microk_bulldozer-2.c
@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vzeroupper		        	\n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovups        %%xmm4,  16(%4)		\n\t"
 	"vzeroupper		        	\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c
index fe195a63b2..8b9d6d104b 100644
--- a/kernel/x86_64/cdot_microk_haswell-2.c
+++ b/kernel/x86_64/cdot_microk_haswell-2.c
@@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vzeroupper					     \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c
index 01816917d2..fe142c38f7 100644
--- a/kernel/x86_64/cdot_microk_sandy-2.c
+++ b/kernel/x86_64/cdot_microk_sandy-2.c
@@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovups       %%xmm4,  16(%4)		\n\t"
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c
index 76a3aa0eb0..7350b21c9f 100644
--- a/kernel/x86_64/cdot_microk_steamroller-2.c
+++ b/kernel/x86_64/cdot_microk_steamroller-2.c
@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vzeroupper		        	\n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovups        %%xmm4,  16(%4)		\n\t"
 	"vzeroupper		        	\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c
index 3abffc4cfa..31451aa6cb 100644
--- a/kernel/x86_64/cscal_microk_bulldozer-2.c
+++ b/kernel/x86_64/cscal_microk_bulldozer-2.c
@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+        :
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+        :
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+        :
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+        :
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c
index 0a4eb683c2..a04a4c4aba 100644
--- a/kernel/x86_64/cscal_microk_haswell-2.c
+++ b/kernel/x86_64/cscal_microk_haswell-2.c
@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+        :
           "r" (alpha)   // 2
-	: "cc", //"0", "1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+        :
           "r" (alpha)   // 2
 	: "cc", // "0", "1",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
@@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+        :
           "r" (alpha)   // 2
 	: "cc", //"%0", "%1",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
@@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
 
 	"vzeroupper					    \n\t"
 
-	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	: 
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+        :
           "r" (alpha)   // 2
-	: "cc", //"0", "1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c
index 8346e17483..e8073d485e 100644
--- a/kernel/x86_64/cscal_microk_steamroller-2.c
+++ b/kernel/x86_64/cscal_microk_steamroller-2.c
@@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
-	: "cc", //"0", "1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
 
 	"vzeroupper					    \n\t"
 
+	: 
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
           "r" (alpha)   // 2
-	: "cc", //"0", "1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
-	: "cc", //"0", "1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c
index 8c520dcf10..9c1305b977 100644
--- a/kernel/x86_64/daxpy_microk_bulldozer-2.c
+++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c
@@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c
index bbe8b95506..f3682e6d72 100644
--- a/kernel/x86_64/daxpy_microk_haswell-2.c
+++ b/kernel/x86_64/daxpy_microk_haswell-2.c
@@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper				     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c
index 943d893af3..8feb9f26cd 100644
--- a/kernel/x86_64/daxpy_microk_nehalem-2.c
+++ b/kernel/x86_64/daxpy_microk_nehalem-2.c
@@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c
index 95eb953b48..4b83124c7a 100644
--- a/kernel/x86_64/daxpy_microk_piledriver-2.c
+++ b/kernel/x86_64/daxpy_microk_piledriver-2.c
@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"subq	        $16, %1			             		\n\t"		
 	"jnz		1b		             			\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"subq	        $16, %1			             		\n\t"		
 	"jnz		1b		             			\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c
index 85e038cef1..db9a45de81 100644
--- a/kernel/x86_64/daxpy_microk_sandy-2.c
+++ b/kernel/x86_64/daxpy_microk_sandy-2.c
@@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c
index e40009037d..8e63fcc1db 100644
--- a/kernel/x86_64/daxpy_microk_steamroller-2.c
+++ b/kernel/x86_64/daxpy_microk_steamroller-2.c
@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"subq	        $16, %1			             		\n\t"		
 	"jnz		1b		             			\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"subq	        $16, %1			             		\n\t"		
 	"jnz		1b		             			\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c
index 9756ee46a9..5590c5b177 100644
--- a/kernel/x86_64/ddot_microk_bulldozer-2.c
+++ b/kernel/x86_64/ddot_microk_bulldozer-2.c
@@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 
 	"vmovsd		%%xmm4,    (%4)		\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c
index 365737363b..dbb5487f70 100644
--- a/kernel/x86_64/ddot_microk_haswell-2.c
+++ b/kernel/x86_64/ddot_microk_haswell-2.c
@@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vzeroupper				\n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c
index fb5ec9bca0..e5e234e225 100644
--- a/kernel/x86_64/ddot_microk_nehalem-2.c
+++ b/kernel/x86_64/ddot_microk_nehalem-2.c
@@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 
 	"movsd	       %%xmm4,    (%4)	\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c
index ac950885c6..cc4bcd90a2 100644
--- a/kernel/x86_64/ddot_microk_piledriver-2.c
+++ b/kernel/x86_64/ddot_microk_piledriver-2.c
@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovsd		%%xmm4,    (%4)		\n\t"
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
@@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovsd		%%xmm4,    (%4)		\n\t"
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c
index 160f956048..84493ec273 100644
--- a/kernel/x86_64/ddot_microk_sandy-2.c
+++ b/kernel/x86_64/ddot_microk_sandy-2.c
@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovsd		%%xmm4,    (%4)		\n\t"
 	"vzeroupper				\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c
index 5ce20b5dee..27d5244ce2 100644
--- a/kernel/x86_64/ddot_microk_steamroller-2.c
+++ b/kernel/x86_64/ddot_microk_steamroller-2.c
@@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovsd		%%xmm4,    (%4)		\n\t"
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c
index 6d2530e81e..6d33641e91 100644
--- a/kernel/x86_64/dgemv_n_4.c
+++ b/kernel/x86_64/dgemv_n_4.c
@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 	"jnz		1b		       \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (ap[0]),  // 4
@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
         "jnz            1b               \n\t"
 
         :
+          "+r" (i),     // 0    
+          "+r" (n)      // 1
         :
-          "r" (i),      // 0    
-          "r" (n),      // 1
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (ap),     // 4
diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c
index 584a6c6b5b..da0fa2fff2 100644
--- a/kernel/x86_64/dgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c
@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
           "r" (ap[3]),  // 7
           "r" (alpha)   // 8
 	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3",
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", 
diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c
index 530780bab7..466931b82f 100644
--- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c
+++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c
@@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	__asm__  __volatile__
 	(
 	"vzeroupper			 \n\t"
-	"vbroadcastsd    (%2), %%ymm12	 \n\t"	// x0 
-	"vbroadcastsd   8(%2), %%ymm13	 \n\t"	// x1 
-	"vbroadcastsd  16(%2), %%ymm14	 \n\t"	// x2 
-	"vbroadcastsd  24(%2), %%ymm15	 \n\t"	// x3 
-	"vbroadcastsd  32(%2), %%ymm0 	 \n\t"	// x4 
-	"vbroadcastsd  40(%2), %%ymm1 	 \n\t"	// x5 
-	"vbroadcastsd  48(%2), %%ymm2 	 \n\t"	// x6 
-	"vbroadcastsd  56(%2), %%ymm3 	 \n\t"	// x7 
+	"vbroadcastsd    (%3), %%ymm12	 \n\t"	// x0 
+	"vbroadcastsd   8(%3), %%ymm13	 \n\t"	// x1 
+	"vbroadcastsd  16(%3), %%ymm14	 \n\t"	// x2 
+	"vbroadcastsd  24(%3), %%ymm15	 \n\t"	// x3 
+	"vbroadcastsd  32(%3), %%ymm0 	 \n\t"	// x4 
+	"vbroadcastsd  40(%3), %%ymm1 	 \n\t"	// x5 
+	"vbroadcastsd  48(%3), %%ymm2 	 \n\t"	// x6 
+	"vbroadcastsd  56(%3), %%ymm3 	 \n\t"	// x7 
 
 	"vbroadcastsd    (%9), %%ymm6 	 \n\t"	// alpha 
 
         "testq          $0x04, %1                      \n\t"
         "jz             2f                     \n\t"
 
-	"vmovupd	(%3,%0,8), %%ymm7	       \n\t"	// 4 * y
+	"vmovupd	(%4,%0,8), %%ymm7	       \n\t"	// 4 * y
 	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 
-	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
-	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm5      \n\t" 
-	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
-	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm5      \n\t" 
+	"vfmadd231pd   (%5,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd   (%6,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231pd   (%8,%0,8), %%ymm15, %%ymm5      \n\t" 
 
-	"vfmadd231pd   (%4,%8,8), %%ymm0 , %%ymm4      \n\t" 
-	"vfmadd231pd   (%5,%8,8), %%ymm1 , %%ymm5      \n\t" 
-	"vfmadd231pd   (%6,%8,8), %%ymm2 , %%ymm4      \n\t" 
-	"vfmadd231pd   (%7,%8,8), %%ymm3 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%5,%2,8), %%ymm0 , %%ymm4      \n\t" 
+	"vfmadd231pd   (%6,%2,8), %%ymm1 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%7,%2,8), %%ymm2 , %%ymm4      \n\t" 
+	"vfmadd231pd   (%8,%2,8), %%ymm3 , %%ymm5      \n\t" 
 
 	"vaddpd		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
 	"vmulpd		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
 	"vaddpd		%%ymm7 , %%ymm5 , %%ymm5       \n\t"
 
 
-	"vmovupd  %%ymm5,   (%3,%0,8)		       \n\t"	// 4 * y
+	"vmovupd  %%ymm5,   (%4,%0,8)		       \n\t"	// 4 * y
 
-        "addq		$4 , %8	  	 	       \n\t"
+        "addq		$4 , %2	  	 	       \n\t"
         "addq		$4 , %0	  	 	       \n\t"
 	"subq	        $4 , %1			       \n\t"		
 
@@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
-	"vmovupd	(%3,%0,8), %%ymm8	       \n\t"	// 4 * y
-	"vmovupd      32(%3,%0,8), %%ymm9	       \n\t"	// 4 * y
-
-	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
-	"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5      \n\t" 
-	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm4      \n\t" 
-	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
-	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
-	"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5      \n\t" 
-	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm4      \n\t" 
-	"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5      \n\t" 
-
-	"vfmadd231pd   (%4,%8,8), %%ymm0 , %%ymm4      \n\t" 
+	"vmovupd	(%4,%0,8), %%ymm8	       \n\t"	// 4 * y
+	"vmovupd      32(%4,%0,8), %%ymm9	       \n\t"	// 4 * y
+
+	"vfmadd231pd   (%5,%0,8), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%0,8), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231pd   (%7,%0,8), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5      \n\t" 
+	"vfmadd231pd   (%8,%0,8), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231pd   (%5,%2,8), %%ymm0 , %%ymm4      \n\t" 
         "addq		$8 , %0	  	 	       \n\t"
-	"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5      \n\t" 
-	"vfmadd231pd   (%5,%8,8), %%ymm1 , %%ymm4      \n\t" 
-	"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5      \n\t" 
-	"vfmadd231pd   (%6,%8,8), %%ymm2 , %%ymm4      \n\t" 
-	"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5      \n\t" 
-	"vfmadd231pd   (%7,%8,8), %%ymm3 , %%ymm4      \n\t" 
-	"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5      \n\t" 
+	"vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%6,%2,8), %%ymm1 , %%ymm4      \n\t" 
+	"vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%7,%2,8), %%ymm2 , %%ymm4      \n\t" 
+	"vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5      \n\t" 
+	"vfmadd231pd   (%8,%2,8), %%ymm3 , %%ymm4      \n\t" 
+	"vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5      \n\t" 
 
 	"vfmadd231pd     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
 	"vfmadd231pd     %%ymm6 , %%ymm5 , %%ymm9      \n\t"
 
-        "addq		$8 , %8	  	 	      \n\t"
+        "addq		$8 , %2	  	 	      \n\t"
 	"vmovupd  %%ymm8,-64(%3,%0,8)		      \n\t"	// 4 * y
 	"subq	        $8 , %1			      \n\t"		
-	"vmovupd  %%ymm9,-32(%3,%0,8)		      \n\t"	// 4 * y
+	"vmovupd  %%ymm9,-32(%4,%0,8)		      \n\t"	// 4 * y
 
 	"jnz		1b		      \n\t"
 
@@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	:
           "+r" (i),	// 0	
-	  "+r" (n)  	// 1
+	  "+r" (n),  	// 1
+          "+r" (lda4)   // 2
         : 
-          "r" (x),      // 2
-          "r" (y),      // 3
-          "r" (ap[0]),  // 4
-          "r" (ap[1]),  // 5
-          "r" (ap[2]),  // 6
-          "r" (ap[3]),  // 7
-          "r" (lda4),   // 8
+          "r" (x),      // 3
+          "r" (y),      // 4
+          "r" (ap[0]),  // 5
+          "r" (ap[1]),  // 6
+          "r" (ap[2]),  // 7
+          "r" (ap[3]),  // 8
           "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c
index a7478e3a8b..ed672a7579 100644
--- a/kernel/x86_64/dgemv_t_4.c
+++ b/kernel/x86_64/dgemv_t_4.c
@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
 	"movsd	       %%xmm11,8(%2)	        \n\t"
 
         :
-   	:
-	"r" (i),	 // 0
-	"r" (n),	 // 1
+	"+r" (i),	 // 0
+	"+r" (n)	 // 1
+	:
         "r" (y),         // 2    
         "r" (ap0),       // 3
         "r" (ap1),       // 4
@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 	"movsd	       %%xmm10, (%2)	        \n\t"
 
         :
-   	:
-	"r" (i),	 // 0
-	"r" (n),	 // 1
+	"+r" (i),	 // 0
+	"+r" (n)	 // 1
+	:
         "r" (y),         // 2    
         "r" (ap),        // 3
         "r" (x)          // 4
@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
         "jnz            1b              \n\t"
 
         :
-   	:
-	"r" (i),	  // 0
-	"r" (n),	  // 1
+	"+r" (i),	  // 0
+	"+r" (n)	  // 1
+	:
         "r" (&da),        // 2    
         "r" (src),        // 3
         "r" (dest)        // 4
diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c
index 2bf966a5f4..e8494500ff 100644
--- a/kernel/x86_64/dger_microk_sandy-2.c
+++ b/kernel/x86_64/dger_microk_sandy-2.c
@@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"vzeroupper					     \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+	:
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c
index ef9a0a6ba0..d0d7801fd4 100644
--- a/kernel/x86_64/dscal.c
+++ b/kernel/x86_64/dscal.c
@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
 	"jnz    1b					    \n\t"
 
         :
+          "+r" (n)      // 0
         :
-          "r" (n),      // 0
           "r" (x),      // 1
           "r" (x1),     // 2
           "r" (alpha),  // 3
diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c
index de53b0bc4b..096662781e 100644
--- a/kernel/x86_64/dscal_microk_bulldozer-2.c
+++ b/kernel/x86_64/dscal_microk_bulldozer-2.c
@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n1),  	// 0
-          "r" (x),      // 1
+	  "+r" (n1),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha),  // 2
 	  "r" (n2)   	// 3
 	: "cc", 
@@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n1),  	// 0
-          "r" (x),      // 1
+	  "+r" (n1),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha),  // 2
 	  "r" (n2)   	// 3
 	: "cc", 
diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c
index e732a27181..77ed59a4e3 100644
--- a/kernel/x86_64/dscal_microk_haswell-2.c
+++ b/kernel/x86_64/dscal_microk_haswell-2.c
@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n1),  	// 0
-          "r" (x),      // 1
+	  "+r" (n1),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha),  // 2
 	  "r" (n2)   	// 3
 	: "cc", 
@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
 
 	"vzeroupper					    \n\t"
 
+	: 
+	  "+r" (n1),  	// 0
+          "+r" (x)      // 1
 	:
-        : 
-	  "r" (n1),  	// 0
-          "r" (x),      // 1
           "r" (alpha),  // 2
 	  "r" (n2)   	// 3
 	: "cc", 
diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c
index 8d855072b4..9982b8e587 100644
--- a/kernel/x86_64/dscal_microk_sandy-2.c
+++ b/kernel/x86_64/dscal_microk_sandy-2.c
@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n1),  	// 0
-          "r" (x),      // 1
+	  "+r" (n1),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha),  // 2
 	  "r" (n2)   	// 3
 	: "cc", 
@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
 
 	"vzeroupper					    \n\t"
 
+	: 
+	  "+r" (n1),  	// 0
+          "+r" (x)      // 1
 	:
-        : 
-	  "r" (n1),  	// 0
-          "r" (x),      // 1
           "r" (alpha),  // 2
 	  "r" (n2)   	// 3
 	: "cc", 
diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
index d84470cc44..bfa07b6d02 100644
--- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
+++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 	"vmovsd         %%xmm3 ,24(%9)		\n\t"	// save temp2
 
 	:
-        : 
-          "r" (from),	// 0	
+          "+r" (from)	// 0	
+        :
 	  "r" (to),  	// 1
           "r" (x),      // 2
           "r" (y),      // 3
diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c
index 866782ee6f..6241879d5c 100644
--- a/kernel/x86_64/dsymv_L_microk_haswell-2.c
+++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c
@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 	"vzeroupper				     \n\t"
 
 	:
-        : 
-          "r" (from),	// 0	
+          "+r" (from)	// 0	
+        :
 	  "r" (to),  	// 1
           "r" (x),      // 2
           "r" (y),      // 3
diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c
index 38479f77af..a161dcd8b3 100644
--- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c
+++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c
@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 	"movsd         %%xmm3 , 24(%9)		\n\t"	// save temp2
 
 	:
-        : 
-          "r" (from),	// 0	
+          "+r" (from)	// 0	
+        :
 	  "r" (to),  	// 1
           "r" (x),      // 2
           "r" (y),      // 3
diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c
index b4e6ab3692..b205b10193 100644
--- a/kernel/x86_64/dsymv_L_microk_sandy-2.c
+++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c
@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 	"vzeroupper				     \n\t"
 
 	:
-        : 
-          "r" (from),	// 0	
+          "+r" (from)	// 0	
+        :
 	  "r" (to),  	// 1
           "r" (x),      // 2
           "r" (y),      // 3
diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
index d7166fe4b4..ae287b6d8c 100644
--- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
+++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 	"vmovsd         %%xmm3 ,24(%9)		\n\t"	// save temp2
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (a0),     // 4
diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c
index d83d20f8e6..4778f644a3 100644
--- a/kernel/x86_64/dsymv_U_microk_haswell-2.c
+++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c
@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 	"vzeroupper				     \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (a0),	// 4
diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c
index 1344c75f73..065182286a 100644
--- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c
+++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c
@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 	"movsd         %%xmm3 , 24(%9)		\n\t"	// save temp2
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (a0),     // 4
diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c
index 1ef6fbafdc..d84e703bd5 100644
--- a/kernel/x86_64/dsymv_U_microk_sandy-2.c
+++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c
@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 	"vzeroupper				     \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (a0),	// 4
diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
index fcab8e2c78..9ab78fc8ea 100644
--- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c
+++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	cmpq	       $0, %0						\n\t"
 	"	je	       4f						\n\t"
 
-	"	vmovups         (%2,%1,4), %%ymm0				\n\t"	// read a
-	"	vmovups         (%3,%1,8), %%ymm1				\n\t"	// read b0
-	"	vmovups       32(%3,%1,8), %%ymm2				\n\t"	// read b1
+	"	vmovups         (%8,%1,4), %%ymm0				\n\t"	// read a
+	"	vmovups         (%9,%1,8), %%ymm1				\n\t"	// read b0
+	"	vmovups       32(%9,%1,8), %%ymm2				\n\t"	// read b1
 
 
 	"	addq		$8, %1						\n\t"
@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	.p2align 4							\n\t"
 	"1:									\n\t"
 
-	"	vmovups         (%2,%1,4), %%ymm4				\n\t"	// read a
+	"	vmovups         (%8,%1,4), %%ymm4				\n\t"	// read a
         "       vpermpd         $0xb1  , %%ymm0 , %%ymm3                	\n\t"
 
 	"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm8			\n\t"
 	"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm12			\n\t"
 
-	"	vmovups         (%3,%1,8), %%ymm5				\n\t"	// read b0
+	"	vmovups         (%9,%1,8), %%ymm5				\n\t"	// read b0
 	"	vfmadd231pd	%%ymm3 , %%ymm1 , %%ymm9			\n\t"
 	"	vfmadd231pd	%%ymm3 , %%ymm2 , %%ymm13			\n\t"
 
         "       vpermpd         $0x1b  , %%ymm3 , %%ymm0                	\n\t"
-	"	vmovups       32(%3,%1,8), %%ymm6				\n\t"	// read b1
+	"	vmovups       32(%9,%1,8), %%ymm6				\n\t"	// read b1
         "       vpermpd         $0xb1  , %%ymm0 , %%ymm3                	\n\t"
 	"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm10			\n\t"
 	"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm14			\n\t"
@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 
 	"	jz		22f						\n\t"
 
-	"	vmovups         (%2,%1,4), %%ymm0				\n\t"	// read a
+	"	vmovups         (%8,%1,4), %%ymm0				\n\t"	// read a
 
 	"	vfmadd231pd	%%ymm4 , %%ymm5 , %%ymm8			\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm6 , %%ymm12			\n\t"
 
         "       vpermpd         $0xb1  , %%ymm4 , %%ymm4                	\n\t"
-	"	vmovups         (%3,%1,8), %%ymm1				\n\t"	// read b0
+	"	vmovups         (%9,%1,8), %%ymm1				\n\t"	// read b0
 	"	vfmadd231pd	%%ymm4 , %%ymm5 , %%ymm9			\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm6 , %%ymm13			\n\t"
 
         "       vpermpd         $0x1b  , %%ymm4 , %%ymm4                	\n\t"
-	"	vmovups       32(%3,%1,8), %%ymm2				\n\t"	// read b1
+	"	vmovups       32(%9,%1,8), %%ymm2				\n\t"	// read b1
 	"	vfmadd231pd	%%ymm4 , %%ymm5 , %%ymm10			\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm6 , %%ymm14			\n\t"
 
@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vmovups		  (%6,%7,1) , %%ymm7			\n\t"	// read c7
 
 	"	vsubpd		%%ymm8 , %%ymm0 , %%ymm8		\n\t"
-	"	vmovups           (%9),  %%ymm0				\n\t"
+	"	vmovups           (%3),  %%ymm0				\n\t"
 	"	vsubpd		%%ymm9 , %%ymm1 , %%ymm9		\n\t"
 	"	vpermpd		$0x55 ,  %%ymm0 , %%ymm1		\n\t"
 	"	vsubpd		%%ymm10, %%ymm2 , %%ymm10		\n\t"
@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vpermpd		$0x00 ,  %%ymm0 , %%ymm0		\n\t"
 
 	"	vsubpd		%%ymm12, %%ymm4 , %%ymm12		\n\t"
-	"	vmovups         32(%9),  %%ymm4				\n\t"
+	"	vmovups         32(%3),  %%ymm4				\n\t"
 	"	vsubpd		%%ymm13, %%ymm5 , %%ymm13		\n\t"
 	"	vpermpd		$0x55 ,  %%ymm4 , %%ymm5		\n\t"
 	"	vsubpd		%%ymm14, %%ymm6 , %%ymm14		\n\t"
@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 
 	"5:								\n\t"	// i = 0
 
-	"	addq	$64, %9						\n\t"	// b=b+8
+	"	addq	$64, %3						\n\t"	// b=b+8
 
 	"	vmulpd		%%ymm8 , %%ymm0, %%ymm8			\n\t"	// a *bb
-	"	vmovups           (%9),  %%ymm0				\n\t"
-	"	vmovups		%%ymm8 , (%8)				\n\t"	// write a
+	"	vmovups           (%3),  %%ymm0				\n\t"
+	"	vmovups		%%ymm8 , (%2)				\n\t"	// write a
 	"	vmovups		%%ymm8 , (%4)				\n\t"	// write c
 
 	"	vfnmadd231pd	%%ymm8 , %%ymm1 , %%ymm9		\n\t"
-	"	vmovups         32(%9),  %%ymm1				\n\t"
+	"	vmovups         32(%3),  %%ymm1				\n\t"
 	"	vfnmadd231pd	%%ymm8 , %%ymm2 , %%ymm10		\n\t"
 	"	vpermpd		$0xaa ,  %%ymm0 , %%ymm2		\n\t"
 	"	vfnmadd231pd	%%ymm8 , %%ymm3 , %%ymm11		\n\t"
@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vpermpd		$0xff ,  %%ymm1 , %%ymm7		\n\t"
 	"	vpermpd		$0x00 ,  %%ymm1 , %%ymm4		\n\t"
 
-	"	addq	$64, %9						\n\t"	// b=b+8
-	"	addq	$32, %8						\n\t"	// a=a+8
+	"	addq	$64, %3						\n\t"	// b=b+8
+	"	addq	$32, %2						\n\t"	// a=a+8
 
 
 
 	"	vmulpd		%%ymm9 , %%ymm0, %%ymm9			\n\t"	// a *bb
-	"	vmovups           (%9),  %%ymm0				\n\t"
-	"	vmovups         32(%9),  %%ymm1				\n\t"
-	"	vmovups		%%ymm9 , (%8)				\n\t"	// write a
+	"	vmovups           (%3),  %%ymm0				\n\t"
+	"	vmovups         32(%3),  %%ymm1				\n\t"
+	"	vmovups		%%ymm9 , (%2)				\n\t"	// write a
 	"	vmovups		%%ymm9 , (%4,%7,1)			\n\t"	// write c
 
 	"	vfnmadd231pd	%%ymm9 , %%ymm2 , %%ymm10		\n\t"
@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vpermpd		$0xff ,  %%ymm1 , %%ymm7		\n\t"
 	"	vpermpd		$0x00 ,  %%ymm1 , %%ymm4		\n\t"
 
-	"	addq	$64, %9						\n\t"	// b=b+8
-	"	addq	$32, %8						\n\t"	// a=a+8
+	"	addq	$64, %3						\n\t"	// b=b+8
+	"	addq	$32, %2						\n\t"	// a=a+8
 
 	"	vmulpd		%%ymm10, %%ymm0, %%ymm10		\n\t"	// a *bb
-	"	vmovups           (%9),  %%ymm0				\n\t"
-	"	vmovups         32(%9),  %%ymm1				\n\t"
-	"	vmovups		%%ymm10, (%8)				\n\t"	// write a
+	"	vmovups           (%3),  %%ymm0				\n\t"
+	"	vmovups         32(%3),  %%ymm1				\n\t"
+	"	vmovups		%%ymm10, (%2)				\n\t"	// write a
 	"	vmovups		%%ymm10, (%4,%7,2)			\n\t"	// write c
 
 	"	vfnmadd231pd	%%ymm10, %%ymm3 , %%ymm11		\n\t"
@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vpermpd		$0x00 ,  %%ymm1 , %%ymm4		\n\t"
 
 
-	"	addq	$64, %9						\n\t"	// b=b+8
-	"	addq	$32, %8						\n\t"	// a=a+8
+	"	addq	$64, %3						\n\t"	// b=b+8
+	"	addq	$32, %2						\n\t"	// a=a+8
 
 
 
 	"	vmulpd		%%ymm11, %%ymm0, %%ymm11		\n\t"	// a *bb
-	"	vmovups         32(%9),  %%ymm1				\n\t"
-	"	vmovups		%%ymm11, (%8)				\n\t"	// write a
+	"	vmovups         32(%3),  %%ymm1				\n\t"
+	"	vmovups		%%ymm11, (%2)				\n\t"	// write a
 	"	vmovups		%%ymm11, (%5)     			\n\t"	// write c
 
 	"	vfnmadd231pd	%%ymm11, %%ymm4 , %%ymm12		\n\t"
@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vpermpd		$0x00 ,  %%ymm1 , %%ymm0		\n\t"
 
 
-	"	addq	$64, %9						\n\t"	// b=b+8
-	"	addq	$32, %8						\n\t"	// a=a+8
+	"	addq	$64, %3						\n\t"	// b=b+8
+	"	addq	$32, %2						\n\t"	// a=a+8
 
 
 	"	vmulpd		%%ymm12, %%ymm0, %%ymm12		\n\t"	// a *bb
-	"	vmovups         32(%9),  %%ymm1				\n\t"
-	"	vmovups		%%ymm12, (%8)				\n\t"	// write a
+	"	vmovups         32(%3),  %%ymm1				\n\t"
+	"	vmovups		%%ymm12, (%2)				\n\t"	// write a
 	"	vmovups		%%ymm12, (%5,%7,1)			\n\t"	// write c
 
 	"	vfnmadd231pd	%%ymm12, %%ymm5 , %%ymm13		\n\t"
@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vpermpd		$0xff ,  %%ymm1 , %%ymm7		\n\t"
 	"	vpermpd		$0x55 ,  %%ymm1 , %%ymm0		\n\t"
 
-	"	addq	$64, %9						\n\t"	// b=b+8
-	"	addq	$32, %8						\n\t"	// a=a+8
+	"	addq	$64, %3						\n\t"	// b=b+8
+	"	addq	$32, %2						\n\t"	// a=a+8
 
 	"	vmulpd		%%ymm13, %%ymm0, %%ymm13		\n\t"	// a *bb
-	"	vmovups         32(%9),  %%ymm1				\n\t"
-	"	vmovups		%%ymm13, (%8)				\n\t"	// write a
+	"	vmovups         32(%3),  %%ymm1				\n\t"
+	"	vmovups		%%ymm13, (%2)				\n\t"	// write a
 	"	vmovups		%%ymm13, (%5,%7,2)			\n\t"	// write c
 
 	"	vfnmadd231pd	%%ymm13, %%ymm6 , %%ymm14		\n\t"
@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vpermpd		$0xaa ,  %%ymm1 , %%ymm0		\n\t"
 
 
-	"	addq	$64, %9						\n\t"	// b=b+8
-	"	addq	$32, %8						\n\t"	// a=a+8
+	"	addq	$64, %3						\n\t"	// b=b+8
+	"	addq	$32, %2						\n\t"	// a=a+8
 
 
 	"	vmulpd		%%ymm14, %%ymm0, %%ymm14		\n\t"	// a *bb
-	"	vmovups         32(%9),  %%ymm1				\n\t"
-	"	vmovups		%%ymm14, (%8)				\n\t"	// write a
+	"	vmovups         32(%3),  %%ymm1				\n\t"
+	"	vmovups		%%ymm14, (%2)				\n\t"	// write a
 	"	vmovups		%%ymm14, (%6)     			\n\t"	// write c
 
 	"	vfnmadd231pd	%%ymm14, %%ymm7 , %%ymm15		\n\t"
 
 	"	vpermpd		$0xff ,  %%ymm1 , %%ymm0		\n\t"
 
-	"	addq	$32, %8						\n\t"	// a=a+8
+	"	addq	$32, %2						\n\t"	// a=a+8
 
 	"	vmulpd		%%ymm15, %%ymm0, %%ymm15		\n\t"	// a *bb
-	"	vmovups		%%ymm15, (%8)				\n\t"	// write a
+	"	vmovups		%%ymm15, (%2)				\n\t"	// write a
 	"	vmovups		%%ymm15, (%6,%7,1)			\n\t"	// write c
 
 	"	vzeroupper						\n\t"
 
         :
+          "+r" (n1),     // 0    
+          "+a" (i),      // 1    
+          "+r" (as),     // 2
+          "+r" (bs)      // 3
         :
-          "r" (n1),     // 0    
-          "a" (i),      // 1    
-          "r" (a),      // 2
-          "r" (b),      // 3
           "r" (c),      // 4
           "r" (c3),     // 5
           "r" (c6),     // 6
           "r" (ldc),    // 7
-          "r" (as),     // 8
-          "r" (bs)      // 9
+          "r" (a),     // 8
+          "r" (b)      // 9
         : "cc",
           "%xmm0", "%xmm1", "%xmm2", "%xmm3",
           "%xmm4", "%xmm5", "%xmm6", "%xmm7",
diff --git a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c
index 54df5b3594..35ed4cc013 100644
--- a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c
+++ b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c
@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	.align 16							\n\t"
 	"1:									\n\t"
 
-	"	prefetcht0	384(%2,%1,8)					\n\t"
-	"	prefetcht0	384(%3,%1,8)					\n\t"
-	"	vmovddup	(%3,%1,2), %%xmm0				\n\t"	// read b
-	"	vmovups         (%2,%1,8), %%xmm4				\n\t"
-	"	vmovddup       8(%3,%1,2), %%xmm1				\n\t"	
-	"	vmovups       16(%2,%1,8), %%xmm5				\n\t"
-	"	vmovups       32(%2,%1,8), %%xmm6				\n\t"
-	"	vmovups       48(%2,%1,8), %%xmm7				\n\t"
+	"	prefetcht0	384(%6,%1,8)					\n\t"
+	"	prefetcht0	384(%7,%1,8)					\n\t"
+	"	vmovddup	(%7,%1,2), %%xmm0				\n\t"	// read b
+	"	vmovups         (%6,%1,8), %%xmm4				\n\t"
+	"	vmovddup       8(%7,%1,2), %%xmm1				\n\t"	
+	"	vmovups       16(%6,%1,8), %%xmm5				\n\t"
+	"	vmovups       32(%6,%1,8), %%xmm6				\n\t"
+	"	vmovups       48(%6,%1,8), %%xmm7				\n\t"
 
 	"	vfmaddpd	%%xmm8 , %%xmm0 , %%xmm4 , %%xmm8		\n\t"
 	"	vfmaddpd	%%xmm12, %%xmm1 , %%xmm4 , %%xmm12		\n\t"
@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 
 	"	jz		2f						\n\t"
 
-	"	prefetcht0	384(%2,%1,8)					\n\t"
-	"	vmovddup	(%3,%1,2), %%xmm0				\n\t"	// read b
-	"	vmovups         (%2,%1,8), %%xmm4				\n\t"
-	"	vmovddup       8(%3,%1,2), %%xmm1				\n\t"	
-	"	vmovups       16(%2,%1,8), %%xmm5				\n\t"
-	"	vmovups       32(%2,%1,8), %%xmm6				\n\t"
-	"	vmovups       48(%2,%1,8), %%xmm7				\n\t"
+	"	prefetcht0	384(%6,%1,8)					\n\t"
+	"	vmovddup	(%7,%1,2), %%xmm0				\n\t"	// read b
+	"	vmovups         (%6,%1,8), %%xmm4				\n\t"
+	"	vmovddup       8(%7,%1,2), %%xmm1				\n\t"	
+	"	vmovups       16(%6,%1,8), %%xmm5				\n\t"
+	"	vmovups       32(%6,%1,8), %%xmm6				\n\t"
+	"	vmovups       48(%6,%1,8), %%xmm7				\n\t"
 
 	"	vfmaddpd	%%xmm8 , %%xmm0 , %%xmm4 , %%xmm8		\n\t"
 	"	vfmaddpd	%%xmm12, %%xmm1 , %%xmm4 , %%xmm12		\n\t"
@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 
 	"	jz		2f						\n\t"
 
-	"	prefetcht0	384(%2,%1,8)					\n\t"
-	"	vmovddup	(%3,%1,2), %%xmm0				\n\t"	// read b
-	"	vmovups         (%2,%1,8), %%xmm4				\n\t"
-	"	vmovddup       8(%3,%1,2), %%xmm1				\n\t"	
-	"	vmovups       16(%2,%1,8), %%xmm5				\n\t"
-	"	vmovups       32(%2,%1,8), %%xmm6				\n\t"
-	"	vmovups       48(%2,%1,8), %%xmm7				\n\t"
+	"	prefetcht0	384(%6,%1,8)					\n\t"
+	"	vmovddup	(%7,%1,2), %%xmm0				\n\t"	// read b
+	"	vmovups         (%6,%1,8), %%xmm4				\n\t"
+	"	vmovddup       8(%7,%1,2), %%xmm1				\n\t"	
+	"	vmovups       16(%6,%1,8), %%xmm5				\n\t"
+	"	vmovups       32(%6,%1,8), %%xmm6				\n\t"
+	"	vmovups       48(%6,%1,8), %%xmm7				\n\t"
 
 	"	vfmaddpd	%%xmm8 , %%xmm0 , %%xmm4 , %%xmm8		\n\t"
 	"	vfmaddpd	%%xmm12, %%xmm1 , %%xmm4 , %%xmm12		\n\t"
@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 
 	"	jz		2f						\n\t"
 
-	"	prefetcht0	384(%2,%1,8)					\n\t"
-	"	vmovddup	(%3,%1,2), %%xmm0				\n\t"	// read b
-	"	vmovddup       8(%3,%1,2), %%xmm1				\n\t"	
-	"	vmovups         (%2,%1,8), %%xmm4				\n\t"
-	"	vmovups       16(%2,%1,8), %%xmm5				\n\t"
-	"	vmovups       32(%2,%1,8), %%xmm6				\n\t"
-	"	vmovups       48(%2,%1,8), %%xmm7				\n\t"
+	"	prefetcht0	384(%6,%1,8)					\n\t"
+	"	vmovddup	(%7,%1,2), %%xmm0				\n\t"	// read b
+	"	vmovddup       8(%7,%1,2), %%xmm1				\n\t"	
+	"	vmovups         (%6,%1,8), %%xmm4				\n\t"
+	"	vmovups       16(%6,%1,8), %%xmm5				\n\t"
+	"	vmovups       32(%6,%1,8), %%xmm6				\n\t"
+	"	vmovups       48(%6,%1,8), %%xmm7				\n\t"
 
 	"	vfmaddpd	%%xmm8 , %%xmm0 , %%xmm4 , %%xmm8		\n\t"
 	"	vfmaddpd	%%xmm12, %%xmm1 , %%xmm4 , %%xmm12		\n\t"
@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 
 	"3:									\n\t"	// i = 1
 
-	"	vmovddup	(%7), %%xmm1					\n\t"	// read b
-	"	vmovddup       8(%7), %%xmm0					\n\t"	// read bb
+	"	vmovddup	(%3), %%xmm1					\n\t"	// read b
+	"	vmovddup       8(%3), %%xmm0					\n\t"	// read bb
 
 	"	vmulpd		%%xmm12 ,  %%xmm0 ,  %%xmm12			\n\t"	// aa * bb 
 	"	vmulpd		%%xmm13 ,  %%xmm0 ,  %%xmm13			\n\t"	// aa * bb 
 	"	vmulpd		%%xmm14 ,  %%xmm0 ,  %%xmm14			\n\t"	// aa * bb 
 	"	vmulpd		%%xmm15 ,  %%xmm0 ,  %%xmm15			\n\t"	// aa * bb 
 
-	"	vmovups		%%xmm12 ,    (%6)				\n\t"	// write a
-	"	vmovups		%%xmm13 ,  16(%6)				\n\t"	// write a
-	"	vmovups		%%xmm14 ,  32(%6)				\n\t"	// write a
-	"	vmovups		%%xmm15 ,  48(%6)				\n\t"	// write a
+	"	vmovups		%%xmm12 ,    (%2)				\n\t"	// write a
+	"	vmovups		%%xmm13 ,  16(%2)				\n\t"	// write a
+	"	vmovups		%%xmm14 ,  32(%2)				\n\t"	// write a
+	"	vmovups		%%xmm15 ,  48(%2)				\n\t"	// write a
 
 	"	vmovups		%%xmm12 ,    (%5)				\n\t"	// write c1
 	"	vmovups		%%xmm13 ,  16(%5)				\n\t"	
@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vfnmaddpd	%%xmm11 ,  %%xmm15 , %%xmm1 , %%xmm11		\n\t"   
 
 	"									\n\t" // i = 0
-	"	subq		$16 , %7					\n\t" // b = b - 2
-	"	subq		$64 , %6					\n\t" // a = a - 8
+	"	subq		$16 , %3					\n\t" // b = b - 2
+	"	subq		$64 , %2					\n\t" // a = a - 8
 
-	"	vmovddup        (%7), %%xmm0					\n\t"	// read bb
+	"	vmovddup        (%3), %%xmm0					\n\t"	// read bb
 
 	"	vmulpd		%%xmm8  ,  %%xmm0 ,  %%xmm8 			\n\t"	// aa * bb 
 	"	vmulpd		%%xmm9  ,  %%xmm0 ,  %%xmm9 			\n\t"
 	"	vmulpd		%%xmm10 ,  %%xmm0 ,  %%xmm10			\n\t"
 	"	vmulpd		%%xmm11 ,  %%xmm0 ,  %%xmm11			\n\t"
 
-	"	vmovups		%%xmm8  ,    (%6)				\n\t"	// write a
-	"	vmovups		%%xmm9  ,  16(%6)				\n\t"
-	"	vmovups		%%xmm10 ,  32(%6)				\n\t"
-	"	vmovups		%%xmm11 ,  48(%6)				\n\t"
+	"	vmovups		%%xmm8  ,    (%2)				\n\t"	// write a
+	"	vmovups		%%xmm9  ,  16(%2)				\n\t"
+	"	vmovups		%%xmm10 ,  32(%2)				\n\t"
+	"	vmovups		%%xmm11 ,  48(%2)				\n\t"
 
 	"	vmovups		%%xmm8  ,    (%4)				\n\t"	// write c0
 	"	vmovups		%%xmm9  ,  16(%4)				\n\t"
@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vzeroupper							\n\t"
 
         :
+          "+r" (n1),     // 0    
+          "+a" (i),      // 1    
+          "+r" (as),     // 2
+          "+r" (bs)      // 3
         :
-          "r" (n1),     // 0    
-          "a" (i),      // 1    
-          "r" (a),      // 2
-          "r" (b),      // 3
           "r" (c),      // 4
           "r" (c1),     // 5
-          "r" (as),     // 6
-          "r" (bs)      // 7
+          "r" (a),      // 6
+          "r" (b)       // 7
         : "cc",
           "%xmm0", "%xmm1", "%xmm2", "%xmm3",
           "%xmm4", "%xmm5", "%xmm6", "%xmm7",
diff --git a/kernel/x86_64/gemm_kernel_4x8_nano.S b/kernel/x86_64/gemm_kernel_4x8_nano.S
index 074562804c..e29520fa1b 100644
--- a/kernel/x86_64/gemm_kernel_4x8_nano.S
+++ b/kernel/x86_64/gemm_kernel_4x8_nano.S
@@ -135,7 +135,7 @@
 #endif
 
 	movq	%rsp, %rbx	# save old stack
-	subq	$128 + LOCAL_BUFFER_SIZE, %rsp
+	subq	$256 + LOCAL_BUFFER_SIZE, %rsp
 	andq	$-4096, %rsp	# align stack
 
 	STACK_TOUCHING
diff --git a/kernel/x86_64/gemm_kernel_8x4_sse.S b/kernel/x86_64/gemm_kernel_8x4_sse.S
index c4ef1f809a..1602c13c50 100644
--- a/kernel/x86_64/gemm_kernel_8x4_sse.S
+++ b/kernel/x86_64/gemm_kernel_8x4_sse.S
@@ -383,7 +383,7 @@
 	EMMS
 
 	movq	%rsp, %rbx	# save old stack
-	subq	$128 + LOCAL_BUFFER_SIZE, %rsp
+	subq	$256 + LOCAL_BUFFER_SIZE, %rsp
 	andq	$-4096, %rsp	# align stack
 
 	STACK_TOUCHING
diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c
index 3a743d64c7..7099ba4c6f 100644
--- a/kernel/x86_64/saxpy_microk_haswell-2.c
+++ b/kernel/x86_64/saxpy_microk_haswell-2.c
@@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper				     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c
index 68f68ea3a9..88bbb695d4 100644
--- a/kernel/x86_64/saxpy_microk_nehalem-2.c
+++ b/kernel/x86_64/saxpy_microk_nehalem-2.c
@@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c
index 204cf8bacf..5feea7f241 100644
--- a/kernel/x86_64/saxpy_microk_piledriver-2.c
+++ b/kernel/x86_64/saxpy_microk_piledriver-2.c
@@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper				     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
@@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper				     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c
index 0a6bef0466..0d448d5f88 100644
--- a/kernel/x86_64/saxpy_microk_sandy-2.c
+++ b/kernel/x86_64/saxpy_microk_sandy-2.c
@@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c
index 36e61b0776..8958a33dcc 100644
--- a/kernel/x86_64/sdot_microk_bulldozer-2.c
+++ b/kernel/x86_64/sdot_microk_bulldozer-2.c
@@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 
 	"vmovss		%%xmm4,    (%4)		\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c
index df367b61f1..91dc928d39 100644
--- a/kernel/x86_64/sdot_microk_haswell-2.c
+++ b/kernel/x86_64/sdot_microk_haswell-2.c
@@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovss		%%xmm4,    (%4)		\n\t"
 	"vzeroupper				\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c
index 1a27177f58..5a715d0083 100644
--- a/kernel/x86_64/sdot_microk_nehalem-2.c
+++ b/kernel/x86_64/sdot_microk_nehalem-2.c
@@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 
 	"movss	       %%xmm4,    (%4)	\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c
index ca13536f26..ae25d5a50b 100644
--- a/kernel/x86_64/sdot_microk_sandy-2.c
+++ b/kernel/x86_64/sdot_microk_sandy-2.c
@@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovss		%%xmm4,    (%4)		\n\t"
 	"vzeroupper				\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c
index 6b8b2566ba..bf6a5f2871 100644
--- a/kernel/x86_64/sdot_microk_steamroller-2.c
+++ b/kernel/x86_64/sdot_microk_steamroller-2.c
@@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 
 	"vmovss		%%xmm4,    (%4)		\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
@@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 
 	"vmovss		%%xmm4,    (%4)		\n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 65305ac59f..63697970fe 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 	"jnz		1b		       \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+	:
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (ap[0]),  // 4
@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
 
         "3:      			 \n\t" 
         :
+          "+r" (i),     // 0    
+          "+r" (n1)     // 1
         :
-          "r" (i),      // 0    
-          "r" (n1),     // 1
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (ap),     // 4
@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
         "jnz            1b              \n\t"
 
         :
+        "+r" (i),         // 0
+        "+r" (n)          // 1
         :
-        "r" (i),          // 0
-        "r" (n),          // 1
         "r" (src),        // 2
         "r" (dest)        // 3
         : "cc",
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
index 31001c7f3d..bbf06c84b5 100644
--- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	__asm__  __volatile__
 	(
-	"vbroadcastss    (%2), %%xmm12	 \n\t"	// x0 
-	"vbroadcastss   4(%2), %%xmm13	 \n\t"	// x1 
-	"vbroadcastss   8(%2), %%xmm14	 \n\t"	// x2 
-	"vbroadcastss  12(%2), %%xmm15	 \n\t"	// x3 
-	"vbroadcastss  16(%2), %%xmm0 	 \n\t"	// x4 
-	"vbroadcastss  20(%2), %%xmm1 	 \n\t"	// x5 
-	"vbroadcastss  24(%2), %%xmm2 	 \n\t"	// x6 
-	"vbroadcastss  28(%2), %%xmm3 	 \n\t"	// x7 
+	"vbroadcastss    (%3), %%xmm12	 \n\t"	// x0 
+	"vbroadcastss   4(%3), %%xmm13	 \n\t"	// x1 
+	"vbroadcastss   8(%3), %%xmm14	 \n\t"	// x2 
+	"vbroadcastss  12(%3), %%xmm15	 \n\t"	// x3 
+	"vbroadcastss  16(%3), %%xmm0 	 \n\t"	// x4 
+	"vbroadcastss  20(%3), %%xmm1 	 \n\t"	// x5 
+	"vbroadcastss  24(%3), %%xmm2 	 \n\t"	// x6 
+	"vbroadcastss  28(%3), %%xmm3 	 \n\t"	// x7 
 
 	"vbroadcastss    (%9), %%xmm8 	 \n\t"	// alpha 
 
@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
 	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
 
-	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
-	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm13, %%xmm5 \n\t" 
-	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
-	"vfmaddps %%xmm5,   (%7,%0,4), %%xmm15, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%5,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%6,%0,4), %%xmm13, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%7,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%8,%0,4), %%xmm15, %%xmm5 \n\t" 
         "addq		$4 , %0	  	 	       \n\t"
 
-	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
-	"vfmaddps %%xmm5,   (%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
-	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
-	"vfmaddps %%xmm5,   (%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
-        "addq		$4 , %8	  	 	       \n\t"
+	"vfmaddps %%xmm4,   (%5,%2,4), %%xmm0 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%6,%2,4), %%xmm1 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%7,%2,4), %%xmm2 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5,   (%8,%2,4), %%xmm3 , %%xmm5 \n\t" 
+        "addq		$4 , %2	  	 	       \n\t"
 	
 	"vaddps		%%xmm5 , %%xmm4, %%xmm4        \n\t"
-	"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
+	"vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
 	"subq	        $4 , %1			       \n\t"		
-	"vmovups  %%xmm6, -16(%3,%0,4)		       \n\t"	// 4 * y
+	"vmovups  %%xmm6, -16(%4,%0,4)		       \n\t"	// 4 * y
 
 	"2:                                  \n\t"
 
@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
 	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
 
-	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" 
-	"vfmaddps %%xmm4,   (%5,%0,4), %%xmm13, %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
-	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" 
-	"vfmaddps %%xmm4,   (%7,%0,4), %%xmm15, %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" 
-
-	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
-        "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" 
-	"vfmaddps %%xmm4,   (%5,%8,4), %%xmm1 , %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
-	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" 
-	"vfmaddps %%xmm4,   (%7,%8,4), %%xmm3 , %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%5,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm13, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%7,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%8,%0,4), %%xmm15, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" 
+
+	"vfmaddps %%xmm4,   (%5,%2,4), %%xmm0 , %%xmm4 \n\t" 
+        "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%2,4), %%xmm1 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%7,%2,4), %%xmm2 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%8,%2,4), %%xmm3 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" 
 	
-	"vfmaddps    (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
-	"vfmaddps  16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
-	"vmovups  %%xmm4,   (%3,%0,4)		      \n\t"	// 4 * y
-	"vmovups  %%xmm5, 16(%3,%0,4)		      \n\t"	// 4 * y
+	"vfmaddps    (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
+	"vfmaddps  16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
+	"vmovups  %%xmm4,   (%4,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm5, 16(%4,%0,4)		      \n\t"	// 4 * y
 
         "addq		$8 , %0	  	 	      \n\t"
-        "addq		$8 , %8	  	 	      \n\t"
+        "addq		$8 , %2	  	 	      \n\t"
 	"subq	        $8 , %1			      \n\t"		
 
 
@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vxorps		%%xmm6, %%xmm6 , %%xmm6  \n\t"
 	"vxorps		%%xmm7, %%xmm7 , %%xmm7  \n\t"
 
-        "prefetcht0      192(%4,%0,4)                  \n\t"
-	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" 
         "prefetcht0      192(%5,%0,4)                  \n\t"
-	"vfmaddps %%xmm4,   (%5,%0,4), %%xmm13, %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%5,%0,4), %%xmm12, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" 
         "prefetcht0      192(%6,%0,4)                  \n\t"
-	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" 
+	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm13, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" 
         "prefetcht0      192(%7,%0,4)                  \n\t"
-	"vfmaddps %%xmm4,   (%7,%0,4), %%xmm15, %%xmm4 \n\t" 
+	"vfmaddps %%xmm4,   (%7,%0,4), %%xmm14, %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" 
+        "prefetcht0      192(%8,%0,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%8,%0,4), %%xmm15, %%xmm4 \n\t" 
 	".align 2				 \n\t"
-	"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" 
-
-	"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" 
-	"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" 
-	"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" 
-	"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" 
-	"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" 
-	"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" 
-	"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" 
-	"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" 
-
-        "prefetcht0      192(%4,%8,4)                  \n\t"
-	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
-        "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" 
-        "prefetcht0      192(%5,%8,4)                  \n\t"
-	"vfmaddps %%xmm4,   (%5,%8,4), %%xmm1 , %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
-        "prefetcht0      192(%6,%8,4)                  \n\t"
-	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" 
-        "prefetcht0      192(%7,%8,4)                  \n\t"
-	"vfmaddps %%xmm4,   (%7,%8,4), %%xmm3 , %%xmm4 \n\t" 
-	"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
+	"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" 
+
+	"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t" 
+
+        "prefetcht0      192(%5,%2,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%5,%2,4), %%xmm0 , %%xmm4 \n\t" 
+        "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" 
+        "prefetcht0      192(%6,%2,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%6,%2,4), %%xmm1 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" 
+        "prefetcht0      192(%7,%2,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%7,%2,4), %%xmm2 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" 
+        "prefetcht0      192(%8,%2,4)                  \n\t"
+	"vfmaddps %%xmm4,   (%8,%2,4), %%xmm3 , %%xmm4 \n\t" 
+	"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" 
 	
-	"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" 
-        "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" 
-	"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" 
-	"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" 
-	"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" 
-	"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" 
-	"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" 
-	"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t" 
+        "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t" 
+	"vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t" 
+	"vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t" 
 	
-	"vfmaddps    (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
-	"vfmaddps  16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
-	"vfmaddps  32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
-	"vfmaddps  48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
+	"vfmaddps    (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
+	"vfmaddps  16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
+	"vfmaddps  32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
+	"vfmaddps  48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
 
         "addq		$16, %0	  	 	      \n\t"
-	"vmovups  %%xmm4,-64(%3,%0,4)		      \n\t"	// 4 * y
-	"vmovups  %%xmm5,-48(%3,%0,4)		      \n\t"	// 4 * y
-        "addq		$16, %8	  	 	      \n\t"
-	"vmovups  %%xmm6,-32(%3,%0,4)		      \n\t"	// 4 * y
-	"vmovups  %%xmm7,-16(%3,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm4,-64(%4,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm5,-48(%4,%0,4)		      \n\t"	// 4 * y
+        "addq		$16, %2	  	 	      \n\t"
+	"vmovups  %%xmm6,-32(%4,%0,4)		      \n\t"	// 4 * y
+	"vmovups  %%xmm7,-16(%4,%0,4)		      \n\t"	// 4 * y
 
 	"subq	        $16, %1			      \n\t"		
 	"jnz		1b		      \n\t"
@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	:
           "+r" (i),	// 0	
-	  "+r" (n)  	// 1
+	  "+r" (n),  	// 1
+          "+r" (lda4)   // 2
         : 
-          "r" (x),      // 2
-          "r" (y),      // 3
-          "r" (ap[0]),  // 4
-          "r" (ap[1]),  // 5
-          "r" (ap[2]),  // 6
-          "r" (ap[3]),  // 7
-          "r" (lda4),   // 8
+          "r" (x),      // 3
+          "r" (y),      // 4
+          "r" (ap[0]),  // 5
+          "r" (ap[1]),  // 6
+          "r" (ap[2]),  // 7
+          "r" (ap[3]),  // 8
           "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
index 2c90f8aa99..93e1e26e8a 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 
-
 #define HAVE_KERNEL_4x8 1
 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
 
@@ -38,41 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	__asm__  __volatile__
 	(
 	"vzeroupper			 \n\t"
-	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
-	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
-	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
-	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
-	"vbroadcastss  16(%2), %%ymm0 	 \n\t"	// x4 
-	"vbroadcastss  20(%2), %%ymm1 	 \n\t"	// x5 
-	"vbroadcastss  24(%2), %%ymm2 	 \n\t"	// x6 
-	"vbroadcastss  28(%2), %%ymm3 	 \n\t"	// x7 
+	"vbroadcastss    (%3), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%3), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%3), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%3), %%ymm15	 \n\t"	// x3 
+	"vbroadcastss  16(%3), %%ymm0 	 \n\t"	// x4 
+	"vbroadcastss  20(%3), %%ymm1 	 \n\t"	// x5 
+	"vbroadcastss  24(%3), %%ymm2 	 \n\t"	// x6 
+	"vbroadcastss  28(%3), %%ymm3 	 \n\t"	// x7 
 
 	"vbroadcastss    (%9), %%ymm6 	 \n\t"	// alpha 
 
         "testq          $0x04, %1                      \n\t"
         "jz             2f                    \n\t"
 
-	"vmovups	(%3,%0,4), %%xmm7	       \n\t"	// 4 * y
+	"vmovups	(%4,%0,4), %%xmm7	       \n\t"	// 4 * y
 	"vxorps		%%xmm4 , %%xmm4, %%xmm4        \n\t"
 	"vxorps		%%xmm5 , %%xmm5, %%xmm5        \n\t"
 
-	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
-	"vfmadd231ps   (%5,%0,4), %%xmm13, %%xmm5      \n\t" 
-	"vfmadd231ps   (%6,%0,4), %%xmm14, %%xmm4      \n\t" 
-	"vfmadd231ps   (%7,%0,4), %%xmm15, %%xmm5      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%xmm12, %%xmm4      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%xmm13, %%xmm5      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%xmm14, %%xmm4      \n\t" 
+	"vfmadd231ps   (%8,%0,4), %%xmm15, %%xmm5      \n\t" 
 
-	"vfmadd231ps   (%4,%8,4), %%xmm0 , %%xmm4      \n\t" 
-	"vfmadd231ps   (%5,%8,4), %%xmm1 , %%xmm5      \n\t" 
-	"vfmadd231ps   (%6,%8,4), %%xmm2 , %%xmm4      \n\t" 
-	"vfmadd231ps   (%7,%8,4), %%xmm3 , %%xmm5      \n\t" 
+	"vfmadd231ps   (%5,%2,4), %%xmm0 , %%xmm4      \n\t" 
+	"vfmadd231ps   (%6,%2,4), %%xmm1 , %%xmm5      \n\t" 
+	"vfmadd231ps   (%7,%2,4), %%xmm2 , %%xmm4      \n\t" 
+	"vfmadd231ps   (%8,%2,4), %%xmm3 , %%xmm5      \n\t" 
 
 	"vaddps		%%xmm4 , %%xmm5 , %%xmm5       \n\t"
 	"vmulps		%%xmm6 , %%xmm5 , %%xmm5       \n\t"
 	"vaddps		%%xmm7 , %%xmm5 , %%xmm5       \n\t"
 
-	"vmovups  %%xmm5,   (%3,%0,4)		       \n\t"	// 4 * y
+	"vmovups  %%xmm5,   (%4,%0,4)		       \n\t"	// 4 * y
 
-        "addq		$4 , %8	  	 	       \n\t"
+        "addq		$4 , %2	  	 	       \n\t"
         "addq		$4 , %0	  	 	       \n\t"
 	"subq	        $4 , %1			       \n\t"		
 
@@ -81,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
         "testq          $0x08, %1                      \n\t"
         "jz             3f                    \n\t"
 
-	"vmovups	(%3,%0,4), %%ymm7	       \n\t"	// 8 * y
+	"vmovups	(%4,%0,4), %%ymm7	       \n\t"	// 8 * y
 	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 
-	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
-	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm5      \n\t" 
-	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
-	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm5      \n\t" 
+	"vfmadd231ps   (%5,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps   (%8,%0,4), %%ymm15, %%ymm5      \n\t" 
 
-	"vfmadd231ps   (%4,%8,4), %%ymm0 , %%ymm4      \n\t" 
-	"vfmadd231ps   (%5,%8,4), %%ymm1 , %%ymm5      \n\t" 
-	"vfmadd231ps   (%6,%8,4), %%ymm2 , %%ymm4      \n\t" 
-	"vfmadd231ps   (%7,%8,4), %%ymm3 , %%ymm5      \n\t" 
+	"vfmadd231ps   (%5,%2,4), %%ymm0 , %%ymm4      \n\t" 
+	"vfmadd231ps   (%6,%2,4), %%ymm1 , %%ymm5      \n\t" 
+	"vfmadd231ps   (%7,%2,4), %%ymm2 , %%ymm4      \n\t" 
+	"vfmadd231ps   (%8,%2,4), %%ymm3 , %%ymm5      \n\t" 
 
 	"vaddps		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
 	"vmulps		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
 	"vaddps		%%ymm7 , %%ymm5 , %%ymm5       \n\t"
 
 
-	"vmovups  %%ymm5,   (%3,%0,4)		       \n\t"	// 8 * y
+	"vmovups  %%ymm5,   (%4,%0,4)		       \n\t"	// 8 * y
 
-        "addq		$8 , %8	  	 	       \n\t"
+        "addq		$8 , %2	  	 	       \n\t"
         "addq		$8 , %0	  	 	       \n\t"
 	"subq	        $8 , %1			       \n\t"		
 
@@ -117,35 +116,35 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
-	"vmovups	(%3,%0,4), %%ymm8	       \n\t"	// 8 * y
-	"vmovups      32(%3,%0,4), %%ymm9	       \n\t"	// 8 * y
-
-	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
-	"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5      \n\t" 
-	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
-	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
-	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
-	"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5      \n\t" 
-	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
-	"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5      \n\t" 
-
-	"vfmadd231ps   (%4,%8,4), %%ymm0 , %%ymm4      \n\t" 
+	"vmovups	(%4,%0,4), %%ymm8	       \n\t"	// 8 * y
+	"vmovups      32(%4,%0,4), %%ymm9	       \n\t"	// 8 * y
+
+	"vfmadd231ps   (%5,%0,4), %%ymm12, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%0,4), %%ymm13, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5      \n\t" 
+	"vfmadd231ps   (%7,%0,4), %%ymm14, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5      \n\t" 
+	"vfmadd231ps   (%8,%0,4), %%ymm15, %%ymm4      \n\t" 
+	"vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5      \n\t" 
+
+	"vfmadd231ps   (%5,%2,4), %%ymm0 , %%ymm4      \n\t" 
         "addq		$16, %0	  	 	       \n\t"
-	"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5      \n\t" 
-	"vfmadd231ps   (%5,%8,4), %%ymm1 , %%ymm4      \n\t" 
-	"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5      \n\t" 
-	"vfmadd231ps   (%6,%8,4), %%ymm2 , %%ymm4      \n\t" 
-	"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5      \n\t" 
-	"vfmadd231ps   (%7,%8,4), %%ymm3 , %%ymm4      \n\t" 
-	"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5      \n\t" 
+	"vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5      \n\t" 
+	"vfmadd231ps   (%6,%2,4), %%ymm1 , %%ymm4      \n\t" 
+	"vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5      \n\t" 
+	"vfmadd231ps   (%7,%2,4), %%ymm2 , %%ymm4      \n\t" 
+	"vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5      \n\t" 
+	"vfmadd231ps   (%8,%2,4), %%ymm3 , %%ymm4      \n\t" 
+	"vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5      \n\t" 
 
 	"vfmadd231ps     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
 	"vfmadd231ps     %%ymm6 , %%ymm5 , %%ymm9      \n\t"
 
-        "addq		$16, %8	  	 	      \n\t"
-	"vmovups  %%ymm8,-64(%3,%0,4)		      \n\t"	// 8 * y
+        "addq		$16, %2	  	 	      \n\t"
+	"vmovups  %%ymm8,-64(%4,%0,4)		      \n\t"	// 8 * y
 	"subq	        $16, %1			      \n\t"		
-	"vmovups  %%ymm9,-32(%3,%0,4)		      \n\t"	// 8 * y
+	"vmovups  %%ymm9,-32(%4,%0,4)		      \n\t"	// 8 * y
 
 	"jnz		1b		      \n\t"
 
@@ -154,15 +153,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	:
           "+r" (i),	// 0	
-	  "+r" (n)  	// 1
+	  "+r" (n),  	// 1
+          "+r" (lda4)   // 2
         :  
-	  "r" (x),      // 2
-          "r" (y),      // 3
-          "r" (ap[0]),  // 4
-          "r" (ap[1]),  // 5
-          "r" (ap[2]),  // 6
-          "r" (ap[3]),  // 7
-          "r" (lda4),   // 8
+	  "r" (x),      // 3
+          "r" (y),      // 4
+          "r" (ap[0]),  // 5
+          "r" (ap[1]),  // 6
+          "r" (ap[2]),  // 7
+          "r" (ap[3]),  // 8
           "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
@@ -177,7 +176,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 } 
 
 
-
 #define HAVE_KERNEL_4x4 1
 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
 
@@ -196,6 +194,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 
 	"vbroadcastss    (%8), %%ymm6 	 \n\t"	// alpha 
 
+
         "testq          $0x04, %1                      \n\t"
         "jz             2f                    \n\t"
 
diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
index 11a3e943b7..d21232bfaf 100644
--- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	__asm__  __volatile__
 	(
-	"movss    (%2), %%xmm12	 \n\t"	// x0 
-	"movss   4(%2), %%xmm13	 \n\t"	// x1 
-	"movss   8(%2), %%xmm14	 \n\t"	// x2 
-	"movss  12(%2), %%xmm15	 \n\t"	// x3 
+	"movss    (%3), %%xmm12	 \n\t"	// x0 
+	"movss   4(%3), %%xmm13	 \n\t"	// x1 
+	"movss   8(%3), %%xmm14	 \n\t"	// x2 
+	"movss  12(%3), %%xmm15	 \n\t"	// x3 
 	"shufps $0,  %%xmm12, %%xmm12\n\t"	
 	"shufps $0,  %%xmm13, %%xmm13\n\t"	
 	"shufps $0,  %%xmm14, %%xmm14\n\t"	
 	"shufps $0,  %%xmm15, %%xmm15\n\t"	
 
-	"movss  16(%2), %%xmm0	 \n\t"	// x4 
-	"movss  20(%2), %%xmm1	 \n\t"	// x5 
-	"movss  24(%2), %%xmm2	 \n\t"	// x6 
-	"movss  28(%2), %%xmm3	 \n\t"	// x7 
+	"movss  16(%3), %%xmm0	 \n\t"	// x4 
+	"movss  20(%3), %%xmm1	 \n\t"	// x5 
+	"movss  24(%3), %%xmm2	 \n\t"	// x6 
+	"movss  28(%3), %%xmm3	 \n\t"	// x7 
 	"shufps $0,  %%xmm0 , %%xmm0 \n\t"	
 	"shufps $0,  %%xmm1 , %%xmm1 \n\t"	
 	"shufps $0,  %%xmm2 , %%xmm2 \n\t"	
@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"1:				 \n\t"
 	"xorps           %%xmm4 , %%xmm4	 \n\t"
 	"xorps           %%xmm5 , %%xmm5	 \n\t"
-	"movups             (%3,%0,4), %%xmm7          \n\t" // 4 * y
+	"movups             (%4,%0,4), %%xmm7          \n\t" // 4 * y
 
 	".p2align 1				       \n\t"
-	"movups             (%4,%0,4), %%xmm8          \n\t" 
-	"movups             (%5,%0,4), %%xmm9          \n\t" 
-	"movups             (%6,%0,4), %%xmm10         \n\t" 
-	"movups             (%7,%0,4), %%xmm11         \n\t" 
+	"movups             (%5,%0,4), %%xmm8          \n\t" 
+	"movups             (%6,%0,4), %%xmm9          \n\t" 
+	"movups             (%7,%0,4), %%xmm10         \n\t" 
+	"movups             (%8,%0,4), %%xmm11         \n\t" 
 	".p2align 1				       \n\t"
 	"mulps		%%xmm12, %%xmm8		       \n\t"
 	"mulps		%%xmm13, %%xmm9		       \n\t"
@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"addps		%%xmm10, %%xmm4	               \n\t"
 	"addps		%%xmm11, %%xmm5 	       \n\t"
 
-	"movups             (%4,%8,4), %%xmm8          \n\t" 
-	"movups             (%5,%8,4), %%xmm9          \n\t" 
-	"movups             (%6,%8,4), %%xmm10         \n\t" 
-	"movups             (%7,%8,4), %%xmm11         \n\t" 
+	"movups             (%5,%2,4), %%xmm8          \n\t" 
+	"movups             (%6,%2,4), %%xmm9          \n\t" 
+	"movups             (%7,%2,4), %%xmm10         \n\t" 
+	"movups             (%8,%2,4), %%xmm11         \n\t" 
 	".p2align 1				       \n\t"
 	"mulps		%%xmm0 , %%xmm8		       \n\t"
 	"mulps		%%xmm1 , %%xmm9		       \n\t"
@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"addps		%%xmm10, %%xmm4	       	       \n\t"
 	"addps		%%xmm11, %%xmm5 	       \n\t"
 
-        "addq		$4 , %8	  	 	       \n\t"
+        "addq		$4 , %2	  	 	       \n\t"
 	"addps		%%xmm5 , %%xmm4 	       \n\t"
         "addq		$4 , %0	  	 	       \n\t"
 	"mulps		%%xmm6 , %%xmm4		       \n\t" 
 	"subq	        $4 , %1			       \n\t"		
 	"addps		%%xmm4 , %%xmm7 	       \n\t"
 
-	"movups  %%xmm7 , -16(%3,%0,4)		       \n\t"	// 4 * y
+	"movups  %%xmm7 , -16(%4,%0,4)		       \n\t"	// 4 * y
 
 	"jnz		1b		       \n\t"
 
 	:
           "+r" (i),	// 0	
-	  "+r" (n)  	// 1
+	  "+r" (n), 	// 1
+          "+r" (lda4)   // 2
         : 
-          "r" (x),      // 2
-          "r" (y),      // 3
-          "r" (ap[0]),  // 4
-          "r" (ap[1]),  // 5
-          "r" (ap[2]),  // 6
-          "r" (ap[3]),  // 7
-          "r" (lda4),   // 8
+          "r" (x),      // 3
+          "r" (y),      // 4
+          "r" (ap[0]),  // 5
+          "r" (ap[1]),  // 6
+          "r" (ap[2]),  // 7
+          "r" (ap[3]),  // 8
           "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c
index b35daa35b0..3fc46542b7 100644
--- a/kernel/x86_64/sgemv_n_microk_sandy-4.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c
@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	__asm__  __volatile__
 	(
 	"vzeroupper			 \n\t"
-	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
-	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
-	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
-	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
-	"vbroadcastss  16(%2), %%ymm0 	 \n\t"	// x4 
-	"vbroadcastss  20(%2), %%ymm1 	 \n\t"	// x5 
-	"vbroadcastss  24(%2), %%ymm2 	 \n\t"	// x6 
-	"vbroadcastss  28(%2), %%ymm3 	 \n\t"	// x7 
+	"vbroadcastss    (%3), %%ymm12	 \n\t"	// x0 
+	"vbroadcastss   4(%3), %%ymm13	 \n\t"	// x1 
+	"vbroadcastss   8(%3), %%ymm14	 \n\t"	// x2 
+	"vbroadcastss  12(%3), %%ymm15	 \n\t"	// x3 
+	"vbroadcastss  16(%3), %%ymm0 	 \n\t"	// x4 
+	"vbroadcastss  20(%3), %%ymm1 	 \n\t"	// x5 
+	"vbroadcastss  24(%3), %%ymm2 	 \n\t"	// x6 
+	"vbroadcastss  28(%3), %%ymm3 	 \n\t"	// x7 
 
 	"vbroadcastss    (%9), %%ymm6 	 \n\t"	// alpha 
 
@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	"vxorps	  %%xmm4 , %%xmm4 , %%xmm4        \n\t"
 	"vxorps	  %%xmm5 , %%xmm5 , %%xmm5        \n\t"
-	"vmovups	(%3,%0,4), %%xmm7	  \n\t"	// 4 * y
+	"vmovups	(%4,%0,4), %%xmm7	  \n\t"	// 4 * y
 
-	"vmulps   (%4,%0,4), %%xmm12, %%xmm8      \n\t" 
-	"vmulps   (%5,%0,4), %%xmm13, %%xmm10     \n\t" 
-	"vmulps   (%6,%0,4), %%xmm14, %%xmm9      \n\t" 
-	"vmulps   (%7,%0,4), %%xmm15, %%xmm11     \n\t" 
+	"vmulps   (%5,%0,4), %%xmm12, %%xmm8      \n\t" 
+	"vmulps   (%6,%0,4), %%xmm13, %%xmm10     \n\t" 
+	"vmulps   (%7,%0,4), %%xmm14, %%xmm9      \n\t" 
+	"vmulps   (%8,%0,4), %%xmm15, %%xmm11     \n\t" 
 	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
 	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
 	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
 	"vaddps	  %%xmm5, %%xmm11, %%xmm5	  \n\t"
 
-	"vmulps   (%4,%8,4), %%xmm0 , %%xmm8      \n\t" 
-	"vmulps   (%5,%8,4), %%xmm1 , %%xmm10     \n\t" 
-	"vmulps   (%6,%8,4), %%xmm2 , %%xmm9      \n\t" 
-	"vmulps   (%7,%8,4), %%xmm3 , %%xmm11     \n\t" 
+	"vmulps   (%5,%2,4), %%xmm0 , %%xmm8      \n\t" 
+	"vmulps   (%6,%2,4), %%xmm1 , %%xmm10     \n\t" 
+	"vmulps   (%7,%2,4), %%xmm2 , %%xmm9      \n\t" 
+	"vmulps   (%8,%2,4), %%xmm3 , %%xmm11     \n\t" 
 	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
 	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
 	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vmulps	  %%xmm6, %%xmm4 , %%xmm5	  \n\t"
 	"vaddps	  %%xmm5, %%xmm7 , %%xmm5	  \n\t"
 
-	"vmovups  %%xmm5,   (%3,%0,4)		  \n\t"	// 4 * y
+	"vmovups  %%xmm5,   (%4,%0,4)		  \n\t"	// 4 * y
 
-        "addq		$4, %8	  	 	  \n\t"
+        "addq		$4, %2	  	 	  \n\t"
         "addq		$4, %0	  	 	  \n\t"
 	"subq	        $4, %1			  \n\t"		
 
@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
 	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
-	"vmovups	(%3,%0,4), %%ymm7	  \n\t"	// 8 * y
+	"vmovups	(%4,%0,4), %%ymm7	  \n\t"	// 8 * y
 
-	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
-	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
-	"vmulps   (%6,%0,4), %%ymm14, %%ymm9      \n\t" 
-	"vmulps   (%7,%0,4), %%ymm15, %%ymm11     \n\t" 
+	"vmulps   (%5,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps   (%6,%0,4), %%ymm13, %%ymm10     \n\t" 
+	"vmulps   (%7,%0,4), %%ymm14, %%ymm9      \n\t" 
+	"vmulps   (%8,%0,4), %%ymm15, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
 
-	"vmulps   (%4,%8,4), %%ymm0 , %%ymm8      \n\t" 
-	"vmulps   (%5,%8,4), %%ymm1 , %%ymm10     \n\t" 
-	"vmulps   (%6,%8,4), %%ymm2 , %%ymm9      \n\t" 
-	"vmulps   (%7,%8,4), %%ymm3 , %%ymm11     \n\t" 
+	"vmulps   (%5,%2,4), %%ymm0 , %%ymm8      \n\t" 
+	"vmulps   (%6,%2,4), %%ymm1 , %%ymm10     \n\t" 
+	"vmulps   (%7,%2,4), %%ymm2 , %%ymm9      \n\t" 
+	"vmulps   (%8,%2,4), %%ymm3 , %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vmulps	  %%ymm6, %%ymm4 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm5, %%ymm7 , %%ymm5	  \n\t"
 
-	"vmovups  %%ymm5,   (%3,%0,4)		  \n\t"	// 8 * y
+	"vmovups  %%ymm5,   (%4,%0,4)		  \n\t"	// 8 * y
 
-        "addq		$8, %8	  	 	  \n\t"
+        "addq		$8, %2	  	 	  \n\t"
         "addq		$8, %0	  	 	  \n\t"
 	"subq	        $8, %1			  \n\t"		
 
@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
 	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
 
-	"prefetcht0	 192(%4,%0,4)		  \n\t"
-	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
-	"vmulps 32(%4,%0,4), %%ymm12, %%ymm9      \n\t" 
 	"prefetcht0	 192(%5,%0,4)		  \n\t"
-	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
-	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
+	"vmulps   (%5,%0,4), %%ymm12, %%ymm8      \n\t" 
+	"vmulps 32(%5,%0,4), %%ymm12, %%ymm9      \n\t" 
+	"prefetcht0	 192(%6,%0,4)		  \n\t"
+	"vmulps   (%6,%0,4), %%ymm13, %%ymm10     \n\t" 
+	"vmulps 32(%6,%0,4), %%ymm13, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
 
-	"prefetcht0	 192(%6,%0,4)		  \n\t"
-	"vmulps   (%6,%0,4), %%ymm14, %%ymm8      \n\t" 
-	"vmulps 32(%6,%0,4), %%ymm14, %%ymm9      \n\t" 
 	"prefetcht0	 192(%7,%0,4)		  \n\t"
-	"vmulps   (%7,%0,4), %%ymm15, %%ymm10     \n\t" 
-	"vmulps 32(%7,%0,4), %%ymm15, %%ymm11     \n\t" 
+	"vmulps   (%7,%0,4), %%ymm14, %%ymm8      \n\t" 
+	"vmulps 32(%7,%0,4), %%ymm14, %%ymm9      \n\t" 
+	"prefetcht0	 192(%8,%0,4)		  \n\t"
+	"vmulps   (%8,%0,4), %%ymm15, %%ymm10     \n\t" 
+	"vmulps 32(%8,%0,4), %%ymm15, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
 
-	"prefetcht0	 192(%4,%8,4)		  \n\t"
-	"vmulps   (%4,%8,4), %%ymm0 , %%ymm8      \n\t" 
-	"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9      \n\t" 
-	"prefetcht0	 192(%5,%8,4)		  \n\t"
-	"vmulps   (%5,%8,4), %%ymm1 , %%ymm10     \n\t" 
-	"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11     \n\t" 
+	"prefetcht0	 192(%5,%2,4)		  \n\t"
+	"vmulps   (%5,%2,4), %%ymm0 , %%ymm8      \n\t" 
+	"vmulps 32(%5,%2,4), %%ymm0 , %%ymm9      \n\t" 
+	"prefetcht0	 192(%6,%2,4)		  \n\t"
+	"vmulps   (%6,%2,4), %%ymm1 , %%ymm10     \n\t" 
+	"vmulps 32(%6,%2,4), %%ymm1 , %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
 
-	"prefetcht0	 192(%6,%8,4)		  \n\t"
-	"vmulps   (%6,%8,4), %%ymm2 , %%ymm8      \n\t" 
-	"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9      \n\t" 
-	"prefetcht0	 192(%7,%8,4)		  \n\t"
-	"vmulps   (%7,%8,4), %%ymm3 , %%ymm10     \n\t" 
-	"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11     \n\t" 
+	"prefetcht0	 192(%7,%2,4)		  \n\t"
+	"vmulps   (%7,%2,4), %%ymm2 , %%ymm8      \n\t" 
+	"vmulps 32(%7,%2,4), %%ymm2 , %%ymm9      \n\t" 
+	"prefetcht0	 192(%8,%2,4)		  \n\t"
+	"vmulps   (%8,%2,4), %%ymm3 , %%ymm10     \n\t" 
+	"vmulps 32(%8,%2,4), %%ymm3 , %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 	"vmulps	  %%ymm6, %%ymm4 , %%ymm4	  \n\t"
 	"vmulps	  %%ymm6, %%ymm5 , %%ymm5	  \n\t"
 
-	"vaddps    (%3,%0,4), %%ymm4 , %%ymm4	 \n\t"	// 8 * y
-	"vaddps  32(%3,%0,4), %%ymm5 , %%ymm5	 \n\t"	// 8 * y
+	"vaddps    (%4,%0,4), %%ymm4 , %%ymm4	 \n\t"	// 8 * y
+	"vaddps  32(%4,%0,4), %%ymm5 , %%ymm5	 \n\t"	// 8 * y
 
-	"vmovups  %%ymm4,   (%3,%0,4)		  \n\t"	// 8 * y
-	"vmovups  %%ymm5, 32(%3,%0,4)		  \n\t"	// 8 * y
+	"vmovups  %%ymm4,   (%4,%0,4)		  \n\t"	// 8 * y
+	"vmovups  %%ymm5, 32(%4,%0,4)		  \n\t"	// 8 * y
 
-        "addq		$16, %8	  	 	  \n\t"
+        "addq		$16, %2	  	 	  \n\t"
         "addq		$16, %0	  	 	  \n\t"
 	"subq	        $16, %1			  \n\t"		
 	"jnz		1b		  \n\t"
@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
 
 	:
           "+r" (i),	// 0	
-	  "+r" (n)  	// 1
+	  "+r" (n),  	// 1
+          "+r" (lda4)   // 2
         : 
-          "r" (x),      // 2
-          "r" (y),      // 3
-          "r" (ap[0]),  // 4
-          "r" (ap[1]),  // 5
-          "r" (ap[2]),  // 6
-          "r" (ap[3]),  // 7
-          "r" (lda4),   // 8
+          "r" (x),      // 3
+          "r" (y),      // 4
+          "r" (ap[0]),  // 5
+          "r" (ap[1]),  // 6
+          "r" (ap[2]),  // 7
+          "r" (ap[3]),  // 8
           "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 065e5b3852..86ecaf516e 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
 	"movss	       %%xmm11,4(%2)	        \n\t"
 
         :
-   	:
-	"r" (i),	 // 0
-	"r" (n),	 // 1
+	"+r" (i),	 // 0
+	"+r" (n)	 // 1
+	:
         "r" (y),         // 2    
         "r" (ap0),       // 3
         "r" (ap1),       // 4
@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 	"movss	       %%xmm10, (%2)	        \n\t"
 
         :
-   	:
-	"r" (i),	 // 0
-	"r" (n),	 // 1
+	"+r" (i),	 // 0
+	"+r" (n)	 // 1
+	:
         "r" (y),         // 2    
         "r" (ap),        // 3
         "r" (x)          // 4
@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
         "jnz            1b              \n\t"
 
         :
-   	:
-	"r" (i),	  // 0
-	"r" (n),	  // 1
+	"+r" (i),	  // 0
+	"+r" (n)	  // 1
+	:
         "r" (&da),        // 2    
         "r" (src),        // 3
         "r" (dest)        // 4
diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c
index 79180b991e..14f13475b8 100644
--- a/kernel/x86_64/sger_microk_sandy-2.c
+++ b/kernel/x86_64/sger_microk_sandy-2.c
@@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"vzeroupper					     \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+	:
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha)   // 4
diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
index 9002228f32..602c3edf2d 100644
--- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
+++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
@@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 	"vmovss         %%xmm3 ,12(%9)		\n\t"	// save temp2
 
 	:
-        : 
-          "r" (from),	// 0	
+          "+r" (from)	// 0	
+        :
 	  "r" (to),  	// 1
           "r" (x),      // 2
           "r" (y),      // 3
diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c
index 69db008b66..fdfe4349a1 100644
--- a/kernel/x86_64/ssymv_L_microk_haswell-2.c
+++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c
@@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 	"vzeroupper				     \n\t"
 
 	:
-        : 
-          "r" (from),	// 0	
+          "+r" (from)	// 0	
+        :
 	  "r" (to),  	// 1
           "r" (x),      // 2
           "r" (y),      // 3
diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c
index c0fe5d6401..6bb9c02f6f 100644
--- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c
+++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c
@@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to,  FLOAT **a, FLOAT *x, F
 	"movss         %%xmm3 , 12(%9)		\n\t"	// save temp2
 
 	:
-        : 
-          "r" (from),	// 0	
+          "+r" (from)	// 0	
+        :
 	  "r" (to),  	// 1
           "r" (x),      // 2
           "r" (y),      // 3
diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c
index 093ca8073c..0c78212e7d 100644
--- a/kernel/x86_64/ssymv_L_microk_sandy-2.c
+++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c
@@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 	"vzeroupper				     \n\t"
 
 	:
-        : 
-          "r" (from),	// 0	
+          "+r" (from)	// 0	
+        :
 	  "r" (to),  	// 1
           "r" (x),      // 2
           "r" (y),      // 3
@@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 	"vzeroupper				     \n\t"
 
 	:
-        : 
-          "r" (from),	// 0	
+          "+r" (from)	// 0	
+        :
 	  "r" (to),  	// 1
           "r" (x),      // 2
           "r" (y),      // 3
diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
index 8c01ab8069..4a4f4d68de 100644
--- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
+++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
@@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 	"vmovss         %%xmm3 ,12(%9)		\n\t"	// save temp2
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (a0),     // 4
diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c
index a32e59b447..e6a09ccf88 100644
--- a/kernel/x86_64/ssymv_U_microk_haswell-2.c
+++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c
@@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 	"vzeroupper				     \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (a0),	// 4
diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c
index b8e6ee7326..c56ff3b15d 100644
--- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c
+++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c
@@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 	"movss         %%xmm3 , 12(%9)		\n\t"	// save temp2
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (a0),     // 4
diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c
index e8650650cd..c4919a39a4 100644
--- a/kernel/x86_64/ssymv_U_microk_sandy-2.c
+++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c
@@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
 	"vzeroupper				     \n\t"
 
 	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (a0),	// 4
diff --git a/kernel/x86_64/strsm_kernel_LN_bulldozer.c b/kernel/x86_64/strsm_kernel_LN_bulldozer.c
index 1b8991c6cf..3cd215000b 100644
--- a/kernel/x86_64/strsm_kernel_LN_bulldozer.c
+++ b/kernel/x86_64/strsm_kernel_LN_bulldozer.c
@@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	.align 16							\n\t"
 	"1:									\n\t"
 
-	"	vbroadcastss	(%3,%1,1), %%xmm0				\n\t"	// read b
-	"	vmovups         (%2,%1,8), %%xmm4				\n\t"
-	"	vbroadcastss   4(%3,%1,1), %%xmm1				\n\t"	
-	"	vmovups       16(%2,%1,8), %%xmm5				\n\t"
-	"	vmovups       32(%2,%1,8), %%xmm6				\n\t"
-	"	vmovups       48(%2,%1,8), %%xmm7				\n\t"
+	"	vbroadcastss	(%7,%1,1), %%xmm0				\n\t"	// read b
+	"	vmovups         (%6,%1,8), %%xmm4				\n\t"
+	"	vbroadcastss   4(%7,%1,1), %%xmm1				\n\t"	
+	"	vmovups       16(%6,%1,8), %%xmm5				\n\t"
+	"	vmovups       32(%6,%1,8), %%xmm6				\n\t"
+	"	vmovups       48(%6,%1,8), %%xmm7				\n\t"
 
 	"	vfmaddps	%%xmm8 , %%xmm0 , %%xmm4 , %%xmm8		\n\t"
 	"	vfmaddps	%%xmm12, %%xmm1 , %%xmm4 , %%xmm12		\n\t"
@@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 
 	"3:									\n\t"	
 
-	"	vbroadcastss    60(%6) , %%xmm0					\n\t" // i=15, read aa[i]		
+	"	vbroadcastss    60(%2) , %%xmm0					\n\t" // i=15, read aa[i]		
 	"	vshufps		$0xff  , %%xmm11 , %%xmm11 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xff  , %%xmm15 , %%xmm15 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 60(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 60(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
@@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    56(%6) , %%xmm0					\n\t" // i=14, read aa[i]		
+	"	vbroadcastss    56(%2) , %%xmm0					\n\t" // i=14, read aa[i]		
 	"	vshufps		$0xaa  , %%xmm11 , %%xmm11 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xaa  , %%xmm15 , %%xmm15 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 56(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 56(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
@@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    52(%6) , %%xmm0					\n\t" // i=13, read aa[i]		
+	"	vbroadcastss    52(%2) , %%xmm0					\n\t" // i=13, read aa[i]		
 	"	vshufps		$0x55  , %%xmm11 , %%xmm11 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x55  , %%xmm15 , %%xmm15 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 52(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 52(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
@@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    48(%6) , %%xmm0					\n\t" // i=12, read aa[i]		
+	"	vbroadcastss    48(%2) , %%xmm0					\n\t" // i=12, read aa[i]		
 	"	vshufps		$0x00  , %%xmm11 , %%xmm11 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x00  , %%xmm15 , %%xmm15 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 48(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 48(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
@@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10             \n\t"
         "       vfnmaddps       %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    44(%6) , %%xmm0					\n\t" // i=11, read aa[i]		
+	"	vbroadcastss    44(%2) , %%xmm0					\n\t" // i=11, read aa[i]		
 	"	vshufps		$0xff  , %%xmm10 , %%xmm10 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xff  , %%xmm14 , %%xmm14 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 44(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 44(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
@@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10             \n\t"
         "       vfnmaddps       %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    40(%6) , %%xmm0					\n\t" // i=10, read aa[i]		
+	"	vbroadcastss    40(%2) , %%xmm0					\n\t" // i=10, read aa[i]		
 	"	vshufps		$0xaa  , %%xmm10 , %%xmm10 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xaa  , %%xmm14 , %%xmm14 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 40(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 40(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
@@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10             \n\t"
         "       vfnmaddps       %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    36(%6) , %%xmm0					\n\t" // i=9 , read aa[i]		
+	"	vbroadcastss    36(%2) , %%xmm0					\n\t" // i=9 , read aa[i]		
 	"	vshufps		$0x55  , %%xmm10 , %%xmm10 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x55  , %%xmm14 , %%xmm14 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 36(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 36(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
@@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10             \n\t"
         "       vfnmaddps       %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    32(%6) , %%xmm0					\n\t" // i=8 , read aa[i]		
+	"	vbroadcastss    32(%2) , %%xmm0					\n\t" // i=8 , read aa[i]		
 	"	vshufps		$0x00  , %%xmm10 , %%xmm10 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x00  , %%xmm14 , %%xmm14 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 32(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 32(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
         "       vfnmaddps       %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    28(%6) , %%xmm0					\n\t" // i=7 , read aa[i]		
+	"	vbroadcastss    28(%2) , %%xmm0					\n\t" // i=7 , read aa[i]		
 	"	vshufps		$0xff  , %%xmm9  , %%xmm9  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xff  , %%xmm13 , %%xmm13 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 28(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 28(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
         "       vfnmaddps       %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    24(%6) , %%xmm0					\n\t" // i=6 , read aa[i]		
+	"	vbroadcastss    24(%2) , %%xmm0					\n\t" // i=6 , read aa[i]		
 	"	vshufps		$0xaa  , %%xmm9  , %%xmm9  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xaa  , %%xmm13 , %%xmm13 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 24(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 24(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
         "       vfnmaddps       %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    20(%6) , %%xmm0					\n\t" // i=5 , read aa[i]		
+	"	vbroadcastss    20(%2) , %%xmm0					\n\t" // i=5 , read aa[i]		
 	"	vshufps		$0x55  , %%xmm9  , %%xmm9  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x55  , %%xmm13 , %%xmm13 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 20(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 20(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
         "       vfnmaddps       %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    16(%6) , %%xmm0					\n\t" // i=4 , read aa[i]		
+	"	vbroadcastss    16(%2) , %%xmm0					\n\t" // i=4 , read aa[i]		
 	"	vshufps		$0x00  , %%xmm9  , %%xmm9  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x00  , %%xmm13 , %%xmm13 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 16(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 16(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    12(%6) , %%xmm0					\n\t" // i=3 , read aa[i]		
+	"	vbroadcastss    12(%2) , %%xmm0					\n\t" // i=3 , read aa[i]		
 	"	vshufps		$0xff  , %%xmm8  , %%xmm8  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xff  , %%xmm12 , %%xmm12 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 12(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 12(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss     8(%6) , %%xmm0					\n\t" // i=2 , read aa[i]		
+	"	vbroadcastss     8(%2) , %%xmm0					\n\t" // i=2 , read aa[i]		
 	"	vshufps		$0xaa  , %%xmm8  , %%xmm8  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xaa  , %%xmm12 , %%xmm12 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  ,  8(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  ,  8(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss     4(%6) , %%xmm0					\n\t" // i=1 , read aa[i]		
+	"	vbroadcastss     4(%2) , %%xmm0					\n\t" // i=1 , read aa[i]		
 	"	vshufps		$0x55  , %%xmm8  , %%xmm8  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x55  , %%xmm12 , %%xmm12 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  ,  4(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  ,  4(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
 
-	"	subq		$64 , %6					\n\t"   // a -= m
-	"	subq		$8  , %7					\n\t"   // b -= n
+	"	subq		$64 , %2					\n\t"   // a -= m
+	"	subq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss     0(%6) , %%xmm0					\n\t" // i=0 , read aa[i]		
+	"	vbroadcastss     0(%2) , %%xmm0					\n\t" // i=0 , read aa[i]		
 	"	vshufps		$0x00  , %%xmm8  , %%xmm8  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x00  , %%xmm12 , %%xmm12 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  ,  0(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  ,  0(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
 	"	vzeroupper							\n\t"
 
         :
+          "+r" (n1),     // 0    
+          "+a" (i),      // 1    
+          "+r" (as),     // 2
+          "+r" (bs)      // 3
         :
-          "r" (n1),     // 0    
-          "a" (i),      // 1    
-          "r" (a),      // 2
-          "r" (b),      // 3
           "r" (c),      // 4
           "r" (c1),     // 5
-          "r" (as),     // 6
-          "r" (bs)      // 7
+          "r" (a),      // 6
+          "r" (b)       // 7
         : "cc",
           "%xmm0", "%xmm1", "%xmm2", "%xmm3",
           "%xmm4", "%xmm5", "%xmm6", "%xmm7",
diff --git a/kernel/x86_64/strsm_kernel_LT_bulldozer.c b/kernel/x86_64/strsm_kernel_LT_bulldozer.c
index 0623dddb0c..a4a62491cd 100644
--- a/kernel/x86_64/strsm_kernel_LT_bulldozer.c
+++ b/kernel/x86_64/strsm_kernel_LT_bulldozer.c
@@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	.align 16							\n\t"
 	"1:									\n\t"
 
-	"	vbroadcastss	(%3,%1,1), %%xmm0				\n\t"	// read b
-	"	vmovups         (%2,%1,8), %%xmm4				\n\t"
-	"	vbroadcastss   4(%3,%1,1), %%xmm1				\n\t"	
-	"	vmovups       16(%2,%1,8), %%xmm5				\n\t"
-	"	vmovups       32(%2,%1,8), %%xmm6				\n\t"
-	"	vmovups       48(%2,%1,8), %%xmm7				\n\t"
+	"	vbroadcastss	(%7,%1,1), %%xmm0				\n\t"	// read b
+	"	vmovups         (%6,%1,8), %%xmm4				\n\t"
+	"	vbroadcastss   4(%7,%1,1), %%xmm1				\n\t"	
+	"	vmovups       16(%6,%1,8), %%xmm5				\n\t"
+	"	vmovups       32(%6,%1,8), %%xmm6				\n\t"
+	"	vmovups       48(%6,%1,8), %%xmm7				\n\t"
 
 	"	vfmaddps	%%xmm8 , %%xmm0 , %%xmm4 , %%xmm8		\n\t"
 	"	vfmaddps	%%xmm12, %%xmm1 , %%xmm4 , %%xmm12		\n\t"
@@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 
 	"3:									\n\t"	
 
-	"	vbroadcastss     0(%6) , %%xmm0					\n\t" // i=0, read aa[i]		
+	"	vbroadcastss     0(%2) , %%xmm0					\n\t" // i=0, read aa[i]		
 	"	vshufps		$0x00  , %%xmm8  , %%xmm8  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x00  , %%xmm12 , %%xmm12 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  ,  0(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  ,  0(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
@@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss     4(%6) , %%xmm0					\n\t" // i=1, read aa[i]		
+	"	vbroadcastss     4(%2) , %%xmm0					\n\t" // i=1, read aa[i]		
 	"	vshufps		$0x55  , %%xmm8  , %%xmm8  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x55  , %%xmm12 , %%xmm12 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  ,  4(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  ,  4(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
@@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss     8(%6) , %%xmm0					\n\t" // i=2, read aa[i]		
+	"	vbroadcastss     8(%2) , %%xmm0					\n\t" // i=2, read aa[i]		
 	"	vshufps		$0xaa  , %%xmm8  , %%xmm8  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xaa  , %%xmm12 , %%xmm12 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  ,  8(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  ,  8(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups         0(%6)  , %%xmm4                                 \n\t"   // read a[k]
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups         0(%2)  , %%xmm4                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
 	"       vfnmaddps       %%xmm8  , %%xmm1 , %%xmm4 , %%xmm8              \n\t"
         "       vfnmaddps       %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12             \n\t"
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
@@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    12(%6) , %%xmm0					\n\t" // i=3, read aa[i]		
+	"	vbroadcastss    12(%2) , %%xmm0					\n\t" // i=3, read aa[i]		
 	"	vshufps		$0xff  , %%xmm8  , %%xmm8  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xff  , %%xmm12 , %%xmm12 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 12(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 12(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
         "       vfnmaddps       %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13             \n\t"
         "       vfnmaddps       %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10             \n\t"
@@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    16(%6) , %%xmm0					\n\t" // i=4, read aa[i]		
+	"	vbroadcastss    16(%2) , %%xmm0					\n\t" // i=4, read aa[i]		
 	"	vshufps		$0x00  , %%xmm9  , %%xmm9  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x00  , %%xmm13 , %%xmm13 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 16(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 16(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
         "       vfnmaddps       %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13             \n\t"
         "       vfnmaddps       %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10             \n\t"
@@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    20(%6) , %%xmm0					\n\t" // i=5, read aa[i]		
+	"	vbroadcastss    20(%2) , %%xmm0					\n\t" // i=5, read aa[i]		
 	"	vshufps		$0x55  , %%xmm9  , %%xmm9  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x55  , %%xmm13 , %%xmm13 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 20(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 20(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
         "       vfnmaddps       %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13             \n\t"
         "       vfnmaddps       %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10             \n\t"
@@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    24(%6) , %%xmm0					\n\t" // i=6, read aa[i]		
+	"	vbroadcastss    24(%2) , %%xmm0					\n\t" // i=6, read aa[i]		
 	"	vshufps		$0xaa  , %%xmm9  , %%xmm9  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xaa  , %%xmm13 , %%xmm13 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 24(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 24(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups        16(%6)  , %%xmm5                                 \n\t"   // read a[k]
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups        16(%2)  , %%xmm5                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
         "       vfnmaddps       %%xmm9  , %%xmm1 , %%xmm5 , %%xmm9              \n\t"
         "       vfnmaddps       %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13             \n\t"
         "       vfnmaddps       %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10             \n\t"
@@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    28(%6) , %%xmm0					\n\t" // i=7, read aa[i]		
+	"	vbroadcastss    28(%2) , %%xmm0					\n\t" // i=7, read aa[i]		
 	"	vshufps		$0xff  , %%xmm9  , %%xmm9  , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xff  , %%xmm13 , %%xmm13 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 28(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 28(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
         "       vfnmaddps       %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10             \n\t"
         "       vfnmaddps       %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14             \n\t"
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    32(%6) , %%xmm0					\n\t" // i=8, read aa[i]		
+	"	vbroadcastss    32(%2) , %%xmm0					\n\t" // i=8, read aa[i]		
 	"	vshufps		$0x00  , %%xmm10 , %%xmm10 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x00  , %%xmm14 , %%xmm14 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 32(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 32(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
         "       vfnmaddps       %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10             \n\t"
         "       vfnmaddps       %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14             \n\t"
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    36(%6) , %%xmm0					\n\t" // i=9, read aa[i]		
+	"	vbroadcastss    36(%2) , %%xmm0					\n\t" // i=9, read aa[i]		
 	"	vshufps		$0x55  , %%xmm10 , %%xmm10 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x55  , %%xmm14 , %%xmm14 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 36(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 36(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
         "       vfnmaddps       %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10             \n\t"
         "       vfnmaddps       %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14             \n\t"
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    40(%6) , %%xmm0					\n\t" // i=10, read aa[i]		
+	"	vbroadcastss    40(%2) , %%xmm0					\n\t" // i=10, read aa[i]		
 	"	vshufps		$0xaa  , %%xmm10 , %%xmm10 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xaa  , %%xmm14 , %%xmm14 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 40(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 40(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups        32(%6)  , %%xmm6                                 \n\t"   // read a[k]
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups        32(%2)  , %%xmm6                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
         "       vfnmaddps       %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10             \n\t"
         "       vfnmaddps       %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14             \n\t"
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    44(%6) , %%xmm0					\n\t" // i=11, read aa[i]		
+	"	vbroadcastss    44(%2) , %%xmm0					\n\t" // i=11, read aa[i]		
 	"	vshufps		$0xff  , %%xmm10 , %%xmm10 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xff  , %%xmm14 , %%xmm14 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 44(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 44(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    48(%6) , %%xmm0					\n\t" // i=12, read aa[i]		
+	"	vbroadcastss    48(%2) , %%xmm0					\n\t" // i=12, read aa[i]		
 	"	vshufps		$0x00  , %%xmm11 , %%xmm11 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x00  , %%xmm15 , %%xmm15 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 48(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 48(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    52(%6) , %%xmm0					\n\t" // i=13, read aa[i]		
+	"	vbroadcastss    52(%2) , %%xmm0					\n\t" // i=13, read aa[i]		
 	"	vshufps		$0x55  , %%xmm11 , %%xmm11 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0x55  , %%xmm15 , %%xmm15 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 52(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 52(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    56(%6) , %%xmm0					\n\t" // i=14, read aa[i]		
+	"	vbroadcastss    56(%2) , %%xmm0					\n\t" // i=14, read aa[i]		
 	"	vshufps		$0xaa  , %%xmm11 , %%xmm11 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xaa  , %%xmm15 , %%xmm15 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 56(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 56(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
-	"       vmovups        48(%6)  , %%xmm7                                 \n\t"   // read a[k]
+	"       vmovups        48(%2)  , %%xmm7                                 \n\t"   // read a[k]
         "       vfnmaddps       %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11             \n\t"
         "       vfnmaddps       %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15             \n\t"
 
-	"	addq		$64 , %6					\n\t"   // a -= m
-	"	addq		$8  , %7					\n\t"   // b -= n
+	"	addq		$64 , %2					\n\t"   // a -= m
+	"	addq		$8  , %3					\n\t"   // b -= n
 
-	"	vbroadcastss    60(%6) , %%xmm0					\n\t" // i=15, read aa[i]		
+	"	vbroadcastss    60(%2) , %%xmm0					\n\t" // i=15, read aa[i]		
 	"	vshufps		$0xff  , %%xmm11 , %%xmm11 , %%xmm1		\n\t" // extract bb0
 	"	vshufps		$0xff  , %%xmm15 , %%xmm15 , %%xmm2		\n\t" // extract bb1
 	"       vmulps          %%xmm0  , %%xmm1 , %%xmm1                       \n\t"   // bb0 * aa
 	"       vmulps          %%xmm0  , %%xmm2 , %%xmm2                       \n\t"   // bb1 * aa
         "       vmovss          %%xmm1  , 60(%4)                                \n\t"   // c[i] = bb0 * aa
         "       vmovss          %%xmm2  , 60(%5)                                \n\t"   // c[i] = bb1 * aa
-        "       vmovss          %%xmm1  ,   (%7)                        	\n\t"   // b[0] = bb0 * aa
-        "       vmovss          %%xmm2  ,  4(%7)                        	\n\t"   // b[1] = bb1 * aa
+        "       vmovss          %%xmm1  ,   (%3)                        	\n\t"   // b[0] = bb0 * aa
+        "       vmovss          %%xmm2  ,  4(%3)                        	\n\t"   // b[1] = bb1 * aa
 
 	"	vzeroupper							\n\t"
 
         :
+          "+r" (n1),     // 0    
+          "+a" (i),      // 1    
+          "+r" (as),     // 2
+          "+r" (bs)      // 3
         :
-          "r" (n1),     // 0    
-          "a" (i),      // 1    
-          "r" (a),      // 2
-          "r" (b),      // 3
-          "r" (c),      // 4
-          "r" (c1),     // 5
-          "r" (as),     // 6
-          "r" (bs)      // 7
+          "r" (c),       // 4
+          "r" (c1),      // 5
+          "r" (a),       // 6
+          "r" (b)        // 7
         : "cc",
           "%xmm0", "%xmm1", "%xmm2", "%xmm3",
           "%xmm4", "%xmm5", "%xmm6", "%xmm7",
diff --git a/kernel/x86_64/strsm_kernel_RN_bulldozer.c b/kernel/x86_64/strsm_kernel_RN_bulldozer.c
index 4cc557d552..c11c84cec8 100644
--- a/kernel/x86_64/strsm_kernel_RN_bulldozer.c
+++ b/kernel/x86_64/strsm_kernel_RN_bulldozer.c
@@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	.align 16							\n\t"
 	"1:									\n\t"
 
-	"	vbroadcastss	(%3,%1,1), %%xmm0				\n\t"	// read b
-	"	vmovups         (%2,%1,8), %%xmm4				\n\t"
-	"	vbroadcastss   4(%3,%1,1), %%xmm1				\n\t"	
-	"	vmovups       16(%2,%1,8), %%xmm5				\n\t"
-	"	vmovups       32(%2,%1,8), %%xmm6				\n\t"
-	"	vmovups       48(%2,%1,8), %%xmm7				\n\t"
+	"	vbroadcastss	(%7,%1,1), %%xmm0				\n\t"	// read b
+	"	vmovups         (%6,%1,8), %%xmm4				\n\t"
+	"	vbroadcastss   4(%7,%1,1), %%xmm1				\n\t"	
+	"	vmovups       16(%6,%1,8), %%xmm5				\n\t"
+	"	vmovups       32(%6,%1,8), %%xmm6				\n\t"
+	"	vmovups       48(%6,%1,8), %%xmm7				\n\t"
 
 	"	vfmaddps	%%xmm8 , %%xmm0 , %%xmm4 , %%xmm8		\n\t"
 	"	vfmaddps	%%xmm12, %%xmm1 , %%xmm4 , %%xmm12		\n\t"
@@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 
 	"3:									\n\t"	// i = 0
 
-	"	vbroadcastss	(%7), %%xmm0					\n\t"	// read bb
-	"	vbroadcastss   4(%7), %%xmm1					\n\t"	// read b
+	"	vbroadcastss	(%3), %%xmm0					\n\t"	// read bb
+	"	vbroadcastss   4(%3), %%xmm1					\n\t"	// read b
 
 	"	vmulps		%%xmm8  ,  %%xmm0 ,  %%xmm8 			\n\t"	// aa * bb 
 	"	vmulps		%%xmm9  ,  %%xmm0 ,  %%xmm9 			\n\t"
 	"	vmulps		%%xmm10 ,  %%xmm0 ,  %%xmm10			\n\t"
 	"	vmulps		%%xmm11 ,  %%xmm0 ,  %%xmm11			\n\t"
 
-	"	vmovups		%%xmm8  ,    (%6)				\n\t"	// write a
-	"	vmovups		%%xmm9  ,  16(%6)				\n\t"
-	"	vmovups		%%xmm10 ,  32(%6)				\n\t"
-	"	vmovups		%%xmm11 ,  48(%6)				\n\t"
+	"	vmovups		%%xmm8  ,    (%2)				\n\t"	// write a
+	"	vmovups		%%xmm9  ,  16(%2)				\n\t"
+	"	vmovups		%%xmm10 ,  32(%2)				\n\t"
+	"	vmovups		%%xmm11 ,  48(%2)				\n\t"
 
 	"	vmovups		%%xmm8  ,    (%4)				\n\t"	// write c0
 	"	vmovups		%%xmm9  ,  16(%4)				\n\t"
@@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vfnmaddps	%%xmm15 ,  %%xmm11 , %%xmm1 , %%xmm15		\n\t"   
 
 	"									\n\t" // i = 1
-	"	addq		$8  , %7					\n\t" // b = b + 2
-	"	addq	       $64  , %6					\n\t" // a = a + 16
+	"	addq		$8  , %3					\n\t" // b = b + 2
+	"	addq	       $64  , %2					\n\t" // a = a + 16
 
-	"	vbroadcastss      4(%7), %%xmm0					\n\t"	// read bb
+	"	vbroadcastss      4(%3), %%xmm0					\n\t"	// read bb
 
 	"	vmulps		%%xmm12 ,  %%xmm0 ,  %%xmm12			\n\t"	// aa * bb 
 	"	vmulps		%%xmm13 ,  %%xmm0 ,  %%xmm13			\n\t"	// aa * bb 
 	"	vmulps		%%xmm14 ,  %%xmm0 ,  %%xmm14			\n\t"	// aa * bb 
 	"	vmulps		%%xmm15 ,  %%xmm0 ,  %%xmm15			\n\t"	// aa * bb 
 
-	"	vmovups		%%xmm12 ,    (%6)				\n\t"	// write a
-	"	vmovups		%%xmm13 ,  16(%6)				\n\t"	// write a
-	"	vmovups		%%xmm14 ,  32(%6)				\n\t"	// write a
-	"	vmovups		%%xmm15 ,  48(%6)				\n\t"	// write a
+	"	vmovups		%%xmm12 ,    (%2)				\n\t"	// write a
+	"	vmovups		%%xmm13 ,  16(%2)				\n\t"	// write a
+	"	vmovups		%%xmm14 ,  32(%2)				\n\t"	// write a
+	"	vmovups		%%xmm15 ,  48(%2)				\n\t"	// write a
 
 	"	vmovups		%%xmm12 ,    (%5)				\n\t"	// write c1
 	"	vmovups		%%xmm13 ,  16(%5)				\n\t"	
@@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vzeroupper							\n\t"
 
         :
+          "+r" (n1),     // 0    
+          "+a" (i),      // 1    
+          "+r" (as),     // 2
+          "+r" (bs)      // 3
         :
-          "r" (n1),     // 0    
-          "a" (i),      // 1    
-          "r" (a),      // 2
-          "r" (b),      // 3
-          "r" (c),      // 4
-          "r" (c1),     // 5
-          "r" (as),     // 6
-          "r" (bs)      // 7
+          "r" (c),       // 4
+          "r" (c1),      // 5
+          "r" (a),       // 6
+          "r" (b)        // 7
         : "cc",
           "%xmm0", "%xmm1", "%xmm2", "%xmm3",
           "%xmm4", "%xmm5", "%xmm6", "%xmm7",
diff --git a/kernel/x86_64/strsm_kernel_RT_bulldozer.c b/kernel/x86_64/strsm_kernel_RT_bulldozer.c
index 73f6e8a956..326ca29761 100644
--- a/kernel/x86_64/strsm_kernel_RT_bulldozer.c
+++ b/kernel/x86_64/strsm_kernel_RT_bulldozer.c
@@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	.align 16							\n\t"
 	"1:									\n\t"
 
-	"	vbroadcastss	(%3,%1,1), %%xmm0				\n\t"	// read b
-	"	vmovups         (%2,%1,8), %%xmm4				\n\t"
-	"	vbroadcastss   4(%3,%1,1), %%xmm1				\n\t"	
-	"	vmovups       16(%2,%1,8), %%xmm5				\n\t"
-	"	vmovups       32(%2,%1,8), %%xmm6				\n\t"
-	"	vmovups       48(%2,%1,8), %%xmm7				\n\t"
+	"	vbroadcastss	(%7,%1,1), %%xmm0				\n\t"	// read b
+	"	vmovups         (%6,%1,8), %%xmm4				\n\t"
+	"	vbroadcastss   4(%7,%1,1), %%xmm1				\n\t"	
+	"	vmovups       16(%6,%1,8), %%xmm5				\n\t"
+	"	vmovups       32(%6,%1,8), %%xmm6				\n\t"
+	"	vmovups       48(%6,%1,8), %%xmm7				\n\t"
 
 	"	vfmaddps	%%xmm8 , %%xmm0 , %%xmm4 , %%xmm8		\n\t"
 	"	vfmaddps	%%xmm12, %%xmm1 , %%xmm4 , %%xmm12		\n\t"
@@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 
 	"3:									\n\t"	// i = 1
 
-	"	vbroadcastss	(%7), %%xmm1					\n\t"	// read b
-	"	vbroadcastss   4(%7), %%xmm0					\n\t"	// read bb
+	"	vbroadcastss	(%3), %%xmm1					\n\t"	// read b
+	"	vbroadcastss   4(%3), %%xmm0					\n\t"	// read bb
 
 	"	vmulps		%%xmm12 ,  %%xmm0 ,  %%xmm12			\n\t"	// aa * bb 
 	"	vmulps		%%xmm13 ,  %%xmm0 ,  %%xmm13			\n\t"	// aa * bb 
 	"	vmulps		%%xmm14 ,  %%xmm0 ,  %%xmm14			\n\t"	// aa * bb 
 	"	vmulps		%%xmm15 ,  %%xmm0 ,  %%xmm15			\n\t"	// aa * bb 
 
-	"	vmovups		%%xmm12 ,    (%6)				\n\t"	// write a
-	"	vmovups		%%xmm13 ,  16(%6)				\n\t"	// write a
-	"	vmovups		%%xmm14 ,  32(%6)				\n\t"	// write a
-	"	vmovups		%%xmm15 ,  48(%6)				\n\t"	// write a
+	"	vmovups		%%xmm12 ,    (%2)				\n\t"	// write a
+	"	vmovups		%%xmm13 ,  16(%2)				\n\t"	// write a
+	"	vmovups		%%xmm14 ,  32(%2)				\n\t"	// write a
+	"	vmovups		%%xmm15 ,  48(%2)				\n\t"	// write a
 
 	"	vmovups		%%xmm12 ,    (%5)				\n\t"	// write c1
 	"	vmovups		%%xmm13 ,  16(%5)				\n\t"	
@@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vfnmaddps	%%xmm11 ,  %%xmm15 , %%xmm1 , %%xmm11		\n\t"   
 
 	"									\n\t" // i = 0
-	"	subq		$8  , %7					\n\t" // b = b - 2
-	"	subq	       $64  , %6					\n\t" // a = a - 16
+	"	subq		$8  , %3					\n\t" // b = b - 2
+	"	subq	       $64  , %2					\n\t" // a = a - 16
 
-	"	vbroadcastss       (%7), %%xmm0					\n\t"	// read bb
+	"	vbroadcastss       (%3), %%xmm0					\n\t"	// read bb
 
 	"	vmulps		%%xmm8  ,  %%xmm0 ,  %%xmm8 			\n\t"	// aa * bb 
 	"	vmulps		%%xmm9  ,  %%xmm0 ,  %%xmm9 			\n\t"
 	"	vmulps		%%xmm10 ,  %%xmm0 ,  %%xmm10			\n\t"
 	"	vmulps		%%xmm11 ,  %%xmm0 ,  %%xmm11			\n\t"
 
-	"	vmovups		%%xmm8  ,    (%6)				\n\t"	// write a
-	"	vmovups		%%xmm9  ,  16(%6)				\n\t"
-	"	vmovups		%%xmm10 ,  32(%6)				\n\t"
-	"	vmovups		%%xmm11 ,  48(%6)				\n\t"
+	"	vmovups		%%xmm8  ,    (%2)				\n\t"	// write a
+	"	vmovups		%%xmm9  ,  16(%2)				\n\t"
+	"	vmovups		%%xmm10 ,  32(%2)				\n\t"
+	"	vmovups		%%xmm11 ,  48(%2)				\n\t"
 
 	"	vmovups		%%xmm8  ,    (%4)				\n\t"	// write c0
 	"	vmovups		%%xmm9  ,  16(%4)				\n\t"
@@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vzeroupper							\n\t"
 
         :
+          "+r" (n1),     // 0    
+          "+a" (i),      // 1    
+          "+r" (as),     // 2
+          "+r" (bs)      // 3
         :
-          "r" (n1),     // 0    
-          "a" (i),      // 1    
-          "r" (a),      // 2
-          "r" (b),      // 3
-          "r" (c),      // 4
-          "r" (c1),     // 5
-          "r" (as),     // 6
-          "r" (bs)      // 7
+          "r" (c),       // 4
+          "r" (c1),      // 5
+          "r" (a),       // 6
+          "r" (b)        // 7
         : "cc",
           "%xmm0", "%xmm1", "%xmm2", "%xmm3",
           "%xmm4", "%xmm5", "%xmm6", "%xmm7",
diff --git a/kernel/x86_64/sum.S b/kernel/x86_64/sum.S
new file mode 100644
index 0000000000..d075eaa042
--- /dev/null
+++ b/kernel/x86_64/sum.S
@@ -0,0 +1,179 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define M	ARG1
+#define X	ARG2
+#define INCX	ARG3
+
+#define I	%rax
+
+#include "l1param.h"
+
+	PROLOGUE
+	PROFCODE
+
+	fldz
+	testq	M, M
+	jle	.L999
+	testq	INCX, INCX
+	jle	.L999
+
+	salq	$BASE_SHIFT, INCX
+
+	fldz
+	fldz
+	fldz
+	cmpq	$SIZE, INCX
+	jne	.L40
+
+	movq	M, I
+	sarq	$3,   I
+	jle	.L20
+	ALIGN_4
+
+.L10:
+#ifdef PREFETCH
+	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
+#endif
+
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	FLD	2 * SIZE(X)
+	FLD	3 * SIZE(X)
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	FLD	4 * SIZE(X)
+	FLD	5 * SIZE(X)
+	FLD	6 * SIZE(X)
+	FLD	7 * SIZE(X)
+
+	addq	$8 * SIZE, X
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	decq	I
+	jg	.L10
+	ALIGN_4
+
+.L20:
+	andq	$7,  M
+	jle	.L998
+	ALIGN_4
+
+.L21:
+	FLD	(X)
+	faddp	%st,%st(1)
+	addq	$1 * SIZE, X
+	decq	M
+	jg	.L21
+	jmp	.L998
+	ALIGN_4
+
+.L40:
+	movq	M, I
+	sarq	$3,   I
+	jle	.L60
+	ALIGN_4
+
+.L50:
+	FLD	(X)
+	addq	INCX, X
+	FLD	(X)
+	addq	INCX, X
+	FLD	(X)
+	addq	INCX, X
+	FLD	(X)
+	addq	INCX, X
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	FLD	(X)
+	addq	INCX, X
+	FLD	(X)
+	addq	INCX, X
+	FLD	(X)
+	addq	INCX, X
+	FLD	(X)
+	addq	INCX, X
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	decq	I
+	jg	.L50
+	ALIGN_4
+
+.L60:
+	andq	$7,  M
+	jle	.L998
+	ALIGN_4
+
+
+.L61:
+	FLD	(X)
+	addq	INCX, X
+	faddp	%st,%st(1)
+	decq	M
+	jg	.L61
+	ALIGN_4
+
+.L998:
+	faddp	%st,%st(2)
+	faddp	%st,%st(1)
+	faddp	%st,%st(1)
+	ALIGN_4
+
+.L999:
+	ret
+
+	EPILOGUE
diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c
index 0e15761f79..15d3679717 100644
--- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c
+++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c
@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper					    \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper					    \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c
index 30e8b19552..89d23daf32 100644
--- a/kernel/x86_64/zaxpy_microk_haswell-2.c
+++ b/kernel/x86_64/zaxpy_microk_haswell-2.c
@@ -111,10 +111,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper					    \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c
index 233af143ad..17b8b24f7c 100644
--- a/kernel/x86_64/zaxpy_microk_sandy-2.c
+++ b/kernel/x86_64/zaxpy_microk_sandy-2.c
@@ -99,10 +99,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper					    \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
@@ -176,10 +176,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper					    \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c
index 728d092133..907b1ae009 100644
--- a/kernel/x86_64/zaxpy_microk_steamroller-2.c
+++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c
@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper					    \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
 	"jnz		1b		             \n\t"
 	"vzeroupper					    \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (alpha),  // 4
diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c
index 30a9552d60..db9a48cce8 100644
--- a/kernel/x86_64/zdot_microk_bulldozer-2.c
+++ b/kernel/x86_64/zdot_microk_bulldozer-2.c
@@ -96,10 +96,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovups       %%xmm4,  16(%4)		\n\t"
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
@@ -175,10 +175,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovups       %%xmm4,  16(%4)		\n\t"
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c
index 11056a3c16..9f2fc2c1d9 100644
--- a/kernel/x86_64/zdot_microk_haswell-2.c
+++ b/kernel/x86_64/zdot_microk_haswell-2.c
@@ -101,10 +101,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovups       %%xmm4,  16(%4)		\n\t"
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
@@ -186,10 +186,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovups       %%xmm4,  16(%4)		\n\t"
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c
index 87c5b03402..33415e26e5 100644
--- a/kernel/x86_64/zdot_microk_sandy-2.c
+++ b/kernel/x86_64/zdot_microk_sandy-2.c
@@ -107,10 +107,10 @@ if ( n < 1280 )
 	"vmovups       %%xmm4,  16(%4)		\n\t"
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
@@ -199,10 +199,10 @@ if ( n < 1280 )
 	"vmovups       %%xmm4,  16(%4)		\n\t"
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c
index 325f74ae30..87138fe9a0 100644
--- a/kernel/x86_64/zdot_microk_steamroller-2.c
+++ b/kernel/x86_64/zdot_microk_steamroller-2.c
@@ -95,10 +95,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovups       %%xmm4,  16(%4)		\n\t"
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
@@ -172,10 +172,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 	"vmovups       %%xmm4,  16(%4)		\n\t"
 	"vzeroupper					     \n\t"
 
-	:
-        : 
-          "r" (i),	// 0	
-	  "r" (n),  	// 1
+	: 
+          "+r" (i),	// 0	
+	  "+r" (n)  	// 1
+        :
           "r" (x),      // 2
           "r" (y),      // 3
           "r" (dot)     // 4
diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c
index 03882d6b66..5e733ffdae 100644
--- a/kernel/x86_64/zscal_microk_bulldozer-2.c
+++ b/kernel/x86_64/zscal_microk_bulldozer-2.c
@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
 	: "cc", //"%0", "%1",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
@@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
 
 	"vzeroupper					    \n\t"
 
+	: 
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
           "r" (alpha)   // 2
 	: "cc", //"%0", "%1",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c
index d9253c1ed5..8c8f5b75cb 100644
--- a/kernel/x86_64/zscal_microk_haswell-2.c
+++ b/kernel/x86_64/zscal_microk_haswell-2.c
@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c
index 97b07add65..c9267ee0c3 100644
--- a/kernel/x86_64/zscal_microk_steamroller-2.c
+++ b/kernel/x86_64/zscal_microk_steamroller-2.c
@@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
 
 	"vzeroupper					    \n\t"
 
+	: 
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
@@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
 	"vzeroupper					    \n\t"
 
 	:
-        : 
-	  "r" (n),  	// 0
-          "r" (x),      // 1
+	  "+r" (n),  	// 0
+          "+r" (x)      // 1
+	:
           "r" (alpha)   // 2
-	: "cc", //"%0", "%1",
+	: "cc",
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
diff --git a/kernel/x86_64/zsum.S b/kernel/x86_64/zsum.S
new file mode 100644
index 0000000000..45e0ddff55
--- /dev/null
+++ b/kernel/x86_64/zsum.S
@@ -0,0 +1,180 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define M	ARG1
+#define X	ARG2
+#define INCX	ARG3
+
+#define I	%rax
+
+#include "l1param.h"
+
+	PROLOGUE
+	PROFCODE
+
+	fldz
+	testq	M, M
+	jle	.L999
+	testq	INCX, INCX
+	jle	.L999
+
+	salq	$ZBASE_SHIFT, INCX
+
+	fldz
+	fldz
+	fldz
+	cmpq	$SIZE * 2, INCX
+	jne	.L40
+
+	movq	M, I
+	sarq	$2,   I
+	jle	.L20
+	ALIGN_4
+
+.L10:
+#ifdef PREFETCH
+	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
+#endif
+
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	FLD	2 * SIZE(X)
+	FLD	3 * SIZE(X)
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	FLD	4 * SIZE(X)
+	FLD	5 * SIZE(X)
+	FLD	6 * SIZE(X)
+	FLD	7 * SIZE(X)
+
+	addq	$8 * SIZE, X
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	decq	I
+	jg	.L10
+	ALIGN_4
+
+.L20:
+	andq	$3,  M
+	jle	.L998
+	ALIGN_4
+
+
+.L21:
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	faddp	%st,%st(3)
+	faddp	%st,%st(1)
+	addq	$2 * SIZE, X
+	decq	M
+	jg	.L21
+	jmp	.L998
+	ALIGN_4
+
+.L40:
+	movq	M, I
+	sarq	$2,   I
+	jle	.L60
+	ALIGN_4
+
+.L50:
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	addq	INCX, X
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	addq	INCX, X
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	addq	INCX, X
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	addq	INCX, X
+
+	faddp	%st, %st(7)
+	faddp	%st, %st(5)
+	faddp	%st, %st(3)
+	faddp	%st, %st(1)
+
+	decq	I
+	jg	.L50
+	ALIGN_4
+
+.L60:
+	andq	$3,  M
+	jle	.L998
+	ALIGN_4
+
+
+.L61:
+	FLD	0 * SIZE(X)
+	FLD	1 * SIZE(X)
+	addq	INCX, X
+	faddp	%st,%st(3)
+	faddp	%st,%st(1)
+	decq	M
+	jg	.L61
+	ALIGN_4
+
+.L998:
+	faddp	%st,%st(2)
+	faddp	%st,%st(1)
+	faddp	%st,%st(1)
+	ALIGN_4
+
+.L999:
+	ret
+
+	EPILOGUE
diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13
index add628bfe1..b1ffd3c54d 100644
--- a/kernel/zarch/KERNEL.Z13
+++ b/kernel/zarch/KERNEL.Z13
@@ -1,18 +1,18 @@
 SAMAXKERNEL  = ../arm/amax.c
-DAMAXKERNEL  = ../arm/amax.c
+DAMAXKERNEL  = damax_z13.c
 CAMAXKERNEL  = ../arm/zamax.c
-ZAMAXKERNEL  = ../arm/zamax.c
+ZAMAXKERNEL  = zamax_z13.c
 
 SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = damin_z13.c
 CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = zamin_z13.c
 
 SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = dmax_z13.c
 
 SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
+DMINKERNEL   = dmin_z13.c
 
 ISAMAXKERNEL = ../arm/iamax.c
 IDAMAXKERNEL = idamax.c
@@ -25,16 +25,21 @@ ICAMINKERNEL = ../arm/izamin.c
 IZAMINKERNEL = izamin.c
 
 ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = idmax.c
 
 ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = idmin.c
 
 SASUMKERNEL  = ../arm/asum.c
 DASUMKERNEL  = dasum.c
 CASUMKERNEL  = ../arm/zasum.c
 ZASUMKERNEL  = zasum.c
 
+SSUMKERNEL  = ../arm/asum.c
+DSUMKERNEL  = dasum.c
+CSUMKERNEL  = ../arm/zasum.c
+ZSUMKERNEL  = zasum.c
+
 SAXPYKERNEL  = ../arm/axpy.c
 DAXPYKERNEL  = daxpy.c
 CAXPYKERNEL  = ../arm/zaxpy.c
diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14
new file mode 100644
index 0000000000..971896c2d4
--- /dev/null
+++ b/kernel/zarch/KERNEL.Z14
@@ -0,0 +1,151 @@
+SAMAXKERNEL  = samax.c
+DAMAXKERNEL  = damax.c
+CAMAXKERNEL  = camax.c
+ZAMAXKERNEL  = zamax.c
+
+SAMINKERNEL  = samin.c
+DAMINKERNEL  = damin.c
+CAMINKERNEL  = camin.c
+ZAMINKERNEL  = zamin.c
+
+SMAXKERNEL   = smax.c
+DMAXKERNEL   = dmax.c
+
+SMINKERNEL   = smin.c
+DMINKERNEL   = dmin.c
+
+ISAMAXKERNEL = isamax.c
+IDAMAXKERNEL = idamax.c
+ICAMAXKERNEL = icamax.c
+IZAMAXKERNEL = izamax.c
+
+ISAMINKERNEL = isamin.c
+IDAMINKERNEL = idamin.c
+ICAMINKERNEL = icamin.c
+IZAMINKERNEL = izamin.c
+
+ISMAXKERNEL  = ismax.c
+IDMAXKERNEL  = idmax.c
+
+ISMINKERNEL  = ismin.c
+IDMINKERNEL  = idmin.c
+
+SASUMKERNEL  = sasum.c
+DASUMKERNEL  = dasum.c
+CASUMKERNEL  = casum.c
+ZASUMKERNEL  = zasum.c
+
+SSUMKERNEL  = ssum.c
+DSUMKERNEL  = dsum.c
+CSUMKERNEL  = csum.c
+ZSUMKERNEL  = zsum.c
+
+SAXPYKERNEL  = saxpy.c
+DAXPYKERNEL  = daxpy.c
+CAXPYKERNEL  = caxpy.c
+ZAXPYKERNEL  = zaxpy.c
+
+SCOPYKERNEL  = scopy.c
+DCOPYKERNEL  = dcopy.c
+CCOPYKERNEL  = ccopy.c
+ZCOPYKERNEL  = zcopy.c
+
+SDOTKERNEL   = sdot.c
+DDOTKERNEL   = ddot.c
+CDOTKERNEL   = cdot.c
+ZDOTKERNEL   = zdot.c
+DSDOTKERNEL  = dsdot.c
+
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+
+SROTKERNEL   = srot.c
+DROTKERNEL   = drot.c
+CROTKERNEL   = crot.c
+ZROTKERNEL   = zrot.c
+
+SSCALKERNEL  = sscal.c
+DSCALKERNEL  = dscal.c
+CSCALKERNEL  = cscal.c
+ZSCALKERNEL  = zscal.c
+
+SSWAPKERNEL  = sswap.c
+DSWAPKERNEL  = dswap.c
+CSWAPKERNEL  = cswap.c
+ZSWAPKERNEL  = zswap.c
+
+SGEMVNKERNEL = sgemv_n_4.c
+DGEMVNKERNEL = dgemv_n_4.c
+CGEMVNKERNEL = cgemv_n_4.c
+ZGEMVNKERNEL = zgemv_n_4.c
+
+SGEMVTKERNEL = sgemv_t_4.c
+DGEMVTKERNEL = dgemv_t_4.c
+CGEMVTKERNEL = cgemv_t_4.c
+ZGEMVTKERNEL = zgemv_t_4.c
+
+STRMMKERNEL	= strmm8x4V.S
+DTRMMKERNEL	= trmm8x4V.S
+CTRMMKERNEL	= ctrmm4x4V.S
+ZTRMMKERNEL	= ztrmm4x4V.S
+
+SGEMMKERNEL    =  strmm8x4V.S
+SGEMMINCOPY    = ../generic/gemm_ncopy_8.c
+SGEMMITCOPY    = ../generic/gemm_tcopy_8.c
+SGEMMONCOPY    = ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
+SGEMMINCOPYOBJ = sgemm_incopy.o
+SGEMMITCOPYOBJ = sgemm_itcopy.o
+SGEMMONCOPYOBJ = sgemm_oncopy.o
+SGEMMOTCOPYOBJ = sgemm_otcopy.o
+
+
+ 
+DGEMMKERNEL    =  gemm8x4V.S
+DGEMMINCOPY    = ../generic/gemm_ncopy_8.c
+DGEMMITCOPY    = ../generic/gemm_tcopy_8.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ctrmm4x4V.S
+CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ztrmm4x4V.S
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+
+
+
+
diff --git a/kernel/zarch/KERNEL.ZARCH_GENERIC b/kernel/zarch/KERNEL.ZARCH_GENERIC
index 848ee9b548..3bbeb9155d 100644
--- a/kernel/zarch/KERNEL.ZARCH_GENERIC
+++ b/kernel/zarch/KERNEL.ZARCH_GENERIC
@@ -35,6 +35,11 @@ DASUMKERNEL  = ../arm/asum.c
 CASUMKERNEL  = ../arm/zasum.c
 ZASUMKERNEL  = ../arm/zasum.c
 
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
 SAXPYKERNEL  = ../arm/axpy.c
 DAXPYKERNEL  = ../arm/axpy.c
 CAXPYKERNEL  = ../arm/zaxpy.c
diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c
new file mode 100644
index 0000000000..b10ca4752d
--- /dev/null
+++ b/kernel/zarch/camax.c
@@ -0,0 +1,215 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
+
+static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT amax;
+
+  __asm__("vlef   %%v0,0(%[x]),0\n\t"
+    "vlef   %%v16,4(%[x]),0\n\t"
+    "vlef   %%v0,8(%[x]),1\n\t"
+    "vlef   %%v16,12(%[x]),1\n\t"
+    "vlef   %%v0,16(%[x]),2\n\t"
+    "vlef   %%v16,20(%[x]),2\n\t"
+    "vlef   %%v0,24(%[x]),3\n\t"
+    "vlef   %%v16,28(%[x]),3\n\t"
+    "vflpsb %%v0,%%v0\n\t"
+    "vflpsb %%v16,%%v16\n\t"
+    "vfasb  %%v0,%%v0,%%v16\n\t"
+    "vleib  %%v1,0,0\n\t"
+    "vleib  %%v1,1,1\n\t"
+    "vleib  %%v1,2,2\n\t"
+    "vleib  %%v1,3,3\n\t"
+    "vleib  %%v1,8,4\n\t"
+    "vleib  %%v1,9,5\n\t"
+    "vleib  %%v1,10,6\n\t"
+    "vleib  %%v1,11,7\n\t"
+    "vleib  %%v1,16,8\n\t"
+    "vleib  %%v1,17,9\n\t"
+    "vleib  %%v1,18,10\n\t"
+    "vleib  %%v1,19,11\n\t"
+    "vleib  %%v1,24,12\n\t"
+    "vleib  %%v1,25,13\n\t"
+    "vleib  %%v1,26,14\n\t"
+    "vleib  %%v1,27,15\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl    %%v16,0(%%r1,%[x])\n\t"
+    "vl    %%v2,16(%%r1,%[x])\n\t"
+    "vpkg  %%v17,%%v16,%%v2\n\t"
+    "vperm %%v16,%%v16,%%v2,%%v1\n\t"
+    "vl    %%v18,32(%%r1,%[x])\n\t"
+    "vl    %%v2,48(%%r1,%[x])\n\t"
+    "vpkg  %%v19,%%v18,%%v2\n\t"
+    "vperm %%v18,%%v18,%%v2,%%v1\n\t"
+    "vl    %%v20,64(%%r1,%[x])\n\t"
+    "vl    %%v2,80(%%r1,%[x])\n\t"
+    "vpkg  %%v21,%%v20,%%v2\n\t"
+    "vperm %%v20,%%v20,%%v2,%%v1\n\t"
+    "vl    %%v22,96(%%r1,%[x])\n\t"
+    "vl    %%v2,112(%%r1,%[x])\n\t"
+    "vpkg  %%v23,%%v22,%%v2\n\t"
+    "vperm %%v22,%%v22,%%v2,%%v1\n\t"
+    "vl    %%v24,128(%%r1,%[x])\n\t"
+    "vl    %%v2,144(%%r1,%[x])\n\t"
+    "vpkg  %%v25,%%v24,%%v2\n\t"
+    "vperm %%v24,%%v24,%%v2,%%v1\n\t"
+    "vl    %%v26,160(%%r1,%[x])\n\t"
+    "vl    %%v2,176(%%r1,%[x])\n\t"
+    "vpkg  %%v27,%%v26,%%v2\n\t"
+    "vperm %%v26,%%v26,%%v2,%%v1\n\t"
+    "vl    %%v28,192(%%r1,%[x])\n\t"
+    "vl    %%v2,208(%%r1,%[x])\n\t"
+    "vpkg  %%v29,%%v28,%%v2\n\t"
+    "vperm %%v28,%%v28,%%v2,%%v1\n\t"
+    "vl    %%v30,224(%%r1,%[x])\n\t"
+    "vl    %%v2,240(%%r1,%[x])\n\t"
+    "vpkg  %%v31,%%v30,%%v2\n\t"
+    "vperm %%v30,%%v30,%%v2,%%v1\n\t"
+    "vflpsb  %%v16,%%v16\n\t"
+    "vflpsb  %%v17,%%v17\n\t"
+    "vflpsb  %%v18,%%v18\n\t"
+    "vflpsb  %%v19,%%v19\n\t"
+    "vflpsb  %%v20,%%v20\n\t"
+    "vflpsb  %%v21,%%v21\n\t"
+    "vflpsb  %%v22,%%v22\n\t"
+    "vflpsb  %%v23,%%v23\n\t"
+    "vflpsb  %%v24,%%v24\n\t"
+    "vflpsb  %%v25,%%v25\n\t"
+    "vflpsb  %%v26,%%v26\n\t"
+    "vflpsb  %%v27,%%v27\n\t"
+    "vflpsb  %%v28,%%v28\n\t"
+    "vflpsb  %%v29,%%v29\n\t"
+    "vflpsb  %%v30,%%v30\n\t"
+    "vflpsb  %%v31,%%v31\n\t"
+    "vfasb %%v16,%%v16,%%v17\n\t"
+    "vfasb %%v18,%%v18,%%v19\n\t"
+    "vfasb %%v20,%%v20,%%v21\n\t"
+    "vfasb %%v22,%%v22,%%v23\n\t"
+    "vfasb %%v24,%%v24,%%v25\n\t"
+    "vfasb %%v26,%%v26,%%v27\n\t"
+    "vfasb %%v28,%%v28,%%v29\n\t"
+    "vfasb %%v30,%%v30,%%v31\n\t"
+    "vfmaxsb  %%v16,%%v16,%%v24,0\n\t"
+    "vfmaxsb  %%v18,%%v18,%%v26,0\n\t"
+    "vfmaxsb  %%v20,%%v20,%%v28,0\n\t"
+    "vfmaxsb  %%v22,%%v22,%%v30,0\n\t"
+    "vfmaxsb  %%v16,%%v16,%%v20,0\n\t"
+    "vfmaxsb  %%v18,%%v18,%%v22,0\n\t"
+    "vfmaxsb  %%v16,%%v16,%%v18,0\n\t"
+    "vfmaxsb  %%v0,%%v0,%%v16,0\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "veslg   %%v16,%%v0,32\n\t"
+    "vfmaxsb %%v0,%%v0,%%v16,0\n\t"
+    "vrepf  %%v16,%%v0,2\n\t"
+    "wfmaxsb %%v0,%%v0,%%v16,0\n\t"
+    "ler    %[amax],%%f0"
+    : [amax] "=f"(amax),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
+       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+
+  return amax;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0;
+  FLOAT maxf = 0.0;
+  BLASLONG inc_x2;
+
+  if (n <= 0 || inc_x <= 0)
+    return (maxf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      maxf = camax_kernel_32(n1, x);
+      ix = n1 * 2;
+      i = n1;
+    } else {
+      maxf = CABS1(x, 0);
+      ix += 2;
+      i++;
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) > maxf) {
+        maxf = CABS1(x, ix);
+      }
+      ix += 2;
+      i++;
+    }
+    return (maxf);
+
+  } else {
+
+    maxf = CABS1(x, 0);
+    inc_x2 = 2 * inc_x;
+
+    BLASLONG n1 = n & -4;
+    while (i < n1) {
+
+      if (CABS1(x, ix) > maxf) {
+        maxf = CABS1(x, ix);
+      }
+      if (CABS1(x, ix + inc_x2) > maxf) {
+        maxf = CABS1(x, ix + inc_x2);
+      }
+      if (CABS1(x, ix + inc_x2 * 2) > maxf) {
+        maxf = CABS1(x, ix + inc_x2 * 2);
+      }
+      if (CABS1(x, ix + inc_x2 * 3) > maxf) {
+        maxf = CABS1(x, ix + inc_x2 * 3);
+      }
+
+      ix += inc_x2 * 4;
+
+      i += 4;
+
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) > maxf) {
+        maxf = CABS1(x, ix);
+      }
+      ix += inc_x2;
+      i++;
+    }
+    return (maxf);
+  }
+}
diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c
new file mode 100644
index 0000000000..40945fae81
--- /dev/null
+++ b/kernel/zarch/camin.c
@@ -0,0 +1,215 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
+
+static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT amin;
+
+  __asm__("vlef   %%v0,0(%[x]),0\n\t"
+    "vlef   %%v16,4(%[x]),0\n\t"
+    "vlef   %%v0,8(%[x]),1\n\t"
+    "vlef   %%v16,12(%[x]),1\n\t"
+    "vlef   %%v0,16(%[x]),2\n\t"
+    "vlef   %%v16,20(%[x]),2\n\t"
+    "vlef   %%v0,24(%[x]),3\n\t"
+    "vlef   %%v16,28(%[x]),3\n\t"
+    "vflpsb %%v0,%%v0\n\t"
+    "vflpsb %%v16,%%v16\n\t"
+    "vfasb  %%v0,%%v0,%%v16\n\t"
+    "vleib  %%v1,0,0\n\t"
+    "vleib  %%v1,1,1\n\t"
+    "vleib  %%v1,2,2\n\t"
+    "vleib  %%v1,3,3\n\t"
+    "vleib  %%v1,8,4\n\t"
+    "vleib  %%v1,9,5\n\t"
+    "vleib  %%v1,10,6\n\t"
+    "vleib  %%v1,11,7\n\t"
+    "vleib  %%v1,16,8\n\t"
+    "vleib  %%v1,17,9\n\t"
+    "vleib  %%v1,18,10\n\t"
+    "vleib  %%v1,19,11\n\t"
+    "vleib  %%v1,24,12\n\t"
+    "vleib  %%v1,25,13\n\t"
+    "vleib  %%v1,26,14\n\t"
+    "vleib  %%v1,27,15\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl    %%v16,0(%%r1,%[x])\n\t"
+    "vl    %%v2,16(%%r1,%[x])\n\t"
+    "vpkg  %%v17,%%v16,%%v2\n\t"
+    "vperm %%v16,%%v16,%%v2,%%v1\n\t"
+    "vl    %%v18,32(%%r1,%[x])\n\t"
+    "vl    %%v2,48(%%r1,%[x])\n\t"
+    "vpkg  %%v19,%%v18,%%v2\n\t"
+    "vperm %%v18,%%v18,%%v2,%%v1\n\t"
+    "vl    %%v20,64(%%r1,%[x])\n\t"
+    "vl    %%v2,80(%%r1,%[x])\n\t"
+    "vpkg  %%v21,%%v20,%%v2\n\t"
+    "vperm %%v20,%%v20,%%v2,%%v1\n\t"
+    "vl    %%v22,96(%%r1,%[x])\n\t"
+    "vl    %%v2,112(%%r1,%[x])\n\t"
+    "vpkg  %%v23,%%v22,%%v2\n\t"
+    "vperm %%v22,%%v22,%%v2,%%v1\n\t"
+    "vl    %%v24,128(%%r1,%[x])\n\t"
+    "vl    %%v2,144(%%r1,%[x])\n\t"
+    "vpkg  %%v25,%%v24,%%v2\n\t"
+    "vperm %%v24,%%v24,%%v2,%%v1\n\t"
+    "vl    %%v26,160(%%r1,%[x])\n\t"
+    "vl    %%v2,176(%%r1,%[x])\n\t"
+    "vpkg  %%v27,%%v26,%%v2\n\t"
+    "vperm %%v26,%%v26,%%v2,%%v1\n\t"
+    "vl    %%v28,192(%%r1,%[x])\n\t"
+    "vl    %%v2,208(%%r1,%[x])\n\t"
+    "vpkg  %%v29,%%v28,%%v2\n\t"
+    "vperm %%v28,%%v28,%%v2,%%v1\n\t"
+    "vl    %%v30,224(%%r1,%[x])\n\t"
+    "vl    %%v2,240(%%r1,%[x])\n\t"
+    "vpkg  %%v31,%%v30,%%v2\n\t"
+    "vperm %%v30,%%v30,%%v2,%%v1\n\t"
+    "vflpsb  %%v16,%%v16\n\t"
+    "vflpsb  %%v17,%%v17\n\t"
+    "vflpsb  %%v18,%%v18\n\t"
+    "vflpsb  %%v19,%%v19\n\t"
+    "vflpsb  %%v20,%%v20\n\t"
+    "vflpsb  %%v21,%%v21\n\t"
+    "vflpsb  %%v22,%%v22\n\t"
+    "vflpsb  %%v23,%%v23\n\t"
+    "vflpsb  %%v24,%%v24\n\t"
+    "vflpsb  %%v25,%%v25\n\t"
+    "vflpsb  %%v26,%%v26\n\t"
+    "vflpsb  %%v27,%%v27\n\t"
+    "vflpsb  %%v28,%%v28\n\t"
+    "vflpsb  %%v29,%%v29\n\t"
+    "vflpsb  %%v30,%%v30\n\t"
+    "vflpsb  %%v31,%%v31\n\t"
+    "vfasb %%v16,%%v16,%%v17\n\t"
+    "vfasb %%v18,%%v18,%%v19\n\t"
+    "vfasb %%v20,%%v20,%%v21\n\t"
+    "vfasb %%v22,%%v22,%%v23\n\t"
+    "vfasb %%v24,%%v24,%%v25\n\t"
+    "vfasb %%v26,%%v26,%%v27\n\t"
+    "vfasb %%v28,%%v28,%%v29\n\t"
+    "vfasb %%v30,%%v30,%%v31\n\t"
+    "vfminsb  %%v16,%%v16,%%v24,0\n\t"
+    "vfminsb  %%v18,%%v18,%%v26,0\n\t"
+    "vfminsb  %%v20,%%v20,%%v28,0\n\t"
+    "vfminsb  %%v22,%%v22,%%v30,0\n\t"
+    "vfminsb  %%v16,%%v16,%%v20,0\n\t"
+    "vfminsb  %%v18,%%v18,%%v22,0\n\t"
+    "vfminsb  %%v16,%%v16,%%v18,0\n\t"
+    "vfminsb  %%v0,%%v0,%%v16,0\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "veslg   %%v16,%%v0,32\n\t"
+    "vfminsb %%v0,%%v0,%%v16,0\n\t"
+    "vrepf  %%v16,%%v0,2\n\t"
+    "wfminsb %%v0,%%v0,%%v16,0\n\t"
+    "ler    %[amin],%%f0"
+    : [amin] "=f"(amin),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
+       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+
+  return amin;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0;
+  FLOAT minf = 0.0;
+  BLASLONG inc_x2;
+
+  if (n <= 0 || inc_x <= 0)
+    return (minf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      minf = camin_kernel_32(n1, x);
+      ix = n1 * 2;
+      i = n1;
+    } else {
+      minf = CABS1(x, 0);
+      ix += 2;
+      i++;
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) < minf) {
+        minf = CABS1(x, ix);
+      }
+      ix += 2;
+      i++;
+    }
+    return (minf);
+
+  } else {
+
+    minf = CABS1(x, 0);
+    inc_x2 = 2 * inc_x;
+
+    BLASLONG n1 = n & -4;
+    while (i < n1) {
+
+      if (CABS1(x, ix) < minf) {
+        minf = CABS1(x, ix);
+      }
+      if (CABS1(x, ix + inc_x2) < minf) {
+        minf = CABS1(x, ix + inc_x2);
+      }
+      if (CABS1(x, ix + inc_x2 * 2) < minf) {
+        minf = CABS1(x, ix + inc_x2 * 2);
+      }
+      if (CABS1(x, ix + inc_x2 * 3) < minf) {
+        minf = CABS1(x, ix + inc_x2 * 3);
+      }
+
+      ix += inc_x2 * 4;
+
+      i += 4;
+
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) < minf) {
+        minf = CABS1(x, ix);
+      }
+      ix += inc_x2;
+      i++;
+    }
+    return (minf);
+  }
+}
diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c
new file mode 100644
index 0000000000..e28f2018c7
--- /dev/null
+++ b/kernel/zarch/casum.c
@@ -0,0 +1,155 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define ABS fabsf
+
+static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT asum;
+
+  __asm__("vzero   %%v24\n\t"
+    "vzero   %%v25\n\t"
+    "vzero   %%v26\n\t"
+    "vzero   %%v27\n\t"
+    "vzero   %%v28\n\t"
+    "vzero   %%v29\n\t"
+    "vzero   %%v30\n\t"
+    "vzero   %%v31\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd  1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16, 0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v20, 64(%%r1,%[x])\n\t"
+    "vl  %%v21, 80(%%r1,%[x])\n\t"
+    "vl  %%v22, 96(%%r1,%[x])\n\t"
+    "vl  %%v23, 112(%%r1,%[x])\n\t"
+    "vflpsb  %%v16, %%v16\n\t"
+    "vflpsb  %%v17, %%v17\n\t"
+    "vflpsb  %%v18, %%v18\n\t"
+    "vflpsb  %%v19, %%v19\n\t"
+    "vflpsb  %%v20, %%v20\n\t"
+    "vflpsb  %%v21, %%v21\n\t"
+    "vflpsb  %%v22, %%v22\n\t"
+    "vflpsb  %%v23, %%v23\n\t"
+    "vfasb   %%v24,%%v24,%%v16\n\t"
+    "vfasb   %%v25,%%v25,%%v17\n\t"
+    "vfasb   %%v26,%%v26,%%v18\n\t"
+    "vfasb   %%v27,%%v27,%%v19\n\t"
+    "vfasb   %%v28,%%v28,%%v20\n\t"
+    "vfasb   %%v29,%%v29,%%v21\n\t"
+    "vfasb   %%v30,%%v30,%%v22\n\t"
+    "vfasb   %%v31,%%v31,%%v23\n\t"
+    "vl  %%v16, 128(%%r1,%[x])\n\t"
+    "vl  %%v17, 144(%%r1,%[x])\n\t"
+    "vl  %%v18, 160(%%r1,%[x])\n\t"
+    "vl  %%v19, 176(%%r1,%[x])\n\t"
+    "vl  %%v20, 192(%%r1,%[x])\n\t"
+    "vl  %%v21, 208(%%r1,%[x])\n\t"
+    "vl  %%v22, 224(%%r1,%[x])\n\t"
+    "vl  %%v23, 240(%%r1,%[x])\n\t"
+    "vflpsb  %%v16, %%v16\n\t"
+    "vflpsb  %%v17, %%v17\n\t"
+    "vflpsb  %%v18, %%v18\n\t"
+    "vflpsb  %%v19, %%v19\n\t"
+    "vflpsb  %%v20, %%v20\n\t"
+    "vflpsb  %%v21, %%v21\n\t"
+    "vflpsb  %%v22, %%v22\n\t"
+    "vflpsb  %%v23, %%v23\n\t"
+    "vfasb   %%v24,%%v24,%%v16\n\t"
+    "vfasb   %%v25,%%v25,%%v17\n\t"
+    "vfasb   %%v26,%%v26,%%v18\n\t"
+    "vfasb   %%v27,%%v27,%%v19\n\t"
+    "vfasb   %%v28,%%v28,%%v20\n\t"
+    "vfasb   %%v29,%%v29,%%v21\n\t"
+    "vfasb   %%v30,%%v30,%%v22\n\t"
+    "vfasb   %%v31,%%v31,%%v23\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b\n\t"
+    "vfasb   %%v24,%%v24,%%v25\n\t"
+    "vfasb   %%v24,%%v24,%%v26\n\t"
+    "vfasb   %%v24,%%v24,%%v27\n\t"
+    "vfasb   %%v24,%%v24,%%v28\n\t"
+    "vfasb   %%v24,%%v24,%%v29\n\t"
+    "vfasb   %%v24,%%v24,%%v30\n\t"
+    "vfasb   %%v24,%%v24,%%v31\n\t"
+    "veslg   %%v25,%%v24,32\n\t"
+    "vfasb   %%v24,%%v24,%%v25\n\t"
+    "vrepf   %%v25,%%v24,2\n\t"
+    "vfasb   %%v24,%%v24,%%v25\n\t"
+    "vstef   %%v24,%[asum],0"
+    : [asum] "=Q"(asum),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return asum;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ip = 0;
+  FLOAT sumf = 0.0;
+  BLASLONG n1;
+  BLASLONG inc_x2;
+
+  if (n <= 0 || inc_x <= 0)
+    return (sumf);
+
+  if (inc_x == 1) {
+
+    n1 = n & -32;
+    if (n1 > 0) {
+
+      sumf = casum_kernel_32(n1, x);
+      i = n1;
+      ip = 2 * n1;
+    }
+
+    while (i < n) {
+      sumf += ABS(x[ip]) + ABS(x[ip + 1]);
+      i++;
+      ip += 2;
+    }
+
+  } else {
+    inc_x2 = 2 * inc_x;
+
+    while (i < n) {
+      sumf += ABS(x[ip]) + ABS(x[ip + 1]);
+      ip += inc_x2;
+      i++;
+    }
+
+  }
+  return (sumf);
+}
diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c
new file mode 100644
index 0000000000..14a124ae25
--- /dev/null
+++ b/kernel/zarch/caxpy.c
@@ -0,0 +1,166 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
+  __asm__(
+#if !defined(CONJ)
+    "vlrepf %%v0,0(%[alpha])\n\t"
+    "vlef   %%v1,4(%[alpha]),0\n\t"
+    "vlef   %%v1,4(%[alpha]),2\n\t"
+    "vflcsb %%v1,%%v1\n\t"
+    "vlef   %%v1,4(%[alpha]),1\n\t"
+    "vlef   %%v1,4(%[alpha]),3\n\t"
+#else
+    "vlef   %%v0,0(%[alpha]),1\n\t"
+    "vlef   %%v0,0(%[alpha]),3\n\t"
+    "vflcsb %%v0,%%v0\n\t"
+    "vlef   %%v0,0(%[alpha]),0\n\t"
+    "vlef   %%v0,0(%[alpha]),2\n\t"
+    "vlrepf %%v1,4(%[alpha])\n\t"
+#endif
+    "srlg %[n],%[n],4\n\t"
+    "xgr  %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "pfd 2, 1024(%%r1,%[y])\n\t"
+    "vl   %%v8,0(%%r1,%[x])\n\t"
+    "vl   %%v9,16(%%r1,%[x])\n\t"
+    "vl   %%v10,32(%%r1,%[x])\n\t"
+    "vl   %%v11,48(%%r1,%[x])\n\t"
+    "vl   %%v12,0(%%r1,%[y])\n\t"
+    "vl   %%v13,16(%%r1,%[y])\n\t"
+    "vl   %%v14,32(%%r1,%[y])\n\t"
+    "vl   %%v15,48(%%r1,%[y])\n\t"
+    "vl   %%v16,64(%%r1,%[x])\n\t"
+    "vl   %%v17,80(%%r1,%[x])\n\t"
+    "vl   %%v18,96(%%r1,%[x])\n\t"
+    "vl   %%v19,112(%%r1,%[x])\n\t"
+    "vl   %%v20,64(%%r1,%[y])\n\t"
+    "vl   %%v21,80(%%r1,%[y])\n\t"
+    "vl   %%v22,96(%%r1,%[y])\n\t"
+    "vl   %%v23,112(%%r1,%[y])\n\t"
+    "verllg   %%v24,%%v8,32\n\t"
+    "verllg   %%v25,%%v9,32\n\t"
+    "verllg   %%v26,%%v10,32\n\t"
+    "verllg   %%v27,%%v11,32\n\t"
+    "verllg   %%v28,%%v16,32\n\t"
+    "verllg   %%v29,%%v17,32\n\t"
+    "verllg   %%v30,%%v18,32\n\t"
+    "verllg   %%v31,%%v19,32\n\t"
+    "vfmasb %%v8,%%v8,%%v0,%%v12\n\t"
+    "vfmasb %%v9,%%v9,%%v0,%%v13\n\t"
+    "vfmasb %%v10,%%v10,%%v0,%%v14\n\t"
+    "vfmasb %%v11,%%v11,%%v0,%%v15\n\t"
+    "vfmasb %%v16,%%v16,%%v0,%%v20\n\t"
+    "vfmasb %%v17,%%v17,%%v0,%%v21\n\t"
+    "vfmasb %%v18,%%v18,%%v0,%%v22\n\t"
+    "vfmasb %%v19,%%v19,%%v0,%%v23\n\t"
+    "vfmasb %%v8,%%v24,%%v1,%%v8\n\t"
+    "vfmasb %%v9,%%v25,%%v1,%%v9\n\t"
+    "vfmasb %%v10,%%v26,%%v1,%%v10\n\t"
+    "vfmasb %%v11,%%v27,%%v1,%%v11\n\t"
+    "vfmasb %%v16,%%v28,%%v1,%%v16\n\t"
+    "vfmasb %%v17,%%v29,%%v1,%%v17\n\t"
+    "vfmasb %%v18,%%v30,%%v1,%%v18\n\t"
+    "vfmasb %%v19,%%v31,%%v1,%%v19\n\t"
+    "vst %%v8,0(%%r1,%[y])\n\t"
+    "vst %%v9,16(%%r1,%[y])\n\t"
+    "vst %%v10,32(%%r1,%[y])\n\t"
+    "vst %%v11,48(%%r1,%[y])\n\t"
+    "vst %%v16,64(%%r1,%[y])\n\t"
+    "vst %%v17,80(%%r1,%[y])\n\t"
+    "vst %%v18,96(%%r1,%[y])\n\t"
+    "vst %%v19,112(%%r1,%[y])\n\t"
+    "agfi  %%r1,128\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
+       "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
+       "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+}
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
+          FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+          BLASLONG dummy2) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+  FLOAT da[2] __attribute__ ((aligned(16)));
+
+  if (n <= 0)
+    return (0);
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -16;
+
+    if (n1) {
+      da[0] = da_r;
+      da[1] = da_i;
+      caxpy_kernel_16(n1, x, y, da);
+      ix = 2 * n1;
+    }
+    i = n1;
+    while (i < n) {
+#if !defined(CONJ)
+      y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
+      y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
+#else
+      y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
+      y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
+#endif
+      i++;
+      ix += 2;
+
+    }
+    return (0);
+
+  }
+
+  inc_x *= 2;
+  inc_y *= 2;
+
+  while (i < n) {
+
+#if !defined(CONJ)
+    y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
+    y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
+#else
+    y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
+    y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
+#endif
+    ix += inc_x;
+    iy += inc_y;
+    i++;
+
+  }
+  return (0);
+
+}
diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c
new file mode 100644
index 0000000000..0a5e03992a
--- /dev/null
+++ b/kernel/zarch/ccopy.c
@@ -0,0 +1,88 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
+  __asm__("srlg %[n],%[n],5\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%[x])\n\t"
+    "pfd 2, 1024(%[y])\n\t"
+    "mvc 0(256,%[y]),0(%[x])\n\t"
+    "la  %[x],256(%[x])\n\t"
+    "la  %[y],256(%[y])\n\t"
+    "brctg %[n],0b"
+    : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
+       [n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x)
+    : "cc");
+}
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+
+  if (n <= 0)
+    return (0);
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+      ccopy_kernel_32(n1, x, y);
+      i = n1;
+      ix = n1 * 2;
+      iy = n1 * 2;
+    }
+
+    while (i < n) {
+      y[iy] = x[iy];
+      y[iy + 1] = x[ix + 1];
+      ix += 2;
+      iy += 2;
+      i++;
+
+    }
+
+  } else {
+
+    BLASLONG inc_x2 = 2 * inc_x;
+    BLASLONG inc_y2 = 2 * inc_y;
+
+    while (i < n) {
+      y[iy] = x[ix];
+      y[iy + 1] = x[ix + 1];
+      ix += inc_x2;
+      iy += inc_y2;
+      i++;
+
+    }
+
+  }
+
+  return (0);
+}
diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c
new file mode 100644
index 0000000000..d90f9c8712
--- /dev/null
+++ b/kernel/zarch/cdot.c
@@ -0,0 +1,176 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
+  __asm__("vzero %%v24\n\t"
+    "vzero %%v25\n\t"
+    "vzero %%v26\n\t"
+    "vzero %%v27\n\t"
+    "vzero %%v28\n\t"
+    "vzero %%v29\n\t"
+    "vzero %%v30\n\t"
+    "vzero %%v31\n\t"
+    "srlg %[n],%[n],4\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "pfd 1, 1024(%%r1,%[y])\n\t"
+    "vl  %%v16,  0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v0,  0(%%r1,%[y])\n\t"
+    "vl  %%v1, 16(%%r1,%[y])\n\t"
+    "vl  %%v2, 32(%%r1,%[y])\n\t"
+    "vl  %%v3, 48(%%r1,%[y])\n\t"
+    "verllg   %%v20,%%v16,32\n\t"
+    "verllg   %%v21,%%v17,32\n\t"
+    "verllg   %%v22,%%v18,32\n\t"
+    "verllg   %%v23,%%v19,32\n\t"
+    "vfmasb    %%v24,%%v16,%%v0,%%v24\n\t"
+    "vfmasb    %%v25,%%v20,%%v0,%%v25\n\t"
+    "vfmasb    %%v26,%%v17,%%v1,%%v26\n\t"
+    "vfmasb    %%v27,%%v21,%%v1,%%v27\n\t"
+    "vfmasb    %%v28,%%v18,%%v2,%%v28\n\t"
+    "vfmasb    %%v29,%%v22,%%v2,%%v29\n\t"
+    "vfmasb    %%v30,%%v19,%%v3,%%v30\n\t"
+    "vfmasb    %%v31,%%v23,%%v3,%%v31\n\t"
+    "vl  %%v16, 64(%%r1,%[x])\n\t"
+    "vl  %%v17, 80(%%r1,%[x])\n\t"
+    "vl  %%v18, 96(%%r1,%[x])\n\t"
+    "vl  %%v19, 112(%%r1,%[x])\n\t"
+    "vl  %%v0, 64(%%r1,%[y])\n\t"
+    "vl  %%v1, 80(%%r1,%[y])\n\t"
+    "vl  %%v2, 96(%%r1,%[y])\n\t"
+    "vl  %%v3, 112(%%r1,%[y])\n\t"
+    "verllg   %%v20,%%v16,32\n\t"
+    "verllg   %%v21,%%v17,32\n\t"
+    "verllg   %%v22,%%v18,32\n\t"
+    "verllg   %%v23,%%v19,32\n\t"
+    "vfmasb    %%v24,%%v16,%%v0,%%v24\n\t"
+    "vfmasb    %%v25,%%v20,%%v0,%%v25\n\t"
+    "vfmasb    %%v26,%%v17,%%v1,%%v26\n\t"
+    "vfmasb    %%v27,%%v21,%%v1,%%v27\n\t"
+    "vfmasb    %%v28,%%v18,%%v2,%%v28\n\t"
+    "vfmasb    %%v29,%%v22,%%v2,%%v29\n\t"
+    "vfmasb    %%v30,%%v19,%%v3,%%v30\n\t"
+    "vfmasb    %%v31,%%v23,%%v3,%%v31\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %[n],0b\n\t"
+    "vfasb  %%v24,%%v24,%%v26\n\t"
+    "vfasb  %%v24,%%v24,%%v28\n\t"
+    "vfasb  %%v24,%%v24,%%v30\n\t"
+    "vrepg  %%v26,%%v24,1\n\t"
+    "vfasb  %%v24,%%v24,%%v26\n\t"
+    "vfasb  %%v25,%%v25,%%v27\n\t"
+    "vfasb  %%v25,%%v25,%%v29\n\t"
+    "vfasb  %%v25,%%v25,%%v31\n\t"
+    "vrepg  %%v27,%%v25,1\n\t"
+    "vfasb  %%v25,%%v25,%%v27\n\t"
+    "vstef  %%v24,0(%[d]),0\n\t"
+    "vstef  %%v24,4(%[d]),1\n\t"
+    "vstef  %%v25,8(%[d]),1\n\t"
+    "vstef  %%v25,12(%[d]),0"
+    : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
+    : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
+       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+}
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+                             BLASLONG inc_y) {
+  BLASLONG i;
+  BLASLONG ix, iy;
+  OPENBLAS_COMPLEX_FLOAT result;
+  FLOAT dot[4] __attribute__ ((aligned(16))) = {
+  0.0, 0.0, 0.0, 0.0};
+
+  if (n <= 0) {
+    CREAL(result) = 0.0;
+    CIMAG(result) = 0.0;
+    return (result);
+
+  }
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -16;
+
+    if (n1)
+      cdot_kernel_16(n1, x, y, dot);
+
+    i = n1;
+    BLASLONG j = i * 2;
+
+    while (i < n) {
+
+      dot[0] += x[j] * y[j];
+      dot[1] += x[j + 1] * y[j + 1];
+      dot[2] += x[j] * y[j + 1];
+      dot[3] += x[j + 1] * y[j];
+
+      j += 2;
+      i++;
+
+    }
+
+  } else {
+    i = 0;
+    ix = 0;
+    iy = 0;
+    inc_x <<= 1;
+    inc_y <<= 1;
+    while (i < n) {
+
+      dot[0] += x[ix] * y[iy];
+      dot[1] += x[ix + 1] * y[iy + 1];
+      dot[2] += x[ix] * y[iy + 1];
+      dot[3] += x[ix + 1] * y[iy];
+
+      ix += inc_x;
+      iy += inc_y;
+      i++;
+
+    }
+  }
+
+#if !defined(CONJ)
+  CREAL(result) = dot[0] - dot[1];
+  CIMAG(result) = dot[2] + dot[3];
+#else
+  CREAL(result) = dot[0] + dot[1];
+  CIMAG(result) = dot[2] - dot[3];
+
+#endif
+
+  return (result);
+
+}
diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c
new file mode 100644
index 0000000000..5c36bc3383
--- /dev/null
+++ b/kernel/zarch/cgemv_n_4.c
@@ -0,0 +1,752 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define NBMAX 2048
+
+static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+  register FLOAT *ap2 = ap[2];
+  register FLOAT *ap3 = ap[3];
+
+  __asm__("vlrepg     %%v16,0(%[x])\n\t"
+    "vlrepg     %%v17,8(%[x])\n\t"
+    "vlrepg     %%v18,16(%[x])\n\t"
+    "vlrepg     %%v19,24(%[x])\n\t"
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    "vlef   %%v20,4(%[x]),0\n\t"
+    "vlef   %%v20,4(%[x]),2\n\t"
+    "vflcsb %%v20,%%v20\n\t"
+    "vlef   %%v20,0(%[x]),1\n\t"
+    "vlef   %%v20,0(%[x]),3\n\t"
+    "vlef   %%v21,12(%[x]),0\n\t"
+    "vlef   %%v21,12(%[x]),2\n\t"
+    "vflcsb %%v21,%%v21\n\t"
+    "vlef   %%v21,8(%[x]),1\n\t"
+    "vlef   %%v21,8(%[x]),3\n\t"
+    "vlef   %%v22,20(%[x]),0\n\t"
+    "vlef   %%v22,20(%[x]),2\n\t"
+    "vflcsb %%v22,%%v22\n\t"
+    "vlef   %%v22,16(%[x]),1\n\t"
+    "vlef   %%v22,16(%[x]),3\n\t"
+    "vlef   %%v23,28(%[x]),0\n\t"
+    "vlef   %%v23,28(%[x]),2\n\t"
+    "vflcsb %%v23,%%v23\n\t"
+    "vlef   %%v23,24(%[x]),1\n\t"
+    "vlef   %%v23,24(%[x]),3\n\t"
+#else
+    "vlef   %%v20,0(%[x]),1\n\t"
+    "vlef   %%v20,0(%[x]),3\n\t"
+    "vflcsb %%v20,%%v20\n\t"
+    "vlef   %%v20,4(%[x]),0\n\t"
+    "vlef   %%v20,4(%[x]),2\n\t"
+    "vlef   %%v21,8(%[x]),1\n\t"
+    "vlef   %%v21,8(%[x]),3\n\t"
+    "vflcsb %%v21,%%v21\n\t"
+    "vlef   %%v21,12(%[x]),0\n\t"
+    "vlef   %%v21,12(%[x]),2\n\t"
+    "vlef   %%v22,16(%[x]),1\n\t"
+    "vlef   %%v22,16(%[x]),3\n\t"
+    "vflcsb %%v22,%%v22\n\t"
+    "vlef   %%v22,20(%[x]),0\n\t"
+    "vlef   %%v22,20(%[x]),2\n\t"
+    "vlef   %%v23,24(%[x]),1\n\t"
+    "vlef   %%v23,24(%[x]),3\n\t"
+    "vflcsb %%v23,%%v23\n\t"
+    "vlef   %%v23,28(%[x]),0\n\t"
+    "vlef   %%v23,28(%[x]),2\n\t"
+#endif
+    "vleib  %%v1,0,0\n\t"
+    "vleib  %%v1,1,1\n\t"
+    "vleib  %%v1,2,2\n\t"
+    "vleib  %%v1,3,3\n\t"
+    "vleib  %%v1,0,4\n\t"
+    "vleib  %%v1,1,5\n\t"
+    "vleib  %%v1,2,6\n\t"
+    "vleib  %%v1,3,7\n\t"
+    "vleib  %%v1,8,8\n\t"
+    "vleib  %%v1,9,9\n\t"
+    "vleib  %%v1,10,10\n\t"
+    "vleib  %%v1,11,11\n\t"
+    "vleib  %%v1,8,12\n\t"
+    "vleib  %%v1,9,13\n\t"
+    "vleib  %%v1,10,14\n\t"
+    "vleib  %%v1,11,15\n\t"
+    "vleib  %%v2,4,0\n\t"
+    "vleib  %%v2,5,1\n\t"
+    "vleib  %%v2,6,2\n\t"
+    "vleib  %%v2,7,3\n\t"
+    "vleib  %%v2,4,4\n\t"
+    "vleib  %%v2,5,5\n\t"
+    "vleib  %%v2,6,6\n\t"
+    "vleib  %%v2,7,7\n\t"
+    "vleib  %%v2,12,8\n\t"
+    "vleib  %%v2,13,9\n\t"
+    "vleib  %%v2,14,10\n\t"
+    "vleib  %%v2,15,11\n\t"
+    "vleib  %%v2,12,12\n\t"
+    "vleib  %%v2,13,13\n\t"
+    "vleib  %%v2,14,14\n\t"
+    "vleib  %%v2,15,15\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "srlg  %[n],%[n],1\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 1,1024(%%r1,%[ap2])\n\t"
+    "pfd 1,1024(%%r1,%[ap3])\n\t"
+    "pfd 2,1024(%%r1,%[y])\n\t"
+    "vl    %%v24,0(%%r1,%[ap0])\n\t"
+    "vperm %%v25,%%v24,%%v24,%%v2\n\t"
+    "vperm %%v24,%%v24,%%v24,%%v1\n\t"
+    "vl    %%v26,0(%%r1,%[ap1])\n\t"
+    "vperm %%v27,%%v26,%%v26,%%v2\n\t"
+    "vperm %%v26,%%v26,%%v26,%%v1\n\t"
+    "vl  %%v0,0(%%r1,%[y])\n\t"
+    "vfmasb   %%v0,%%v24,%%v16,%%v0\n\t"
+    "vfmasb   %%v0,%%v25,%%v20,%%v0\n\t"
+    "vfmasb   %%v0,%%v26,%%v17,%%v0\n\t"
+    "vfmasb   %%v0,%%v27,%%v21,%%v0\n\t"
+    "vl    %%v28,0(%%r1,%[ap2])\n\t"
+    "vperm %%v29,%%v28,%%v28,%%v2\n\t"
+    "vperm %%v28,%%v28,%%v28,%%v1\n\t"
+    "vl    %%v30,0(%%r1,%[ap3])\n\t"
+    "vperm %%v31,%%v30,%%v30,%%v2\n\t"
+    "vperm %%v30,%%v30,%%v30,%%v1\n\t"
+    "vfmasb   %%v0,%%v28,%%v18,%%v0\n\t"
+    "vfmasb   %%v0,%%v29,%%v22,%%v0\n\t"
+    "vfmasb   %%v0,%%v30,%%v19,%%v0\n\t"
+    "vfmasb   %%v0,%%v31,%%v23,%%v0\n\t"
+    "vst %%v0,0(%%r1,%[y])\n\t"
+    "agfi   %%r1,16\n\t"
+    "brctg  %[n],0b\n\t"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
+       "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
+       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+}
+
+static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+
+  __asm__("vlrepg     %%v16,0(%[x])\n\t"
+    "vlrepg     %%v17,8(%[x])\n\t"
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    "vlef   %%v18,4(%[x]),0\n\t"
+    "vlef   %%v18,4(%[x]),2\n\t"
+    "vflcsb %%v18,%%v18\n\t"
+    "vlef   %%v18,0(%[x]),1\n\t"
+    "vlef   %%v18,0(%[x]),3\n\t"
+    "vlef   %%v19,12(%[x]),0\n\t"
+    "vlef   %%v19,12(%[x]),2\n\t"
+    "vflcsb %%v19,%%v19\n\t"
+    "vlef   %%v19,8(%[x]),1\n\t"
+    "vlef   %%v19,8(%[x]),3\n\t"
+#else
+    "vlef   %%v18,0(%[x]),1\n\t"
+    "vlef   %%v18,0(%[x]),3\n\t"
+    "vflcsb %%v18,%%v18\n\t"
+    "vlef   %%v18,4(%[x]),0\n\t"
+    "vlef   %%v18,4(%[x]),2\n\t"
+    "vlef   %%v19,8(%[x]),1\n\t"
+    "vlef   %%v19,8(%[x]),3\n\t"
+    "vflcsb %%v19,%%v19\n\t"
+    "vlef   %%v19,12(%[x]),0\n\t"
+    "vlef   %%v19,12(%[x]),2\n\t"
+#endif
+    "vleib  %%v1,0,0\n\t"
+    "vleib  %%v1,1,1\n\t"
+    "vleib  %%v1,2,2\n\t"
+    "vleib  %%v1,3,3\n\t"
+    "vleib  %%v1,0,4\n\t"
+    "vleib  %%v1,1,5\n\t"
+    "vleib  %%v1,2,6\n\t"
+    "vleib  %%v1,3,7\n\t"
+    "vleib  %%v1,8,8\n\t"
+    "vleib  %%v1,9,9\n\t"
+    "vleib  %%v1,10,10\n\t"
+    "vleib  %%v1,11,11\n\t"
+    "vleib  %%v1,8,12\n\t"
+    "vleib  %%v1,9,13\n\t"
+    "vleib  %%v1,10,14\n\t"
+    "vleib  %%v1,11,15\n\t"
+    "vleib  %%v2,4,0\n\t"
+    "vleib  %%v2,5,1\n\t"
+    "vleib  %%v2,6,2\n\t"
+    "vleib  %%v2,7,3\n\t"
+    "vleib  %%v2,4,4\n\t"
+    "vleib  %%v2,5,5\n\t"
+    "vleib  %%v2,6,6\n\t"
+    "vleib  %%v2,7,7\n\t"
+    "vleib  %%v2,12,8\n\t"
+    "vleib  %%v2,13,9\n\t"
+    "vleib  %%v2,14,10\n\t"
+    "vleib  %%v2,15,11\n\t"
+    "vleib  %%v2,12,12\n\t"
+    "vleib  %%v2,13,13\n\t"
+    "vleib  %%v2,14,14\n\t"
+    "vleib  %%v2,15,15\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "srlg  %[n],%[n],1\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 2,1024(%%r1,%[y])\n\t"
+    "vl    %%v20,0(%%r1,%[ap0])\n\t"
+    "vperm %%v21,%%v20,%%v20,%%v2\n\t"
+    "vperm %%v20,%%v20,%%v20,%%v1\n\t"
+    "vl    %%v22,0(%%r1,%[ap1])\n\t"
+    "vperm %%v23,%%v22,%%v22,%%v2\n\t"
+    "vperm %%v22,%%v22,%%v22,%%v1\n\t"
+    "vl  %%v0,0(%%r1,%[y])\n\t"
+    "vfmasb   %%v0,%%v20,%%v16,%%v0\n\t"
+    "vfmasb   %%v0,%%v21,%%v18,%%v0\n\t"
+    "vfmasb   %%v0,%%v22,%%v17,%%v0\n\t"
+    "vfmasb   %%v0,%%v23,%%v19,%%v0\n\t"
+    "vst %%v0,0(%%r1,%[y])\n\t"
+    "agfi   %%r1,16\n\t"
+    "brctg  %[n],0b\n\t"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
+       "v21", "v22", "v23");
+}
+
+static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
+  __asm__("vlrepg     %%v16,0(%[x])\n\t"
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    "vlef   %%v17,4(%[x]),0\n\t"
+    "vlef   %%v17,4(%[x]),2\n\t"
+    "vflcsb %%v17,%%v17\n\t"
+    "vlef   %%v17,0(%[x]),1\n\t"
+    "vlef   %%v17,0(%[x]),3\n\t"
+#else
+    "vlef   %%v17,0(%[x]),1\n\t"
+    "vlef   %%v17,0(%[x]),3\n\t"
+    "vflcsb %%v17,%%v17\n\t"
+    "vlef   %%v17,4(%[x]),0\n\t"
+    "vlef   %%v17,4(%[x]),2\n\t"
+#endif
+    "vleib  %%v1,0,0\n\t"
+    "vleib  %%v1,1,1\n\t"
+    "vleib  %%v1,2,2\n\t"
+    "vleib  %%v1,3,3\n\t"
+    "vleib  %%v1,0,4\n\t"
+    "vleib  %%v1,1,5\n\t"
+    "vleib  %%v1,2,6\n\t"
+    "vleib  %%v1,3,7\n\t"
+    "vleib  %%v1,8,8\n\t"
+    "vleib  %%v1,9,9\n\t"
+    "vleib  %%v1,10,10\n\t"
+    "vleib  %%v1,11,11\n\t"
+    "vleib  %%v1,8,12\n\t"
+    "vleib  %%v1,9,13\n\t"
+    "vleib  %%v1,10,14\n\t"
+    "vleib  %%v1,11,15\n\t"
+    "vleib  %%v2,4,0\n\t"
+    "vleib  %%v2,5,1\n\t"
+    "vleib  %%v2,6,2\n\t"
+    "vleib  %%v2,7,3\n\t"
+    "vleib  %%v2,4,4\n\t"
+    "vleib  %%v2,5,5\n\t"
+    "vleib  %%v2,6,6\n\t"
+    "vleib  %%v2,7,7\n\t"
+    "vleib  %%v2,12,8\n\t"
+    "vleib  %%v2,13,9\n\t"
+    "vleib  %%v2,14,10\n\t"
+    "vleib  %%v2,15,11\n\t"
+    "vleib  %%v2,12,12\n\t"
+    "vleib  %%v2,13,13\n\t"
+    "vleib  %%v2,14,14\n\t"
+    "vleib  %%v2,15,15\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "srlg  %[n],%[n],1\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap])\n\t"
+    "pfd 2,1024(%%r1,%[y])\n\t"
+    "vl    %%v18,0(%%r1,%[ap])\n\t"
+    "vperm %%v19,%%v18,%%v18,%%v2\n\t"
+    "vperm %%v18,%%v18,%%v18,%%v1\n\t"
+    "vl  %%v0,0(%%r1,%[y])\n\t"
+    "vfmasb   %%v0,%%v18,%%v16,%%v0\n\t"
+    "vfmasb   %%v0,%%v19,%%v17,%%v0\n\t"
+    "vst %%v0,0(%%r1,%[y])\n\t"
+    "agfi   %%r1,16\n\t"
+    "brctg  %[n],0b\n\t"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
+       "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19");
+}
+
+static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
+                    FLOAT alpha_i) {
+  __asm__(
+#if !defined(XCONJ)
+    "vlrepf %%v0,%[alpha_r]\n\t"
+    "vlef   %%v1,%[alpha_i],0\n\t"
+    "vlef   %%v1,%[alpha_i],2\n\t"
+    "vflcsb %%v1,%%v1\n\t"
+    "vlef   %%v1,%[alpha_i],1\n\t"
+    "vlef   %%v1,%[alpha_i],3\n\t"
+#else
+    "vlef   %%v0,%[alpha_r],1\n\t"
+    "vlef   %%v0,%[alpha_r],3\n\t"
+    "vflcsb %%v0,%%v0\n\t"
+    "vlef   %%v0,%[alpha_r],0\n\t"
+    "vlef   %%v0,%[alpha_r],2\n\t"
+    "vlrepf %%v1,%[alpha_i]\n\t"
+#endif
+    "xgr   %%r1,%%r1\n\t"
+    "srlg  %[n],%[n],2\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[src])\n\t"
+    "pfd 2,1024(%%r1,%[dest])\n\t"
+    "vl   %%v16,0(%%r1,%[src])\n\t"
+    "vl   %%v17,16(%%r1,%[src])\n\t"
+    "vl   %%v18,0(%%r1,%[dest])\n\t"
+    "vl   %%v19,16(%%r1,%[dest])\n\t"
+    "verllg   %%v20,%%v16,32\n\t"
+    "verllg   %%v21,%%v17,32\n\t"
+    "vfmasb %%v22,%%v16,%%v0,%%v18\n\t"
+    "vfmasb %%v23,%%v17,%%v0,%%v19\n\t"
+    "vfmasb %%v22,%%v20,%%v1,%%v22\n\t"
+    "vfmasb %%v23,%%v21,%%v1,%%v23\n\t"
+    "vst %%v22,0(%%r1,%[dest])\n\t"
+    "vst %%v23,16(%%r1,%[dest])\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n)
+    : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src),
+       [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23");
+}
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,
+                  FLOAT alpha_r, FLOAT alpha_i) {
+  BLASLONG i;
+
+  if (inc_dest != 2) {
+
+    FLOAT temp_r;
+    FLOAT temp_i;
+    for (i = 0; i < n; i++) {
+#if !defined(XCONJ)
+      temp_r = alpha_r * src[0] - alpha_i * src[1];
+      temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+      temp_r = alpha_r * src[0] + alpha_i * src[1];
+      temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+      *dest += temp_r;
+      *(dest + 1) += temp_i;
+
+      src += 2;
+      dest += inc_dest;
+    }
+    return;
+  }
+
+  add_y_4(n, src, dest, alpha_r, alpha_i);
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+          FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+          BLASLONG inc_y, FLOAT *buffer) {
+  BLASLONG i;
+  FLOAT *a_ptr;
+  FLOAT *x_ptr;
+  FLOAT *y_ptr;
+  FLOAT *ap[4];
+  BLASLONG n1;
+  BLASLONG m1;
+  BLASLONG m2;
+  BLASLONG m3;
+  BLASLONG n2;
+  BLASLONG lda4;
+  FLOAT xbuffer[8], *ybuffer;
+
+  if (m < 1)
+    return (0);
+  if (n < 1)
+    return (0);
+
+  ybuffer = buffer;
+
+  inc_x *= 2;
+  inc_y *= 2;
+  lda *= 2;
+  lda4 = 4 * lda;
+
+  n1 = n / 4;
+  n2 = n % 4;
+
+  m3 = m % 4;
+  m1 = m - (m % 4);
+  m2 = (m % NBMAX) - (m % 4);
+
+  y_ptr = y;
+
+  BLASLONG NB = NBMAX;
+
+  while (NB == NBMAX) {
+
+    m1 -= NB;
+    if (m1 < 0) {
+      if (m2 == 0)
+        break;
+      NB = m2;
+    }
+
+    a_ptr = a;
+    ap[0] = a_ptr;
+    ap[1] = a_ptr + lda;
+    ap[2] = ap[1] + lda;
+    ap[3] = ap[2] + lda;
+    x_ptr = x;
+    //zero_y(NB,ybuffer);
+    memset(ybuffer, 0, NB * 8);
+
+    if (inc_x == 2) {
+
+      for (i = 0; i < n1; i++) {
+        cgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
+        ap[0] += lda4;
+        ap[1] += lda4;
+        ap[2] += lda4;
+        ap[3] += lda4;
+        a_ptr += lda4;
+        x_ptr += 8;
+      }
+
+      if (n2 & 2) {
+        cgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
+        x_ptr += 4;
+        a_ptr += 2 * lda;
+
+      }
+
+      if (n2 & 1) {
+        cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
+        /* x_ptr += 2;  
+           a_ptr += lda; */
+
+      }
+    } else {
+
+      for (i = 0; i < n1; i++) {
+
+        xbuffer[0] = x_ptr[0];
+        xbuffer[1] = x_ptr[1];
+        x_ptr += inc_x;
+        xbuffer[2] = x_ptr[0];
+        xbuffer[3] = x_ptr[1];
+        x_ptr += inc_x;
+        xbuffer[4] = x_ptr[0];
+        xbuffer[5] = x_ptr[1];
+        x_ptr += inc_x;
+        xbuffer[6] = x_ptr[0];
+        xbuffer[7] = x_ptr[1];
+        x_ptr += inc_x;
+
+        cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer);
+        ap[0] += lda4;
+        ap[1] += lda4;
+        ap[2] += lda4;
+        ap[3] += lda4;
+        a_ptr += lda4;
+      }
+
+      for (i = 0; i < n2; i++) {
+        xbuffer[0] = x_ptr[0];
+        xbuffer[1] = x_ptr[1];
+        x_ptr += inc_x;
+        cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
+        a_ptr += 1 * lda;
+
+      }
+
+    }
+
+    add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
+    a += 2 * NB;
+    y_ptr += NB * inc_y;
+  }
+
+  if (m3 == 0)
+    return (0);
+
+  if (m3 == 1) {
+    a_ptr = a;
+    x_ptr = x;
+    FLOAT temp_r = 0.0;
+    FLOAT temp_i = 0.0;
+
+    if (lda == 2 && inc_x == 2) {
+
+      for (i = 0; i < (n & -2); i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
+        temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
+#else
+        temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
+        temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
+#endif
+
+        a_ptr += 4;
+        x_ptr += 4;
+      }
+
+      for (; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+        temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+        a_ptr += 2;
+        x_ptr += 2;
+      }
+
+    } else {
+
+      for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+        temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+        a_ptr += lda;
+        x_ptr += inc_x;
+      }
+
+    }
+#if !defined(XCONJ)
+    y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+    y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+    y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+    y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+    return (0);
+  }
+
+  if (m3 == 2) {
+    a_ptr = a;
+    x_ptr = x;
+    FLOAT temp_r0 = 0.0;
+    FLOAT temp_i0 = 0.0;
+    FLOAT temp_r1 = 0.0;
+    FLOAT temp_i1 = 0.0;
+
+    if (lda == 4 && inc_x == 2) {
+
+      for (i = 0; i < (n & -2); i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+        temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+
+        temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
+        temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
+        temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
+        temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
+
+#else
+        temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+
+        temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
+        temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
+        temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+        temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
+
+#endif
+
+        a_ptr += 8;
+        x_ptr += 4;
+      }
+
+      for (; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+        temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+        a_ptr += 4;
+        x_ptr += 2;
+      }
+
+    } else {
+
+      for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+        temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+        a_ptr += lda;
+        x_ptr += inc_x;
+      }
+
+    }
+#if !defined(XCONJ)
+    y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+    y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+    return (0);
+  }
+
+  if (m3 == 3) {
+    a_ptr = a;
+    x_ptr = x;
+    FLOAT temp_r0 = 0.0;
+    FLOAT temp_i0 = 0.0;
+    FLOAT temp_r1 = 0.0;
+    FLOAT temp_i1 = 0.0;
+    FLOAT temp_r2 = 0.0;
+    FLOAT temp_i2 = 0.0;
+
+    if (lda == 6 && inc_x == 2) {
+
+      for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+        temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+        temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+        temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+        temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+        temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+        a_ptr += 6;
+        x_ptr += 2;
+      }
+
+    } else {
+
+      for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+        temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+        temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+        temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+        temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+        temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+        a_ptr += lda;
+        x_ptr += inc_x;
+      }
+
+    }
+#if !defined(XCONJ)
+    y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
+    y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+#else
+    y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
+    y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+#endif
+    return (0);
+  }
+
+  return (0);
+}
diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c
new file mode 100644
index 0000000000..e10edfab02
--- /dev/null
+++ b/kernel/zarch/cgemv_t_4.c
@@ -0,0 +1,724 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define NBMAX 2048
+
+static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
+                             FLOAT *alpha) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+  register FLOAT *ap2 = ap[2];
+  register FLOAT *ap3 = ap[3];
+
+  __asm__("vzero  %%v16\n\t"
+        "vzero  %%v17\n\t"
+        "vzero  %%v18\n\t"
+        "vzero  %%v19\n\t"
+        "vzero  %%v20\n\t"
+        "vzero  %%v21\n\t"
+        "vzero  %%v22\n\t"
+        "vzero  %%v23\n\t"
+        "vleib  %%v2,0,0\n\t"
+        "vleib  %%v2,1,1\n\t"
+        "vleib  %%v2,2,2\n\t"
+        "vleib  %%v2,3,3\n\t"
+        "vleib  %%v2,0,4\n\t"
+        "vleib  %%v2,1,5\n\t"
+        "vleib  %%v2,2,6\n\t"
+        "vleib  %%v2,3,7\n\t"
+        "vleib  %%v2,8,8\n\t"
+        "vleib  %%v2,9,9\n\t"
+        "vleib  %%v2,10,10\n\t"
+        "vleib  %%v2,11,11\n\t"
+        "vleib  %%v2,8,12\n\t"
+        "vleib  %%v2,9,13\n\t"
+        "vleib  %%v2,10,14\n\t"
+        "vleib  %%v2,11,15\n\t"
+        "vleib  %%v3,4,0\n\t"
+        "vleib  %%v3,5,1\n\t"
+        "vleib  %%v3,6,2\n\t"
+        "vleib  %%v3,7,3\n\t"
+        "vleib  %%v3,4,4\n\t"
+        "vleib  %%v3,5,5\n\t"
+        "vleib  %%v3,6,6\n\t"
+        "vleib  %%v3,7,7\n\t"
+        "vleib  %%v3,12,8\n\t"
+        "vleib  %%v3,13,9\n\t"
+        "vleib  %%v3,14,10\n\t"
+        "vleib  %%v3,15,11\n\t"
+        "vleib  %%v3,12,12\n\t"
+        "vleib  %%v3,13,13\n\t"
+        "vleib  %%v3,14,14\n\t"
+        "vleib  %%v3,15,15\n\t"
+        "xgr   %%r1,%%r1\n\t"
+        "srlg  %[n],%[n],1\n\t"
+        "0:\n\t"
+        "pfd 1,1024(%%r1,%[ap0])\n\t"
+        "pfd 1,1024(%%r1,%[ap1])\n\t"
+        "pfd 1,1024(%%r1,%[ap2])\n\t"
+        "pfd 1,1024(%%r1,%[ap3])\n\t"
+        "pfd 1,1024(%%r1,%[x])\n\t"
+        "vl     %%v0,0(%%r1,%[x])\n\t"
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vlef   %%v1,4(%%r1,%[x]),0\n\t"
+        "vlef   %%v1,12(%%r1,%[x]),2\n\t"
+        "vflcsb %%v1,%%v1\n\t"
+        "vlef   %%v1,0(%%r1,%[x]),1\n\t"
+        "vlef   %%v1,8(%%r1,%[x]),3\n\t"
+#else
+        "vlef   %%v1,0(%%r1,%[x]),1\n\t"
+        "vlef   %%v1,8(%%r1,%[x]),3\n\t"
+        "vflcsb %%v1,%%v1\n\t"
+        "vlef   %%v1,4(%%r1,%[x]),0\n\t"
+        "vlef   %%v1,12(%%r1,%[x]),2\n\t"
+#endif
+        "vl    %%v24,0(%%r1,%[ap0])\n\t"
+        "vperm %%v25,%%v24,%%v24,%%v3\n\t"
+        "vperm %%v24,%%v24,%%v24,%%v2\n\t"
+        "vl    %%v26,0(%%r1,%[ap1])\n\t"
+        "vperm %%v27,%%v26,%%v26,%%v3\n\t"
+        "vperm %%v26,%%v26,%%v26,%%v2\n\t"
+        "vl    %%v28,0(%%r1,%[ap2])\n\t"
+        "vperm %%v29,%%v28,%%v28,%%v3\n\t"
+        "vperm %%v28,%%v28,%%v28,%%v2\n\t"
+        "vl    %%v30,0(%%r1,%[ap3])\n\t"
+        "vperm %%v31,%%v30,%%v30,%%v3\n\t"
+        "vperm %%v30,%%v30,%%v30,%%v2\n\t"
+        "vfmasb   %%v16,%%v24,%%v0,%%v16\n\t"
+        "vfmasb   %%v20,%%v25,%%v1,%%v20\n\t"
+        "vfmasb   %%v17,%%v26,%%v0,%%v17\n\t"
+        "vfmasb   %%v21,%%v27,%%v1,%%v21\n\t"
+        "vfmasb   %%v18,%%v28,%%v0,%%v18\n\t"
+        "vfmasb   %%v22,%%v29,%%v1,%%v22\n\t"
+        "vfmasb   %%v19,%%v30,%%v0,%%v19\n\t"
+        "vfmasb   %%v23,%%v31,%%v1,%%v23\n\t"
+        "agfi   %%r1,16\n\t"
+        "brctg  %[n],0b\n\t"
+        "vfasb  %%v16,%%v16,%%v20\n\t"
+        "vfasb  %%v17,%%v17,%%v21\n\t"
+        "vfasb  %%v18,%%v18,%%v22\n\t"
+        "vfasb  %%v19,%%v19,%%v23\n\t"
+        "vrepg  %%v20,%%v16,1\n\t"
+        "vrepg  %%v21,%%v17,1\n\t"
+        "vrepg  %%v22,%%v18,1\n\t"
+        "vrepg  %%v23,%%v19,1\n\t"
+        "vfasb  %%v16,%%v16,%%v20\n\t"
+        "vfasb  %%v17,%%v17,%%v21\n\t"
+        "vfasb  %%v18,%%v18,%%v22\n\t"
+        "vfasb  %%v19,%%v19,%%v23\n\t"
+        "vmrhg  %%v16,%%v16,%%v17\n\t"
+        "vmrhg  %%v17,%%v18,%%v19\n\t"
+        "verllg %%v18,%%v16,32\n\t"
+        "verllg %%v19,%%v17,32\n\t"
+#if !defined(XCONJ)
+        "vlrepf %%v20,0(%[alpha])\n\t"
+        "vlef   %%v21,4(%[alpha]),0\n\t"
+        "vlef   %%v21,4(%[alpha]),2\n\t"
+        "vflcsb %%v21,%%v21\n\t"
+        "vlef   %%v21,4(%[alpha]),1\n\t"
+        "vlef   %%v21,4(%[alpha]),3\n\t"
+#else
+        "vlef   %%v20,0(%[alpha]),1\n\t"
+        "vlef   %%v20,0(%[alpha]),3\n\t"
+        "vflcsb %%v20,%%v20\n\t"
+        "vlef   %%v20,0(%[alpha]),0\n\t"
+        "vlef   %%v20,0(%[alpha]),2\n\t"
+        "vlrepf %%v21,4(%[alpha])\n\t"
+#endif
+        "vl  %%v22,0(%[y])\n\t"
+        "vl  %%v23,16(%[y])\n\t"
+        "vfmasb   %%v22,%%v16,%%v20,%%v22\n\t"
+        "vfmasb   %%v22,%%v18,%%v21,%%v22\n\t"
+        "vfmasb   %%v23,%%v17,%%v20,%%v23\n\t"
+        "vfmasb   %%v23,%%v19,%%v21,%%v23\n\t"
+        "vst  %%v22,0(%[y])\n\t"
+        "vst  %%v23,16(%[y])"
+    : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
+       "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
+       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+}
+
+static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
+                             FLOAT *alpha) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+
+  __asm__("vzero  %%v16\n\t"
+        "vzero  %%v17\n\t"
+        "vzero  %%v18\n\t"
+        "vzero  %%v19\n\t"
+        "vleib  %%v2,0,0\n\t"
+        "vleib  %%v2,1,1\n\t"
+        "vleib  %%v2,2,2\n\t"
+        "vleib  %%v2,3,3\n\t"
+        "vleib  %%v2,0,4\n\t"
+        "vleib  %%v2,1,5\n\t"
+        "vleib  %%v2,2,6\n\t"
+        "vleib  %%v2,3,7\n\t"
+        "vleib  %%v2,8,8\n\t"
+        "vleib  %%v2,9,9\n\t"
+        "vleib  %%v2,10,10\n\t"
+        "vleib  %%v2,11,11\n\t"
+        "vleib  %%v2,8,12\n\t"
+        "vleib  %%v2,9,13\n\t"
+        "vleib  %%v2,10,14\n\t"
+        "vleib  %%v2,11,15\n\t"
+        "vleib  %%v3,4,0\n\t"
+        "vleib  %%v3,5,1\n\t"
+        "vleib  %%v3,6,2\n\t"
+        "vleib  %%v3,7,3\n\t"
+        "vleib  %%v3,4,4\n\t"
+        "vleib  %%v3,5,5\n\t"
+        "vleib  %%v3,6,6\n\t"
+        "vleib  %%v3,7,7\n\t"
+        "vleib  %%v3,12,8\n\t"
+        "vleib  %%v3,13,9\n\t"
+        "vleib  %%v3,14,10\n\t"
+        "vleib  %%v3,15,11\n\t"
+        "vleib  %%v3,12,12\n\t"
+        "vleib  %%v3,13,13\n\t"
+        "vleib  %%v3,14,14\n\t"
+        "vleib  %%v3,15,15\n\t"
+        "xgr   %%r1,%%r1\n\t"
+        "srlg  %[n],%[n],1\n\t"
+        "0:\n\t"
+        "pfd 1,1024(%%r1,%[ap0])\n\t"
+        "pfd 1,1024(%%r1,%[ap1])\n\t"
+        "pfd 1,1024(%%r1,%[x])\n\t"
+        "vl     %%v0,0(%%r1,%[x])\n\t"
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vlef   %%v1,4(%%r1,%[x]),0\n\t"
+        "vlef   %%v1,12(%%r1,%[x]),2\n\t"
+        "vflcsb %%v1,%%v1\n\t"
+        "vlef   %%v1,0(%%r1,%[x]),1\n\t"
+        "vlef   %%v1,8(%%r1,%[x]),3\n\t"
+#else
+        "vlef   %%v1,0(%%r1,%[x]),1\n\t"
+        "vlef   %%v1,8(%%r1,%[x]),3\n\t"
+        "vflcsb %%v1,%%v1\n\t"
+        "vlef   %%v1,4(%%r1,%[x]),0\n\t"
+        "vlef   %%v1,12(%%r1,%[x]),2\n\t"
+#endif
+        "vl    %%v20,0(%%r1,%[ap0])\n\t"
+        "vperm %%v21,%%v20,%%v20,%%v3\n\t"
+        "vperm %%v20,%%v20,%%v20,%%v2\n\t"
+        "vl    %%v22,0(%%r1,%[ap1])\n\t"
+        "vperm %%v23,%%v22,%%v22,%%v3\n\t"
+        "vperm %%v22,%%v22,%%v22,%%v2\n\t"
+        "vfmasb   %%v16,%%v20,%%v0,%%v16\n\t"
+        "vfmasb   %%v18,%%v21,%%v1,%%v18\n\t"
+        "vfmasb   %%v17,%%v22,%%v0,%%v17\n\t"
+        "vfmasb   %%v19,%%v23,%%v1,%%v19\n\t"
+        "agfi   %%r1,16\n\t"
+        "brctg  %[n],0b\n\t"
+        "vfasb  %%v16,%%v16,%%v18\n\t"
+        "vfasb  %%v17,%%v17,%%v19\n\t"
+        "vrepg  %%v18,%%v16,1\n\t"
+        "vrepg  %%v19,%%v17,1\n\t"
+        "vfasb  %%v16,%%v16,%%v18\n\t"
+        "vfasb  %%v17,%%v17,%%v19\n\t"
+        "vmrhg  %%v16,%%v16,%%v17\n\t"
+        "verllg %%v17,%%v16,32\n\t"
+#if !defined(XCONJ)
+        "vlrepf %%v18,0(%[alpha])\n\t"
+        "vlef   %%v19,4(%[alpha]),0\n\t"
+        "vlef   %%v19,4(%[alpha]),2\n\t"
+        "vflcsb %%v19,%%v19\n\t"
+        "vlef   %%v19,4(%[alpha]),1\n\t"
+        "vlef   %%v19,4(%[alpha]),3\n\t"
+#else
+        "vlef   %%v18,0(%[alpha]),1\n\t"
+        "vlef   %%v18,0(%[alpha]),3\n\t"
+        "vflcsb %%v18,%%v18\n\t"
+        "vlef   %%v18,0(%[alpha]),0\n\t"
+        "vlef   %%v18,0(%[alpha]),2\n\t"
+        "vlrepf %%v19,4(%[alpha])\n\t"
+#endif
+        "vl  %%v20,0(%[y])\n\t"
+        "vfmasb   %%v20,%%v16,%%v18,%%v20\n\t"
+        "vfmasb   %%v20,%%v17,%%v19,%%v20\n\t"
+        "vst  %%v20,0(%[y])"
+    : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
+       "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
+       "v21", "v22", "v23");
+}
+
+static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
+                             FLOAT *alpha) {
+  __asm__("vzero  %%v16\n\t"
+        "vzero  %%v17\n\t"
+        "vleib  %%v2,0,0\n\t"
+        "vleib  %%v2,1,1\n\t"
+        "vleib  %%v2,2,2\n\t"
+        "vleib  %%v2,3,3\n\t"
+        "vleib  %%v2,0,4\n\t"
+        "vleib  %%v2,1,5\n\t"
+        "vleib  %%v2,2,6\n\t"
+        "vleib  %%v2,3,7\n\t"
+        "vleib  %%v2,8,8\n\t"
+        "vleib  %%v2,9,9\n\t"
+        "vleib  %%v2,10,10\n\t"
+        "vleib  %%v2,11,11\n\t"
+        "vleib  %%v2,8,12\n\t"
+        "vleib  %%v2,9,13\n\t"
+        "vleib  %%v2,10,14\n\t"
+        "vleib  %%v2,11,15\n\t"
+        "vleib  %%v3,4,0\n\t"
+        "vleib  %%v3,5,1\n\t"
+        "vleib  %%v3,6,2\n\t"
+        "vleib  %%v3,7,3\n\t"
+        "vleib  %%v3,4,4\n\t"
+        "vleib  %%v3,5,5\n\t"
+        "vleib  %%v3,6,6\n\t"
+        "vleib  %%v3,7,7\n\t"
+        "vleib  %%v3,12,8\n\t"
+        "vleib  %%v3,13,9\n\t"
+        "vleib  %%v3,14,10\n\t"
+        "vleib  %%v3,15,11\n\t"
+        "vleib  %%v3,12,12\n\t"
+        "vleib  %%v3,13,13\n\t"
+        "vleib  %%v3,14,14\n\t"
+        "vleib  %%v3,15,15\n\t"
+        "xgr   %%r1,%%r1\n\t"
+        "srlg  %[n],%[n],1\n\t"
+        "0:\n\t"
+        "pfd 1,1024(%%r1,%[ap])\n\t"
+        "pfd 1,1024(%%r1,%[x])\n\t"
+        "vl     %%v0,0(%%r1,%[x])\n\t"
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vlef   %%v1,4(%%r1,%[x]),0\n\t"
+        "vlef   %%v1,12(%%r1,%[x]),2\n\t"
+        "vflcsb %%v1,%%v1\n\t"
+        "vlef   %%v1,0(%%r1,%[x]),1\n\t"
+        "vlef   %%v1,8(%%r1,%[x]),3\n\t"
+#else
+        "vlef   %%v1,0(%%r1,%[x]),1\n\t"
+        "vlef   %%v1,8(%%r1,%[x]),3\n\t"
+        "vflcsb %%v1,%%v1\n\t"
+        "vlef   %%v1,4(%%r1,%[x]),0\n\t"
+        "vlef   %%v1,12(%%r1,%[x]),2\n\t"
+#endif
+        "vl    %%v18,0(%%r1,%[ap])\n\t"
+        "vperm %%v19,%%v18,%%v18,%%v3\n\t"
+        "vperm %%v18,%%v18,%%v18,%%v2\n\t"
+        "vfmasb   %%v16,%%v18,%%v0,%%v16\n\t"
+        "vfmasb   %%v17,%%v19,%%v1,%%v17\n\t"
+        "agfi   %%r1,16\n\t"
+        "brctg  %[n],0b\n\t"
+        "vfasb  %%v16,%%v16,%%v17\n\t"
+        "vrepg  %%v17,%%v16,1\n\t"
+        "vfasb  %%v16,%%v16,%%v17\n\t"
+        "verllg %%v17,%%v16,32\n\t"
+#if !defined(XCONJ)
+        "vlrepf %%v18,0(%[alpha])\n\t"
+        "vlef   %%v19,4(%[alpha]),0\n\t"
+        "vflcsb %%v19,%%v19\n\t"
+        "vlef   %%v19,4(%[alpha]),1\n\t"
+#else
+        "vlef   %%v18,0(%[alpha]),1\n\t"
+        "vflcsb %%v18,%%v18\n\t"
+        "vlef   %%v18,0(%[alpha]),0\n\t"
+        "vlrepf %%v19,4(%[alpha])\n\t"
+#endif
+        "vleg     %%v0,0(%[y]),0\n\t"
+        "vfmasb   %%v0,%%v16,%%v18,%%v0\n\t"
+        "vfmasb   %%v0,%%v17,%%v19,%%v0\n\t"
+        "vsteg    %%v0,0(%[y]),0"
+    : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
+       "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+}
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+  BLASLONG i;
+  for (i = 0; i < n; i++) {
+    *dest = *src;
+    *(dest + 1) = *(src + 1);
+    dest += 2;
+    src += inc_src;
+  }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+          FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+          BLASLONG inc_y, FLOAT *buffer) {
+  BLASLONG i;
+  BLASLONG j;
+  FLOAT *a_ptr;
+  FLOAT *x_ptr;
+  FLOAT *y_ptr;
+  FLOAT *ap[8];
+  BLASLONG n1;
+  BLASLONG m1;
+  BLASLONG m2;
+  BLASLONG m3;
+  BLASLONG n2;
+  BLASLONG lda4;
+  FLOAT ybuffer[8], *xbuffer;
+  FLOAT alpha[2];
+
+  if (m < 1)
+    return (0);
+  if (n < 1)
+    return (0);
+
+  inc_x <<= 1;
+  inc_y <<= 1;
+  lda <<= 1;
+  lda4 = lda << 2;
+
+  xbuffer = buffer;
+
+  n1 = n >> 2;
+  n2 = n & 3;
+
+  m3 = m & 3;
+  m1 = m - m3;
+  m2 = (m & (NBMAX - 1)) - m3;
+
+  alpha[0] = alpha_r;
+  alpha[1] = alpha_i;
+
+  BLASLONG NB = NBMAX;
+
+  while (NB == NBMAX) {
+
+    m1 -= NB;
+    if (m1 < 0) {
+      if (m2 == 0)
+        break;
+      NB = m2;
+    }
+
+    y_ptr = y;
+    a_ptr = a;
+    x_ptr = x;
+    ap[0] = a_ptr;
+    ap[1] = a_ptr + lda;
+    ap[2] = ap[1] + lda;
+    ap[3] = ap[2] + lda;
+    if (inc_x != 2)
+      copy_x(NB, x_ptr, xbuffer, inc_x);
+    else
+      xbuffer = x_ptr;
+
+    if (inc_y == 2) {
+
+      for (i = 0; i < n1; i++) {
+        cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha);
+        ap[0] += lda4;
+        ap[1] += lda4;
+        ap[2] += lda4;
+        ap[3] += lda4;
+        a_ptr += lda4;
+        y_ptr += 8;
+
+      }
+
+      if (n2 & 2) {
+        cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha);
+        a_ptr += lda * 2;
+        y_ptr += 4;
+
+      }
+
+      if (n2 & 1) {
+        cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
+        /* a_ptr += lda;
+           y_ptr += 2; */
+
+      }
+
+    } else {
+
+      for (i = 0; i < n1; i++) {
+        memset(ybuffer, 0, sizeof(ybuffer));
+        cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha);
+        ap[0] += lda4;
+        ap[1] += lda4;
+        ap[2] += lda4;
+        ap[3] += lda4;
+        a_ptr += lda4;
+
+        y_ptr[0] += ybuffer[0];
+        y_ptr[1] += ybuffer[1];
+        y_ptr += inc_y;
+        y_ptr[0] += ybuffer[2];
+        y_ptr[1] += ybuffer[3];
+        y_ptr += inc_y;
+        y_ptr[0] += ybuffer[4];
+        y_ptr[1] += ybuffer[5];
+        y_ptr += inc_y;
+        y_ptr[0] += ybuffer[6];
+        y_ptr[1] += ybuffer[7];
+        y_ptr += inc_y;
+
+      }
+
+      for (i = 0; i < n2; i++) {
+        memset(ybuffer, 0, sizeof(ybuffer));
+        cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha);
+        a_ptr += lda;
+        y_ptr[0] += ybuffer[0];
+        y_ptr[1] += ybuffer[1];
+        y_ptr += inc_y;
+
+      }
+
+    }
+    a += 2 * NB;
+    x += NB * inc_x;
+  }
+
+  if (m3 == 0)
+    return (0);
+
+  x_ptr = x;
+  j = 0;
+  a_ptr = a;
+  y_ptr = y;
+
+  if (m3 == 3) {
+
+    FLOAT temp_r;
+    FLOAT temp_i;
+    FLOAT x0 = x_ptr[0];
+    FLOAT x1 = x_ptr[1];
+    x_ptr += inc_x;
+    FLOAT x2 = x_ptr[0];
+    FLOAT x3 = x_ptr[1];
+    x_ptr += inc_x;
+    FLOAT x4 = x_ptr[0];
+    FLOAT x5 = x_ptr[1];
+    while (j < n) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+      temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+      temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+      temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+      temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
+      temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
+#else
+
+      temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+      temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+      temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+      temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
+      temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
+#endif
+
+#if !defined(XCONJ)
+      y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+      y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+      y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+      y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+      a_ptr += lda;
+      y_ptr += inc_y;
+      j++;
+    }
+    return (0);
+  }
+
+  if (m3 == 2) {
+
+    FLOAT temp_r;
+    FLOAT temp_i;
+    FLOAT temp_r1;
+    FLOAT temp_i1;
+    FLOAT x0 = x_ptr[0];
+    FLOAT x1 = x_ptr[1];
+    x_ptr += inc_x;
+    FLOAT x2 = x_ptr[0];
+    FLOAT x3 = x_ptr[1];
+    FLOAT ar = alpha[0];
+    FLOAT ai = alpha[1];
+
+    while (j < (n & -2)) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+      temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+      temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+      temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+      a_ptr += lda;
+      temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+      temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
+      temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
+#else
+
+      temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+      temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+      temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+      a_ptr += lda;
+      temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+      temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
+      temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
+#endif
+
+#if !defined(XCONJ)
+      y_ptr[0] += ar * temp_r - ai * temp_i;
+      y_ptr[1] += ar * temp_i + ai * temp_r;
+      y_ptr += inc_y;
+      y_ptr[0] += ar * temp_r1 - ai * temp_i1;
+      y_ptr[1] += ar * temp_i1 + ai * temp_r1;
+#else
+      y_ptr[0] += ar * temp_r + ai * temp_i;
+      y_ptr[1] -= ar * temp_i - ai * temp_r;
+      y_ptr += inc_y;
+      y_ptr[0] += ar * temp_r1 + ai * temp_i1;
+      y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
+#endif
+
+      a_ptr += lda;
+      y_ptr += inc_y;
+      j += 2;
+    }
+
+    while (j < n) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+      temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+      temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+      temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+#else
+
+      temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+      temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+      temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+#endif
+
+#if !defined(XCONJ)
+      y_ptr[0] += ar * temp_r - ai * temp_i;
+      y_ptr[1] += ar * temp_i + ai * temp_r;
+#else
+      y_ptr[0] += ar * temp_r + ai * temp_i;
+      y_ptr[1] -= ar * temp_i - ai * temp_r;
+#endif
+
+      a_ptr += lda;
+      y_ptr += inc_y;
+      j++;
+    }
+
+    return (0);
+  }
+
+  if (m3 == 1) {
+
+    FLOAT temp_r;
+    FLOAT temp_i;
+    FLOAT temp_r1;
+    FLOAT temp_i1;
+    FLOAT x0 = x_ptr[0];
+    FLOAT x1 = x_ptr[1];
+    FLOAT ar = alpha[0];
+    FLOAT ai = alpha[1];
+
+    while (j < (n & -2)) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+      temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+      a_ptr += lda;
+      temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+#else
+
+      temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+      a_ptr += lda;
+      temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+#endif
+
+#if !defined(XCONJ)
+      y_ptr[0] += ar * temp_r - ai * temp_i;
+      y_ptr[1] += ar * temp_i + ai * temp_r;
+      y_ptr += inc_y;
+      y_ptr[0] += ar * temp_r1 - ai * temp_i1;
+      y_ptr[1] += ar * temp_i1 + ai * temp_r1;
+#else
+      y_ptr[0] += ar * temp_r + ai * temp_i;
+      y_ptr[1] -= ar * temp_i - ai * temp_r;
+      y_ptr += inc_y;
+      y_ptr[0] += ar * temp_r1 + ai * temp_i1;
+      y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
+#endif
+
+      a_ptr += lda;
+      y_ptr += inc_y;
+      j += 2;
+    }
+
+    while (j < n) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+      temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+#else
+
+      temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+#endif
+
+#if !defined(XCONJ)
+      y_ptr[0] += ar * temp_r - ai * temp_i;
+      y_ptr[1] += ar * temp_i + ai * temp_r;
+#else
+      y_ptr[0] += ar * temp_r + ai * temp_i;
+      y_ptr[1] -= ar * temp_i - ai * temp_r;
+#endif
+
+      a_ptr += lda;
+      y_ptr += inc_y;
+      j++;
+    }
+    return (0);
+  }
+
+  return (0);
+}
diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c
new file mode 100644
index 0000000000..aab155f8b5
--- /dev/null
+++ b/kernel/zarch/crot.c
@@ -0,0 +1,236 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
+  __asm__("vlrepf %%v0,%[c]\n\t"
+    "vlrepf %%v1,%[s]\n\t"
+    "srlg   %[n],%[n],5\n\t"
+    "xgr    %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "pfd 2, 1024(%%r1,%[y])\n\t"
+    "vl  %%v24, 0(%%r1,%[x])\n\t"
+    "vl  %%v25, 16(%%r1,%[x])\n\t"
+    "vl  %%v26, 32(%%r1,%[x])\n\t"
+    "vl  %%v27, 48(%%r1,%[x])\n\t"
+    "vl  %%v16, 0(%%r1,%[y])\n\t"
+    "vl  %%v17, 16(%%r1,%[y])\n\t"
+    "vl  %%v18, 32(%%r1,%[y])\n\t"
+    "vl  %%v19, 48(%%r1,%[y])\n\t"
+    "vfmsb %%v28,%%v24,%%v0\n\t"
+    "vfmsb %%v29,%%v25,%%v0\n\t"
+    "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v30,%%v26,%%v0\n\t"
+    "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v31,%%v27,%%v0\n\t"
+    "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 0(%%r1,%[x])\n\t"
+    "vst  %%v29, 16(%%r1,%[x])\n\t"
+    "vst  %%v30, 32(%%r1,%[x])\n\t"
+    "vst  %%v31, 48(%%r1,%[x])\n\t"
+    "vst  %%v20, 0(%%r1,%[y])\n\t"
+    "vst  %%v21, 16(%%r1,%[y])\n\t"
+    "vst  %%v22, 32(%%r1,%[y])\n\t"
+    "vst  %%v23, 48(%%r1,%[y])\n\t"
+    "vl  %%v24, 64(%%r1,%[x])\n\t"
+    "vl  %%v25, 80(%%r1,%[x])\n\t"
+    "vl  %%v26, 96(%%r1,%[x])\n\t"
+    "vl  %%v27, 112(%%r1,%[x])\n\t"
+    "vl  %%v16, 64(%%r1,%[y])\n\t"
+    "vl  %%v17, 80(%%r1,%[y])\n\t"
+    "vl  %%v18, 96(%%r1,%[y])\n\t"
+    "vl  %%v19, 112(%%r1,%[y])\n\t"
+    "vfmsb %%v28,%%v24,%%v0\n\t"
+    "vfmsb %%v29,%%v25,%%v0\n\t"
+    "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v30,%%v26,%%v0\n\t"
+    "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v31,%%v27,%%v0\n\t"
+    "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 64(%%r1,%[x])\n\t"
+    "vst  %%v29, 80(%%r1,%[x])\n\t"
+    "vst  %%v30, 96(%%r1,%[x])\n\t"
+    "vst  %%v31, 112(%%r1,%[x])\n\t"
+    "vst  %%v20, 64(%%r1,%[y])\n\t"
+    "vst  %%v21, 80(%%r1,%[y])\n\t"
+    "vst  %%v22, 96(%%r1,%[y])\n\t"
+    "vst  %%v23, 112(%%r1,%[y])\n\t"
+    "vl  %%v24, 128(%%r1,%[x])\n\t"
+    "vl  %%v25, 144(%%r1,%[x])\n\t"
+    "vl  %%v26, 160(%%r1,%[x])\n\t"
+    "vl  %%v27, 176(%%r1,%[x])\n\t"
+    "vl  %%v16, 128(%%r1,%[y])\n\t"
+    "vl  %%v17, 144(%%r1,%[y])\n\t"
+    "vl  %%v18, 160(%%r1,%[y])\n\t"
+    "vl  %%v19, 176(%%r1,%[y])\n\t"
+    "vfmsb %%v28,%%v24,%%v0\n\t"
+    "vfmsb %%v29,%%v25,%%v0\n\t"
+    "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v30,%%v26,%%v0\n\t"
+    "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v31,%%v27,%%v0\n\t"
+    "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 128(%%r1,%[x])\n\t"
+    "vst  %%v29, 144(%%r1,%[x])\n\t"
+    "vst  %%v30, 160(%%r1,%[x])\n\t"
+    "vst  %%v31, 176(%%r1,%[x])\n\t"
+    "vst  %%v20, 128(%%r1,%[y])\n\t"
+    "vst  %%v21, 144(%%r1,%[y])\n\t"
+    "vst  %%v22, 160(%%r1,%[y])\n\t"
+    "vst  %%v23, 176(%%r1,%[y])\n\t"
+    "vl  %%v24, 192(%%r1,%[x])\n\t"
+    "vl  %%v25, 208(%%r1,%[x])\n\t"
+    "vl  %%v26, 224(%%r1,%[x])\n\t"
+    "vl  %%v27, 240(%%r1,%[x])\n\t"
+    "vl  %%v16, 192(%%r1,%[y])\n\t"
+    "vl  %%v17, 208(%%r1,%[y])\n\t"
+    "vl  %%v18, 224(%%r1,%[y])\n\t"
+    "vl  %%v19, 240(%%r1,%[y])\n\t"
+    "vfmsb %%v28,%%v24,%%v0\n\t"
+    "vfmsb %%v29,%%v25,%%v0\n\t"
+    "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v30,%%v26,%%v0\n\t"
+    "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v31,%%v27,%%v0\n\t"
+    "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 192(%%r1,%[x])\n\t"
+    "vst  %%v29, 208(%%r1,%[x])\n\t"
+    "vst  %%v30, 224(%%r1,%[x])\n\t"
+    "vst  %%v31, 240(%%r1,%[x])\n\t"
+    "vst  %%v20, 192(%%r1,%[y])\n\t"
+    "vst  %%v21, 208(%%r1,%[y])\n\t"
+    "vst  %%v22, 224(%%r1,%[y])\n\t"
+    "vst  %%v23, 240(%%r1,%[y])\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) x),
+       "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
+    : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+}
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT c, FLOAT s) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+  FLOAT temp[2];
+  BLASLONG inc_x2;
+  BLASLONG inc_y2;
+
+  if (n <= 0)
+    return (0);
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+      FLOAT cosa, sina;
+      cosa = c;
+      sina = s;
+      crot_kernel_32(n1, x, y, &cosa, &sina);
+      i = n1;
+      ix = 2 * n1;
+    }
+
+    while (i < n) {
+      temp[0] = c * x[ix] + s * y[ix];
+      temp[1] = c * x[ix + 1] + s * y[ix + 1];
+      y[ix] = c * y[ix] - s * x[ix];
+      y[ix + 1] = c * y[ix + 1] - s * x[ix + 1];
+      x[ix] = temp[0];
+      x[ix + 1] = temp[1];
+
+      ix += 2;
+      i++;
+
+    }
+
+  } else {
+    inc_x2 = 2 * inc_x;
+    inc_y2 = 2 * inc_y;
+    while (i < n) {
+      temp[0] = c * x[ix] + s * y[iy];
+      temp[1] = c * x[ix + 1] + s * y[iy + 1];
+      y[iy] = c * y[iy] - s * x[ix];
+      y[iy + 1] = c * y[iy + 1] - s * x[ix + 1];
+      x[ix] = temp[0];
+      x[ix + 1] = temp[1];
+
+      ix += inc_x2;
+      iy += inc_y2;
+      i++;
+
+    }
+
+  }
+  return (0);
+
+}
diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c
new file mode 100644
index 0000000000..9fc54cf295
--- /dev/null
+++ b/kernel/zarch/cscal.c
@@ -0,0 +1,429 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) {
+  __asm__("vlrepf %%v0,0(%[alpha])\n\t"
+    "vlef   %%v1,4(%[alpha]),0\n\t"
+    "vlef   %%v1,4(%[alpha]),2\n\t"
+    "vflcsb %%v1,%%v1\n\t"
+    "vlef   %%v1,4(%[alpha]),1\n\t"
+    "vlef   %%v1,4(%[alpha]),3\n\t"
+    "srlg %[n],%[n],4\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "vl   %%v16,0(%%r1,%[x])\n\t"
+    "vl   %%v17,16(%%r1,%[x])\n\t"
+    "vl   %%v18,32(%%r1,%[x])\n\t"
+    "vl   %%v19,48(%%r1,%[x])\n\t"
+    "vl   %%v20,64(%%r1,%[x])\n\t"
+    "vl   %%v21,80(%%r1,%[x])\n\t"
+    "vl   %%v22,96(%%r1,%[x])\n\t"
+    "vl   %%v23,112(%%r1,%[x])\n\t"
+    "verllg   %%v24,%%v16,32\n\t"
+    "verllg   %%v25,%%v17,32\n\t"
+    "verllg   %%v26,%%v18,32\n\t"
+    "verllg   %%v27,%%v19,32\n\t"
+    "verllg   %%v28,%%v20,32\n\t"
+    "verllg   %%v29,%%v21,32\n\t"
+    "verllg   %%v30,%%v22,32\n\t"
+    "verllg   %%v31,%%v23,32\n\t"
+    "vfmsb %%v16,%%v16,%%v0\n\t"
+    "vfmsb %%v17,%%v17,%%v0\n\t"
+    "vfmsb %%v18,%%v18,%%v0\n\t"
+    "vfmsb %%v19,%%v19,%%v0\n\t"
+    "vfmsb %%v20,%%v20,%%v0\n\t"
+    "vfmsb %%v21,%%v21,%%v0\n\t"
+    "vfmsb %%v22,%%v22,%%v0\n\t"
+    "vfmsb %%v23,%%v23,%%v0\n\t"
+    "vfmasb %%v16,%%v24,%%v1,%%v16\n\t"
+    "vfmasb %%v17,%%v25,%%v1,%%v17\n\t"
+    "vfmasb %%v18,%%v26,%%v1,%%v18\n\t"
+    "vfmasb %%v19,%%v27,%%v1,%%v19\n\t"
+    "vfmasb %%v20,%%v28,%%v1,%%v20\n\t"
+    "vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
+    "vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
+    "vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
+    "vst %%v16,0(%%r1,%[x])\n\t"
+    "vst %%v17,16(%%r1,%[x])\n\t"
+    "vst %%v18,32(%%r1,%[x])\n\t"
+    "vst %%v19,48(%%r1,%[x])\n\t"
+    "vst %%v20,64(%%r1,%[x])\n\t"
+    "vst %%v21,80(%%r1,%[x])\n\t"
+    "vst %%v22,96(%%r1,%[x])\n\t"
+    "vst %%v23,112(%%r1,%[x])\n\t"
+    "agfi  %%r1,128\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
+    : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
+       [alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+}
+
+static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
+  __asm__("vlef   %%v0,4(%[alpha]),0\n\t"
+    "vlef   %%v0,4(%[alpha]),2\n\t"
+    "vflcsb %%v0,%%v0\n\t"
+    "vlef   %%v0,4(%[alpha]),1\n\t"
+    "vlef   %%v0,4(%[alpha]),3\n\t"
+    "srlg %[n],%[n],4\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "vl   %%v16,0(%%r1,%[x])\n\t"
+    "vl   %%v17,16(%%r1,%[x])\n\t"
+    "vl   %%v18,32(%%r1,%[x])\n\t"
+    "vl   %%v19,48(%%r1,%[x])\n\t"
+    "vl   %%v20,64(%%r1,%[x])\n\t"
+    "vl   %%v21,80(%%r1,%[x])\n\t"
+    "vl   %%v22,96(%%r1,%[x])\n\t"
+    "vl   %%v23,112(%%r1,%[x])\n\t"
+    "verllg   %%v16,%%v16,32\n\t"
+    "verllg   %%v17,%%v17,32\n\t"
+    "verllg   %%v18,%%v18,32\n\t"
+    "verllg   %%v19,%%v19,32\n\t"
+    "verllg   %%v20,%%v20,32\n\t"
+    "verllg   %%v21,%%v21,32\n\t"
+    "verllg   %%v22,%%v22,32\n\t"
+    "verllg   %%v23,%%v23,32\n\t"
+    "vfmsb %%v16,%%v16,%%v0\n\t"
+    "vfmsb %%v17,%%v17,%%v0\n\t"
+    "vfmsb %%v18,%%v18,%%v0\n\t"
+    "vfmsb %%v19,%%v19,%%v0\n\t"
+    "vfmsb %%v20,%%v20,%%v0\n\t"
+    "vfmsb %%v21,%%v21,%%v0\n\t"
+    "vfmsb %%v22,%%v22,%%v0\n\t"
+    "vfmsb %%v23,%%v23,%%v0\n\t"
+    "vst %%v16,0(%%r1,%[x])\n\t"
+    "vst %%v17,16(%%r1,%[x])\n\t"
+    "vst %%v18,32(%%r1,%[x])\n\t"
+    "vst %%v19,48(%%r1,%[x])\n\t"
+    "vst %%v20,64(%%r1,%[x])\n\t"
+    "vst %%v21,80(%%r1,%[x])\n\t"
+    "vst %%v22,96(%%r1,%[x])\n\t"
+    "vst %%v23,112(%%r1,%[x])\n\t"
+    "agfi  %%r1,128\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
+    : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
+       [alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23");
+}
+
+static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
+  __asm__("vlrepf %%v0,0(%[alpha])\n\t"
+    "srlg %[n],%[n],4\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "vl   %%v16,0(%%r1,%[x])\n\t"
+    "vl   %%v17,16(%%r1,%[x])\n\t"
+    "vl   %%v18,32(%%r1,%[x])\n\t"
+    "vl   %%v19,48(%%r1,%[x])\n\t"
+    "vl   %%v20,64(%%r1,%[x])\n\t"
+    "vl   %%v21,80(%%r1,%[x])\n\t"
+    "vl   %%v22,96(%%r1,%[x])\n\t"
+    "vl   %%v23,112(%%r1,%[x])\n\t"
+    "vfmsb %%v16,%%v16,%%v0\n\t"
+    "vfmsb %%v17,%%v17,%%v0\n\t"
+    "vfmsb %%v18,%%v18,%%v0\n\t"
+    "vfmsb %%v19,%%v19,%%v0\n\t"
+    "vfmsb %%v20,%%v20,%%v0\n\t"
+    "vfmsb %%v21,%%v21,%%v0\n\t"
+    "vfmsb %%v22,%%v22,%%v0\n\t"
+    "vfmsb %%v23,%%v23,%%v0\n\t"
+    "vst %%v16,0(%%r1,%[x])\n\t"
+    "vst %%v17,16(%%r1,%[x])\n\t"
+    "vst %%v18,32(%%r1,%[x])\n\t"
+    "vst %%v19,48(%%r1,%[x])\n\t"
+    "vst %%v20,64(%%r1,%[x])\n\t"
+    "vst %%v21,80(%%r1,%[x])\n\t"
+    "vst %%v22,96(%%r1,%[x])\n\t"
+    "vst %%v23,112(%%r1,%[x])\n\t"
+    "agfi  %%r1,128\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
+    : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
+       [alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23");
+}
+
+static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
+  __asm__("vzero %%v0\n\t"
+    "srlg %[n],%[n],4\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "vst  %%v0,0(%%r1,%[x])\n\t"
+    "vst  %%v0,16(%%r1,%[x])\n\t"
+    "vst  %%v0,32(%%r1,%[x])\n\t"
+    "vst  %%v0,48(%%r1,%[x])\n\t"
+    "vst  %%v0,64(%%r1,%[x])\n\t"
+    "vst  %%v0,80(%%r1,%[x])\n\t"
+    "vst  %%v0,96(%%r1,%[x])\n\t"
+    "vst  %%v0,112(%%r1,%[x])\n\t"
+    "agfi  %%r1,128\n\t"
+    "brctg %[n],0b"
+    : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
+    : [x] "a"(x)
+    : "cc", "r1", "v0");
+}
+
+static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
+                               BLASLONG inc_x) {
+  BLASLONG i;
+  BLASLONG inc_x2 = 2 * inc_x;
+  BLASLONG inc_x3 = inc_x2 + inc_x;
+  FLOAT t0, t1, t2, t3;
+  FLOAT da_r = alpha[0];
+  FLOAT da_i = alpha[1];
+
+  for (i = 0; i < n; i += 4) {
+    t0 = da_r * x[0] - da_i * x[1];
+    t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
+    t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
+    t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];
+
+    x[1] = da_i * x[0] + da_r * x[1];
+    x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
+    x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
+    x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];
+
+    x[0] = t0;
+    x[inc_x] = t1;
+    x[inc_x2] = t2;
+    x[inc_x3] = t3;
+
+    x += 4 * inc_x;
+  }
+}
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
+          FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+          BLASLONG dummy2) {
+  BLASLONG i = 0, j = 0;
+  FLOAT temp0;
+  FLOAT temp1;
+  FLOAT alpha[2] __attribute__ ((aligned(16)));
+
+  if (inc_x != 1) {
+    inc_x <<= 1;
+
+    if (da_r == 0.0) {
+
+      BLASLONG n1 = n & -2;
+
+      if (da_i == 0.0) {
+
+        while (j < n1) {
+
+          x[i] = 0.0;
+          x[i + 1] = 0.0;
+          x[i + inc_x] = 0.0;
+          x[i + 1 + inc_x] = 0.0;
+          i += 2 * inc_x;
+          j += 2;
+
+        }
+
+        while (j < n) {
+
+          x[i] = 0.0;
+          x[i + 1] = 0.0;
+          i += inc_x;
+          j++;
+
+        }
+
+      } else {
+
+        while (j < n1) {
+
+          temp0 = -da_i * x[i + 1];
+          x[i + 1] = da_i * x[i];
+          x[i] = temp0;
+          temp1 = -da_i * x[i + 1 + inc_x];
+          x[i + 1 + inc_x] = da_i * x[i + inc_x];
+          x[i + inc_x] = temp1;
+          i += 2 * inc_x;
+          j += 2;
+
+        }
+
+        while (j < n) {
+
+          temp0 = -da_i * x[i + 1];
+          x[i + 1] = da_i * x[i];
+          x[i] = temp0;
+          i += inc_x;
+          j++;
+
+        }
+
+      }
+
+    } else {
+
+      if (da_i == 0.0) {
+        BLASLONG n1 = n & -2;
+
+        while (j < n1) {
+
+          temp0 = da_r * x[i];
+          x[i + 1] = da_r * x[i + 1];
+          x[i] = temp0;
+          temp1 = da_r * x[i + inc_x];
+          x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
+          x[i + inc_x] = temp1;
+          i += 2 * inc_x;
+          j += 2;
+
+        }
+
+        while (j < n) {
+
+          temp0 = da_r * x[i];
+          x[i + 1] = da_r * x[i + 1];
+          x[i] = temp0;
+          i += inc_x;
+          j++;
+
+        }
+
+      } else {
+
+        BLASLONG n1 = n & -8;
+        if (n1 > 0) {
+          alpha[0] = da_r;
+          alpha[1] = da_i;
+          cscal_kernel_inc_8(n1, alpha, x, inc_x);
+          j = n1;
+          i = n1 * inc_x;
+        }
+
+        while (j < n) {
+
+          temp0 = da_r * x[i] - da_i * x[i + 1];
+          x[i + 1] = da_r * x[i + 1] + da_i * x[i];
+          x[i] = temp0;
+          i += inc_x;
+          j++;
+
+        }
+
+      }
+
+    }
+
+    return (0);
+  }
+
+  BLASLONG n1 = n & -16;
+  if (n1 > 0) {
+
+    alpha[0] = da_r;
+    alpha[1] = da_i;
+
+    if (da_r == 0.0)
+      if (da_i == 0)
+        cscal_kernel_16_zero(n1, x);
+      else
+        cscal_kernel_16_zero_r(n1, alpha, x);
+    else if (da_i == 0)
+      cscal_kernel_16_zero_i(n1, alpha, x);
+    else
+      cscal_kernel_16(n1, alpha, x);
+
+    i = n1 << 1;
+    j = n1;
+  }
+
+  if (da_r == 0.0) {
+
+    if (da_i == 0.0) {
+
+      while (j < n) {
+
+        x[i] = 0.0;
+        x[i + 1] = 0.0;
+        i += 2;
+        j++;
+
+      }
+
+    } else {
+
+      while (j < n) {
+
+        temp0 = -da_i * x[i + 1];
+        x[i + 1] = da_i * x[i];
+        x[i] = temp0;
+        i += 2;
+        j++;
+
+      }
+
+    }
+
+  } else {
+
+    if (da_i == 0.0) {
+
+      while (j < n) {
+
+        temp0 = da_r * x[i];
+        x[i + 1] = da_r * x[i + 1];
+        x[i] = temp0;
+        i += 2;
+        j++;
+
+      }
+
+    } else {
+
+      while (j < n) {
+
+        temp0 = da_r * x[i] - da_i * x[i + 1];
+        x[i + 1] = da_r * x[i + 1] + da_i * x[i];
+        x[i] = temp0;
+        i += 2;
+        j++;
+
+      }
+
+    }
+
+  }
+
+  return (0);
+}
diff --git a/kernel/zarch/csum.c b/kernel/zarch/csum.c
new file mode 100644
index 0000000000..c0b8c6371d
--- /dev/null
+++ b/kernel/zarch/csum.c
@@ -0,0 +1,137 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT sum;
+
+  __asm__("vzero   %%v24\n\t"
+    "vzero   %%v25\n\t"
+    "vzero   %%v26\n\t"
+    "vzero   %%v27\n\t"
+    "vzero   %%v28\n\t"
+    "vzero   %%v29\n\t"
+    "vzero   %%v30\n\t"
+    "vzero   %%v31\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd  1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16, 0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v20, 64(%%r1,%[x])\n\t"
+    "vl  %%v21, 80(%%r1,%[x])\n\t"
+    "vl  %%v22, 96(%%r1,%[x])\n\t"
+    "vl  %%v23, 112(%%r1,%[x])\n\t"
+    "vfasb   %%v24,%%v24,%%v16\n\t"
+    "vfasb   %%v25,%%v25,%%v17\n\t"
+    "vfasb   %%v26,%%v26,%%v18\n\t"
+    "vfasb   %%v27,%%v27,%%v19\n\t"
+    "vfasb   %%v28,%%v28,%%v20\n\t"
+    "vfasb   %%v29,%%v29,%%v21\n\t"
+    "vfasb   %%v30,%%v30,%%v22\n\t"
+    "vfasb   %%v31,%%v31,%%v23\n\t"
+    "vl  %%v16, 128(%%r1,%[x])\n\t"
+    "vl  %%v17, 144(%%r1,%[x])\n\t"
+    "vl  %%v18, 160(%%r1,%[x])\n\t"
+    "vl  %%v19, 176(%%r1,%[x])\n\t"
+    "vl  %%v20, 192(%%r1,%[x])\n\t"
+    "vl  %%v21, 208(%%r1,%[x])\n\t"
+    "vl  %%v22, 224(%%r1,%[x])\n\t"
+    "vl  %%v23, 240(%%r1,%[x])\n\t"
+    "vfasb   %%v24,%%v24,%%v16\n\t"
+    "vfasb   %%v25,%%v25,%%v17\n\t"
+    "vfasb   %%v26,%%v26,%%v18\n\t"
+    "vfasb   %%v27,%%v27,%%v19\n\t"
+    "vfasb   %%v28,%%v28,%%v20\n\t"
+    "vfasb   %%v29,%%v29,%%v21\n\t"
+    "vfasb   %%v30,%%v30,%%v22\n\t"
+    "vfasb   %%v31,%%v31,%%v23\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b\n\t"
+    "vfasb   %%v24,%%v24,%%v25\n\t"
+    "vfasb   %%v24,%%v24,%%v26\n\t"
+    "vfasb   %%v24,%%v24,%%v27\n\t"
+    "vfasb   %%v24,%%v24,%%v28\n\t"
+    "vfasb   %%v24,%%v24,%%v29\n\t"
+    "vfasb   %%v24,%%v24,%%v30\n\t"
+    "vfasb   %%v24,%%v24,%%v31\n\t"
+    "veslg   %%v25,%%v24,32\n\t"
+    "vfasb   %%v24,%%v24,%%v25\n\t"
+    "vrepf   %%v25,%%v24,2\n\t"
+    "vfasb   %%v24,%%v24,%%v25\n\t"
+    "vstef   %%v24,%[asum],0"
+    : [sum] "=Q"(sum),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return sum;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ip = 0;
+  FLOAT sumf = 0.0;
+  BLASLONG n1;
+  BLASLONG inc_x2;
+
+  if (n <= 0 || inc_x <= 0)
+    return (sumf);
+
+  if (inc_x == 1) {
+
+    n1 = n & -32;
+    if (n1 > 0) {
+
+      sumf = csum_kernel_32(n1, x);
+      i = n1;
+      ip = 2 * n1;
+    }
+
+    while (i < n) {
+      sumf += x[ip] + x[ip + 1];
+      i++;
+      ip += 2;
+    }
+
+  } else {
+    inc_x2 = 2 * inc_x;
+
+    while (i < n) {
+      sumf += x[ip] + x[ip + 1];
+      ip += inc_x2;
+      i++;
+    }
+
+  }
+  return (sumf);
+}
diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c
new file mode 100644
index 0000000000..198994e185
--- /dev/null
+++ b/kernel/zarch/cswap.c
@@ -0,0 +1,169 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
+  __asm__("srlg %[n],%[n],5\n\t"
+    "xgr  %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "pfd 2, 1024(%%r1,%[y])\n\t"
+    "vl  %%v16, 0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v20, 64(%%r1,%[x])\n\t"
+    "vl  %%v21, 80(%%r1,%[x])\n\t"
+    "vl  %%v22, 96(%%r1,%[x])\n\t"
+    "vl  %%v23, 112(%%r1,%[x])\n\t"
+    "vl  %%v24, 128(%%r1,%[x])\n\t"
+    "vl  %%v25, 144(%%r1,%[x])\n\t"
+    "vl  %%v26, 160(%%r1,%[x])\n\t"
+    "vl  %%v27, 176(%%r1,%[x])\n\t"
+    "vl  %%v28, 192(%%r1,%[x])\n\t"
+    "vl  %%v29, 208(%%r1,%[x])\n\t"
+    "vl  %%v30, 224(%%r1,%[x])\n\t"
+    "vl  %%v31, 240(%%r1,%[x])\n\t"
+    "vl  %%v0, 0(%%r1,%[y])\n\t"
+    "vl  %%v1, 16(%%r1,%[y])\n\t"
+    "vl  %%v2, 32(%%r1,%[y])\n\t"
+    "vl  %%v3, 48(%%r1,%[y])\n\t"
+    "vl  %%v4, 64(%%r1,%[y])\n\t"
+    "vl  %%v5, 80(%%r1,%[y])\n\t"
+    "vl  %%v6, 96(%%r1,%[y])\n\t"
+    "vl  %%v7, 112(%%r1,%[y])\n\t"
+    "vst  %%v0, 0(%%r1,%[x])\n\t"
+    "vst  %%v1, 16(%%r1,%[x])\n\t"
+    "vst  %%v2, 32(%%r1,%[x])\n\t"
+    "vst  %%v3, 48(%%r1,%[x])\n\t"
+    "vst  %%v4, 64(%%r1,%[x])\n\t"
+    "vst  %%v5, 80(%%r1,%[x])\n\t"
+    "vst  %%v6, 96(%%r1,%[x])\n\t"
+    "vst  %%v7, 112(%%r1,%[x])\n\t"
+    "vl  %%v0, 128(%%r1,%[y])\n\t"
+    "vl  %%v1, 144(%%r1,%[y])\n\t"
+    "vl  %%v2, 160(%%r1,%[y])\n\t"
+    "vl  %%v3, 176(%%r1,%[y])\n\t"
+    "vl  %%v4, 192(%%r1,%[y])\n\t"
+    "vl  %%v5, 208(%%r1,%[y])\n\t"
+    "vl  %%v6, 224(%%r1,%[y])\n\t"
+    "vl  %%v7, 240(%%r1,%[y])\n\t"
+    "vst  %%v0, 128(%%r1,%[x])\n\t"
+    "vst  %%v1, 144(%%r1,%[x])\n\t"
+    "vst  %%v2, 160(%%r1,%[x])\n\t"
+    "vst  %%v3, 176(%%r1,%[x])\n\t"
+    "vst  %%v4, 192(%%r1,%[x])\n\t"
+    "vst  %%v5, 208(%%r1,%[x])\n\t"
+    "vst  %%v6, 224(%%r1,%[x])\n\t"
+    "vst  %%v7, 240(%%r1,%[x])\n\t"
+    "vst  %%v16, 0(%%r1,%[y])\n\t"
+    "vst  %%v17, 16(%%r1,%[y])\n\t"
+    "vst  %%v18, 32(%%r1,%[y])\n\t"
+    "vst  %%v19, 48(%%r1,%[y])\n\t"
+    "vst  %%v20, 64(%%r1,%[y])\n\t"
+    "vst  %%v21, 80(%%r1,%[y])\n\t"
+    "vst  %%v22, 96(%%r1,%[y])\n\t"
+    "vst  %%v23, 112(%%r1,%[y])\n\t"
+    "vst  %%v24, 128(%%r1,%[y])\n\t"
+    "vst  %%v25, 144(%%r1,%[y])\n\t"
+    "vst  %%v26, 160(%%r1,%[y])\n\t"
+    "vst  %%v27, 176(%%r1,%[y])\n\t"
+    "vst  %%v28, 192(%%r1,%[y])\n\t"
+    "vst  %%v29, 208(%%r1,%[y])\n\t"
+    "vst  %%v30, 224(%%r1,%[y])\n\t"
+    "vst  %%v31, 240(%%r1,%[y])\n\t"
+    "agfi   %%r1,256\n\t"
+    "brctg  %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) x),
+       "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
+    : [x] "a"(x),[y] "a"(y)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+}
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
+          FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT *dummy, BLASLONG dummy2) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+  FLOAT temp[2];
+  BLASLONG inc_x2, inc_y2;
+
+  if (n <= 0)
+    return (0);
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+      cswap_kernel_32(n1, x, y);
+      i = n1;
+      ix = 2 * n1;
+      iy = 2 * n1;
+    }
+
+    while (i < n) {
+
+      temp[0] = x[ix];
+      temp[1] = x[ix + 1];
+      x[ix] = y[iy];
+      x[ix + 1] = y[iy + 1];
+      y[iy] = temp[0];
+      y[iy + 1] = temp[1];
+
+      ix += 2;
+      iy += 2;
+      i++;
+
+    }
+
+  } else {
+
+    inc_x2 = 2 * inc_x;
+    inc_y2 = 2 * inc_y;
+
+    while (i < n) {
+
+      temp[0] = x[ix];
+      temp[1] = x[ix + 1];
+      x[ix] = y[iy];
+      x[ix + 1] = y[iy + 1];
+      y[iy] = temp[0];
+      y[iy + 1] = temp[1];
+
+      ix += inc_x2;
+      iy += inc_y2;
+      i++;
+
+    }
+
+  }
+  return (0);
+
+}
diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c
new file mode 100644
index 0000000000..caacb50dc1
--- /dev/null
+++ b/kernel/zarch/damax.c
@@ -0,0 +1,150 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define ABS fabs
+
+static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT amax;
+
+  __asm__("vl    %%v0,0(%[x])\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,128(%%r1,%[x])\n\t"
+    "vl  %%v25,144(%%r1,%[x])\n\t"
+    "vl  %%v26,160(%%r1,%[x])\n\t"
+    "vl  %%v27,176(%%r1,%[x])\n\t"
+    "vl  %%v28,192(%%r1,%[x])\n\t"
+    "vl  %%v29,208(%%r1,%[x])\n\t"
+    "vl  %%v30,224(%%r1,%[x])\n\t"
+    "vl  %%v31,240(%%r1,%[x])\n\t"
+    "vfmaxdb  %%v16,%%v16,%%v24,8\n\t"
+    "vfmaxdb  %%v17,%%v17,%%v25,8\n\t"
+    "vfmaxdb  %%v18,%%v18,%%v26,8\n\t"
+    "vfmaxdb  %%v19,%%v19,%%v27,8\n\t"
+    "vfmaxdb  %%v20,%%v20,%%v28,8\n\t"
+    "vfmaxdb  %%v21,%%v21,%%v29,8\n\t"
+    "vfmaxdb  %%v22,%%v22,%%v30,8\n\t"
+    "vfmaxdb  %%v23,%%v23,%%v31,8\n\t"
+    "vfmaxdb  %%v16,%%v16,%%v20,8\n\t"
+    "vfmaxdb  %%v17,%%v17,%%v21,8\n\t"
+    "vfmaxdb  %%v18,%%v18,%%v22,8\n\t"
+    "vfmaxdb  %%v19,%%v19,%%v23,8\n\t"
+    "vfmaxdb  %%v16,%%v16,%%v18,8\n\t"
+    "vfmaxdb  %%v17,%%v17,%%v19,8\n\t"
+    "vfmaxdb  %%v16,%%v16,%%v17,8\n\t"
+    "vfmaxdb  %%v0,%%v0,%%v16,8\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg   %%v16,%%v0,1\n\t"
+    "wfmaxdb %%v0,%%v0,%%v16,8\n\t"
+    "lpdr    %[amax],%%f0"
+    : [amax] "=f"(amax),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return amax;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT maxf = 0.0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (maxf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      maxf = damax_kernel_32(n1, x);
+
+      i = n1;
+    } else {
+      maxf = ABS(x[0]);
+      i++;
+    }
+
+    while (i < n) {
+      if (ABS(x[i]) > maxf) {
+        maxf = ABS(x[i]);
+      }
+      i++;
+    }
+    return (maxf);
+
+  } else {
+
+    maxf = ABS(x[0]);
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (ABS(x[i]) > maxf) {
+        maxf = ABS(x[i]);
+      }
+      if (ABS(x[i + inc_x]) > maxf) {
+        maxf = ABS(x[i + inc_x]);
+      }
+      if (ABS(x[i + 2 * inc_x]) > maxf) {
+        maxf = ABS(x[i + 2 * inc_x]);
+      }
+      if (ABS(x[i + 3 * inc_x]) > maxf) {
+        maxf = ABS(x[i + 3 * inc_x]);
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (ABS(x[i]) > maxf) {
+        maxf = ABS(x[i]);
+      }
+      i += inc_x;
+      j++;
+    }
+    return (maxf);
+  }
+}
diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c
new file mode 100644
index 0000000000..f3db4c108f
--- /dev/null
+++ b/kernel/zarch/damax_z13.c
@@ -0,0 +1,184 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define ABS fabs
+
+static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT amax;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "vflpdb %%v0,%%v0\n\t"
+    "srlg   %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfchdb  %%v24,%%v16,%%v17\n\t"
+    "vfchdb  %%v25,%%v18,%%v19\n\t"
+    "vfchdb  %%v26,%%v20,%%v21\n\t"
+    "vfchdb  %%v27,%%v22,%%v23\n\t"
+    "vsel    %%v24,%%v16,%%v17,%%v24\n\t"
+    "vsel    %%v25,%%v18,%%v19,%%v25\n\t"
+    "vsel    %%v26,%%v20,%%v21,%%v26\n\t"
+    "vsel    %%v27,%%v22,%%v23,%%v27\n\t"
+    "vfchdb  %%v28,%%v24,%%v25\n\t"
+    "vfchdb  %%v29,%%v26,%%v27\n\t"
+    "vsel    %%v28,%%v24,%%v25,%%v28\n\t"
+    "vsel    %%v29,%%v26,%%v27,%%v29\n\t"
+    "vfchdb  %%v30,%%v28,%%v29\n\t"
+    "vsel    %%v30,%%v28,%%v29,%%v30\n\t"
+    "vfchdb  %%v31,%%v30,%%v0\n\t"
+    "vsel    %%v0,%%v30,%%v0,%%v31\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,192(%%r1,%[x])\n\t"
+    "vl  %%v21,208(%%r1,%[x])\n\t"
+    "vl  %%v22,224(%%r1,%[x])\n\t"
+    "vl  %%v23,240(%%r1,%[x])\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfchdb  %%v24,%%v16,%%v17\n\t"
+    "vfchdb  %%v25,%%v18,%%v19\n\t"
+    "vfchdb  %%v26,%%v20,%%v21\n\t"
+    "vfchdb  %%v27,%%v22,%%v23\n\t"
+    "vsel    %%v24,%%v16,%%v17,%%v24\n\t"
+    "vsel    %%v25,%%v18,%%v19,%%v25\n\t"
+    "vsel    %%v26,%%v20,%%v21,%%v26\n\t"
+    "vsel    %%v27,%%v22,%%v23,%%v27\n\t"
+    "vfchdb  %%v28,%%v24,%%v25\n\t"
+    "vfchdb  %%v29,%%v26,%%v27\n\t"
+    "vsel    %%v28,%%v24,%%v25,%%v28\n\t"
+    "vsel    %%v29,%%v26,%%v27,%%v29\n\t"
+    "vfchdb  %%v30,%%v28,%%v29\n\t"
+    "vsel    %%v30,%%v28,%%v29,%%v30\n\t"
+    "vfchdb  %%v31,%%v30,%%v0\n\t"
+    "vsel    %%v0,%%v30,%%v0,%%v31\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v16,%%v0,1\n\t"
+    "wfchdb %%v17,%%v0,%%v16\n\t"
+    "vsel   %%v0,%%v0,%%v16,%%v17\n\t"
+    "ldr    %[amax],%%f0"
+    : [amax] "=f"(amax),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return amax;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT maxf = 0.0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (maxf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      maxf = damax_kernel_32(n1, x);
+
+      i = n1;
+    } else {
+      maxf = ABS(x[0]);
+      i++;
+    }
+
+    while (i < n) {
+      if (ABS(x[i]) > maxf) {
+        maxf = ABS(x[i]);
+      }
+      i++;
+    }
+    return (maxf);
+
+  } else {
+
+    maxf = ABS(x[0]);
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (ABS(x[i]) > maxf) {
+        maxf = ABS(x[i]);
+      }
+      if (ABS(x[i + inc_x]) > maxf) {
+        maxf = ABS(x[i + inc_x]);
+      }
+      if (ABS(x[i + 2 * inc_x]) > maxf) {
+        maxf = ABS(x[i + 2 * inc_x]);
+      }
+      if (ABS(x[i + 3 * inc_x]) > maxf) {
+        maxf = ABS(x[i + 3 * inc_x]);
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (ABS(x[i]) > maxf) {
+        maxf = ABS(x[i]);
+      }
+      i += inc_x;
+      j++;
+    }
+    return (maxf);
+  }
+}
diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c
new file mode 100644
index 0000000000..0163a144b3
--- /dev/null
+++ b/kernel/zarch/damin.c
@@ -0,0 +1,150 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define ABS fabs
+
+static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT amin;
+
+  __asm__("vl    %%v0,0(%[x])\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,128(%%r1,%[x])\n\t"
+    "vl  %%v25,144(%%r1,%[x])\n\t"
+    "vl  %%v26,160(%%r1,%[x])\n\t"
+    "vl  %%v27,176(%%r1,%[x])\n\t"
+    "vl  %%v28,192(%%r1,%[x])\n\t"
+    "vl  %%v29,208(%%r1,%[x])\n\t"
+    "vl  %%v30,224(%%r1,%[x])\n\t"
+    "vl  %%v31,240(%%r1,%[x])\n\t"
+    "vfmindb  %%v16,%%v16,%%v24,8\n\t"
+    "vfmindb  %%v17,%%v17,%%v25,8\n\t"
+    "vfmindb  %%v18,%%v18,%%v26,8\n\t"
+    "vfmindb  %%v19,%%v19,%%v27,8\n\t"
+    "vfmindb  %%v20,%%v20,%%v28,8\n\t"
+    "vfmindb  %%v21,%%v21,%%v29,8\n\t"
+    "vfmindb  %%v22,%%v22,%%v30,8\n\t"
+    "vfmindb  %%v23,%%v23,%%v31,8\n\t"
+    "vfmindb  %%v16,%%v16,%%v20,8\n\t"
+    "vfmindb  %%v17,%%v17,%%v21,8\n\t"
+    "vfmindb  %%v18,%%v18,%%v22,8\n\t"
+    "vfmindb  %%v19,%%v19,%%v23,8\n\t"
+    "vfmindb  %%v16,%%v16,%%v18,8\n\t"
+    "vfmindb  %%v17,%%v17,%%v19,8\n\t"
+    "vfmindb  %%v16,%%v16,%%v17,8\n\t"
+    "vfmindb  %%v0,%%v0,%%v16,8\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg   %%v16,%%v0,1\n\t"
+    "wfmindb %%v0,%%v0,%%v16,8\n\t"
+    "lpdr    %[amin],%%f0"
+    : [amin] "=f"(amin),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return amin;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT minf = 0.0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (minf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      minf = damin_kernel_32(n1, x);
+
+      i = n1;
+    } else {
+      minf = ABS(x[0]);
+      i++;
+    }
+
+    while (i < n) {
+      if (ABS(x[i]) < minf) {
+        minf = ABS(x[i]);
+      }
+      i++;
+    }
+    return (minf);
+
+  } else {
+
+    minf = ABS(x[0]);
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (ABS(x[i]) < minf) {
+        minf = ABS(x[i]);
+      }
+      if (ABS(x[i + inc_x]) < minf) {
+        minf = ABS(x[i + inc_x]);
+      }
+      if (ABS(x[i + 2 * inc_x]) < minf) {
+        minf = ABS(x[i + 2 * inc_x]);
+      }
+      if (ABS(x[i + 3 * inc_x]) < minf) {
+        minf = ABS(x[i + 3 * inc_x]);
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (ABS(x[i]) < minf) {
+        minf = ABS(x[i]);
+      }
+      i += inc_x;
+      j++;
+    }
+    return (minf);
+  }
+}
diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c
new file mode 100644
index 0000000000..4196b2e15f
--- /dev/null
+++ b/kernel/zarch/damin_z13.c
@@ -0,0 +1,184 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define ABS fabs
+
+static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT amin;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "vflpdb %%v0,%%v0\n\t"
+    "srlg   %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfchdb  %%v24,%%v17,%%v16\n\t"
+    "vfchdb  %%v25,%%v19,%%v18\n\t"
+    "vfchdb  %%v26,%%v21,%%v20\n\t"
+    "vfchdb  %%v27,%%v23,%%v22\n\t"
+    "vsel    %%v24,%%v16,%%v17,%%v24\n\t"
+    "vsel    %%v25,%%v18,%%v19,%%v25\n\t"
+    "vsel    %%v26,%%v20,%%v21,%%v26\n\t"
+    "vsel    %%v27,%%v22,%%v23,%%v27\n\t"
+    "vfchdb  %%v28,%%v25,%%v24\n\t"
+    "vfchdb  %%v29,%%v27,%%v26\n\t"
+    "vsel    %%v28,%%v24,%%v25,%%v28\n\t"
+    "vsel    %%v29,%%v26,%%v27,%%v29\n\t"
+    "vfchdb  %%v30,%%v29,%%v28\n\t"
+    "vsel    %%v30,%%v28,%%v29,%%v30\n\t"
+    "vfchdb  %%v31,%%v0,%%v30\n\t"
+    "vsel    %%v0,%%v30,%%v0,%%v31\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,192(%%r1,%[x])\n\t"
+    "vl  %%v21,208(%%r1,%[x])\n\t"
+    "vl  %%v22,224(%%r1,%[x])\n\t"
+    "vl  %%v23,240(%%r1,%[x])\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfchdb  %%v24,%%v17,%%v16\n\t"
+    "vfchdb  %%v25,%%v19,%%v18\n\t"
+    "vfchdb  %%v26,%%v21,%%v20\n\t"
+    "vfchdb  %%v27,%%v23,%%v22\n\t"
+    "vsel    %%v24,%%v16,%%v17,%%v24\n\t"
+    "vsel    %%v25,%%v18,%%v19,%%v25\n\t"
+    "vsel    %%v26,%%v20,%%v21,%%v26\n\t"
+    "vsel    %%v27,%%v22,%%v23,%%v27\n\t"
+    "vfchdb  %%v28,%%v25,%%v24\n\t"
+    "vfchdb  %%v29,%%v27,%%v26\n\t"
+    "vsel    %%v28,%%v24,%%v25,%%v28\n\t"
+    "vsel    %%v29,%%v26,%%v27,%%v29\n\t"
+    "vfchdb  %%v30,%%v29,%%v28\n\t"
+    "vsel    %%v30,%%v28,%%v29,%%v30\n\t"
+    "vfchdb  %%v31,%%v0,%%v30\n\t"
+    "vsel    %%v0,%%v30,%%v0,%%v31\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v16,%%v0,1\n\t"
+    "wfchdb %%v17,%%v16,%%v0\n\t"
+    "vsel   %%v0,%%v0,%%v16,%%v17\n\t"
+    "ldr    %[amin],%%f0"
+    : [amin] "=f"(amin),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return amin;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT minf = 0.0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (minf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      minf = damin_kernel_32(n1, x);
+
+      i = n1;
+    } else {
+      minf = ABS(x[0]);
+      i++;
+    }
+
+    while (i < n) {
+      if (ABS(x[i]) < minf) {
+        minf = ABS(x[i]);
+      }
+      i++;
+    }
+    return (minf);
+
+  } else {
+
+    minf = ABS(x[0]);
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (ABS(x[i]) < minf) {
+        minf = ABS(x[i]);
+      }
+      if (ABS(x[i + inc_x]) < minf) {
+        minf = ABS(x[i + inc_x]);
+      }
+      if (ABS(x[i + 2 * inc_x]) < minf) {
+        minf = ABS(x[i + 2 * inc_x]);
+      }
+      if (ABS(x[i + 3 * inc_x]) < minf) {
+        minf = ABS(x[i + 3 * inc_x]);
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (ABS(x[i]) < minf) {
+        minf = ABS(x[i]);
+      }
+      i += inc_x;
+      j++;
+    }
+    return (minf);
+  }
+}
diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c
index 7a42a08634..aa1382b103 100644
--- a/kernel/zarch/dasum.c
+++ b/kernel/zarch/dasum.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -23,142 +23,144 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-
+*****************************************************************************/
 
 #include "common.h"
 #include <math.h>
 
-#if defined(DOUBLE) 
-#define ABS fabs 
-#else 
-#define ABS fabsf 
-#endif
-
-
- 
-
-static   FLOAT  dasum_kernel_32(BLASLONG n, FLOAT *x) {
-    FLOAT asum    ; 
-    __asm__  (
-            "pfd     1, 0(%[ptr_x])   \n\t"
-            "sllg    %%r0,%[n],3  \n\t"
-            "agr     %%r0,%[ptr_x]    \n\t"   
-            "vzero   %%v0       \n\t"
-            "vzero   %%v1       \n\t"
-            "vzero   %%v2       \n\t"
-            "vzero   %%v3       \n\t"   
-            ".align 16 \n\t"
-            "1:      \n\t"
-            "pfd     1, 256(%[ptr_temp] ) \n\t"
-            "vlm     %%v24,%%v31, 0(%[ptr_temp] ) \n\t"  
-    
-            "vflpdb  %%v24, %%v24 \n\t"
-            "vflpdb  %%v25, %%v25 \n\t"
-            "vflpdb  %%v26, %%v26 \n\t"
-            "vflpdb  %%v27, %%v27 \n\t"
-            "vflpdb  %%v28, %%v28 \n\t"
-            "vflpdb  %%v29, %%v29 \n\t"
-            "vflpdb  %%v30, %%v30 \n\t"
-            "vflpdb  %%v31, %%v31 \n\t"
-    
-            "vfadb   %%v0,%%v0,%%v24    \n\t"
-            "vfadb   %%v1,%%v1,%%v25    \n\t"
-            "vfadb   %%v2,%%v2,%%v26    \n\t"
-            "vfadb   %%v3,%%v3,%%v27    \n\t" 
-            "vfadb   %%v0,%%v0,%%v28    \n\t"
-            "vfadb   %%v1,%%v1,%%v29    \n\t"
-            "vfadb   %%v2,%%v2,%%v30    \n\t"
-            "vfadb   %%v3,%%v3,%%v31    \n\t" 
-    
-            "vlm     %%v24,%%v31, 128(%[ptr_temp]) \n\t"  
-    
-            "vflpdb  %%v24, %%v24       \n\t"
-            "vflpdb  %%v25, %%v25       \n\t"
-            "vflpdb  %%v26, %%v26       \n\t"
-            "vflpdb  %%v27, %%v27       \n\t"
-            "vflpdb  %%v28, %%v28       \n\t"
-            "vflpdb  %%v29, %%v29       \n\t"
-            "vflpdb  %%v30, %%v30       \n\t"
-            "vflpdb  %%v31, %%v31       \n\t"
-            "la      %[ptr_temp],256(%[ptr_temp])  \n\t"  
-            "vfadb   %%v0,%%v0,%%v24    \n\t"
-            "vfadb   %%v1,%%v1,%%v25    \n\t"
-            "vfadb   %%v2,%%v2,%%v26    \n\t"
-            "vfadb   %%v3,%%v3,%%v27    \n\t" 
-            "vfadb   %%v0,%%v0,%%v28    \n\t"
-            "vfadb   %%v1,%%v1,%%v29    \n\t"
-            "vfadb   %%v2,%%v2,%%v30    \n\t"
-            "vfadb   %%v3,%%v3,%%v31    \n\t"  
-            
-            "clgrjl  %[ptr_temp],%%r0,1b           \n\t"
-            "vfadb   %%v24,%%v0,%%v1    \n\t"
-            "vfadb   %%v25,%%v2,%%v3    \n\t"
-            "vfadb   %%v0,%%v25,%%v24   \n\t"
-            "vrepg   %%v1,%%v0,1        \n\t"
-            "adbr    %%f0,%%f1          \n\t"
-            "ldr     %[asum],%%f0       \n\t"
-            : [asum] "=f"(asum),[ptr_temp] "+&a"(x)
-            : [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x)
-            : "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
-            );
-      return asum;
-
+#define ABS fabs
+
+static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT asum;
+
+  __asm__("vzero   %%v24\n\t"
+    "vzero   %%v25\n\t"
+    "vzero   %%v26\n\t"
+    "vzero   %%v27\n\t"
+    "vzero   %%v28\n\t"
+    "vzero   %%v29\n\t"
+    "vzero   %%v30\n\t"
+    "vzero   %%v31\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd  1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16, 0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v20, 64(%%r1,%[x])\n\t"
+    "vl  %%v21, 80(%%r1,%[x])\n\t"
+    "vl  %%v22, 96(%%r1,%[x])\n\t"
+    "vl  %%v23, 112(%%r1,%[x])\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfadb   %%v24,%%v24,%%v16\n\t"
+    "vfadb   %%v25,%%v25,%%v17\n\t"
+    "vfadb   %%v26,%%v26,%%v18\n\t"
+    "vfadb   %%v27,%%v27,%%v19\n\t"
+    "vfadb   %%v28,%%v28,%%v20\n\t"
+    "vfadb   %%v29,%%v29,%%v21\n\t"
+    "vfadb   %%v30,%%v30,%%v22\n\t"
+    "vfadb   %%v31,%%v31,%%v23\n\t"
+    "vl  %%v16, 128(%%r1,%[x])\n\t"
+    "vl  %%v17, 144(%%r1,%[x])\n\t"
+    "vl  %%v18, 160(%%r1,%[x])\n\t"
+    "vl  %%v19, 176(%%r1,%[x])\n\t"
+    "vl  %%v20, 192(%%r1,%[x])\n\t"
+    "vl  %%v21, 208(%%r1,%[x])\n\t"
+    "vl  %%v22, 224(%%r1,%[x])\n\t"
+    "vl  %%v23, 240(%%r1,%[x])\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfadb   %%v24,%%v24,%%v16\n\t"
+    "vfadb   %%v25,%%v25,%%v17\n\t"
+    "vfadb   %%v26,%%v26,%%v18\n\t"
+    "vfadb   %%v27,%%v27,%%v19\n\t"
+    "vfadb   %%v28,%%v28,%%v20\n\t"
+    "vfadb   %%v29,%%v29,%%v21\n\t"
+    "vfadb   %%v30,%%v30,%%v22\n\t"
+    "vfadb   %%v31,%%v31,%%v23\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b\n\t"
+    "vfadb   %%v24,%%v24,%%v25\n\t"
+    "vfadb   %%v24,%%v24,%%v26\n\t"
+    "vfadb   %%v24,%%v24,%%v27\n\t"
+    "vfadb   %%v24,%%v24,%%v28\n\t"
+    "vfadb   %%v24,%%v24,%%v29\n\t"
+    "vfadb   %%v24,%%v24,%%v30\n\t"
+    "vfadb   %%v24,%%v24,%%v31\n\t"
+    "vrepg   %%v25,%%v24,1\n\t"
+    "vfadb   %%v24,%%v24,%%v25\n\t"
+    "vsteg   %%v24,%[asum],0"
+    : [asum] "=Q"(asum),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return asum;
 }
 
-
-
-
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
-    BLASLONG i = 0;
-    BLASLONG j = 0;
-    FLOAT sumf = 0.0;
-    BLASLONG n1;
-
-    if (n <= 0 || inc_x <= 0) return sumf;
-
-    if (inc_x == 1) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT sumf = 0.0;
+  BLASLONG n1;
 
-        n1 = n & -32;
-               
-        if (n1 > 0) {
+  if (n <= 0 || inc_x <= 0)
+    return sumf;
 
-            sumf = dasum_kernel_32(n1, x);
-            i = n1;
-        }
+  if (inc_x == 1) {
 
-        while (i < n) {
-            sumf += ABS(x[i]);
-            i++;
-        }
+    n1 = n & -32;
 
-    } else {
-        BLASLONG n1 = n & -4;
-        register FLOAT sum1, sum2;
-        sum1 = 0.0;
-        sum2 = 0.0;
-        while (j < n1) {
+    if (n1 > 0) {
 
-            sum1 += ABS(x[i]);
-            sum2 += ABS(x[i + inc_x]);
-            sum1 += ABS(x[i + 2 * inc_x]);
-            sum2 += ABS(x[i + 3 * inc_x]);
+      sumf = dasum_kernel_32(n1, x);
+      i = n1;
+    }
 
-            i += inc_x * 4;
-            j += 4;
+    while (i < n) {
+      sumf += ABS(x[i]);
+      i++;
+    }
 
-        }
-        sumf = sum1 + sum2;
-        while (j < n) {
+  } else {
+    BLASLONG n1 = n & -4;
+    register FLOAT sum1, sum2;
+    sum1 = 0.0;
+    sum2 = 0.0;
+    while (j < n1) {
 
-            sumf += ABS(x[i]);
-            i += inc_x;
-            j++;
-        }
+      sum1 += ABS(x[i]);
+      sum2 += ABS(x[i + inc_x]);
+      sum1 += ABS(x[i + 2 * inc_x]);
+      sum2 += ABS(x[i + 3 * inc_x]);
 
+      i += inc_x * 4;
+      j += 4;
 
     }
-    return sumf;
-}
+    sumf = sum1 + sum2;
+    while (j < n) {
 
+      sumf += ABS(x[i]);
+      i += inc_x;
+      j++;
+    }
 
+  }
+  return sumf;
+}
diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c
index 16f82a5879..5b0208c20e 100644
--- a/kernel/zarch/daxpy.c
+++ b/kernel/zarch/daxpy.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -25,159 +25,143 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
 #include "common.h"
 
-#define PREFETCH_INS 1
-#if defined(Z13_A)
-#include <vecintrin.h>
-
-static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
-{
-    BLASLONG  i = 0;
-    __vector double v_a = {alpha,alpha}; 
-    __vector double * v_y=(__vector double *)y;
-    __vector double * v_x=(__vector double *)x;
-        
-    for(; i<n/2; i+=16){
-
-        v_y[i]    += v_a * v_x[i];
-        v_y[i+1]  += v_a * v_x[i+1];
-        v_y[i+2]  += v_a * v_x[i+2];
-        v_y[i+3]  += v_a * v_x[i+3];
-        v_y[i+4]  += v_a * v_x[i+4];
-        v_y[i+5]  += v_a * v_x[i+5];
-        v_y[i+6]  += v_a * v_x[i+6];
-        v_y[i+7]  += v_a * v_x[i+7]; 
-        v_y[i+8]  += v_a * v_x[i+8];
-        v_y[i+9]  += v_a * v_x[i+9];
-        v_y[i+10] += v_a * v_x[i+10];
-        v_y[i+11] += v_a * v_x[i+11];
-        v_y[i+12] += v_a * v_x[i+12];
-        v_y[i+13] += v_a * v_x[i+13];
-        v_y[i+14] += v_a * v_x[i+14];
-        v_y[i+15] += v_a * v_x[i+15];
-    }
-
+static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
+  __asm__("vlrepg %%v0,%[alpha]\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "pfd 2, 1024(%%r1,%[y])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,0(%%r1,%[y])\n\t"
+    "vl  %%v21,16(%%r1,%[y])\n\t"
+    "vl  %%v22,32(%%r1,%[y])\n\t"
+    "vl  %%v23,48(%%r1,%[y])\n\t"
+    "vl  %%v24,64(%%r1,%[x])\n\t"
+    "vl  %%v25,80(%%r1,%[x])\n\t"
+    "vl  %%v26,96(%%r1,%[x])\n\t"
+    "vl  %%v27,112(%%r1,%[x])\n\t"
+    "vl  %%v28,64(%%r1,%[y])\n\t"
+    "vl  %%v29,80(%%r1,%[y])\n\t"
+    "vl  %%v30,96(%%r1,%[y])\n\t"
+    "vl  %%v31,112(%%r1,%[y])\n\t"
+    "vfmadb   %%v16,%%v0,%%v16,%%v20\n\t"
+    "vfmadb   %%v17,%%v0,%%v17,%%v21\n\t"
+    "vfmadb   %%v18,%%v0,%%v18,%%v22\n\t"
+    "vfmadb   %%v19,%%v0,%%v19,%%v23\n\t"
+    "vfmadb   %%v24,%%v0,%%v24,%%v28\n\t"
+    "vfmadb   %%v25,%%v0,%%v25,%%v29\n\t"
+    "vfmadb   %%v26,%%v0,%%v26,%%v30\n\t"
+    "vfmadb   %%v27,%%v0,%%v27,%%v31\n\t"
+    "vst  %%v16,0(%%r1,%[y])\n\t"
+    "vst  %%v17,16(%%r1,%[y])\n\t"
+    "vst  %%v18,32(%%r1,%[y])\n\t"
+    "vst  %%v19,48(%%r1,%[y])\n\t"
+    "vst  %%v24,64(%%r1,%[y])\n\t"
+    "vst  %%v25,80(%%r1,%[y])\n\t"
+    "vst  %%v26,96(%%r1,%[y])\n\t"
+    "vst  %%v27,112(%%r1,%[y])\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,128(%%r1,%[y])\n\t"
+    "vl  %%v21,144(%%r1,%[y])\n\t"
+    "vl  %%v22,160(%%r1,%[y])\n\t"
+    "vl  %%v23,176(%%r1,%[y])\n\t"
+    "vl  %%v24,192(%%r1,%[x])\n\t"
+    "vl  %%v25,208(%%r1,%[x])\n\t"
+    "vl  %%v26,224(%%r1,%[x])\n\t"
+    "vl  %%v27,240(%%r1,%[x])\n\t"
+    "vl  %%v28,192(%%r1,%[y])\n\t"
+    "vl  %%v29,208(%%r1,%[y])\n\t"
+    "vl  %%v30,224(%%r1,%[y])\n\t"
+    "vl  %%v31,240(%%r1,%[y])\n\t"
+    "vfmadb   %%v16,%%v0,%%v16,%%v20\n\t"
+    "vfmadb   %%v17,%%v0,%%v17,%%v21\n\t"
+    "vfmadb   %%v18,%%v0,%%v18,%%v22\n\t"
+    "vfmadb   %%v19,%%v0,%%v19,%%v23\n\t"
+    "vfmadb   %%v24,%%v0,%%v24,%%v28\n\t"
+    "vfmadb   %%v25,%%v0,%%v25,%%v29\n\t"
+    "vfmadb   %%v26,%%v0,%%v26,%%v30\n\t"
+    "vfmadb   %%v27,%%v0,%%v27,%%v31\n\t"
+    "vst  %%v16,128(%%r1,%[y])\n\t"
+    "vst  %%v17,144(%%r1,%[y])\n\t"
+    "vst  %%v18,160(%%r1,%[y])\n\t"
+    "vst  %%v19,176(%%r1,%[y])\n\t"
+    "vst  %%v24,192(%%r1,%[y])\n\t"
+    "vst  %%v25,208(%%r1,%[y])\n\t"
+    "vst  %%v26,224(%%r1,%[y])\n\t"
+    "vst  %%v27,240(%%r1,%[y])\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
+       [alpha] "Q"(*alpha)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
 }
-#else
-static void   daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
-{ 
-        
-         __asm__ volatile( 
-#if defined(PREFETCH_INS)         
-                   "pfd      1, 0(%[x_tmp])   \n\t"
-                   "pfd      2, 0(%[y_tmp])   \n\t"
-#endif
-                    "lgdr    %%r0,%[alpha]    \n\t"
-                    "vlvgp   %%v0,%%r0,%%r0   \n\t"   
-                    "srlg    %%r0,%[n],5      \n\t" 
-                    "vlr     %%v1,%%v0        \n\t"
-                    ".align 16 \n\t"    
-                    "1: \n\t"
-#if defined(PREFETCH_INS)         
-                    "pfd      1, 256(%[x_tmp]) \n\t"
-                    "pfd      2, 256(%[y_tmp]) \n\t"
-#endif                  
-                    "vlm      %%v16,%%v23,  0(%[x_tmp])    \n\t"
-                    "vlm      %%v24, %%v31, 0(%[y_tmp])    \n\t"
-                    "vfmadb   %%v16,%%v0,%%v16,%%v24       \n\t"
-                    "vfmadb   %%v17,%%v1,%%v17,%%v25       \n\t"  
-                    "vfmadb   %%v18,%%v0,%%v18,%%v26       \n\t"
-                    "vfmadb   %%v19,%%v1,%%v19,%%v27       \n\t"
-                    "vfmadb   %%v20,%%v0,%%v20,%%v28       \n\t"
-                    "vfmadb   %%v21,%%v1,%%v21,%%v29       \n\t"  
-                    "vfmadb   %%v22,%%v0,%%v22,%%v30       \n\t"
-                    "vfmadb   %%v23,%%v1,%%v23,%%v31       \n\t"
-                    "vstm     %%v16,%%v23,   0(%[y_tmp])   \n\t" 
-                    "vlm      %%v24,%%v31, 128(%[x_tmp])   \n\t"
-                    "vlm      %%v16,%%v23, 128(%[y_tmp])   \n\t"
-                    "vfmadb   %%v24,%%v0,%%v24,%%v16       \n\t"
-                    "vfmadb   %%v25,%%v1,%%v25,%%v17       \n\t"  
-                    "vfmadb   %%v26,%%v0,%%v26,%%v18       \n\t"
-                    "vfmadb   %%v27,%%v1,%%v27,%%v19       \n\t"
-                    "vfmadb   %%v28,%%v0,%%v28,%%v20       \n\t"
-                    "vfmadb   %%v29,%%v1,%%v29,%%v21       \n\t"  
-                    "vfmadb   %%v30,%%v0,%%v30,%%v22       \n\t"
-                    "vfmadb   %%v31,%%v1,%%v31,%%v23       \n\t"  
-                    "la       %[x_tmp],256(%[x_tmp])       \n\t"
-                    "vstm     %%v24, %%v31, 128(%[y_tmp])  \n\t"
-                    "la       %[y_tmp],256(%[y_tmp])       \n\t" 
-                    "brctg    %%r0,1b"
-                    : [mem_y] "+m" (*(double (*)[n])y), [x_tmp] "+&a"(x), [y_tmp] "+&a"(y)
-                    : [mem_x] "m" (*(const double (*)[n])x), [n] "r"(n), [alpha] "f"(alpha)
-                    :"cc", "r0", "v0","v1","v16","v17","v18","v19","v20","v21",
-                    "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
-                 );
- 
-
-}
- 
- 
-
-#endif
- 
-
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
-{
-    BLASLONG i=0;
-    BLASLONG ix=0,iy=0;
 
-    if ( n <= 0 )  return 0 ;
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
+          BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+          BLASLONG dummy2) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
 
-    if ( (inc_x == 1) && (inc_y == 1) )
-    {
+  if (n <= 0)
+    return 0;
 
-        BLASLONG n1 = n & -32;
+  if ((inc_x == 1) && (inc_y == 1)) {
 
-        if ( n1 )
-            daxpy_kernel_32(n1, x, y , da );
+    BLASLONG n1 = n & -32;
 
-        i = n1;
-        while(i < n)
-        {
+    if (n1)
+      daxpy_kernel_32(n1, x, y, &da);
 
-            y[i] += da * x[i] ;
-            i++ ;
-
-        }
-        return 0 ;
+    i = n1;
+    while (i < n) {
 
+      y[i] += da * x[i];
+      i++;
 
     }
+    return 0;
 
-    BLASLONG n1 = n & -4;
-
-    while(i < n1)
-    {
+  }
 
-        FLOAT m1      = da * x[ix] ;
-        FLOAT m2      = da * x[ix+inc_x] ;
-        FLOAT m3      = da * x[ix+2*inc_x] ;
-        FLOAT m4      = da * x[ix+3*inc_x] ;
+  BLASLONG n1 = n & -4;
 
-        y[iy]         += m1 ;
-        y[iy+inc_y]   += m2 ;
-        y[iy+2*inc_y] += m3 ;
-        y[iy+3*inc_y] += m4 ;
+  while (i < n1) {
 
-        ix  += inc_x*4 ;
-        iy  += inc_y*4 ;
-        i+=4 ;
+    FLOAT m1 = da * x[ix];
+    FLOAT m2 = da * x[ix + inc_x];
+    FLOAT m3 = da * x[ix + 2 * inc_x];
+    FLOAT m4 = da * x[ix + 3 * inc_x];
 
-    }
+    y[iy] += m1;
+    y[iy + inc_y] += m2;
+    y[iy + 2 * inc_y] += m3;
+    y[iy + 3 * inc_y] += m4;
 
-    while(i < n)
-    {
+    ix += inc_x * 4;
+    iy += inc_y * 4;
+    i += 4;
 
-        y[iy] += da * x[ix] ;
-        ix  += inc_x ;
-        iy  += inc_y ;
-        i++ ;
+  }
 
-    }
-    return 0 ;
+  while (i < n) {
 
-}
+    y[iy] += da * x[ix];
+    ix += inc_x;
+    iy += inc_y;
+    i++;
 
+  }
+  return 0;
 
+}
diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c
index 01aa86bb20..691b90c64c 100644
--- a/kernel/zarch/dcopy.c
+++ b/kernel/zarch/dcopy.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -23,147 +23,57 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
+*****************************************************************************/
 
 #include "common.h"
 
-#if defined(Z13mvc)
-
-static void  dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
-
-    __asm__ volatile(
-            "pfd   1, 0(%[ptr_x])    \n\t"
-            "pfd   2, 0(%[ptr_y])    \n\t"
-            "srlg  %[n_tmp],%[n_tmp],5  \n\t"
-            ".align 16 \n\t"
-            "1: \n\t"
-            "mvc   0(256,%[ptr_y]),0(%[ptr_x]) \n\t"
-            "la    %[ptr_x],256(%[ptr_x])       \n\t"
-            "la    %[ptr_y],256(%[ptr_y])       \n\t"
-            "brctg %[n_tmp],1b"
-            : [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n),
-              [ptr_x] "+&a"(x), [ptr_y] "+&a"(y)
-            : [mem_x] "m" (*(const double (*)[n])x)
-            : "cc" 
-            );
-    return;
-
-}
-#else
-
-static void  dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
-
-    __asm__ volatile(
-            "pfd   1, 0(%[ptr_x]) \n\t"
-            "pfd   2, 0(%[ptr_y]) \n\t"
-            "srlg  %[n_tmp],%[n_tmp],5      \n\t"
-            "xgr   %%r1,%%r1       \n\t"
-            ".align 16 \n\t"
-            "1:    \n\t"
-            "pfd   1, 256(%%r1,%[ptr_x]) \n\t"
-            "pfd   2, 256(%%r1,%[ptr_y]) \n\t"
-
-            "vl    %%v24, 0(%%r1,%[ptr_x])   \n\t"
-            "vst   %%v24, 0(%%r1,%[ptr_y])   \n\t"
-            "vl    %%v25, 16(%%r1,%[ptr_x])  \n\t"
-            "vst   %%v25, 16(%%r1,%[ptr_y])  \n\t"
-            "vl    %%v26, 32(%%r1,%[ptr_x])  \n\t"
-            "vst   %%v26, 32(%%r1,%[ptr_y])  \n\t"
-            "vl    %%v27, 48(%%r1,%[ptr_x])  \n\t"
-            "vst   %%v27, 48(%%r1,%[ptr_y])  \n\t"
-
-            "vl    %%v24, 64(%%r1,%[ptr_x])  \n\t"
-            "vst   %%v24, 64(%%r1,%[ptr_y])  \n\t"
-            "vl    %%v25, 80(%%r1,%[ptr_x])  \n\t"
-            "vst   %%v25, 80(%%r1,%[ptr_y])  \n\t"
-            "vl    %%v26, 96(%%r1,%[ptr_x])  \n\t"
-            "vst   %%v26, 96(%%r1,%[ptr_y])  \n\t"
-            "vl    %%v27, 112(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v27, 112(%%r1,%[ptr_y]) \n\t"
-
-
-            "vl    %%v24, 128(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v24, 128(%%r1,%[ptr_y]) \n\t"
-
-            "vl    %%v25, 144(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v25, 144(%%r1,%[ptr_y]) \n\t"
-
-            "vl    %%v26, 160(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v26, 160(%%r1,%[ptr_y]) \n\t"
-
-            "vl    %%v27, 176(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v27, 176(%%r1,%[ptr_y]) \n\t"
-
-            "vl    %%v24, 192(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v24, 192(%%r1,%[ptr_y]) \n\t"
-            "vl    %%v25, 208(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v25, 208(%%r1,%[ptr_y]) \n\t"
-            "vl    %%v26, 224(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v26, 224(%%r1,%[ptr_y]) \n\t"
-            "vl    %%v27, 240(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v27, 240(%%r1,%[ptr_y]) \n\t"
-            "la    %%r1,256(%%r1)      \n\t"
-            "brctg %[n_tmp],1b"
-            : [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n)
-            : [mem_x] "m" (*(const double (*)[n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
-            : "cc",  "r1", "v24","v25","v26","v27"
-            );
-    return;
-
+static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
+  __asm__("srlg %[n],%[n],5\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%[x])\n\t"
+    "pfd 2, 1024(%[y])\n\t"
+    "mvc 0(256,%[y]),0(%[x])\n\t"
+    "la  %[x],256(%[x])\n\t"
+    "la  %[y],256(%[y])\n\t"
+    "brctg %[n],0b"
+    : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x)
+    : "cc");
 }
-#endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
-    BLASLONG i = 0;
-    BLASLONG ix = 0, iy = 0;
-
-    if (n <= 0) return 0;
-
-    if ((inc_x == 1) && (inc_y == 1)) {
-
-        BLASLONG n1 = n & -32;
-        if (n1 > 0) {
-            dcopy_kernel_32(n1, x, y);
-            i = n1;
-        }
-
-        while (i < n) {
-            y[i] = x[i];
-            i++;
-
-        }
-
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
 
-    } else {
-
-        BLASLONG n1 = n & -4;
+  if (n <= 0)
+    return 0;
 
-        while (i < n1) {
+  if ((inc_x == 1) && (inc_y == 1)) {
 
-            y[iy] = x[ix];
-            y[iy + inc_y] = x[ix + inc_x];
-            y[iy + 2 * inc_y] = x[ix + 2 * inc_x];
-            y[iy + 3 * inc_y] = x[ix + 3 * inc_x];
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+      dcopy_kernel_32(n1, x, y);
+      i = n1;
+    }
 
-            ix += inc_x * 4;
-            iy += inc_y * 4;
-            i += 4;
+    while (i < n) {
+      y[i] = x[i];
+      i++;
 
-        }
+    }
 
-        while (i < n) {
+  } else {
 
-            y[iy] = x[ix];
-            ix += inc_x;
-            iy += inc_y;
-            i++;
+    while (i < n) {
 
-        }
+      y[iy] = x[ix];
+      ix += inc_x;
+      iy += inc_y;
+      i++;
 
     }
-    return 0;
 
+  }
+  return 0;
 
 }
-
-
diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c
index c70cbd00d5..9cad68f4b6 100644
--- a/kernel/zarch/ddot.c
+++ b/kernel/zarch/ddot.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -25,184 +25,129 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
 #include "common.h"
 
-
-#if  defined(Z13)
-static  FLOAT  ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
-{
-    FLOAT dot;
-         __asm__ volatile( 
-            "pfd   1, 0(%[ptr_x_tmp]) \n\t"
-            "pfd   1, 0(%[ptr_y_tmp]) \n\t"      
-            "vzero %%v24  \n\t"
-            "vzero %%v25  \n\t" 
-            "vzero %%v26  \n\t"
-            "vzero %%v27  \n\t"                  
-            "srlg  %[n_tmp],%[n_tmp],4    \n\t" 
-            "xgr   %%r1,%%r1    \n\t"
-            ".align 16 \n\t"    
-            "1:    \n\t"
-            "pfd    1,    256(%%r1,%[ptr_x_tmp]) \n\t"
-            "pfd    1,    256(%%r1,%[ptr_y_tmp]) \n\t"                
-            "vl     %%v16,  0(%%r1,%[ptr_x_tmp]) \n\t"
-            "vl     %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
-            "vl     %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
-            "vl     %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
-
-            "vl     %%v28,  0(%%r1,%[ptr_y_tmp]) \n\t"
-            "vfmadb %%v24,%%v16,%%v28,%%v24      \n\t"  
-            "vl     %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
-            "vfmadb %%v25,%%v17,%%v29,%%v25      \n\t"   
-     
-            "vl     %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
-            "vfmadb %%v26,%%v18,%%v30,%%v26      \n\t"      
-            "vl     %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t" 
-            "vfmadb %%v27,%%v19,%%v31,%%v27      \n\t"   
- 
-            "vl     %%v16,  64(%%r1 ,%[ptr_x_tmp]) \n\t"
-            "vl     %%v17,  80(%%r1,%[ptr_x_tmp])  \n\t"
-            "vl     %%v18,  96(%%r1,%[ptr_x_tmp])  \n\t"
-            "vl     %%v19, 112(%%r1,%[ptr_x_tmp])  \n\t"
-
-            "vl     %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
-            "vfmadb %%v24,%%v16,%%v28,%%v24      \n\t"  
-            "vl     %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
-            "vfmadb %%v25,%%v17,%%v29,%%v25      \n\t"  
-          
-     
-            "vl     %%v30, 96(%%r1,%[ptr_y_tmp])  \n\t"
-            "vfmadb %%v26,%%v18,%%v30,%%v26       \n\t" 
-            "vl     %%v31, 112(%%r1,%[ptr_y_tmp]) \n\t" 
-            "vfmadb %%v27,%%v19,%%v31,%%v27       \n\t"  
-             
-            
-            "la     %%r1,128(%%r1) \n\t"
-            "brctg  %[n_tmp],1b \n\t"
-            "vfadb  %%v24,%%v25,%%v24    \n\t"
-            "vfadb  %%v24,%%v26,%%v24    \n\t"
-            "vfadb  %%v24,%%v27,%%v24    \n\t"                 
-            "vrepg  %%v1,%%v24,1         \n\t"
-            "vfadb  %%v1,%%v24,%%v1      \n\t"  
-            "ldr    %[dot],  %%f1     \n\t"  
-            : [dot] "=f"(dot) ,[n_tmp] "+&r"(n)
-            : [mem_x] "m"( *(const double (*)[n])x),
-              [mem_y] "m"( *(const double (*)[n])y),
-              [ptr_x_tmp]"a"(x), [ptr_y_tmp] "a"(y) 
-            :"cc" , "r1","f1","v16", "v17","v18","v19","v20","v21","v22","v23",
-            "v24","v25","v26","v27","v28","v29","v30","v31"
-
-         );
-    return dot;        
-
+static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
+  FLOAT dot;
+
+  __asm__("vzero %%v0\n\t"
+    "vzero %%v1\n\t"
+    "vzero %%v2\n\t"
+    "vzero %%v3\n\t"
+    "vzero %%v4\n\t"
+    "vzero %%v5\n\t"
+    "vzero %%v6\n\t"
+    "vzero %%v7\n\t"
+    "srlg  %[n],%[n],4\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[x])\n\t"
+    "pfd 1,1024(%%r1,%[y])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[y])\n\t"
+    "vl  %%v25,16(%%r1,%[y])\n\t"
+    "vl  %%v26,32(%%r1,%[y])\n\t"
+    "vl  %%v27,48(%%r1,%[y])\n\t"
+    "vl  %%v28,64(%%r1,%[y])\n\t"
+    "vl  %%v29,80(%%r1,%[y])\n\t"
+    "vl  %%v30,96(%%r1,%[y])\n\t"
+    "vl  %%v31,112(%%r1,%[y])\n\t"
+    "vfmadb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vfmadb   %%v1,%%v17,%%v25,%%v1\n\t"
+    "vfmadb   %%v2,%%v18,%%v26,%%v2\n\t"
+    "vfmadb   %%v3,%%v19,%%v27,%%v3\n\t"
+    "vfmadb   %%v4,%%v20,%%v28,%%v4\n\t"
+    "vfmadb   %%v5,%%v21,%%v29,%%v5\n\t"
+    "vfmadb   %%v6,%%v22,%%v30,%%v6\n\t"
+    "vfmadb   %%v7,%%v23,%%v31,%%v7\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %[n],0b\n\t"
+    "vfadb   %%v0,%%v0,%%v1\n\t"
+    "vfadb   %%v0,%%v0,%%v2\n\t"
+    "vfadb   %%v0,%%v0,%%v3\n\t"
+    "vfadb   %%v0,%%v0,%%v4\n\t"
+    "vfadb   %%v0,%%v0,%%v5\n\t"
+    "vfadb   %%v0,%%v0,%%v6\n\t"
+    "vfadb   %%v0,%%v0,%%v7\n\t"
+    "vrepg  %%v1,%%v0,1\n\t"
+    "adbr   %%f0,%%f1\n\t"
+    "ldr    %[dot],%%f0"
+    : [dot] "=f"(dot),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
+       "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+
+  return dot;
 }
 
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
 
-#else
-
-static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y )
-{
-    BLASLONG register i = 0;
-    FLOAT dot = 0.0;
-
-    while(i < n)
-        {
-            dot +=  y[i]  * x[i]
-                  + y[i+1] * x[i+1]
-                  + y[i+2] * x[i+2]
-                  + y[i+3] * x[i+3]
-                  + y[i+4] * x[i+4]
-                  + y[i+5] * x[i+5]
-                  + y[i+6] * x[i+6]
-                  + y[i+7] * x[i+7] ;
-            dot +=  y[i+8]  * x[i+8]
-                  + y[i+9] * x[i+9]
-                  + y[i+10] * x[i+10]
-                  + y[i+11] * x[i+11]
-                  + y[i+12] * x[i+12]
-                  + y[i+13] * x[i+13]
-                  + y[i+14] * x[i+14]
-                  + y[i+15] * x[i+15] ;
-    
-
-            i+=16 ;
-
-       }
-    return dot;
-    
-}
-
-#endif
-
-FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
-{
-    BLASLONG i=0;
-    BLASLONG ix=0,iy=0;
-
-    FLOAT  dot = 0.0 ;
+  FLOAT dot = 0.0;
 
-    if ( n <= 0 )  return(dot);
+  if (n <= 0)
+    return (dot);
 
-    if ( (inc_x == 1) && (inc_y == 1) )
-    {
+  if ((inc_x == 1) && (inc_y == 1)) {
 
-        BLASLONG n1 = n & -16;
-        
-        if ( n1 ){
-            dot = ddot_kernel_16(n1, x, y  );
-            i = n1;
-        }
+    BLASLONG n1 = n & -16;
 
-        
-        while(i < n)
-        {
+    if (n1)
+      dot = ddot_kernel_16(n1, x, y);
 
-            dot += y[i] * x[i] ;
-            i++ ;
-
-        } 
-        return(dot);
+    i = n1;
+    while (i < n) {
 
+      dot += y[i] * x[i];
+      i++;
 
     }
+    return (dot);
 
-    FLOAT temp1 = 0.0;
-    FLOAT temp2 = 0.0;
-
-    BLASLONG n1 = n & -4;    
+  }
 
-    while(i < n1)
-    {
+  FLOAT temp1 = 0.0;
+  FLOAT temp2 = 0.0;
 
-        FLOAT m1 = y[iy]       * x[ix] ;
-        FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
+  BLASLONG n1 = n & -4;
 
-        FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
-        FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
+  while (i < n1) {
 
-        ix  += inc_x*4 ;
-        iy  += inc_y*4 ;
+    FLOAT m1 = y[iy] * x[ix];
+    FLOAT m2 = y[iy + inc_y] * x[ix + inc_x];
 
-        temp1 += m1+m3;
-        temp2 += m2+m4;
+    FLOAT m3 = y[iy + 2 * inc_y] * x[ix + 2 * inc_x];
+    FLOAT m4 = y[iy + 3 * inc_y] * x[ix + 3 * inc_x];
 
-        i+=4 ;
+    ix += inc_x * 4;
+    iy += inc_y * 4;
 
-    }
+    temp1 += m1 + m3;
+    temp2 += m2 + m4;
 
-    while(i < n)
-    {
+    i += 4;
 
-        temp1 += y[iy] * x[ix] ;
-        ix  += inc_x ;
-        iy  += inc_y ;
-        i++ ;
+  }
 
-    }
-    dot = temp1 + temp2;
-    return(dot);
+  while (i < n) {
 
-}
+    temp1 += y[iy] * x[ix];
+    ix += inc_x;
+    iy += inc_y;
+    i++;
 
+  }
+  dot = temp1 + temp2;
+  return (dot);
 
+}
diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c
index bb202e754e..502ba837ea 100644
--- a/kernel/zarch/dgemv_n_4.c
+++ b/kernel/zarch/dgemv_n_4.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2017, The OpenBLAS Project
+Copyright (c) 2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -25,461 +25,593 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
 #include "common.h"
 
 #define NBMAX 2048
 
-#define HAVE_KERNEL_4x4_VEC 1
-#define HAVE_KERNEL_4x2_VEC 1
-#define HAVE_KERNEL_4x1_VEC 1
-
-#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
- #include <vecintrin.h>
-#endif
-
-#ifdef HAVE_KERNEL_4x4
-
-#elif HAVE_KERNEL_4x4_VEC
-
-static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
-{
-    BLASLONG i;
-    FLOAT x0,x1,x2,x3;
-    x0 = xo[0] * *alpha;
-    x1 = xo[1] * *alpha;
-    x2 = xo[2] * *alpha;
-    x3 = xo[3] * *alpha;
-    __vector double   v_x0 = {x0,x0};
-    __vector double   v_x1 = {x1,x1};
-    __vector double   v_x2 = {x2,x2};
-    __vector double   v_x3 = {x3,x3};
-    __vector double* v_y =(__vector double*)y;      
-    __vector double* va0 = (__vector double*)ap[0];
-    __vector double* va1 = (__vector double*)ap[1];
-    __vector double* va2 = (__vector double*)ap[2];
-    __vector double* va3 = (__vector double*)ap[3]; 
-
-    for ( i=0; i< n/2; i+=2 )
-    {
-        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ;
-        v_y[i+1] += v_x0 * va0[i+1] +  v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ;        
-    }
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
+                             FLOAT *alpha) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+  register FLOAT *ap2 = ap[2];
+  register FLOAT *ap3 = ap[3];
+
+  __asm__("vlrepg %%v0,0(%[x])\n\t"
+    "vlrepg %%v1,8(%[x])\n\t"
+    "vlrepg %%v2,16(%[x])\n\t"
+    "vlrepg %%v3,24(%[x])\n\t"
+    "vlrepg %%v4,%[alpha]\n\t"
+    "vfmdb  %%v0,%%v0,%%v4\n\t"
+    "vfmdb  %%v1,%%v1,%%v4\n\t"
+    "vfmdb  %%v2,%%v2,%%v4\n\t"
+    "vfmdb  %%v3,%%v3,%%v4\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-16\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,4\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 1,1024(%%r1,%[ap2])\n\t"
+    "pfd 1,1024(%%r1,%[ap3])\n\t"
+    "pfd 2,1024(%%r1,%[y])\n\t"
+    "vl  %%v16,0(%%r1,%[ap0])\n\t"
+    "vl  %%v17,0(%%r1,%[ap1])\n\t"
+    "vl  %%v18,0(%%r1,%[ap2])\n\t"
+    "vl  %%v19,0(%%r1,%[ap3])\n\t"
+    "vl  %%v20,16(%%r1,%[ap0])\n\t"
+    "vl  %%v21,16(%%r1,%[ap1])\n\t"
+    "vl  %%v22,16(%%r1,%[ap2])\n\t"
+    "vl  %%v23,16(%%r1,%[ap3])\n\t"
+    "vl  %%v24,32(%%r1,%[ap0])\n\t"
+    "vl  %%v25,32(%%r1,%[ap1])\n\t"
+    "vl  %%v26,32(%%r1,%[ap2])\n\t"
+    "vl  %%v27,32(%%r1,%[ap3])\n\t"
+    "vl  %%v28,48(%%r1,%[ap0])\n\t"
+    "vl  %%v29,48(%%r1,%[ap1])\n\t"
+    "vl  %%v30,48(%%r1,%[ap2])\n\t"
+    "vl  %%v31,48(%%r1,%[ap3])\n\t"
+    "vl  %%v4,0(%%r1,%[y])\n\t"
+    "vl  %%v5,16(%%r1,%[y])\n\t"
+    "vl  %%v6,32(%%r1,%[y])\n\t"
+    "vl  %%v7,48(%%r1,%[y])\n\t"
+    "vfmadb   %%v4,%%v16,%%v0,%%v4\n\t"
+    "vfmadb   %%v5,%%v20,%%v0,%%v5\n\t"
+    "vfmadb   %%v6,%%v24,%%v0,%%v6\n\t"
+    "vfmadb   %%v7,%%v28,%%v0,%%v7\n\t"
+    "vfmadb   %%v4,%%v17,%%v1,%%v4\n\t"
+    "vfmadb   %%v5,%%v21,%%v1,%%v5\n\t"
+    "vfmadb   %%v6,%%v25,%%v1,%%v6\n\t"
+    "vfmadb   %%v7,%%v29,%%v1,%%v7\n\t"
+    "vfmadb   %%v4,%%v18,%%v2,%%v4\n\t"
+    "vfmadb   %%v5,%%v22,%%v2,%%v5\n\t"
+    "vfmadb   %%v6,%%v26,%%v2,%%v6\n\t"
+    "vfmadb   %%v7,%%v30,%%v2,%%v7\n\t"
+    "vfmadb   %%v4,%%v19,%%v3,%%v4\n\t"
+    "vfmadb   %%v5,%%v23,%%v3,%%v5\n\t"
+    "vfmadb   %%v6,%%v27,%%v3,%%v6\n\t"
+    "vfmadb   %%v7,%%v31,%%v3,%%v7\n\t"
+    "vst %%v4,0(%%r1,%[y])\n\t"
+    "vst %%v5,16(%%r1,%[y])\n\t"
+    "vst %%v6,32(%%r1,%[y])\n\t"
+    "vst %%v7,48(%%r1,%[y])\n\t"
+    "vl  %%v16,64(%%r1,%[ap0])\n\t"
+    "vl  %%v17,64(%%r1,%[ap1])\n\t"
+    "vl  %%v18,64(%%r1,%[ap2])\n\t"
+    "vl  %%v19,64(%%r1,%[ap3])\n\t"
+    "vl  %%v20,80(%%r1,%[ap0])\n\t"
+    "vl  %%v21,80(%%r1,%[ap1])\n\t"
+    "vl  %%v22,80(%%r1,%[ap2])\n\t"
+    "vl  %%v23,80(%%r1,%[ap3])\n\t"
+    "vl  %%v24,96(%%r1,%[ap0])\n\t"
+    "vl  %%v25,96(%%r1,%[ap1])\n\t"
+    "vl  %%v26,96(%%r1,%[ap2])\n\t"
+    "vl  %%v27,96(%%r1,%[ap3])\n\t"
+    "vl  %%v28,112(%%r1,%[ap0])\n\t"
+    "vl  %%v29,112(%%r1,%[ap1])\n\t"
+    "vl  %%v30,112(%%r1,%[ap2])\n\t"
+    "vl  %%v31,112(%%r1,%[ap3])\n\t"
+    "vl  %%v4,64(%%r1,%[y])\n\t"
+    "vl  %%v5,80(%%r1,%[y])\n\t"
+    "vl  %%v6,96(%%r1,%[y])\n\t"
+    "vl  %%v7,112(%%r1,%[y])\n\t"
+    "vfmadb   %%v4,%%v16,%%v0,%%v4\n\t"
+    "vfmadb   %%v5,%%v20,%%v0,%%v5\n\t"
+    "vfmadb   %%v6,%%v24,%%v0,%%v6\n\t"
+    "vfmadb   %%v7,%%v28,%%v0,%%v7\n\t"
+    "vfmadb   %%v4,%%v17,%%v1,%%v4\n\t"
+    "vfmadb   %%v5,%%v21,%%v1,%%v5\n\t"
+    "vfmadb   %%v6,%%v25,%%v1,%%v6\n\t"
+    "vfmadb   %%v7,%%v29,%%v1,%%v7\n\t"
+    "vfmadb   %%v4,%%v18,%%v2,%%v4\n\t"
+    "vfmadb   %%v5,%%v22,%%v2,%%v5\n\t"
+    "vfmadb   %%v6,%%v26,%%v2,%%v6\n\t"
+    "vfmadb   %%v7,%%v30,%%v2,%%v7\n\t"
+    "vfmadb   %%v4,%%v19,%%v3,%%v4\n\t"
+    "vfmadb   %%v5,%%v23,%%v3,%%v5\n\t"
+    "vfmadb   %%v6,%%v27,%%v3,%%v6\n\t"
+    "vfmadb   %%v7,%%v31,%%v3,%%v7\n\t"
+    "vst %%v4,64(%%r1,%[y])\n\t"
+    "vst %%v5,80(%%r1,%[y])\n\t"
+    "vst %%v6,96(%%r1,%[y])\n\t"
+    "vst %%v7,112(%%r1,%[y])\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,12\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[ap0])\n\t"
+    "vl  %%v17,0(%%r1,%[ap1])\n\t"
+    "vl  %%v18,0(%%r1,%[ap2])\n\t"
+    "vl  %%v19,0(%%r1,%[ap3])\n\t"
+    "vl  %%v20,16(%%r1,%[ap0])\n\t"
+    "vl  %%v21,16(%%r1,%[ap1])\n\t"
+    "vl  %%v22,16(%%r1,%[ap2])\n\t"
+    "vl  %%v23,16(%%r1,%[ap3])\n\t"
+    "vl  %%v4,0(%%r1,%[y])\n\t"
+    "vl  %%v5,16(%%r1,%[y])\n\t"
+    "vfmadb   %%v4,%%v16,%%v0,%%v4\n\t"
+    "vfmadb   %%v5,%%v20,%%v0,%%v5\n\t"
+    "vfmadb   %%v4,%%v17,%%v1,%%v4\n\t"
+    "vfmadb   %%v5,%%v21,%%v1,%%v5\n\t"
+    "vfmadb   %%v4,%%v18,%%v2,%%v4\n\t"
+    "vfmadb   %%v5,%%v22,%%v2,%%v5\n\t"
+    "vfmadb   %%v4,%%v19,%%v3,%%v4\n\t"
+    "vfmadb   %%v5,%%v23,%%v3,%%v5\n\t"
+    "vst %%v4,0(%%r1,%[y])\n\t"
+    "vst %%v5,16(%%r1,%[y])\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "nop"
+    : "+m"(*(struct { FLOAT x[n]; } *) y)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
+       "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
+       "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
+       [n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+       "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+       "v26", "v27", "v28", "v29", "v30", "v31");
 }
 
-#else
-
-static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
-{
-    BLASLONG i;
-    FLOAT *a0,*a1,*a2,*a3;
-    FLOAT x[4]  __attribute__ ((aligned (16)));
-    a0 = ap[0];
-    a1 = ap[1];
-    a2 = ap[2];
-    a3 = ap[3];
-
-    for ( i=0; i<4; i++)
-        x[i] = xo[i] * *alpha;
-
-    for ( i=0; i< n; i+=4 )
-    {
-        y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];        
-        y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];        
-        y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];        
-        y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];        
-    }
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
+                             FLOAT *alpha) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+
+  __asm__("vlrepg %%v0,0(%[x])\n\t"
+    "vlrepg %%v1,8(%[x])\n\t"
+    "vlrepg %%v2,%[alpha]\n\t"
+    "vfmdb  %%v0,%%v0,%%v2\n\t"
+    "vfmdb  %%v1,%%v1,%%v2\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-16\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,4\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 2,1024(%%r1,%[y])\n\t"
+    "vl  %%v16,0(%%r1,%[ap0])\n\t"
+    "vl  %%v17,0(%%r1,%[ap1])\n\t"
+    "vl  %%v18,16(%%r1,%[ap0])\n\t"
+    "vl  %%v19,16(%%r1,%[ap1])\n\t"
+    "vl  %%v20,32(%%r1,%[ap0])\n\t"
+    "vl  %%v21,32(%%r1,%[ap1])\n\t"
+    "vl  %%v22,48(%%r1,%[ap0])\n\t"
+    "vl  %%v23,48(%%r1,%[ap1])\n\t"
+    "vl  %%v24,64(%%r1,%[ap0])\n\t"
+    "vl  %%v25,64(%%r1,%[ap1])\n\t"
+    "vl  %%v26,80(%%r1,%[ap0])\n\t"
+    "vl  %%v27,80(%%r1,%[ap1])\n\t"
+    "vl  %%v28,96(%%r1,%[ap0])\n\t"
+    "vl  %%v29,96(%%r1,%[ap1])\n\t"
+    "vl  %%v30,112(%%r1,%[ap0])\n\t"
+    "vl  %%v31,112(%%r1,%[ap1])\n\t"
+    "vl  %%v2,0(%%r1,%[y])\n\t"
+    "vl  %%v3,16(%%r1,%[y])\n\t"
+    "vl  %%v4,32(%%r1,%[y])\n\t"
+    "vl  %%v5,48(%%r1,%[y])\n\t"
+    "vl  %%v6,64(%%r1,%[y])\n\t"
+    "vl  %%v7,80(%%r1,%[y])\n\t"
+    "vl  %%v8,96(%%r1,%[y])\n\t"
+    "vl  %%v9,112(%%r1,%[y])\n\t"
+    "vfmadb   %%v2,%%v16,%%v0,%%v2\n\t"
+    "vfmadb   %%v3,%%v18,%%v0,%%v3\n\t"
+    "vfmadb   %%v4,%%v20,%%v0,%%v4\n\t"
+    "vfmadb   %%v5,%%v22,%%v0,%%v5\n\t"
+    "vfmadb   %%v6,%%v24,%%v0,%%v6\n\t"
+    "vfmadb   %%v7,%%v26,%%v0,%%v7\n\t"
+    "vfmadb   %%v8,%%v28,%%v0,%%v8\n\t"
+    "vfmadb   %%v9,%%v30,%%v0,%%v9\n\t"
+    "vfmadb   %%v2,%%v17,%%v1,%%v2\n\t"
+    "vfmadb   %%v3,%%v19,%%v1,%%v3\n\t"
+    "vfmadb   %%v4,%%v21,%%v1,%%v4\n\t"
+    "vfmadb   %%v5,%%v23,%%v1,%%v5\n\t"
+    "vfmadb   %%v6,%%v25,%%v1,%%v6\n\t"
+    "vfmadb   %%v7,%%v27,%%v1,%%v7\n\t"
+    "vfmadb   %%v8,%%v29,%%v1,%%v8\n\t"
+    "vfmadb   %%v9,%%v31,%%v1,%%v9\n\t"
+    "vst %%v2,0(%%r1,%[y])\n\t"
+    "vst %%v3,16(%%r1,%[y])\n\t"
+    "vst %%v4,32(%%r1,%[y])\n\t"
+    "vst %%v5,48(%%r1,%[y])\n\t"
+    "vst %%v6,64(%%r1,%[y])\n\t"
+    "vst %%v7,80(%%r1,%[y])\n\t"
+    "vst %%v8,96(%%r1,%[y])\n\t"
+    "vst %%v9,112(%%r1,%[y])\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,12\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[ap0])\n\t"
+    "vl  %%v17,0(%%r1,%[ap1])\n\t"
+    "vl  %%v18,16(%%r1,%[ap0])\n\t"
+    "vl  %%v19,16(%%r1,%[ap1])\n\t"
+    "vl  %%v2,0(%%r1,%[y])\n\t"
+    "vl  %%v3,16(%%r1,%[y])\n\t"
+    "vfmadb   %%v2,%%v16,%%v0,%%v2\n\t"
+    "vfmadb   %%v3,%%v18,%%v0,%%v3\n\t"
+    "vfmadb   %%v2,%%v17,%%v1,%%v2\n\t"
+    "vfmadb   %%v3,%%v19,%%v1,%%v3\n\t"
+    "vst %%v2,0(%%r1,%[y])\n\t"
+    "vst %%v3,16(%%r1,%[y])\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "nop"
+    : "+m"(*(struct { FLOAT x[n]; } *) y)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
+       [n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+       "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
 }
 
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y,
+                             FLOAT *alpha) {
+  __asm__("vlrepg %%v0,0(%[x])\n\t"
+    "vlrepg %%v16,%[alpha]\n\t"
+    "vfmdb  %%v0,%%v0,%%v16\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-16\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,4\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[a0])\n\t"
+    "pfd 2,1024(%%r1,%[y])\n\t"
+    "vl  %%v16,0(%%r1,%[a0])\n\t"
+    "vl  %%v17,16(%%r1,%[a0])\n\t"
+    "vl  %%v18,32(%%r1,%[a0])\n\t"
+    "vl  %%v19,48(%%r1,%[a0])\n\t"
+    "vl  %%v20,64(%%r1,%[a0])\n\t"
+    "vl  %%v21,80(%%r1,%[a0])\n\t"
+    "vl  %%v22,96(%%r1,%[a0])\n\t"
+    "vl  %%v23,112(%%r1,%[a0])\n\t"
+    "vl  %%v24,0(%%r1,%[y])\n\t"
+    "vl  %%v25,16(%%r1,%[y])\n\t"
+    "vl  %%v26,32(%%r1,%[y])\n\t"
+    "vl  %%v27,48(%%r1,%[y])\n\t"
+    "vl  %%v28,64(%%r1,%[y])\n\t"
+    "vl  %%v29,80(%%r1,%[y])\n\t"
+    "vl  %%v30,96(%%r1,%[y])\n\t"
+    "vl  %%v31,112(%%r1,%[y])\n\t"
+    "vfmadb   %%v24,%%v16,%%v0,%%v24\n\t"
+    "vfmadb   %%v25,%%v17,%%v0,%%v25\n\t"
+    "vfmadb   %%v26,%%v18,%%v0,%%v26\n\t"
+    "vfmadb   %%v27,%%v19,%%v0,%%v27\n\t"
+    "vfmadb   %%v28,%%v20,%%v0,%%v28\n\t"
+    "vfmadb   %%v29,%%v21,%%v0,%%v29\n\t"
+    "vfmadb   %%v30,%%v22,%%v0,%%v30\n\t"
+    "vfmadb   %%v31,%%v23,%%v0,%%v31\n\t"
+    "vst %%v24,0(%%r1,%[y])\n\t"
+    "vst %%v25,16(%%r1,%[y])\n\t"
+    "vst %%v26,32(%%r1,%[y])\n\t"
+    "vst %%v27,48(%%r1,%[y])\n\t"
+    "vst %%v28,64(%%r1,%[y])\n\t"
+    "vst %%v29,80(%%r1,%[y])\n\t"
+    "vst %%v30,96(%%r1,%[y])\n\t"
+    "vst %%v31,112(%%r1,%[y])\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,12\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[a0])\n\t"
+    "vl  %%v17,16(%%r1,%[a0])\n\t"
+    "vl  %%v18,0(%%r1,%[y])\n\t"
+    "vl  %%v19,16(%%r1,%[y])\n\t"
+    "vfmadb   %%v18,%%v16,%%v0,%%v18\n\t"
+    "vfmadb   %%v19,%%v17,%%v0,%%v19\n\t"
+    "vst %%v18,0(%%r1,%[y])\n\t"
+    "vst %%v19,16(%%r1,%[y])\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "nop"
+    : "+m"(*(struct { FLOAT x[n]; } *) y)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
+       "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha),
+       [n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+}
 
-#endif
-
-#ifdef HAVE_KERNEL_4x2
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {
+  BLASLONG i;
+  for (i = 0; i < n; i++) {
+    *dest += src[i];
+    dest += inc_dest;
+  }
+}
 
-#elif HAVE_KERNEL_4x2_VEC
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
+          BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT *buffer) {
+  BLASLONG i;
+  FLOAT *a_ptr;
+  FLOAT *x_ptr;
+  FLOAT *y_ptr;
+  FLOAT *ap[4];
+  BLASLONG n1;
+  BLASLONG m1;
+  BLASLONG m2;
+  BLASLONG m3;
+  BLASLONG n2;
+  BLASLONG lda4 = lda << 2;
+  FLOAT xbuffer[8], *ybuffer;
+
+  if (m < 1)
+    return (0);
+  if (n < 1)
+    return (0);
+
+  ybuffer = buffer;
+
+  n1 = n >> 2;
+  n2 = n & 3;
+
+  m3 = m & 3;
+  m1 = m & -4;
+  m2 = (m & (NBMAX - 1)) - m3;
+
+  y_ptr = y;
+
+  BLASLONG NB = NBMAX;
+
+  while (NB == NBMAX) {
+
+    m1 -= NB;
+    if (m1 < 0) {
+      if (m2 == 0)
+        break;
+      NB = m2;
+    }
 
-static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
-{
-    BLASLONG i;
-    FLOAT x0,x1;
-    x0 = xo[0] * *alpha;
-    x1 = xo[1] * *alpha; 
-    __vector double   v_x0 = {x0,x0};
-    __vector double   v_x1 = {x1,x1}; 
-    __vector double* v_y =(__vector double*)y;      
-    __vector double* va0 = (__vector double*)ap[0];
-    __vector double* va1 = (__vector double*)ap[1]; 
+    a_ptr = a;
+    x_ptr = x;
+
+    ap[0] = a_ptr;
+    ap[1] = a_ptr + lda;
+    ap[2] = ap[1] + lda;
+    ap[3] = ap[2] + lda;
+
+    if (inc_y != 1)
+      memset(ybuffer, 0, NB * 8);
+    else
+      ybuffer = y_ptr;
+
+    if (inc_x == 1) {
+
+      for (i = 0; i < n1; i++) {
+        dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
+        ap[0] += lda4;
+        ap[1] += lda4;
+        ap[2] += lda4;
+        ap[3] += lda4;
+        a_ptr += lda4;
+        x_ptr += 4;
+      }
+
+      if (n2 & 2) {
+        dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
+        a_ptr += lda * 2;
+        x_ptr += 2;
+      }
+
+      if (n2 & 1) {
+        dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
+        /* a_ptr += lda;
+           x_ptr += 1; */
+
+      }
+
+    } else {
+
+      for (i = 0; i < n1; i++) {
+        xbuffer[0] = x_ptr[0];
+        x_ptr += inc_x;
+        xbuffer[1] = x_ptr[0];
+        x_ptr += inc_x;
+        xbuffer[2] = x_ptr[0];
+        x_ptr += inc_x;
+        xbuffer[3] = x_ptr[0];
+        x_ptr += inc_x;
+        dgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha);
+        ap[0] += lda4;
+        ap[1] += lda4;
+        ap[2] += lda4;
+        ap[3] += lda4;
+        a_ptr += lda4;
+      }
+
+      for (i = 0; i < n2; i++) {
+        xbuffer[0] = x_ptr[0];
+        x_ptr += inc_x;
+        dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
+        a_ptr += lda;
+
+      }
 
-    for ( i=0; i< n/2; i+=2 )
-    {
-        v_y[i]   += v_x0 * va0[i] +  v_x1 * va1[i]   ;
-        v_y[i+1] += v_x0 * va0[i+1] +  v_x1 * va1[i+1]  ;        
-    } 
-}
-#else
-
-static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
-{
-    BLASLONG i;
-    FLOAT *a0,*a1;
-    FLOAT x[4]  __attribute__ ((aligned (16)));
-    a0 = ap[0];
-    a1 = ap[1];
-
-    for ( i=0; i<2; i++)
-        x[i] = xo[i] * *alpha;
-
-    for ( i=0; i< n; i+=4 )
-    {
-        y[i] += a0[i]*x[0] + a1[i]*x[1];        
-        y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];        
-        y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];        
-        y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];        
     }
-}
 
+    a += NB;
+    if (inc_y != 1) {
+      add_y(NB, ybuffer, y_ptr, inc_y);
+      y_ptr += NB * inc_y;
+    } else
+      y_ptr += NB;
 
-#endif
+  }
 
-#ifdef HAVE_KERNEL_4x1
+  if (m3 == 0)
+    return (0);
 
-#elif HAVE_KERNEL_4x1_VEC
-static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
-{
-    
-    BLASLONG i;
-    FLOAT x0;
-    x0 = xo[0] * *alpha;
-    __vector double   v_x0 = {x0,x0};
-    __vector double* v_y =(__vector double*)y;      
-    __vector double* va0 = (__vector double*)ap;
+  if (m3 == 3) {
+    a_ptr = a;
+    x_ptr = x;
+    FLOAT temp0 = 0.0;
+    FLOAT temp1 = 0.0;
+    FLOAT temp2 = 0.0;
+    if (lda == 3 && inc_x == 1) {
 
-    for ( i=0; i< n/2; i+=2 )
-    {
-        v_y[i] += v_x0 * va0[i]    ;
-        v_y[i+1] += v_x0 * va0[i+1]  ;        
-    }
-        
- 
-}
+      for (i = 0; i < (n & -4); i += 4) {
 
-#else
-static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
-{
-    BLASLONG i;
-    FLOAT *a0;
-    FLOAT x[4]  __attribute__ ((aligned (16)));
-    a0 = ap;
-
-    for ( i=0; i<1; i++)
-        x[i] = xo[i] * *alpha;
-
-    for ( i=0; i< n; i+=4 )
-    {
-        y[i] += a0[i]*x[0];        
-        y[i+1] += a0[i+1]*x[0];        
-        y[i+2] += a0[i+2]*x[0];        
-        y[i+3] += a0[i+3]*x[0];        
-    }
-}
+        temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+        temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
 
+        temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
+        temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+        temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
 
-#endif
+        a_ptr += 12;
+        x_ptr += 4;
+      }
 
- 
+      for (; i < n; i++) {
+        temp0 += a_ptr[0] * x_ptr[0];
+        temp1 += a_ptr[1] * x_ptr[0];
+        temp2 += a_ptr[2] * x_ptr[0];
+        a_ptr += 3;
+        x_ptr++;
+      }
 
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
-{
-    BLASLONG i;
-        
-    for ( i=0; i<n; i++ ){
-            *dest += *src;
-            src++;
-            dest += inc_dest;
-    }
-    return;
-     
-}
+    } else {
 
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
-    BLASLONG i;
-    BLASLONG j;
-    FLOAT *a_ptr;
-    FLOAT *x_ptr;
-    FLOAT *y_ptr;
-    FLOAT *ap[4];
-    BLASLONG n1;
-    BLASLONG m1;
-    BLASLONG m2;
-    BLASLONG m3;
-    BLASLONG n2;
-    BLASLONG lda4 =  lda << 2;
-    FLOAT xbuffer[8],*ybuffer;
-
-    if ( m < 1 ) return(0);
-    if ( n < 1 ) return(0);
-
-    ybuffer = buffer;
-    
-    n1 = n >> 2 ;
-    n2 = n &  3 ;
-
-    m3 = m & 3  ;
-    m1 = m & -4 ;
-    m2 = (m & (NBMAX-1)) - m3 ;
-
-    y_ptr = y;
-
-    BLASLONG NB = NBMAX;
-
-    while ( NB == NBMAX )
-    {
-        
-        m1 -= NB;
-        if ( m1 < 0)
-        {
-            if ( m2 == 0 ) break;    
-            NB = m2;
-        }
-        
-        a_ptr = a;
-        x_ptr = x;
-        
-        ap[0] = a_ptr;
-        ap[1] = a_ptr + lda;
-        ap[2] = ap[1] + lda;
-        ap[3] = ap[2] + lda;
-
-        if ( inc_y != 1 )
-            memset(ybuffer,0,NB*8);
-        else
-            ybuffer = y_ptr;
-
-        if ( inc_x == 1 )
-        {
-
-
-            for( i = 0; i < n1 ; i++)
-            {
-                dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
-                ap[0] += lda4; 
-                ap[1] += lda4; 
-                ap[2] += lda4; 
-                ap[3] += lda4; 
-                a_ptr += lda4;
-                x_ptr += 4;    
-            }
-
-            if ( n2 & 2 )
-            {
-                dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
-                a_ptr += lda*2;
-                x_ptr += 2;    
-            }
-
-
-            if ( n2 & 1 )
-            {
-                dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
-                a_ptr += lda;
-                x_ptr += 1;    
-
-            }
-
-
-        }
-        else
-        {
-
-            for( i = 0; i < n1 ; i++)
-            {
-                xbuffer[0] = x_ptr[0];
-                x_ptr += inc_x;    
-                xbuffer[1] =  x_ptr[0];
-                x_ptr += inc_x;    
-                xbuffer[2] =  x_ptr[0];
-                x_ptr += inc_x;    
-                xbuffer[3] = x_ptr[0];
-                x_ptr += inc_x;    
-                dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
-                ap[0] += lda4; 
-                ap[1] += lda4; 
-                ap[2] += lda4; 
-                ap[3] += lda4; 
-                a_ptr += lda4;
-            }
-
-            for( i = 0; i < n2 ; i++)
-            {
-                xbuffer[0] = x_ptr[0];
-                x_ptr += inc_x;    
-                dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
-                a_ptr += lda;
-
-            }
-
-        }
-
-        a     += NB;
-        if ( inc_y != 1 )
-        {
-            add_y(NB,ybuffer,y_ptr,inc_y);
-            y_ptr += NB * inc_y;
-        }
-        else
-            y_ptr += NB ;
+      for (i = 0; i < n; i++) {
+        temp0 += a_ptr[0] * x_ptr[0];
+        temp1 += a_ptr[1] * x_ptr[0];
+        temp2 += a_ptr[2] * x_ptr[0];
+        a_ptr += lda;
+        x_ptr += inc_x;
 
-    }
+      }
 
-    if ( m3 == 0 ) return(0);
-
-    if ( m3 == 3 )
-    {
-        a_ptr = a;
-        x_ptr = x;
-        FLOAT temp0 = 0.0;
-        FLOAT temp1 = 0.0;
-        FLOAT temp2 = 0.0;
-        if ( lda == 3 && inc_x ==1 )
-        {
-
-            for( i = 0; i < ( n & -4 ); i+=4 )
-            {
-
-                temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
-                temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
-
-                temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
-                temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
-                temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
-
-                a_ptr += 12;
-                x_ptr += 4;
-            }
-
-            for( ; i < n; i++ )
-            {
-                temp0 += a_ptr[0] * x_ptr[0];
-                temp1 += a_ptr[1] * x_ptr[0];
-                temp2 += a_ptr[2] * x_ptr[0];
-                a_ptr += 3;
-                x_ptr ++;
-            }
-
-        }
-        else
-        {
-
-            for( i = 0; i < n; i++ )
-            {
-                temp0 += a_ptr[0] * x_ptr[0];
-                temp1 += a_ptr[1] * x_ptr[0];
-                temp2 += a_ptr[2] * x_ptr[0];
-                a_ptr += lda;
-                x_ptr += inc_x;
-
-
-            }
-
-        }
-        y_ptr[0] += alpha * temp0;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha * temp1;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha * temp2;
-        return(0);
     }
+    y_ptr[0] += alpha * temp0;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha * temp1;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha * temp2;
+    return (0);
+  }
+
+  if (m3 == 2) {
+    a_ptr = a;
+    x_ptr = x;
+    FLOAT temp0 = 0.0;
+    FLOAT temp1 = 0.0;
+    if (lda == 2 && inc_x == 1) {
+
+      for (i = 0; i < (n & -4); i += 4) {
+        temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+        temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+        temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+        a_ptr += 8;
+        x_ptr += 4;
+
+      }
+
+      for (; i < n; i++) {
+        temp0 += a_ptr[0] * x_ptr[0];
+        temp1 += a_ptr[1] * x_ptr[0];
+        a_ptr += 2;
+        x_ptr++;
+      }
+
+    } else {
+
+      for (i = 0; i < n; i++) {
+        temp0 += a_ptr[0] * x_ptr[0];
+        temp1 += a_ptr[1] * x_ptr[0];
+        a_ptr += lda;
+        x_ptr += inc_x;
+
+      }
 
-
-    if ( m3 == 2 )
-    {
-        a_ptr = a;
-        x_ptr = x;
-        FLOAT temp0 = 0.0;
-        FLOAT temp1 = 0.0;
-        if ( lda == 2 && inc_x ==1 )
-        {
-
-            for( i = 0; i < (n & -4) ; i+=4 )
-            {
-                temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
-                temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
-                temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
-                a_ptr += 8;
-                x_ptr += 4;
-
-            }
-
-
-            for( ; i < n; i++ )
-            {
-                temp0 += a_ptr[0]   * x_ptr[0];
-                temp1 += a_ptr[1]   * x_ptr[0];
-                a_ptr += 2;
-                x_ptr ++;
-            }
-
-        }
-        else
-        {
-
-            for( i = 0; i < n; i++ )
-            {
-                temp0 += a_ptr[0] * x_ptr[0];
-                temp1 += a_ptr[1] * x_ptr[0];
-                a_ptr += lda;
-                x_ptr += inc_x;
-
-
-            }
-
-        }
-        y_ptr[0] += alpha * temp0;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha * temp1;
-        return(0);
     }
+    y_ptr[0] += alpha * temp0;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha * temp1;
+    return (0);
+  }
+
+  if (m3 == 1) {
+    a_ptr = a;
+    x_ptr = x;
+    FLOAT temp = 0.0;
+    if (lda == 1 && inc_x == 1) {
+
+      for (i = 0; i < (n & -4); i += 4) {
+        temp +=
+          a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i +
+                                                                    2] *
+          x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3];
+
+      }
+
+      for (; i < n; i++) {
+        temp += a_ptr[i] * x_ptr[i];
+      }
+
+    } else {
+
+      for (i = 0; i < n; i++) {
+        temp += a_ptr[0] * x_ptr[0];
+        a_ptr += lda;
+        x_ptr += inc_x;
+      }
 
-    if ( m3 == 1 )
-    {
-        a_ptr = a;
-        x_ptr = x;
-        FLOAT temp = 0.0;
-        if ( lda == 1 && inc_x ==1 )
-        {
-
-            for( i = 0; i < (n & -4); i+=4 )
-            {
-                temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
-    
-            }
-
-            for( ; i < n; i++ )
-            {
-                temp += a_ptr[i] * x_ptr[i];
-            }
-
-        }
-        else
-        {
-
-            for( i = 0; i < n; i++ )
-            {
-                temp += a_ptr[0] * x_ptr[0];
-                a_ptr += lda;
-                x_ptr += inc_x;
-            }
-
-        }
-        y_ptr[0] += alpha * temp;
-        return(0);
     }
+    y_ptr[0] += alpha * temp;
+    return (0);
+  }
 
-
-    return(0);
+  return (0);
 }
-
-
diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c
index 96af0139ce..de72a1798a 100644
--- a/kernel/zarch/dgemv_t_4.c
+++ b/kernel/zarch/dgemv_t_4.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2017, The OpenBLAS Project
+Copyright (c) 2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -25,517 +25,736 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
 #include "common.h"
 
-#define HAVE_KERNEL_4x4_VEC 1
-#define HAVE_KERNEL_4x2_VEC 1
-#define HAVE_KERNEL_4x1_VEC 1
-
-#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
- #include <vecintrin.h>
-#endif
 #define NBMAX 2048
 
-#ifdef HAVE_KERNEL_4x4
-
-#elif HAVE_KERNEL_4x4_VEC
-
-static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-    BLASLONG i; 
-    __vector double* va0 = (__vector double*)ap[0];
-    __vector double* va1 = (__vector double*)ap[1];
-    __vector double* va2 = (__vector double*)ap[2];
-    __vector double* va3 = (__vector double*)ap[3];     
-    __vector double* v_x =(__vector double*)x;           
-    __vector double temp0 = {0,0};      
-    __vector double temp1 = {0,0};       
-    __vector double temp2 = {0,0};       
-    __vector double temp3 = {0,0};       
-
-    for ( i=0; i< n/2; i+=2 )
-    {
-        temp0 += v_x[i] * va0[i]  + v_x[i+1] * va0[i+1] ;        
-        temp1 += v_x[i] * va1[i]  + v_x[i+1] * va1[i+1] ;    
-        temp2 += v_x[i] * va2[i]  + v_x[i+1] * va2[i+1] ;    
-        temp3 += v_x[i] * va3[i]  + v_x[i+1] * va3[i+1] ;        
-    }
-        
-    y[0] = temp0[0] + temp0[1];
-    y[1] = temp1[0] + temp1[1];
-    y[2] = temp2[0] + temp2[1];
-    y[3] = temp3[0] + temp3[1];; 
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+  register FLOAT *ap2 = ap[2];
+  register FLOAT *ap3 = ap[3];
+
+  __asm__("vzero %%v0\n\t"
+    "vzero %%v1\n\t"
+    "vzero %%v2\n\t"
+    "vzero %%v3\n\t"
+    "vzero %%v4\n\t"
+    "vzero %%v5\n\t"
+    "vzero %%v6\n\t"
+    "vzero %%v7\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-16\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,4\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 1,1024(%%r1,%[ap2])\n\t"
+    "pfd 1,1024(%%r1,%[ap3])\n\t"
+    "pfd 1,1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vl  %%v25,0(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v1,%%v16,%%v25,%%v1\n\t"
+    "vl  %%v26,0(%%r1,%[ap2])\n\t"
+    "vfmadb   %%v2,%%v16,%%v26,%%v2\n\t"
+    "vl  %%v27,0(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v3,%%v16,%%v27,%%v3\n\t"
+    "vl  %%v28,16(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v4,%%v17,%%v28,%%v4\n\t"
+    "vl  %%v29,16(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v5,%%v17,%%v29,%%v5\n\t"
+    "vl  %%v30,16(%%r1,%[ap2])\n\t"
+    "vfmadb   %%v6,%%v17,%%v30,%%v6\n\t"
+    "vl  %%v31,16(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v7,%%v17,%%v31,%%v7\n\t"
+    "vl  %%v24,32(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v0,%%v18,%%v24,%%v0\n\t"
+    "vl  %%v25,32(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v1,%%v18,%%v25,%%v1\n\t"
+    "vl  %%v26,32(%%r1,%[ap2])\n\t"
+    "vfmadb   %%v2,%%v18,%%v26,%%v2\n\t"
+    "vl  %%v27,32(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v3,%%v18,%%v27,%%v3\n\t"
+    "vl  %%v28,48(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v4,%%v19,%%v28,%%v4\n\t"
+    "vl  %%v29,48(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v5,%%v19,%%v29,%%v5\n\t"
+    "vl  %%v30,48(%%r1,%[ap2])\n\t"
+    "vfmadb   %%v6,%%v19,%%v30,%%v6\n\t"
+    "vl  %%v31,48(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v7,%%v19,%%v31,%%v7\n\t"
+    "vl  %%v24,64(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v0,%%v20,%%v24,%%v0\n\t"
+    "vl  %%v25,64(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v1,%%v20,%%v25,%%v1\n\t"
+    "vl  %%v26,64(%%r1,%[ap2])\n\t"
+    "vfmadb   %%v2,%%v20,%%v26,%%v2\n\t"
+    "vl  %%v27,64(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v3,%%v20,%%v27,%%v3\n\t"
+    "vl  %%v28,80(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v4,%%v21,%%v28,%%v4\n\t"
+    "vl  %%v29,80(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v5,%%v21,%%v29,%%v5\n\t"
+    "vl  %%v30,80(%%r1,%[ap2])\n\t"
+    "vfmadb   %%v6,%%v21,%%v30,%%v6\n\t"
+    "vl  %%v31,80(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v7,%%v21,%%v31,%%v7\n\t"
+    "vl  %%v24,96(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v0,%%v22,%%v24,%%v0\n\t"
+    "vl  %%v25,96(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v1,%%v22,%%v25,%%v1\n\t"
+    "vl  %%v26,96(%%r1,%[ap2])\n\t"
+    "vfmadb   %%v2,%%v22,%%v26,%%v2\n\t"
+    "vl  %%v27,96(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v3,%%v22,%%v27,%%v3\n\t"
+    "vl  %%v28,112(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v4,%%v23,%%v28,%%v4\n\t"
+    "vl  %%v29,112(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v5,%%v23,%%v29,%%v5\n\t"
+    "vl  %%v30,112(%%r1,%[ap2])\n\t"
+    "vfmadb   %%v6,%%v23,%%v30,%%v6\n\t"
+    "vl  %%v31,112(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v7,%%v23,%%v31,%%v7\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,12\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vl  %%v25,0(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v1,%%v16,%%v25,%%v1\n\t"
+    "vl  %%v26,0(%%r1,%[ap2])\n\t"
+    "vfmadb   %%v2,%%v16,%%v26,%%v2\n\t"
+    "vl  %%v27,0(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v3,%%v16,%%v27,%%v3\n\t"
+    "vl  %%v28,16(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v4,%%v17,%%v28,%%v4\n\t"
+    "vl  %%v29,16(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v5,%%v17,%%v29,%%v5\n\t"
+    "vl  %%v30,16(%%r1,%[ap2])\n\t"
+    "vfmadb   %%v6,%%v17,%%v30,%%v6\n\t"
+    "vl  %%v31,16(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v7,%%v17,%%v31,%%v7\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "vfadb  %%v0,%%v0,%%v4\n\t"
+    "vfadb  %%v1,%%v1,%%v5\n\t"
+    "vfadb  %%v2,%%v2,%%v6\n\t"
+    "vfadb  %%v3,%%v3,%%v7\n\t"
+    "vrepg  %%v4,%%v0,1\n\t"
+    "adbr   %%f0,%%f4\n\t"
+    "std    %%f0,0(%[y])\n\t"
+    "vrepg  %%v4,%%v1,1\n\t"
+    "adbr   %%f1,%%f4\n\t"
+    "std    %%f1,8(%[y])\n\t"
+    "vrepg  %%v4,%%v2,1\n\t"
+    "adbr   %%f2,%%f4\n\t"
+    "std    %%f2,16(%[y])\n\t"
+    "vrepg  %%v4,%%v3,1\n\t"
+    "adbr   %%f3,%%f4\n\t"
+    "std    %%f3,24(%[y])"
+    : "=m"(*(struct { FLOAT x[4]; } *) y)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
+       "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
+       "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+       "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+       "v26", "v27", "v28", "v29", "v30", "v31");
 }
-#else
-static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-    BLASLONG i;
-    FLOAT *a0,*a1,*a2,*a3;
-    a0 = ap[0];
-    a1 = ap[1];
-    a2 = ap[2];
-    a3 = ap[3];
-    FLOAT temp0 = 0.0;
-    FLOAT temp1 = 0.0;
-    FLOAT temp2 = 0.0;
-    FLOAT temp3 = 0.0;
-
-    for ( i=0; i< n; i+=4 )
-    {
-        temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];        
-        temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];        
-        temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];        
-        temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];        
-    }
-    y[0] = temp0;
-    y[1] = temp1;
-    y[2] = temp2;
-    y[3] = temp3;
+
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+
+  __asm__("vzero %%v0\n\t"
+    "vzero %%v1\n\t"
+    "vzero %%v2\n\t"
+    "vzero %%v3\n\t"
+    "vzero %%v4\n\t"
+    "vzero %%v5\n\t"
+    "vzero %%v6\n\t"
+    "vzero %%v7\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-16\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,4\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 1,1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vl  %%v25,0(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v1,%%v16,%%v25,%%v1\n\t"
+    "vl  %%v26,16(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v2,%%v17,%%v26,%%v2\n\t"
+    "vl  %%v27,16(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v3,%%v17,%%v27,%%v3\n\t"
+    "vl  %%v28,32(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v4,%%v18,%%v28,%%v4\n\t"
+    "vl  %%v29,32(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v5,%%v18,%%v29,%%v5\n\t"
+    "vl  %%v30,48(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v6,%%v19,%%v30,%%v6\n\t"
+    "vl  %%v31,48(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v7,%%v19,%%v31,%%v7\n\t"
+    "vl  %%v24,64(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v0,%%v20,%%v24,%%v0\n\t"
+    "vl  %%v25,64(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v1,%%v20,%%v25,%%v1\n\t"
+    "vl  %%v26,80(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v2,%%v21,%%v26,%%v2\n\t"
+    "vl  %%v27,80(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v3,%%v21,%%v27,%%v3\n\t"
+    "vl  %%v28,96(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v4,%%v22,%%v28,%%v4\n\t"
+    "vl  %%v29,96(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v5,%%v22,%%v29,%%v5\n\t"
+    "vl  %%v30,112(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v6,%%v23,%%v30,%%v6\n\t"
+    "vl  %%v31,112(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v7,%%v23,%%v31,%%v7\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,12\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vl  %%v25,0(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v1,%%v16,%%v25,%%v1\n\t"
+    "vl  %%v26,16(%%r1,%[ap0])\n\t"
+    "vfmadb   %%v2,%%v17,%%v26,%%v2\n\t"
+    "vl  %%v27,16(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v3,%%v17,%%v27,%%v3\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "vfadb  %%v0,%%v0,%%v2\n\t"
+    "vfadb  %%v0,%%v0,%%v4\n\t"
+    "vfadb  %%v0,%%v0,%%v6\n\t"
+    "vfadb  %%v1,%%v1,%%v3\n\t"
+    "vfadb  %%v1,%%v1,%%v5\n\t"
+    "vfadb  %%v1,%%v1,%%v7\n\t"
+    "vrepg  %%v2,%%v0,1\n\t"
+    "adbr   %%f0,%%f2\n\t"
+    "std    %%f0,0(%[y])\n\t"
+    "vrepg  %%v2,%%v1,1\n\t"
+    "adbr   %%f1,%%f2\n\t"
+    "std    %%f1,8(%[y])"
+    : "=m"(*(struct { FLOAT x[2]; } *) y)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+       "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+       "v26", "v27", "v28", "v29", "v30", "v31");
 }
-    
-#endif
- 
-#ifdef HAVE_KERNEL_4x2
-
-#elif HAVE_KERNEL_4x2_VEC
-
-static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-    BLASLONG i; 
-    __vector double* va0 = (__vector double*)ap[0];
-    __vector double* va1 = (__vector double*)ap[1];     
-    __vector double* v_x =(__vector double*)x;           
-    __vector double temp0 = {0,0};      
-    __vector double temp1 = {0,0};         
-
-    for ( i=0; i< n/2; i+=2 )
-    {
-        temp0 += v_x[i] * va0[i]  + v_x[i+1] * va0[i+1] ;        
-        temp1 += v_x[i] * va1[i]  + v_x[i+1] * va1[i+1] ;        
-    }
-        
-    y[0] = temp0[0] + temp0[1];
-    y[1] = temp1[0] + temp1[1]; 
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) {
+  __asm__("vzero %%v0\n\t"
+    "vzero %%v1\n\t"
+    "vzero %%v2\n\t"
+    "vzero %%v3\n\t"
+    "vzero %%v4\n\t"
+    "vzero %%v5\n\t"
+    "vzero %%v6\n\t"
+    "vzero %%v7\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-16\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,4\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[a0])\n\t"
+    "pfd 1,1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[a0])\n\t"
+    "vfmadb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vl  %%v25,16(%%r1,%[a0])\n\t"
+    "vfmadb   %%v1,%%v17,%%v25,%%v1\n\t"
+    "vl  %%v26,32(%%r1,%[a0])\n\t"
+    "vfmadb   %%v2,%%v18,%%v26,%%v2\n\t"
+    "vl  %%v27,48(%%r1,%[a0])\n\t"
+    "vfmadb   %%v3,%%v19,%%v27,%%v3\n\t"
+    "vl  %%v28,64(%%r1,%[a0])\n\t"
+    "vfmadb   %%v4,%%v20,%%v28,%%v4\n\t"
+    "vl  %%v29,80(%%r1,%[a0])\n\t"
+    "vfmadb   %%v5,%%v21,%%v29,%%v5\n\t"
+    "vl  %%v30,96(%%r1,%[a0])\n\t"
+    "vfmadb   %%v6,%%v22,%%v30,%%v6\n\t"
+    "vl  %%v31,112(%%r1,%[a0])\n\t"
+    "vfmadb   %%v7,%%v23,%%v31,%%v7\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,12\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[a0])\n\t"
+    "vfmadb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vl  %%v25,16(%%r1,%[a0])\n\t"
+    "vfmadb   %%v1,%%v17,%%v25,%%v1\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "vfadb  %%v0,%%v0,%%v1\n\t"
+    "vfadb  %%v0,%%v0,%%v2\n\t"
+    "vfadb  %%v0,%%v0,%%v3\n\t"
+    "vfadb  %%v0,%%v0,%%v4\n\t"
+    "vfadb  %%v0,%%v0,%%v5\n\t"
+    "vfadb  %%v0,%%v0,%%v6\n\t"
+    "vfadb  %%v0,%%v0,%%v7\n\t"
+    "vrepg  %%v1,%%v0,1\n\t"
+    "adbr   %%f0,%%f1\n\t"
+    "std    %%f0,0(%[y])"
+    : "=m"(*(FLOAT (*)[1]) y)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
+       "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+       "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+       "v26", "v27", "v28", "v29", "v30", "v31");
 }
-#else
-static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
 
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+  BLASLONG i;
+  for (i = 0; i < n; i++) {
+    dest[i] = *src;
+    src += inc_src;
+  }
+}
+
+static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) {
+  __asm__("vlrepg %%v0,%[da]\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-16\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,4\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[src])\n\t"
+    "pfd 2,1024(%%r1,%[dest])\n\t"
+    "vl  %%v16,0(%%r1,%[src])\n\t"
+    "vl  %%v17,16(%%r1,%[src])\n\t"
+    "vl  %%v18,32(%%r1,%[src])\n\t"
+    "vl  %%v19,48(%%r1,%[src])\n\t"
+    "vl  %%v20,64(%%r1,%[src])\n\t"
+    "vl  %%v21,80(%%r1,%[src])\n\t"
+    "vl  %%v22,96(%%r1,%[src])\n\t"
+    "vl  %%v23,112(%%r1,%[src])\n\t"
+    "vl  %%v24, 0(%%r1,%[dest])\n\t"
+    "vfmadb   %%v24,%%v16,%%v0,%%v24\n\t"
+    "vst  %%v24, 0(%%r1,%[dest])\n\t"
+    "vl  %%v25, 16(%%r1,%[dest])\n\t"
+    "vfmadb   %%v25,%%v17,%%v0,%%v25\n\t"
+    "vst  %%v25, 16(%%r1,%[dest])\n\t"
+    "vl  %%v26, 32(%%r1,%[dest])\n\t"
+    "vfmadb   %%v26,%%v18,%%v0,%%v26\n\t"
+    "vst  %%v26, 32(%%r1,%[dest])\n\t"
+    "vl  %%v27, 48(%%r1,%[dest])\n\t"
+    "vfmadb   %%v27,%%v19,%%v0,%%v27\n\t"
+    "vst  %%v27, 48(%%r1,%[dest])\n\t"
+    "vl  %%v28, 64(%%r1,%[dest])\n\t"
+    "vfmadb   %%v28,%%v20,%%v0,%%v28\n\t"
+    "vst  %%v28, 64(%%r1,%[dest])\n\t"
+    "vl  %%v29, 80(%%r1,%[dest])\n\t"
+    "vfmadb   %%v29,%%v21,%%v0,%%v29\n\t"
+    "vst  %%v29, 80(%%r1,%[dest])\n\t"
+    "vl  %%v30, 96(%%r1,%[dest])\n\t"
+    "vfmadb   %%v30,%%v22,%%v0,%%v30\n\t"
+    "vst  %%v30, 96(%%r1,%[dest])\n\t"
+    "vl  %%v31, 112(%%r1,%[dest])\n\t"
+    "vfmadb   %%v31,%%v23,%%v0,%%v31\n\t"
+    "vst  %%v31, 112(%%r1,%[dest])\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,12\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[src])\n\t"
+    "vl  %%v17,16(%%r1,%[src])\n\t"
+    "vl  %%v24, 0(%%r1,%[dest])\n\t"
+    "vfmadb   %%v24,%%v16,%%v0,%%v24\n\t"
+    "vst  %%v24, 0(%%r1,%[dest])\n\t"
+    "vl  %%v25, 16(%%r1,%[dest])\n\t"
+    "vfmadb   %%v25,%%v17,%%v0,%%v25\n\t"
+    "vst  %%v25, 16(%%r1,%[dest])\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "nop"
+    : "+m"(*(struct { FLOAT x[n]; } *) dest)
+    : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src),
+       [src] "a"(src),[n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+}
+static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest,
+                  BLASLONG inc_dest) {
+  if (inc_dest == 1)
+    add_y_kernel_4(n, da, src, dest);
+  else {
     BLASLONG i;
-    FLOAT *a0,*a1;
-    a0 = ap[0];
-    a1 = ap[1]; 
-    FLOAT temp0 = 0.0;
-    FLOAT temp1 = 0.0; 
-
-    for ( i=0; i< n; i+=4 )
-    {
-        temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];        
-        temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];         
+    for (i = 0; i < n; i++) {
+      *dest += src[i] * da;
+      dest += inc_dest;
     }
-    y[0] = temp0;
-    y[1] = temp1; 
-
+  }
 }
-#endif    
 
-#ifdef HAVE_KERNEL_4x1
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
+          BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT *buffer) {
+  BLASLONG register i;
+  BLASLONG register j;
+  FLOAT *a_ptr;
+  FLOAT *x_ptr;
+  FLOAT *y_ptr;
+  BLASLONG n0;
+  BLASLONG n1;
+  BLASLONG m1;
+  BLASLONG m2;
+  BLASLONG m3;
+  BLASLONG n2;
+  FLOAT ybuffer[2] __attribute__ ((aligned(16)));
+  FLOAT *xbuffer;
+  FLOAT *ytemp;
+
+  if (m < 1)
+    return (0);
+  if (n < 1)
+    return (0);
+
+  xbuffer = buffer;
+  ytemp = buffer + (m < NBMAX ? m : NBMAX);
+
+  n0 = n / NBMAX;
+  n1 = (n % NBMAX) >> 2;
+  n2 = n & 3;
+
+  m3 = m & 3;
+  m1 = m & -4;
+  m2 = (m & (NBMAX - 1)) - m3;
+
+  BLASLONG NB = NBMAX;
+
+  while (NB == NBMAX) {
+    m1 -= NB;
+    if (m1 < 0) {
+      if (m2 == 0)
+        break;
+      NB = m2;
+    }
 
-#elif HAVE_KERNEL_4x1_VEC
+    y_ptr = y;
+    a_ptr = a;
+    x_ptr = x;
 
-static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
-{
-    BLASLONG i; 
-    __vector double* va0 = (__vector double*)a0;      
-    __vector double* v_x =(__vector double*)x;           
-    __vector double temp0 = {0,0};          
+    if (inc_x == 1)
+      xbuffer = x_ptr;
+    else
+      copy_x(NB, x_ptr, xbuffer, inc_x);
 
-    for ( i=0; i< n/2; i+=2 )
-    {
-        temp0 += v_x[i] * va0[i]  + v_x[i+1] * va0[i+1] ;         
-    }
-        
-    y[0] = temp0[0] + temp0[1]; 
-}
-#else
-static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
-{
-    BLASLONG i;
-    
-     
-    FLOAT temp0 = 0.0; 
+    FLOAT *ap[4];
+    FLOAT *yp;
+    BLASLONG register lda4 = 4 * lda;
+    ap[0] = a_ptr;
+    ap[1] = a_ptr + lda;
+    ap[2] = ap[1] + lda;
+    ap[3] = ap[2] + lda;
 
-    for ( i=0; i< n; i+=4 )
-    {
-        temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];         
-    }
-    y[0] = temp0; 
-}
-#endif
-    
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
-{
-        BLASLONG i;
-        for ( i=0; i<n; i++ )
-        {
-                *dest = *src;
-                dest++;
-                src += inc_src;
-        }
-}
- 
-static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
-{
-
-        BLASLONG i;
- 
-        for ( i=0; i<n; i++ )
-        {
-                *dest += src[i]  * da;
-                dest  += inc_dest;
-        }
-        return; 
-         
-}
+    if (n0 > 0) {
+      BLASLONG nb1 = NBMAX / 4;
+      for (j = 0; j < n0; j++) {
 
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
-    BLASLONG register i;
-    BLASLONG register j;
-    FLOAT *a_ptr;
-    FLOAT *x_ptr;
-    FLOAT *y_ptr;
-    BLASLONG n0;
-    BLASLONG n1;
-    BLASLONG m1;
-    BLASLONG m2;
-    BLASLONG m3;
-    BLASLONG n2;
-    FLOAT ybuffer[4],*xbuffer;
-    FLOAT *ytemp;
-
-    if ( m < 1 ) return(0);
-    if ( n < 1 ) return(0);
-
-    xbuffer = buffer;
-    ytemp   = buffer + (m < NBMAX ? m : NBMAX);
-    
-    n0 = n / NBMAX;
-    n1 = (n % NBMAX)  >> 2 ;
-    n2 = n & 3  ;
-
-    m3 = m & 3  ;
-    m1 = m & -4 ;
-    m2 = (m & (NBMAX-1)) - m3 ;
-
-
-    BLASLONG NB = NBMAX;
-
-    while ( NB == NBMAX )
-    {
-        
-        m1 -= NB;
-        if ( m1 < 0)
-        {
-            if ( m2 == 0 ) break;    
-            NB = m2;
+        yp = ytemp;
+        for (i = 0; i < nb1; i++) {
+          dgemv_kernel_4x4(NB, ap, xbuffer, yp);
+          ap[0] += lda4;
+          ap[1] += lda4;
+          ap[2] += lda4;
+          ap[3] += lda4;
+          yp += 4;
         }
-        
-        y_ptr = y;
-        a_ptr = a;
-        x_ptr = x;
-
-        if ( inc_x == 1 )
-            xbuffer = x_ptr;
-        else
-            copy_x(NB,x_ptr,xbuffer,inc_x);
-
-
-        FLOAT *ap[4];
-        FLOAT *yp;
-        BLASLONG register lda4 = 4 * lda;
-        ap[0] = a_ptr;
-        ap[1] = a_ptr + lda;
-        ap[2] = ap[1] + lda;
-        ap[3] = ap[2] + lda;
-
-        if ( n0 > 0 )
-        {
-            BLASLONG nb1 = NBMAX / 4;
-            for( j=0; j<n0; j++)
-            {
-
-                yp = ytemp;
-                for( i = 0; i < nb1  ; i++)
-                {
-                    dgemv_kernel_4x4(NB,ap,xbuffer,yp);
-                    ap[0] += lda4 ;
-                    ap[1] += lda4 ;
-                    ap[2] += lda4 ;
-                    ap[3] += lda4 ;
-                    yp += 4;
-                }
-                add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
-                y_ptr += nb1 * inc_y * 4;
-                a_ptr += nb1 * lda4 ;
-
-            }
+        add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y);
+        y_ptr += nb1 * inc_y * 4;
+        a_ptr += nb1 * lda4;
 
-        }
+      }
 
+    }
 
-        yp = ytemp;
+    yp = ytemp;
 
-        for( i = 0; i < n1 ; i++)
-        {
-            dgemv_kernel_4x4(NB,ap,xbuffer,yp);
-            ap[0] += lda4 ;
-            ap[1] += lda4 ;
-            ap[2] += lda4 ;
-            ap[3] += lda4 ;
-            yp += 4;
-        }
-        if ( n1 > 0 )
-        {
-            add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
-            y_ptr += n1 * inc_y * 4;
-            a_ptr += n1 * lda4 ;
-        }
+    for (i = 0; i < n1; i++) {
+      dgemv_kernel_4x4(NB, ap, xbuffer, yp);
+      ap[0] += lda4;
+      ap[1] += lda4;
+      ap[2] += lda4;
+      ap[3] += lda4;
+      yp += 4;
+    }
+    if (n1 > 0) {
+      add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y);
+      y_ptr += n1 * inc_y * 4;
+      a_ptr += n1 * lda4;
+    }
 
-        if ( n2 & 2 )
-        {
+    if (n2 & 2) {
 
-            dgemv_kernel_4x2(NB,ap,xbuffer,ybuffer);
-            a_ptr  += lda * 2;
-            *y_ptr += ybuffer[0] * alpha;
-            y_ptr  += inc_y;
-            *y_ptr += ybuffer[1] * alpha;
-            y_ptr  += inc_y;
+      dgemv_kernel_4x2(NB, ap, xbuffer, ybuffer);
+      a_ptr += lda * 2;
+      *y_ptr += ybuffer[0] * alpha;
+      y_ptr += inc_y;
+      *y_ptr += ybuffer[1] * alpha;
+      y_ptr += inc_y;
 
-        }
+    }
 
-        if ( n2 & 1 )
-        {
+    if (n2 & 1) {
 
-            dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
-            a_ptr  += lda;
-            *y_ptr += ybuffer[0] * alpha;
-            y_ptr  += inc_y;
+      dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
+      // a_ptr  += lda;
+      *y_ptr += ybuffer[0] * alpha;
+      // y_ptr  += inc_y;
 
-        }
-        a += NB;
-        x += NB * inc_x;    
     }
+    a += NB;
+    x += NB * inc_x;
+  }
+
+  if (m3 == 0)
+    return (0);
+
+  x_ptr = x;
+  a_ptr = a;
+  if (m3 == 3) {
+    FLOAT xtemp0 = *x_ptr * alpha;
+    x_ptr += inc_x;
+    FLOAT xtemp1 = *x_ptr * alpha;
+    x_ptr += inc_x;
+    FLOAT xtemp2 = *x_ptr * alpha;
 
-    if ( m3 == 0 ) return(0);
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
 
-    x_ptr = x;
-    a_ptr = a;
-    if ( m3 == 3 )
-    {
-        FLOAT xtemp0 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp1 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp2 = *x_ptr * alpha;
-
-        FLOAT *aj = a_ptr;
-        y_ptr = y;
-
-        if ( lda == 3 && inc_y == 1 )
-        {
-
-            for ( j=0; j< ( n & -4) ; j+=4 )
-            {
-
-                y_ptr[j]   += aj[0] * xtemp0 + aj[1]  * xtemp1 + aj[2]  * xtemp2;
-                y_ptr[j+1] += aj[3] * xtemp0 + aj[4]  * xtemp1 + aj[5]  * xtemp2;
-                y_ptr[j+2] += aj[6] * xtemp0 + aj[7]  * xtemp1 + aj[8]  * xtemp2;
-                y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
-                 aj        += 12;
-            }
-
-            for ( ; j<n; j++ )
-            {
-                y_ptr[j]  += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
-                 aj        += 3;
-            }
+    if (lda == 3 && inc_y == 1) {
 
-        }
-        else
-        {
+      for (j = 0; j < (n & -4); j += 4) {
 
-            if ( inc_y == 1 )
-            {
+        y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+        y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
+        y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
+        y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+        aj += 12;
+      }
 
-                BLASLONG register lda2 = lda << 1;
-                BLASLONG register lda4 = lda << 2;
-                BLASLONG register lda3 = lda2 + lda;
+      for (; j < n; j++) {
+        y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+        aj += 3;
+      }
 
-                for ( j=0; j< ( n & -4 ); j+=4 )
-                {
+    } else {
 
-                    y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 + *(aj+2)      * xtemp2;
-                    y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 + *(aj+lda+2)  * xtemp2;
-                    y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
-                    y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
-                     aj          += lda4;
-                }
+      if (inc_y == 1) {
 
-                for ( ; j< n ; j++ )
-                {
+        BLASLONG register lda2 = lda << 1;
+        BLASLONG register lda4 = lda << 2;
+        BLASLONG register lda3 = lda2 + lda;
 
-                    y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
-                     aj          += lda;
-                }
+        for (j = 0; j < (n & -4); j += 4) {
 
-            }
-            else
-            {
+          y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+          y_ptr[j + 1] +=
+            *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda +
+                                                                2) * xtemp2;
+          y_ptr[j + 2] +=
+            *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 +
+                                                                  2) * xtemp2;
+          y_ptr[j + 3] +=
+            *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 +
+                                                                  2) * xtemp2;
+          aj += lda4;
+        }
 
-                for ( j=0; j<n; j++ )
-                {
-                    *y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
-                     y_ptr += inc_y;
-                     aj    += lda;
-                }
+        for (; j < n; j++) {
 
+          y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+          aj += lda;
+        }
 
-            }
+      } else {
 
+        for (j = 0; j < n; j++) {
+          *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+          y_ptr += inc_y;
+          aj += lda;
         }
-        return(0);
-    }
 
-    if ( m3 == 2 )
-    {
-        FLOAT xtemp0 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp1 = *x_ptr * alpha;
+      }
 
-        FLOAT *aj = a_ptr;
-        y_ptr = y;
+    }
+    return (0);
+  }
 
-        if ( lda == 2 && inc_y == 1 )
-        {
+  if (m3 == 2) {
+    FLOAT xtemp0 = *x_ptr * alpha;
+    x_ptr += inc_x;
+    FLOAT xtemp1 = *x_ptr * alpha;
 
-            for ( j=0; j< ( n & -4) ; j+=4 )
-            {
-                y_ptr[j]   += aj[0] * xtemp0 + aj[1] * xtemp1 ;
-                y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
-                y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
-                y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
-                 aj         += 8;
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
 
-            }
+    if (lda == 2 && inc_y == 1) {
 
-            for ( ; j<n; j++ )
-            {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
-                 aj       += 2;
-            }
+      for (j = 0; j < (n & -4); j += 4) {
+        y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+        y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
+        y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
+        y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
+        aj += 8;
 
-        }
-        else
-        {
-            if ( inc_y == 1 )
-            {
-
-                BLASLONG register lda2 = lda << 1;
-                BLASLONG register lda4 = lda << 2;
-                BLASLONG register lda3 = lda2 + lda;
-
-                for ( j=0; j< ( n & -4 ); j+=4 )
-                {
-
-                    y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 ;
-                    y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 ;
-                    y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
-                    y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
-                     aj          += lda4;
-                }
-
-                for ( ; j< n ; j++ )
-                {
-
-                    y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 ;
-                     aj          += lda;
-                }
-
-            }
-            else
-            {
-                for ( j=0; j<n; j++ )
-                {
-                    *y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
-                     y_ptr += inc_y;
-                     aj    += lda;
-                }
-            }
+      }
 
-        }
-        return(0);
+      for (; j < n; j++) {
+        y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+        aj += 2;
+      }
 
-    }
+    } else {
+      if (inc_y == 1) {
 
-    FLOAT xtemp = *x_ptr * alpha;
-    FLOAT *aj = a_ptr;
-    y_ptr = y;
-    if ( lda == 1 && inc_y == 1 )
-    {
-        for ( j=0; j< ( n & -4) ; j+=4 )
-        {
-            y_ptr[j]   += aj[j]   * xtemp;
-            y_ptr[j+1] += aj[j+1] * xtemp;
-            y_ptr[j+2] += aj[j+2] * xtemp;
-            y_ptr[j+3] += aj[j+3] * xtemp;
-        }
-        for ( ; j<n   ; j++ )
-        {
-            y_ptr[j] += aj[j] * xtemp;
-        }
+        BLASLONG register lda2 = lda << 1;
+        BLASLONG register lda4 = lda << 2;
+        BLASLONG register lda3 = lda2 + lda;
 
+        for (j = 0; j < (n & -4); j += 4) {
 
+          y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+          y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
+          y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
+          y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
+          aj += lda4;
+        }
 
-    }
-    else
-    {
-        if ( inc_y == 1 )
-        {
-
-            BLASLONG register lda2 = lda << 1;
-            BLASLONG register lda4 = lda << 2;
-            BLASLONG register lda3 = lda2 + lda;
-            for ( j=0; j< ( n & -4 ); j+=4 )
-            {
-                y_ptr[j]    += *aj        * xtemp;
-                y_ptr[j+1]  += *(aj+lda)  * xtemp;
-                y_ptr[j+2]  += *(aj+lda2) * xtemp;
-                y_ptr[j+3]  += *(aj+lda3) * xtemp;
-                 aj          += lda4  ;
-            }
-
-            for ( ; j<n; j++ )
-            {
-                y_ptr[j]  += *aj * xtemp;
-                 aj        += lda;
-            }
+        for (; j < n; j++) {
 
+          y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+          aj += lda;
         }
-        else
-        {
-            for ( j=0; j<n; j++ )
-            {
-                *y_ptr += *aj * xtemp;
-                 y_ptr += inc_y;
-                 aj    += lda;
-            }
 
+      } else {
+        for (j = 0; j < n; j++) {
+          *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
+          y_ptr += inc_y;
+          aj += lda;
         }
+      }
+
+    }
+    return (0);
+
+  }
+
+  FLOAT xtemp = *x_ptr * alpha;
+  FLOAT *aj = a_ptr;
+  y_ptr = y;
+  if (lda == 1 && inc_y == 1) {
+    for (j = 0; j < (n & -4); j += 4) {
+      y_ptr[j] += aj[j] * xtemp;
+      y_ptr[j + 1] += aj[j + 1] * xtemp;
+      y_ptr[j + 2] += aj[j + 2] * xtemp;
+      y_ptr[j + 3] += aj[j + 3] * xtemp;
+    }
+    for (; j < n; j++) {
+      y_ptr[j] += aj[j] * xtemp;
     }
 
-    return(0);
-}
+  } else {
+    if (inc_y == 1) {
+
+      BLASLONG register lda2 = lda << 1;
+      BLASLONG register lda4 = lda << 2;
+      BLASLONG register lda3 = lda2 + lda;
+      for (j = 0; j < (n & -4); j += 4) {
+        y_ptr[j] += *aj * xtemp;
+        y_ptr[j + 1] += *(aj + lda) * xtemp;
+        y_ptr[j + 2] += *(aj + lda2) * xtemp;
+        y_ptr[j + 3] += *(aj + lda3) * xtemp;
+        aj += lda4;
+      }
+
+      for (; j < n; j++) {
+        y_ptr[j] += *aj * xtemp;
+        aj += lda;
+      }
+
+    } else {
+      for (j = 0; j < n; j++) {
+        *y_ptr += *aj * xtemp;
+        y_ptr += inc_y;
+        aj += lda;
+      }
 
+    }
+  }
 
+  return (0);
+}
diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c
new file mode 100644
index 0000000000..cdc8d5d08f
--- /dev/null
+++ b/kernel/zarch/dmax.c
@@ -0,0 +1,147 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT max;
+
+  __asm__("vl    %%v0,0(%[x])\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,128(%%r1,%[x])\n\t"
+    "vl  %%v25,144(%%r1,%[x])\n\t"
+    "vl  %%v26,160(%%r1,%[x])\n\t"
+    "vl  %%v27,176(%%r1,%[x])\n\t"
+    "vl  %%v28,192(%%r1,%[x])\n\t"
+    "vl  %%v29,208(%%r1,%[x])\n\t"
+    "vl  %%v30,224(%%r1,%[x])\n\t"
+    "vl  %%v31,240(%%r1,%[x])\n\t"
+    "vfmaxdb  %%v16,%%v16,%%v24,0\n\t"
+    "vfmaxdb  %%v17,%%v17,%%v25,0\n\t"
+    "vfmaxdb  %%v18,%%v18,%%v26,0\n\t"
+    "vfmaxdb  %%v19,%%v19,%%v27,0\n\t"
+    "vfmaxdb  %%v20,%%v20,%%v28,0\n\t"
+    "vfmaxdb  %%v21,%%v21,%%v29,0\n\t"
+    "vfmaxdb  %%v22,%%v22,%%v30,0\n\t"
+    "vfmaxdb  %%v23,%%v23,%%v31,0\n\t"
+    "vfmaxdb  %%v16,%%v16,%%v20,0\n\t"
+    "vfmaxdb  %%v17,%%v17,%%v21,0\n\t"
+    "vfmaxdb  %%v18,%%v18,%%v22,0\n\t"
+    "vfmaxdb  %%v19,%%v19,%%v23,0\n\t"
+    "vfmaxdb  %%v16,%%v16,%%v18,0\n\t"
+    "vfmaxdb  %%v17,%%v17,%%v19,0\n\t"
+    "vfmaxdb  %%v16,%%v16,%%v17,0\n\t"
+    "vfmaxdb  %%v0,%%v0,%%v16,0\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg   %%v16,%%v0,1\n\t"
+    "wfmaxdb %%v0,%%v0,%%v16,0\n\t"
+    "ldr    %[max],%%f0"
+    : [max] "=f"(max),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return max;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT maxf = 0.0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (maxf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      maxf = dmax_kernel_32(n1, x);
+
+      i = n1;
+    } else {
+      maxf = x[0];
+      i++;
+    }
+
+    while (i < n) {
+      if (x[i] > maxf) {
+        maxf = x[i];
+      }
+      i++;
+    }
+    return (maxf);
+
+  } else {
+
+    maxf = x[0];
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (x[i] > maxf) {
+        maxf = x[i];
+      }
+      if (x[i + inc_x] > maxf) {
+        maxf = x[i + inc_x];
+      }
+      if (x[i + 2 * inc_x] > maxf) {
+        maxf = x[i + 2 * inc_x];
+      }
+      if (x[i + 3 * inc_x] > maxf) {
+        maxf = x[i + 3 * inc_x];
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (x[i] > maxf) {
+        maxf = x[i];
+      }
+      i += inc_x;
+      j++;
+    }
+    return (maxf);
+  }
+}
diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c
new file mode 100644
index 0000000000..c4e8d91f87
--- /dev/null
+++ b/kernel/zarch/dmax_z13.c
@@ -0,0 +1,164 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT max;
+
+  __asm__("vl    %%v0,0(%[x])\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vfchdb  %%v24,%%v16,%%v17\n\t"
+    "vfchdb  %%v25,%%v18,%%v19\n\t"
+    "vfchdb  %%v26,%%v20,%%v21\n\t"
+    "vfchdb  %%v27,%%v22,%%v23\n\t"
+    "vsel    %%v24,%%v16,%%v17,%%v24\n\t"
+    "vsel    %%v25,%%v18,%%v19,%%v25\n\t"
+    "vsel    %%v26,%%v20,%%v21,%%v26\n\t"
+    "vsel    %%v27,%%v22,%%v23,%%v27\n\t"
+    "vfchdb  %%v28,%%v24,%%v25\n\t"
+    "vfchdb  %%v29,%%v26,%%v27\n\t"
+    "vsel    %%v28,%%v24,%%v25,%%v28\n\t"
+    "vsel    %%v29,%%v26,%%v27,%%v29\n\t"
+    "vfchdb  %%v30,%%v28,%%v29\n\t"
+    "vsel    %%v30,%%v28,%%v29,%%v30\n\t"
+    "vfchdb  %%v31,%%v30,%%v0\n\t"
+    "vsel    %%v0,%%v30,%%v0,%%v31\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,192(%%r1,%[x])\n\t"
+    "vl  %%v21,208(%%r1,%[x])\n\t"
+    "vl  %%v22,224(%%r1,%[x])\n\t"
+    "vl  %%v23,240(%%r1,%[x])\n\t"
+    "vfchdb  %%v24,%%v16,%%v17\n\t"
+    "vfchdb  %%v25,%%v18,%%v19\n\t"
+    "vfchdb  %%v26,%%v20,%%v21\n\t"
+    "vfchdb  %%v27,%%v22,%%v23\n\t"
+    "vsel    %%v24,%%v16,%%v17,%%v24\n\t"
+    "vsel    %%v25,%%v18,%%v19,%%v25\n\t"
+    "vsel    %%v26,%%v20,%%v21,%%v26\n\t"
+    "vsel    %%v27,%%v22,%%v23,%%v27\n\t"
+    "vfchdb  %%v28,%%v24,%%v25\n\t"
+    "vfchdb  %%v29,%%v26,%%v27\n\t"
+    "vsel    %%v28,%%v24,%%v25,%%v28\n\t"
+    "vsel    %%v29,%%v26,%%v27,%%v29\n\t"
+    "vfchdb  %%v30,%%v28,%%v29\n\t"
+    "vsel    %%v30,%%v28,%%v29,%%v30\n\t"
+    "vfchdb  %%v31,%%v30,%%v0\n\t"
+    "vsel    %%v0,%%v30,%%v0,%%v31\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v16,%%v0,1\n\t"
+    "wfchdb %%v17,%%v0,%%v16\n\t"
+    "vsel   %%v0,%%v0,%%v16,%%v17\n\t"
+    "ldr    %[max],%%f0"
+    : [max] "=f"(max),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return max;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT maxf = 0.0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (maxf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      maxf = dmax_kernel_32(n1, x);
+
+      i = n1;
+    } else {
+      maxf = x[0];
+      i++;
+    }
+
+    while (i < n) {
+      if (x[i] > maxf) {
+        maxf = x[i];
+      }
+      i++;
+    }
+    return (maxf);
+
+  } else {
+
+    maxf = x[0];
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (x[i] > maxf) {
+        maxf = x[i];
+      }
+      if (x[i + inc_x] > maxf) {
+        maxf = x[i + inc_x];
+      }
+      if (x[i + 2 * inc_x] > maxf) {
+        maxf = x[i + 2 * inc_x];
+      }
+      if (x[i + 3 * inc_x] > maxf) {
+        maxf = x[i + 3 * inc_x];
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (x[i] > maxf) {
+        maxf = x[i];
+      }
+      i += inc_x;
+      j++;
+    }
+    return (maxf);
+  }
+}
diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c
new file mode 100644
index 0000000000..f9b129cbd9
--- /dev/null
+++ b/kernel/zarch/dmin.c
@@ -0,0 +1,147 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT min;
+
+  __asm__("vl    %%v0,0(%[x])\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,128(%%r1,%[x])\n\t"
+    "vl  %%v25,144(%%r1,%[x])\n\t"
+    "vl  %%v26,160(%%r1,%[x])\n\t"
+    "vl  %%v27,176(%%r1,%[x])\n\t"
+    "vl  %%v28,192(%%r1,%[x])\n\t"
+    "vl  %%v29,208(%%r1,%[x])\n\t"
+    "vl  %%v30,224(%%r1,%[x])\n\t"
+    "vl  %%v31,240(%%r1,%[x])\n\t"
+    "vfmindb  %%v16,%%v16,%%v24,0\n\t"
+    "vfmindb  %%v17,%%v17,%%v25,0\n\t"
+    "vfmindb  %%v18,%%v18,%%v26,0\n\t"
+    "vfmindb  %%v19,%%v19,%%v27,0\n\t"
+    "vfmindb  %%v20,%%v20,%%v28,0\n\t"
+    "vfmindb  %%v21,%%v21,%%v29,0\n\t"
+    "vfmindb  %%v22,%%v22,%%v30,0\n\t"
+    "vfmindb  %%v23,%%v23,%%v31,0\n\t"
+    "vfmindb  %%v16,%%v16,%%v20,0\n\t"
+    "vfmindb  %%v17,%%v17,%%v21,0\n\t"
+    "vfmindb  %%v18,%%v18,%%v22,0\n\t"
+    "vfmindb  %%v19,%%v19,%%v23,0\n\t"
+    "vfmindb  %%v16,%%v16,%%v18,0\n\t"
+    "vfmindb  %%v17,%%v17,%%v19,0\n\t"
+    "vfmindb  %%v16,%%v16,%%v17,0\n\t"
+    "vfmindb  %%v0,%%v0,%%v16,0\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg   %%v16,%%v0,1\n\t"
+    "wfmindb %%v0,%%v0,%%v16,0\n\t"
+    "ldr    %[min],%%f0"
+    : [min] "=f"(min),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return min;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT minf = 0.0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (minf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      minf = dmin_kernel_32(n1, x);
+
+      i = n1;
+    } else {
+      minf = x[0];
+      i++;
+    }
+
+    while (i < n) {
+      if (x[i] < minf) {
+        minf = x[i];
+      }
+      i++;
+    }
+    return (minf);
+
+  } else {
+
+    minf = x[0];
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (x[i] < minf) {
+        minf = x[i];
+      }
+      if (x[i + inc_x] < minf) {
+        minf = x[i + inc_x];
+      }
+      if (x[i + 2 * inc_x] < minf) {
+        minf = x[i + 2 * inc_x];
+      }
+      if (x[i + 3 * inc_x] < minf) {
+        minf = x[i + 3 * inc_x];
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (x[i] < minf) {
+        minf = x[i];
+      }
+      i += inc_x;
+      j++;
+    }
+    return (minf);
+  }
+}
diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c
new file mode 100644
index 0000000000..77f021c1d9
--- /dev/null
+++ b/kernel/zarch/dmin_z13.c
@@ -0,0 +1,164 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT min;
+
+  __asm__("vl    %%v0,0(%[x])\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vfchdb  %%v24,%%v17,%%v16\n\t"
+    "vfchdb  %%v25,%%v19,%%v18\n\t"
+    "vfchdb  %%v26,%%v21,%%v20\n\t"
+    "vfchdb  %%v27,%%v23,%%v22\n\t"
+    "vsel    %%v24,%%v16,%%v17,%%v24\n\t"
+    "vsel    %%v25,%%v18,%%v19,%%v25\n\t"
+    "vsel    %%v26,%%v20,%%v21,%%v26\n\t"
+    "vsel    %%v27,%%v22,%%v23,%%v27\n\t"
+    "vfchdb  %%v28,%%v25,%%v24\n\t"
+    "vfchdb  %%v29,%%v27,%%v26\n\t"
+    "vsel    %%v28,%%v24,%%v25,%%v28\n\t"
+    "vsel    %%v29,%%v26,%%v27,%%v29\n\t"
+    "vfchdb  %%v30,%%v29,%%v28\n\t"
+    "vsel    %%v30,%%v28,%%v29,%%v30\n\t"
+    "vfchdb  %%v31,%%v0,%%v30\n\t"
+    "vsel    %%v0,%%v30,%%v0,%%v31\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,192(%%r1,%[x])\n\t"
+    "vl  %%v21,208(%%r1,%[x])\n\t"
+    "vl  %%v22,224(%%r1,%[x])\n\t"
+    "vl  %%v23,240(%%r1,%[x])\n\t"
+    "vfchdb  %%v24,%%v17,%%v16\n\t"
+    "vfchdb  %%v25,%%v19,%%v18\n\t"
+    "vfchdb  %%v26,%%v21,%%v20\n\t"
+    "vfchdb  %%v27,%%v23,%%v22\n\t"
+    "vsel    %%v24,%%v16,%%v17,%%v24\n\t"
+    "vsel    %%v25,%%v18,%%v19,%%v25\n\t"
+    "vsel    %%v26,%%v20,%%v21,%%v26\n\t"
+    "vsel    %%v27,%%v22,%%v23,%%v27\n\t"
+    "vfchdb  %%v28,%%v25,%%v24\n\t"
+    "vfchdb  %%v29,%%v27,%%v26\n\t"
+    "vsel    %%v28,%%v24,%%v25,%%v28\n\t"
+    "vsel    %%v29,%%v26,%%v27,%%v29\n\t"
+    "vfchdb  %%v30,%%v29,%%v28\n\t"
+    "vsel    %%v30,%%v28,%%v29,%%v30\n\t"
+    "vfchdb  %%v31,%%v0,%%v30\n\t"
+    "vsel    %%v0,%%v30,%%v0,%%v31\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v16,%%v0,1\n\t"
+    "wfchdb %%v17,%%v16,%%v0\n\t"
+    "vsel   %%v0,%%v0,%%v16,%%v17\n\t"
+    "ldr    %[min],%%f0"
+    : [min] "=f"(min),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return min;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT minf = 0.0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (minf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      minf = dmin_kernel_32(n1, x);
+
+      i = n1;
+    } else {
+      minf = x[0];
+      i++;
+    }
+
+    while (i < n) {
+      if (x[i] < minf) {
+        minf = x[i];
+      }
+      i++;
+    }
+    return (minf);
+
+  } else {
+
+    minf = x[0];
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (x[i] < minf) {
+        minf = x[i];
+      }
+      if (x[i + inc_x] < minf) {
+        minf = x[i + inc_x];
+      }
+      if (x[i + 2 * inc_x] < minf) {
+        minf = x[i + 2 * inc_x];
+      }
+      if (x[i + 3 * inc_x] < minf) {
+        minf = x[i + 3 * inc_x];
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (x[i] < minf) {
+        minf = x[i];
+      }
+      i += inc_x;
+      j++;
+    }
+    return (minf);
+  }
+}
diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c
index bf29538c7a..11fbe15b6d 100644
--- a/kernel/zarch/drot.c
+++ b/kernel/zarch/drot.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -27,226 +27,200 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
-{
-          __asm__  (
-            "pfd    2, 0(%[ptr_x]) \n\t"
-            "pfd    2, 0(%[ptr_y]) \n\t"
-            "lgdr   %%r1,%[cos]    \n\t"
-            "vlvgp  %%v0,%%r1,%%r1 \n\t"
-            "lgdr   %%r1,%[sin]    \n\t"
-            "vlvgp  %%v1,%%r1,%%r1 \n\t"
-            "srlg   %[n_tmp],%[n_tmp],5  \n\t"
-            "xgr    %%r1,%%r1  \n\t"
-            ".align 16 \n\t"
-            "1:     \n\t"
-            "pfd    2, 256(%%r1,%[ptr_x]) \n\t"
-            "pfd    2, 256(%%r1,%[ptr_y]) \n\t"
-            "vl     %%v24, 0(%%r1,%[ptr_x])  \n\t" 
-            "vl     %%v25, 16(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v26, 32(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v27, 48(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v16, 0(%%r1,%[ptr_y])  \n\t" 
-            "vl     %%v17, 16(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v18, 32(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v19, 48(%%r1,%[ptr_y]) \n\t"  
-           
-            "vfmdb  %%v28,%%v24,%%v0 \n\t"
-            "vfmdb  %%v29,%%v25,%%v0 \n\t"
-            "vfmdb  %%v20,%%v24,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v21,%%v25,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v30,%%v26,%%v0 \n\t"
-            "vfmdb  %%v22,%%v26,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v31,%%v27,%%v0 \n\t"
-            "vfmdb  %%v23,%%v27,%%v1 \n\t" /* yn=x*s  */
-            /* 2nd parts*/
-            "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"  
-            "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"  
-            "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" 
-            "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" 
-            "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t"  /* yn=y*c-yn */
-
-            "vst    %%v28, 0(%%r1,%[ptr_x])  \n\t" 
-            "vst    %%v29, 16(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v30, 32(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v31, 48(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v20, 0(%%r1,%[ptr_y])  \n\t" 
-            "vst    %%v21, 16(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v22, 32(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v23, 48(%%r1,%[ptr_y]) \n\t"  
-           
-            "vl     %%v24, 64(%%r1,%[ptr_x])  \n\t" 
-            "vl     %%v25, 80(%%r1,%[ptr_x])  \n\t" 
-            "vl     %%v26, 96(%%r1,%[ptr_x])  \n\t" 
-            "vl     %%v27, 112(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v16, 64(%%r1,%[ptr_y])  \n\t" 
-            "vl     %%v17, 80(%%r1,%[ptr_y])  \n\t" 
-            "vl     %%v18, 96(%%r1,%[ptr_y])  \n\t" 
-            "vl     %%v19, 112(%%r1,%[ptr_y]) \n\t"  
-           
-            "vfmdb  %%v28,%%v24,%%v0 \n\t"
-            "vfmdb  %%v29,%%v25,%%v0 \n\t"
-            "vfmdb  %%v20,%%v24,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v21,%%v25,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v30,%%v26,%%v0 \n\t"
-            "vfmdb  %%v22,%%v26,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v31,%%v27,%%v0 \n\t"
-            "vfmdb  %%v23,%%v27,%%v1 \n\t" /* yn=x*s  */
-            /* 2nd parts*/
-            "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"  
-            "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"  
-            "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" 
-            "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" 
-            "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t"  /* yn=y*c-yn */
-
-            "vst    %%v28, 64(%%r1,%[ptr_x])  \n\t" 
-            "vst    %%v29, 80(%%r1,%[ptr_x])  \n\t" 
-            "vst    %%v30, 96(%%r1,%[ptr_x])  \n\t" 
-            "vst    %%v31, 112(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v20, 64(%%r1,%[ptr_y])  \n\t" 
-            "vst    %%v21, 80(%%r1,%[ptr_y])  \n\t" 
-            "vst    %%v22, 96(%%r1,%[ptr_y])  \n\t" 
-            "vst    %%v23, 112(%%r1,%[ptr_y]) \n\t"
-           
-            "vl     %%v24, 128(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v25, 144(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v26, 160(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v27, 176(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v16, 128(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v17, 144(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v18, 160(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v19, 176(%%r1,%[ptr_y]) \n\t"  
-           
-            "vfmdb  %%v28,%%v24,%%v0 \n\t"
-            "vfmdb  %%v29,%%v25,%%v0 \n\t"
-            "vfmdb  %%v20,%%v24,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v21,%%v25,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v30,%%v26,%%v0 \n\t"
-            "vfmdb  %%v22,%%v26,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v31,%%v27,%%v0 \n\t"
-            "vfmdb  %%v23,%%v27,%%v1 \n\t" /* yn=x*s  */
-            /* 2nd parts*/
-            "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"  
-            "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"  
-            "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" 
-            "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" 
-            "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t"  /* yn=y*c-yn */
-
-            "vst    %%v28, 128(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v29, 144(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v30, 160(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v31, 176(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v20, 128(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v21, 144(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v22, 160(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v23, 176(%%r1,%[ptr_y]) \n\t"  
-           
-            "vl     %%v24, 192(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v25, 208(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v26, 224(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v27, 240(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v16, 192(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v17, 208(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v18, 224(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v19, 240(%%r1,%[ptr_y]) \n\t"  
-           
-            "vfmdb  %%v28,%%v24,%%v0 \n\t"
-            "vfmdb  %%v29,%%v25,%%v0 \n\t"
-            "vfmdb  %%v20,%%v24,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v21,%%v25,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v30,%%v26,%%v0 \n\t"
-            "vfmdb  %%v22,%%v26,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v31,%%v27,%%v0 \n\t"
-            "vfmdb  %%v23,%%v27,%%v1 \n\t" /* yn=x*s  */
-            /* 2nd parts*/
-            "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"  
-            "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"  
-            "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" 
-            "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" 
-            "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t"  /* yn=y*c-yn */
-
-            "vst    %%v28, 192(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v29, 208(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v30, 224(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v31, 240(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v20, 192(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v21, 208(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v22, 224(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v23, 240(%%r1,%[ptr_y]) \n\t"
-
-            "la     %%r1,256(%%r1) \n\t"
-            "brctg  %[n_tmp],1b"
-            : [mem_x] "+m" (*(double (*)[n])x),
-              [mem_y] "+m" (*(double (*)[n])y),
-              [n_tmp] "+&r"(n)
-            : [ptr_x] "a"(x), [ptr_y]  "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
-            : "cc", "r1" ,"v0","v1","v16",
-            "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
-            );
-      return;
-
+static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
+  __asm__("vlrepg %%v0,%[c]\n\t"
+    "vlrepg %%v1,%[s]\n\t"
+    "srlg   %[n],%[n],5\n\t"
+    "xgr    %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "pfd 2, 1024(%%r1,%[y])\n\t"
+    "vl  %%v24, 0(%%r1,%[x])\n\t"
+    "vl  %%v25, 16(%%r1,%[x])\n\t"
+    "vl  %%v26, 32(%%r1,%[x])\n\t"
+    "vl  %%v27, 48(%%r1,%[x])\n\t"
+    "vl  %%v16, 0(%%r1,%[y])\n\t"
+    "vl  %%v17, 16(%%r1,%[y])\n\t"
+    "vl  %%v18, 32(%%r1,%[y])\n\t"
+    "vl  %%v19, 48(%%r1,%[y])\n\t"
+    "vfmdb %%v28,%%v24,%%v0\n\t"
+    "vfmdb %%v29,%%v25,%%v0\n\t"
+    "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v30,%%v26,%%v0\n\t"
+    "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v31,%%v27,%%v0\n\t"
+    "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 0(%%r1,%[x])\n\t"
+    "vst  %%v29, 16(%%r1,%[x])\n\t"
+    "vst  %%v30, 32(%%r1,%[x])\n\t"
+    "vst  %%v31, 48(%%r1,%[x])\n\t"
+    "vst  %%v20, 0(%%r1,%[y])\n\t"
+    "vst  %%v21, 16(%%r1,%[y])\n\t"
+    "vst  %%v22, 32(%%r1,%[y])\n\t"
+    "vst  %%v23, 48(%%r1,%[y])\n\t"
+    "vl  %%v24, 64(%%r1,%[x])\n\t"
+    "vl  %%v25, 80(%%r1,%[x])\n\t"
+    "vl  %%v26, 96(%%r1,%[x])\n\t"
+    "vl  %%v27, 112(%%r1,%[x])\n\t"
+    "vl  %%v16, 64(%%r1,%[y])\n\t"
+    "vl  %%v17, 80(%%r1,%[y])\n\t"
+    "vl  %%v18, 96(%%r1,%[y])\n\t"
+    "vl  %%v19, 112(%%r1,%[y])\n\t"
+    "vfmdb %%v28,%%v24,%%v0\n\t"
+    "vfmdb %%v29,%%v25,%%v0\n\t"
+    "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v30,%%v26,%%v0\n\t"
+    "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v31,%%v27,%%v0\n\t"
+    "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 64(%%r1,%[x])\n\t"
+    "vst  %%v29, 80(%%r1,%[x])\n\t"
+    "vst  %%v30, 96(%%r1,%[x])\n\t"
+    "vst  %%v31, 112(%%r1,%[x])\n\t"
+    "vst  %%v20, 64(%%r1,%[y])\n\t"
+    "vst  %%v21, 80(%%r1,%[y])\n\t"
+    "vst  %%v22, 96(%%r1,%[y])\n\t"
+    "vst  %%v23, 112(%%r1,%[y])\n\t"
+    "vl  %%v24, 128(%%r1,%[x])\n\t"
+    "vl  %%v25, 144(%%r1,%[x])\n\t"
+    "vl  %%v26, 160(%%r1,%[x])\n\t"
+    "vl  %%v27, 176(%%r1,%[x])\n\t"
+    "vl  %%v16, 128(%%r1,%[y])\n\t"
+    "vl  %%v17, 144(%%r1,%[y])\n\t"
+    "vl  %%v18, 160(%%r1,%[y])\n\t"
+    "vl  %%v19, 176(%%r1,%[y])\n\t"
+    "vfmdb %%v28,%%v24,%%v0\n\t"
+    "vfmdb %%v29,%%v25,%%v0\n\t"
+    "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v30,%%v26,%%v0\n\t"
+    "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v31,%%v27,%%v0\n\t"
+    "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 128(%%r1,%[x])\n\t"
+    "vst  %%v29, 144(%%r1,%[x])\n\t"
+    "vst  %%v30, 160(%%r1,%[x])\n\t"
+    "vst  %%v31, 176(%%r1,%[x])\n\t"
+    "vst  %%v20, 128(%%r1,%[y])\n\t"
+    "vst  %%v21, 144(%%r1,%[y])\n\t"
+    "vst  %%v22, 160(%%r1,%[y])\n\t"
+    "vst  %%v23, 176(%%r1,%[y])\n\t"
+    "vl  %%v24, 192(%%r1,%[x])\n\t"
+    "vl  %%v25, 208(%%r1,%[x])\n\t"
+    "vl  %%v26, 224(%%r1,%[x])\n\t"
+    "vl  %%v27, 240(%%r1,%[x])\n\t"
+    "vl  %%v16, 192(%%r1,%[y])\n\t"
+    "vl  %%v17, 208(%%r1,%[y])\n\t"
+    "vl  %%v18, 224(%%r1,%[y])\n\t"
+    "vl  %%v19, 240(%%r1,%[y])\n\t"
+    "vfmdb %%v28,%%v24,%%v0\n\t"
+    "vfmdb %%v29,%%v25,%%v0\n\t"
+    "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v30,%%v26,%%v0\n\t"
+    "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v31,%%v27,%%v0\n\t"
+    "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 192(%%r1,%[x])\n\t"
+    "vst  %%v29, 208(%%r1,%[x])\n\t"
+    "vst  %%v30, 224(%%r1,%[x])\n\t"
+    "vst  %%v31, 240(%%r1,%[x])\n\t"
+    "vst  %%v20, 192(%%r1,%[y])\n\t"
+    "vst  %%v21, 208(%%r1,%[y])\n\t"
+    "vst  %%v22, 224(%%r1,%[y])\n\t"
+    "vst  %%v23, 240(%%r1,%[y])\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
+       [n] "+&r"(n)
+    : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
 }
 
-int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
-{
-    BLASLONG i=0;
-    BLASLONG ix=0,iy=0;
-     
-    FLOAT temp;
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT c, FLOAT s) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
 
-    if ( n <= 0 )  return(0);
+  FLOAT temp;
 
-    if ( (inc_x == 1) && (inc_y == 1) )
-    {
+  if (n <= 0)
+    return (0);
 
-        BLASLONG n1 = n & -32;
-        if ( n1 > 0 )
-        {
-            
-            drot_kernel_32(n1, x, y, c, s);
-            i=n1;
-        }
+  if ((inc_x == 1) && (inc_y == 1)) {
 
-        while(i < n)
-        {
-            temp  = c*x[i] + s*y[i] ;
-            y[i]  = c*y[i] - s*x[i] ;
-            x[i]  = temp ;
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+      FLOAT cosa, sina;
+      cosa = c;
+      sina = s;
+      drot_kernel_32(n1, x, y, &cosa, &sina);
+      i = n1;
+    }
 
-            i++ ;
+    while (i < n) {
+      temp = c * x[i] + s * y[i];
+      y[i] = c * y[i] - s * x[i];
+      x[i] = temp;
 
-        }
+      i++;
 
     }
-    else
-    {
 
-        while(i < n)
-        {
-            temp   = c*x[ix] + s*y[iy] ;
-            y[iy]  = c*y[iy] - s*x[ix] ;
-            x[ix]  = temp ;
+  } else {
 
-            ix += inc_x ;
-            iy += inc_y ;
-            i++ ;
+    while (i < n) {
+      temp = c * x[ix] + s * y[iy];
+      y[iy] = c * y[iy] - s * x[ix];
+      x[ix] = temp;
 
-        }
+      ix += inc_x;
+      iy += inc_y;
+      i++;
 
     }
-    return(0);
 
-}
+  }
+  return (0);
 
+}
diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c
index e29f51012c..2961eff202 100644
--- a/kernel/zarch/dscal.c
+++ b/kernel/zarch/dscal.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -27,237 +27,151 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#ifdef Z13_A
-static void   dscal_kernel_32( BLASLONG n, FLOAT  da , FLOAT *x )
-{
-
-          
-             __asm__ ("pfd    2, 0(%[x_ptr])   \n\t"
-                      "lgdr   %%r0,%[alpha]    \n\t"
-                      "vlvgp  %%v0,%%r0,%%r0   \n\t"
-                      "srlg   %[n],%[n],4 \n\t"
-                      "vlr    %%v1,%%v0        \n\t"
-                      "vlm    %%v16,%%v23, 0(%[x_ptr])          \n\t"
-                      "la     %[x_ptr], 128(%[x_ptr])     \n\t"
-                      "aghik  %[n], %[n], -1             \n\t"
-                      "jle     2f     \n\t"
-                       ".align 16 \n\t"
-                      "1:          \n\t"
-                      "vfmdb  %%v24, %%v16, %%v0          \n\t"
-                      "vfmdb  %%v25, %%v17, %%v0          \n\t"
-                      "vfmdb  %%v26, %%v18, %%v0          \n\t"
-                      "vfmdb  %%v27, %%v19, %%v1          \n\t"
-                      "vlm     %%v16,%%v19, 0(%[x_ptr])         \n\t"
-                      "vfmdb  %%v28, %%v20, %%v0          \n\t"
-                      "vfmdb  %%v29, %%v21, %%v1          \n\t"
-                      "vfmdb  %%v30, %%v22, %%v0          \n\t"
-                      "vfmdb  %%v31, %%v23, %%v1          \n\t"
-                      "vlm     %%v20,%%v23, 64(%[x_ptr])         \n\t"
-                      "lay    %[x_ptr], -128(%[x_ptr])    \n\t"
-                      "vstm   %%v24,%%v31, 0(%[x_ptr])          \n\t"
-                      "la     %[x_ptr],256(%[x_ptr])      \n\t"
-                      "brctg %[n],1b     \n\t"
-                      "2:            \n\t"
-                      "vfmdb  %%v24, %%v16, %%v0          \n\t"
-                      "vfmdb  %%v25, %%v17, %%v1          \n\t"
-                      "vfmdb  %%v26, %%v18, %%v0          \n\t"
-                      "vfmdb  %%v27, %%v19, %%v1          \n\t"
-                      "lay    %[x_ptr] , -128(%[x_ptr])   \n\t"
-                      "vfmdb  %%v28, %%v20, %%v0          \n\t"
-                      "vfmdb  %%v29, %%v21, %%v1          \n\t"
-                      "vfmdb  %%v30, %%v22, %%v0          \n\t"
-                      "vfmdb  %%v31, %%v23, %%v1          \n\t"
-                      "vstm   %%v24,%%v31, 0(%[x_ptr])         \n\t"
-                      : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n)
-                                       : [alpha] "f"(da)
-                                       :"cc" ,  "r0","v0","v1","v16","v17","v18","v19","v20","v21",
-                                       "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
-                 );
- }
-#else
-static void   dscal_kernel_32( BLASLONG n, FLOAT  da , FLOAT *x )
-{
-
-             /* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */
-             __asm__ ("pfd    2, 0(%[x_ptr])   \n\t"      
-                      "lgdr   %%r0,%[alpha]    \n\t"
-                      "vlvgp  %%v0,%%r0,%%r0   \n\t"
-                      "vlr    %%v1,%%v0        \n\t"
-                      "sllg   %%r0,%[n],3      \n\t" 
-                      "agr    %%r0,%[x_ptr]    \n\t"
-                      ".align 16 \n\t"    
-                      "1:     \n\t" 
-                      "pfd    2,         256(%[x_ptr])     \n\t"    
-                      "vlm    %%v16,%%v23, 0(%[x_ptr])     \n\t"
-                      "vfmdb  %%v16,%%v16,%%v0 \n\t"
-                      "vfmdb  %%v17,%%v17,%%v1 \n\t"
-                      "vfmdb  %%v18,%%v18,%%v0 \n\t"
-                      "vfmdb  %%v19,%%v19,%%v1 \n\t"
-                      "vfmdb  %%v20,%%v20,%%v0 \n\t"
-                      "vfmdb  %%v21,%%v21,%%v1 \n\t"
-                      "vfmdb  %%v22,%%v22,%%v0 \n\t"
-                      "vfmdb  %%v23,%%v23,%%v1 \n\t" 
-                      "vstm   %%v16,%%v23, 0(%[x_ptr])      \n\t"  
-                      "vlm    %%v24,%%v31,128(%[x_ptr])     \n\t"                                              
-                      "vfmdb  %%v24,%%v24,%%v0 \n\t"       
-                      "vfmdb  %%v25,%%v25,%%v1 \n\t"
-                      "vfmdb  %%v26,%%v26,%%v0 \n\t"
-                      "vfmdb  %%v27,%%v27,%%v1 \n\t"
-                      "vfmdb  %%v28,%%v28,%%v0 \n\t"
-                      "vfmdb  %%v29,%%v29,%%v1 \n\t"
-                      "vfmdb  %%v30,%%v30,%%v0 \n\t"
-                      "vfmdb  %%v31,%%v31,%%v1 \n\t"                                     
-                      "vstm   %%v24,%%v31,128(%[x_ptr])    \n\t"  
-                      "la     %[x_ptr],  256(%[x_ptr])    \n\t"
-                      "clgrjl %[x_ptr],%%r0,1b \n\t"  
-                      : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
-                      : [n] "r"(n),[alpha] "f"(da)
-                      :"cc" ,  "r0","v0","v1","v16","v17","v18","v19","v20","v21",
-                      "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
-                 );
-
- }
-#endif
-static void   dscal_kernel_32_zero( BLASLONG n,  FLOAT *x )
-{
-   
-             __asm__ ("pfd    2, 0(%[x_ptr])   \n\t"      
-                      "vzero  %%v24            \n\t"
-                      "sllg   %%r0,%[n],3      \n\t" 
-                      "vzero  %%v25            \n\t"
-                      "agr    %%r0,%[x_ptr]    \n\t"
-                      ".align 16 \n\t"    
-                      "1:        \n\t" 
-                      "pfd    2,      256(%[x_ptr])  \n\t"     
-                      "vst    %%v24,    0(%[x_ptr])  \n\t" 
-                      "vst    %%v25,    16(%[x_ptr]) \n\t" 
-                      "vst    %%v24,    32(%[x_ptr]) \n\t"   
-                      "vst    %%v25,    48(%[x_ptr]) \n\t"  
-                      "vst    %%v24,    64(%[x_ptr]) \n\t" 
-                      "vst    %%v25,    80(%[x_ptr]) \n\t" 
-                      "vst    %%v24,    96(%[x_ptr]) \n\t"  
-                      "vst    %%v25,   112(%[x_ptr]) \n\t"  
-                      "vst    %%v24,   128(%[x_ptr]) \n\t" 
-                      "vst    %%v25,   144(%[x_ptr]) \n\t" 
-                      "vst    %%v24,   160(%[x_ptr]) \n\t"   
-                      "vst    %%v25,   176(%[x_ptr]) \n\t"  
-                      "vst    %%v24,   192(%[x_ptr]) \n\t" 
-                      "vst    %%v25,   208(%[x_ptr]) \n\t" 
-                      "vst    %%v24,   224(%[x_ptr]) \n\t"  
-                      "vst    %%v25,   240(%[x_ptr]) \n\t"                        
-                      "la     %[x_ptr],256(%[x_ptr]) \n\t"
-                      "clgrjl %[x_ptr],%%r0,1b \n\t"
-                      : [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
-                      : [n] "r"(n)
-                      :"cc" ,  "r0", "v24" ,"v25"
-                 );
+static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) {
+  __asm__("vlrepg %%v0,%[da]\n\t"
+    "srlg  %[n],%[n],4\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "vl    %%v24,0(%%r1,%[x])\n\t"
+    "vfmdb %%v24,%%v24,%%v0\n\t"
+    "vst   %%v24,0(%%r1,%[x])\n\t"
+    "vl    %%v25,16(%%r1,%[x])\n\t"
+    "vfmdb %%v25,%%v25,%%v0\n\t"
+    "vst   %%v25,16(%%r1,%[x])\n\t"
+    "vl    %%v26,32(%%r1,%[x])\n\t"
+    "vfmdb %%v26,%%v26,%%v0\n\t"
+    "vst   %%v26,32(%%r1,%[x])\n\t"
+    "vl    %%v27,48(%%r1,%[x])\n\t"
+    "vfmdb %%v27,%%v27,%%v0\n\t"
+    "vst   %%v27,48(%%r1,%[x])\n\t"
+    "vl    %%v28,64(%%r1,%[x])\n\t"
+    "vfmdb %%v28,%%v28,%%v0\n\t"
+    "vst   %%v28,64(%%r1,%[x])\n\t"
+    "vl    %%v29,80(%%r1,%[x])\n\t"
+    "vfmdb %%v29,%%v29,%%v0\n\t"
+    "vst   %%v29,80(%%r1,%[x])\n\t"
+    "vl    %%v30,96(%%r1,%[x])\n\t"
+    "vfmdb %%v30,%%v30,%%v0\n\t"
+    "vst   %%v30,96(%%r1,%[x])\n\t"
+    "vl    %%v31,112(%%r1,%[x])\n\t"
+    "vfmdb %%v31,%%v31,%%v0\n\t"
+    "vst   %%v31,112(%%r1,%[x])\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %[n],0b"
+    : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
+    : [x] "a"(x),[da] "Q"(da)
+    : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
 }
- 
-
 
+static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
+  __asm__("vzero %%v0\n\t"
+    "srlg %[n],%[n],4\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "vst  %%v0,0(%%r1,%[x])\n\t"
+    "vst  %%v0,16(%%r1,%[x])\n\t"
+    "vst  %%v0,32(%%r1,%[x])\n\t"
+    "vst  %%v0,48(%%r1,%[x])\n\t"
+    "vst  %%v0,64(%%r1,%[x])\n\t"
+    "vst  %%v0,80(%%r1,%[x])\n\t"
+    "vst  %%v0,96(%%r1,%[x])\n\t"
+    "vst  %%v0,112(%%r1,%[x])\n\t"
+    "agfi  %%r1,128\n\t"
+    "brctg %[n],0b"
+    : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
+    : [x] "a"(x)
+    : "cc", "r1", "v0");
+}
 
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
-{
-    BLASLONG i=0,j=0;
-    if ( n <= 0 || inc_x <=0 )
-        return(0);
-
- 
-    if ( inc_x == 1 )
-    {
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
+          BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+          BLASLONG dummy2) {
+  BLASLONG i = 0, j = 0;
+  if (n <= 0 || inc_x <= 0)
+    return (0);
 
-        if ( da == 0.0 )
-        {        
+  if (inc_x == 1) {
 
-            BLASLONG n1 = n & -32;
-            if ( n1 > 0 )
-            {
-                
-                dscal_kernel_32_zero(n1 ,  x);
-                j=n1;
-            }
+    if (da == 0.0) {
 
-            while(j < n)
-            {
+      BLASLONG n1 = n & -16;
+      if (n1 > 0) {
 
-                x[j]=0.0;
-                j++;
-            }
+        dscal_kernel_16_zero(n1, x);
+        j = n1;
+      }
 
-        }
-        else
-        {
+      while (j < n) {
 
-            BLASLONG n1 = n & -32;
-            if ( n1 > 0 )
-            { 
-                dscal_kernel_32(n1 , da , x);
-                j=n1;
-            }
-            while(j < n)
-            {
+        x[j] = 0.0;
+        j++;
+      }
 
-                x[j] = da * x[j] ;
-                j++;
-            }
-        }
+    } else {
 
+      BLASLONG n1 = n & -16;
+      if (n1 > 0) {
+        dscal_kernel_16(n1, da, x);
+        j = n1;
+      }
+      while (j < n) {
 
+        x[j] = da * x[j];
+        j++;
+      }
     }
-    else
-    {
 
-        if ( da == 0.0 )
-        {        
+  } else {
 
-                        BLASLONG n1 = n & -4;
+    if (da == 0.0) {
 
-                        while (j < n1) {
+      BLASLONG n1 = n & -4;
 
-                            x[i]=0.0;
-                            x[i + inc_x]=0.0;
-                            x[i + 2 * inc_x]=0.0;
-                            x[i + 3 * inc_x]=0.0;
+      while (j < n1) {
 
-                            i += inc_x * 4; 
-                            j += 4;
+        x[i] = 0.0;
+        x[i + inc_x] = 0.0;
+        x[i + 2 * inc_x] = 0.0;
+        x[i + 3 * inc_x] = 0.0;
 
-                        } 
-            while(j < n)
-            {
+        i += inc_x * 4;
+        j += 4;
 
-                x[i]=0.0;
-                i += inc_x ;
-                j++;
-            }
+      }
+      while (j < n) {
 
-        }
-        else
-        {
-                        BLASLONG n1 = n & -4;
+        x[i] = 0.0;
+        i += inc_x;
+        j++;
+      }
 
-                        while (j < n1) {
+    } else {
+      BLASLONG n1 = n & -4;
 
-                            x[i] = da * x[i] ;
-                            x[i + inc_x] = da * x[i + inc_x];
-                            x[i + 2 * inc_x] = da *  x[i + 2 * inc_x];
-                            x[i + 3 * inc_x] = da * x[i + 3 * inc_x];
+      while (j < n1) {
 
-                            i += inc_x * 4; 
-                            j += 4;
+        x[i] = da * x[i];
+        x[i + inc_x] = da * x[i + inc_x];
+        x[i + 2 * inc_x] = da * x[i + 2 * inc_x];
+        x[i + 3 * inc_x] = da * x[i + 3 * inc_x];
 
-                        }  
+        i += inc_x * 4;
+        j += 4;
 
-            while(j < n)
-            {
+      }
 
-                x[i] = da * x[i] ;
-                i += inc_x ;
-                j++;
-            }
-        }
+      while (j < n) {
 
+        x[i] = da * x[i];
+        i += inc_x;
+        j++;
+      }
     }
-    return 0;
 
-}
\ No newline at end of file
+  }
+  return 0;
+
+}
diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c
new file mode 100644
index 0000000000..5fa88c3b92
--- /dev/null
+++ b/kernel/zarch/dsdot.c
@@ -0,0 +1,173 @@
+/***************************************************************************
+Copyright (c) 2013-2019,The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms,with or without
+modification,are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice,this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice,this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL
+DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
+  double dot;
+
+  __asm__("vzero %%v0\n\t"
+    "vzero %%v1\n\t"
+    "vzero %%v2\n\t"
+    "vzero %%v3\n\t"
+    "vzero %%v4\n\t"
+    "vzero %%v5\n\t"
+    "vzero %%v6\n\t"
+    "vzero %%v7\n\t"
+    "srlg  %[n],%[n],4\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[x])\n\t"
+    "pfd 1,1024(%%r1,%[y])\n\t"
+    "vlef  %%v16,0(%%r1,%[x]),0\n\t"
+    "vlef  %%v16,4(%%r1,%[x]),2\n\t"
+    "vlef  %%v17,8(%%r1,%[x]),0\n\t"
+    "vlef  %%v17,12(%%r1,%[x]),2\n\t"
+    "vlef  %%v18,16(%%r1,%[x]),0\n\t"
+    "vlef  %%v18,20(%%r1,%[x]),2\n\t"
+    "vlef  %%v19,24(%%r1,%[x]),0\n\t"
+    "vlef  %%v19,28(%%r1,%[x]),2\n\t"
+    "vlef  %%v20,32(%%r1,%[x]),0\n\t"
+    "vlef  %%v20,36(%%r1,%[x]),2\n\t"
+    "vlef  %%v21,40(%%r1,%[x]),0\n\t"
+    "vlef  %%v21,44(%%r1,%[x]),2\n\t"
+    "vlef  %%v22,48(%%r1,%[x]),0\n\t"
+    "vlef  %%v22,52(%%r1,%[x]),2\n\t"
+    "vlef  %%v23,56(%%r1,%[x]),0\n\t"
+    "vlef  %%v23,60(%%r1,%[x]),2\n\t"
+    "vflls   %%v16,%%v16\n\t"
+    "vflls   %%v17,%%v17\n\t"
+    "vflls   %%v18,%%v18\n\t"
+    "vflls   %%v19,%%v19\n\t"
+    "vflls   %%v20,%%v20\n\t"
+    "vflls   %%v21,%%v21\n\t"
+    "vflls   %%v22,%%v22\n\t"
+    "vflls   %%v23,%%v23\n\t"
+    "vlef    %%v24,0(%%r1,%[y]),0\n\t"
+    "vlef    %%v24,4(%%r1,%[y]),2\n\t"
+    "vflls   %%v24,%%v24\n\t"
+    "vfmadb  %%v0,%%v16,%%v24,%%v0\n\t"
+    "vlef    %%v25,8(%%r1,%[y]),0\n\t"
+    "vlef    %%v25,12(%%r1,%[y]),2\n\t"
+    "vflls   %%v25,%%v25\n\t"
+    "vfmadb  %%v1,%%v17,%%v25,%%v1\n\t"
+    "vlef    %%v26,16(%%r1,%[y]),0\n\t"
+    "vlef    %%v26,20(%%r1,%[y]),2\n\t"
+    "vflls   %%v26,%%v26\n\t"
+    "vfmadb  %%v2,%%v18,%%v26,%%v2\n\t"
+    "vlef    %%v27,24(%%r1,%[y]),0\n\t"
+    "vlef    %%v27,28(%%r1,%[y]),2\n\t"
+    "vflls   %%v27,%%v27\n\t"
+    "vfmadb  %%v3,%%v19,%%v27,%%v3\n\t"
+    "vlef    %%v28,32(%%r1,%[y]),0\n\t"
+    "vlef    %%v28,36(%%r1,%[y]),2\n\t"
+    "vflls   %%v28,%%v28\n\t"
+    "vfmadb  %%v4,%%v20,%%v28,%%v4\n\t"
+    "vlef    %%v29,40(%%r1,%[y]),0\n\t"
+    "vlef    %%v29,44(%%r1,%[y]),2\n\t"
+    "vflls   %%v29,%%v29\n\t"
+    "vfmadb  %%v5,%%v21,%%v29,%%v5\n\t"
+    "vlef    %%v30,48(%%r1,%[y]),0\n\t"
+    "vlef    %%v30,52(%%r1,%[y]),2\n\t"
+    "vflls   %%v30,%%v30\n\t"
+    "vfmadb  %%v6,%%v22,%%v30,%%v6\n\t"
+    "vlef    %%v31,56(%%r1,%[y]),0\n\t"
+    "vlef    %%v31,60(%%r1,%[y]),2\n\t"
+    "vflls   %%v31,%%v31\n\t"
+    "vfmadb  %%v7,%%v23,%%v31,%%v7\n\t"
+    "agfi   %%r1,64\n\t"
+    "brctg  %[n],0b\n\t"
+    "vfadb  %%v0,%%v0,%%v1\n\t"
+    "vfadb  %%v0,%%v0,%%v2\n\t"
+    "vfadb  %%v0,%%v0,%%v3\n\t"
+    "vfadb  %%v0,%%v0,%%v4\n\t"
+    "vfadb  %%v0,%%v0,%%v5\n\t"
+    "vfadb  %%v0,%%v0,%%v6\n\t"
+    "vfadb  %%v0,%%v0,%%v7\n\t"
+    "vrepg  %%v1,%%v0,1\n\t"
+    "adbr   %%f0,%%f1\n\t"
+    "ldr    %[dot],%%f0"
+    : [dot] "=f"(dot),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
+       "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+
+  return dot;
+}
+
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+
+  double dot = 0.0;
+
+  if (n <= 0)
+    return (dot);
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -16;
+
+    if (n1)
+      dot = dsdot_kernel_16(n1, x, y);
+
+    i = n1;
+    while (i < n) {
+
+      dot += (double) y[i] * (double) x[i];
+      i++;
+
+    }
+    return (dot);
+
+  }
+
+  BLASLONG n1 = n & -2;
+
+  while (i < n1) {
+
+    dot += (double) y[iy] * (double) x[ix];
+    dot += (double) y[iy + inc_y] * (double) x[ix + inc_x];
+    ix += inc_x * 2;
+    iy += inc_y * 2;
+    i += 2;
+
+  }
+
+  while (i < n) {
+
+    dot += (double) y[iy] * (double) x[ix];
+    ix += inc_x;
+    iy += inc_y;
+    i++;
+
+  }
+  return (dot);
+
+}
diff --git a/kernel/zarch/dsum.c b/kernel/zarch/dsum.c
new file mode 100644
index 0000000000..178bc3462d
--- /dev/null
+++ b/kernel/zarch/dsum.c
@@ -0,0 +1,148 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) {
+  FLOAT sum;
+
+  __asm__("vzero   %%v24\n\t"
+    "vzero   %%v25\n\t"
+    "vzero   %%v26\n\t"
+    "vzero   %%v27\n\t"
+    "vzero   %%v28\n\t"
+    "vzero   %%v29\n\t"
+    "vzero   %%v30\n\t"
+    "vzero   %%v31\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd  1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16, 0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v20, 64(%%r1,%[x])\n\t"
+    "vl  %%v21, 80(%%r1,%[x])\n\t"
+    "vl  %%v22, 96(%%r1,%[x])\n\t"
+    "vl  %%v23, 112(%%r1,%[x])\n\t"
+    "vfadb   %%v24,%%v24,%%v16\n\t"
+    "vfadb   %%v25,%%v25,%%v17\n\t"
+    "vfadb   %%v26,%%v26,%%v18\n\t"
+    "vfadb   %%v27,%%v27,%%v19\n\t"
+    "vfadb   %%v28,%%v28,%%v20\n\t"
+    "vfadb   %%v29,%%v29,%%v21\n\t"
+    "vfadb   %%v30,%%v30,%%v22\n\t"
+    "vfadb   %%v31,%%v31,%%v23\n\t"
+    "vl  %%v16, 128(%%r1,%[x])\n\t"
+    "vl  %%v17, 144(%%r1,%[x])\n\t"
+    "vl  %%v18, 160(%%r1,%[x])\n\t"
+    "vl  %%v19, 176(%%r1,%[x])\n\t"
+    "vl  %%v20, 192(%%r1,%[x])\n\t"
+    "vl  %%v21, 208(%%r1,%[x])\n\t"
+    "vl  %%v22, 224(%%r1,%[x])\n\t"
+    "vl  %%v23, 240(%%r1,%[x])\n\t"
+    "vfadb   %%v24,%%v24,%%v16\n\t"
+    "vfadb   %%v25,%%v25,%%v17\n\t"
+    "vfadb   %%v26,%%v26,%%v18\n\t"
+    "vfadb   %%v27,%%v27,%%v19\n\t"
+    "vfadb   %%v28,%%v28,%%v20\n\t"
+    "vfadb   %%v29,%%v29,%%v21\n\t"
+    "vfadb   %%v30,%%v30,%%v22\n\t"
+    "vfadb   %%v31,%%v31,%%v23\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b\n\t"
+    "vfadb   %%v24,%%v24,%%v25\n\t"
+    "vfadb   %%v24,%%v24,%%v26\n\t"
+    "vfadb   %%v24,%%v24,%%v27\n\t"
+    "vfadb   %%v24,%%v24,%%v28\n\t"
+    "vfadb   %%v24,%%v24,%%v29\n\t"
+    "vfadb   %%v24,%%v24,%%v30\n\t"
+    "vfadb   %%v24,%%v24,%%v31\n\t"
+    "vrepg   %%v25,%%v24,1\n\t"
+    "vfadb   %%v24,%%v24,%%v25\n\t"
+    "vsteg   %%v24,%[asum],0"
+    : [sum] "=Q"(sum),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return sum;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT sumf = 0.0;
+  BLASLONG n1;
+
+  if (n <= 0 || inc_x <= 0)
+    return sumf;
+
+  if (inc_x == 1) {
+
+    n1 = n & -32;
+
+    if (n1 > 0) {
+
+      sumf = dsum_kernel_32(n1, x);
+      i = n1;
+    }
+
+    while (i < n) {
+      sumf += x[i];
+      i++;
+    }
+
+  } else {
+    BLASLONG n1 = n & -4;
+    register FLOAT sum1, sum2;
+    sum1 = 0.0;
+    sum2 = 0.0;
+    while (j < n1) {
+
+      sum1 += x[i];
+      sum2 += x[i + inc_x];
+      sum1 += x[i + 2 * inc_x];
+      sum2 += x[i + 3 * inc_x];
+
+      i += inc_x * 4;
+      j += 4;
+
+    }
+    sumf = sum1 + sum2;
+    while (j < n) {
+
+      sumf += x[i];
+      i += inc_x;
+      j++;
+    }
+
+  }
+  return sumf;
+}
diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c
index d7e079147e..f0c9ded511 100644
--- a/kernel/zarch/dswap.c
+++ b/kernel/zarch/dswap.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -25,264 +25,127 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
-
 #include "common.h"
 
-
-
-#if defined(Z13_SWAP_A)
-static void   dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
-{
-         __asm__ volatile(
-            "pfd  1, 0(%[ptr_x]) \n\t"
-            "pfd  2, 0(%[ptr_y]) \n\t"
-            "srlg %[n_tmp],%[n_tmp],5      \n\t"
-            "xgr  %%r1,%%r1      \n\t"
-            ".align 16 \n\t"
-            "1:  \n\t"
-            "pfd 2, 256(%%r1,%[ptr_x]) \n\t"
-            "pfd 2, 256(%%r1,%[ptr_y]) \n\t"
-            
-            "vl  %%v24, 0(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v16, 0(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v25, 16(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v17, 16(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v26, 32(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v18, 32(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v18, 32(%%r1,%[ptr_x]) \n\t"           
-
-            "vl  %%v27, 48(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v19, 48(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v19, 48(%%r1,%[ptr_x]) \n\t"    
-
-            "vl  %%v28, 64(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v20, 64(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v20, 64(%%r1,%[ptr_x]) \n\t"   
-
-            "vl  %%v29, 80(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v21, 80(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v30, 96(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v22, 96(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v31, 112(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v23, 112(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v24, 128(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v16, 128(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v25, 144(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v17, 144(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v26, 160(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v18, 160(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v18, 160(%%r1,%[ptr_x]) \n\t"           
-
-            "vl  %%v27, 176(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v19, 176(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v19, 176(%%r1,%[ptr_x]) \n\t"    
-
-            "vl  %%v28, 192(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v20, 192(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v20, 192(%%r1,%[ptr_x]) \n\t"   
-
-            "vl  %%v29, 208(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v21, 208(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v30, 224(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v22, 224(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v31, 240(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v23, 240(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v23, 240(%%r1,%[ptr_x]) \n\t"    
-          
-            "la  %%r1,256(%%r1) \n\t"
-            "brctg %[n_tmp],1b"
-            : [mem_x] "+m" (*(double (*)[n])x),
-              [mem_y] "+m" (*(double (*)[n])y),
-              [n_tmp] "+&r"(n)
-            : [ptr_x] "a"(x), [ptr_y] "a"(y) 
-            : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
-              ,"v24","v25","v26","v27","v28","v29","v30","v31"
-            );
-    return;
-
+static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
+  __asm__("srlg %[n],%[n],5\n\t"
+    "xgr  %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "pfd 2, 1024(%%r1,%[y])\n\t"
+    "vl  %%v16, 0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v20, 64(%%r1,%[x])\n\t"
+    "vl  %%v21, 80(%%r1,%[x])\n\t"
+    "vl  %%v22, 96(%%r1,%[x])\n\t"
+    "vl  %%v23, 112(%%r1,%[x])\n\t"
+    "vl  %%v24, 128(%%r1,%[x])\n\t"
+    "vl  %%v25, 144(%%r1,%[x])\n\t"
+    "vl  %%v26, 160(%%r1,%[x])\n\t"
+    "vl  %%v27, 176(%%r1,%[x])\n\t"
+    "vl  %%v28, 192(%%r1,%[x])\n\t"
+    "vl  %%v29, 208(%%r1,%[x])\n\t"
+    "vl  %%v30, 224(%%r1,%[x])\n\t"
+    "vl  %%v31, 240(%%r1,%[x])\n\t"
+    "vl  %%v0, 0(%%r1,%[y])\n\t"
+    "vl  %%v1, 16(%%r1,%[y])\n\t"
+    "vl  %%v2, 32(%%r1,%[y])\n\t"
+    "vl  %%v3, 48(%%r1,%[y])\n\t"
+    "vl  %%v4, 64(%%r1,%[y])\n\t"
+    "vl  %%v5, 80(%%r1,%[y])\n\t"
+    "vl  %%v6, 96(%%r1,%[y])\n\t"
+    "vl  %%v7, 112(%%r1,%[y])\n\t"
+    "vst  %%v0, 0(%%r1,%[x])\n\t"
+    "vst  %%v1, 16(%%r1,%[x])\n\t"
+    "vst  %%v2, 32(%%r1,%[x])\n\t"
+    "vst  %%v3, 48(%%r1,%[x])\n\t"
+    "vst  %%v4, 64(%%r1,%[x])\n\t"
+    "vst  %%v5, 80(%%r1,%[x])\n\t"
+    "vst  %%v6, 96(%%r1,%[x])\n\t"
+    "vst  %%v7, 112(%%r1,%[x])\n\t"
+    "vl  %%v0, 128(%%r1,%[y])\n\t"
+    "vl  %%v1, 144(%%r1,%[y])\n\t"
+    "vl  %%v2, 160(%%r1,%[y])\n\t"
+    "vl  %%v3, 176(%%r1,%[y])\n\t"
+    "vl  %%v4, 192(%%r1,%[y])\n\t"
+    "vl  %%v5, 208(%%r1,%[y])\n\t"
+    "vl  %%v6, 224(%%r1,%[y])\n\t"
+    "vl  %%v7, 240(%%r1,%[y])\n\t"
+    "vst  %%v0, 128(%%r1,%[x])\n\t"
+    "vst  %%v1, 144(%%r1,%[x])\n\t"
+    "vst  %%v2, 160(%%r1,%[x])\n\t"
+    "vst  %%v3, 176(%%r1,%[x])\n\t"
+    "vst  %%v4, 192(%%r1,%[x])\n\t"
+    "vst  %%v5, 208(%%r1,%[x])\n\t"
+    "vst  %%v6, 224(%%r1,%[x])\n\t"
+    "vst  %%v7, 240(%%r1,%[x])\n\t"
+    "vst  %%v16, 0(%%r1,%[y])\n\t"
+    "vst  %%v17, 16(%%r1,%[y])\n\t"
+    "vst  %%v18, 32(%%r1,%[y])\n\t"
+    "vst  %%v19, 48(%%r1,%[y])\n\t"
+    "vst  %%v20, 64(%%r1,%[y])\n\t"
+    "vst  %%v21, 80(%%r1,%[y])\n\t"
+    "vst  %%v22, 96(%%r1,%[y])\n\t"
+    "vst  %%v23, 112(%%r1,%[y])\n\t"
+    "vst  %%v24, 128(%%r1,%[y])\n\t"
+    "vst  %%v25, 144(%%r1,%[y])\n\t"
+    "vst  %%v26, 160(%%r1,%[y])\n\t"
+    "vst  %%v27, 176(%%r1,%[y])\n\t"
+    "vst  %%v28, 192(%%r1,%[y])\n\t"
+    "vst  %%v29, 208(%%r1,%[y])\n\t"
+    "vst  %%v30, 224(%%r1,%[y])\n\t"
+    "vst  %%v31, 240(%%r1,%[y])\n\t"
+    "agfi   %%r1,256\n\t"
+    "brctg  %[n],0b"
+    : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
+       [n] "+&r"(n)
+    : [x] "a"(x),[y] "a"(y)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
 }
 
-#else
-
-static void   dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
-{
-         __asm__ volatile(
-            "pfd 2, 0(%[ptr_x]) \n\t"
-            "pfd 2, 0(%[ptr_y]) \n\t"
-            "srlg %[n_tmp],%[n_tmp],5       \n\t"
-            "xgr %%r1,%%r1       \n\t"
-            ".align 16 \n\t"
-            "1: \n\t"
-            "pfd 2, 256(%%r1,%[ptr_x]) \n\t"
-            "pfd 2, 256(%%r1,%[ptr_y]) \n\t"
-            
-            "vl  %%v16, 0(%%r1,%[ptr_x])   \n\t" 
-            "vl  %%v17, 16(%%r1,%[ptr_x])  \n\t" 
-            "vl  %%v18, 32(%%r1,%[ptr_x])  \n\t" 
-            "vl  %%v19, 48(%%r1,%[ptr_x])  \n\t"  
-            "vl  %%v20, 64(%%r1,%[ptr_x])  \n\t" 
-            "vl  %%v21, 80(%%r1,%[ptr_x])  \n\t" 
-            "vl  %%v22, 96(%%r1,%[ptr_x])  \n\t" 
-            "vl  %%v23, 112(%%r1,%[ptr_x]) \n\t"
-            "vl  %%v24, 128(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v25, 144(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v26, 160(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v27, 176(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v28, 192(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v29, 208(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v30, 224(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v31, 240(%%r1,%[ptr_x]) \n\t"
-
-
-            "vl  %%v0, 0(%%r1,%[ptr_y])    \n\t" 
-            "vl  %%v1, 16(%%r1,%[ptr_y])   \n\t" 
-            "vl  %%v2, 32(%%r1,%[ptr_y])   \n\t" 
-            "vl  %%v3, 48(%%r1,%[ptr_y])   \n\t"  
-            "vl  %%v4, 64(%%r1,%[ptr_y])   \n\t" 
-            "vl  %%v5, 80(%%r1,%[ptr_y])   \n\t" 
-            "vl  %%v6, 96(%%r1,%[ptr_y])   \n\t" 
-            "vl  %%v7, 112(%%r1,%[ptr_y])  \n\t"
-            "vst %%v0,  0(%%r1,%[ptr_x])   \n\t" 
-            "vst %%v1, 16(%%r1,%[ptr_x])   \n\t" 
-            "vst %%v2, 32(%%r1,%[ptr_x])   \n\t" 
-            "vst %%v3, 48(%%r1,%[ptr_x])   \n\t"  
-            "vst %%v4, 64(%%r1,%[ptr_x])   \n\t" 
-            "vst %%v5, 80(%%r1,%[ptr_x])   \n\t" 
-            "vst %%v6, 96(%%r1,%[ptr_x])   \n\t" 
-            "vst %%v7, 112(%%r1,%[ptr_x])  \n\t" 
-
-            "vl  %%v0, 128(%%r1,%[ptr_y])  \n\t" 
-            "vl  %%v1, 144(%%r1,%[ptr_y])  \n\t" 
-            "vl  %%v2, 160(%%r1,%[ptr_y])  \n\t" 
-            "vl  %%v3, 176(%%r1,%[ptr_y])  \n\t"  
-            "vl  %%v4, 192(%%r1,%[ptr_y])  \n\t" 
-            "vl  %%v5, 208(%%r1,%[ptr_y])  \n\t" 
-            "vl  %%v6, 224(%%r1,%[ptr_y])  \n\t" 
-            "vl  %%v7, 240(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v0, 128(%%r1,%[ptr_x])  \n\t" 
-            "vst %%v1, 144(%%r1,%[ptr_x])  \n\t" 
-            "vst %%v2, 160(%%r1,%[ptr_x])  \n\t" 
-            "vst %%v3, 176(%%r1,%[ptr_x])  \n\t"  
-            "vst %%v4, 192(%%r1,%[ptr_x])  \n\t" 
-            "vst %%v5, 208(%%r1,%[ptr_x])  \n\t" 
-            "vst %%v6, 224(%%r1,%[ptr_x])  \n\t" 
-            "vst %%v7, 240(%%r1,%[ptr_x])  \n\t"
-
-            "vst %%v16,  0(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v17, 16(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v18, 32(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v19, 48(%%r1,%[ptr_y])  \n\t"  
-            "vst %%v20, 64(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v21, 80(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v22, 96(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
-            "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
- 
-          
-            "la  %%r1,256(%%r1) \n\t"
-            "brctg %[n_tmp],1b"
-            : [mem_x] "+m" (*(double (*)[n])x),
-              [mem_y] "+m" (*(double (*)[n])y),
-              [n_tmp] "+&r"(n)
-            : [ptr_x] "a"(x), [ptr_y] "a"(y) 
-            : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
-            "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" 
-            );
-    return;
-
-}
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
+          BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+          BLASLONG dummy2) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+  FLOAT temp;
 
-#endif
+  if (n <= 0)
+    return (0);
 
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
-{
-    BLASLONG i=0;
-    BLASLONG ix=0,iy=0;
-    FLOAT temp;
+  if ((inc_x == 1) && (inc_y == 1)) {
 
-    if ( n <= 0     )  return(0);
-
-    if ( (inc_x == 1) && (inc_y == 1 ))
-    {
-
-        BLASLONG n1 = n & -32;
-        if ( n1 > 0 )
-        {
-            dswap_kernel_32(n1, x, y);
-            i=n1;
-        }
-
-        while(i < n)
-        {
-            temp = y[i];    
-            y[i] = x[i] ;
-            x[i] = temp;
-            i++ ;
-
-        }
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+      dswap_kernel_32(n1, x, y);
+      i = n1;
+    }
 
+    while (i < n) {
+      temp = y[i];
+      y[i] = x[i];
+      x[i] = temp;
+      i++;
 
     }
-    else
-    {
 
-        while(i < n)
-        {
-            temp  = y[iy];
-            y[iy] = x[ix] ;
-            x[ix] = temp;
-            ix += inc_x ;
-            iy += inc_y ;
-            i++ ;
+  } else {
 
-        }
+    while (i < n) {
+      temp = y[iy];
+      y[iy] = x[ix];
+      x[ix] = temp;
+      ix += inc_x;
+      iy += inc_y;
+      i++;
 
     }
-    return(0);
-    
-
-}
 
+  }
+  return (0);
 
+}
diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c
new file mode 100644
index 0000000000..a2546b8124
--- /dev/null
+++ b/kernel/zarch/icamax.c
@@ -0,0 +1,302 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
+
+static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
+  BLASLONG iamax;
+
+  __asm__("vlef   %%v0,0(%[x]),0\n\t"
+    "vlef   %%v1,4(%[x]),0\n\t"
+    "vlef   %%v0,8(%[x]),1\n\t"
+    "vlef   %%v1,12(%[x]),1\n\t"
+    "vlef   %%v0,16(%[x]),2\n\t"
+    "vlef   %%v1,20(%[x]),2\n\t"
+    "vlef   %%v0,24(%[x]),3\n\t"
+    "vlef   %%v1,28(%[x]),3\n\t"
+    "vflpsb %%v0,%%v0\n\t"
+    "vflpsb %%v1,%%v1\n\t"
+    "vfasb  %%v0,%%v0,%%v1\n\t"
+    "vleig  %%v1,0,0\n\t"
+    "vleig  %%v1,2,1\n\t"
+    "vleig  %%v2,1,0\n\t"
+    "vleig  %%v2,3,1\n\t"
+    "vrepig %%v3,16\n\t"
+    "vzero  %%v4\n\t"
+    "vleib  %%v9,0,0\n\t"
+    "vleib  %%v9,1,1\n\t"
+    "vleib  %%v9,2,2\n\t"
+    "vleib  %%v9,3,3\n\t"
+    "vleib  %%v9,8,4\n\t"
+    "vleib  %%v9,9,5\n\t"
+    "vleib  %%v9,10,6\n\t"
+    "vleib  %%v9,11,7\n\t"
+    "vleib  %%v9,16,8\n\t"
+    "vleib  %%v9,17,9\n\t"
+    "vleib  %%v9,18,10\n\t"
+    "vleib  %%v9,19,11\n\t"
+    "vleib  %%v9,24,12\n\t"
+    "vleib  %%v9,25,13\n\t"
+    "vleib  %%v9,26,14\n\t"
+    "vleib  %%v9,27,15\n\t"
+    "vleif  %%v24,0,0\n\t"
+    "vleif  %%v24,1,1\n\t"
+    "vleif  %%v24,2,2\n\t"
+    "vleif  %%v24,3,3\n\t"
+    "vleif  %%v25,4,0\n\t"
+    "vleif  %%v25,5,1\n\t"
+    "vleif  %%v25,6,2\n\t"
+    "vleif  %%v25,7,3\n\t"
+    "vleif  %%v26,8,0\n\t"
+    "vleif  %%v26,9,1\n\t"
+    "vleif  %%v26,10,2\n\t"
+    "vleif  %%v26,11,3\n\t"
+    "vleif  %%v27,12,0\n\t"
+    "vleif  %%v27,13,1\n\t"
+    "vleif  %%v27,14,2\n\t"
+    "vleif  %%v27,15,3\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl    %%v16,0(%%r1,%[x])\n\t"
+    "vl    %%v28,16(%%r1,%[x])\n\t"
+    "vpkg  %%v17,%%v16,%%v28\n\t"
+    "vperm %%v16,%%v16,%%v28,%%v9\n\t"
+    "vl    %%v18,32(%%r1,%[x])\n\t"
+    "vl    %%v29,48(%%r1,%[x])\n\t"
+    "vpkg  %%v19,%%v18,%%v29\n\t"
+    "vperm %%v18,%%v18,%%v29,%%v9\n\t"
+    "vl    %%v20,64(%%r1,%[x])\n\t"
+    "vl    %%v30,80(%%r1,%[x])\n\t"
+    "vpkg  %%v21,%%v20,%%v30\n\t"
+    "vperm %%v20,%%v20,%%v30,%%v9\n\t"
+    "vl    %%v22,96(%%r1,%[x])\n\t"
+    "vl    %%v31,112(%%r1,%[x])\n\t"
+    "vpkg  %%v23,%%v22,%%v31\n\t"
+    "vperm %%v22,%%v22,%%v31,%%v9\n\t"
+    "vflpsb  %%v16, %%v16\n\t"
+    "vflpsb  %%v17, %%v17\n\t"
+    "vflpsb  %%v18, %%v18\n\t"
+    "vflpsb  %%v19, %%v19\n\t"
+    "vflpsb  %%v20, %%v20\n\t"
+    "vflpsb  %%v21, %%v21\n\t"
+    "vflpsb  %%v22, %%v22\n\t"
+    "vflpsb  %%v23, %%v23\n\t"
+    "vfasb %%v16,%%v16,%%v17\n\t"
+    "vfasb %%v17,%%v18,%%v19\n\t"
+    "vfasb %%v18,%%v20,%%v21\n\t"
+    "vfasb %%v19,%%v22,%%v23\n\t"
+    "vfchesb  %%v5,%%v16,%%v17\n\t"
+    "vfchesb  %%v6,%%v18,%%v19\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v5\n\t"
+    "vsel    %%v5,%%v24,%%v25,%%v5\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v6\n\t"
+    "vsel    %%v6,%%v26,%%v27,%%v6\n\t"
+    "vfchesb  %%v18,%%v16,%%v17\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v18\n\t"
+    "vsegf   %%v6,%%v5\n\t"
+    "vesrlg  %%v5,%%v5,32\n\t"
+    "vag     %%v5,%%v5,%%v4\n\t"
+    "vag     %%v6,%%v6,%%v4\n\t"
+    "vfchesb  %%v7,%%v0,%%v16\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v7\n\t"
+    "vsegf   %%v8,%%v7\n\t"
+    "vesrlg  %%v7,%%v7,32\n\t"
+    "vsegf   %%v7,%%v7\n\t"
+    "vsel    %%v1,%%v1,%%v5,%%v7\n\t"
+    "vsel    %%v2,%%v2,%%v6,%%v8\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vl    %%v16,128(%%r1,%[x])\n\t"
+    "vl    %%v28,144(%%r1,%[x])\n\t"
+    "vpkg  %%v17,%%v16,%%v28\n\t"
+    "vperm %%v16,%%v16,%%v28,%%v9\n\t"
+    "vl    %%v18,160(%%r1,%[x])\n\t"
+    "vl    %%v29,176(%%r1,%[x])\n\t"
+    "vpkg  %%v19,%%v18,%%v29\n\t"
+    "vperm %%v18,%%v18,%%v29,%%v9\n\t"
+    "vl    %%v20,192(%%r1,%[x])\n\t"
+    "vl    %%v30,208(%%r1,%[x])\n\t"
+    "vpkg  %%v21,%%v20,%%v30\n\t"
+    "vperm %%v20,%%v20,%%v30,%%v9\n\t"
+    "vl    %%v22,224(%%r1,%[x])\n\t"
+    "vl    %%v31,240(%%r1,%[x])\n\t"
+    "vpkg  %%v23,%%v22,%%v31\n\t"
+    "vperm %%v22,%%v22,%%v31,%%v9\n\t"
+    "vflpsb  %%v16, %%v16\n\t"
+    "vflpsb  %%v17, %%v17\n\t"
+    "vflpsb  %%v18, %%v18\n\t"
+    "vflpsb  %%v19, %%v19\n\t"
+    "vflpsb  %%v20, %%v20\n\t"
+    "vflpsb  %%v21, %%v21\n\t"
+    "vflpsb  %%v22, %%v22\n\t"
+    "vflpsb  %%v23, %%v23\n\t"
+    "vfasb %%v16,%%v16,%%v17\n\t"
+    "vfasb %%v17,%%v18,%%v19\n\t"
+    "vfasb %%v18,%%v20,%%v21\n\t"
+    "vfasb %%v19,%%v22,%%v23\n\t"
+    "vfchesb  %%v5,%%v16,%%v17\n\t"
+    "vfchesb  %%v6,%%v18,%%v19\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v5\n\t"
+    "vsel    %%v5,%%v24,%%v25,%%v5\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v6\n\t"
+    "vsel    %%v6,%%v26,%%v27,%%v6\n\t"
+    "vfchesb  %%v18,%%v16,%%v17\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v18\n\t"
+    "vsegf   %%v6,%%v5\n\t"
+    "vesrlg  %%v5,%%v5,32\n\t"
+    "vag     %%v5,%%v5,%%v4\n\t"
+    "vag     %%v6,%%v6,%%v4\n\t"
+    "vfchesb  %%v7,%%v0,%%v16\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v7\n\t"
+    "vsegf   %%v8,%%v7\n\t"
+    "vesrlg  %%v7,%%v7,32\n\t"
+    "vsegf   %%v7,%%v7\n\t"
+    "vsel    %%v1,%%v1,%%v5,%%v7\n\t"
+    "vsel    %%v2,%%v2,%%v6,%%v8\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "veslg   %%v3,%%v0,32\n\t"
+    "vfchsb  %%v4,%%v0,%%v3\n\t"
+    "vchlg   %%v5,%%v2,%%v1\n\t"
+    "vfcesb  %%v6,%%v0,%%v3\n\t"
+    "vn      %%v5,%%v5,%%v6\n\t"
+    "vo      %%v4,%%v4,%%v5\n\t"
+    "vsel    %%v0,%%v0,%%v3,%%v4\n\t"
+    "vesrlg  %%v4,%%v4,32\n\t"
+    "vsegf   %%v4,%%v4\n\t"
+    "vsel    %%v1,%%v1,%%v2,%%v4\n\t"
+    "vrepf  %%v2,%%v0,2\n\t"
+    "vrepg  %%v3,%%v1,1\n\t"
+    "wfcsb  %%v2,%%v0\n\t"
+    "jne 1f\n\t"
+    "vstef  %%v0,%[amax],0\n\t"
+    "vmnlg  %%v0,%%v1,%%v3\n\t"
+    "vlgvg  %[iamax],%%v0,0\n\t"
+    "j 2f\n\t"
+    "1:\n\t"
+    "wfchsb %%v4,%%v2,%%v0\n\t"
+    "vesrlg %%v4,%%v4,32\n\t"
+    "vsegf  %%v4,%%v4\n\t"
+    "vsel   %%v1,%%v3,%%v1,%%v4\n\t"
+    "vsel   %%v0,%%v2,%%v0,%%v4\n\t"
+    "ste    %%f0,%[amax]\n\t"
+    "vlgvg  %[iamax],%%v1,0\n\t"
+    "2:\n\t"
+    "nop"
+    : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+       "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+       "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return iamax;
+}
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0;
+  FLOAT maxf = 0;
+  BLASLONG max = 0;
+  BLASLONG inc_x2;
+
+  if (n <= 0 || inc_x <= 0)
+    return (max);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      max = icamax_kernel_32(n1, x, &maxf);
+      ix = n1 * 2;
+      i = n1;
+    } else {
+      maxf = CABS1(x, 0);
+      ix += 2;
+      i++;
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) > maxf) {
+        max = i;
+        maxf = CABS1(x, ix);
+      }
+      ix += 2;
+      i++;
+    }
+    return (max + 1);
+
+  } else {
+
+    max = 0;
+    maxf = CABS1(x, 0);
+    inc_x2 = 2 * inc_x;
+
+    BLASLONG n1 = n & -4;
+    while (i < n1) {
+
+      if (CABS1(x, ix) > maxf) {
+        max = i;
+        maxf = CABS1(x, ix);
+      }
+      if (CABS1(x, ix + inc_x2) > maxf) {
+        max = i + 1;
+        maxf = CABS1(x, ix + inc_x2);
+      }
+      if (CABS1(x, ix + 2 * inc_x2) > maxf) {
+        max = i + 2;
+        maxf = CABS1(x, ix + 2 * inc_x2);
+      }
+      if (CABS1(x, ix + 3 * inc_x2) > maxf) {
+        max = i + 3;
+        maxf = CABS1(x, ix + 3 * inc_x2);
+      }
+
+      ix += inc_x2 * 4;
+
+      i += 4;
+
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) > maxf) {
+        max = i;
+        maxf = CABS1(x, ix);
+      }
+      ix += inc_x2;
+      i++;
+    }
+    return (max + 1);
+  }
+}
diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c
new file mode 100644
index 0000000000..09654b7426
--- /dev/null
+++ b/kernel/zarch/icamin.c
@@ -0,0 +1,302 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
+
+static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
+  BLASLONG iamin;
+
+  __asm__("vlef   %%v0,0(%[x]),0\n\t"
+    "vlef   %%v1,4(%[x]),0\n\t"
+    "vlef   %%v0,8(%[x]),1\n\t"
+    "vlef   %%v1,12(%[x]),1\n\t"
+    "vlef   %%v0,16(%[x]),2\n\t"
+    "vlef   %%v1,20(%[x]),2\n\t"
+    "vlef   %%v0,24(%[x]),3\n\t"
+    "vlef   %%v1,28(%[x]),3\n\t"
+    "vflpsb %%v0,%%v0\n\t"
+    "vflpsb %%v1,%%v1\n\t"
+    "vfasb  %%v0,%%v0,%%v1\n\t"
+    "vleig  %%v1,0,0\n\t"
+    "vleig  %%v1,2,1\n\t"
+    "vleig  %%v2,1,0\n\t"
+    "vleig  %%v2,3,1\n\t"
+    "vrepig %%v3,16\n\t"
+    "vzero  %%v4\n\t"
+    "vleib  %%v9,0,0\n\t"
+    "vleib  %%v9,1,1\n\t"
+    "vleib  %%v9,2,2\n\t"
+    "vleib  %%v9,3,3\n\t"
+    "vleib  %%v9,8,4\n\t"
+    "vleib  %%v9,9,5\n\t"
+    "vleib  %%v9,10,6\n\t"
+    "vleib  %%v9,11,7\n\t"
+    "vleib  %%v9,16,8\n\t"
+    "vleib  %%v9,17,9\n\t"
+    "vleib  %%v9,18,10\n\t"
+    "vleib  %%v9,19,11\n\t"
+    "vleib  %%v9,24,12\n\t"
+    "vleib  %%v9,25,13\n\t"
+    "vleib  %%v9,26,14\n\t"
+    "vleib  %%v9,27,15\n\t"
+    "vleif  %%v24,0,0\n\t"
+    "vleif  %%v24,1,1\n\t"
+    "vleif  %%v24,2,2\n\t"
+    "vleif  %%v24,3,3\n\t"
+    "vleif  %%v25,4,0\n\t"
+    "vleif  %%v25,5,1\n\t"
+    "vleif  %%v25,6,2\n\t"
+    "vleif  %%v25,7,3\n\t"
+    "vleif  %%v26,8,0\n\t"
+    "vleif  %%v26,9,1\n\t"
+    "vleif  %%v26,10,2\n\t"
+    "vleif  %%v26,11,3\n\t"
+    "vleif  %%v27,12,0\n\t"
+    "vleif  %%v27,13,1\n\t"
+    "vleif  %%v27,14,2\n\t"
+    "vleif  %%v27,15,3\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl    %%v16,0(%%r1,%[x])\n\t"
+    "vl    %%v28,16(%%r1,%[x])\n\t"
+    "vpkg  %%v17,%%v16,%%v28\n\t"
+    "vperm %%v16,%%v16,%%v28,%%v9\n\t"
+    "vl    %%v18,32(%%r1,%[x])\n\t"
+    "vl    %%v29,48(%%r1,%[x])\n\t"
+    "vpkg  %%v19,%%v18,%%v29\n\t"
+    "vperm %%v18,%%v18,%%v29,%%v9\n\t"
+    "vl    %%v20,64(%%r1,%[x])\n\t"
+    "vl    %%v30,80(%%r1,%[x])\n\t"
+    "vpkg  %%v21,%%v20,%%v30\n\t"
+    "vperm %%v20,%%v20,%%v30,%%v9\n\t"
+    "vl    %%v22,96(%%r1,%[x])\n\t"
+    "vl    %%v31,112(%%r1,%[x])\n\t"
+    "vpkg  %%v23,%%v22,%%v31\n\t"
+    "vperm %%v22,%%v22,%%v31,%%v9\n\t"
+    "vflpsb  %%v16, %%v16\n\t"
+    "vflpsb  %%v17, %%v17\n\t"
+    "vflpsb  %%v18, %%v18\n\t"
+    "vflpsb  %%v19, %%v19\n\t"
+    "vflpsb  %%v20, %%v20\n\t"
+    "vflpsb  %%v21, %%v21\n\t"
+    "vflpsb  %%v22, %%v22\n\t"
+    "vflpsb  %%v23, %%v23\n\t"
+    "vfasb %%v16,%%v16,%%v17\n\t"
+    "vfasb %%v17,%%v18,%%v19\n\t"
+    "vfasb %%v18,%%v20,%%v21\n\t"
+    "vfasb %%v19,%%v22,%%v23\n\t"
+    "vfchesb  %%v5,%%v17,%%v16\n\t"
+    "vfchesb  %%v6,%%v19,%%v18\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v5\n\t"
+    "vsel    %%v5,%%v24,%%v25,%%v5\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v6\n\t"
+    "vsel    %%v6,%%v26,%%v27,%%v6\n\t"
+    "vfchesb  %%v18,%%v17,%%v16\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v18\n\t"
+    "vsegf   %%v6,%%v5\n\t"
+    "vesrlg  %%v5,%%v5,32\n\t"
+    "vag     %%v5,%%v5,%%v4\n\t"
+    "vag     %%v6,%%v6,%%v4\n\t"
+    "vfchesb  %%v7,%%v16,%%v0\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v7\n\t"
+    "vsegf   %%v8,%%v7\n\t"
+    "vesrlg  %%v7,%%v7,32\n\t"
+    "vsegf   %%v7,%%v7\n\t"
+    "vsel    %%v1,%%v1,%%v5,%%v7\n\t"
+    "vsel    %%v2,%%v2,%%v6,%%v8\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vl    %%v16,128(%%r1,%[x])\n\t"
+    "vl    %%v28,144(%%r1,%[x])\n\t"
+    "vpkg  %%v17,%%v16,%%v28\n\t"
+    "vperm %%v16,%%v16,%%v28,%%v9\n\t"
+    "vl    %%v18,160(%%r1,%[x])\n\t"
+    "vl    %%v29,176(%%r1,%[x])\n\t"
+    "vpkg  %%v19,%%v18,%%v29\n\t"
+    "vperm %%v18,%%v18,%%v29,%%v9\n\t"
+    "vl    %%v20,192(%%r1,%[x])\n\t"
+    "vl    %%v30,208(%%r1,%[x])\n\t"
+    "vpkg  %%v21,%%v20,%%v30\n\t"
+    "vperm %%v20,%%v20,%%v30,%%v9\n\t"
+    "vl    %%v22,224(%%r1,%[x])\n\t"
+    "vl    %%v31,240(%%r1,%[x])\n\t"
+    "vpkg  %%v23,%%v22,%%v31\n\t"
+    "vperm %%v22,%%v22,%%v31,%%v9\n\t"
+    "vflpsb  %%v16, %%v16\n\t"
+    "vflpsb  %%v17, %%v17\n\t"
+    "vflpsb  %%v18, %%v18\n\t"
+    "vflpsb  %%v19, %%v19\n\t"
+    "vflpsb  %%v20, %%v20\n\t"
+    "vflpsb  %%v21, %%v21\n\t"
+    "vflpsb  %%v22, %%v22\n\t"
+    "vflpsb  %%v23, %%v23\n\t"
+    "vfasb %%v16,%%v16,%%v17\n\t"
+    "vfasb %%v17,%%v18,%%v19\n\t"
+    "vfasb %%v18,%%v20,%%v21\n\t"
+    "vfasb %%v19,%%v22,%%v23\n\t"
+    "vfchesb  %%v5,%%v17,%%v16\n\t"
+    "vfchesb  %%v6,%%v19,%%v18\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v5\n\t"
+    "vsel    %%v5,%%v24,%%v25,%%v5\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v6\n\t"
+    "vsel    %%v6,%%v26,%%v27,%%v6\n\t"
+    "vfchesb  %%v18,%%v17,%%v16\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v18\n\t"
+    "vsegf   %%v6,%%v5\n\t"
+    "vesrlg  %%v5,%%v5,32\n\t"
+    "vag     %%v5,%%v5,%%v4\n\t"
+    "vag     %%v6,%%v6,%%v4\n\t"
+    "vfchesb  %%v7,%%v16,%%v0\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v7\n\t"
+    "vsegf   %%v8,%%v7\n\t"
+    "vesrlg  %%v7,%%v7,32\n\t"
+    "vsegf   %%v7,%%v7\n\t"
+    "vsel    %%v1,%%v1,%%v5,%%v7\n\t"
+    "vsel    %%v2,%%v2,%%v6,%%v8\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "veslg   %%v3,%%v0,32\n\t"
+    "vfchsb  %%v4,%%v3,%%v0\n\t"
+    "vchlg   %%v5,%%v2,%%v1\n\t"
+    "vfcesb  %%v6,%%v0,%%v3\n\t"
+    "vn      %%v5,%%v5,%%v6\n\t"
+    "vo      %%v4,%%v4,%%v5\n\t"
+    "vsel    %%v0,%%v0,%%v3,%%v4\n\t"
+    "vesrlg  %%v4,%%v4,32\n\t"
+    "vsegf   %%v4,%%v4\n\t"
+    "vsel    %%v1,%%v1,%%v2,%%v4\n\t"
+    "vrepf  %%v2,%%v0,2\n\t"
+    "vrepg  %%v3,%%v1,1\n\t"
+    "wfcsb  %%v2,%%v0\n\t"
+    "jne 1f\n\t"
+    "vstef  %%v0,%[amin],0\n\t"
+    "vmnlg  %%v0,%%v1,%%v3\n\t"
+    "vlgvg  %[iamin],%%v0,0\n\t"
+    "j 2f\n\t"
+    "1:\n\t"
+    "wfchsb %%v4,%%v0,%%v2\n\t"
+    "vesrlg %%v4,%%v4,32\n\t"
+    "vsegf  %%v4,%%v4\n\t"
+    "vsel   %%v1,%%v3,%%v1,%%v4\n\t"
+    "vsel   %%v0,%%v2,%%v0,%%v4\n\t"
+    "ste    %%f0,%[amin]\n\t"
+    "vlgvg  %[iamin],%%v1,0\n\t"
+    "2:\n\t"
+    "nop"
+    : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+       "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+       "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return iamin;
+}
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0;
+  FLOAT minf = 0;
+  BLASLONG min = 0;
+  BLASLONG inc_x2;
+
+  if (n <= 0 || inc_x <= 0)
+    return (min);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      min = icamin_kernel_32(n1, x, &minf);
+      ix = n1 * 2;
+      i = n1;
+    } else {
+      minf = CABS1(x, 0);
+      ix += 2;
+      i++;
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) < minf) {
+        min = i;
+        minf = CABS1(x, ix);
+      }
+      ix += 2;
+      i++;
+    }
+    return (min + 1);
+
+  } else {
+
+    min = 0;
+    minf = CABS1(x, 0);
+    inc_x2 = 2 * inc_x;
+
+    BLASLONG n1 = n & -4;
+    while (i < n1) {
+
+      if (CABS1(x, ix) < minf) {
+        min = i;
+        minf = CABS1(x, ix);
+      }
+      if (CABS1(x, ix + inc_x2) < minf) {
+        min = i + 1;
+        minf = CABS1(x, ix + inc_x2);
+      }
+      if (CABS1(x, ix + 2 * inc_x2) < minf) {
+        min = i + 2;
+        minf = CABS1(x, ix + 2 * inc_x2);
+      }
+      if (CABS1(x, ix + 3 * inc_x2) < minf) {
+        min = i + 3;
+        minf = CABS1(x, ix + 3 * inc_x2);
+      }
+
+      ix += inc_x2 * 4;
+
+      i += 4;
+
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) < minf) {
+        min = i;
+        minf = CABS1(x, ix);
+      }
+      ix += inc_x2;
+      i++;
+    }
+    return (min + 1);
+  }
+}
diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c
index b670911480..b292c1d151 100644
--- a/kernel/zarch/idamax.c
+++ b/kernel/zarch/idamax.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -23,225 +23,223 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
+*****************************************************************************/
+
 #include "common.h"
 #include <math.h>
 
-#if defined(DOUBLE)
-
 #define ABS fabs
 
-#else
-
-#define ABS fabsf
-
-#endif
-
- 
-/**
- * Find  maximum index 
- * Warning: requirements n>0  and n % 32 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param maxf  (out) maximum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
-    BLASLONG index;
-   __asm__(
-            "pfd 1, 0(%[ptr_x])      \n\t"
-            "sllg   %%r0,%[n],3      \n\t"
-            "agr    %%r0,%[ptr_x]    \n\t"
-            "vleig  %%v20,0,0  \n\t"
-            "vleig  %%v20,1,1  \n\t"
-            "vleig  %%v21,2,0  \n\t"
-            "vleig  %%v21,3,1  \n\t"
-            "vleig  %%v22,4,0  \n\t"
-            "vleig  %%v22,5,1  \n\t"
-            "vleig  %%v23,6,0  \n\t"
-            "vleig  %%v23,7,1  \n\t"
-            "vrepig %%v4,8     \n\t"
-            "vzero  %%v5       \n\t"
-            "vzero  %%v18      \n\t"
-            "vzero  %%v19      \n\t"
-            ".align 16 \n\t"
-            "1: \n\t"
-            "pfd 1, 256(%[ptr_tmp] ) \n\t"
-            "vlm     %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
-            "vflpdb  %%v24, %%v24 \n\t"
-            "vflpdb  %%v25, %%v25 \n\t"
-            "vflpdb  %%v26, %%v26 \n\t"
-            "vflpdb  %%v27, %%v27 \n\t"
-            "vflpdb  %%v28, %%v28 \n\t"
-            "vflpdb  %%v29, %%v29 \n\t"
-            "vflpdb  %%v30, %%v30 \n\t"
-            "vflpdb  %%v31, %%v31 \n\t"
-            "vfchdb  %%v16,%%v25,%%v24  \n\t "
-            "vfchdb  %%v17,%%v27,%%v26  \n\t "
-            "vsel    %%v1,%%v21,%%v20,%%v16 \n\t"
-            "vsel    %%v0,%%v25,%%v24,%%v16 \n\t"
-            "vsel    %%v2,%%v23,%%v22,%%v17 \n\t"
-            "vsel    %%v3,%%v27,%%v26,%%v17 \n\t"
-            "vfchdb  %%v16,%%v29,%%v28  \n\t "
-            "vfchdb  %%v17,%%v31,%%v30  \n\t"
-            "vsel    %%v24,%%v21,%%v20,%%v16 \n\t"
-            "vsel    %%v25,%%v29,%%v28,%%v16 \n\t"
-            "vsel    %%v26,%%v23,%%v22,%%v17 \n\t"
-            "vsel    %%v27,%%v31,%%v30,%%v17 \n\t"
-
-            "vfchdb  %%v28, %%v3,%%v0        \n\t"
-            "vfchdb  %%v29,%%v27, %%v25      \n\t"
-            "vsel    %%v1,%%v2,%%v1,%%v28    \n\t"
-            "vsel    %%v0,%%v3,%%v0,%%v28    \n\t"
-            "vsel    %%v24,%%v26,%%v24,%%v29 \n\t"
-            "vsel    %%v25,%%v27,%%v25,%%v29 \n\t"
-            "vag     %%v1,%%v1,%%v5    \n\t"
-            "vag     %%v24,%%v24,%%v5  \n\t"
-            "vag     %%v24,%%v24,%%v4  \n\t"
-            "vfchdb  %%v16,%%v25 , %%v0      \n\t"
-            "vag     %%v5,%%v5,%%v4          \n\t"
-            "vsel    %%v29,%%v25,%%v0,%%v16  \n\t"
-            "vsel    %%v28,%%v24,%%v1,%%v16  \n\t"
-            "vfchdb  %%v17, %%v29,%%v18      \n\t"
-            "vsel    %%v19,%%v28,%%v19,%%v17 \n\t"
-            "vsel    %%v18,%%v29,%%v18,%%v17 \n\t"
-            "vag     %%v5,%%v5,%%v4          \n\t"
-            "vlm     %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
-            "vflpdb  %%v24, %%v24 \n\t"
-            "vflpdb  %%v25, %%v25 \n\t"
-            "vflpdb  %%v26, %%v26 \n\t"
-            "vflpdb  %%v27, %%v27 \n\t"
-            "vflpdb  %%v28, %%v28 \n\t"
-            "vflpdb  %%v29, %%v29 \n\t"
-            "vflpdb  %%v30, %%v30 \n\t"
-            "vflpdb  %%v31, %%v31 \n\t"
-            "vfchdb  %%v16,%%v25,%%v24  \n\t "
-            "vfchdb  %%v17,%%v27,%%v26  \n\t "
-            "vsel    %%v1,%%v21,%%v20,%%v16 \n\t"
-            "vsel    %%v0,%%v25,%%v24,%%v16 \n\t"
-            "vsel    %%v2,%%v23,%%v22,%%v17 \n\t"
-            "vsel    %%v3,%%v27,%%v26,%%v17 \n\t"
-            "vfchdb  %%v16,%%v29,%%v28  \n\t "
-            "vfchdb  %%v17,%%v31,%%v30  \n\t"
-            "vsel    %%v24,%%v21,%%v20,%%v16 \n\t"
-            "vsel    %%v25,%%v29,%%v28,%%v16 \n\t"
-            "vsel    %%v26,%%v23,%%v22,%%v17 \n\t"
-            "vsel    %%v27,%%v31,%%v30,%%v17 \n\t"
-
-            "vfchdb  %%v28, %%v3,%%v0     \n\t"
-            "vfchdb  %%v29,%%v27, %%v25   \n\t"
-            "vsel    %%v1,%%v2,%%v1,%%v28 \n\t"
-            "vsel    %%v0,%%v3,%%v0,%%v28 \n\t"
-            "vsel    %%v24,%%v26,%%v24,%%v29 \n\t"
-            "vsel    %%v25,%%v27,%%v25,%%v29 \n\t"
-            "vag     %%v1,%%v1,%%v5      \n\t"
-            "vag     %%v24,%%v24,%%v5    \n\t"
-            "la      %[ptr_tmp],256(%[ptr_tmp])   \n\t"
-            "vag     %%v24,%%v24,%%v4    \n\t"
-            "vfchdb  %%v16,%%v25 , %%v0  \n\t"
-            "vag     %%v5,%%v5,%%v4      \n\t"
-            "vsel    %%v29,%%v25,%%v0,%%v16  \n\t"
-            "vsel    %%v28,%%v24,%%v1,%%v16  \n\t"
-            "vfchdb  %%v17, %%v29,%%v18      \n\t"
-            "vsel    %%v19,%%v28,%%v19,%%v17 \n\t"
-            "vsel    %%v18,%%v29,%%v18,%%v17 \n\t"
-            "vag     %%v5,%%v5,%%v4 \n\t"
-            "clgrjl  %[ptr_tmp],%%r0,1b  \n\t"
-
-            "vrepg   %%v26,%%v18,1     \n\t"
-            "vrepg   %%v5,%%v19,1      \n\t"
-            "wfcdb   %%v26,%%v18       \n\t"
-            "jne 2f  \n\t"
-            "vsteg   %%v18,%[maxf],0   \n\t"
-            "vmnlg   %%v1,%%v5,%%v19   \n\t"
-            "j 3f    \n\t"
-
-            "2:      \n\t"
-            "wfchdb  %%v16,%%v26,%%v18      \n\t"
-            "vsel    %%v1,%%v5,%%v19,%%v16  \n\t"
-            "vsel    %%v0,%%v26,%%v18,%%v16 \n\t"
-            "std     %%f0,%[maxf]           \n\t"
-             
-            "3:      \n\t"
-            "vlgvg   %[index],%%v1,0        \n\t"
-            : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
-            : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) 
-            : "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
-            "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
-            );
-    
-    return index;
-
+static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
+  BLASLONG iamax;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "vflpdb %%v0,%%v0\n\t"
+    "vleig  %%v1,0,0\n\t"
+    "vleig  %%v1,1,1\n\t"
+    "vrepig %%v2,16\n\t"
+    "vzero  %%v3\n\t"
+    "vleig  %%v24,0,0\n\t"
+    "vleig  %%v24,1,1\n\t"
+    "vleig  %%v25,2,0\n\t"
+    "vleig  %%v25,3,1\n\t"
+    "vleig  %%v26,4,0\n\t"
+    "vleig  %%v26,5,1\n\t"
+    "vleig  %%v27,6,0\n\t"
+    "vleig  %%v27,7,1\n\t"
+    "vleig  %%v28,8,0\n\t"
+    "vleig  %%v28,9,1\n\t"
+    "vleig  %%v29,10,0\n\t"
+    "vleig  %%v29,11,1\n\t"
+    "vleig  %%v30,12,0\n\t"
+    "vleig  %%v30,13,1\n\t"
+    "vleig  %%v31,14,0\n\t"
+    "vleig  %%v31,15,1\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfchedb  %%v4,%%v16,%%v17\n\t"
+    "vfchedb  %%v5,%%v18,%%v19\n\t"
+    "vfchedb  %%v6,%%v20,%%v21\n\t"
+    "vfchedb  %%v7,%%v22,%%v23\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v4\n\t"
+    "vsel    %%v4,%%v24,%%v25,%%v4\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v5\n\t"
+    "vsel    %%v5,%%v26,%%v27,%%v5\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v6\n\t"
+    "vsel    %%v6,%%v28,%%v29,%%v6\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v7\n\t"
+    "vsel    %%v7,%%v30,%%v31,%%v7\n\t"
+    "vfchedb  %%v20,%%v16,%%v17\n\t"
+    "vfchedb  %%v21,%%v18,%%v19\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v5,%%v6,%%v7,%%v21\n\t"
+    "vfchedb  %%v18,%%v16,%%v17\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v18\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vfchedb  %%v5,%%v0,%%v16\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v5\n\t"
+    "vsel    %%v1,%%v1,%%v4,%%v5\n\t"
+    "vag     %%v3,%%v3,%%v2\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,192(%%r1,%[x])\n\t"
+    "vl  %%v21,208(%%r1,%[x])\n\t"
+    "vl  %%v22,224(%%r1,%[x])\n\t"
+    "vl  %%v23,240(%%r1,%[x])\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfchedb  %%v4,%%v16,%%v17\n\t"
+    "vfchedb  %%v5,%%v18,%%v19\n\t"
+    "vfchedb  %%v6,%%v20,%%v21\n\t"
+    "vfchedb  %%v7,%%v22,%%v23\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v4\n\t"
+    "vsel    %%v4,%%v24,%%v25,%%v4\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v5\n\t"
+    "vsel    %%v5,%%v26,%%v27,%%v5\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v6\n\t"
+    "vsel    %%v6,%%v28,%%v29,%%v6\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v7\n\t"
+    "vsel    %%v7,%%v30,%%v31,%%v7\n\t"
+    "vfchedb  %%v20,%%v16,%%v17\n\t"
+    "vfchedb  %%v21,%%v18,%%v19\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v5,%%v6,%%v7,%%v21\n\t"
+    "vfchedb  %%v18,%%v16,%%v17\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v18\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vfchedb  %%v5,%%v0,%%v16\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v5\n\t"
+    "vsel    %%v1,%%v1,%%v4,%%v5\n\t"
+    "vag     %%v3,%%v3,%%v2\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v2,%%v0,1\n\t"
+    "vrepg  %%v3,%%v1,1\n\t"
+    "wfcdb  %%v2,%%v0\n\t"
+    "jne 1f\n\t"
+    "vsteg  %%v0,%[amax],0\n\t"
+    "vmnlg  %%v0,%%v1,%%v3\n\t"
+    "vlgvg  %[iamax],%%v0,0\n\t"
+    "j 2f\n\t"
+    "1:\n\t"
+    "wfchdb %%v4,%%v2,%%v0\n\t"
+    "vsel   %%v1,%%v3,%%v1,%%v4\n\t"
+    "vsel   %%v0,%%v2,%%v0,%%v4\n\t"
+    "std    %%f0,%[amax]\n\t"
+    "vlgvg  %[iamax],%%v1,0\n\t"
+    "2:\n\t"
+    "nop"
+    : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+
+  return iamax;
 }
 
-
- 
- 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
-    BLASLONG i = 0;
-    BLASLONG j = 0;
-    BLASLONG ix = 0;
-    FLOAT maxf = 0.0;
-    BLASLONG max = 0;
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT maxf = 0.0;
+  BLASLONG max = 0;
 
-    if (n <= 0 || inc_x <= 0) return (max);
+  if (n <= 0 || inc_x <= 0)
+    return (max);
 
-    if (inc_x == 1) {
+  if (inc_x == 1) {
 
-        BLASLONG n1 = n & -32;
-        if (n1 > 0) {
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
 
-            max = diamax_kernel_32_TUNED(n1, x, &maxf);
+      max = idamax_kernel_32(n1, x, &maxf);
 
-            i = n1;
-        }
+      i = n1;
+    } else {
+      maxf = ABS(x[0]);
+      i++;
+    }
 
-        while (i < n) {
-            if (ABS(x[i]) > maxf) {
-                max = i;
-                maxf = ABS(x[i]);
-            }
-            i++;
-        }
-        return (max + 1);
+    while (i < n) {
+      if (ABS(x[i]) > maxf) {
+        max = i;
+        maxf = ABS(x[i]);
+      }
+      i++;
+    }
+    return (max + 1);
 
-    } else {
+  } else {
+
+    max = 0;
+    maxf = ABS(x[0]);
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (ABS(x[i]) > maxf) {
+        max = j;
+        maxf = ABS(x[i]);
+      }
+      if (ABS(x[i + inc_x]) > maxf) {
+        max = j + 1;
+        maxf = ABS(x[i + inc_x]);
+      }
+      if (ABS(x[i + 2 * inc_x]) > maxf) {
+        max = j + 2;
+        maxf = ABS(x[i + 2 * inc_x]);
+      }
+      if (ABS(x[i + 3 * inc_x]) > maxf) {
+        max = j + 3;
+        maxf = ABS(x[i + 3 * inc_x]);
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
 
-        BLASLONG n1 = n & -4;
-        while (j < n1) {
-
-            if (ABS(x[i]) > maxf) {
-                max = j;
-                maxf = ABS(x[i]);
-            }
-            if (ABS(x[i + inc_x]) > maxf) {
-                max = j + 1;
-                maxf = ABS(x[i + inc_x]);
-            }
-            if (ABS(x[i + 2 * inc_x]) > maxf) {
-                max = j + 2;
-                maxf = ABS(x[i + 2 * inc_x]);
-            }
-            if (ABS(x[i + 3 * inc_x]) > maxf) {
-                max = j + 3;
-                maxf = ABS(x[i + 3 * inc_x]);
-            }
-
-            i += inc_x * 4;
-
-            j += 4;
-
-        }
-
-
-        while (j < n) {
-            if (ABS(x[i]) > maxf) {
-                max = j;
-                maxf = ABS(x[i]);
-            }
-            i += inc_x;
-            j++;
-        }
-        return (max + 1);
+    while (j < n) {
+      if (ABS(x[i]) > maxf) {
+        max = j;
+        maxf = ABS(x[i]);
+      }
+      i += inc_x;
+      j++;
     }
+    return (max + 1);
+  }
 }
diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c
index 8a7ff1659a..f9a8119e15 100644
--- a/kernel/zarch/idamin.c
+++ b/kernel/zarch/idamin.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -23,241 +23,223 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
+*****************************************************************************/
+
 #include "common.h"
 #include <math.h>
 
-#if defined(DOUBLE)
-
 #define ABS fabs
 
-#else
-
-#define ABS fabsf
-
-#endif
-
-/**
- * Find  minimum index 
- * Warning: requirements n>0  and n % 32 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param minf  (out) minimum absolute value .( only for output )
- * @return minimum index 
- */
-static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
-     BLASLONG index;
-    __asm__( 
-            "pfd    1, 0(%[ptr_x]) \n\t"
-            "sllg   %%r0,%[n],3    \n\t" 
-            "agr    %%r0,%[ptr_x]  \n\t"
-            "vleig  %%v20,0,0  \n\t"
-            "vleig  %%v20,1,1  \n\t"
-            "vleig  %%v21,2,0  \n\t"
-            "vleig  %%v21,3,1  \n\t"
-            "vleig  %%v22,4,0  \n\t"
-            "vleig  %%v22,5,1  \n\t"
-            "vleig  %%v23,6,0  \n\t"
-            "vleig  %%v23,7,1  \n\t"
-            "vrepig %%v4,8     \n\t"
-            "vlrepg %%v18,0(%[ptr_x])   \n\t"
-            "vzero  %%v5        \n\t" 
-            "vflpdb %%v18, %%v18 \n\t"
-            "vzero  %%v19          \n\t"
-            ".align 16 \n\t"
-            "1: \n\t"
-            "pfd     1, 256(%[ptr_tmp] ) \n\t"
-            "vlm     %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
-
-            "vflpdb  %%v24, %%v24 \n\t"
-            "vflpdb  %%v25, %%v25 \n\t"
-            "vflpdb  %%v26, %%v26 \n\t"
-            "vflpdb  %%v27, %%v27 \n\t"
-            "vflpdb  %%v28, %%v28 \n\t"
-            "vflpdb  %%v29, %%v29 \n\t"
-            "vflpdb  %%v30, %%v30 \n\t"
-            "vflpdb  %%v31, %%v31 \n\t"
-
-            "vfchdb  %%v16,%%v24,%%v25  \n\t "
-            "vfchdb  %%v17,%%v26 ,%%v27 \n\t "
-            "vsel    %%v1,%%v21,%%v20,%%v16 \n\t"
-            "vsel    %%v0,%%v25,%%v24,%%v16 \n\t"
-            "vsel    %%v2,%%v23,%%v22,%%v17 \n\t"
-            "vsel    %%v3,%%v27,%%v26,%%v17 \n\t"
-            "vfchdb  %%v16,%%v28, %%v29 \n\t "
-            "vfchdb  %%v17,%%v30,%%v31  \n\t"
-            "vsel    %%v24,%%v21,%%v20,%%v16 \n\t"
-            "vsel    %%v25,%%v29,%%v28,%%v16 \n\t"
-            "vsel    %%v26,%%v23,%%v22,%%v17 \n\t"
-            "vsel    %%v27,%%v31,%%v30,%%v17 \n\t"
-
-
-            "vfchdb  %%v28,%%v0 , %%v3       \n\t"
-            "vfchdb  %%v29, %%v25,%%v27      \n\t"
-            "vsel    %%v1,%%v2,%%v1,%%v28    \n\t"
-            "vsel    %%v0,%%v3,%%v0,%%v28    \n\t"
-            "vsel    %%v24,%%v26,%%v24,%%v29 \n\t"
-            "vsel    %%v25,%%v27,%%v25,%%v29 \n\t"
-
-            "vag     %%v1,%%v1,%%v5   \n\t"
-            "vag     %%v24,%%v24,%%v5   \n\t"
-            "vag     %%v24,%%v24,%%v4   \n\t"
-
-            "vfchdb  %%v16, %%v0,%%v25      \n\t"
-            "vag     %%v5,%%v5,%%v4         \n\t"
-            "vsel    %%v29,%%v25,%%v0,%%v16 \n\t"
-            "vsel    %%v28,%%v24,%%v1,%%v16 \n\t"
-
-            "vfchdb  %%v17,%%v18, %%v29      \n\t"
-            "vsel    %%v19,%%v28,%%v19,%%v17 \n\t"
-            "vsel    %%v18,%%v29,%%v18,%%v17 \n\t"
-
-            "vag     %%v5,%%v5,%%v4 \n\t"
-
-            "vlm     %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
-            "vflpdb  %%v24, %%v24 \n\t"
-            "vflpdb  %%v25, %%v25 \n\t"
-            "vflpdb  %%v26, %%v26 \n\t"
-            "vflpdb  %%v27, %%v27 \n\t"
-            "vflpdb  %%v28, %%v28 \n\t"
-            "vflpdb  %%v29, %%v29 \n\t"
-            "vflpdb  %%v30, %%v30 \n\t"
-            "vflpdb  %%v31, %%v31 \n\t"
-
-            "vfchdb  %%v16,%%v24,%%v25  \n\t"
-            "vfchdb  %%v17,%%v26 ,%%v27 \n\t"
-            "vsel    %%v1,%%v21,%%v20,%%v16 \n\t"
-            "vsel    %%v0,%%v25,%%v24,%%v16 \n\t"
-            "vsel    %%v2,%%v23,%%v22,%%v17 \n\t"
-            "vsel    %%v3,%%v27,%%v26,%%v17 \n\t"
-            "vfchdb  %%v16,%%v28 ,%%v29 \n\t"
-            "vfchdb  %%v17,%%v30,%%v31  \n\t"
-            "vsel    %%v24,%%v21,%%v20,%%v16 \n\t"
-            "vsel    %%v25,%%v29,%%v28,%%v16 \n\t"
-            "vsel    %%v26,%%v23,%%v22,%%v17 \n\t"
-            "vsel    %%v27,%%v31,%%v30,%%v17 \n\t"
-
-
-            "vfchdb  %%v28,%%v0 , %%v3       \n\t"
-            "vfchdb  %%v29, %%v25,%%v27      \n\t"
-            "vsel    %%v1,%%v2,%%v1,%%v28    \n\t"
-            "vsel    %%v0,%%v3,%%v0,%%v28    \n\t"
-            "vsel    %%v24,%%v26,%%v24,%%v29 \n\t"
-            "vsel    %%v25,%%v27,%%v25,%%v29 \n\t"
-
-            "vag     %%v1,%%v1,%%v5     \n\t"
-            "vag     %%v24,%%v24,%%v5   \n\t"
-            "la      %[ptr_tmp],256(%[ptr_tmp])   \n\t"
-            "vag     %%v24,%%v24,%%v4   \n\t"
-
-            "vfchdb  %%v16, %%v0,%%v25      \n\t"
-            "vag     %%v5,%%v5,%%v4         \n\t"
-            "vsel    %%v29,%%v25,%%v0,%%v16 \n\t"
-            "vsel    %%v28,%%v24,%%v1,%%v16 \n\t"
-
-            "vfchdb  %%v17,%%v18, %%v29      \n\t"
-            "vsel    %%v19,%%v28,%%v19,%%v17 \n\t"
-            "vsel    %%v18,%%v29,%%v18,%%v17 \n\t"
-
-            "vag     %%v5,%%v5,%%v4 \n\t"
-
-            "clgrjl  %[ptr_tmp],%%r0,1b \n\t"
-
-
-            "vrepg   %%v26,%%v18,1   \n\t"
-            "vrepg   %%v5,%%v19,1    \n\t"
-            "wfcdb   %%v26,%%v18     \n\t"
-            "jne 2f  \n\t"
-            "vsteg   %%v18,%[minf],0  \n\t"
-            "vmnlg   %%v1,%%v5,%%v19  \n\t"
-            "j 3f    \n\t"
-            
-            "2:      \n\t"
-            "wfchdb  %%v16,%%v18 ,%%v26     \n\t "
-            "vsel    %%v1,%%v5,%%v19,%%v16  \n\t"
-            "vsel    %%v0,%%v26,%%v18,%%v16 \n\t"
-            "std     %%f0,%[minf]     \n\t"            
-
-            "3:   \n\t"
-            "vlgvg   %[index],%%v1,0  \n\t" 
-
-            : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
-            : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) 
-            : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
-            "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
-
-            );
-    
-    return index;
-
+static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
+  BLASLONG iamin;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "vflpdb %%v0,%%v0\n\t"
+    "vleig  %%v1,0,0\n\t"
+    "vleig  %%v1,1,1\n\t"
+    "vrepig %%v2,16\n\t"
+    "vzero  %%v3\n\t"
+    "vleig  %%v24,0,0\n\t"
+    "vleig  %%v24,1,1\n\t"
+    "vleig  %%v25,2,0\n\t"
+    "vleig  %%v25,3,1\n\t"
+    "vleig  %%v26,4,0\n\t"
+    "vleig  %%v26,5,1\n\t"
+    "vleig  %%v27,6,0\n\t"
+    "vleig  %%v27,7,1\n\t"
+    "vleig  %%v28,8,0\n\t"
+    "vleig  %%v28,9,1\n\t"
+    "vleig  %%v29,10,0\n\t"
+    "vleig  %%v29,11,1\n\t"
+    "vleig  %%v30,12,0\n\t"
+    "vleig  %%v30,13,1\n\t"
+    "vleig  %%v31,14,0\n\t"
+    "vleig  %%v31,15,1\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfchedb  %%v4,%%v17,%%v16\n\t"
+    "vfchedb  %%v5,%%v19,%%v18\n\t"
+    "vfchedb  %%v6,%%v21,%%v20\n\t"
+    "vfchedb  %%v7,%%v23,%%v22\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v4\n\t"
+    "vsel    %%v4,%%v24,%%v25,%%v4\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v5\n\t"
+    "vsel    %%v5,%%v26,%%v27,%%v5\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v6\n\t"
+    "vsel    %%v6,%%v28,%%v29,%%v6\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v7\n\t"
+    "vsel    %%v7,%%v30,%%v31,%%v7\n\t"
+    "vfchedb  %%v20,%%v17,%%v16\n\t"
+    "vfchedb  %%v21,%%v19,%%v18\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v5,%%v6,%%v7,%%v21\n\t"
+    "vfchedb  %%v18,%%v17,%%v16\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v18\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vfchedb  %%v5,%%v16,%%v0\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v5\n\t"
+    "vsel    %%v1,%%v1,%%v4,%%v5\n\t"
+    "vag     %%v3,%%v3,%%v2\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,192(%%r1,%[x])\n\t"
+    "vl  %%v21,208(%%r1,%[x])\n\t"
+    "vl  %%v22,224(%%r1,%[x])\n\t"
+    "vl  %%v23,240(%%r1,%[x])\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfchedb  %%v4,%%v17,%%v16\n\t"
+    "vfchedb  %%v5,%%v19,%%v18\n\t"
+    "vfchedb  %%v6,%%v21,%%v20\n\t"
+    "vfchedb  %%v7,%%v23,%%v22\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v4\n\t"
+    "vsel    %%v4,%%v24,%%v25,%%v4\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v5\n\t"
+    "vsel    %%v5,%%v26,%%v27,%%v5\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v6\n\t"
+    "vsel    %%v6,%%v28,%%v29,%%v6\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v7\n\t"
+    "vsel    %%v7,%%v30,%%v31,%%v7\n\t"
+    "vfchedb  %%v20,%%v17,%%v16\n\t"
+    "vfchedb  %%v21,%%v19,%%v18\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v5,%%v6,%%v7,%%v21\n\t"
+    "vfchedb  %%v18,%%v17,%%v16\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v18\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vfchedb  %%v5,%%v16,%%v0\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v5\n\t"
+    "vsel    %%v1,%%v1,%%v4,%%v5\n\t"
+    "vag     %%v3,%%v3,%%v2\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v2,%%v0,1\n\t"
+    "vrepg  %%v3,%%v1,1\n\t"
+    "wfcdb  %%v2,%%v0\n\t"
+    "jne 1f\n\t"
+    "vsteg  %%v0,%[amin],0\n\t"
+    "vmnlg  %%v0,%%v1,%%v3\n\t"
+    "vlgvg  %[iamin],%%v0,0\n\t"
+    "j 2f\n\t"
+    "1:\n\t"
+    "wfchdb %%v4,%%v0,%%v2\n\t"
+    "vsel   %%v1,%%v3,%%v1,%%v4\n\t"
+    "vsel   %%v0,%%v2,%%v0,%%v4\n\t"
+    "std    %%f0,%[amin]\n\t"
+    "vlgvg  %[iamin],%%v1,0\n\t"
+    "2:\n\t"
+    "nop"
+    : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+
+  return iamin;
 }
 
-
-
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
-    BLASLONG i = 0;
-    BLASLONG j = 0;
-    BLASLONG ix = 0;
-    BLASLONG min = 0;
-    FLOAT minf = 0.0;
-    
-    if (n <= 0 || inc_x <= 0) return (min);
-    minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant
-    if (inc_x == 1) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT minf = 0.0;
+  BLASLONG min = 0;
 
-        BLASLONG n1 = n & -32;
-        if (n1 > 0) {
+  if (n <= 0 || inc_x <= 0)
+    return (min);
 
-            min = diamin_kernel_32(n1, x, &minf);
-            i = n1;
-        }
+  if (inc_x == 1) {
 
-        while (i < n) {
-            if (ABS(x[i]) < minf) {
-                min = i;
-                minf = ABS(x[i]);
-            }
-            i++;
-        }
-        return (min + 1);
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
 
+      min = idamin_kernel_32(n1, x, &minf);
+
+      i = n1;
     } else {
+      minf = ABS(x[0]);
+      i++;
+    }
 
-        BLASLONG n1 = n & -4;
-        while (j < n1) {
+    while (i < n) {
+      if (ABS(x[i]) < minf) {
+        min = i;
+        minf = ABS(x[i]);
+      }
+      i++;
+    }
+    return (min + 1);
+
+  } else {
 
-            if (ABS(x[i]) < minf) {
-                min = j;
-                minf = ABS(x[i]);
-            }
-            if (ABS(x[i + inc_x]) < minf) {
-                min = j + 1;
-                minf = ABS(x[i + inc_x]);
-            }
-            if (ABS(x[i + 2 * inc_x]) < minf) {
-                min = j + 2;
-                minf = ABS(x[i + 2 * inc_x]);
-            }
-            if (ABS(x[i + 3 * inc_x]) < minf) {
-                min = j + 3;
-                minf = ABS(x[i + 3 * inc_x]);
-            }
+    min = 0;
+    minf = ABS(x[0]);
 
-            i += inc_x * 4;
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
 
-            j += 4;
+      if (ABS(x[i]) < minf) {
+        min = j;
+        minf = ABS(x[i]);
+      }
+      if (ABS(x[i + inc_x]) < minf) {
+        min = j + 1;
+        minf = ABS(x[i + inc_x]);
+      }
+      if (ABS(x[i + 2 * inc_x]) < minf) {
+        min = j + 2;
+        minf = ABS(x[i + 2 * inc_x]);
+      }
+      if (ABS(x[i + 3 * inc_x]) < minf) {
+        min = j + 3;
+        minf = ABS(x[i + 3 * inc_x]);
+      }
 
-        }
+      i += inc_x * 4;
 
+      j += 4;
+
+    }
 
-        while (j < n) {
-            if (ABS(x[i]) < minf) {
-                min = j;
-                minf = ABS(x[i]);
-            }
-            i += inc_x;
-            j++;
-        }
-        return (min + 1);
+    while (j < n) {
+      if (ABS(x[i]) < minf) {
+        min = j;
+        minf = ABS(x[i]);
+      }
+      i += inc_x;
+      j++;
     }
+    return (min + 1);
+  }
 }
diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c
new file mode 100644
index 0000000000..8f283bc170
--- /dev/null
+++ b/kernel/zarch/idmax.c
@@ -0,0 +1,225 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) {
+  BLASLONG imax;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "vleig  %%v1,0,0\n\t"
+    "vleig  %%v1,1,1\n\t"
+    "vrepig %%v2,16\n\t"
+    "vzero  %%v3\n\t"
+    "vleig  %%v24,0,0\n\t"
+    "vleig  %%v24,1,1\n\t"
+    "vleig  %%v25,2,0\n\t"
+    "vleig  %%v25,3,1\n\t"
+    "vleig  %%v26,4,0\n\t"
+    "vleig  %%v26,5,1\n\t"
+    "vleig  %%v27,6,0\n\t"
+    "vleig  %%v27,7,1\n\t"
+    "vleig  %%v28,8,0\n\t"
+    "vleig  %%v28,9,1\n\t"
+    "vleig  %%v29,10,0\n\t"
+    "vleig  %%v29,11,1\n\t"
+    "vleig  %%v30,12,0\n\t"
+    "vleig  %%v30,13,1\n\t"
+    "vleig  %%v31,14,0\n\t"
+    "vleig  %%v31,15,1\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vfchedb  %%v4,%%v16,%%v17\n\t"
+    "vfchedb  %%v5,%%v18,%%v19\n\t"
+    "vfchedb  %%v6,%%v20,%%v21\n\t"
+    "vfchedb  %%v7,%%v22,%%v23\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v4\n\t"
+    "vsel    %%v4,%%v24,%%v25,%%v4\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v5\n\t"
+    "vsel    %%v5,%%v26,%%v27,%%v5\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v6\n\t"
+    "vsel    %%v6,%%v28,%%v29,%%v6\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v7\n\t"
+    "vsel    %%v7,%%v30,%%v31,%%v7\n\t"
+    "vfchedb  %%v20,%%v16,%%v17\n\t"
+    "vfchedb  %%v21,%%v18,%%v19\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v5,%%v6,%%v7,%%v21\n\t"
+    "vfchedb  %%v18,%%v16,%%v17\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v18\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vfchedb  %%v5,%%v0,%%v16\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v5\n\t"
+    "vsel    %%v1,%%v1,%%v4,%%v5\n\t"
+    "vag     %%v3,%%v3,%%v2\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,192(%%r1,%[x])\n\t"
+    "vl  %%v21,208(%%r1,%[x])\n\t"
+    "vl  %%v22,224(%%r1,%[x])\n\t"
+    "vl  %%v23,240(%%r1,%[x])\n\t"
+    "vfchedb  %%v4,%%v16,%%v17\n\t"
+    "vfchedb  %%v5,%%v18,%%v19\n\t"
+    "vfchedb  %%v6,%%v20,%%v21\n\t"
+    "vfchedb  %%v7,%%v22,%%v23\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v4\n\t"
+    "vsel    %%v4,%%v24,%%v25,%%v4\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v5\n\t"
+    "vsel    %%v5,%%v26,%%v27,%%v5\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v6\n\t"
+    "vsel    %%v6,%%v28,%%v29,%%v6\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v7\n\t"
+    "vsel    %%v7,%%v30,%%v31,%%v7\n\t"
+    "vfchedb  %%v20,%%v16,%%v17\n\t"
+    "vfchedb  %%v21,%%v18,%%v19\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v5,%%v6,%%v7,%%v21\n\t"
+    "vfchedb  %%v18,%%v16,%%v17\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v18\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vfchedb  %%v5,%%v0,%%v16\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v5\n\t"
+    "vsel    %%v1,%%v1,%%v4,%%v5\n\t"
+    "vag     %%v3,%%v3,%%v2\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v2,%%v0,1\n\t"
+    "vrepg  %%v3,%%v1,1\n\t"
+    "wfcdb  %%v2,%%v0\n\t"
+    "jne 1f\n\t"
+    "vsteg  %%v0,%[max],0\n\t"
+    "vmnlg  %%v0,%%v1,%%v3\n\t"
+    "vlgvg  %[imax],%%v0,0\n\t"
+    "j 2f\n\t"
+    "1:\n\t"
+    "wfchdb %%v4,%%v2,%%v0\n\t"
+    "vsel   %%v1,%%v3,%%v1,%%v4\n\t"
+    "vsel   %%v0,%%v2,%%v0,%%v4\n\t"
+    "std    %%f0,%[max]\n\t"
+    "vlgvg  %[imax],%%v1,0\n\t"
+    "2:\n\t"
+    "nop"
+    : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+
+  return imax;
+}
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT maxf = 0.0;
+  BLASLONG max = 0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (max);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      max = idmax_kernel_32(n1, x, &maxf);
+
+      i = n1;
+    } else {
+      maxf = x[0];
+      i++;
+    }
+
+    while (i < n) {
+      if (x[i] > maxf) {
+        max = i;
+        maxf = x[i];
+      }
+      i++;
+    }
+    return (max + 1);
+
+  } else {
+
+    max = 0;
+    maxf = x[0];
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (x[i] > maxf) {
+        max = j;
+        maxf = x[i];
+      }
+      if (x[i + inc_x] > maxf) {
+        max = j + 1;
+        maxf = x[i + inc_x];
+      }
+      if (x[i + 2 * inc_x] > maxf) {
+        max = j + 2;
+        maxf = x[i + 2 * inc_x];
+      }
+      if (x[i + 3 * inc_x] > maxf) {
+        max = j + 3;
+        maxf = x[i + 3 * inc_x];
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (x[i] > maxf) {
+        max = j;
+        maxf = x[i];
+      }
+      i += inc_x;
+      j++;
+    }
+    return (max + 1);
+  }
+}
diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c
new file mode 100644
index 0000000000..e4b7bb4fe3
--- /dev/null
+++ b/kernel/zarch/idmin.c
@@ -0,0 +1,225 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) {
+  BLASLONG imin;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "vleig  %%v1,0,0\n\t"
+    "vleig  %%v1,1,1\n\t"
+    "vrepig %%v2,16\n\t"
+    "vzero  %%v3\n\t"
+    "vleig  %%v24,0,0\n\t"
+    "vleig  %%v24,1,1\n\t"
+    "vleig  %%v25,2,0\n\t"
+    "vleig  %%v25,3,1\n\t"
+    "vleig  %%v26,4,0\n\t"
+    "vleig  %%v26,5,1\n\t"
+    "vleig  %%v27,6,0\n\t"
+    "vleig  %%v27,7,1\n\t"
+    "vleig  %%v28,8,0\n\t"
+    "vleig  %%v28,9,1\n\t"
+    "vleig  %%v29,10,0\n\t"
+    "vleig  %%v29,11,1\n\t"
+    "vleig  %%v30,12,0\n\t"
+    "vleig  %%v30,13,1\n\t"
+    "vleig  %%v31,14,0\n\t"
+    "vleig  %%v31,15,1\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vfchedb  %%v4,%%v17,%%v16\n\t"
+    "vfchedb  %%v5,%%v19,%%v18\n\t"
+    "vfchedb  %%v6,%%v21,%%v20\n\t"
+    "vfchedb  %%v7,%%v23,%%v22\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v4\n\t"
+    "vsel    %%v4,%%v24,%%v25,%%v4\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v5\n\t"
+    "vsel    %%v5,%%v26,%%v27,%%v5\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v6\n\t"
+    "vsel    %%v6,%%v28,%%v29,%%v6\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v7\n\t"
+    "vsel    %%v7,%%v30,%%v31,%%v7\n\t"
+    "vfchedb  %%v20,%%v17,%%v16\n\t"
+    "vfchedb  %%v21,%%v19,%%v18\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v5,%%v6,%%v7,%%v21\n\t"
+    "vfchedb  %%v18,%%v17,%%v16\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v18\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vfchedb  %%v5,%%v16,%%v0\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v5\n\t"
+    "vsel    %%v1,%%v1,%%v4,%%v5\n\t"
+    "vag     %%v3,%%v3,%%v2\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,192(%%r1,%[x])\n\t"
+    "vl  %%v21,208(%%r1,%[x])\n\t"
+    "vl  %%v22,224(%%r1,%[x])\n\t"
+    "vl  %%v23,240(%%r1,%[x])\n\t"
+    "vfchedb  %%v4,%%v17,%%v16\n\t"
+    "vfchedb  %%v5,%%v19,%%v18\n\t"
+    "vfchedb  %%v6,%%v21,%%v20\n\t"
+    "vfchedb  %%v7,%%v23,%%v22\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v4\n\t"
+    "vsel    %%v4,%%v24,%%v25,%%v4\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v5\n\t"
+    "vsel    %%v5,%%v26,%%v27,%%v5\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v6\n\t"
+    "vsel    %%v6,%%v28,%%v29,%%v6\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v7\n\t"
+    "vsel    %%v7,%%v30,%%v31,%%v7\n\t"
+    "vfchedb  %%v20,%%v17,%%v16\n\t"
+    "vfchedb  %%v21,%%v19,%%v18\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v5,%%v6,%%v7,%%v21\n\t"
+    "vfchedb  %%v18,%%v17,%%v16\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v18\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vfchedb  %%v5,%%v16,%%v0\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v5\n\t"
+    "vsel    %%v1,%%v1,%%v4,%%v5\n\t"
+    "vag     %%v3,%%v3,%%v2\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v2,%%v0,1\n\t"
+    "vrepg  %%v3,%%v1,1\n\t"
+    "wfcdb  %%v2,%%v0\n\t"
+    "jne 1f\n\t"
+    "vsteg  %%v0,%[min],0\n\t"
+    "vmnlg  %%v0,%%v1,%%v3\n\t"
+    "vlgvg  %[imin],%%v0,0\n\t"
+    "j 2f\n\t"
+    "1:\n\t"
+    "wfchdb %%v4,%%v0,%%v2\n\t"
+    "vsel   %%v1,%%v3,%%v1,%%v4\n\t"
+    "vsel   %%v0,%%v2,%%v0,%%v4\n\t"
+    "std    %%f0,%[min]\n\t"
+    "vlgvg  %[imin],%%v1,0\n\t"
+    "2:\n\t"
+    "nop"
+    : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+
+  return imin;
+}
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT minf = 0.0;
+  BLASLONG min = 0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (min);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -32;
+    if (n1 > 0) {
+
+      min = idmin_kernel_32(n1, x, &minf);
+
+      i = n1;
+    } else {
+      minf = x[0];
+      i++;
+    }
+
+    while (i < n) {
+      if (x[i] < minf) {
+        min = i;
+        minf = x[i];
+      }
+      i++;
+    }
+    return (min + 1);
+
+  } else {
+
+    min = 0;
+    minf = x[0];
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (x[i] < minf) {
+        min = j;
+        minf = x[i];
+      }
+      if (x[i + inc_x] < minf) {
+        min = j + 1;
+        minf = x[i + inc_x];
+      }
+      if (x[i + 2 * inc_x] < minf) {
+        min = j + 2;
+        minf = x[i + 2 * inc_x];
+      }
+      if (x[i + 3 * inc_x] < minf) {
+        min = j + 3;
+        minf = x[i + 3 * inc_x];
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (x[i] < minf) {
+        min = j;
+        minf = x[i];
+      }
+      i += inc_x;
+      j++;
+    }
+    return (min + 1);
+  }
+}
diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c
new file mode 100644
index 0000000000..ac86435d77
--- /dev/null
+++ b/kernel/zarch/isamax.c
@@ -0,0 +1,289 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define ABS fabsf
+
+static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) {
+  BLASLONG iamax;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "vflpsb %%v0,%%v0\n\t"
+    "vleig  %%v1,0,0\n\t"
+    "vleig  %%v1,2,1\n\t"
+    "vleig  %%v2,1,0\n\t"
+    "vleig  %%v2,3,1\n\t"
+    "vrepig %%v3,32\n\t"
+    "vzero  %%v4\n\t"
+    "vleif  %%v24,0,0\n\t"
+    "vleif  %%v24,1,1\n\t"
+    "vleif  %%v24,2,2\n\t"
+    "vleif  %%v24,3,3\n\t"
+    "vleif  %%v25,4,0\n\t"
+    "vleif  %%v25,5,1\n\t"
+    "vleif  %%v25,6,2\n\t"
+    "vleif  %%v25,7,3\n\t"
+    "vleif  %%v26,8,0\n\t"
+    "vleif  %%v26,9,1\n\t"
+    "vleif  %%v26,10,2\n\t"
+    "vleif  %%v26,11,3\n\t"
+    "vleif  %%v27,12,0\n\t"
+    "vleif  %%v27,13,1\n\t"
+    "vleif  %%v27,14,2\n\t"
+    "vleif  %%v27,15,3\n\t"
+    "vleif  %%v28,16,0\n\t"
+    "vleif  %%v28,17,1\n\t"
+    "vleif  %%v28,18,2\n\t"
+    "vleif  %%v28,19,3\n\t"
+    "vleif  %%v29,20,0\n\t"
+    "vleif  %%v29,21,1\n\t"
+    "vleif  %%v29,22,2\n\t"
+    "vleif  %%v29,23,3\n\t"
+    "vleif  %%v30,24,0\n\t"
+    "vleif  %%v30,25,1\n\t"
+    "vleif  %%v30,26,2\n\t"
+    "vleif  %%v30,27,3\n\t"
+    "vleif  %%v31,28,0\n\t"
+    "vleif  %%v31,29,1\n\t"
+    "vleif  %%v31,30,2\n\t"
+    "vleif  %%v31,31,3\n\t"
+    "srlg  %[n],%[n],6\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vflpsb  %%v16, %%v16\n\t"
+    "vflpsb  %%v17, %%v17\n\t"
+    "vflpsb  %%v18, %%v18\n\t"
+    "vflpsb  %%v19, %%v19\n\t"
+    "vflpsb  %%v20, %%v20\n\t"
+    "vflpsb  %%v21, %%v21\n\t"
+    "vflpsb  %%v22, %%v22\n\t"
+    "vflpsb  %%v23, %%v23\n\t"
+    "vfchesb  %%v5,%%v16,%%v17\n\t"
+    "vfchesb  %%v6,%%v18,%%v19\n\t"
+    "vfchesb  %%v7,%%v20,%%v21\n\t"
+    "vfchesb  %%v8,%%v22,%%v23\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v5\n\t"
+    "vsel    %%v5,%%v24,%%v25,%%v5\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v6\n\t"
+    "vsel    %%v6,%%v26,%%v27,%%v6\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v7\n\t"
+    "vsel    %%v7,%%v28,%%v29,%%v7\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v8\n\t"
+    "vsel    %%v8,%%v30,%%v31,%%v8\n\t"
+    "vfchesb  %%v20,%%v16,%%v17\n\t"
+    "vfchesb  %%v21,%%v18,%%v19\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v6,%%v7,%%v8,%%v21\n\t"
+    "vfchesb  %%v18,%%v16,%%v17\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v18\n\t"
+    "vsegf   %%v6,%%v5\n\t"
+    "vesrlg  %%v5,%%v5,32\n\t"
+    "vag     %%v5,%%v5,%%v4\n\t"
+    "vag     %%v6,%%v6,%%v4\n\t"
+    "vfchesb  %%v7,%%v0,%%v16\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v7\n\t"
+    "vsegf   %%v8,%%v7\n\t"
+    "vesrlg  %%v7,%%v7,32\n\t"
+    "vsegf   %%v7,%%v7\n\t"
+    "vsel    %%v1,%%v1,%%v5,%%v7\n\t"
+    "vsel    %%v2,%%v2,%%v6,%%v8\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,192(%%r1,%[x])\n\t"
+    "vl  %%v21,208(%%r1,%[x])\n\t"
+    "vl  %%v22,224(%%r1,%[x])\n\t"
+    "vl  %%v23,240(%%r1,%[x])\n\t"
+    "vflpsb  %%v16, %%v16\n\t"
+    "vflpsb  %%v17, %%v17\n\t"
+    "vflpsb  %%v18, %%v18\n\t"
+    "vflpsb  %%v19, %%v19\n\t"
+    "vflpsb  %%v20, %%v20\n\t"
+    "vflpsb  %%v21, %%v21\n\t"
+    "vflpsb  %%v22, %%v22\n\t"
+    "vflpsb  %%v23, %%v23\n\t"
+    "vfchesb  %%v5,%%v16,%%v17\n\t"
+    "vfchesb  %%v6,%%v18,%%v19\n\t"
+    "vfchesb  %%v7,%%v20,%%v21\n\t"
+    "vfchesb  %%v8,%%v22,%%v23\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v5\n\t"
+    "vsel    %%v5,%%v24,%%v25,%%v5\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v6\n\t"
+    "vsel    %%v6,%%v26,%%v27,%%v6\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v7\n\t"
+    "vsel    %%v7,%%v28,%%v29,%%v7\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v8\n\t"
+    "vsel    %%v8,%%v30,%%v31,%%v8\n\t"
+    "vfchesb  %%v20,%%v16,%%v17\n\t"
+    "vfchesb  %%v21,%%v18,%%v19\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v6,%%v7,%%v8,%%v21\n\t"
+    "vfchesb  %%v18,%%v16,%%v17\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v18\n\t"
+    "vsegf   %%v6,%%v5\n\t"
+    "vesrlg  %%v5,%%v5,32\n\t"
+    "vag     %%v5,%%v5,%%v4\n\t"
+    "vag     %%v6,%%v6,%%v4\n\t"
+    "vfchesb  %%v7,%%v0,%%v16\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v7\n\t"
+    "vsegf   %%v8,%%v7\n\t"
+    "vesrlg  %%v7,%%v7,32\n\t"
+    "vsegf   %%v7,%%v7\n\t"
+    "vsel    %%v1,%%v1,%%v5,%%v7\n\t"
+    "vsel    %%v2,%%v2,%%v6,%%v8\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "veslg   %%v3,%%v0,32\n\t"
+    "vfchsb  %%v4,%%v0,%%v3\n\t"
+    "vchlg   %%v5,%%v2,%%v1\n\t"
+    "vfcesb  %%v6,%%v0,%%v3\n\t"
+    "vn      %%v5,%%v5,%%v6\n\t"
+    "vo      %%v4,%%v4,%%v5\n\t"
+    "vsel    %%v0,%%v0,%%v3,%%v4\n\t"
+    "vesrlg  %%v4,%%v4,32\n\t"
+    "vsegf   %%v4,%%v4\n\t"
+    "vsel    %%v1,%%v1,%%v2,%%v4\n\t"
+    "vrepf  %%v2,%%v0,2\n\t"
+    "vrepg  %%v3,%%v1,1\n\t"
+    "wfcsb  %%v2,%%v0\n\t"
+    "jne 1f\n\t"
+    "vstef  %%v0,%[amax],0\n\t"
+    "vmnlg  %%v0,%%v1,%%v3\n\t"
+    "vlgvg  %[iamax],%%v0,0\n\t"
+    "j 2f\n\t"
+    "1:\n\t"
+    "wfchsb %%v4,%%v2,%%v0\n\t"
+    "vesrlg %%v4,%%v4,32\n\t"
+    "vsegf  %%v4,%%v4\n\t"
+    "vsel   %%v1,%%v3,%%v1,%%v4\n\t"
+    "vsel   %%v0,%%v2,%%v0,%%v4\n\t"
+    "ste    %%f0,%[amax]\n\t"
+    "vlgvg  %[iamax],%%v1,0\n\t"
+    "2:\n\t"
+    "nop"
+    : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+
+  return iamax;
+}
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT maxf = 0.0;
+  BLASLONG max = 0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (max);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -64;
+    if (n1 > 0) {
+
+      max = isamax_kernel_64(n1, x, &maxf);
+
+      i = n1;
+    } else {
+      maxf = ABS(x[0]);
+      i++;
+    }
+
+    while (i < n) {
+      if (ABS(x[i]) > maxf) {
+        max = i;
+        maxf = ABS(x[i]);
+      }
+      i++;
+    }
+    return (max + 1);
+
+  } else {
+
+    max = 0;
+    maxf = ABS(x[0]);
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (ABS(x[i]) > maxf) {
+        max = j;
+        maxf = ABS(x[i]);
+      }
+      if (ABS(x[i + inc_x]) > maxf) {
+        max = j + 1;
+        maxf = ABS(x[i + inc_x]);
+      }
+      if (ABS(x[i + 2 * inc_x]) > maxf) {
+        max = j + 2;
+        maxf = ABS(x[i + 2 * inc_x]);
+      }
+      if (ABS(x[i + 3 * inc_x]) > maxf) {
+        max = j + 3;
+        maxf = ABS(x[i + 3 * inc_x]);
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (ABS(x[i]) > maxf) {
+        max = j;
+        maxf = ABS(x[i]);
+      }
+      i += inc_x;
+      j++;
+    }
+    return (max + 1);
+  }
+}
diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c
new file mode 100644
index 0000000000..3f2d039eb9
--- /dev/null
+++ b/kernel/zarch/isamin.c
@@ -0,0 +1,289 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define ABS fabsf
+
+static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) {
+  BLASLONG iamin;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "vflpsb %%v0,%%v0\n\t"
+    "vleig  %%v1,0,0\n\t"
+    "vleig  %%v1,2,1\n\t"
+    "vleig  %%v2,1,0\n\t"
+    "vleig  %%v2,3,1\n\t"
+    "vrepig %%v3,32\n\t"
+    "vzero  %%v4\n\t"
+    "vleif  %%v24,0,0\n\t"
+    "vleif  %%v24,1,1\n\t"
+    "vleif  %%v24,2,2\n\t"
+    "vleif  %%v24,3,3\n\t"
+    "vleif  %%v25,4,0\n\t"
+    "vleif  %%v25,5,1\n\t"
+    "vleif  %%v25,6,2\n\t"
+    "vleif  %%v25,7,3\n\t"
+    "vleif  %%v26,8,0\n\t"
+    "vleif  %%v26,9,1\n\t"
+    "vleif  %%v26,10,2\n\t"
+    "vleif  %%v26,11,3\n\t"
+    "vleif  %%v27,12,0\n\t"
+    "vleif  %%v27,13,1\n\t"
+    "vleif  %%v27,14,2\n\t"
+    "vleif  %%v27,15,3\n\t"
+    "vleif  %%v28,16,0\n\t"
+    "vleif  %%v28,17,1\n\t"
+    "vleif  %%v28,18,2\n\t"
+    "vleif  %%v28,19,3\n\t"
+    "vleif  %%v29,20,0\n\t"
+    "vleif  %%v29,21,1\n\t"
+    "vleif  %%v29,22,2\n\t"
+    "vleif  %%v29,23,3\n\t"
+    "vleif  %%v30,24,0\n\t"
+    "vleif  %%v30,25,1\n\t"
+    "vleif  %%v30,26,2\n\t"
+    "vleif  %%v30,27,3\n\t"
+    "vleif  %%v31,28,0\n\t"
+    "vleif  %%v31,29,1\n\t"
+    "vleif  %%v31,30,2\n\t"
+    "vleif  %%v31,31,3\n\t"
+    "srlg  %[n],%[n],6\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vflpsb  %%v16, %%v16\n\t"
+    "vflpsb  %%v17, %%v17\n\t"
+    "vflpsb  %%v18, %%v18\n\t"
+    "vflpsb  %%v19, %%v19\n\t"
+    "vflpsb  %%v20, %%v20\n\t"
+    "vflpsb  %%v21, %%v21\n\t"
+    "vflpsb  %%v22, %%v22\n\t"
+    "vflpsb  %%v23, %%v23\n\t"
+    "vfchesb  %%v5,%%v17,%%v16\n\t"
+    "vfchesb  %%v6,%%v19,%%v18\n\t"
+    "vfchesb  %%v7,%%v21,%%v20\n\t"
+    "vfchesb  %%v8,%%v23,%%v22\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v5\n\t"
+    "vsel    %%v5,%%v24,%%v25,%%v5\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v6\n\t"
+    "vsel    %%v6,%%v26,%%v27,%%v6\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v7\n\t"
+    "vsel    %%v7,%%v28,%%v29,%%v7\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v8\n\t"
+    "vsel    %%v8,%%v30,%%v31,%%v8\n\t"
+    "vfchesb  %%v20,%%v17,%%v16\n\t"
+    "vfchesb  %%v21,%%v19,%%v18\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v6,%%v7,%%v8,%%v21\n\t"
+    "vfchesb  %%v18,%%v17,%%v16\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v18\n\t"
+    "vsegf   %%v6,%%v5\n\t"
+    "vesrlg  %%v5,%%v5,32\n\t"
+    "vag     %%v5,%%v5,%%v4\n\t"
+    "vag     %%v6,%%v6,%%v4\n\t"
+    "vfchesb  %%v7,%%v16,%%v0\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v7\n\t"
+    "vsegf   %%v8,%%v7\n\t"
+    "vesrlg  %%v7,%%v7,32\n\t"
+    "vsegf   %%v7,%%v7\n\t"
+    "vsel    %%v1,%%v1,%%v5,%%v7\n\t"
+    "vsel    %%v2,%%v2,%%v6,%%v8\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,192(%%r1,%[x])\n\t"
+    "vl  %%v21,208(%%r1,%[x])\n\t"
+    "vl  %%v22,224(%%r1,%[x])\n\t"
+    "vl  %%v23,240(%%r1,%[x])\n\t"
+    "vflpsb  %%v16, %%v16\n\t"
+    "vflpsb  %%v17, %%v17\n\t"
+    "vflpsb  %%v18, %%v18\n\t"
+    "vflpsb  %%v19, %%v19\n\t"
+    "vflpsb  %%v20, %%v20\n\t"
+    "vflpsb  %%v21, %%v21\n\t"
+    "vflpsb  %%v22, %%v22\n\t"
+    "vflpsb  %%v23, %%v23\n\t"
+    "vfchesb  %%v5,%%v17,%%v16\n\t"
+    "vfchesb  %%v6,%%v19,%%v18\n\t"
+    "vfchesb  %%v7,%%v21,%%v20\n\t"
+    "vfchesb  %%v8,%%v23,%%v22\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v5\n\t"
+    "vsel    %%v5,%%v24,%%v25,%%v5\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v6\n\t"
+    "vsel    %%v6,%%v26,%%v27,%%v6\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v7\n\t"
+    "vsel    %%v7,%%v28,%%v29,%%v7\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v8\n\t"
+    "vsel    %%v8,%%v30,%%v31,%%v8\n\t"
+    "vfchesb  %%v20,%%v17,%%v16\n\t"
+    "vfchesb  %%v21,%%v19,%%v18\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v6,%%v7,%%v8,%%v21\n\t"
+    "vfchesb  %%v18,%%v17,%%v16\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v18\n\t"
+    "vsegf   %%v6,%%v5\n\t"
+    "vesrlg  %%v5,%%v5,32\n\t"
+    "vag     %%v5,%%v5,%%v4\n\t"
+    "vag     %%v6,%%v6,%%v4\n\t"
+    "vfchesb  %%v7,%%v16,%%v0\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v7\n\t"
+    "vsegf   %%v8,%%v7\n\t"
+    "vesrlg  %%v7,%%v7,32\n\t"
+    "vsegf   %%v7,%%v7\n\t"
+    "vsel    %%v1,%%v1,%%v5,%%v7\n\t"
+    "vsel    %%v2,%%v2,%%v6,%%v8\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "veslg   %%v3,%%v0,32\n\t"
+    "vfchsb  %%v4,%%v3,%%v0\n\t"
+    "vchlg   %%v5,%%v2,%%v1\n\t"
+    "vfcesb  %%v6,%%v0,%%v3\n\t"
+    "vn      %%v5,%%v5,%%v6\n\t"
+    "vo      %%v4,%%v4,%%v5\n\t"
+    "vsel    %%v0,%%v0,%%v3,%%v4\n\t"
+    "vesrlg  %%v4,%%v4,32\n\t"
+    "vsegf   %%v4,%%v4\n\t"
+    "vsel    %%v1,%%v1,%%v2,%%v4\n\t"
+    "vrepf  %%v2,%%v0,2\n\t"
+    "vrepg  %%v3,%%v1,1\n\t"
+    "wfcsb  %%v2,%%v0\n\t"
+    "jne 1f\n\t"
+    "vstef  %%v0,%[amin],0\n\t"
+    "vmnlg  %%v0,%%v1,%%v3\n\t"
+    "vlgvg  %[iamin],%%v0,0\n\t"
+    "j 2f\n\t"
+    "1:\n\t"
+    "wfchsb %%v4,%%v0,%%v2\n\t"
+    "vesrlg %%v4,%%v4,32\n\t"
+    "vsegf  %%v4,%%v4\n\t"
+    "vsel   %%v1,%%v3,%%v1,%%v4\n\t"
+    "vsel   %%v0,%%v2,%%v0,%%v4\n\t"
+    "ste    %%f0,%[amin]\n\t"
+    "vlgvg  %[iamin],%%v1,0\n\t"
+    "2:\n\t"
+    "nop"
+    : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+
+  return iamin;
+}
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT minf = 0.0;
+  BLASLONG min = 0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (min);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -64;
+    if (n1 > 0) {
+
+      min = isamin_kernel_64(n1, x, &minf);
+
+      i = n1;
+    } else {
+      minf = ABS(x[0]);
+      i++;
+    }
+
+    while (i < n) {
+      if (ABS(x[i]) < minf) {
+        min = i;
+        minf = ABS(x[i]);
+      }
+      i++;
+    }
+    return (min + 1);
+
+  } else {
+
+    min = 0;
+    minf = ABS(x[0]);
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (ABS(x[i]) < minf) {
+        min = j;
+        minf = ABS(x[i]);
+      }
+      if (ABS(x[i + inc_x]) < minf) {
+        min = j + 1;
+        minf = ABS(x[i + inc_x]);
+      }
+      if (ABS(x[i + 2 * inc_x]) < minf) {
+        min = j + 2;
+        minf = ABS(x[i + 2 * inc_x]);
+      }
+      if (ABS(x[i + 3 * inc_x]) < minf) {
+        min = j + 3;
+        minf = ABS(x[i + 3 * inc_x]);
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (ABS(x[i]) < minf) {
+        min = j;
+        minf = ABS(x[i]);
+      }
+      i += inc_x;
+      j++;
+    }
+    return (min + 1);
+  }
+}
diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c
new file mode 100644
index 0000000000..41172c1bd3
--- /dev/null
+++ b/kernel/zarch/ismax.c
@@ -0,0 +1,269 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) {
+  BLASLONG imax;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "vleig  %%v1,0,0\n\t"
+    "vleig  %%v1,2,1\n\t"
+    "vleig  %%v2,1,0\n\t"
+    "vleig  %%v2,3,1\n\t"
+    "vrepig %%v3,32\n\t"
+    "vzero  %%v4\n\t"
+    "vleif  %%v24,0,0\n\t"
+    "vleif  %%v24,1,1\n\t"
+    "vleif  %%v24,2,2\n\t"
+    "vleif  %%v24,3,3\n\t"
+    "vleif  %%v25,4,0\n\t"
+    "vleif  %%v25,5,1\n\t"
+    "vleif  %%v25,6,2\n\t"
+    "vleif  %%v25,7,3\n\t"
+    "vleif  %%v26,8,0\n\t"
+    "vleif  %%v26,9,1\n\t"
+    "vleif  %%v26,10,2\n\t"
+    "vleif  %%v26,11,3\n\t"
+    "vleif  %%v27,12,0\n\t"
+    "vleif  %%v27,13,1\n\t"
+    "vleif  %%v27,14,2\n\t"
+    "vleif  %%v27,15,3\n\t"
+    "vleif  %%v28,16,0\n\t"
+    "vleif  %%v28,17,1\n\t"
+    "vleif  %%v28,18,2\n\t"
+    "vleif  %%v28,19,3\n\t"
+    "vleif  %%v29,20,0\n\t"
+    "vleif  %%v29,21,1\n\t"
+    "vleif  %%v29,22,2\n\t"
+    "vleif  %%v29,23,3\n\t"
+    "vleif  %%v30,24,0\n\t"
+    "vleif  %%v30,25,1\n\t"
+    "vleif  %%v30,26,2\n\t"
+    "vleif  %%v30,27,3\n\t"
+    "vleif  %%v31,28,0\n\t"
+    "vleif  %%v31,29,1\n\t"
+    "vleif  %%v31,30,2\n\t"
+    "vleif  %%v31,31,3\n\t"
+    "srlg  %[n],%[n],6\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vfchesb  %%v5,%%v16,%%v17\n\t"
+    "vfchesb  %%v6,%%v18,%%v19\n\t"
+    "vfchesb  %%v7,%%v20,%%v21\n\t"
+    "vfchesb  %%v8,%%v22,%%v23\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v5\n\t"
+    "vsel    %%v5,%%v24,%%v25,%%v5\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v6\n\t"
+    "vsel    %%v6,%%v26,%%v27,%%v6\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v7\n\t"
+    "vsel    %%v7,%%v28,%%v29,%%v7\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v8\n\t"
+    "vsel    %%v8,%%v30,%%v31,%%v8\n\t"
+    "vfchesb  %%v20,%%v16,%%v17\n\t"
+    "vfchesb  %%v21,%%v18,%%v19\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v6,%%v7,%%v8,%%v21\n\t"
+    "vfchesb  %%v18,%%v16,%%v17\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v18\n\t"
+    "vsegf   %%v6,%%v5\n\t"
+    "vesrlg  %%v5,%%v5,32\n\t"
+    "vag     %%v5,%%v5,%%v4\n\t"
+    "vag     %%v6,%%v6,%%v4\n\t"
+    "vfchesb  %%v7,%%v0,%%v16\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v7\n\t"
+    "vsegf   %%v8,%%v7\n\t"
+    "vesrlg  %%v7,%%v7,32\n\t"
+    "vsegf   %%v7,%%v7\n\t"
+    "vsel    %%v1,%%v1,%%v5,%%v7\n\t"
+    "vsel    %%v2,%%v2,%%v6,%%v8\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,192(%%r1,%[x])\n\t"
+    "vl  %%v21,208(%%r1,%[x])\n\t"
+    "vl  %%v22,224(%%r1,%[x])\n\t"
+    "vl  %%v23,240(%%r1,%[x])\n\t"
+    "vfchesb  %%v5,%%v16,%%v17\n\t"
+    "vfchesb  %%v6,%%v18,%%v19\n\t"
+    "vfchesb  %%v7,%%v20,%%v21\n\t"
+    "vfchesb  %%v8,%%v22,%%v23\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v5\n\t"
+    "vsel    %%v5,%%v24,%%v25,%%v5\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v6\n\t"
+    "vsel    %%v6,%%v26,%%v27,%%v6\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v7\n\t"
+    "vsel    %%v7,%%v28,%%v29,%%v7\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v8\n\t"
+    "vsel    %%v8,%%v30,%%v31,%%v8\n\t"
+    "vfchesb  %%v20,%%v16,%%v17\n\t"
+    "vfchesb  %%v21,%%v18,%%v19\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v6,%%v7,%%v8,%%v21\n\t"
+    "vfchesb  %%v18,%%v16,%%v17\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v18\n\t"
+    "vsegf   %%v6,%%v5\n\t"
+    "vesrlg  %%v5,%%v5,32\n\t"
+    "vag     %%v5,%%v5,%%v4\n\t"
+    "vag     %%v6,%%v6,%%v4\n\t"
+    "vfchesb  %%v7,%%v0,%%v16\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v7\n\t"
+    "vsegf   %%v8,%%v7\n\t"
+    "vesrlg  %%v7,%%v7,32\n\t"
+    "vsegf   %%v7,%%v7\n\t"
+    "vsel    %%v1,%%v1,%%v5,%%v7\n\t"
+    "vsel    %%v2,%%v2,%%v6,%%v8\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "veslg   %%v3,%%v0,32\n\t"
+    "vfchsb  %%v4,%%v0,%%v3\n\t"
+    "vchlg   %%v5,%%v2,%%v1\n\t"
+    "vfcesb  %%v6,%%v0,%%v3\n\t"
+    "vn      %%v5,%%v5,%%v6\n\t"
+    "vo      %%v4,%%v4,%%v5\n\t"
+    "vsel    %%v0,%%v0,%%v3,%%v4\n\t"
+    "vesrlg  %%v4,%%v4,32\n\t"
+    "vsegf   %%v4,%%v4\n\t"
+    "vsel    %%v1,%%v1,%%v2,%%v4\n\t"
+    "vrepf  %%v2,%%v0,2\n\t"
+    "vrepg  %%v3,%%v1,1\n\t"
+    "wfcsb  %%v2,%%v0\n\t"
+    "jne 1f\n\t"
+    "vstef  %%v0,%[max],0\n\t"
+    "vmnlg  %%v0,%%v1,%%v3\n\t"
+    "vlgvg  %[imax],%%v0,0\n\t"
+    "j 2f\n\t"
+    "1:\n\t"
+    "wfchsb %%v4,%%v2,%%v0\n\t"
+    "vesrlg %%v4,%%v4,32\n\t"
+    "vsegf  %%v4,%%v4\n\t"
+    "vsel   %%v1,%%v3,%%v1,%%v4\n\t"
+    "vsel   %%v0,%%v2,%%v0,%%v4\n\t"
+    "ste    %%f0,%[max]\n\t"
+    "vlgvg  %[imax],%%v1,0\n\t"
+    "2:\n\t"
+    "nop"
+    : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+
+  return imax;
+}
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT maxf = 0.0;
+  BLASLONG max = 0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (max);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -64;
+    if (n1 > 0) {
+
+      max = ismax_kernel_64(n1, x, &maxf);
+
+      i = n1;
+    } else {
+      maxf = x[0];
+      i++;
+    }
+
+    while (i < n) {
+      if (x[i] > maxf) {
+        max = i;
+        maxf = x[i];
+      }
+      i++;
+    }
+    return (max + 1);
+
+  } else {
+
+    max = 0;
+    maxf = x[0];
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (x[i] > maxf) {
+        max = j;
+        maxf = x[i];
+      }
+      if (x[i + inc_x] > maxf) {
+        max = j + 1;
+        maxf = x[i + inc_x];
+      }
+      if (x[i + 2 * inc_x] > maxf) {
+        max = j + 2;
+        maxf = x[i + 2 * inc_x];
+      }
+      if (x[i + 3 * inc_x] > maxf) {
+        max = j + 3;
+        maxf = x[i + 3 * inc_x];
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (x[i] > maxf) {
+        max = j;
+        maxf = x[i];
+      }
+      i += inc_x;
+      j++;
+    }
+    return (max + 1);
+  }
+}
diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c
new file mode 100644
index 0000000000..e2684df416
--- /dev/null
+++ b/kernel/zarch/ismin.c
@@ -0,0 +1,269 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) {
+  BLASLONG imin;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "vleig  %%v1,0,0\n\t"
+    "vleig  %%v1,2,1\n\t"
+    "vleig  %%v2,1,0\n\t"
+    "vleig  %%v2,3,1\n\t"
+    "vrepig %%v3,32\n\t"
+    "vzero  %%v4\n\t"
+    "vleif  %%v24,0,0\n\t"
+    "vleif  %%v24,1,1\n\t"
+    "vleif  %%v24,2,2\n\t"
+    "vleif  %%v24,3,3\n\t"
+    "vleif  %%v25,4,0\n\t"
+    "vleif  %%v25,5,1\n\t"
+    "vleif  %%v25,6,2\n\t"
+    "vleif  %%v25,7,3\n\t"
+    "vleif  %%v26,8,0\n\t"
+    "vleif  %%v26,9,1\n\t"
+    "vleif  %%v26,10,2\n\t"
+    "vleif  %%v26,11,3\n\t"
+    "vleif  %%v27,12,0\n\t"
+    "vleif  %%v27,13,1\n\t"
+    "vleif  %%v27,14,2\n\t"
+    "vleif  %%v27,15,3\n\t"
+    "vleif  %%v28,16,0\n\t"
+    "vleif  %%v28,17,1\n\t"
+    "vleif  %%v28,18,2\n\t"
+    "vleif  %%v28,19,3\n\t"
+    "vleif  %%v29,20,0\n\t"
+    "vleif  %%v29,21,1\n\t"
+    "vleif  %%v29,22,2\n\t"
+    "vleif  %%v29,23,3\n\t"
+    "vleif  %%v30,24,0\n\t"
+    "vleif  %%v30,25,1\n\t"
+    "vleif  %%v30,26,2\n\t"
+    "vleif  %%v30,27,3\n\t"
+    "vleif  %%v31,28,0\n\t"
+    "vleif  %%v31,29,1\n\t"
+    "vleif  %%v31,30,2\n\t"
+    "vleif  %%v31,31,3\n\t"
+    "srlg  %[n],%[n],6\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vfchesb  %%v5,%%v17,%%v16\n\t"
+    "vfchesb  %%v6,%%v19,%%v18\n\t"
+    "vfchesb  %%v7,%%v21,%%v20\n\t"
+    "vfchesb  %%v8,%%v23,%%v22\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v5\n\t"
+    "vsel    %%v5,%%v24,%%v25,%%v5\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v6\n\t"
+    "vsel    %%v6,%%v26,%%v27,%%v6\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v7\n\t"
+    "vsel    %%v7,%%v28,%%v29,%%v7\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v8\n\t"
+    "vsel    %%v8,%%v30,%%v31,%%v8\n\t"
+    "vfchesb  %%v20,%%v17,%%v16\n\t"
+    "vfchesb  %%v21,%%v19,%%v18\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v6,%%v7,%%v8,%%v21\n\t"
+    "vfchesb  %%v18,%%v17,%%v16\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v18\n\t"
+    "vsegf   %%v6,%%v5\n\t"
+    "vesrlg  %%v5,%%v5,32\n\t"
+    "vag     %%v5,%%v5,%%v4\n\t"
+    "vag     %%v6,%%v6,%%v4\n\t"
+    "vfchesb  %%v7,%%v16,%%v0\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v7\n\t"
+    "vsegf   %%v8,%%v7\n\t"
+    "vesrlg  %%v7,%%v7,32\n\t"
+    "vsegf   %%v7,%%v7\n\t"
+    "vsel    %%v1,%%v1,%%v5,%%v7\n\t"
+    "vsel    %%v2,%%v2,%%v6,%%v8\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,192(%%r1,%[x])\n\t"
+    "vl  %%v21,208(%%r1,%[x])\n\t"
+    "vl  %%v22,224(%%r1,%[x])\n\t"
+    "vl  %%v23,240(%%r1,%[x])\n\t"
+    "vfchesb  %%v5,%%v17,%%v16\n\t"
+    "vfchesb  %%v6,%%v19,%%v18\n\t"
+    "vfchesb  %%v7,%%v21,%%v20\n\t"
+    "vfchesb  %%v8,%%v23,%%v22\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v5\n\t"
+    "vsel    %%v5,%%v24,%%v25,%%v5\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v6\n\t"
+    "vsel    %%v6,%%v26,%%v27,%%v6\n\t"
+    "vsel    %%v18,%%v20,%%v21,%%v7\n\t"
+    "vsel    %%v7,%%v28,%%v29,%%v7\n\t"
+    "vsel    %%v19,%%v22,%%v23,%%v8\n\t"
+    "vsel    %%v8,%%v30,%%v31,%%v8\n\t"
+    "vfchesb  %%v20,%%v17,%%v16\n\t"
+    "vfchesb  %%v21,%%v19,%%v18\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v20\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v20\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v21\n\t"
+    "vsel    %%v6,%%v7,%%v8,%%v21\n\t"
+    "vfchesb  %%v18,%%v17,%%v16\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v5,%%v5,%%v6,%%v18\n\t"
+    "vsegf   %%v6,%%v5\n\t"
+    "vesrlg  %%v5,%%v5,32\n\t"
+    "vag     %%v5,%%v5,%%v4\n\t"
+    "vag     %%v6,%%v6,%%v4\n\t"
+    "vfchesb  %%v7,%%v16,%%v0\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v7\n\t"
+    "vsegf   %%v8,%%v7\n\t"
+    "vesrlg  %%v7,%%v7,32\n\t"
+    "vsegf   %%v7,%%v7\n\t"
+    "vsel    %%v1,%%v1,%%v5,%%v7\n\t"
+    "vsel    %%v2,%%v2,%%v6,%%v8\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "veslg   %%v3,%%v0,32\n\t"
+    "vfchsb  %%v4,%%v3,%%v0\n\t"
+    "vchlg   %%v5,%%v2,%%v1\n\t"
+    "vfcesb  %%v6,%%v0,%%v3\n\t"
+    "vn      %%v5,%%v5,%%v6\n\t"
+    "vo      %%v4,%%v4,%%v5\n\t"
+    "vsel    %%v0,%%v0,%%v3,%%v4\n\t"
+    "vesrlg  %%v4,%%v4,32\n\t"
+    "vsegf   %%v4,%%v4\n\t"
+    "vsel    %%v1,%%v1,%%v2,%%v4\n\t"
+    "vrepf  %%v2,%%v0,2\n\t"
+    "vrepg  %%v3,%%v1,1\n\t"
+    "wfcsb  %%v2,%%v0\n\t"
+    "jne 1f\n\t"
+    "vstef  %%v0,%[min],0\n\t"
+    "vmnlg  %%v0,%%v1,%%v3\n\t"
+    "vlgvg  %[imin],%%v0,0\n\t"
+    "j 2f\n\t"
+    "1:\n\t"
+    "wfchsb %%v4,%%v0,%%v2\n\t"
+    "vesrlg %%v4,%%v4,32\n\t"
+    "vsegf  %%v4,%%v4\n\t"
+    "vsel   %%v1,%%v3,%%v1,%%v4\n\t"
+    "vsel   %%v0,%%v2,%%v0,%%v4\n\t"
+    "ste    %%f0,%[min]\n\t"
+    "vlgvg  %[imin],%%v1,0\n\t"
+    "2:\n\t"
+    "nop"
+    : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+
+  return imin;
+}
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT minf = 0.0;
+  BLASLONG min = 0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (min);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -64;
+    if (n1 > 0) {
+
+      min = ismin_kernel_64(n1, x, &minf);
+
+      i = n1;
+    } else {
+      minf = x[0];
+      i++;
+    }
+
+    while (i < n) {
+      if (x[i] < minf) {
+        min = i;
+        minf = x[i];
+      }
+      i++;
+    }
+    return (min + 1);
+
+  } else {
+
+    min = 0;
+    minf = x[0];
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (x[i] < minf) {
+        min = j;
+        minf = x[i];
+      }
+      if (x[i + inc_x] < minf) {
+        min = j + 1;
+        minf = x[i + inc_x];
+      }
+      if (x[i + 2 * inc_x] < minf) {
+        min = j + 2;
+        minf = x[i + 2 * inc_x];
+      }
+      if (x[i + 3 * inc_x] < minf) {
+        min = j + 3;
+        minf = x[i + 3 * inc_x];
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (x[i] < minf) {
+        min = j;
+        minf = x[i];
+      }
+      i += inc_x;
+      j++;
+    }
+    return (min + 1);
+  }
+}
diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c
index 216c3414a6..daca1d6f71 100644
--- a/kernel/zarch/izamax.c
+++ b/kernel/zarch/izamax.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2017, The OpenBLAS Project
+Copyright (c) 2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -24,243 +24,222 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
- 
 
 #include "common.h"
 #include <math.h>
- 
-#define ABS fabs 
-#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
-
 
+#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
+
+static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) {
+  BLASLONG iamax;
+
+  __asm__("vleg   %%v0,0(%[x]),0\n\t"
+    "vleg   %%v1,8(%[x]),0\n\t"
+    "vleg   %%v0,16(%[x]),1\n\t"
+    "vleg   %%v1,24(%[x]),1\n\t"
+    "vflpdb %%v0,%%v0\n\t"
+    "vflpdb %%v1,%%v1\n\t"
+    "vfadb  %%v0,%%v0,%%v1\n\t"
+    "vleig  %%v1,0,0\n\t"
+    "vleig  %%v1,1,1\n\t"
+    "vrepig %%v2,8\n\t"
+    "vzero  %%v3\n\t"
+    "vleig  %%v24,0,0\n\t"
+    "vleig  %%v24,1,1\n\t"
+    "vleig  %%v25,2,0\n\t"
+    "vleig  %%v25,3,1\n\t"
+    "vleig  %%v26,4,0\n\t"
+    "vleig  %%v26,5,1\n\t"
+    "vleig  %%v27,6,0\n\t"
+    "vleig  %%v27,7,1\n\t"
+    "srlg  %[n],%[n],4\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vleg  %%v16,0(%%r1,%[x]),0\n\t"
+    "vleg  %%v17,8(%%r1,%[x]),0\n\t"
+    "vleg  %%v16,16(%%r1,%[x]),1\n\t"
+    "vleg  %%v17,24(%%r1,%[x]),1\n\t"
+    "vleg  %%v18,32(%%r1,%[x]),0\n\t"
+    "vleg  %%v19,40(%%r1,%[x]),0\n\t"
+    "vleg  %%v18,48(%%r1,%[x]),1\n\t"
+    "vleg  %%v19,56(%%r1,%[x]),1\n\t"
+    "vleg  %%v20,64(%%r1,%[x]),0\n\t"
+    "vleg  %%v21,72(%%r1,%[x]),0\n\t"
+    "vleg  %%v20,80(%%r1,%[x]),1\n\t"
+    "vleg  %%v21,88(%%r1,%[x]),1\n\t"
+    "vleg  %%v22,96(%%r1,%[x]),0\n\t"
+    "vleg  %%v23,104(%%r1,%[x]),0\n\t"
+    "vleg  %%v22,112(%%r1,%[x]),1\n\t"
+    "vleg  %%v23,120(%%r1,%[x]),1\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfadb %%v16,%%v16,%%v17\n\t"
+    "vfadb %%v17,%%v18,%%v19\n\t"
+    "vfadb %%v18,%%v20,%%v21\n\t"
+    "vfadb %%v19,%%v22,%%v23\n\t"
+    "vfchedb  %%v4,%%v16,%%v17\n\t"
+    "vfchedb  %%v5,%%v18,%%v19\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v4\n\t"
+    "vsel    %%v4,%%v24,%%v25,%%v4\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v5\n\t"
+    "vsel    %%v5,%%v26,%%v27,%%v5\n\t"
+    "vfchedb  %%v18,%%v16,%%v17\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v18\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vfchedb  %%v5,%%v0,%%v16\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v5\n\t"
+    "vsel    %%v1,%%v1,%%v4,%%v5\n\t"
+    "vag     %%v3,%%v3,%%v2\n\t"
+    "vleg  %%v16,128(%%r1,%[x]),0\n\t"
+    "vleg  %%v17,136(%%r1,%[x]),0\n\t"
+    "vleg  %%v16,144(%%r1,%[x]),1\n\t"
+    "vleg  %%v17,152(%%r1,%[x]),1\n\t"
+    "vleg  %%v18,160(%%r1,%[x]),0\n\t"
+    "vleg  %%v19,168(%%r1,%[x]),0\n\t"
+    "vleg  %%v18,176(%%r1,%[x]),1\n\t"
+    "vleg  %%v19,184(%%r1,%[x]),1\n\t"
+    "vleg  %%v20,192(%%r1,%[x]),0\n\t"
+    "vleg  %%v21,200(%%r1,%[x]),0\n\t"
+    "vleg  %%v20,208(%%r1,%[x]),1\n\t"
+    "vleg  %%v21,216(%%r1,%[x]),1\n\t"
+    "vleg  %%v22,224(%%r1,%[x]),0\n\t"
+    "vleg  %%v23,232(%%r1,%[x]),0\n\t"
+    "vleg  %%v22,240(%%r1,%[x]),1\n\t"
+    "vleg  %%v23,248(%%r1,%[x]),1\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfadb %%v16,%%v16,%%v17\n\t"
+    "vfadb %%v17,%%v18,%%v19\n\t"
+    "vfadb %%v18,%%v20,%%v21\n\t"
+    "vfadb %%v19,%%v22,%%v23\n\t"
+    "vfchedb  %%v4,%%v16,%%v17\n\t"
+    "vfchedb  %%v5,%%v18,%%v19\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v4\n\t"
+    "vsel    %%v4,%%v24,%%v25,%%v4\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v5\n\t"
+    "vsel    %%v5,%%v26,%%v27,%%v5\n\t"
+    "vfchedb  %%v18,%%v16,%%v17\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v18\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vfchedb  %%v5,%%v0,%%v16\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v5\n\t"
+    "vsel    %%v1,%%v1,%%v4,%%v5\n\t"
+    "vag     %%v3,%%v3,%%v2\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v2,%%v0,1\n\t"
+    "vrepg  %%v3,%%v1,1\n\t"
+    "wfcdb  %%v2,%%v0\n\t"
+    "jne 1f\n\t"
+    "vsteg  %%v0,%[amax],0\n\t"
+    "vmnlg  %%v0,%%v1,%%v3\n\t"
+    "vlgvg  %[iamax],%%v0,0\n\t"
+    "j 2f\n\t"
+    "1:\n\t"
+    "wfchdb %%v4,%%v2,%%v0\n\t"
+    "vsel   %%v1,%%v3,%%v1,%%v4\n\t"
+    "vsel   %%v0,%%v2,%%v0,%%v4\n\t"
+    "std    %%f0,%[amax]\n\t"
+    "vlgvg  %[iamax],%%v1,0\n\t"
+    "2:\n\t"
+    "nop"
+    : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
+       "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
+
+  return iamax;
+}
 
- 
-/**
- * Find  maximum index 
- * Warning: requirements n>0  and n % 16 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param maxf  (out) maximum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG   ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { 
-    BLASLONG index;
-    __asm__(
-            "pfd 1, 0(%[ptr_x]) \n\t" 
-            "vleig  %%v16,0,0  \n\t"
-            "vleig  %%v16,1,1  \n\t"
-            "vleig  %%v17,2,0  \n\t"
-            "vleig  %%v17,3,1  \n\t"
-            "vleig  %%v18,4,0  \n\t"
-            "vleig  %%v18,5,1  \n\t"
-            "vleig  %%v19,6,0  \n\t"
-            "vleig  %%v19,7,1  \n\t" 
-            "vleig  %%v20,8,0  \n\t"
-            "vleig  %%v20,9,1  \n\t"
-            "vleig  %%v21,10,0 \n\t"
-            "vleig  %%v21,11,1 \n\t"
-            "vleig  %%v22,12,0 \n\t"
-            "vleig  %%v22,13,1 \n\t"
-            "vleig  %%v23,14,0 \n\t"
-            "vleig  %%v23,15,1 \n\t" 
-    
-    
-            "sllg   %%r0,%[n],4 \n\t"
-            "agr    %%r0,%[ptr_x]    \n\t" 
-            "vzero  %%v6      \n\t"
-            "vzero  %%v7      \n\t"
-            "vrepig %%v4,16   \n\t"
-            "vzero  %%v5      \n\t"
-            ".align 16 \n\t"
-            "1:     \n\t"
-            "pfd    1, 256(%[ptr_tmp] ) \n\t"
-        
-            "vleg    %%v24 ,  0(%[ptr_tmp]),0 \n\t" 
-            "vleg    %%v25 ,  8(%[ptr_tmp]),0 \n\t"
-            "vleg    %%v24 , 16(%[ptr_tmp]),1 \n\t" 
-            "vleg    %%v25 , 24(%[ptr_tmp]),1 \n\t"
-            "vleg    %%v26 , 32(%[ptr_tmp]),0 \n\t"  
-            "vleg    %%v27 , 40(%[ptr_tmp]),0 \n\t"
-            "vleg    %%v26 , 48(%[ptr_tmp]),1 \n\t" 
-            "vleg    %%v27 , 56(%[ptr_tmp]),1 \n\t" 
-            "vleg    %%v28 , 64(%[ptr_tmp]),0 \n\t" 
-            "vleg    %%v29 , 72(%[ptr_tmp]),0 \n\t" 
-            "vleg    %%v28 , 80(%[ptr_tmp]),1 \n\t"
-            "vleg    %%v29 , 88(%[ptr_tmp]),1 \n\t" 
-            "vleg    %%v30 , 96(%[ptr_tmp]),0 \n\t"  
-            "vleg    %%v31 ,104(%[ptr_tmp]),0 \n\t"
-            "vleg    %%v30 ,112(%[ptr_tmp]),1 \n\t"
-            "vleg    %%v31 ,120(%[ptr_tmp]),1 \n\t"  
-            "vflpdb  %%v24, %%v24   \n\t" 
-            "vflpdb  %%v25, %%v25   \n\t" 
-            "vflpdb  %%v26, %%v26   \n\t" 
-            "vflpdb  %%v27, %%v27   \n\t" 
-            "vflpdb  %%v28, %%v28   \n\t" 
-            "vflpdb  %%v29, %%v29   \n\t"
-            "vflpdb  %%v30, %%v30   \n\t" 
-            "vflpdb  %%v31, %%v31   \n\t"    
-     
-            "vfadb   %%v0,%%v24,%%v25 \n\t"
-            "vfadb   %%v1,%%v26,%%v27 \n\t"
-            "vfadb   %%v2,%%v28,%%v29 \n\t"
-            "vfadb   %%v3,%%v30,%%v31 \n\t"
-     
-    
-            "vleg    %%v24 , 128(%[ptr_tmp]),0  \n\t" 
-            "vleg    %%v25 , 136(%[ptr_tmp]),0  \n\t"
-            "vleg    %%v24 , 144(%[ptr_tmp]),1  \n\t" 
-            "vleg    %%v25 , 152(%[ptr_tmp]),1  \n\t"
-            "vleg    %%v26 , 160(%[ptr_tmp]),0  \n\t"  
-            "vleg    %%v27 , 168(%[ptr_tmp]),0  \n\t"
-            "vleg    %%v26 , 176(%[ptr_tmp]),1  \n\t" 
-            "vleg    %%v27 , 184(%[ptr_tmp]),1  \n\t" 
-            "vleg    %%v28 , 192(%[ptr_tmp]),0  \n\t" 
-            "vleg    %%v29 , 200(%[ptr_tmp]),0  \n\t" 
-            "vleg    %%v28 , 208(%[ptr_tmp]),1  \n\t"
-            "vleg    %%v29 , 216(%[ptr_tmp]),1  \n\t" 
-            "vleg    %%v30 , 224(%[ptr_tmp]),0  \n\t"  
-            "vleg    %%v31 , 232(%[ptr_tmp]),0  \n\t"
-            "vleg    %%v30 , 240(%[ptr_tmp]),1  \n\t"
-            "vleg    %%v31 , 248(%[ptr_tmp]),1  \n\t"  
-            "vflpdb  %%v24, %%v24 \n\t" 
-            "vflpdb  %%v25, %%v25 \n\t" 
-            "vflpdb  %%v26, %%v26 \n\t" 
-            "vflpdb  %%v27, %%v27 \n\t" 
-            "vflpdb  %%v28, %%v28 \n\t" 
-            "vflpdb  %%v29, %%v29 \n\t"
-            "vflpdb  %%v30, %%v30 \n\t" 
-            "vflpdb  %%v31, %%v31 \n\t"    
-     
-            "vfadb   %%v24,%%v24,%%v25  \n\t"
-            "vfadb   %%v26,%%v26,%%v27  \n\t"
-            "vfadb   %%v28,%%v28,%%v29  \n\t"
-            "vfadb   %%v30,%%v30,%%v31  \n\t"
-    
-            "vfchdb  %%v25,%%v1,%%v0  \n\t" 
-            "vsel    %%v29,%%v17,%%v16,%%v25 \n\t"
-            "vsel    %%v31,%%v1,%%v0,%%v25   \n\t"  
-    
-            "vfchdb  %%v27,%%v3,%%v2 \n\t "   
-            "vsel    %%v0,%%v19,%%v18,%%v27 \n\t"
-            "vsel    %%v1,%%v3,%%v2,%%v27   \n\t"  
-    
-            "vfchdb  %%v25,%%v26,%%v24      \n\t" 
-            "vsel    %%v2,%%v21,%%v20,%%v25 \n\t"
-            "vsel    %%v3,%%v26,%%v24,%%v25 \n\t" 
-    
-            "vfchdb  %%v27,%%v30,%%v28       \n\t"   
-            "vsel    %%v25,%%v23,%%v22,%%v27 \n\t"
-            "vsel    %%v27,%%v30,%%v28,%%v27 \n\t"  
-    
-            "vfchdb  %%v24, %%v1,%%v31       \n\t" 
-            "vsel    %%v26,%%v0,%%v29,%%v24  \n\t" 
-            "vsel    %%v28,%%v1,%%v31,%%v24  \n\t"
-    
-            "vfchdb  %%v30, %%v27,%%v3       \n\t" 
-            "vsel    %%v29,%%v25,%%v2,%%v30  \n\t" 
-            "vsel    %%v31,%%v27,%%v3 ,%%v30 \n\t" 
-    
-            "la      %[ptr_tmp],256(%[ptr_tmp])      \n\t"  
-           
-            "vfchdb  %%v0, %%v31,%%v28      \n\t" 
-            "vsel    %%v25,%%v29,%%v26,%%v0 \n\t" 
-            "vsel    %%v27,%%v31,%%v28,%%v0 \n\t"
-    
-            "vag     %%v25,%%v25,%%v5 \n\t"
-    
-             //cmp with previous
-            "vfchdb %%v30, %%v27,%%v6     \n\t"
-            "vsel   %%v7,%%v25,%%v7,%%v30 \n\t" 
-            "vsel   %%v6,%%v27,%%v6,%%v30 \n\t"
-    
-            "vag    %%v5,%%v5,%%v4 \n\t" 
-    
-            "clgrjl %[ptr_tmp],%%r0,1b \n\t"
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0;
+  FLOAT maxf = 0;
+  BLASLONG max = 0;
+  BLASLONG inc_x2;
 
-            //xtract index
-            "vrepg  %%v26,%%v6,1      \n\t"
-            "vrepg  %%v5,%%v7,1       \n\t"
-            "wfcdb  %%v26,%%v6       \n\t"
-            "jne 2f \n\t"
-            "vsteg  %%v6,%[maxf],0  \n\t"
-            "vmnlg  %%v1,%%v5,%%v7 \n\t"
-            "vlgvg  %[index],%%v1,0  \n\t"
-            "j 3    \n\t"
-            "2:     \n\t"
-            "wfchdb %%v16,%%v26,%%v6      \n\t"
-            "vsel   %%v1,%%v5,%%v7,%%v16  \n\t"
-            "vsel   %%v0,%%v26,%%v6,%%v16 \n\t"
-            "vlgvg  %[index],%%v1,0  \n\t"
-            "std    %%f0,%[maxf]    \n\t"
-            "3:     \n\t"
-            : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
-            : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)             
-            : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
-            "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
+  if (n <= 0 || inc_x <= 0)
+    return (max);
 
-            );
-    return index;
+  if (inc_x == 1) {
 
-}
+    BLASLONG n1 = n & -16;
+    if (n1 > 0) {
 
-  
+      max = izamax_kernel_16(n1, x, &maxf);
+      ix = n1 * 2;
+      i = n1;
+    } else {
+      maxf = CABS1(x, 0);
+      ix += 2;
+      i++;
+    }
 
- 
- 
+    while (i < n) {
+      if (CABS1(x, ix) > maxf) {
+        max = i;
+        maxf = CABS1(x, ix);
+      }
+      ix += 2;
+      i++;
+    }
+    return (max + 1);
 
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
-{
-    BLASLONG i = 0;
-    BLASLONG ix = 0;
-    FLOAT maxf = 0;
-    BLASLONG max = 0;
-    BLASLONG inc_x2;
+  } else {
 
-    if (n <= 0 || inc_x <= 0) return(max);
-     
-    if (inc_x == 1) {
+    max = 0;
+    maxf = CABS1(x, 0);
+    inc_x2 = 2 * inc_x;
 
-      BLASLONG n1 = n & -16;
-      if (n1 > 0) {
+    BLASLONG n1 = n & -4;
+    while (i < n1) {
 
-            max = ziamax_kernel_16_TUNED(n1, x, &maxf); 
-            i = n1;
-            ix = n1 << 1;
+      if (CABS1(x, ix) > maxf) {
+        max = i;
+        maxf = CABS1(x, ix);
+      }
+      if (CABS1(x, ix + inc_x2) > maxf) {
+        max = i + 1;
+        maxf = CABS1(x, ix + inc_x2);
+      }
+      if (CABS1(x, ix + 2 * inc_x2) > maxf) {
+        max = i + 2;
+        maxf = CABS1(x, ix + 2 * inc_x2);
+      }
+      if (CABS1(x, ix + 3 * inc_x2) > maxf) {
+        max = i + 3;
+        maxf = CABS1(x, ix + 3 * inc_x2);
       }
 
-      while(i < n)
-    {
-        if( CABS1(x,ix) > maxf )
-        {
-            max = i;
-            maxf = CABS1(x,ix);
-        }
-        ix += 2;
-        i++;
-    }
-        return (max + 1);
-
-    } else {
- 
-      inc_x2 = 2 * inc_x;
+      ix += inc_x2 * 4;
 
-    maxf = CABS1(x,0);
-    ix += inc_x2;
-    i++;
+      i += 4;
 
-    while(i < n)
-    {
-        if( CABS1(x,ix) > maxf )
-        {
-            max = i;
-            maxf = CABS1(x,ix);
-        }
-        ix += inc_x2;
-        i++;
     }
-        return (max + 1);
+
+    while (i < n) {
+      if (CABS1(x, ix) > maxf) {
+        max = i;
+        maxf = CABS1(x, ix);
+      }
+      ix += inc_x2;
+      i++;
     }
- 
+    return (max + 1);
+  }
 }
-
-
diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c
index 9b2a653a77..9ababb91fd 100644
--- a/kernel/zarch/izamin.c
+++ b/kernel/zarch/izamin.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2017, The OpenBLAS Project
+Copyright (c) 2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -24,253 +24,222 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
- 
 
 #include "common.h"
 #include <math.h>
- 
-#define ABS fabs 
-#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
 
- 
-/**
- * Find  minimum index 
- * Warning: requirements n>0  and n % 16 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param minf  (out) minimum absolute value .( only for output )
- * @return minimum index 
- */
-static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { 
-    BLASLONG index ;
-    __asm__(
-            "pfd    1, 0(%[ptr_x]) \n\t" 
-            "vleig  %%v16,0,0  \n\t"
-            "vleig  %%v16,1,1  \n\t"
-            "vleig  %%v17,2,0  \n\t"
-            "vleig  %%v17,3,1  \n\t"
-            "vleig  %%v18,4,0  \n\t"
-            "vleig  %%v18,5,1  \n\t"
-            "vleig  %%v19,6,0  \n\t"
-            "vleig  %%v19,7,1  \n\t" 
-            "vleig  %%v20,8,0  \n\t"
-            "vleig  %%v20,9,1  \n\t"
-            "vleig  %%v21,10,0 \n\t"
-            "vleig  %%v21,11,1 \n\t"
-            "vleig  %%v22,12,0 \n\t"
-            "vleig  %%v22,13,1 \n\t"
-            "vleig  %%v23,14,0 \n\t"
-            "vleig  %%v23,15,1 \n\t" 
-            "ld     %%f6,0(%[ptr_x])     \n\t"
-            "lpdbr  %%f6,%%f6  \n\t" 
-            "ld     %%f7,8(%[ptr_x])     \n\t"
-            "lpdbr  %%f7,%%f7   \n\t"   
-            "adbr   %%f6,%%f7   \n\t"      
-            "sllg   %%r0,%[n],4 \n\t"
-            "agr    %%r0,%[ptr_x]        \n\t" 
-            "vrepg  %%v6,%%v6,0 \n\t"
-            "vzero  %%v7        \n\t"
-            "vrepig %%v4,16     \n\t"
-            "vzero  %%v5        \n\t"
-            ".align 16 \n\t"
-            "1:     \n\t"
-            "pfd    1, 256(%[ptr_tmp] ) \n\t"
-        
-            "vleg   %%v24 ,  0(%[ptr_tmp]),0 \n\t" 
-            "vleg   %%v25 ,  8(%[ptr_tmp]),0 \n\t"
-            "vleg   %%v24 , 16(%[ptr_tmp]),1 \n\t" 
-            "vleg   %%v25 , 24(%[ptr_tmp]),1 \n\t"
-            "vleg   %%v26 , 32(%[ptr_tmp]),0 \n\t"  
-            "vleg   %%v27 , 40(%[ptr_tmp]),0 \n\t"
-            "vleg   %%v26 , 48(%[ptr_tmp]),1 \n\t" 
-            "vleg   %%v27 , 56(%[ptr_tmp]),1 \n\t" 
-            "vleg   %%v28 , 64(%[ptr_tmp]),0 \n\t" 
-            "vleg   %%v29 , 72(%[ptr_tmp]),0 \n\t" 
-            "vleg   %%v28 , 80(%[ptr_tmp]),1 \n\t"
-            "vleg   %%v29 , 88(%[ptr_tmp]),1 \n\t" 
-            "vleg   %%v30 , 96(%[ptr_tmp]),0 \n\t"  
-            "vleg   %%v31 ,104(%[ptr_tmp]),0 \n\t"
-            "vleg   %%v30 ,112(%[ptr_tmp]),1 \n\t"
-            "vleg   %%v31 ,120(%[ptr_tmp]),1 \n\t"  
-            "vflpdb %%v24, %%v24   \n\t" 
-            "vflpdb %%v25, %%v25   \n\t" 
-            "vflpdb %%v26, %%v26   \n\t" 
-            "vflpdb %%v27, %%v27   \n\t" 
-            "vflpdb %%v28, %%v28   \n\t" 
-            "vflpdb %%v29, %%v29   \n\t"
-            "vflpdb %%v30, %%v30   \n\t" 
-            "vflpdb %%v31, %%v31   \n\t"    
-     
-            "vfadb  %%v0,%%v24,%%v25 \n\t"
-            "vfadb  %%v1,%%v26,%%v27 \n\t"
-            "vfadb  %%v2,%%v28,%%v29 \n\t"
-            "vfadb  %%v3,%%v30,%%v31 \n\t"
-     
-    
-            "vleg   %%v24 ,128(%[ptr_tmp]),0 \n\t" 
-            "vleg   %%v25 ,136(%[ptr_tmp]),0 \n\t"
-            "vleg   %%v24 ,144(%[ptr_tmp]),1 \n\t" 
-            "vleg   %%v25 ,152(%[ptr_tmp]),1 \n\t"
-            "vleg   %%v26 ,160(%[ptr_tmp]),0 \n\t"  
-            "vleg   %%v27 ,168(%[ptr_tmp]),0 \n\t"
-            "vleg   %%v26 ,176(%[ptr_tmp]),1 \n\t" 
-            "vleg   %%v27 ,184(%[ptr_tmp]),1 \n\t" 
-            "vleg   %%v28 ,192(%[ptr_tmp]),0 \n\t" 
-            "vleg   %%v29 ,200(%[ptr_tmp]),0 \n\t" 
-            "vleg   %%v28 ,208(%[ptr_tmp]),1 \n\t"
-            "vleg   %%v29 ,216(%[ptr_tmp]),1 \n\t" 
-            "vleg   %%v30 ,224(%[ptr_tmp]),0 \n\t"  
-            "vleg   %%v31 ,232(%[ptr_tmp]),0 \n\t"
-            "vleg   %%v30 ,240(%[ptr_tmp]),1 \n\t"
-            "vleg   %%v31 ,248(%[ptr_tmp]),1 \n\t"  
-            "vflpdb %%v24, %%v24   \n\t" 
-            "vflpdb %%v25, %%v25   \n\t" 
-            "vflpdb %%v26, %%v26   \n\t" 
-            "vflpdb %%v27, %%v27   \n\t" 
-            "vflpdb %%v28, %%v28   \n\t" 
-            "vflpdb %%v29, %%v29   \n\t"
-            "vflpdb %%v30, %%v30   \n\t" 
-            "vflpdb %%v31, %%v31   \n\t"    
-     
-            "vfadb  %%v24,%%v24,%%v25  \n\t"
-            "vfadb  %%v26,%%v26,%%v27  \n\t"
-            "vfadb  %%v28,%%v28,%%v29  \n\t"
-            "vfadb  %%v30,%%v30,%%v31  \n\t"
-    
-    
-            "vfchdb %%v25,%%v0 ,%%v1        \n\t" 
-            "vsel   %%v29,%%v17,%%v16,%%v25 \n\t"
-            "vsel   %%v31,%%v1,%%v0,%%v25   \n\t"  
-    
-            "vfchdb %%v27,%%v2,%%v3         \n\t"   
-            "vsel   %%v0,%%v19,%%v18,%%v27  \n\t"
-            "vsel   %%v1,%%v3,%%v2,%%v27    \n\t"  
-    
-            "vfchdb %%v25,%%v24,%%v26       \n\t" 
-            "vsel   %%v2,%%v21,%%v20,%%v25  \n\t"
-            "vsel   %%v3,%%v26,%%v24,%%v25  \n\t" 
-    
-            "vfchdb %%v27,%%v28,%%v30       \n\t"   
-            "vsel   %%v25,%%v23,%%v22,%%v27 \n\t"
-            "vsel   %%v27,%%v30,%%v28,%%v27 \n\t"  
-    
-            "vfchdb %%v24,%%v31, %%v1       \n\t" 
-            "vsel   %%v26,%%v0,%%v29,%%v24  \n\t" 
-            "vsel   %%v28,%%v1,%%v31,%%v24  \n\t"
-    
-            "vfchdb %%v30,%%v3, %%v27       \n\t" 
-            "vsel   %%v29,%%v25,%%v2,%%v30  \n\t" 
-            "vsel   %%v31,%%v27,%%v3 ,%%v30 \n\t" 
-    
-            "la     %[ptr_tmp],256(%[ptr_tmp])      \n\t"  
-           
-            "vfchdb %%v0,%%v28, %%v31       \n\t" 
-            "vsel   %%v25,%%v29,%%v26,%%v0  \n\t" 
-            "vsel   %%v27,%%v31,%%v28,%%v0  \n\t"
-    
-            "vag    %%v25,%%v25,%%v5 \n\t"
-    
-             //cmp with previous
-            "vfchdb %%v30,%%v6 , %%v27      \n\t"
-            "vsel   %%v7,%%v25,%%v7,%%v30   \n\t" 
-            "vsel   %%v6,%%v27,%%v6,%%v30   \n\t"
-    
-            "vag    %%v5,%%v5,%%v4  \n\t" 
-    
-            "clgrjl %[ptr_tmp],%%r0,1b  \n\t"
-
-            //xtract index
-            "vrepg  %%v26,%%v6,1      \n\t"
-            "vrepg  %%v5,%%v7,1       \n\t"
-            "wfcdb  %%v26,%%v6        \n\t"
-            "jne 2f \n\t"
-            "vsteg  %%v6,%[minf],0    \n\t"
-            "vmnlg  %%v1,%%v5,%%v7   \n\t"
-            "vlgvg  %[index],%%v1,0      \n\t"
-            "j 3f   \n\t"
-            "2: \n\t"
-            "wfchdb %%v16,%%v6 ,%%v26     \n\t"
-            "vsel   %%v1,%%v5,%%v7,%%v16  \n\t"
-            "vsel   %%v0,%%v26,%%v6,%%v16 \n\t"
-            "vlgvg  %[index],%%v1,0  \n\t"
-            "std    %%f0,%[minf]       \n\t"
-            "3: \n\t"
-
-            : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
-            : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) 
-            : "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
-            "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
-
-            );
-
-    return index;
+#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
+
+static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) {
+  BLASLONG iamin;
+
+  __asm__("vleg   %%v0,0(%[x]),0\n\t"
+    "vleg   %%v1,8(%[x]),0\n\t"
+    "vleg   %%v0,16(%[x]),1\n\t"
+    "vleg   %%v1,24(%[x]),1\n\t"
+    "vflpdb %%v0,%%v0\n\t"
+    "vflpdb %%v1,%%v1\n\t"
+    "vfadb  %%v0,%%v0,%%v1\n\t"
+    "vleig  %%v1,0,0\n\t"
+    "vleig  %%v1,1,1\n\t"
+    "vrepig %%v2,8\n\t"
+    "vzero  %%v3\n\t"
+    "vleig  %%v24,0,0\n\t"
+    "vleig  %%v24,1,1\n\t"
+    "vleig  %%v25,2,0\n\t"
+    "vleig  %%v25,3,1\n\t"
+    "vleig  %%v26,4,0\n\t"
+    "vleig  %%v26,5,1\n\t"
+    "vleig  %%v27,6,0\n\t"
+    "vleig  %%v27,7,1\n\t"
+    "srlg  %[n],%[n],4\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vleg  %%v16,0(%%r1,%[x]),0\n\t"
+    "vleg  %%v17,8(%%r1,%[x]),0\n\t"
+    "vleg  %%v16,16(%%r1,%[x]),1\n\t"
+    "vleg  %%v17,24(%%r1,%[x]),1\n\t"
+    "vleg  %%v18,32(%%r1,%[x]),0\n\t"
+    "vleg  %%v19,40(%%r1,%[x]),0\n\t"
+    "vleg  %%v18,48(%%r1,%[x]),1\n\t"
+    "vleg  %%v19,56(%%r1,%[x]),1\n\t"
+    "vleg  %%v20,64(%%r1,%[x]),0\n\t"
+    "vleg  %%v21,72(%%r1,%[x]),0\n\t"
+    "vleg  %%v20,80(%%r1,%[x]),1\n\t"
+    "vleg  %%v21,88(%%r1,%[x]),1\n\t"
+    "vleg  %%v22,96(%%r1,%[x]),0\n\t"
+    "vleg  %%v23,104(%%r1,%[x]),0\n\t"
+    "vleg  %%v22,112(%%r1,%[x]),1\n\t"
+    "vleg  %%v23,120(%%r1,%[x]),1\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfadb %%v16,%%v16,%%v17\n\t"
+    "vfadb %%v17,%%v18,%%v19\n\t"
+    "vfadb %%v18,%%v20,%%v21\n\t"
+    "vfadb %%v19,%%v22,%%v23\n\t"
+    "vfchedb  %%v4,%%v17,%%v16\n\t"
+    "vfchedb  %%v5,%%v19,%%v18\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v4\n\t"
+    "vsel    %%v4,%%v24,%%v25,%%v4\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v5\n\t"
+    "vsel    %%v5,%%v26,%%v27,%%v5\n\t"
+    "vfchedb  %%v18,%%v17,%%v16\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v18\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vfchedb  %%v5,%%v16,%%v0\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v5\n\t"
+    "vsel    %%v1,%%v1,%%v4,%%v5\n\t"
+    "vag     %%v3,%%v3,%%v2\n\t"
+    "vleg  %%v16,128(%%r1,%[x]),0\n\t"
+    "vleg  %%v17,136(%%r1,%[x]),0\n\t"
+    "vleg  %%v16,144(%%r1,%[x]),1\n\t"
+    "vleg  %%v17,152(%%r1,%[x]),1\n\t"
+    "vleg  %%v18,160(%%r1,%[x]),0\n\t"
+    "vleg  %%v19,168(%%r1,%[x]),0\n\t"
+    "vleg  %%v18,176(%%r1,%[x]),1\n\t"
+    "vleg  %%v19,184(%%r1,%[x]),1\n\t"
+    "vleg  %%v20,192(%%r1,%[x]),0\n\t"
+    "vleg  %%v21,200(%%r1,%[x]),0\n\t"
+    "vleg  %%v20,208(%%r1,%[x]),1\n\t"
+    "vleg  %%v21,216(%%r1,%[x]),1\n\t"
+    "vleg  %%v22,224(%%r1,%[x]),0\n\t"
+    "vleg  %%v23,232(%%r1,%[x]),0\n\t"
+    "vleg  %%v22,240(%%r1,%[x]),1\n\t"
+    "vleg  %%v23,248(%%r1,%[x]),1\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfadb %%v16,%%v16,%%v17\n\t"
+    "vfadb %%v17,%%v18,%%v19\n\t"
+    "vfadb %%v18,%%v20,%%v21\n\t"
+    "vfadb %%v19,%%v22,%%v23\n\t"
+    "vfchedb  %%v4,%%v17,%%v16\n\t"
+    "vfchedb  %%v5,%%v19,%%v18\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v4\n\t"
+    "vsel    %%v4,%%v24,%%v25,%%v4\n\t"
+    "vsel    %%v17,%%v18,%%v19,%%v5\n\t"
+    "vsel    %%v5,%%v26,%%v27,%%v5\n\t"
+    "vfchedb  %%v18,%%v17,%%v16\n\t"
+    "vsel    %%v16,%%v16,%%v17,%%v18\n\t"
+    "vsel    %%v4,%%v4,%%v5,%%v18\n\t"
+    "vag     %%v4,%%v4,%%v3\n\t"
+    "vfchedb  %%v5,%%v16,%%v0\n\t"
+    "vsel    %%v0,%%v0,%%v16,%%v5\n\t"
+    "vsel    %%v1,%%v1,%%v4,%%v5\n\t"
+    "vag     %%v3,%%v3,%%v2\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v2,%%v0,1\n\t"
+    "vrepg  %%v3,%%v1,1\n\t"
+    "wfcdb  %%v2,%%v0\n\t"
+    "jne 1f\n\t"
+    "vsteg  %%v0,%[amin],0\n\t"
+    "vmnlg  %%v0,%%v1,%%v3\n\t"
+    "vlgvg  %[iamin],%%v0,0\n\t"
+    "j 2f\n\t"
+    "1:\n\t"
+    "wfchdb %%v4,%%v0,%%v2\n\t"
+    "vsel   %%v1,%%v3,%%v1,%%v4\n\t"
+    "vsel   %%v0,%%v2,%%v0,%%v4\n\t"
+    "std    %%f0,%[amin]\n\t"
+    "vlgvg  %[iamin],%%v1,0\n\t"
+    "2:\n\t"
+    "nop"
+    : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
+       "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
+
+  return iamin;
 }
 
- 
-
- 
- 
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
-{
-    BLASLONG i=0;
-    BLASLONG ix=0;
-    FLOAT minf;
-    BLASLONG min=0;
-    BLASLONG inc_x2;
-
-    if (n <= 0 || inc_x <= 0) return(min);
-    
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0;
+  FLOAT minf = 0;
+  BLASLONG min = 0;
+  BLASLONG inc_x2;
 
-    if (inc_x == 1) {
+  if (n <= 0 || inc_x <= 0)
+    return (min);
 
-        BLASLONG n1 = n & -16;
-        if (n1 > 0) {
+  if (inc_x == 1) {
 
-            min = ziamin_kernel_16_TUNED(n1, x, &minf);
-            i = n1;
-            ix = n1 << 1;
-        }
-        else {
-            //assign minf
-             minf = CABS1(x,0);
-             ix += 2;
-             i++;
-         }
-
-        while(i < n)
-        {
-            if( CABS1(x,ix) < minf )
-            {
-                min = i;
-                minf = CABS1(x,ix);
-            }
-            ix += 2;
-            i++;
-        }
-        return (min + 1);
+    BLASLONG n1 = n & -16;
+    if (n1 > 0) {
 
+      min = izamin_kernel_16(n1, x, &minf);
+      ix = n1 * 2;
+      i = n1;
     } else {
- 
-        inc_x2 = 2 * inc_x;
-
-        minf = CABS1(x,0);
-        ix += inc_x2;
-        i++;
+      minf = CABS1(x, 0);
+      ix += 2;
+      i++;
+    }
 
-        while(i < n)
-        {
-            if( CABS1(x,ix) < minf )
-            {
-                min = i;
-                minf = CABS1(x,ix);
-            }
-            ix += inc_x2;
-            i++;
-        }
-        return (min + 1);
+    while (i < n) {
+      if (CABS1(x, ix) < minf) {
+        min = i;
+        minf = CABS1(x, ix);
+      }
+      ix += 2;
+      i++;
     }
- 
-}
+    return (min + 1);
+
+  } else {
+
+    min = 0;
+    minf = CABS1(x, 0);
+    inc_x2 = 2 * inc_x;
+
+    BLASLONG n1 = n & -4;
+    while (i < n1) {
+
+      if (CABS1(x, ix) < minf) {
+        min = i;
+        minf = CABS1(x, ix);
+      }
+      if (CABS1(x, ix + inc_x2) < minf) {
+        min = i + 1;
+        minf = CABS1(x, ix + inc_x2);
+      }
+      if (CABS1(x, ix + 2 * inc_x2) < minf) {
+        min = i + 2;
+        minf = CABS1(x, ix + 2 * inc_x2);
+      }
+      if (CABS1(x, ix + 3 * inc_x2) < minf) {
+        min = i + 3;
+        minf = CABS1(x, ix + 3 * inc_x2);
+      }
+
+      ix += inc_x2 * 4;
+
+      i += 4;
 
+    }
 
+    while (i < n) {
+      if (CABS1(x, ix) < minf) {
+        min = i;
+        minf = CABS1(x, ix);
+      }
+      ix += inc_x2;
+      i++;
+    }
+    return (min + 1);
+  }
+}
diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c
new file mode 100644
index 0000000000..fdda6dd321
--- /dev/null
+++ b/kernel/zarch/samax.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define ABS fabsf
+
+static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) {
+  FLOAT amax;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "srlg   %[n],%[n],6\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,128(%%r1,%[x])\n\t"
+    "vl  %%v25,144(%%r1,%[x])\n\t"
+    "vl  %%v26,160(%%r1,%[x])\n\t"
+    "vl  %%v27,176(%%r1,%[x])\n\t"
+    "vl  %%v28,192(%%r1,%[x])\n\t"
+    "vl  %%v29,208(%%r1,%[x])\n\t"
+    "vl  %%v30,224(%%r1,%[x])\n\t"
+    "vl  %%v31,240(%%r1,%[x])\n\t"
+    "vfmaxsb  %%v16,%%v16,%%v24,8\n\t"
+    "vfmaxsb  %%v17,%%v17,%%v25,8\n\t"
+    "vfmaxsb  %%v18,%%v18,%%v26,8\n\t"
+    "vfmaxsb  %%v19,%%v19,%%v27,8\n\t"
+    "vfmaxsb  %%v20,%%v20,%%v28,8\n\t"
+    "vfmaxsb  %%v21,%%v21,%%v29,8\n\t"
+    "vfmaxsb  %%v22,%%v22,%%v30,8\n\t"
+    "vfmaxsb  %%v23,%%v23,%%v31,8\n\t"
+    "vfmaxsb  %%v16,%%v16,%%v20,8\n\t"
+    "vfmaxsb  %%v17,%%v17,%%v21,8\n\t"
+    "vfmaxsb  %%v18,%%v18,%%v22,8\n\t"
+    "vfmaxsb  %%v19,%%v19,%%v23,8\n\t"
+    "vfmaxsb  %%v16,%%v16,%%v18,8\n\t"
+    "vfmaxsb  %%v17,%%v17,%%v19,8\n\t"
+    "vfmaxsb  %%v16,%%v16,%%v17,8\n\t"
+    "vfmaxsb  %%v0,%%v0,%%v16,8\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "veslg   %%v16,%%v0,32\n\t"
+    "vfmaxsb %%v0,%%v0,%%v16,8\n\t"
+    "vrepf   %%v16,%%v0,2\n\t"
+    "wfmaxsb %%v0,%%v0,%%v16,8\n\t"
+    "lper    %[amax],%%f0"
+    : [amax] "=f"(amax),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return amax;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT maxf = 0.0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (maxf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -64;
+    if (n1 > 0) {
+
+      maxf = samax_kernel_64(n1, x);
+
+      i = n1;
+    } else {
+      maxf = ABS(x[0]);
+      i++;
+    }
+
+    while (i < n) {
+      if (ABS(x[i]) > maxf) {
+        maxf = ABS(x[i]);
+      }
+      i++;
+    }
+    return (maxf);
+
+  } else {
+
+    maxf = ABS(x[0]);
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (ABS(x[i]) > maxf) {
+        maxf = ABS(x[i]);
+      }
+      if (ABS(x[i + inc_x]) > maxf) {
+        maxf = ABS(x[i + inc_x]);
+      }
+      if (ABS(x[i + 2 * inc_x]) > maxf) {
+        maxf = ABS(x[i + 2 * inc_x]);
+      }
+      if (ABS(x[i + 3 * inc_x]) > maxf) {
+        maxf = ABS(x[i + 3 * inc_x]);
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (ABS(x[i]) > maxf) {
+        maxf = ABS(x[i]);
+      }
+      i += inc_x;
+      j++;
+    }
+    return (maxf);
+  }
+}
diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c
new file mode 100644
index 0000000000..f05e851f96
--- /dev/null
+++ b/kernel/zarch/samin.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define ABS fabsf
+
+static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) {
+  FLOAT amin;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "srlg   %[n],%[n],6\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,128(%%r1,%[x])\n\t"
+    "vl  %%v25,144(%%r1,%[x])\n\t"
+    "vl  %%v26,160(%%r1,%[x])\n\t"
+    "vl  %%v27,176(%%r1,%[x])\n\t"
+    "vl  %%v28,192(%%r1,%[x])\n\t"
+    "vl  %%v29,208(%%r1,%[x])\n\t"
+    "vl  %%v30,224(%%r1,%[x])\n\t"
+    "vl  %%v31,240(%%r1,%[x])\n\t"
+    "vfminsb  %%v16,%%v16,%%v24,8\n\t"
+    "vfminsb  %%v17,%%v17,%%v25,8\n\t"
+    "vfminsb  %%v18,%%v18,%%v26,8\n\t"
+    "vfminsb  %%v19,%%v19,%%v27,8\n\t"
+    "vfminsb  %%v20,%%v20,%%v28,8\n\t"
+    "vfminsb  %%v21,%%v21,%%v29,8\n\t"
+    "vfminsb  %%v22,%%v22,%%v30,8\n\t"
+    "vfminsb  %%v23,%%v23,%%v31,8\n\t"
+    "vfminsb  %%v16,%%v16,%%v20,8\n\t"
+    "vfminsb  %%v17,%%v17,%%v21,8\n\t"
+    "vfminsb  %%v18,%%v18,%%v22,8\n\t"
+    "vfminsb  %%v19,%%v19,%%v23,8\n\t"
+    "vfminsb  %%v16,%%v16,%%v18,8\n\t"
+    "vfminsb  %%v17,%%v17,%%v19,8\n\t"
+    "vfminsb  %%v16,%%v16,%%v17,8\n\t"
+    "vfminsb  %%v0,%%v0,%%v16,8\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "veslg   %%v16,%%v0,32\n\t"
+    "vfminsb %%v0,%%v0,%%v16,8\n\t"
+    "vrepf   %%v16,%%v0,2\n\t"
+    "wfminsb %%v0,%%v0,%%v16,8\n\t"
+    "lper    %[amin],%%f0"
+    : [amin] "=f"(amin),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return amin;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT minf = 0.0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (minf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -64;
+    if (n1 > 0) {
+
+      minf = samin_kernel_64(n1, x);
+
+      i = n1;
+    } else {
+      minf = ABS(x[0]);
+      i++;
+    }
+
+    while (i < n) {
+      if (ABS(x[i]) < minf) {
+        minf = ABS(x[i]);
+      }
+      i++;
+    }
+    return (minf);
+
+  } else {
+
+    minf = ABS(x[0]);
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (ABS(x[i]) < minf) {
+        minf = ABS(x[i]);
+      }
+      if (ABS(x[i + inc_x]) < minf) {
+        minf = ABS(x[i + inc_x]);
+      }
+      if (ABS(x[i + 2 * inc_x]) < minf) {
+        minf = ABS(x[i + 2 * inc_x]);
+      }
+      if (ABS(x[i + 3 * inc_x]) < minf) {
+        minf = ABS(x[i + 3 * inc_x]);
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (ABS(x[i]) < minf) {
+        minf = ABS(x[i]);
+      }
+      i += inc_x;
+      j++;
+    }
+    return (minf);
+  }
+}
diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c
new file mode 100644
index 0000000000..d56f2697b1
--- /dev/null
+++ b/kernel/zarch/sasum.c
@@ -0,0 +1,168 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define ABS fabsf
+
+static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) {
+  FLOAT asum;
+
+  __asm__("vzero   %%v24\n\t"
+    "vzero   %%v25\n\t"
+    "vzero   %%v26\n\t"
+    "vzero   %%v27\n\t"
+    "vzero   %%v28\n\t"
+    "vzero   %%v29\n\t"
+    "vzero   %%v30\n\t"
+    "vzero   %%v31\n\t"
+    "srlg  %[n],%[n],6\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd  1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16, 0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v20, 64(%%r1,%[x])\n\t"
+    "vl  %%v21, 80(%%r1,%[x])\n\t"
+    "vl  %%v22, 96(%%r1,%[x])\n\t"
+    "vl  %%v23, 112(%%r1,%[x])\n\t"
+    "vflpsb  %%v16, %%v16\n\t"
+    "vflpsb  %%v17, %%v17\n\t"
+    "vflpsb  %%v18, %%v18\n\t"
+    "vflpsb  %%v19, %%v19\n\t"
+    "vflpsb  %%v20, %%v20\n\t"
+    "vflpsb  %%v21, %%v21\n\t"
+    "vflpsb  %%v22, %%v22\n\t"
+    "vflpsb  %%v23, %%v23\n\t"
+    "vfasb   %%v24,%%v24,%%v16\n\t"
+    "vfasb   %%v25,%%v25,%%v17\n\t"
+    "vfasb   %%v26,%%v26,%%v18\n\t"
+    "vfasb   %%v27,%%v27,%%v19\n\t"
+    "vfasb   %%v28,%%v28,%%v20\n\t"
+    "vfasb   %%v29,%%v29,%%v21\n\t"
+    "vfasb   %%v30,%%v30,%%v22\n\t"
+    "vfasb   %%v31,%%v31,%%v23\n\t"
+    "vl  %%v16, 128(%%r1,%[x])\n\t"
+    "vl  %%v17, 144(%%r1,%[x])\n\t"
+    "vl  %%v18, 160(%%r1,%[x])\n\t"
+    "vl  %%v19, 176(%%r1,%[x])\n\t"
+    "vl  %%v20, 192(%%r1,%[x])\n\t"
+    "vl  %%v21, 208(%%r1,%[x])\n\t"
+    "vl  %%v22, 224(%%r1,%[x])\n\t"
+    "vl  %%v23, 240(%%r1,%[x])\n\t"
+    "vflpsb  %%v16, %%v16\n\t"
+    "vflpsb  %%v17, %%v17\n\t"
+    "vflpsb  %%v18, %%v18\n\t"
+    "vflpsb  %%v19, %%v19\n\t"
+    "vflpsb  %%v20, %%v20\n\t"
+    "vflpsb  %%v21, %%v21\n\t"
+    "vflpsb  %%v22, %%v22\n\t"
+    "vflpsb  %%v23, %%v23\n\t"
+    "vfasb   %%v24,%%v24,%%v16\n\t"
+    "vfasb   %%v25,%%v25,%%v17\n\t"
+    "vfasb   %%v26,%%v26,%%v18\n\t"
+    "vfasb   %%v27,%%v27,%%v19\n\t"
+    "vfasb   %%v28,%%v28,%%v20\n\t"
+    "vfasb   %%v29,%%v29,%%v21\n\t"
+    "vfasb   %%v30,%%v30,%%v22\n\t"
+    "vfasb   %%v31,%%v31,%%v23\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b\n\t"
+    "vfasb   %%v24,%%v24,%%v25\n\t"
+    "vfasb   %%v24,%%v24,%%v26\n\t"
+    "vfasb   %%v24,%%v24,%%v27\n\t"
+    "vfasb   %%v24,%%v24,%%v28\n\t"
+    "vfasb   %%v24,%%v24,%%v29\n\t"
+    "vfasb   %%v24,%%v24,%%v30\n\t"
+    "vfasb   %%v24,%%v24,%%v31\n\t"
+    "veslg   %%v25,%%v24,32\n\t"
+    "vfasb   %%v24,%%v24,%%v25\n\t"
+    "vrepf   %%v25,%%v24,2\n\t"
+    "vfasb   %%v24,%%v24,%%v25\n\t"
+    "vstef   %%v24,%[asum],0"
+    : [asum] "=Q"(asum),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return asum;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT sumf = 0.0;
+  BLASLONG n1;
+
+  if (n <= 0 || inc_x <= 0)
+    return sumf;
+
+  if (inc_x == 1) {
+
+    n1 = n & -64;
+
+    if (n1 > 0) {
+
+      sumf = sasum_kernel_64(n1, x);
+      i = n1;
+    }
+
+    while (i < n) {
+      sumf += ABS(x[i]);
+      i++;
+    }
+
+  } else {
+    BLASLONG n1 = n & -4;
+    register FLOAT sum1, sum2;
+    sum1 = 0.0;
+    sum2 = 0.0;
+    while (j < n1) {
+
+      sum1 += ABS(x[i]);
+      sum2 += ABS(x[i + inc_x]);
+      sum1 += ABS(x[i + 2 * inc_x]);
+      sum2 += ABS(x[i + 3 * inc_x]);
+
+      i += inc_x * 4;
+      j += 4;
+
+    }
+    sumf = sum1 + sum2;
+    while (j < n) {
+
+      sumf += ABS(x[i]);
+      i += inc_x;
+      j++;
+    }
+
+  }
+  return sumf;
+}
diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c
new file mode 100644
index 0000000000..ca34a47ff3
--- /dev/null
+++ b/kernel/zarch/saxpy.c
@@ -0,0 +1,167 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
+  __asm__("vlrepf %%v0,%[alpha]\n\t"
+    "srlg  %[n],%[n],6\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "pfd 2, 1024(%%r1,%[y])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,0(%%r1,%[y])\n\t"
+    "vl  %%v21,16(%%r1,%[y])\n\t"
+    "vl  %%v22,32(%%r1,%[y])\n\t"
+    "vl  %%v23,48(%%r1,%[y])\n\t"
+    "vl  %%v24,64(%%r1,%[x])\n\t"
+    "vl  %%v25,80(%%r1,%[x])\n\t"
+    "vl  %%v26,96(%%r1,%[x])\n\t"
+    "vl  %%v27,112(%%r1,%[x])\n\t"
+    "vl  %%v28,64(%%r1,%[y])\n\t"
+    "vl  %%v29,80(%%r1,%[y])\n\t"
+    "vl  %%v30,96(%%r1,%[y])\n\t"
+    "vl  %%v31,112(%%r1,%[y])\n\t"
+    "vfmasb   %%v16,%%v0,%%v16,%%v20\n\t"
+    "vfmasb   %%v17,%%v0,%%v17,%%v21\n\t"
+    "vfmasb   %%v18,%%v0,%%v18,%%v22\n\t"
+    "vfmasb   %%v19,%%v0,%%v19,%%v23\n\t"
+    "vfmasb   %%v24,%%v0,%%v24,%%v28\n\t"
+    "vfmasb   %%v25,%%v0,%%v25,%%v29\n\t"
+    "vfmasb   %%v26,%%v0,%%v26,%%v30\n\t"
+    "vfmasb   %%v27,%%v0,%%v27,%%v31\n\t"
+    "vst  %%v16,0(%%r1,%[y])\n\t"
+    "vst  %%v17,16(%%r1,%[y])\n\t"
+    "vst  %%v18,32(%%r1,%[y])\n\t"
+    "vst  %%v19,48(%%r1,%[y])\n\t"
+    "vst  %%v24,64(%%r1,%[y])\n\t"
+    "vst  %%v25,80(%%r1,%[y])\n\t"
+    "vst  %%v26,96(%%r1,%[y])\n\t"
+    "vst  %%v27,112(%%r1,%[y])\n\t"
+    "vl  %%v16,128(%%r1,%[x])\n\t"
+    "vl  %%v17,144(%%r1,%[x])\n\t"
+    "vl  %%v18,160(%%r1,%[x])\n\t"
+    "vl  %%v19,176(%%r1,%[x])\n\t"
+    "vl  %%v20,128(%%r1,%[y])\n\t"
+    "vl  %%v21,144(%%r1,%[y])\n\t"
+    "vl  %%v22,160(%%r1,%[y])\n\t"
+    "vl  %%v23,176(%%r1,%[y])\n\t"
+    "vl  %%v24,192(%%r1,%[x])\n\t"
+    "vl  %%v25,208(%%r1,%[x])\n\t"
+    "vl  %%v26,224(%%r1,%[x])\n\t"
+    "vl  %%v27,240(%%r1,%[x])\n\t"
+    "vl  %%v28,192(%%r1,%[y])\n\t"
+    "vl  %%v29,208(%%r1,%[y])\n\t"
+    "vl  %%v30,224(%%r1,%[y])\n\t"
+    "vl  %%v31,240(%%r1,%[y])\n\t"
+    "vfmasb   %%v16,%%v0,%%v16,%%v20\n\t"
+    "vfmasb   %%v17,%%v0,%%v17,%%v21\n\t"
+    "vfmasb   %%v18,%%v0,%%v18,%%v22\n\t"
+    "vfmasb   %%v19,%%v0,%%v19,%%v23\n\t"
+    "vfmasb   %%v24,%%v0,%%v24,%%v28\n\t"
+    "vfmasb   %%v25,%%v0,%%v25,%%v29\n\t"
+    "vfmasb   %%v26,%%v0,%%v26,%%v30\n\t"
+    "vfmasb   %%v27,%%v0,%%v27,%%v31\n\t"
+    "vst  %%v16,128(%%r1,%[y])\n\t"
+    "vst  %%v17,144(%%r1,%[y])\n\t"
+    "vst  %%v18,160(%%r1,%[y])\n\t"
+    "vst  %%v19,176(%%r1,%[y])\n\t"
+    "vst  %%v24,192(%%r1,%[y])\n\t"
+    "vst  %%v25,208(%%r1,%[y])\n\t"
+    "vst  %%v26,224(%%r1,%[y])\n\t"
+    "vst  %%v27,240(%%r1,%[y])\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
+       [alpha] "Q"(*alpha)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+}
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
+          BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+          BLASLONG dummy2) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+
+  if (n <= 0)
+    return 0;
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -64;
+
+    if (n1)
+      saxpy_kernel_64(n1, x, y, &da);
+
+    i = n1;
+    while (i < n) {
+
+      y[i] += da * x[i];
+      i++;
+
+    }
+    return 0;
+
+  }
+
+  BLASLONG n1 = n & -4;
+
+  while (i < n1) {
+
+    FLOAT m1 = da * x[ix];
+    FLOAT m2 = da * x[ix + inc_x];
+    FLOAT m3 = da * x[ix + 2 * inc_x];
+    FLOAT m4 = da * x[ix + 3 * inc_x];
+
+    y[iy] += m1;
+    y[iy + inc_y] += m2;
+    y[iy + 2 * inc_y] += m3;
+    y[iy + 3 * inc_y] += m4;
+
+    ix += inc_x * 4;
+    iy += inc_y * 4;
+    i += 4;
+
+  }
+
+  while (i < n) {
+
+    y[iy] += da * x[ix];
+    ix += inc_x;
+    iy += inc_y;
+    i++;
+
+  }
+  return 0;
+
+}
diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c
new file mode 100644
index 0000000000..5c453cfbb9
--- /dev/null
+++ b/kernel/zarch/scopy.c
@@ -0,0 +1,79 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
+  __asm__("srlg %[n],%[n],6\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%[x])\n\t"
+    "pfd 2, 1024(%[y])\n\t"
+    "mvc 0(256,%[y]),0(%[x])\n\t"
+    "la  %[x],256(%[x])\n\t"
+    "la  %[y],256(%[y])\n\t"
+    "brctg %[n],0b"
+    : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x)
+    : "cc");
+}
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+
+  if (n <= 0)
+    return 0;
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -64;
+    if (n1 > 0) {
+      scopy_kernel_64(n1, x, y);
+      i = n1;
+    }
+
+    while (i < n) {
+      y[i] = x[i];
+      i++;
+
+    }
+
+  } else {
+
+    while (i < n) {
+
+      y[iy] = x[ix];
+      ix += inc_x;
+      iy += inc_y;
+      i++;
+
+    }
+
+  }
+  return 0;
+
+}
diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c
new file mode 100644
index 0000000000..d870b30f07
--- /dev/null
+++ b/kernel/zarch/sdot.c
@@ -0,0 +1,144 @@
+/***************************************************************************
+Copyright (c) 2013-2019,The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms,with or without
+modification,are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice,this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice,this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL
+DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
+  FLOAT dot;
+
+  __asm__("vzero %%v0\n\t"
+    "vzero %%v1\n\t"
+    "vzero %%v2\n\t"
+    "vzero %%v3\n\t"
+    "vzero %%v4\n\t"
+    "vzero %%v5\n\t"
+    "vzero %%v6\n\t"
+    "vzero %%v7\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[x])\n\t"
+    "pfd 1,1024(%%r1,%[y])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[y])\n\t"
+    "vl  %%v25,16(%%r1,%[y])\n\t"
+    "vl  %%v26,32(%%r1,%[y])\n\t"
+    "vl  %%v27,48(%%r1,%[y])\n\t"
+    "vl  %%v28,64(%%r1,%[y])\n\t"
+    "vl  %%v29,80(%%r1,%[y])\n\t"
+    "vl  %%v30,96(%%r1,%[y])\n\t"
+    "vl  %%v31,112(%%r1,%[y])\n\t"
+    "vfmasb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vfmasb   %%v1,%%v17,%%v25,%%v1\n\t"
+    "vfmasb   %%v2,%%v18,%%v26,%%v2\n\t"
+    "vfmasb   %%v3,%%v19,%%v27,%%v3\n\t"
+    "vfmasb   %%v4,%%v20,%%v28,%%v4\n\t"
+    "vfmasb   %%v5,%%v21,%%v29,%%v5\n\t"
+    "vfmasb   %%v6,%%v22,%%v30,%%v6\n\t"
+    "vfmasb   %%v7,%%v23,%%v31,%%v7\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %[n],0b\n\t"
+    "vfasb   %%v0,%%v0,%%v1\n\t"
+    "vfasb   %%v0,%%v0,%%v2\n\t"
+    "vfasb   %%v0,%%v0,%%v3\n\t"
+    "vfasb   %%v0,%%v0,%%v4\n\t"
+    "vfasb   %%v0,%%v0,%%v5\n\t"
+    "vfasb   %%v0,%%v0,%%v6\n\t"
+    "vfasb   %%v0,%%v0,%%v7\n\t"
+    "vrepf  %%v1,%%v0,1\n\t"
+    "vrepf  %%v2,%%v0,2\n\t"
+    "vrepf  %%v3,%%v0,3\n\t"
+    "aebr   %%f0,%%f1\n\t"
+    "aebr   %%f0,%%f2\n\t"
+    "aebr   %%f0,%%f3\n\t"
+    "ler    %[dot],%%f0"
+    : [dot] "=f"(dot),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), 
+       "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+
+  return dot;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+
+  FLOAT dot = 0.0;
+
+  if (n <= 0)
+    return (dot);
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -32;
+
+    if (n1)
+      dot = sdot_kernel_32(n1, x, y);
+
+    i = n1;
+    while (i < n) {
+
+      dot += y[i] * x[i];
+      i++;
+
+    }
+    return (dot);
+
+  }
+
+  BLASLONG n1 = n & -2;
+
+  while (i < n1) {
+
+    dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x];
+    ix += inc_x * 2;
+    iy += inc_y * 2;
+    i += 2;
+
+  }
+
+  while (i < n) {
+
+    dot += y[iy] * x[ix];
+    ix += inc_x;
+    iy += inc_y;
+    i++;
+
+  }
+  return (dot);
+
+}
diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c
new file mode 100644
index 0000000000..a1efef373f
--- /dev/null
+++ b/kernel/zarch/sgemv_n_4.c
@@ -0,0 +1,597 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define NBMAX 2048
+
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
+                             FLOAT *alpha) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+  register FLOAT *ap2 = ap[2];
+  register FLOAT *ap3 = ap[3];
+
+  __asm__("vlrepf %%v0,0(%[x])\n\t"
+    "vlrepf %%v1,4(%[x])\n\t"
+    "vlrepf %%v2,8(%[x])\n\t"
+    "vlrepf %%v3,12(%[x])\n\t"
+    "vlrepf %%v4,%[alpha]\n\t"
+    "vfmsb  %%v0,%%v0,%%v4\n\t"
+    "vfmsb  %%v1,%%v1,%%v4\n\t"
+    "vfmsb  %%v2,%%v2,%%v4\n\t"
+    "vfmsb  %%v3,%%v3,%%v4\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-32\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,5\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 1,1024(%%r1,%[ap2])\n\t"
+    "pfd 1,1024(%%r1,%[ap3])\n\t"
+    "pfd 2,1024(%%r1,%[y])\n\t"
+    "vl  %%v16,0(%%r1,%[ap0])\n\t"
+    "vl  %%v17,0(%%r1,%[ap1])\n\t"
+    "vl  %%v18,0(%%r1,%[ap2])\n\t"
+    "vl  %%v19,0(%%r1,%[ap3])\n\t"
+    "vl  %%v20,16(%%r1,%[ap0])\n\t"
+    "vl  %%v21,16(%%r1,%[ap1])\n\t"
+    "vl  %%v22,16(%%r1,%[ap2])\n\t"
+    "vl  %%v23,16(%%r1,%[ap3])\n\t"
+    "vl  %%v24,32(%%r1,%[ap0])\n\t"
+    "vl  %%v25,32(%%r1,%[ap1])\n\t"
+    "vl  %%v26,32(%%r1,%[ap2])\n\t"
+    "vl  %%v27,32(%%r1,%[ap3])\n\t"
+    "vl  %%v28,48(%%r1,%[ap0])\n\t"
+    "vl  %%v29,48(%%r1,%[ap1])\n\t"
+    "vl  %%v30,48(%%r1,%[ap2])\n\t"
+    "vl  %%v31,48(%%r1,%[ap3])\n\t"
+    "vl  %%v4,0(%%r1,%[y])\n\t"
+    "vl  %%v5,16(%%r1,%[y])\n\t"
+    "vl  %%v6,32(%%r1,%[y])\n\t"
+    "vl  %%v7,48(%%r1,%[y])\n\t"
+    "vfmasb   %%v4,%%v16,%%v0,%%v4\n\t"
+    "vfmasb   %%v5,%%v20,%%v0,%%v5\n\t"
+    "vfmasb   %%v6,%%v24,%%v0,%%v6\n\t"
+    "vfmasb   %%v7,%%v28,%%v0,%%v7\n\t"
+    "vfmasb   %%v4,%%v17,%%v1,%%v4\n\t"
+    "vfmasb   %%v5,%%v21,%%v1,%%v5\n\t"
+    "vfmasb   %%v6,%%v25,%%v1,%%v6\n\t"
+    "vfmasb   %%v7,%%v29,%%v1,%%v7\n\t"
+    "vfmasb   %%v4,%%v18,%%v2,%%v4\n\t"
+    "vfmasb   %%v5,%%v22,%%v2,%%v5\n\t"
+    "vfmasb   %%v6,%%v26,%%v2,%%v6\n\t"
+    "vfmasb   %%v7,%%v30,%%v2,%%v7\n\t"
+    "vfmasb   %%v4,%%v19,%%v3,%%v4\n\t"
+    "vfmasb   %%v5,%%v23,%%v3,%%v5\n\t"
+    "vfmasb   %%v6,%%v27,%%v3,%%v6\n\t"
+    "vfmasb   %%v7,%%v31,%%v3,%%v7\n\t"
+    "vst %%v4,0(%%r1,%[y])\n\t"
+    "vst %%v5,16(%%r1,%[y])\n\t"
+    "vst %%v6,32(%%r1,%[y])\n\t"
+    "vst %%v7,48(%%r1,%[y])\n\t"
+    "vl  %%v16,64(%%r1,%[ap0])\n\t"
+    "vl  %%v17,64(%%r1,%[ap1])\n\t"
+    "vl  %%v18,64(%%r1,%[ap2])\n\t"
+    "vl  %%v19,64(%%r1,%[ap3])\n\t"
+    "vl  %%v20,80(%%r1,%[ap0])\n\t"
+    "vl  %%v21,80(%%r1,%[ap1])\n\t"
+    "vl  %%v22,80(%%r1,%[ap2])\n\t"
+    "vl  %%v23,80(%%r1,%[ap3])\n\t"
+    "vl  %%v24,96(%%r1,%[ap0])\n\t"
+    "vl  %%v25,96(%%r1,%[ap1])\n\t"
+    "vl  %%v26,96(%%r1,%[ap2])\n\t"
+    "vl  %%v27,96(%%r1,%[ap3])\n\t"
+    "vl  %%v28,112(%%r1,%[ap0])\n\t"
+    "vl  %%v29,112(%%r1,%[ap1])\n\t"
+    "vl  %%v30,112(%%r1,%[ap2])\n\t"
+    "vl  %%v31,112(%%r1,%[ap3])\n\t"
+    "vl  %%v4,64(%%r1,%[y])\n\t"
+    "vl  %%v5,80(%%r1,%[y])\n\t"
+    "vl  %%v6,96(%%r1,%[y])\n\t"
+    "vl  %%v7,112(%%r1,%[y])\n\t"
+    "vfmasb   %%v4,%%v16,%%v0,%%v4\n\t"
+    "vfmasb   %%v5,%%v20,%%v0,%%v5\n\t"
+    "vfmasb   %%v6,%%v24,%%v0,%%v6\n\t"
+    "vfmasb   %%v7,%%v28,%%v0,%%v7\n\t"
+    "vfmasb   %%v4,%%v17,%%v1,%%v4\n\t"
+    "vfmasb   %%v5,%%v21,%%v1,%%v5\n\t"
+    "vfmasb   %%v6,%%v25,%%v1,%%v6\n\t"
+    "vfmasb   %%v7,%%v29,%%v1,%%v7\n\t"
+    "vfmasb   %%v4,%%v18,%%v2,%%v4\n\t"
+    "vfmasb   %%v5,%%v22,%%v2,%%v5\n\t"
+    "vfmasb   %%v6,%%v26,%%v2,%%v6\n\t"
+    "vfmasb   %%v7,%%v30,%%v2,%%v7\n\t"
+    "vfmasb   %%v4,%%v19,%%v3,%%v4\n\t"
+    "vfmasb   %%v5,%%v23,%%v3,%%v5\n\t"
+    "vfmasb   %%v6,%%v27,%%v3,%%v6\n\t"
+    "vfmasb   %%v7,%%v31,%%v3,%%v7\n\t"
+    "vst %%v4,64(%%r1,%[y])\n\t"
+    "vst %%v5,80(%%r1,%[y])\n\t"
+    "vst %%v6,96(%%r1,%[y])\n\t"
+    "vst %%v7,112(%%r1,%[y])\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,28\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[ap0])\n\t"
+    "vl  %%v17,0(%%r1,%[ap1])\n\t"
+    "vl  %%v18,0(%%r1,%[ap2])\n\t"
+    "vl  %%v19,0(%%r1,%[ap3])\n\t"
+    "vl  %%v4,0(%%r1,%[y])\n\t"
+    "vfmasb   %%v4,%%v16,%%v0,%%v4\n\t"
+    "vfmasb   %%v4,%%v17,%%v1,%%v4\n\t"
+    "vfmasb   %%v4,%%v18,%%v2,%%v4\n\t"
+    "vfmasb   %%v4,%%v19,%%v3,%%v4\n\t"
+    "vst %%v4,0(%%r1,%[y])\n\t"
+    "agfi   %%r1,16\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "nop"
+    : "+m"(*(struct { FLOAT x[n]; } *) y)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
+       "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
+       "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
+       [n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+       "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+       "v26", "v27", "v28", "v29", "v30", "v31");
+}
+
+static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
+                             FLOAT *alpha) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+
+  __asm__("vlrepf %%v0,0(%[x])\n\t"
+    "vlrepf %%v1,4(%[x])\n\t"
+    "vlrepf %%v2,%[alpha]\n\t"
+    "vfmsb  %%v0,%%v0,%%v2\n\t"
+    "vfmsb  %%v1,%%v1,%%v2\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-32\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,5\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 2,1024(%%r1,%[y])\n\t"
+    "vl  %%v16,0(%%r1,%[ap0])\n\t"
+    "vl  %%v17,0(%%r1,%[ap1])\n\t"
+    "vl  %%v18,16(%%r1,%[ap0])\n\t"
+    "vl  %%v19,16(%%r1,%[ap1])\n\t"
+    "vl  %%v20,32(%%r1,%[ap0])\n\t"
+    "vl  %%v21,32(%%r1,%[ap1])\n\t"
+    "vl  %%v22,48(%%r1,%[ap0])\n\t"
+    "vl  %%v23,48(%%r1,%[ap1])\n\t"
+    "vl  %%v24,64(%%r1,%[ap0])\n\t"
+    "vl  %%v25,64(%%r1,%[ap1])\n\t"
+    "vl  %%v26,80(%%r1,%[ap0])\n\t"
+    "vl  %%v27,80(%%r1,%[ap1])\n\t"
+    "vl  %%v28,96(%%r1,%[ap0])\n\t"
+    "vl  %%v29,96(%%r1,%[ap1])\n\t"
+    "vl  %%v30,112(%%r1,%[ap0])\n\t"
+    "vl  %%v31,112(%%r1,%[ap1])\n\t"
+    "vl  %%v2,0(%%r1,%[y])\n\t"
+    "vl  %%v3,16(%%r1,%[y])\n\t"
+    "vl  %%v4,32(%%r1,%[y])\n\t"
+    "vl  %%v5,48(%%r1,%[y])\n\t"
+    "vl  %%v6,64(%%r1,%[y])\n\t"
+    "vl  %%v7,80(%%r1,%[y])\n\t"
+    "vl  %%v8,96(%%r1,%[y])\n\t"
+    "vl  %%v9,112(%%r1,%[y])\n\t"
+    "vfmasb   %%v2,%%v16,%%v0,%%v2\n\t"
+    "vfmasb   %%v3,%%v18,%%v0,%%v3\n\t"
+    "vfmasb   %%v4,%%v20,%%v0,%%v4\n\t"
+    "vfmasb   %%v5,%%v22,%%v0,%%v5\n\t"
+    "vfmasb   %%v6,%%v24,%%v0,%%v6\n\t"
+    "vfmasb   %%v7,%%v26,%%v0,%%v7\n\t"
+    "vfmasb   %%v8,%%v28,%%v0,%%v8\n\t"
+    "vfmasb   %%v9,%%v30,%%v0,%%v9\n\t"
+    "vfmasb   %%v2,%%v17,%%v1,%%v2\n\t"
+    "vfmasb   %%v3,%%v19,%%v1,%%v3\n\t"
+    "vfmasb   %%v4,%%v21,%%v1,%%v4\n\t"
+    "vfmasb   %%v5,%%v23,%%v1,%%v5\n\t"
+    "vfmasb   %%v6,%%v25,%%v1,%%v6\n\t"
+    "vfmasb   %%v7,%%v27,%%v1,%%v7\n\t"
+    "vfmasb   %%v8,%%v29,%%v1,%%v8\n\t"
+    "vfmasb   %%v9,%%v31,%%v1,%%v9\n\t"
+    "vst %%v2,0(%%r1,%[y])\n\t"
+    "vst %%v3,16(%%r1,%[y])\n\t"
+    "vst %%v4,32(%%r1,%[y])\n\t"
+    "vst %%v5,48(%%r1,%[y])\n\t"
+    "vst %%v6,64(%%r1,%[y])\n\t"
+    "vst %%v7,80(%%r1,%[y])\n\t"
+    "vst %%v8,96(%%r1,%[y])\n\t"
+    "vst %%v9,112(%%r1,%[y])\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,28\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[ap0])\n\t"
+    "vl  %%v17,0(%%r1,%[ap1])\n\t"
+    "vl  %%v2,0(%%r1,%[y])\n\t"
+    "vfmasb   %%v2,%%v16,%%v0,%%v2\n\t"
+    "vfmasb   %%v2,%%v17,%%v1,%%v2\n\t"
+    "vst %%v2,0(%%r1,%[y])\n\t"
+    "agfi   %%r1,16\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "nop"
+    : "+m"(*(struct { FLOAT x[n]; } *) y)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
+       [n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+       "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+}
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y,
+                             FLOAT *alpha) {
+  __asm__("vlrepf %%v0,0(%[x])\n\t"
+    "vlrepf %%v16,%[alpha]\n\t"
+    "vfmsb  %%v0,%%v0,%%v16\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-32\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,5\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[a0])\n\t"
+    "pfd 2,1024(%%r1,%[y])\n\t"
+    "vl  %%v16,0(%%r1,%[a0])\n\t"
+    "vl  %%v17,16(%%r1,%[a0])\n\t"
+    "vl  %%v18,32(%%r1,%[a0])\n\t"
+    "vl  %%v19,48(%%r1,%[a0])\n\t"
+    "vl  %%v20,64(%%r1,%[a0])\n\t"
+    "vl  %%v21,80(%%r1,%[a0])\n\t"
+    "vl  %%v22,96(%%r1,%[a0])\n\t"
+    "vl  %%v23,112(%%r1,%[a0])\n\t"
+    "vl  %%v24,0(%%r1,%[y])\n\t"
+    "vl  %%v25,16(%%r1,%[y])\n\t"
+    "vl  %%v26,32(%%r1,%[y])\n\t"
+    "vl  %%v27,48(%%r1,%[y])\n\t"
+    "vl  %%v28,64(%%r1,%[y])\n\t"
+    "vl  %%v29,80(%%r1,%[y])\n\t"
+    "vl  %%v30,96(%%r1,%[y])\n\t"
+    "vl  %%v31,112(%%r1,%[y])\n\t"
+    "vfmasb   %%v24,%%v16,%%v0,%%v24\n\t"
+    "vfmasb   %%v25,%%v17,%%v0,%%v25\n\t"
+    "vfmasb   %%v26,%%v18,%%v0,%%v26\n\t"
+    "vfmasb   %%v27,%%v19,%%v0,%%v27\n\t"
+    "vfmasb   %%v28,%%v20,%%v0,%%v28\n\t"
+    "vfmasb   %%v29,%%v21,%%v0,%%v29\n\t"
+    "vfmasb   %%v30,%%v22,%%v0,%%v30\n\t"
+    "vfmasb   %%v31,%%v23,%%v0,%%v31\n\t"
+    "vst %%v24,0(%%r1,%[y])\n\t"
+    "vst %%v25,16(%%r1,%[y])\n\t"
+    "vst %%v26,32(%%r1,%[y])\n\t"
+    "vst %%v27,48(%%r1,%[y])\n\t"
+    "vst %%v28,64(%%r1,%[y])\n\t"
+    "vst %%v29,80(%%r1,%[y])\n\t"
+    "vst %%v30,96(%%r1,%[y])\n\t"
+    "vst %%v31,112(%%r1,%[y])\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,28\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[a0])\n\t"
+    "vl  %%v17,0(%%r1,%[y])\n\t"
+    "vfmasb   %%v17,%%v16,%%v0,%%v17\n\t"
+    "vst %%v17,0(%%r1,%[y])\n\t"
+    "agfi   %%r1,16\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "nop"
+    : "+m"(*(struct { FLOAT x[n]; } *) y)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
+       "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha),
+       [n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+}
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {
+  BLASLONG i;
+  for (i = 0; i < n; i++) {
+    *dest += src[i];
+    dest += inc_dest;
+  }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
+          BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT *buffer) {
+  BLASLONG i;
+  FLOAT *a_ptr;
+  FLOAT *x_ptr;
+  FLOAT *y_ptr;
+  FLOAT *ap[4];
+  BLASLONG n1;
+  BLASLONG m1;
+  BLASLONG m2;
+  BLASLONG m3;
+  BLASLONG n2;
+  BLASLONG lda4 = lda << 2;
+  FLOAT xbuffer[8], *ybuffer;
+
+  if (m < 1)
+    return (0);
+  if (n < 1)
+    return (0);
+
+  ybuffer = buffer;
+
+  n1 = n >> 2;
+  n2 = n & 3;
+
+  m3 = m & 3;
+  m1 = m & -4;
+  m2 = (m & (NBMAX - 1)) - m3;
+
+  y_ptr = y;
+
+  BLASLONG NB = NBMAX;
+
+  while (NB == NBMAX) {
+
+    m1 -= NB;
+    if (m1 < 0) {
+      if (m2 == 0)
+        break;
+      NB = m2;
+    }
+
+    a_ptr = a;
+    x_ptr = x;
+
+    ap[0] = a_ptr;
+    ap[1] = a_ptr + lda;
+    ap[2] = ap[1] + lda;
+    ap[3] = ap[2] + lda;
+
+    if (inc_y != 1)
+      memset(ybuffer, 0, NB * 4);
+    else
+      ybuffer = y_ptr;
+
+    if (inc_x == 1) {
+
+      for (i = 0; i < n1; i++) {
+        sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
+        ap[0] += lda4;
+        ap[1] += lda4;
+        ap[2] += lda4;
+        ap[3] += lda4;
+        a_ptr += lda4;
+        x_ptr += 4;
+      }
+
+      if (n2 & 2) {
+        sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
+        a_ptr += lda * 2;
+        x_ptr += 2;
+      }
+
+      if (n2 & 1) {
+        sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
+        /* a_ptr += lda;
+           x_ptr += 1; */
+
+      }
+
+    } else {
+
+      for (i = 0; i < n1; i++) {
+        xbuffer[0] = x_ptr[0];
+        x_ptr += inc_x;
+        xbuffer[1] = x_ptr[0];
+        x_ptr += inc_x;
+        xbuffer[2] = x_ptr[0];
+        x_ptr += inc_x;
+        xbuffer[3] = x_ptr[0];
+        x_ptr += inc_x;
+        sgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha);
+        ap[0] += lda4;
+        ap[1] += lda4;
+        ap[2] += lda4;
+        ap[3] += lda4;
+        a_ptr += lda4;
+      }
+
+      for (i = 0; i < n2; i++) {
+        xbuffer[0] = x_ptr[0];
+        x_ptr += inc_x;
+        sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
+        a_ptr += lda;
+
+      }
+
+    }
+
+    a += NB;
+    if (inc_y != 1) {
+      add_y(NB, ybuffer, y_ptr, inc_y);
+      y_ptr += NB * inc_y;
+    } else
+      y_ptr += NB;
+
+  }
+
+  if (m3 == 0)
+    return (0);
+
+  if (m3 == 3) {
+    a_ptr = a;
+    x_ptr = x;
+    FLOAT temp0 = 0.0;
+    FLOAT temp1 = 0.0;
+    FLOAT temp2 = 0.0;
+    if (lda == 3 && inc_x == 1) {
+
+      for (i = 0; i < (n & -4); i += 4) {
+
+        temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+        temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+        temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
+        temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+        temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+        a_ptr += 12;
+        x_ptr += 4;
+      }
+
+      for (; i < n; i++) {
+        temp0 += a_ptr[0] * x_ptr[0];
+        temp1 += a_ptr[1] * x_ptr[0];
+        temp2 += a_ptr[2] * x_ptr[0];
+        a_ptr += 3;
+        x_ptr++;
+      }
+
+    } else {
+
+      for (i = 0; i < n; i++) {
+        temp0 += a_ptr[0] * x_ptr[0];
+        temp1 += a_ptr[1] * x_ptr[0];
+        temp2 += a_ptr[2] * x_ptr[0];
+        a_ptr += lda;
+        x_ptr += inc_x;
+
+      }
+
+    }
+    y_ptr[0] += alpha * temp0;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha * temp1;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha * temp2;
+    return (0);
+  }
+
+  if (m3 == 2) {
+    a_ptr = a;
+    x_ptr = x;
+    FLOAT temp0 = 0.0;
+    FLOAT temp1 = 0.0;
+    if (lda == 2 && inc_x == 1) {
+
+      for (i = 0; i < (n & -4); i += 4) {
+        temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+        temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+        temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+        a_ptr += 8;
+        x_ptr += 4;
+
+      }
+
+      for (; i < n; i++) {
+        temp0 += a_ptr[0] * x_ptr[0];
+        temp1 += a_ptr[1] * x_ptr[0];
+        a_ptr += 2;
+        x_ptr++;
+      }
+
+    } else {
+
+      for (i = 0; i < n; i++) {
+        temp0 += a_ptr[0] * x_ptr[0];
+        temp1 += a_ptr[1] * x_ptr[0];
+        a_ptr += lda;
+        x_ptr += inc_x;
+
+      }
+
+    }
+    y_ptr[0] += alpha * temp0;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha * temp1;
+    return (0);
+  }
+
+  if (m3 == 1) {
+    a_ptr = a;
+    x_ptr = x;
+    FLOAT temp = 0.0;
+    if (lda == 1 && inc_x == 1) {
+
+      for (i = 0; i < (n & -4); i += 4) {
+        temp +=
+          a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i +
+                                                                    2] *
+          x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3];
+
+      }
+
+      for (; i < n; i++) {
+        temp += a_ptr[i] * x_ptr[i];
+      }
+
+    } else {
+
+      for (i = 0; i < n; i++) {
+        temp += a_ptr[0] * x_ptr[0];
+        a_ptr += lda;
+        x_ptr += inc_x;
+      }
+
+    }
+    y_ptr[0] += alpha * temp;
+    return (0);
+  }
+
+  return (0);
+}
diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c
new file mode 100644
index 0000000000..81d7c9fe74
--- /dev/null
+++ b/kernel/zarch/sgemv_t_4.c
@@ -0,0 +1,753 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#define NBMAX 2048
+
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+  register FLOAT *ap2 = ap[2];
+  register FLOAT *ap3 = ap[3];
+
+  __asm__("vzero %%v0\n\t"
+    "vzero %%v1\n\t"
+    "vzero %%v2\n\t"
+    "vzero %%v3\n\t"
+    "vzero %%v4\n\t"
+    "vzero %%v5\n\t"
+    "vzero %%v6\n\t"
+    "vzero %%v7\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-32\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,5\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 1,1024(%%r1,%[ap2])\n\t"
+    "pfd 1,1024(%%r1,%[ap3])\n\t"
+    "pfd 1,1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vl  %%v25,0(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v1,%%v16,%%v25,%%v1\n\t"
+    "vl  %%v26,0(%%r1,%[ap2])\n\t"
+    "vfmasb   %%v2,%%v16,%%v26,%%v2\n\t"
+    "vl  %%v27,0(%%r1,%[ap3])\n\t"
+    "vfmasb   %%v3,%%v16,%%v27,%%v3\n\t"
+    "vl  %%v28,16(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v4,%%v17,%%v28,%%v4\n\t"
+    "vl  %%v29,16(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v5,%%v17,%%v29,%%v5\n\t"
+    "vl  %%v30,16(%%r1,%[ap2])\n\t"
+    "vfmasb   %%v6,%%v17,%%v30,%%v6\n\t"
+    "vl  %%v31,16(%%r1,%[ap3])\n\t"
+    "vfmasb   %%v7,%%v17,%%v31,%%v7\n\t"
+    "vl  %%v24,32(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v0,%%v18,%%v24,%%v0\n\t"
+    "vl  %%v25,32(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v1,%%v18,%%v25,%%v1\n\t"
+    "vl  %%v26,32(%%r1,%[ap2])\n\t"
+    "vfmasb   %%v2,%%v18,%%v26,%%v2\n\t"
+    "vl  %%v27,32(%%r1,%[ap3])\n\t"
+    "vfmasb   %%v3,%%v18,%%v27,%%v3\n\t"
+    "vl  %%v28,48(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v4,%%v19,%%v28,%%v4\n\t"
+    "vl  %%v29,48(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v5,%%v19,%%v29,%%v5\n\t"
+    "vl  %%v30,48(%%r1,%[ap2])\n\t"
+    "vfmasb   %%v6,%%v19,%%v30,%%v6\n\t"
+    "vl  %%v31,48(%%r1,%[ap3])\n\t"
+    "vfmasb   %%v7,%%v19,%%v31,%%v7\n\t"
+    "vl  %%v24,64(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v0,%%v20,%%v24,%%v0\n\t"
+    "vl  %%v25,64(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v1,%%v20,%%v25,%%v1\n\t"
+    "vl  %%v26,64(%%r1,%[ap2])\n\t"
+    "vfmasb   %%v2,%%v20,%%v26,%%v2\n\t"
+    "vl  %%v27,64(%%r1,%[ap3])\n\t"
+    "vfmasb   %%v3,%%v20,%%v27,%%v3\n\t"
+    "vl  %%v28,80(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v4,%%v21,%%v28,%%v4\n\t"
+    "vl  %%v29,80(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v5,%%v21,%%v29,%%v5\n\t"
+    "vl  %%v30,80(%%r1,%[ap2])\n\t"
+    "vfmasb   %%v6,%%v21,%%v30,%%v6\n\t"
+    "vl  %%v31,80(%%r1,%[ap3])\n\t"
+    "vfmasb   %%v7,%%v21,%%v31,%%v7\n\t"
+    "vl  %%v24,96(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v0,%%v22,%%v24,%%v0\n\t"
+    "vl  %%v25,96(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v1,%%v22,%%v25,%%v1\n\t"
+    "vl  %%v26,96(%%r1,%[ap2])\n\t"
+    "vfmasb   %%v2,%%v22,%%v26,%%v2\n\t"
+    "vl  %%v27,96(%%r1,%[ap3])\n\t"
+    "vfmasb   %%v3,%%v22,%%v27,%%v3\n\t"
+    "vl  %%v28,112(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v4,%%v23,%%v28,%%v4\n\t"
+    "vl  %%v29,112(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v5,%%v23,%%v29,%%v5\n\t"
+    "vl  %%v30,112(%%r1,%[ap2])\n\t"
+    "vfmasb   %%v6,%%v23,%%v30,%%v6\n\t"
+    "vl  %%v31,112(%%r1,%[ap3])\n\t"
+    "vfmasb   %%v7,%%v23,%%v31,%%v7\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,28\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vl  %%v25,0(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v1,%%v16,%%v25,%%v1\n\t"
+    "vl  %%v26,0(%%r1,%[ap2])\n\t"
+    "vfmasb   %%v2,%%v16,%%v26,%%v2\n\t"
+    "vl  %%v27,0(%%r1,%[ap3])\n\t"
+    "vfmasb   %%v3,%%v16,%%v27,%%v3\n\t"
+    "agfi   %%r1,16\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "vfasb  %%v0,%%v0,%%v4\n\t"
+    "vfasb  %%v1,%%v1,%%v5\n\t"
+    "vfasb  %%v2,%%v2,%%v6\n\t"
+    "vfasb  %%v3,%%v3,%%v7\n\t"
+    "veslg  %%v4,%%v0,32\n\t"
+    "vfasb  %%v0,%%v0,%%v4\n\t"
+    "vrepg  %%v4,%%v0,1\n\t"
+    "aebr   %%f0,%%f4\n\t"
+    "ste    %%f0,0(%[y])\n\t"
+    "veslg  %%v4,%%v1,32\n\t"
+    "vfasb  %%v1,%%v1,%%v4\n\t"
+    "vrepg  %%v4,%%v1,1\n\t"
+    "aebr   %%f1,%%f4\n\t"
+    "ste    %%f1,4(%[y])\n\t"
+    "veslg  %%v4,%%v2,32\n\t"
+    "vfasb  %%v2,%%v2,%%v4\n\t"
+    "vrepg  %%v4,%%v2,1\n\t"
+    "aebr   %%f2,%%f4\n\t"
+    "ste    %%f2,8(%[y])\n\t"
+    "veslg  %%v4,%%v3,32\n\t"
+    "vfasb  %%v3,%%v3,%%v4\n\t"
+    "vrepg  %%v4,%%v3,1\n\t"
+    "aebr   %%f3,%%f4\n\t"
+    "ste    %%f3,12(%[y])"
+    : "=m"(*(struct { FLOAT x[4]; } *) y)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
+       "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
+       "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+       "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+       "v26", "v27", "v28", "v29", "v30", "v31");
+}
+
+static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+
+  __asm__("vzero %%v0\n\t"
+    "vzero %%v1\n\t"
+    "vzero %%v2\n\t"
+    "vzero %%v3\n\t"
+    "vzero %%v4\n\t"
+    "vzero %%v5\n\t"
+    "vzero %%v6\n\t"
+    "vzero %%v7\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-32\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,5\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 1,1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vl  %%v25,0(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v1,%%v16,%%v25,%%v1\n\t"
+    "vl  %%v26,16(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v2,%%v17,%%v26,%%v2\n\t"
+    "vl  %%v27,16(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v3,%%v17,%%v27,%%v3\n\t"
+    "vl  %%v28,32(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v4,%%v18,%%v28,%%v4\n\t"
+    "vl  %%v29,32(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v5,%%v18,%%v29,%%v5\n\t"
+    "vl  %%v30,48(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v6,%%v19,%%v30,%%v6\n\t"
+    "vl  %%v31,48(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v7,%%v19,%%v31,%%v7\n\t"
+    "vl  %%v24,64(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v0,%%v20,%%v24,%%v0\n\t"
+    "vl  %%v25,64(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v1,%%v20,%%v25,%%v1\n\t"
+    "vl  %%v26,80(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v2,%%v21,%%v26,%%v2\n\t"
+    "vl  %%v27,80(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v3,%%v21,%%v27,%%v3\n\t"
+    "vl  %%v28,96(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v4,%%v22,%%v28,%%v4\n\t"
+    "vl  %%v29,96(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v5,%%v22,%%v29,%%v5\n\t"
+    "vl  %%v30,112(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v6,%%v23,%%v30,%%v6\n\t"
+    "vl  %%v31,112(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v7,%%v23,%%v31,%%v7\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,28\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[ap0])\n\t"
+    "vfmasb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vl  %%v25,0(%%r1,%[ap1])\n\t"
+    "vfmasb   %%v1,%%v16,%%v25,%%v1\n\t"
+    "agfi   %%r1,16\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "vfasb  %%v0,%%v0,%%v2\n\t"
+    "vfasb  %%v0,%%v0,%%v4\n\t"
+    "vfasb  %%v0,%%v0,%%v6\n\t"
+    "vfasb  %%v1,%%v1,%%v3\n\t"
+    "vfasb  %%v1,%%v1,%%v5\n\t"
+    "vfasb  %%v1,%%v1,%%v7\n\t"
+    "veslg  %%v2,%%v0,32\n\t"
+    "vfasb  %%v0,%%v0,%%v2\n\t"
+    "vrepg  %%v2,%%v0,1\n\t"
+    "aebr   %%f0,%%f2\n\t"
+    "ste    %%f0,0(%[y])\n\t"
+    "veslg  %%v2,%%v1,32\n\t"
+    "vfasb  %%v1,%%v1,%%v2\n\t"
+    "vrepg  %%v2,%%v1,1\n\t"
+    "aebr   %%f1,%%f2\n\t"
+    "ste    %%f1,4(%[y])"
+    : "=m"(*(struct { FLOAT x[2]; } *) y)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+       "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+       "v26", "v27", "v28", "v29", "v30", "v31");
+}
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) {
+  __asm__("vzero %%v0\n\t"
+    "vzero %%v1\n\t"
+    "vzero %%v2\n\t"
+    "vzero %%v3\n\t"
+    "vzero %%v4\n\t"
+    "vzero %%v5\n\t"
+    "vzero %%v6\n\t"
+    "vzero %%v7\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-32\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,5\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[a0])\n\t"
+    "pfd 1,1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[a0])\n\t"
+    "vfmasb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "vl  %%v25,16(%%r1,%[a0])\n\t"
+    "vfmasb   %%v1,%%v17,%%v25,%%v1\n\t"
+    "vl  %%v26,32(%%r1,%[a0])\n\t"
+    "vfmasb   %%v2,%%v18,%%v26,%%v2\n\t"
+    "vl  %%v27,48(%%r1,%[a0])\n\t"
+    "vfmasb   %%v3,%%v19,%%v27,%%v3\n\t"
+    "vl  %%v28,64(%%r1,%[a0])\n\t"
+    "vfmasb   %%v4,%%v20,%%v28,%%v4\n\t"
+    "vl  %%v29,80(%%r1,%[a0])\n\t"
+    "vfmasb   %%v5,%%v21,%%v29,%%v5\n\t"
+    "vl  %%v30,96(%%r1,%[a0])\n\t"
+    "vfmasb   %%v6,%%v22,%%v30,%%v6\n\t"
+    "vl  %%v31,112(%%r1,%[a0])\n\t"
+    "vfmasb   %%v7,%%v23,%%v31,%%v7\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,28\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v24,0(%%r1,%[a0])\n\t"
+    "vfmasb   %%v0,%%v16,%%v24,%%v0\n\t"
+    "agfi   %%r1,16\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "vfasb  %%v0,%%v0,%%v1\n\t"
+    "vfasb  %%v0,%%v0,%%v2\n\t"
+    "vfasb  %%v0,%%v0,%%v3\n\t"
+    "vfasb  %%v0,%%v0,%%v4\n\t"
+    "vfasb  %%v0,%%v0,%%v5\n\t"
+    "vfasb  %%v0,%%v0,%%v6\n\t"
+    "vfasb  %%v0,%%v0,%%v7\n\t"
+    "veslg  %%v1,%%v0,32\n\t"
+    "vfasb  %%v0,%%v0,%%v1\n\t"
+    "vrepg  %%v1,%%v0,1\n\t"
+    "aebr   %%f0,%%f1\n\t"
+    "ste    %%f0,0(%[y])"
+    : "=m"(*(FLOAT (*)[1]) y)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
+       "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+       "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+       "v26", "v27", "v28", "v29", "v30", "v31");
+}
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+  BLASLONG i;
+  for (i = 0; i < n; i++) {
+    dest[i] = *src;
+    src += inc_src;
+  }
+}
+
+static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) {
+  __asm__("vlrepf %%v0,%[da]\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "lghi    %%r0,-32\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      1f\n\t"
+    "srlg  %%r0,%%r0,5\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[src])\n\t"
+    "pfd 2,1024(%%r1,%[dest])\n\t"
+    "vl  %%v16,0(%%r1,%[src])\n\t"
+    "vl  %%v17,16(%%r1,%[src])\n\t"
+    "vl  %%v18,32(%%r1,%[src])\n\t"
+    "vl  %%v19,48(%%r1,%[src])\n\t"
+    "vl  %%v20,64(%%r1,%[src])\n\t"
+    "vl  %%v21,80(%%r1,%[src])\n\t"
+    "vl  %%v22,96(%%r1,%[src])\n\t"
+    "vl  %%v23,112(%%r1,%[src])\n\t"
+    "vl  %%v24, 0(%%r1,%[dest])\n\t"
+    "vfmasb   %%v24,%%v16,%%v0,%%v24\n\t"
+    "vst  %%v24, 0(%%r1,%[dest])\n\t"
+    "vl  %%v25, 16(%%r1,%[dest])\n\t"
+    "vfmasb   %%v25,%%v17,%%v0,%%v25\n\t"
+    "vst  %%v25, 16(%%r1,%[dest])\n\t"
+    "vl  %%v26, 32(%%r1,%[dest])\n\t"
+    "vfmasb   %%v26,%%v18,%%v0,%%v26\n\t"
+    "vst  %%v26, 32(%%r1,%[dest])\n\t"
+    "vl  %%v27, 48(%%r1,%[dest])\n\t"
+    "vfmasb   %%v27,%%v19,%%v0,%%v27\n\t"
+    "vst  %%v27, 48(%%r1,%[dest])\n\t"
+    "vl  %%v28, 64(%%r1,%[dest])\n\t"
+    "vfmasb   %%v28,%%v20,%%v0,%%v28\n\t"
+    "vst  %%v28, 64(%%r1,%[dest])\n\t"
+    "vl  %%v29, 80(%%r1,%[dest])\n\t"
+    "vfmasb   %%v29,%%v21,%%v0,%%v29\n\t"
+    "vst  %%v29, 80(%%r1,%[dest])\n\t"
+    "vl  %%v30, 96(%%r1,%[dest])\n\t"
+    "vfmasb   %%v30,%%v22,%%v0,%%v30\n\t"
+    "vst  %%v30, 96(%%r1,%[dest])\n\t"
+    "vl  %%v31, 112(%%r1,%[dest])\n\t"
+    "vfmasb   %%v31,%%v23,%%v0,%%v31\n\t"
+    "vst  %%v31, 112(%%r1,%[dest])\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %%r0,0b\n\t"
+    "1:\n\t"
+    "lghi    %%r0,28\n\t"
+    "ngr     %%r0,%[n]\n\t"
+    "ltgr    %%r0,%%r0\n\t"
+    "jz      3f\n\t"
+    "srlg  %%r0,%%r0,2\n\t"
+    "2:\n\t"
+    "vl  %%v16,0(%%r1,%[src])\n\t"
+    "vl  %%v24, 0(%%r1,%[dest])\n\t"
+    "vfmasb   %%v24,%%v16,%%v0,%%v24\n\t"
+    "vst  %%v24, 0(%%r1,%[dest])\n\t"
+    "agfi   %%r1,16\n\t"
+    "brctg  %%r0,2b\n\t"
+    "3:\n\t"
+    "nop"
+    : "+m"(*(struct { FLOAT x[n]; } *) dest)
+    : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src),
+       [src] "a"(src),[n] "r"(n)
+    : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+}
+static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest,
+                  BLASLONG inc_dest) {
+  if (inc_dest == 1)
+    add_y_kernel_4(n, da, src, dest);
+  else {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+      *dest += src[i] * da;
+      dest += inc_dest;
+    }
+  }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
+          BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT *buffer) {
+  BLASLONG register i;
+  BLASLONG register j;
+  FLOAT *a_ptr;
+  FLOAT *x_ptr;
+  FLOAT *y_ptr;
+  BLASLONG n0;
+  BLASLONG n1;
+  BLASLONG m1;
+  BLASLONG m2;
+  BLASLONG m3;
+  BLASLONG n2;
+  FLOAT ybuffer[2] __attribute__ ((aligned(16)));
+  FLOAT *xbuffer;
+  FLOAT *ytemp;
+
+  if (m < 1)
+    return (0);
+  if (n < 1)
+    return (0);
+
+  xbuffer = buffer;
+  ytemp = buffer + (m < NBMAX ? m : NBMAX);
+
+  n0 = n / NBMAX;
+  n1 = (n % NBMAX) >> 2;
+  n2 = n & 3;
+
+  m3 = m & 3;
+  m1 = m & -4;
+  m2 = (m & (NBMAX - 1)) - m3;
+
+  BLASLONG NB = NBMAX;
+
+  while (NB == NBMAX) {
+    m1 -= NB;
+    if (m1 < 0) {
+      if (m2 == 0)
+        break;
+      NB = m2;
+    }
+
+    y_ptr = y;
+    a_ptr = a;
+    x_ptr = x;
+
+    if (inc_x == 1)
+      xbuffer = x_ptr;
+    else
+      copy_x(NB, x_ptr, xbuffer, inc_x);
+
+    FLOAT *ap[4];
+    FLOAT *yp;
+    BLASLONG register lda4 = 4 * lda;
+    ap[0] = a_ptr;
+    ap[1] = a_ptr + lda;
+    ap[2] = ap[1] + lda;
+    ap[3] = ap[2] + lda;
+
+    if (n0 > 0) {
+      BLASLONG nb1 = NBMAX / 4;
+      for (j = 0; j < n0; j++) {
+
+        yp = ytemp;
+        for (i = 0; i < nb1; i++) {
+          sgemv_kernel_4x4(NB, ap, xbuffer, yp);
+          ap[0] += lda4;
+          ap[1] += lda4;
+          ap[2] += lda4;
+          ap[3] += lda4;
+          yp += 4;
+        }
+        add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y);
+        y_ptr += nb1 * inc_y * 4;
+        a_ptr += nb1 * lda4;
+
+      }
+
+    }
+
+    yp = ytemp;
+
+    for (i = 0; i < n1; i++) {
+      sgemv_kernel_4x4(NB, ap, xbuffer, yp);
+      ap[0] += lda4;
+      ap[1] += lda4;
+      ap[2] += lda4;
+      ap[3] += lda4;
+      yp += 4;
+    }
+    if (n1 > 0) {
+      add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y);
+      y_ptr += n1 * inc_y * 4;
+      a_ptr += n1 * lda4;
+    }
+
+    if (n2 & 2) {
+
+      sgemv_kernel_4x2(NB, ap, xbuffer, ybuffer);
+      a_ptr += lda * 2;
+      *y_ptr += ybuffer[0] * alpha;
+      y_ptr += inc_y;
+      *y_ptr += ybuffer[1] * alpha;
+      y_ptr += inc_y;
+
+    }
+
+    if (n2 & 1) {
+
+      sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
+      // a_ptr  += lda;
+      *y_ptr += ybuffer[0] * alpha;
+      // y_ptr  += inc_y;
+
+    }
+    a += NB;
+    x += NB * inc_x;
+  }
+
+  if (m3 == 0)
+    return (0);
+
+  x_ptr = x;
+  a_ptr = a;
+  if (m3 == 3) {
+    FLOAT xtemp0 = *x_ptr * alpha;
+    x_ptr += inc_x;
+    FLOAT xtemp1 = *x_ptr * alpha;
+    x_ptr += inc_x;
+    FLOAT xtemp2 = *x_ptr * alpha;
+
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
+
+    if (lda == 3 && inc_y == 1) {
+
+      for (j = 0; j < (n & -4); j += 4) {
+
+        y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+        y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
+        y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
+        y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+        aj += 12;
+      }
+
+      for (; j < n; j++) {
+        y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+        aj += 3;
+      }
+
+    } else {
+
+      if (inc_y == 1) {
+
+        BLASLONG register lda2 = lda << 1;
+        BLASLONG register lda4 = lda << 2;
+        BLASLONG register lda3 = lda2 + lda;
+
+        for (j = 0; j < (n & -4); j += 4) {
+
+          y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+          y_ptr[j + 1] +=
+            *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda +
+                                                                2) * xtemp2;
+          y_ptr[j + 2] +=
+            *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 +
+                                                                  2) * xtemp2;
+          y_ptr[j + 3] +=
+            *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 +
+                                                                  2) * xtemp2;
+          aj += lda4;
+        }
+
+        for (; j < n; j++) {
+
+          y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+          aj += lda;
+        }
+
+      } else {
+
+        for (j = 0; j < n; j++) {
+          *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+          y_ptr += inc_y;
+          aj += lda;
+        }
+
+      }
+
+    }
+    return (0);
+  }
+
+  if (m3 == 2) {
+    FLOAT xtemp0 = *x_ptr * alpha;
+    x_ptr += inc_x;
+    FLOAT xtemp1 = *x_ptr * alpha;
+
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
+
+    if (lda == 2 && inc_y == 1) {
+
+      for (j = 0; j < (n & -4); j += 4) {
+        y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+        y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
+        y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
+        y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
+        aj += 8;
+
+      }
+
+      for (; j < n; j++) {
+        y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+        aj += 2;
+      }
+
+    } else {
+      if (inc_y == 1) {
+
+        BLASLONG register lda2 = lda << 1;
+        BLASLONG register lda4 = lda << 2;
+        BLASLONG register lda3 = lda2 + lda;
+
+        for (j = 0; j < (n & -4); j += 4) {
+
+          y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+          y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
+          y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
+          y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
+          aj += lda4;
+        }
+
+        for (; j < n; j++) {
+
+          y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+          aj += lda;
+        }
+
+      } else {
+        for (j = 0; j < n; j++) {
+          *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
+          y_ptr += inc_y;
+          aj += lda;
+        }
+      }
+
+    }
+    return (0);
+
+  }
+
+  FLOAT xtemp = *x_ptr * alpha;
+  FLOAT *aj = a_ptr;
+  y_ptr = y;
+  if (lda == 1 && inc_y == 1) {
+    for (j = 0; j < (n & -4); j += 4) {
+      y_ptr[j] += aj[j] * xtemp;
+      y_ptr[j + 1] += aj[j + 1] * xtemp;
+      y_ptr[j + 2] += aj[j + 2] * xtemp;
+      y_ptr[j + 3] += aj[j + 3] * xtemp;
+    }
+    for (; j < n; j++) {
+      y_ptr[j] += aj[j] * xtemp;
+    }
+
+  } else {
+    if (inc_y == 1) {
+
+      BLASLONG register lda2 = lda << 1;
+      BLASLONG register lda4 = lda << 2;
+      BLASLONG register lda3 = lda2 + lda;
+      for (j = 0; j < (n & -4); j += 4) {
+        y_ptr[j] += *aj * xtemp;
+        y_ptr[j + 1] += *(aj + lda) * xtemp;
+        y_ptr[j + 2] += *(aj + lda2) * xtemp;
+        y_ptr[j + 3] += *(aj + lda3) * xtemp;
+        aj += lda4;
+      }
+
+      for (; j < n; j++) {
+        y_ptr[j] += *aj * xtemp;
+        aj += lda;
+      }
+
+    } else {
+      for (j = 0; j < n; j++) {
+        *y_ptr += *aj * xtemp;
+        y_ptr += inc_y;
+        aj += lda;
+      }
+
+    }
+  }
+
+  return (0);
+}
diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c
new file mode 100644
index 0000000000..7015aaa1da
--- /dev/null
+++ b/kernel/zarch/smax.c
@@ -0,0 +1,149 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) {
+  FLOAT max;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "srlg   %[n],%[n],6\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,128(%%r1,%[x])\n\t"
+    "vl  %%v25,144(%%r1,%[x])\n\t"
+    "vl  %%v26,160(%%r1,%[x])\n\t"
+    "vl  %%v27,176(%%r1,%[x])\n\t"
+    "vl  %%v28,192(%%r1,%[x])\n\t"
+    "vl  %%v29,208(%%r1,%[x])\n\t"
+    "vl  %%v30,224(%%r1,%[x])\n\t"
+    "vl  %%v31,240(%%r1,%[x])\n\t"
+    "vfmaxsb  %%v16,%%v16,%%v24,0\n\t"
+    "vfmaxsb  %%v17,%%v17,%%v25,0\n\t"
+    "vfmaxsb  %%v18,%%v18,%%v26,0\n\t"
+    "vfmaxsb  %%v19,%%v19,%%v27,0\n\t"
+    "vfmaxsb  %%v20,%%v20,%%v28,0\n\t"
+    "vfmaxsb  %%v21,%%v21,%%v29,0\n\t"
+    "vfmaxsb  %%v22,%%v22,%%v30,0\n\t"
+    "vfmaxsb  %%v23,%%v23,%%v31,0\n\t"
+    "vfmaxsb  %%v16,%%v16,%%v20,0\n\t"
+    "vfmaxsb  %%v17,%%v17,%%v21,0\n\t"
+    "vfmaxsb  %%v18,%%v18,%%v22,0\n\t"
+    "vfmaxsb  %%v19,%%v19,%%v23,0\n\t"
+    "vfmaxsb  %%v16,%%v16,%%v18,0\n\t"
+    "vfmaxsb  %%v17,%%v17,%%v19,0\n\t"
+    "vfmaxsb  %%v16,%%v16,%%v17,0\n\t"
+    "vfmaxsb  %%v0,%%v0,%%v16,0\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "veslg   %%v16,%%v0,32\n\t"
+    "vfmaxsb %%v0,%%v0,%%v16,0\n\t"
+    "vrepf   %%v16,%%v0,2\n\t"
+    "wfmaxsb %%v0,%%v0,%%v16,0\n\t"
+    "ler    %[max],%%f0"
+    : [max] "=f"(max),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return max;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT maxf = 0.0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (maxf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -64;
+    if (n1 > 0) {
+
+      maxf = smax_kernel_64(n1, x);
+
+      i = n1;
+    } else {
+      maxf = x[0];
+      i++;
+    }
+
+    while (i < n) {
+      if (x[i] > maxf) {
+        maxf = x[i];
+      }
+      i++;
+    }
+    return (maxf);
+
+  } else {
+
+    maxf = x[0];
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (x[i] > maxf) {
+        maxf = x[i];
+      }
+      if (x[i + inc_x] > maxf) {
+        maxf = x[i + inc_x];
+      }
+      if (x[i + 2 * inc_x] > maxf) {
+        maxf = x[i + 2 * inc_x];
+      }
+      if (x[i + 3 * inc_x] > maxf) {
+        maxf = x[i + 3 * inc_x];
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (x[i] > maxf) {
+        maxf = x[i];
+      }
+      i += inc_x;
+      j++;
+    }
+    return (maxf);
+  }
+}
diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c
new file mode 100644
index 0000000000..b6875c5c69
--- /dev/null
+++ b/kernel/zarch/smin.c
@@ -0,0 +1,149 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) {
+  FLOAT min;
+
+  __asm__("vl     %%v0,0(%[x])\n\t"
+    "srlg   %[n],%[n],6\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16,0(%%r1,%[x])\n\t"
+    "vl  %%v17,16(%%r1,%[x])\n\t"
+    "vl  %%v18,32(%%r1,%[x])\n\t"
+    "vl  %%v19,48(%%r1,%[x])\n\t"
+    "vl  %%v20,64(%%r1,%[x])\n\t"
+    "vl  %%v21,80(%%r1,%[x])\n\t"
+    "vl  %%v22,96(%%r1,%[x])\n\t"
+    "vl  %%v23,112(%%r1,%[x])\n\t"
+    "vl  %%v24,128(%%r1,%[x])\n\t"
+    "vl  %%v25,144(%%r1,%[x])\n\t"
+    "vl  %%v26,160(%%r1,%[x])\n\t"
+    "vl  %%v27,176(%%r1,%[x])\n\t"
+    "vl  %%v28,192(%%r1,%[x])\n\t"
+    "vl  %%v29,208(%%r1,%[x])\n\t"
+    "vl  %%v30,224(%%r1,%[x])\n\t"
+    "vl  %%v31,240(%%r1,%[x])\n\t"
+    "vfminsb  %%v16,%%v16,%%v24,0\n\t"
+    "vfminsb  %%v17,%%v17,%%v25,0\n\t"
+    "vfminsb  %%v18,%%v18,%%v26,0\n\t"
+    "vfminsb  %%v19,%%v19,%%v27,0\n\t"
+    "vfminsb  %%v20,%%v20,%%v28,0\n\t"
+    "vfminsb  %%v21,%%v21,%%v29,0\n\t"
+    "vfminsb  %%v22,%%v22,%%v30,0\n\t"
+    "vfminsb  %%v23,%%v23,%%v31,0\n\t"
+    "vfminsb  %%v16,%%v16,%%v20,0\n\t"
+    "vfminsb  %%v17,%%v17,%%v21,0\n\t"
+    "vfminsb  %%v18,%%v18,%%v22,0\n\t"
+    "vfminsb  %%v19,%%v19,%%v23,0\n\t"
+    "vfminsb  %%v16,%%v16,%%v18,0\n\t"
+    "vfminsb  %%v17,%%v17,%%v19,0\n\t"
+    "vfminsb  %%v16,%%v16,%%v17,0\n\t"
+    "vfminsb  %%v0,%%v0,%%v16,0\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "veslg   %%v16,%%v0,32\n\t"
+    "vfminsb %%v0,%%v0,%%v16,0\n\t"
+    "vrepf   %%v16,%%v0,2\n\t"
+    "wfminsb %%v0,%%v0,%%v16,0\n\t"
+    "ler    %[min],%%f0"
+    : [min] "=f"(min),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return min;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT minf = 0.0;
+
+  if (n <= 0 || inc_x <= 0)
+    return (minf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -64;
+    if (n1 > 0) {
+
+      minf = smin_kernel_64(n1, x);
+
+      i = n1;
+    } else {
+      minf = x[0];
+      i++;
+    }
+
+    while (i < n) {
+      if (x[i] < minf) {
+        minf = x[i];
+      }
+      i++;
+    }
+    return (minf);
+
+  } else {
+
+    minf = x[0];
+
+    BLASLONG n1 = n & -4;
+    while (j < n1) {
+
+      if (x[i] < minf) {
+        minf = x[i];
+      }
+      if (x[i + inc_x] < minf) {
+        minf = x[i + inc_x];
+      }
+      if (x[i + 2 * inc_x] < minf) {
+        minf = x[i + 2 * inc_x];
+      }
+      if (x[i + 3 * inc_x] < minf) {
+        minf = x[i + 3 * inc_x];
+      }
+
+      i += inc_x * 4;
+
+      j += 4;
+
+    }
+
+    while (j < n) {
+      if (x[i] < minf) {
+        minf = x[i];
+      }
+      i += inc_x;
+      j++;
+    }
+    return (minf);
+  }
+}
diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c
new file mode 100644
index 0000000000..4f471d8668
--- /dev/null
+++ b/kernel/zarch/srot.c
@@ -0,0 +1,226 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
+  __asm__("vlrepf %%v0,%[c]\n\t"
+    "vlrepf %%v1,%[s]\n\t"
+    "srlg   %[n],%[n],6\n\t"
+    "xgr    %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "pfd 2, 1024(%%r1,%[y])\n\t"
+    "vl  %%v24, 0(%%r1,%[x])\n\t"
+    "vl  %%v25, 16(%%r1,%[x])\n\t"
+    "vl  %%v26, 32(%%r1,%[x])\n\t"
+    "vl  %%v27, 48(%%r1,%[x])\n\t"
+    "vl  %%v16, 0(%%r1,%[y])\n\t"
+    "vl  %%v17, 16(%%r1,%[y])\n\t"
+    "vl  %%v18, 32(%%r1,%[y])\n\t"
+    "vl  %%v19, 48(%%r1,%[y])\n\t"
+    "vfmsb %%v28,%%v24,%%v0\n\t"
+    "vfmsb %%v29,%%v25,%%v0\n\t"
+    "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v30,%%v26,%%v0\n\t"
+    "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v31,%%v27,%%v0\n\t"
+    "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 0(%%r1,%[x])\n\t"
+    "vst  %%v29, 16(%%r1,%[x])\n\t"
+    "vst  %%v30, 32(%%r1,%[x])\n\t"
+    "vst  %%v31, 48(%%r1,%[x])\n\t"
+    "vst  %%v20, 0(%%r1,%[y])\n\t"
+    "vst  %%v21, 16(%%r1,%[y])\n\t"
+    "vst  %%v22, 32(%%r1,%[y])\n\t"
+    "vst  %%v23, 48(%%r1,%[y])\n\t"
+    "vl  %%v24, 64(%%r1,%[x])\n\t"
+    "vl  %%v25, 80(%%r1,%[x])\n\t"
+    "vl  %%v26, 96(%%r1,%[x])\n\t"
+    "vl  %%v27, 112(%%r1,%[x])\n\t"
+    "vl  %%v16, 64(%%r1,%[y])\n\t"
+    "vl  %%v17, 80(%%r1,%[y])\n\t"
+    "vl  %%v18, 96(%%r1,%[y])\n\t"
+    "vl  %%v19, 112(%%r1,%[y])\n\t"
+    "vfmsb %%v28,%%v24,%%v0\n\t"
+    "vfmsb %%v29,%%v25,%%v0\n\t"
+    "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v30,%%v26,%%v0\n\t"
+    "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v31,%%v27,%%v0\n\t"
+    "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 64(%%r1,%[x])\n\t"
+    "vst  %%v29, 80(%%r1,%[x])\n\t"
+    "vst  %%v30, 96(%%r1,%[x])\n\t"
+    "vst  %%v31, 112(%%r1,%[x])\n\t"
+    "vst  %%v20, 64(%%r1,%[y])\n\t"
+    "vst  %%v21, 80(%%r1,%[y])\n\t"
+    "vst  %%v22, 96(%%r1,%[y])\n\t"
+    "vst  %%v23, 112(%%r1,%[y])\n\t"
+    "vl  %%v24, 128(%%r1,%[x])\n\t"
+    "vl  %%v25, 144(%%r1,%[x])\n\t"
+    "vl  %%v26, 160(%%r1,%[x])\n\t"
+    "vl  %%v27, 176(%%r1,%[x])\n\t"
+    "vl  %%v16, 128(%%r1,%[y])\n\t"
+    "vl  %%v17, 144(%%r1,%[y])\n\t"
+    "vl  %%v18, 160(%%r1,%[y])\n\t"
+    "vl  %%v19, 176(%%r1,%[y])\n\t"
+    "vfmsb %%v28,%%v24,%%v0\n\t"
+    "vfmsb %%v29,%%v25,%%v0\n\t"
+    "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v30,%%v26,%%v0\n\t"
+    "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v31,%%v27,%%v0\n\t"
+    "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 128(%%r1,%[x])\n\t"
+    "vst  %%v29, 144(%%r1,%[x])\n\t"
+    "vst  %%v30, 160(%%r1,%[x])\n\t"
+    "vst  %%v31, 176(%%r1,%[x])\n\t"
+    "vst  %%v20, 128(%%r1,%[y])\n\t"
+    "vst  %%v21, 144(%%r1,%[y])\n\t"
+    "vst  %%v22, 160(%%r1,%[y])\n\t"
+    "vst  %%v23, 176(%%r1,%[y])\n\t"
+    "vl  %%v24, 192(%%r1,%[x])\n\t"
+    "vl  %%v25, 208(%%r1,%[x])\n\t"
+    "vl  %%v26, 224(%%r1,%[x])\n\t"
+    "vl  %%v27, 240(%%r1,%[x])\n\t"
+    "vl  %%v16, 192(%%r1,%[y])\n\t"
+    "vl  %%v17, 208(%%r1,%[y])\n\t"
+    "vl  %%v18, 224(%%r1,%[y])\n\t"
+    "vl  %%v19, 240(%%r1,%[y])\n\t"
+    "vfmsb %%v28,%%v24,%%v0\n\t"
+    "vfmsb %%v29,%%v25,%%v0\n\t"
+    "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v30,%%v26,%%v0\n\t"
+    "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmsb %%v31,%%v27,%%v0\n\t"
+    "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 192(%%r1,%[x])\n\t"
+    "vst  %%v29, 208(%%r1,%[x])\n\t"
+    "vst  %%v30, 224(%%r1,%[x])\n\t"
+    "vst  %%v31, 240(%%r1,%[x])\n\t"
+    "vst  %%v20, 192(%%r1,%[y])\n\t"
+    "vst  %%v21, 208(%%r1,%[y])\n\t"
+    "vst  %%v22, 224(%%r1,%[y])\n\t"
+    "vst  %%v23, 240(%%r1,%[y])\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
+       [n] "+&r"(n)
+    : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+}
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT c, FLOAT s) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+
+  FLOAT temp;
+
+  if (n <= 0)
+    return (0);
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -64;
+    if (n1 > 0) {
+      FLOAT cosa, sina;
+      cosa = c;
+      sina = s;
+      srot_kernel_64(n1, x, y, &cosa, &sina);
+      i = n1;
+    }
+
+    while (i < n) {
+      temp = c * x[i] + s * y[i];
+      y[i] = c * y[i] - s * x[i];
+      x[i] = temp;
+
+      i++;
+
+    }
+
+  } else {
+
+    while (i < n) {
+      temp = c * x[ix] + s * y[iy];
+      y[iy] = c * y[iy] - s * x[ix];
+      x[ix] = temp;
+
+      ix += inc_x;
+      iy += inc_y;
+      i++;
+
+    }
+
+  }
+  return (0);
+
+}
diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c
new file mode 100644
index 0000000000..9b9930dc87
--- /dev/null
+++ b/kernel/zarch/sscal.c
@@ -0,0 +1,173 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) {
+  __asm__("vlrepf %%v0,%[da]\n\t"
+    "srlg  %[n],%[n],5\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "vl    %%v24,0(%%r1,%[x])\n\t"
+    "vfmsb %%v24,%%v24,%%v0\n\t"
+    "vst   %%v24,0(%%r1,%[x])\n\t"
+    "vl    %%v25,16(%%r1,%[x])\n\t"
+    "vfmsb %%v25,%%v25,%%v0\n\t"
+    "vst   %%v25,16(%%r1,%[x])\n\t"
+    "vl    %%v26,32(%%r1,%[x])\n\t"
+    "vfmsb %%v26,%%v26,%%v0\n\t"
+    "vst   %%v26,32(%%r1,%[x])\n\t"
+    "vl    %%v27,48(%%r1,%[x])\n\t"
+    "vfmsb %%v27,%%v27,%%v0\n\t"
+    "vst   %%v27,48(%%r1,%[x])\n\t"
+    "vl    %%v28,64(%%r1,%[x])\n\t"
+    "vfmsb %%v28,%%v28,%%v0\n\t"
+    "vst   %%v28,64(%%r1,%[x])\n\t"
+    "vl    %%v29,80(%%r1,%[x])\n\t"
+    "vfmsb %%v29,%%v29,%%v0\n\t"
+    "vst   %%v29,80(%%r1,%[x])\n\t"
+    "vl    %%v30,96(%%r1,%[x])\n\t"
+    "vfmsb %%v30,%%v30,%%v0\n\t"
+    "vst   %%v30,96(%%r1,%[x])\n\t"
+    "vl    %%v31,112(%%r1,%[x])\n\t"
+    "vfmsb %%v31,%%v31,%%v0\n\t"
+    "vst   %%v31,112(%%r1,%[x])\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %[n],0b"
+    : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
+    : [x] "a"(x),[da] "Q"(da)
+    : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
+}
+
+static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) {
+  __asm__("vzero %%v0\n\t"
+    "srlg %[n],%[n],5\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "vst  %%v0,0(%%r1,%[x])\n\t"
+    "vst  %%v0,16(%%r1,%[x])\n\t"
+    "vst  %%v0,32(%%r1,%[x])\n\t"
+    "vst  %%v0,48(%%r1,%[x])\n\t"
+    "vst  %%v0,64(%%r1,%[x])\n\t"
+    "vst  %%v0,80(%%r1,%[x])\n\t"
+    "vst  %%v0,96(%%r1,%[x])\n\t"
+    "vst  %%v0,112(%%r1,%[x])\n\t"
+    "agfi  %%r1,128\n\t"
+    "brctg %[n],0b"
+    : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
+    : [x] "a"(x)
+    : "cc", "r1", "v0");
+}
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
+          BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+          BLASLONG dummy2) {
+  BLASLONG i = 0, j = 0;
+  if (n <= 0 || inc_x <= 0)
+    return (0);
+
+  if (inc_x == 1) {
+
+    if (da == 0.0) {
+
+      BLASLONG n1 = n & -32;
+      if (n1 > 0) {
+
+        sscal_kernel_32_zero(n1, x);
+        j = n1;
+      }
+
+      while (j < n) {
+
+        x[j] = 0.0;
+        j++;
+      }
+
+    } else {
+
+      BLASLONG n1 = n & -32;
+      if (n1 > 0) {
+        sscal_kernel_32(n1, da, x);
+        j = n1;
+      }
+      while (j < n) {
+
+        x[j] = da * x[j];
+        j++;
+      }
+    }
+
+  } else {
+
+    if (da == 0.0) {
+
+      BLASLONG n1 = n & -2;
+
+      while (j < n1) {
+
+        x[i] = 0.0;
+        x[i + inc_x] = 0.0;
+
+        i += inc_x * 2;
+        j += 2;
+
+      }
+      while (j < n) {
+
+        x[i] = 0.0;
+        i += inc_x;
+        j++;
+      }
+
+    } else {
+      BLASLONG n1 = n & -2;
+
+      while (j < n1) {
+
+        x[i] = da * x[i];
+        x[i + inc_x] = da * x[i + inc_x];
+
+        i += inc_x * 2;
+        j += 2;
+
+      }
+
+      while (j < n) {
+
+        x[i] = da * x[i];
+        i += inc_x;
+        j++;
+      }
+    }
+
+  }
+  return 0;
+
+}
diff --git a/kernel/zarch/ssum.c b/kernel/zarch/ssum.c
new file mode 100644
index 0000000000..a433ab5923
--- /dev/null
+++ b/kernel/zarch/ssum.c
@@ -0,0 +1,151 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) {
+  FLOAT sum;
+
+  __asm__("vzero   %%v24\n\t"
+    "vzero   %%v25\n\t"
+    "vzero   %%v26\n\t"
+    "vzero   %%v27\n\t"
+    "vzero   %%v28\n\t"
+    "vzero   %%v29\n\t"
+    "vzero   %%v30\n\t"
+    "vzero   %%v31\n\t"
+    "srlg  %[n],%[n],6\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd  1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16, 0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v20, 64(%%r1,%[x])\n\t"
+    "vl  %%v21, 80(%%r1,%[x])\n\t"
+    "vl  %%v22, 96(%%r1,%[x])\n\t"
+    "vl  %%v23, 112(%%r1,%[x])\n\t"
+    "vfasb   %%v24,%%v24,%%v16\n\t"
+    "vfasb   %%v25,%%v25,%%v17\n\t"
+    "vfasb   %%v26,%%v26,%%v18\n\t"
+    "vfasb   %%v27,%%v27,%%v19\n\t"
+    "vfasb   %%v28,%%v28,%%v20\n\t"
+    "vfasb   %%v29,%%v29,%%v21\n\t"
+    "vfasb   %%v30,%%v30,%%v22\n\t"
+    "vfasb   %%v31,%%v31,%%v23\n\t"
+    "vl  %%v16, 128(%%r1,%[x])\n\t"
+    "vl  %%v17, 144(%%r1,%[x])\n\t"
+    "vl  %%v18, 160(%%r1,%[x])\n\t"
+    "vl  %%v19, 176(%%r1,%[x])\n\t"
+    "vl  %%v20, 192(%%r1,%[x])\n\t"
+    "vl  %%v21, 208(%%r1,%[x])\n\t"
+    "vl  %%v22, 224(%%r1,%[x])\n\t"
+    "vl  %%v23, 240(%%r1,%[x])\n\t"
+    "vfasb   %%v24,%%v24,%%v16\n\t"
+    "vfasb   %%v25,%%v25,%%v17\n\t"
+    "vfasb   %%v26,%%v26,%%v18\n\t"
+    "vfasb   %%v27,%%v27,%%v19\n\t"
+    "vfasb   %%v28,%%v28,%%v20\n\t"
+    "vfasb   %%v29,%%v29,%%v21\n\t"
+    "vfasb   %%v30,%%v30,%%v22\n\t"
+    "vfasb   %%v31,%%v31,%%v23\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b\n\t"
+    "vfasb   %%v24,%%v24,%%v25\n\t"
+    "vfasb   %%v24,%%v24,%%v26\n\t"
+    "vfasb   %%v24,%%v24,%%v27\n\t"
+    "vfasb   %%v24,%%v24,%%v28\n\t"
+    "vfasb   %%v24,%%v24,%%v29\n\t"
+    "vfasb   %%v24,%%v24,%%v30\n\t"
+    "vfasb   %%v24,%%v24,%%v31\n\t"
+    "veslg   %%v25,%%v24,32\n\t"
+    "vfasb   %%v24,%%v24,%%v25\n\t"
+    "vrepf   %%v25,%%v24,2\n\t"
+    "vfasb   %%v24,%%v24,%%v25\n\t"
+    "vstef   %%v24,%[asum],0"
+    : [sum] "=Q"(sum),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return sum;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG j = 0;
+  FLOAT sumf = 0.0;
+  BLASLONG n1;
+
+  if (n <= 0 || inc_x <= 0)
+    return sumf;
+
+  if (inc_x == 1) {
+
+    n1 = n & -64;
+
+    if (n1 > 0) {
+
+      sumf = ssum_kernel_64(n1, x);
+      i = n1;
+    }
+
+    while (i < n) {
+      sumf += x[i];
+      i++;
+    }
+
+  } else {
+    BLASLONG n1 = n & -4;
+    register FLOAT sum1, sum2;
+    sum1 = 0.0;
+    sum2 = 0.0;
+    while (j < n1) {
+
+      sum1 += x[i];
+      sum2 += x[i + inc_x];
+      sum1 += x[i + 2 * inc_x];
+      sum2 += x[i + 3 * inc_x];
+
+      i += inc_x * 4;
+      j += 4;
+
+    }
+    sumf = sum1 + sum2;
+    while (j < n) {
+
+      sumf += x[i];
+      i += inc_x;
+      j++;
+    }
+
+  }
+  return sumf;
+}
diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c
new file mode 100644
index 0000000000..0c62f189d7
--- /dev/null
+++ b/kernel/zarch/sswap.c
@@ -0,0 +1,151 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
+  __asm__("srlg %[n],%[n],6\n\t"
+    "xgr  %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "pfd 2, 1024(%%r1,%[y])\n\t"
+    "vl  %%v16, 0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v20, 64(%%r1,%[x])\n\t"
+    "vl  %%v21, 80(%%r1,%[x])\n\t"
+    "vl  %%v22, 96(%%r1,%[x])\n\t"
+    "vl  %%v23, 112(%%r1,%[x])\n\t"
+    "vl  %%v24, 128(%%r1,%[x])\n\t"
+    "vl  %%v25, 144(%%r1,%[x])\n\t"
+    "vl  %%v26, 160(%%r1,%[x])\n\t"
+    "vl  %%v27, 176(%%r1,%[x])\n\t"
+    "vl  %%v28, 192(%%r1,%[x])\n\t"
+    "vl  %%v29, 208(%%r1,%[x])\n\t"
+    "vl  %%v30, 224(%%r1,%[x])\n\t"
+    "vl  %%v31, 240(%%r1,%[x])\n\t"
+    "vl  %%v0, 0(%%r1,%[y])\n\t"
+    "vl  %%v1, 16(%%r1,%[y])\n\t"
+    "vl  %%v2, 32(%%r1,%[y])\n\t"
+    "vl  %%v3, 48(%%r1,%[y])\n\t"
+    "vl  %%v4, 64(%%r1,%[y])\n\t"
+    "vl  %%v5, 80(%%r1,%[y])\n\t"
+    "vl  %%v6, 96(%%r1,%[y])\n\t"
+    "vl  %%v7, 112(%%r1,%[y])\n\t"
+    "vst  %%v0, 0(%%r1,%[x])\n\t"
+    "vst  %%v1, 16(%%r1,%[x])\n\t"
+    "vst  %%v2, 32(%%r1,%[x])\n\t"
+    "vst  %%v3, 48(%%r1,%[x])\n\t"
+    "vst  %%v4, 64(%%r1,%[x])\n\t"
+    "vst  %%v5, 80(%%r1,%[x])\n\t"
+    "vst  %%v6, 96(%%r1,%[x])\n\t"
+    "vst  %%v7, 112(%%r1,%[x])\n\t"
+    "vl  %%v0, 128(%%r1,%[y])\n\t"
+    "vl  %%v1, 144(%%r1,%[y])\n\t"
+    "vl  %%v2, 160(%%r1,%[y])\n\t"
+    "vl  %%v3, 176(%%r1,%[y])\n\t"
+    "vl  %%v4, 192(%%r1,%[y])\n\t"
+    "vl  %%v5, 208(%%r1,%[y])\n\t"
+    "vl  %%v6, 224(%%r1,%[y])\n\t"
+    "vl  %%v7, 240(%%r1,%[y])\n\t"
+    "vst  %%v0, 128(%%r1,%[x])\n\t"
+    "vst  %%v1, 144(%%r1,%[x])\n\t"
+    "vst  %%v2, 160(%%r1,%[x])\n\t"
+    "vst  %%v3, 176(%%r1,%[x])\n\t"
+    "vst  %%v4, 192(%%r1,%[x])\n\t"
+    "vst  %%v5, 208(%%r1,%[x])\n\t"
+    "vst  %%v6, 224(%%r1,%[x])\n\t"
+    "vst  %%v7, 240(%%r1,%[x])\n\t"
+    "vst  %%v16, 0(%%r1,%[y])\n\t"
+    "vst  %%v17, 16(%%r1,%[y])\n\t"
+    "vst  %%v18, 32(%%r1,%[y])\n\t"
+    "vst  %%v19, 48(%%r1,%[y])\n\t"
+    "vst  %%v20, 64(%%r1,%[y])\n\t"
+    "vst  %%v21, 80(%%r1,%[y])\n\t"
+    "vst  %%v22, 96(%%r1,%[y])\n\t"
+    "vst  %%v23, 112(%%r1,%[y])\n\t"
+    "vst  %%v24, 128(%%r1,%[y])\n\t"
+    "vst  %%v25, 144(%%r1,%[y])\n\t"
+    "vst  %%v26, 160(%%r1,%[y])\n\t"
+    "vst  %%v27, 176(%%r1,%[y])\n\t"
+    "vst  %%v28, 192(%%r1,%[y])\n\t"
+    "vst  %%v29, 208(%%r1,%[y])\n\t"
+    "vst  %%v30, 224(%%r1,%[y])\n\t"
+    "vst  %%v31, 240(%%r1,%[y])\n\t"
+    "agfi   %%r1,256\n\t"
+    "brctg  %[n],0b"
+    : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
+       [n] "+&r"(n)
+    : [x] "a"(x),[y] "a"(y)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
+}
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
+          BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+          BLASLONG dummy2) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+  FLOAT temp;
+
+  if (n <= 0)
+    return (0);
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -64;
+    if (n1 > 0) {
+      sswap_kernel_64(n1, x, y);
+      i = n1;
+    }
+
+    while (i < n) {
+      temp = y[i];
+      y[i] = x[i];
+      x[i] = temp;
+      i++;
+
+    }
+
+  } else {
+
+    while (i < n) {
+      temp = y[iy];
+      y[iy] = x[ix];
+      x[ix] = temp;
+      ix += inc_x;
+      iy += inc_y;
+      i++;
+
+    }
+
+  }
+  return (0);
+
+}
diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c
new file mode 100644
index 0000000000..aa04ab91fe
--- /dev/null
+++ b/kernel/zarch/zamax.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
+
+static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
+  FLOAT amax;
+
+  __asm__("vleg   %%v0,0(%[x]),0\n\t"
+    "vleg   %%v16,8(%[x]),0\n\t"
+    "vleg   %%v0,16(%[x]),1\n\t"
+    "vleg   %%v16,24(%[x]),1\n\t"
+    "vflpdb %%v0,%%v0\n\t"
+    "vflpdb %%v16,%%v16\n\t"
+    "vfadb  %%v0,%%v0,%%v16\n\t"
+    "srlg  %[n],%[n],4\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vleg  %%v16,0(%%r1,%[x]),0\n\t"
+    "vleg  %%v17,8(%%r1,%[x]),0\n\t"
+    "vleg  %%v16,16(%%r1,%[x]),1\n\t"
+    "vleg  %%v17,24(%%r1,%[x]),1\n\t"
+    "vleg  %%v18,32(%%r1,%[x]),0\n\t"
+    "vleg  %%v19,40(%%r1,%[x]),0\n\t"
+    "vleg  %%v18,48(%%r1,%[x]),1\n\t"
+    "vleg  %%v19,56(%%r1,%[x]),1\n\t"
+    "vleg  %%v20,64(%%r1,%[x]),0\n\t"
+    "vleg  %%v21,72(%%r1,%[x]),0\n\t"
+    "vleg  %%v20,80(%%r1,%[x]),1\n\t"
+    "vleg  %%v21,88(%%r1,%[x]),1\n\t"
+    "vleg  %%v22,96(%%r1,%[x]),0\n\t"
+    "vleg  %%v23,104(%%r1,%[x]),0\n\t"
+    "vleg  %%v22,112(%%r1,%[x]),1\n\t"
+    "vleg  %%v23,120(%%r1,%[x]),1\n\t"
+    "vleg  %%v24,128(%%r1,%[x]),0\n\t"
+    "vleg  %%v25,136(%%r1,%[x]),0\n\t"
+    "vleg  %%v24,144(%%r1,%[x]),1\n\t"
+    "vleg  %%v25,152(%%r1,%[x]),1\n\t"
+    "vleg  %%v26,160(%%r1,%[x]),0\n\t"
+    "vleg  %%v27,168(%%r1,%[x]),0\n\t"
+    "vleg  %%v26,176(%%r1,%[x]),1\n\t"
+    "vleg  %%v27,184(%%r1,%[x]),1\n\t"
+    "vleg  %%v28,192(%%r1,%[x]),0\n\t"
+    "vleg  %%v29,200(%%r1,%[x]),0\n\t"
+    "vleg  %%v28,208(%%r1,%[x]),1\n\t"
+    "vleg  %%v29,216(%%r1,%[x]),1\n\t"
+    "vleg  %%v30,224(%%r1,%[x]),0\n\t"
+    "vleg  %%v31,232(%%r1,%[x]),0\n\t"
+    "vleg  %%v30,240(%%r1,%[x]),1\n\t"
+    "vleg  %%v31,248(%%r1,%[x]),1\n\t"
+    "vflpdb  %%v16,%%v16\n\t"
+    "vflpdb  %%v17,%%v17\n\t"
+    "vflpdb  %%v18,%%v18\n\t"
+    "vflpdb  %%v19,%%v19\n\t"
+    "vflpdb  %%v20,%%v20\n\t"
+    "vflpdb  %%v21,%%v21\n\t"
+    "vflpdb  %%v22,%%v22\n\t"
+    "vflpdb  %%v23,%%v23\n\t"
+    "vflpdb  %%v24,%%v24\n\t"
+    "vflpdb  %%v25,%%v25\n\t"
+    "vflpdb  %%v26,%%v26\n\t"
+    "vflpdb  %%v27,%%v27\n\t"
+    "vflpdb  %%v28,%%v28\n\t"
+    "vflpdb  %%v29,%%v29\n\t"
+    "vflpdb  %%v30,%%v30\n\t"
+    "vflpdb  %%v31,%%v31\n\t"
+    "vfadb %%v16,%%v16,%%v17\n\t"
+    "vfadb %%v18,%%v18,%%v19\n\t"
+    "vfadb %%v20,%%v20,%%v21\n\t"
+    "vfadb %%v22,%%v22,%%v23\n\t"
+    "vfadb %%v24,%%v24,%%v25\n\t"
+    "vfadb %%v26,%%v26,%%v27\n\t"
+    "vfadb %%v28,%%v28,%%v29\n\t"
+    "vfadb %%v30,%%v30,%%v31\n\t"
+    "vfmaxdb  %%v16,%%v16,%%v24,0\n\t"
+    "vfmaxdb  %%v18,%%v18,%%v26,0\n\t"
+    "vfmaxdb  %%v20,%%v20,%%v28,0\n\t"
+    "vfmaxdb  %%v22,%%v22,%%v30,0\n\t"
+    "vfmaxdb  %%v16,%%v16,%%v20,0\n\t"
+    "vfmaxdb  %%v18,%%v18,%%v22,0\n\t"
+    "vfmaxdb  %%v16,%%v16,%%v18,0\n\t"
+    "vfmaxdb  %%v0,%%v0,%%v16,0\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v16,%%v0,1\n\t"
+    "wfmaxdb %%v0,%%v0,%%v16,0\n\t"
+    "ldr    %[amax],%%f0"
+    : [amax] "=f"(amax),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return amax;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0;
+  FLOAT maxf = 0.0;
+  BLASLONG inc_x2;
+
+  if (n <= 0 || inc_x <= 0)
+    return (maxf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -16;
+    if (n1 > 0) {
+
+      maxf = zamax_kernel_16(n1, x);
+      ix = n1 * 2;
+      i = n1;
+    } else {
+      maxf = CABS1(x, 0);
+      ix += 2;
+      i++;
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) > maxf) {
+        maxf = CABS1(x, ix);
+      }
+      ix += 2;
+      i++;
+    }
+    return (maxf);
+
+  } else {
+
+    maxf = CABS1(x, 0);
+    inc_x2 = 2 * inc_x;
+
+    BLASLONG n1 = n & -4;
+    while (i < n1) {
+
+      if (CABS1(x, ix) > maxf) {
+        maxf = CABS1(x, ix);
+      }
+      if (CABS1(x, ix + inc_x2) > maxf) {
+        maxf = CABS1(x, ix + inc_x2);
+      }
+      if (CABS1(x, ix + inc_x2 * 2) > maxf) {
+        maxf = CABS1(x, ix + inc_x2 * 2);
+      }
+      if (CABS1(x, ix + inc_x2 * 3) > maxf) {
+        maxf = CABS1(x, ix + inc_x2 * 3);
+      }
+
+      ix += inc_x2 * 4;
+
+      i += 4;
+
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) > maxf) {
+        maxf = CABS1(x, ix);
+      }
+      ix += inc_x2;
+      i++;
+    }
+    return (maxf);
+  }
+}
diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c
new file mode 100644
index 0000000000..37278d6dbb
--- /dev/null
+++ b/kernel/zarch/zamax_z13.c
@@ -0,0 +1,201 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
+
+static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
+  FLOAT amax;
+
+  __asm__("vleg   %%v0,0(%[x]),0\n\t"
+    "vleg   %%v16,8(%[x]),0\n\t"
+    "vleg   %%v0,16(%[x]),1\n\t"
+    "vleg   %%v16,24(%[x]),1\n\t"
+    "vflpdb %%v0,%%v0\n\t"
+    "vflpdb %%v16,%%v16\n\t"
+    "vfadb  %%v0,%%v0,%%v16\n\t"
+    "srlg  %[n],%[n],4\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vleg  %%v16,0(%%r1,%[x]),0\n\t"
+    "vleg  %%v17,8(%%r1,%[x]),0\n\t"
+    "vleg  %%v16,16(%%r1,%[x]),1\n\t"
+    "vleg  %%v17,24(%%r1,%[x]),1\n\t"
+    "vleg  %%v18,32(%%r1,%[x]),0\n\t"
+    "vleg  %%v19,40(%%r1,%[x]),0\n\t"
+    "vleg  %%v18,48(%%r1,%[x]),1\n\t"
+    "vleg  %%v19,56(%%r1,%[x]),1\n\t"
+    "vleg  %%v20,64(%%r1,%[x]),0\n\t"
+    "vleg  %%v21,72(%%r1,%[x]),0\n\t"
+    "vleg  %%v20,80(%%r1,%[x]),1\n\t"
+    "vleg  %%v21,88(%%r1,%[x]),1\n\t"
+    "vleg  %%v22,96(%%r1,%[x]),0\n\t"
+    "vleg  %%v23,104(%%r1,%[x]),0\n\t"
+    "vleg  %%v22,112(%%r1,%[x]),1\n\t"
+    "vleg  %%v23,120(%%r1,%[x]),1\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfadb %%v16,%%v16,%%v17\n\t"
+    "vfadb %%v17,%%v18,%%v19\n\t"
+    "vfadb %%v18,%%v20,%%v21\n\t"
+    "vfadb %%v19,%%v22,%%v23\n\t"
+    "vfchdb  %%v24,%%v16,%%v17\n\t"
+    "vfchdb  %%v25,%%v18,%%v19\n\t"
+    "vsel    %%v24,%%v16,%%v17,%%v24\n\t"
+    "vsel    %%v25,%%v18,%%v19,%%v25\n\t"
+    "vfchdb  %%v26,%%v24,%%v25\n\t"
+    "vsel    %%v26,%%v24,%%v25,%%v26\n\t"
+    "vfchdb  %%v27,%%v26,%%v0\n\t"
+    "vsel    %%v0,%%v26,%%v0,%%v27\n\t"
+    "vleg  %%v16,128(%%r1,%[x]),0\n\t"
+    "vleg  %%v17,136(%%r1,%[x]),0\n\t"
+    "vleg  %%v16,144(%%r1,%[x]),1\n\t"
+    "vleg  %%v17,152(%%r1,%[x]),1\n\t"
+    "vleg  %%v18,160(%%r1,%[x]),0\n\t"
+    "vleg  %%v19,168(%%r1,%[x]),0\n\t"
+    "vleg  %%v18,176(%%r1,%[x]),1\n\t"
+    "vleg  %%v19,184(%%r1,%[x]),1\n\t"
+    "vleg  %%v20,192(%%r1,%[x]),0\n\t"
+    "vleg  %%v21,200(%%r1,%[x]),0\n\t"
+    "vleg  %%v20,208(%%r1,%[x]),1\n\t"
+    "vleg  %%v21,216(%%r1,%[x]),1\n\t"
+    "vleg  %%v22,224(%%r1,%[x]),0\n\t"
+    "vleg  %%v23,232(%%r1,%[x]),0\n\t"
+    "vleg  %%v22,240(%%r1,%[x]),1\n\t"
+    "vleg  %%v23,248(%%r1,%[x]),1\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfadb %%v16,%%v16,%%v17\n\t"
+    "vfadb %%v17,%%v18,%%v19\n\t"
+    "vfadb %%v18,%%v20,%%v21\n\t"
+    "vfadb %%v19,%%v22,%%v23\n\t"
+    "vfchdb  %%v24,%%v16,%%v17\n\t"
+    "vfchdb  %%v25,%%v18,%%v19\n\t"
+    "vsel    %%v24,%%v16,%%v17,%%v24\n\t"
+    "vsel    %%v25,%%v18,%%v19,%%v25\n\t"
+    "vfchdb  %%v26,%%v24,%%v25\n\t"
+    "vsel    %%v26,%%v24,%%v25,%%v26\n\t"
+    "vfchdb  %%v27,%%v26,%%v0\n\t"
+    "vsel    %%v0,%%v26,%%v0,%%v27\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v16,%%v0,1\n\t"
+    "wfchdb %%v17,%%v0,%%v16\n\t"
+    "vsel   %%v0,%%v0,%%v16,%%v17\n\t"
+    "ldr    %[amax],%%f0"
+    : [amax] "=f"(amax),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27");
+
+  return amax;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0;
+  FLOAT maxf = 0.0;
+  BLASLONG inc_x2;
+
+  if (n <= 0 || inc_x <= 0)
+    return (maxf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -16;
+    if (n1 > 0) {
+
+      maxf = zamax_kernel_16(n1, x);
+      ix = n1 * 2;
+      i = n1;
+    } else {
+      maxf = CABS1(x, 0);
+      ix += 2;
+      i++;
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) > maxf) {
+        maxf = CABS1(x, ix);
+      }
+      ix += 2;
+      i++;
+    }
+    return (maxf);
+
+  } else {
+
+    maxf = CABS1(x, 0);
+    inc_x2 = 2 * inc_x;
+
+    BLASLONG n1 = n & -4;
+    while (i < n1) {
+
+      if (CABS1(x, ix) > maxf) {
+        maxf = CABS1(x, ix);
+      }
+      if (CABS1(x, ix + inc_x2) > maxf) {
+        maxf = CABS1(x, ix + inc_x2);
+      }
+      if (CABS1(x, ix + inc_x2 * 2) > maxf) {
+        maxf = CABS1(x, ix + inc_x2 * 2);
+      }
+      if (CABS1(x, ix + inc_x2 * 3) > maxf) {
+        maxf = CABS1(x, ix + inc_x2 * 3);
+      }
+
+      ix += inc_x2 * 4;
+
+      i += 4;
+
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) > maxf) {
+        maxf = CABS1(x, ix);
+      }
+      ix += inc_x2;
+      i++;
+    }
+    return (maxf);
+  }
+}
diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c
new file mode 100644
index 0000000000..0b54028532
--- /dev/null
+++ b/kernel/zarch/zamin.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
+
+static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
+  FLOAT amin;
+
+  __asm__("vleg   %%v0,0(%[x]),0\n\t"
+    "vleg   %%v16,8(%[x]),0\n\t"
+    "vleg   %%v0,16(%[x]),1\n\t"
+    "vleg   %%v16,24(%[x]),1\n\t"
+    "vflpdb %%v0,%%v0\n\t"
+    "vflpdb %%v16,%%v16\n\t"
+    "vfadb  %%v0,%%v0,%%v16\n\t"
+    "srlg  %[n],%[n],4\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vleg  %%v16,0(%%r1,%[x]),0\n\t"
+    "vleg  %%v17,8(%%r1,%[x]),0\n\t"
+    "vleg  %%v16,16(%%r1,%[x]),1\n\t"
+    "vleg  %%v17,24(%%r1,%[x]),1\n\t"
+    "vleg  %%v18,32(%%r1,%[x]),0\n\t"
+    "vleg  %%v19,40(%%r1,%[x]),0\n\t"
+    "vleg  %%v18,48(%%r1,%[x]),1\n\t"
+    "vleg  %%v19,56(%%r1,%[x]),1\n\t"
+    "vleg  %%v20,64(%%r1,%[x]),0\n\t"
+    "vleg  %%v21,72(%%r1,%[x]),0\n\t"
+    "vleg  %%v20,80(%%r1,%[x]),1\n\t"
+    "vleg  %%v21,88(%%r1,%[x]),1\n\t"
+    "vleg  %%v22,96(%%r1,%[x]),0\n\t"
+    "vleg  %%v23,104(%%r1,%[x]),0\n\t"
+    "vleg  %%v22,112(%%r1,%[x]),1\n\t"
+    "vleg  %%v23,120(%%r1,%[x]),1\n\t"
+    "vleg  %%v24,128(%%r1,%[x]),0\n\t"
+    "vleg  %%v25,136(%%r1,%[x]),0\n\t"
+    "vleg  %%v24,144(%%r1,%[x]),1\n\t"
+    "vleg  %%v25,152(%%r1,%[x]),1\n\t"
+    "vleg  %%v26,160(%%r1,%[x]),0\n\t"
+    "vleg  %%v27,168(%%r1,%[x]),0\n\t"
+    "vleg  %%v26,176(%%r1,%[x]),1\n\t"
+    "vleg  %%v27,184(%%r1,%[x]),1\n\t"
+    "vleg  %%v28,192(%%r1,%[x]),0\n\t"
+    "vleg  %%v29,200(%%r1,%[x]),0\n\t"
+    "vleg  %%v28,208(%%r1,%[x]),1\n\t"
+    "vleg  %%v29,216(%%r1,%[x]),1\n\t"
+    "vleg  %%v30,224(%%r1,%[x]),0\n\t"
+    "vleg  %%v31,232(%%r1,%[x]),0\n\t"
+    "vleg  %%v30,240(%%r1,%[x]),1\n\t"
+    "vleg  %%v31,248(%%r1,%[x]),1\n\t"
+    "vflpdb  %%v16,%%v16\n\t"
+    "vflpdb  %%v17,%%v17\n\t"
+    "vflpdb  %%v18,%%v18\n\t"
+    "vflpdb  %%v19,%%v19\n\t"
+    "vflpdb  %%v20,%%v20\n\t"
+    "vflpdb  %%v21,%%v21\n\t"
+    "vflpdb  %%v22,%%v22\n\t"
+    "vflpdb  %%v23,%%v23\n\t"
+    "vflpdb  %%v24,%%v24\n\t"
+    "vflpdb  %%v25,%%v25\n\t"
+    "vflpdb  %%v26,%%v26\n\t"
+    "vflpdb  %%v27,%%v27\n\t"
+    "vflpdb  %%v28,%%v28\n\t"
+    "vflpdb  %%v29,%%v29\n\t"
+    "vflpdb  %%v30,%%v30\n\t"
+    "vflpdb  %%v31,%%v31\n\t"
+    "vfadb %%v16,%%v16,%%v17\n\t"
+    "vfadb %%v18,%%v18,%%v19\n\t"
+    "vfadb %%v20,%%v20,%%v21\n\t"
+    "vfadb %%v22,%%v22,%%v23\n\t"
+    "vfadb %%v24,%%v24,%%v25\n\t"
+    "vfadb %%v26,%%v26,%%v27\n\t"
+    "vfadb %%v28,%%v28,%%v29\n\t"
+    "vfadb %%v30,%%v30,%%v31\n\t"
+    "vfmindb  %%v16,%%v16,%%v24,0\n\t"
+    "vfmindb  %%v18,%%v18,%%v26,0\n\t"
+    "vfmindb  %%v20,%%v20,%%v28,0\n\t"
+    "vfmindb  %%v22,%%v22,%%v30,0\n\t"
+    "vfmindb  %%v16,%%v16,%%v20,0\n\t"
+    "vfmindb  %%v18,%%v18,%%v22,0\n\t"
+    "vfmindb  %%v16,%%v16,%%v18,0\n\t"
+    "vfmindb  %%v0,%%v0,%%v16,0\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v16,%%v0,1\n\t"
+    "wfmindb %%v0,%%v0,%%v16,0\n\t"
+    "ldr    %[amin],%%f0"
+    : [amin] "=f"(amin),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return amin;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0;
+  FLOAT minf = 0.0;
+  BLASLONG inc_x2;
+
+  if (n <= 0 || inc_x <= 0)
+    return (minf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -16;
+    if (n1 > 0) {
+
+      minf = zamin_kernel_16(n1, x);
+      ix = n1 * 2;
+      i = n1;
+    } else {
+      minf = CABS1(x, 0);
+      ix += 2;
+      i++;
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) < minf) {
+        minf = CABS1(x, ix);
+      }
+      ix += 2;
+      i++;
+    }
+    return (minf);
+
+  } else {
+
+    minf = CABS1(x, 0);
+    inc_x2 = 2 * inc_x;
+
+    BLASLONG n1 = n & -4;
+    while (i < n1) {
+
+      if (CABS1(x, ix) < minf) {
+        minf = CABS1(x, ix);
+      }
+      if (CABS1(x, ix + inc_x2) < minf) {
+        minf = CABS1(x, ix + inc_x2);
+      }
+      if (CABS1(x, ix + inc_x2 * 2) < minf) {
+        minf = CABS1(x, ix + inc_x2 * 2);
+      }
+      if (CABS1(x, ix + inc_x2 * 3) < minf) {
+        minf = CABS1(x, ix + inc_x2 * 3);
+      }
+
+      ix += inc_x2 * 4;
+
+      i += 4;
+
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) < minf) {
+        minf = CABS1(x, ix);
+      }
+      ix += inc_x2;
+      i++;
+    }
+    return (minf);
+  }
+}
diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c
new file mode 100644
index 0000000000..e37bb2236f
--- /dev/null
+++ b/kernel/zarch/zamin_z13.c
@@ -0,0 +1,201 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
+
+static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
+  FLOAT amin;
+
+  __asm__("vleg   %%v0,0(%[x]),0\n\t"
+    "vleg   %%v16,8(%[x]),0\n\t"
+    "vleg   %%v0,16(%[x]),1\n\t"
+    "vleg   %%v16,24(%[x]),1\n\t"
+    "vflpdb %%v0,%%v0\n\t"
+    "vflpdb %%v16,%%v16\n\t"
+    "vfadb  %%v0,%%v0,%%v16\n\t"
+    "srlg  %[n],%[n],4\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "vleg  %%v16,0(%%r1,%[x]),0\n\t"
+    "vleg  %%v17,8(%%r1,%[x]),0\n\t"
+    "vleg  %%v16,16(%%r1,%[x]),1\n\t"
+    "vleg  %%v17,24(%%r1,%[x]),1\n\t"
+    "vleg  %%v18,32(%%r1,%[x]),0\n\t"
+    "vleg  %%v19,40(%%r1,%[x]),0\n\t"
+    "vleg  %%v18,48(%%r1,%[x]),1\n\t"
+    "vleg  %%v19,56(%%r1,%[x]),1\n\t"
+    "vleg  %%v20,64(%%r1,%[x]),0\n\t"
+    "vleg  %%v21,72(%%r1,%[x]),0\n\t"
+    "vleg  %%v20,80(%%r1,%[x]),1\n\t"
+    "vleg  %%v21,88(%%r1,%[x]),1\n\t"
+    "vleg  %%v22,96(%%r1,%[x]),0\n\t"
+    "vleg  %%v23,104(%%r1,%[x]),0\n\t"
+    "vleg  %%v22,112(%%r1,%[x]),1\n\t"
+    "vleg  %%v23,120(%%r1,%[x]),1\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfadb %%v16,%%v16,%%v17\n\t"
+    "vfadb %%v17,%%v18,%%v19\n\t"
+    "vfadb %%v18,%%v20,%%v21\n\t"
+    "vfadb %%v19,%%v22,%%v23\n\t"
+    "vfchdb  %%v24,%%v17,%%v16\n\t"
+    "vfchdb  %%v25,%%v19,%%v18\n\t"
+    "vsel    %%v24,%%v16,%%v17,%%v24\n\t"
+    "vsel    %%v25,%%v18,%%v19,%%v25\n\t"
+    "vfchdb  %%v26,%%v25,%%v24\n\t"
+    "vsel    %%v26,%%v24,%%v25,%%v26\n\t"
+    "vfchdb  %%v27,%%v0,%%v26\n\t"
+    "vsel    %%v0,%%v26,%%v0,%%v27\n\t"
+    "vleg  %%v16,128(%%r1,%[x]),0\n\t"
+    "vleg  %%v17,136(%%r1,%[x]),0\n\t"
+    "vleg  %%v16,144(%%r1,%[x]),1\n\t"
+    "vleg  %%v17,152(%%r1,%[x]),1\n\t"
+    "vleg  %%v18,160(%%r1,%[x]),0\n\t"
+    "vleg  %%v19,168(%%r1,%[x]),0\n\t"
+    "vleg  %%v18,176(%%r1,%[x]),1\n\t"
+    "vleg  %%v19,184(%%r1,%[x]),1\n\t"
+    "vleg  %%v20,192(%%r1,%[x]),0\n\t"
+    "vleg  %%v21,200(%%r1,%[x]),0\n\t"
+    "vleg  %%v20,208(%%r1,%[x]),1\n\t"
+    "vleg  %%v21,216(%%r1,%[x]),1\n\t"
+    "vleg  %%v22,224(%%r1,%[x]),0\n\t"
+    "vleg  %%v23,232(%%r1,%[x]),0\n\t"
+    "vleg  %%v22,240(%%r1,%[x]),1\n\t"
+    "vleg  %%v23,248(%%r1,%[x]),1\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfadb %%v16,%%v16,%%v17\n\t"
+    "vfadb %%v17,%%v18,%%v19\n\t"
+    "vfadb %%v18,%%v20,%%v21\n\t"
+    "vfadb %%v19,%%v22,%%v23\n\t"
+    "vfchdb  %%v24,%%v17,%%v16\n\t"
+    "vfchdb  %%v25,%%v19,%%v18\n\t"
+    "vsel    %%v24,%%v16,%%v17,%%v24\n\t"
+    "vsel    %%v25,%%v18,%%v19,%%v25\n\t"
+    "vfchdb  %%v26,%%v25,%%v24\n\t"
+    "vsel    %%v26,%%v24,%%v25,%%v26\n\t"
+    "vfchdb  %%v27,%%v0,%%v26\n\t"
+    "vsel    %%v0,%%v26,%%v0,%%v27\n\t"
+    "agfi    %%r1, 256\n\t"
+    "brctg   %[n], 0b\n\t"
+    "vrepg  %%v16,%%v0,1\n\t"
+    "wfchdb %%v17,%%v16,%%v0\n\t"
+    "vsel   %%v0,%%v0,%%v16,%%v17\n\t"
+    "ldr    %[amin],%%f0"
+    : [amin] "=f"(amin),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23", "v24", "v25", "v26", "v27");
+
+  return amin;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0;
+  FLOAT minf = 0.0;
+  BLASLONG inc_x2;
+
+  if (n <= 0 || inc_x <= 0)
+    return (minf);
+
+  if (inc_x == 1) {
+
+    BLASLONG n1 = n & -16;
+    if (n1 > 0) {
+
+      minf = zamin_kernel_16(n1, x);
+      ix = n1 * 2;
+      i = n1;
+    } else {
+      minf = CABS1(x, 0);
+      ix += 2;
+      i++;
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) < minf) {
+        minf = CABS1(x, ix);
+      }
+      ix += 2;
+      i++;
+    }
+    return (minf);
+
+  } else {
+
+    minf = CABS1(x, 0);
+    inc_x2 = 2 * inc_x;
+
+    BLASLONG n1 = n & -4;
+    while (i < n1) {
+
+      if (CABS1(x, ix) < minf) {
+        minf = CABS1(x, ix);
+      }
+      if (CABS1(x, ix + inc_x2) < minf) {
+        minf = CABS1(x, ix + inc_x2);
+      }
+      if (CABS1(x, ix + inc_x2 * 2) < minf) {
+        minf = CABS1(x, ix + inc_x2 * 2);
+      }
+      if (CABS1(x, ix + inc_x2 * 3) < minf) {
+        minf = CABS1(x, ix + inc_x2 * 3);
+      }
+
+      ix += inc_x2 * 4;
+
+      i += 4;
+
+    }
+
+    while (i < n) {
+      if (CABS1(x, ix) < minf) {
+        minf = CABS1(x, ix);
+      }
+      ix += inc_x2;
+      i++;
+    }
+    return (minf);
+  }
+}
diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c
index 0fc5c9ecbe..aeef8d77e6 100644
--- a/kernel/zarch/zasum.c
+++ b/kernel/zarch/zasum.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -25,135 +25,129 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
 #include "common.h"
 #include <math.h>
 
-#if defined(DOUBLE)
-
 #define ABS fabs
 
-#else
-
-#define ABS fabsf
-
-#endif
-
-
 static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
-    
-    FLOAT asum;
-    __asm__ (
-            "pfd    1, 0(%[ptr_x]) \n\t"
-            "sllg   %%r0,%[n],4    \n\t"
-            "agr    %%r0,%[ptr_x]  \n\t"   
-            "vzero  %%v0      \n\t"
-            "vzero  %%v1      \n\t"
-            "vzero  %%v22     \n\t"
-            "vzero  %%v23     \n\t"   
-            ".align 16 \n\t"
-            "1:     \n\t"
-            "pfd    1, 256(%[ptr_tmp] ) \n\t"
-            "vlm    %%v24,%%v31,0(%[ptr_tmp]) \n\t"  
-    
-            "vflpdb %%v24, %%v24 \n\t"
-            "vflpdb %%v25, %%v25 \n\t"
-            "vflpdb %%v26, %%v26 \n\t"
-            "vflpdb %%v27, %%v27 \n\t"
-            "vflpdb %%v28, %%v28 \n\t"
-            "vflpdb %%v29, %%v29 \n\t"
-            "vflpdb %%v30, %%v30 \n\t"
-            "vflpdb %%v31, %%v31 \n\t"
-    
-            "vfadb  %%v0,%%v0,%%v24    \n\t"
-            "vfadb  %%v1,%%v1,%%v25    \n\t"
-            "vfadb  %%v23,%%v23,%%v26  \n\t"
-            "vfadb  %%v22,%%v22,%%v27  \n\t" 
-            "vfadb  %%v0,%%v0,%%v28    \n\t"
-            "vfadb  %%v1,%%v1,%%v29    \n\t"
-            "vfadb  %%v23,%%v23,%%v30  \n\t"
-            "vfadb  %%v22,%%v22,%%v31  \n\t" 
-    
-            "vlm    %%v24,%%v31, 128(%[ptr_tmp]) \n\t"  
-    
-            "vflpdb %%v24, %%v24 \n\t"
-            "vflpdb %%v25, %%v25 \n\t"
-            "vflpdb %%v26, %%v26 \n\t"
-            "vflpdb %%v27, %%v27 \n\t"
-            "vflpdb %%v28, %%v28 \n\t"
-            "vflpdb %%v29, %%v29 \n\t"
-            "vflpdb %%v30, %%v30 \n\t"
-            "vflpdb %%v31, %%v31 \n\t"
-            "la     %[ptr_tmp],256(%[ptr_tmp]) \n\t"  
-            "vfadb  %%v0,%%v0,%%v24   \n\t"
-            "vfadb  %%v1,%%v1,%%v25   \n\t"
-            "vfadb  %%v23,%%v23,%%v26 \n\t"
-            "vfadb  %%v22,%%v22,%%v27 \n\t" 
-            "vfadb  %%v0,%%v0,%%v28   \n\t"
-            "vfadb  %%v1,%%v1,%%v29   \n\t"
-            "vfadb  %%v23,%%v23,%%v30 \n\t"
-            "vfadb  %%v22,%%v22,%%v31 \n\t"  
-            
-            "clgrjl %[ptr_tmp],%%r0,1b \n\t"
-            "vfadb  %%v24,%%v0,%%v1    \n\t"
-            "vfadb  %%v25,%%v23,%%v22  \n\t"
-            "vfadb  %%v0,%%v25,%%v24   \n\t"
-            "vrepg  %%v1,%%v0,1        \n\t"
-            "adbr   %%f0,%%f1          \n\t"
-            "ldr    %[asum] ,%%f0"
-            : [asum] "=f"(asum),[ptr_tmp] "+&a"(x)
-            : [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x) 
-            : "cc",  "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
-            );
-    return asum;
-
+  FLOAT asum;
+
+  __asm__("vzero   %%v24\n\t"
+    "vzero   %%v25\n\t"
+    "vzero   %%v26\n\t"
+    "vzero   %%v27\n\t"
+    "vzero   %%v28\n\t"
+    "vzero   %%v29\n\t"
+    "vzero   %%v30\n\t"
+    "vzero   %%v31\n\t"
+    "srlg  %[n],%[n],4\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd  1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16, 0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v20, 64(%%r1,%[x])\n\t"
+    "vl  %%v21, 80(%%r1,%[x])\n\t"
+    "vl  %%v22, 96(%%r1,%[x])\n\t"
+    "vl  %%v23, 112(%%r1,%[x])\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfadb   %%v24,%%v24,%%v16\n\t"
+    "vfadb   %%v25,%%v25,%%v17\n\t"
+    "vfadb   %%v26,%%v26,%%v18\n\t"
+    "vfadb   %%v27,%%v27,%%v19\n\t"
+    "vfadb   %%v28,%%v28,%%v20\n\t"
+    "vfadb   %%v29,%%v29,%%v21\n\t"
+    "vfadb   %%v30,%%v30,%%v22\n\t"
+    "vfadb   %%v31,%%v31,%%v23\n\t"
+    "vl  %%v16, 128(%%r1,%[x])\n\t"
+    "vl  %%v17, 144(%%r1,%[x])\n\t"
+    "vl  %%v18, 160(%%r1,%[x])\n\t"
+    "vl  %%v19, 176(%%r1,%[x])\n\t"
+    "vl  %%v20, 192(%%r1,%[x])\n\t"
+    "vl  %%v21, 208(%%r1,%[x])\n\t"
+    "vl  %%v22, 224(%%r1,%[x])\n\t"
+    "vl  %%v23, 240(%%r1,%[x])\n\t"
+    "vflpdb  %%v16, %%v16\n\t"
+    "vflpdb  %%v17, %%v17\n\t"
+    "vflpdb  %%v18, %%v18\n\t"
+    "vflpdb  %%v19, %%v19\n\t"
+    "vflpdb  %%v20, %%v20\n\t"
+    "vflpdb  %%v21, %%v21\n\t"
+    "vflpdb  %%v22, %%v22\n\t"
+    "vflpdb  %%v23, %%v23\n\t"
+    "vfadb   %%v24,%%v24,%%v16\n\t"
+    "vfadb   %%v25,%%v25,%%v17\n\t"
+    "vfadb   %%v26,%%v26,%%v18\n\t"
+    "vfadb   %%v27,%%v27,%%v19\n\t"
+    "vfadb   %%v28,%%v28,%%v20\n\t"
+    "vfadb   %%v29,%%v29,%%v21\n\t"
+    "vfadb   %%v30,%%v30,%%v22\n\t"
+    "vfadb   %%v31,%%v31,%%v23\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b\n\t"
+    "vfadb   %%v24,%%v24,%%v25\n\t"
+    "vfadb   %%v24,%%v24,%%v26\n\t"
+    "vfadb   %%v24,%%v24,%%v27\n\t"
+    "vfadb   %%v24,%%v24,%%v28\n\t"
+    "vfadb   %%v24,%%v24,%%v29\n\t"
+    "vfadb   %%v24,%%v24,%%v30\n\t"
+    "vfadb   %%v24,%%v24,%%v31\n\t"
+    "vrepg   %%v25,%%v24,1\n\t"
+    "vfadb   %%v24,%%v24,%%v25\n\t"
+    "vsteg   %%v24,%[asum],0"
+    : [asum] "=Q"(asum),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return asum;
 }
 
- 
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ip = 0;
+  FLOAT sumf = 0.0;
+  BLASLONG n1;
+  BLASLONG inc_x2;
 
-FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
-{
-    BLASLONG i=0;
-    BLASLONG ip=0;
-    FLOAT sumf = 0.0; 
-    BLASLONG n1;
-    BLASLONG inc_x2;
+  if (n <= 0 || inc_x <= 0)
+    return (sumf);
 
-    if (n <= 0 || inc_x <= 0) return(sumf);
+  if (inc_x == 1) {
 
-    if ( inc_x == 1 )
-    {
+    n1 = n & -16;
+    if (n1 > 0) {
 
-        n1 = n & -16;
-        if ( n1 > 0 )
-        {
-
-            sumf=zasum_kernel_16(n1, x ); 
-            i=n1;
-            ip=2*n1;
-        }
-
-        while(i < n)
-        {
-            sumf += ABS(x[ip]) + ABS(x[ip+1]);
-            i++;
-            ip+=2;
-        }
+      sumf = zasum_kernel_16(n1, x);
+      i = n1;
+      ip = 2 * n1;
+    }
 
+    while (i < n) {
+      sumf += ABS(x[ip]) + ABS(x[ip + 1]);
+      i++;
+      ip += 2;
     }
-    else
-    {
-        inc_x2 = 2* inc_x;
 
-        while(i < n)
-        {
-            sumf += ABS(x[ip]) + ABS(x[ip+1]);
-            ip+=inc_x2;
-            i++;
-        }
+  } else {
+    inc_x2 = 2 * inc_x;
 
+    while (i < n) {
+      sumf += ABS(x[ip]) + ABS(x[ip + 1]);
+      ip += inc_x2;
+      i++;
     }
-    return(sumf);
-}
-
 
+  }
+  return (sumf);
+}
diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c
index 212de25c81..9363ec32df 100644
--- a/kernel/zarch/zaxpy.c
+++ b/kernel/zarch/zaxpy.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -23,190 +23,140 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-
+*****************************************************************************/
 
 #include "common.h"
 
- 
-static void  zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) {
-
-    BLASLONG tempR1 ;
-    __asm__ ("pfd   1, 0(%[x_tmp]) \n\t"
-             "pfd    2, 0(%[y_tmp]) \n\t" 
+static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
+  __asm__(
 #if !defined(CONJ)
-            "lgdr   %[t1],%[alpha_r]    \n\t" 
-            "vlvgp  %%v28,%[t1],%[t1]   \n\t" //load both from disjoint          
-            "lgdr   %[t1],%[alpha_i]    \n\t"  
-            "vlvgp  %%v29,%[t1],%[t1]   \n\t" //load both from disjoint   
-            "vflcdb %%v29,%%v29       \n\t" //complement both
-            "vlvgg  %%v29,%[t1],1     \n\t" //restore 2nd  so that  {-alpha_i, alpha_i}   
-
+    "vlrepg %%v0,0(%[alpha])\n\t"
+    "vleg   %%v1,8(%[alpha]),0\n\t"
+    "wflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,8(%[alpha]),1\n\t"
 #else
-            "lgdr   %[t1],%[alpha_i]    \n\t"  
-            "vlvgp  %%v29,%[t1],%[t1]   \n\t" //load both from disjoint        
-            "lgdr   %[t1],%[alpha_r]    \n\t" 
-            "vlvgp  %%v28,%[t1],%[t1]   \n\t" //load both from disjoint    
-            "vflcdb %%v28,%%v28       \n\t" //complement both
-            "vlvgg  %%v28,%[t1],0     \n\t" //restore 1st  so that  {alpha_r,-alpha_r}   
-#endif           
-                               
-            "xgr     %[t1],%[t1]  \n\t" 
-            "sllg   %[tmp],%[tmp],4    \n\t" 
-            "vl   %%v30 ,  0(%[t1],%[y_tmp]) \n\t" 
-            "vl   %%v31 , 16(%[t1],%[y_tmp]) \n\t" 
-            "vl   %%v6 , 32(%[t1],%[y_tmp]) \n\t" 
-            "vl   %%v7 , 48(%[t1],%[y_tmp]) \n\t" 
-            "vl   %%v20 ,  0(%[t1],%[x_tmp]) \n\t" 
-            "vl   %%v21 , 16(%[t1],%[x_tmp]) \n\t" 
-            "vl   %%v22 , 32(%[t1],%[x_tmp]) \n\t" 
-            "vl   %%v23 , 48(%[t1],%[x_tmp]) \n\t"                         
-            "lay  %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition
-            "j 2f \n\t"
-            ".align 16 \n\t"
-            "1:     \n\t"
-  
-            "vpdi   %%v24 , %%v20, %%v20, 4     \n\t"
-            "vpdi   %%v25 , %%v21, %%v21, 4     \n\t"
-            "vpdi   %%v26 , %%v22, %%v22, 4     \n\t"
-            "vpdi   %%v27 , %%v23, %%v23, 4     \n\t" 
-            "vfmadb %%v16,  %%v20, %%v28, %%v16 \n\t"
-            "vfmadb %%v17,  %%v21, %%v28, %%v17 \n\t"
-            "vfmadb %%v18,  %%v22, %%v28, %%v18 \n\t"
-            "vfmadb %%v19,  %%v23, %%v28, %%v19 \n\t"
-            "vl     %%v30,  64(%[t1],%[y_tmp])  \n\t" 
-            "vl     %%v31,  80(%[t1],%[y_tmp])  \n\t" 
-            "vl     %%v6 ,  96(%[t1],%[y_tmp])  \n\t" 
-            "vl     %%v7 , 112(%[t1],%[y_tmp])  \n\t" 
-            "vfmadb %%v16,  %%v24, %%v29, %%v16 \n\t"
-            "vfmadb %%v17,  %%v25, %%v29, %%v17 \n\t" 
-            "vfmadb %%v18,  %%v26, %%v29, %%v18 \n\t"
-            "vfmadb %%v19,  %%v27, %%v29, %%v19 \n\t"
-            "vl     %%v20 , 64(%[t1],%[x_tmp])  \n\t" 
-            "vl     %%v21 , 80(%[t1],%[x_tmp])  \n\t"      
-            "vl     %%v22 , 96(%[t1],%[x_tmp])  \n\t" 
-            "vl     %%v23 ,112(%[t1],%[x_tmp])  \n\t" 
-
-            "vst    %%v16 ,  0(%[t1],%[y_tmp])  \n\t" 
-            "vst    %%v17 , 16(%[t1],%[y_tmp])  \n\t" 
-            "vst    %%v18 , 32(%[t1],%[y_tmp])  \n\t" 
-            "vst    %%v19 , 48(%[t1],%[y_tmp])  \n\t"   
-    
-            "la     %[t1],64(%[t1] ) \n\t" 
-            "2:  \n\t"
-            "pfd    1, 256(%[t1],%[x_tmp])  \n\t"
-            "pfd    2, 256(%[t1],%[y_tmp])  \n\t"  
-            "vpdi   %%v24 , %%v20, %%v20, 4     \n\t"
-            "vpdi   %%v25 , %%v21, %%v21, 4     \n\t"
-            "vpdi   %%v26 , %%v22, %%v22, 4     \n\t"
-            "vpdi   %%v27 , %%v23, %%v23, 4     \n\t" 
-
-            "vfmadb %%v30,  %%v20, %%v28, %%v30 \n\t"
-            "vfmadb %%v31,  %%v21, %%v28, %%v31 \n\t"
-            "vfmadb %%v6,  %%v22, %%v28, %%v6   \n\t"
-            "vfmadb %%v7,  %%v23, %%v28, %%v7   \n\t"
-            "vl     %%v16,  64(%[t1],%[y_tmp])  \n\t" 
-            "vl     %%v17,  80(%[t1],%[y_tmp])  \n\t" 
-            "vl     %%v18,  96(%[t1],%[y_tmp])  \n\t" 
-            "vl     %%v19, 112(%[t1],%[y_tmp])  \n\t" 
-            "vfmadb %%v30,  %%v24, %%v29, %%v30 \n\t"
-            "vfmadb %%v31,  %%v25, %%v29, %%v31 \n\t"
-            "vfmadb %%v6,  %%v26, %%v29, %%v6   \n\t"
-            "vfmadb %%v7,  %%v27, %%v29, %%v7   \n\t"
-
-            "vl     %%v20 , 64(%[t1],%[x_tmp])  \n\t" 
-            "vl     %%v21 , 80(%[t1],%[x_tmp])  \n\t"  
-            "vl     %%v22 , 96(%[t1],%[x_tmp])  \n\t" 
-            "vl     %%v23 ,112(%[t1],%[x_tmp])  \n\t" 
-
-            "vst    %%v30 ,  0(%[t1],%[y_tmp])  \n\t" 
-            "vst    %%v31 , 16(%[t1],%[y_tmp])  \n\t" 
-            "vst    %%v6 ,  32(%[t1],%[y_tmp])  \n\t" 
-            "vst    %%v7 ,  48(%[t1],%[y_tmp])  \n\t"  
- 
-            "la     %[t1],64(%[t1] ) \n\t"
-          
-
-             "clgrjl %[t1],%[tmp],1b         \n\t"   
-//----------------------------------------------------------------------
-            "vfmadb %%v16,  %%v20, %%v28, %%v16 \n\t"
-            "vfmadb %%v17,  %%v21, %%v28, %%v17 \n\t"
-            "vfmadb %%v18,  %%v22, %%v28, %%v18 \n\t"
-            "vfmadb %%v19,  %%v23, %%v28, %%v19 \n\t" 
-            "vpdi   %%v24 , %%v20, %%v20, 4     \n\t"
-            "vpdi   %%v25 , %%v21, %%v21, 4     \n\t" 
-            "vpdi   %%v26 , %%v22, %%v22, 4     \n\t"
-            "vpdi   %%v27 , %%v23, %%v23, 4     \n\t"             
-            "vfmadb %%v16,  %%v24, %%v29, %%v16 \n\t"
-            "vfmadb %%v17,  %%v25, %%v29, %%v17 \n\t"
-            "vfmadb %%v18,  %%v26, %%v29, %%v18 \n\t"
-            "vfmadb %%v19,  %%v27, %%v29, %%v19 \n\t"
-
-            "vst   %%v16 ,  0(%[t1],%[y_tmp])   \n\t" 
-            "vst   %%v17 , 16(%[t1],%[y_tmp])   \n\t" 
-            "vst   %%v18 , 32(%[t1],%[y_tmp])   \n\t" 
-            "vst   %%v19 , 48(%[t1],%[y_tmp])   \n\t"   
-
-            : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) ,  [t1] "=&a" (tempR1) 
-            : [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
-            : "cc",  "v6","v7", "v16",
-            "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
-            );
-
+    "vleg   %%v0,0(%[alpha]),1\n\t"
+    "vflcdb %%v0,%%v0\n\t"
+    "vleg   %%v0,0(%[alpha]),0\n\t"
+    "vlrepg %%v1,8(%[alpha])\n\t"
+#endif
+    "srlg %[n],%[n],3\n\t"
+    "xgr  %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "pfd 2, 1024(%%r1,%[y])\n\t"
+    "vl   %%v8,0(%%r1,%[x])\n\t"
+    "vl   %%v9,16(%%r1,%[x])\n\t"
+    "vl   %%v10,32(%%r1,%[x])\n\t"
+    "vl   %%v11,48(%%r1,%[x])\n\t"
+    "vl   %%v12,0(%%r1,%[y])\n\t"
+    "vl   %%v13,16(%%r1,%[y])\n\t"
+    "vl   %%v14,32(%%r1,%[y])\n\t"
+    "vl   %%v15,48(%%r1,%[y])\n\t"
+    "vl   %%v16,64(%%r1,%[x])\n\t"
+    "vl   %%v17,80(%%r1,%[x])\n\t"
+    "vl   %%v18,96(%%r1,%[x])\n\t"
+    "vl   %%v19,112(%%r1,%[x])\n\t"
+    "vl   %%v20,64(%%r1,%[y])\n\t"
+    "vl   %%v21,80(%%r1,%[y])\n\t"
+    "vl   %%v22,96(%%r1,%[y])\n\t"
+    "vl   %%v23,112(%%r1,%[y])\n\t"
+    "vpdi %%v24,%%v8,%%v8,4\n\t"
+    "vpdi %%v25,%%v9,%%v9,4\n\t"
+    "vpdi %%v26,%%v10,%%v10,4\n\t"
+    "vpdi %%v27,%%v11,%%v11,4\n\t"
+    "vpdi %%v28,%%v16,%%v16,4\n\t"
+    "vpdi %%v29,%%v17,%%v17,4\n\t"
+    "vpdi %%v30,%%v18,%%v18,4\n\t"
+    "vpdi %%v31,%%v19,%%v19,4\n\t"
+    "vfmadb %%v8,%%v8,%%v0,%%v12\n\t"
+    "vfmadb %%v9,%%v9,%%v0,%%v13\n\t"
+    "vfmadb %%v10,%%v10,%%v0,%%v14\n\t"
+    "vfmadb %%v11,%%v11,%%v0,%%v15\n\t"
+    "vfmadb %%v16,%%v16,%%v0,%%v20\n\t"
+    "vfmadb %%v17,%%v17,%%v0,%%v21\n\t"
+    "vfmadb %%v18,%%v18,%%v0,%%v22\n\t"
+    "vfmadb %%v19,%%v19,%%v0,%%v23\n\t"
+    "vfmadb %%v8,%%v24,%%v1,%%v8\n\t"
+    "vfmadb %%v9,%%v25,%%v1,%%v9\n\t"
+    "vfmadb %%v10,%%v26,%%v1,%%v10\n\t"
+    "vfmadb %%v11,%%v27,%%v1,%%v11\n\t"
+    "vfmadb %%v16,%%v28,%%v1,%%v16\n\t"
+    "vfmadb %%v17,%%v29,%%v1,%%v17\n\t"
+    "vfmadb %%v18,%%v30,%%v1,%%v18\n\t"
+    "vfmadb %%v19,%%v31,%%v1,%%v19\n\t"
+    "vst %%v8,0(%%r1,%[y])\n\t"
+    "vst %%v9,16(%%r1,%[y])\n\t"
+    "vst %%v10,32(%%r1,%[y])\n\t"
+    "vst %%v11,48(%%r1,%[y])\n\t"
+    "vst %%v16,64(%%r1,%[y])\n\t"
+    "vst %%v17,80(%%r1,%[y])\n\t"
+    "vst %%v18,96(%%r1,%[y])\n\t"
+    "vst %%v19,112(%%r1,%[y])\n\t"
+    "agfi  %%r1,128\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
+       "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
+       "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
 }
 
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
+          FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+          BLASLONG dummy2) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+  FLOAT da[2] __attribute__ ((aligned(16)));
 
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
-    BLASLONG i = 0;
-    BLASLONG ix = 0, iy = 0;
-
-    if (n <= 0) return (0);
+  if (n <= 0)
+    return (0);
 
-    if ((inc_x == 1) && (inc_y == 1)) {
+  if ((inc_x == 1) && (inc_y == 1)) {
 
-        BLASLONG n1 = n & -8;
+    BLASLONG n1 = n & -8;
 
-        if (n1) { 
-            zaxpy_kernel_8(n1, x, y, da_r,da_i);
-            ix = 2 * n1;
-        }
-        i = n1;
-        while (i < n) {
+    if (n1) {
+      da[0] = da_r;
+      da[1] = da_i;
+      zaxpy_kernel_8(n1, x, y, da);
+      ix = 2 * n1;
+    }
+    i = n1;
+    while (i < n) {
 #if !defined(CONJ)
-            y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
-            y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
+      y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
+      y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
 #else
-            y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
-            y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
+      y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
+      y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
 #endif
-            i++;
-            ix += 2;
-
-        }
-        return (0);
-
+      i++;
+      ix += 2;
 
     }
+    return (0);
 
-    inc_x *= 2;
-    inc_y *= 2;
+  }
 
-    while (i < n) {
+  inc_x *= 2;
+  inc_y *= 2;
+
+  while (i < n) {
 
 #if !defined(CONJ)
-        y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
-        y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
+    y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
+    y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
 #else
-        y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
-        y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
+    y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
+    y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
 #endif
-        ix += inc_x;
-        iy += inc_y;
-        i++;
+    ix += inc_x;
+    iy += inc_y;
+    i++;
 
-    }
-    return (0);
+  }
+  return (0);
 
 }
-
-
diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c
index b5bf383f70..5a46aec1c9 100644
--- a/kernel/zarch/zcopy.c
+++ b/kernel/zarch/zcopy.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -24,122 +24,65 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
- 
-#include "common.h"
- 
-static void  zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
-
-    __asm__ volatile(
-            "pfd   1, 0(%[ptr_x]) \n\t"
-            "pfd   2, 0(%[ptr_y]) \n\t"
-            "srlg  %[n_tmp],%[n_tmp],4      \n\t"
-            "xgr   %%r1,%%r1       \n\t"
-            ".align 16 \n\t"
-            "1:    \n\t"
-            "pfd   1, 256(%%r1,%[ptr_x]) \n\t"
-            "pfd   2, 256(%%r1,%[ptr_y]) \n\t"
-
-            "vl    %%v24, 0(%%r1,%[ptr_x])   \n\t"
-            "vst   %%v24, 0(%%r1,%[ptr_y])   \n\t"
-            "vl    %%v25, 16(%%r1,%[ptr_x])  \n\t"
-            "vst   %%v25, 16(%%r1,%[ptr_y])  \n\t"
-            "vl    %%v26, 32(%%r1,%[ptr_x])  \n\t"
-            "vst   %%v26, 32(%%r1,%[ptr_y])  \n\t"
-            "vl    %%v27, 48(%%r1,%[ptr_x])  \n\t"
-            "vst   %%v27, 48(%%r1,%[ptr_y])  \n\t"
-
-            "vl    %%v28, 64(%%r1,%[ptr_x])  \n\t"
-            "vst   %%v28, 64(%%r1,%[ptr_y])  \n\t"
-            "vl    %%v29, 80(%%r1,%[ptr_x])  \n\t"
-            "vst   %%v29, 80(%%r1,%[ptr_y])  \n\t"
-            "vl    %%v30, 96(%%r1,%[ptr_x])  \n\t"
-            "vst   %%v30, 96(%%r1,%[ptr_y])  \n\t"
-            "vl    %%v31, 112(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v31, 112(%%r1,%[ptr_y]) \n\t"
-
-
-            "vl    %%v24, 128(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v24, 128(%%r1,%[ptr_y]) \n\t"
-
-            "vl    %%v25, 144(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v25, 144(%%r1,%[ptr_y]) \n\t"
-
-            "vl    %%v26, 160(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v26, 160(%%r1,%[ptr_y]) \n\t"
-
-            "vl    %%v27, 176(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v27, 176(%%r1,%[ptr_y]) \n\t"
 
-            "vl    %%v28, 192(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v28, 192(%%r1,%[ptr_y]) \n\t"
-            "vl    %%v29, 208(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v29, 208(%%r1,%[ptr_y]) \n\t"
-            "vl    %%v30, 224(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v30, 224(%%r1,%[ptr_y]) \n\t"
-            "vl    %%v31, 240(%%r1,%[ptr_x]) \n\t"
-            "vst   %%v31, 240(%%r1,%[ptr_y]) \n\t"
-            "la    %%r1,256(%%r1)      \n\t"
-            "brctg %[n_tmp],1b"
-            : [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n)
-            : [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
-            : "cc",  "r1", "v24","v25","v26","v27","v28","v29","v30","v31" 
-            );
-    return; 
+#include "common.h"
 
+static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
+  __asm__("srlg %[n],%[n],4\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%[x])\n\t"
+    "pfd 2, 1024(%[y])\n\t"
+    "mvc 0(256,%[y]),0(%[x])\n\t"
+    "la  %[x],256(%[x])\n\t"
+    "la  %[y],256(%[y])\n\t"
+    "brctg %[n],0b"
+    : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
+       [n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x)
+    : "cc");
 }
 
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
 
-int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
-{
-    BLASLONG i=0;
-    BLASLONG ix=0,iy=0;
-
-    if ( n <= 0     )  return(0);
-
-    if ( (inc_x == 1) && (inc_y == 1 ))
-    {
-
-        BLASLONG n1 = n & -16;
-        if ( n1 > 0 )
-        {
-            zcopy_kernel_16(n1, x, y);
-            i=n1;
-            ix=n1*2;
-            iy=n1*2;
-        }
+  if (n <= 0)
+    return (0);
 
-        while(i < n)
-        {
-            y[iy] = x[iy] ;
-            y[iy+1] = x[ix+1] ;
-            ix+=2;
-            iy+=2;
-            i++ ;
+  if ((inc_x == 1) && (inc_y == 1)) {
 
-        }
+    BLASLONG n1 = n & -16;
+    if (n1 > 0) {
+      zcopy_kernel_16(n1, x, y);
+      i = n1;
+      ix = n1 * 2;
+      iy = n1 * 2;
+    }
 
+    while (i < n) {
+      y[iy] = x[iy];
+      y[iy + 1] = x[ix + 1];
+      ix += 2;
+      iy += 2;
+      i++;
 
     }
-    else
-    {
 
-        BLASLONG inc_x2 = 2 * inc_x;
-        BLASLONG inc_y2 = 2 * inc_y;
+  } else {
 
-        while(i < n)
-        {
-            y[iy] = x[ix] ;
-            y[iy+1] = x[ix+1] ;
-            ix += inc_x2 ;
-            iy += inc_y2 ;
-            i++ ;
+    BLASLONG inc_x2 = 2 * inc_x;
+    BLASLONG inc_y2 = 2 * inc_y;
 
-        }
+    while (i < n) {
+      y[iy] = x[ix];
+      y[iy + 1] = x[ix + 1];
+      ix += inc_x2;
+      iy += inc_y2;
+      i++;
 
     }
-    return(0);
-    
-
-}
 
+  }
 
+  return (0);
+}
diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c
index 61c5d6b98a..ac6e69c23f 100644
--- a/kernel/zarch/zdot.c
+++ b/kernel/zarch/zdot.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -23,203 +23,150 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-
+*****************************************************************************/
 
 #include "common.h"
-#if defined(Z13)
 
 static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
-
-    __asm__ volatile(
-            "pfd   1, 0(%[ptr_x_tmp]) \n\t"
-            "pfd   1, 0(%[ptr_y_tmp]) \n\t"
-            "vzero %%v24  \n\t"
-            "vzero %%v25  \n\t"
-            "vzero %%v26  \n\t"
-            "vzero %%v27  \n\t"
-            "srlg  %[n_tmp],%[n_tmp],3      \n\t"
-            "xgr   %%r1,%%r1       \n\t"
-            ".align 16 \n\t"
-            "1:     \n\t"
-            "pfd    1,    256(%%r1,%[ptr_x_tmp])  \n\t"
-            "pfd    1,    256(%%r1,%[ptr_y_tmp])  \n\t"
-            "vl     %%v16,  0(%%r1,%[ptr_x_tmp])  \n\t"
-            "vl     %%v17, 16(%%r1,%[ptr_x_tmp])  \n\t"
-            "vl     %%v18, 32(%%r1,%[ptr_x_tmp])  \n\t"
-            "vl     %%v19, 48(%%r1,%[ptr_x_tmp])  \n\t"
-            "vl     %%v28,  0(%%r1,%[ptr_y_tmp])  \n\t"
-            "vl     %%v29, 16(%%r1,%[ptr_y_tmp])  \n\t"
-            "vl     %%v30, 32(%%r1,%[ptr_y_tmp])  \n\t"
-            "vl     %%v31, 48(%%r1,%[ptr_y_tmp])  \n\t"
-            "vpdi   %%v20,%%v16,%%v16,4 \n\t"
-            "vpdi   %%v21,%%v17,%%v17,4 \n\t"
-            "vpdi   %%v22,%%v18,%%v18,4 \n\t"
-            "vpdi   %%v23,%%v19,%%v19,4 \n\t"
-
-
-            "vfmadb %%v24,%%v16,%%v28,%%v24  \n\t"
-            "vfmadb %%v25,%%v20,%%v28,%%v25  \n\t"
-            "vfmadb %%v26,%%v17,%%v29,%%v26  \n\t"
-            "vfmadb %%v27,%%v21,%%v29,%%v27  \n\t"
-            "vfmadb %%v24,%%v18,%%v30,%%v24  \n\t"
-            "vfmadb %%v25,%%v22,%%v30,%%v25  \n\t"
-            "vfmadb %%v26,%%v19,%%v31,%%v26  \n\t"
-            "vfmadb %%v27,%%v23,%%v31,%%v27  \n\t"
-
-
-
-            "vl     %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t"
-            "vl     %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
-            "vl     %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
-            "vl     %%v19,112(%%r1,%[ptr_x_tmp]) \n\t"
-            "vl     %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
-            "vl     %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
-            "vl     %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
-            "vl     %%v31,112(%%r1,%[ptr_y_tmp]) \n\t"
-            "vpdi   %%v20,%%v16,%%v16,4 \n\t"
-            "vpdi   %%v21,%%v17,%%v17,4 \n\t"
-            "vpdi   %%v22,%%v18,%%v18,4 \n\t"
-            "vpdi   %%v23,%%v19,%%v19,4 \n\t"
-            "vfmadb %%v24,%%v16,%%v28,%%v24  \n\t"
-            "vfmadb %%v25,%%v20,%%v28,%%v25  \n\t"
-            "vfmadb %%v26,%%v17,%%v29,%%v26  \n\t"
-            "vfmadb %%v27,%%v21,%%v29,%%v27  \n\t"
-            "vfmadb %%v24,%%v18,%%v30,%%v24  \n\t"
-            "vfmadb %%v25,%%v22,%%v30,%%v25  \n\t"
-            "vfmadb %%v26,%%v19,%%v31,%%v26  \n\t"
-            "vfmadb %%v27,%%v23,%%v31,%%v27  \n\t"
-
-
-            "la     %%r1,128(%%r1)       \n\t"
-            "brctg  %[n_tmp],1b          \n\t"
-            "vfadb  %%v24,%%v26,%%v24    \n\t"
-            "vfadb  %%v25,%%v25,%%v27    \n\t"
-            "vsteg  %%v24, 0(%[ptr_d]),0    \n\t"
-            "vsteg  %%v24, 8(%[ptr_d]),1    \n\t"
-            "vsteg  %%v25,16(%[ptr_d]),1    \n\t"
-            "vsteg  %%v25,24(%[ptr_d]),0    \n\t"
-            : [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n) 
-            : [mem_x] "m"( *(const double (*)[2*n])x),
-              [mem_y] "m"( *(const double (*)[2*n])y),
-              [ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d)
-            : "cc", "r1","v16",
-            "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" 
-            );
-
+  __asm__("vzero %%v24\n\t"
+    "vzero %%v25\n\t"
+    "vzero %%v26\n\t"
+    "vzero %%v27\n\t"
+    "vzero %%v28\n\t"
+    "vzero %%v29\n\t"
+    "vzero %%v30\n\t"
+    "vzero %%v31\n\t"
+    "srlg %[n],%[n],3\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 1, 1024(%%r1,%[x])\n\t"
+    "pfd 1, 1024(%%r1,%[y])\n\t"
+    "vl  %%v16,  0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v0,  0(%%r1,%[y])\n\t"
+    "vl  %%v1, 16(%%r1,%[y])\n\t"
+    "vl  %%v2, 32(%%r1,%[y])\n\t"
+    "vl  %%v3, 48(%%r1,%[y])\n\t"
+    "vpdi %%v20,%%v16,%%v16,4\n\t"
+    "vpdi %%v21,%%v17,%%v17,4\n\t"
+    "vpdi %%v22,%%v18,%%v18,4\n\t"
+    "vpdi %%v23,%%v19,%%v19,4\n\t"
+    "vfmadb    %%v24,%%v16,%%v0,%%v24\n\t"
+    "vfmadb    %%v25,%%v20,%%v0,%%v25\n\t"
+    "vfmadb    %%v26,%%v17,%%v1,%%v26\n\t"
+    "vfmadb    %%v27,%%v21,%%v1,%%v27\n\t"
+    "vfmadb    %%v28,%%v18,%%v2,%%v28\n\t"
+    "vfmadb    %%v29,%%v22,%%v2,%%v29\n\t"
+    "vfmadb    %%v30,%%v19,%%v3,%%v30\n\t"
+    "vfmadb    %%v31,%%v23,%%v3,%%v31\n\t"
+    "vl  %%v16, 64(%%r1,%[x])\n\t"
+    "vl  %%v17, 80(%%r1,%[x])\n\t"
+    "vl  %%v18, 96(%%r1,%[x])\n\t"
+    "vl  %%v19, 112(%%r1,%[x])\n\t"
+    "vl  %%v0, 64(%%r1,%[y])\n\t"
+    "vl  %%v1, 80(%%r1,%[y])\n\t"
+    "vl  %%v2, 96(%%r1,%[y])\n\t"
+    "vl  %%v3, 112(%%r1,%[y])\n\t"
+    "vpdi %%v20,%%v16,%%v16,4\n\t"
+    "vpdi %%v21,%%v17,%%v17,4\n\t"
+    "vpdi %%v22,%%v18,%%v18,4\n\t"
+    "vpdi %%v23,%%v19,%%v19,4\n\t"
+    "vfmadb    %%v24,%%v16,%%v0,%%v24\n\t"
+    "vfmadb    %%v25,%%v20,%%v0,%%v25\n\t"
+    "vfmadb    %%v26,%%v17,%%v1,%%v26\n\t"
+    "vfmadb    %%v27,%%v21,%%v1,%%v27\n\t"
+    "vfmadb    %%v28,%%v18,%%v2,%%v28\n\t"
+    "vfmadb    %%v29,%%v22,%%v2,%%v29\n\t"
+    "vfmadb    %%v30,%%v19,%%v3,%%v30\n\t"
+    "vfmadb    %%v31,%%v23,%%v3,%%v31\n\t"
+    "agfi   %%r1,128\n\t"
+    "brctg  %[n],0b\n\t"
+    "vfadb  %%v24,%%v24,%%v26\n\t"
+    "vfadb  %%v24,%%v24,%%v28\n\t"
+    "vfadb  %%v24,%%v24,%%v30\n\t"
+    "vfadb  %%v25,%%v25,%%v27\n\t"
+    "vfadb  %%v25,%%v25,%%v29\n\t"
+    "vfadb  %%v25,%%v25,%%v31\n\t"
+    "vsteg  %%v24,0(%[d]),0\n\t"
+    "vsteg  %%v24,8(%[d]),1\n\t"
+    "vsteg  %%v25,16(%[d]),1\n\t"
+    "vsteg  %%v25,24(%[d]),0"
+    : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
+    : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
+       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
 }
 
-#else
-
-static  void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
-    BLASLONG register i = 0;
-    FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0};
-    BLASLONG j = 0;
-
-    while (i < n) {
-
-        dot[0] += x[j] * y[j];
-        dot[1] += x[j + 1] * y[j + 1];
-        dot[2] += x[j] * y[j + 1];
-        dot[3] += x[j + 1] * y[j];
-
-        dot[0] += x[j + 2] * y[j + 2];
-        dot[1] += x[j + 3] * y[j + 3];
-        dot[2] += x[j + 2] * y[j + 3];
-        dot[3] += x[j + 3] * y[j + 2];
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+                             BLASLONG inc_y) {
+  BLASLONG i;
+  BLASLONG ix, iy;
+  OPENBLAS_COMPLEX_FLOAT result;
+  FLOAT dot[4] __attribute__ ((aligned(16))) = {
+  0.0, 0.0, 0.0, 0.0};
+
+  if (n <= 0) {
+    CREAL(result) = 0.0;
+    CIMAG(result) = 0.0;
+    return (result);
 
-        dot[0] += x[j + 4] * y[j + 4];
-        dot[1] += x[j + 5] * y[j + 5];
-        dot[2] += x[j + 4] * y[j + 5];
-        dot[3] += x[j + 5] * y[j + 4];
+  }
 
-        dot[0] += x[j + 6] * y[j + 6];
-        dot[1] += x[j + 7] * y[j + 7];
-        dot[2] += x[j + 6] * y[j + 7];
-        dot[3] += x[j + 7] * y[j + 6];
+  if ((inc_x == 1) && (inc_y == 1)) {
 
-        j += 8;
-        i += 4;
+    BLASLONG n1 = n & -8;
 
-    }
-    d[0] = dot[0];
-    d[1] = dot[1];
-    d[2] = dot[2];
-    d[3] = dot[3];
+    if (n1)
+      zdot_kernel_8(n1, x, y, dot);
 
-}
+    i = n1;
+    BLASLONG j = i * 2;
 
-#endif
+    while (i < n) {
 
-OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
-    BLASLONG i = 0;
-    BLASLONG ix=0, iy=0;
-    OPENBLAS_COMPLEX_FLOAT result;
-    FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
+      dot[0] += x[j] * y[j];
+      dot[1] += x[j + 1] * y[j + 1];
+      dot[2] += x[j] * y[j + 1];
+      dot[3] += x[j + 1] * y[j];
 
-    if (n <= 0) {
-        CREAL(result) = 0.0;
-        CIMAG(result) = 0.0;
-        return (result);
+      j += 2;
+      i++;
 
     }
 
-    if ((inc_x == 1) && (inc_y == 1)) {
-
-        BLASLONG n1 = n & -8;
-        BLASLONG j=0; 
-
-        if (n1){
-            zdot_kernel_8(n1, x, y, dot);
-            i = n1;
-            j = n1 <<1;
-        }
- 
-
-        while (i < n) {
-
-            dot[0] += x[j] * y[j];
-            dot[1] += x[j + 1] * y[j + 1];
-            dot[2] += x[j] * y[j + 1];
-            dot[3] += x[j + 1] * y[j];
-
-            j += 2;
-            i++;
-
-        }
-
-
-    } else {
-        i = 0;
-        ix = 0;
-        iy = 0;
-        inc_x <<= 1;
-        inc_y <<= 1;
-        while (i < n) {
+  } else {
+    i = 0;
+    ix = 0;
+    iy = 0;
+    inc_x <<= 1;
+    inc_y <<= 1;
+    while (i < n) {
 
-            dot[0] += x[ix] * y[iy];
-            dot[1] += x[ix + 1] * y[iy + 1];
-            dot[2] += x[ix] * y[iy + 1];
-            dot[3] += x[ix + 1] * y[iy];
+      dot[0] += x[ix] * y[iy];
+      dot[1] += x[ix + 1] * y[iy + 1];
+      dot[2] += x[ix] * y[iy + 1];
+      dot[3] += x[ix + 1] * y[iy];
 
-            ix += inc_x;
-            iy += inc_y;
-            i++;
+      ix += inc_x;
+      iy += inc_y;
+      i++;
 
-        }
     }
+  }
 
 #if !defined(CONJ)
-    CREAL(result) = dot[0] - dot[1];
-    CIMAG(result) = dot[2] + dot[3];
+  CREAL(result) = dot[0] - dot[1];
+  CIMAG(result) = dot[2] + dot[3];
 #else
-    CREAL(result) = dot[0] + dot[1];
-    CIMAG(result) = dot[2] - dot[3];
+  CREAL(result) = dot[0] + dot[1];
+  CIMAG(result) = dot[2] - dot[3];
 
 #endif
 
-    return (result);
+  return (result);
 
 }
-
-
diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c
index 484db30734..13045a3591 100644
--- a/kernel/zarch/zgemv_n_4.c
+++ b/kernel/zarch/zgemv_n_4.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2018, The OpenBLAS Project
+Copyright (c) 2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -23,898 +23,642 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
+*****************************************************************************/
 
-#include <stdlib.h>
-#include <stdio.h>
 #include "common.h"
 
-#define HAVE_KERNEL_4x4_VEC 1
-#define HAVE_KERNEL_4x2_VEC 1
-#define HAVE_KERNEL_4x1_VEC 1
-#define HAVE_KERNEL_ADDY 1
-
-#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
-#include <vecintrin.h> 
-#endif
-
-// 
 #define NBMAX 1024
 
-#ifdef HAVE_KERNEL_4x4_VEC_ASM
-
-#elif HAVE_KERNEL_4x4_VEC
-
-static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
-    BLASLONG i;
-    FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
+static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+  register FLOAT *ap2 = ap[2];
+  register FLOAT *ap3 = ap[3];
 
+  __asm__("vl     %%v16,0(%[x])\n\t"
+    "vl     %%v17,16(%[x])\n\t"
+    "vl     %%v18,32(%[x])\n\t"
+    "vl     %%v19,48(%[x])\n\t"
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-
-    register __vector double vx0_r = {x[0], x[0]};
-    register __vector double vx0_i = {-x[1], x[1]};
-    register __vector double vx1_r = {x[2], x[2]};
-    register __vector double vx1_i = {-x[3], x[3]};
-    register __vector double vx2_r = {x[4], x[4]};
-    register __vector double vx2_i = {-x[5], x[5]};
-    register __vector double vx3_r = {x[6], x[6]};
-    register __vector double vx3_i = {-x[7], x[7]};
-
+    "vleg   %%v20,8(%[x]),0\n\t"
+    "wflcdb %%v20,%%v20\n\t"
+    "vleg   %%v20,0(%[x]),1\n\t"
+    "vleg   %%v21,24(%[x]),0\n\t"
+    "wflcdb %%v21,%%v21\n\t"
+    "vleg   %%v21,16(%[x]),1\n\t"
+    "vleg   %%v22,40(%[x]),0\n\t"
+    "wflcdb %%v22,%%v22\n\t"
+    "vleg   %%v22,32(%[x]),1\n\t"
+    "vleg   %%v23,56(%[x]),0\n\t"
+    "wflcdb %%v23,%%v23\n\t"
+    "vleg   %%v23,48(%[x]),1\n\t"
 #else
-    register __vector double vx0_r = {x[0], -x[0]};
-    register __vector double vx0_i = {x[1], x[1]};
-    register __vector double vx1_r = {x[2], -x[2]};
-    register __vector double vx1_i = {x[3], x[3]};
-    register __vector double vx2_r = {x[4], -x[4]};
-    register __vector double vx2_i = {x[5], x[5]};
-    register __vector double vx3_r = {x[6], -x[6]};
-    register __vector double vx3_i = {x[7], x[7]};
+    "vleg   %%v20,0(%[x]),1\n\t"
+    "vflcdb %%v20,%%v20\n\t"
+    "vleg   %%v20,8(%[x]),0\n\t"
+    "vleg   %%v21,16(%[x]),1\n\t"
+    "vflcdb %%v21,%%v21\n\t"
+    "vleg   %%v21,24(%[x]),0\n\t"
+    "vleg   %%v22,32(%[x]),1\n\t"
+    "vflcdb %%v22,%%v22\n\t"
+    "vleg   %%v22,40(%[x]),0\n\t"
+    "vleg   %%v23,48(%[x]),1\n\t"
+    "vflcdb %%v23,%%v23\n\t"
+    "vleg   %%v23,56(%[x]),0\n\t"
 #endif
-
-    register __vector double *vy = (__vector double *) y;
-    register __vector double *vptr_a0 = (__vector double *) a0;
-    register __vector double *vptr_a1 = (__vector double *) a1;
-    register __vector double *vptr_a2 = (__vector double *) a2;
-    register __vector double *vptr_a3 = (__vector double *) a3;
-
-    for (i = 0; i < n; i += 4) {
-
-        register __vector double vy_0 = vy[i];
-        register __vector double vy_1 = vy[i + 1];
-        register __vector double vy_2 = vy[i + 2];
-        register __vector double vy_3 = vy[i + 3];
-
-        register __vector double va0 = vptr_a0[i];
-        register __vector double va0_1 = vptr_a0[i + 1];
-        register __vector double va0_2 = vptr_a0[i + 2];
-        register __vector double va0_3 = vptr_a0[i + 3];
-
-        register __vector double va1 = vptr_a1[i];
-        register __vector double va1_1 = vptr_a1[i + 1];
-        register __vector double va1_2 = vptr_a1[i + 2];
-        register __vector double va1_3 = vptr_a1[i + 3];
-
-        register __vector double va2 = vptr_a2[i];
-        register __vector double va2_1 = vptr_a2[i + 1];
-        register __vector double va2_2 = vptr_a2[i + 2];
-        register __vector double va2_3 = vptr_a2[i + 3];
-
-        register __vector double va3 = vptr_a3[i];
-        register __vector double va3_1 = vptr_a3[i + 1];
-        register __vector double va3_2 = vptr_a3[i + 2];
-        register __vector double va3_3 = vptr_a3[i + 3];
-
-        vy_0 += va0*vx0_r;
-        vy_1 += va0_1*vx0_r;
-        vy_2 += va0_2*vx0_r;
-        vy_3 += va0_3*vx0_r;
-
-        vy_0 += va1*vx1_r;
-        vy_1 += va1_1*vx1_r;
-        vy_2 += va1_2*vx1_r;
-        vy_3 += va1_3*vx1_r;
-
-        va0 = vec_permi(va0, va0, 2);
-        va0_1 = vec_permi(va0_1, va0_1, 2);
-        va0_2 = vec_permi(va0_2, va0_2, 2);
-        va0_3 = vec_permi(va0_3, va0_3, 2);
-
-        vy_0 += va2*vx2_r;
-        vy_1 += va2_1*vx2_r;
-        vy_2 += va2_2*vx2_r;
-        vy_3 += va2_3*vx2_r;
-
-        va1 = vec_permi(va1, va1, 2);
-        va1_1 = vec_permi(va1_1, va1_1, 2);
-        va1_2 = vec_permi(va1_2, va1_2, 2);
-        va1_3 = vec_permi(va1_3, va1_3, 2);
-
-        vy_0 += va3*vx3_r;
-        vy_1 += va3_1*vx3_r;
-        vy_2 += va3_2*vx3_r;
-        vy_3 += va3_3*vx3_r;
-
-        va2 = vec_permi(va2, va2, 2);
-        va2_1 = vec_permi(va2_1, va2_1, 2);
-        va2_2 = vec_permi(va2_2, va2_2, 2);
-        va2_3 = vec_permi(va2_3, va2_3, 2);
-
-        vy_0 += va0*vx0_i;
-        vy_1 += va0_1*vx0_i;
-        vy_2 += va0_2*vx0_i;
-        vy_3 += va0_3*vx0_i;
-
-        va3 = vec_permi(va3, va3, 2);
-        va3_1 = vec_permi(va3_1, va3_1, 2);
-        va3_2 = vec_permi(va3_2, va3_2, 2);
-        va3_3 = vec_permi(va3_3, va3_3, 2);
-
-        vy_0 += va1*vx1_i;
-        vy_1 += va1_1*vx1_i;
-        vy_2 += va1_2*vx1_i;
-        vy_3 += va1_3*vx1_i;
-
-        vy_0 += va2*vx2_i;
-        vy_1 += va2_1*vx2_i;
-        vy_2 += va2_2*vx2_i;
-        vy_3 += va2_3*vx2_i;
-
-        vy_0 += va3*vx3_i;
-        vy_1 += va3_1*vx3_i;
-        vy_2 += va3_2*vx3_i;
-        vy_3 += va3_3*vx3_i;
-
-        vy[i] = vy_0;
-        vy[i + 1] = vy_1;
-        vy[i + 2] = vy_2;
-        vy[i + 3] = vy_3;
-
-    }
-}
-#else
-
-static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
-    BLASLONG i;
-    FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-
-    for (i = 0; i < 2 * n; i += 2) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-        y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
-        y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
-        y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
-        y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
-        y[i] += a2[i] * x[4] - a2[i + 1] * x[5];
-        y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4];
-        y[i] += a3[i] * x[6] - a3[i + 1] * x[7];
-        y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6];
-#else 
-        y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
-        y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
-        y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
-        y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
-        y[i] += a2[i] * x[4] + a2[i + 1] * x[5];
-        y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4];
-        y[i] += a3[i] * x[6] + a3[i + 1] * x[7];
-        y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6];
-#endif
-    }
+    "xgr   %%r1,%%r1\n\t"
+    "srlg  %[n],%[n],1\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 1,1024(%%r1,%[ap2])\n\t"
+    "pfd 1,1024(%%r1,%[ap3])\n\t"
+    "pfd 2,1024(%%r1,%[y])\n\t"
+    "vl  %%v0,0(%%r1,%[y])\n\t"
+    "vl  %%v1,16(%%r1,%[y])\n\t"
+    "vlrepg %%v24,0(%%r1,%[ap0])\n\t"
+    "vlrepg %%v25,8(%%r1,%[ap0])\n\t"
+    "vlrepg %%v26,0(%%r1,%[ap1])\n\t"
+    "vlrepg %%v27,8(%%r1,%[ap1])\n\t"
+    "vlrepg %%v28,16(%%r1,%[ap0])\n\t"
+    "vlrepg %%v29,24(%%r1,%[ap0])\n\t"
+    "vlrepg %%v30,16(%%r1,%[ap1])\n\t"
+    "vlrepg %%v31,24(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v0,%%v24,%%v16,%%v0\n\t"
+    "vfmadb   %%v1,%%v28,%%v16,%%v1\n\t"
+    "vfmadb   %%v0,%%v25,%%v20,%%v0\n\t"
+    "vfmadb   %%v1,%%v29,%%v20,%%v1\n\t"
+    "vfmadb   %%v0,%%v26,%%v17,%%v0\n\t"
+    "vfmadb   %%v1,%%v30,%%v17,%%v1\n\t"
+    "vfmadb   %%v0,%%v27,%%v21,%%v0\n\t"
+    "vfmadb   %%v1,%%v31,%%v21,%%v1\n\t"
+    "vlrepg %%v24,0(%%r1,%[ap2])\n\t"
+    "vlrepg %%v25,8(%%r1,%[ap2])\n\t"
+    "vlrepg %%v26,0(%%r1,%[ap3])\n\t"
+    "vlrepg %%v27,8(%%r1,%[ap3])\n\t"
+    "vlrepg %%v28,16(%%r1,%[ap2])\n\t"
+    "vlrepg %%v29,24(%%r1,%[ap2])\n\t"
+    "vlrepg %%v30,16(%%r1,%[ap3])\n\t"
+    "vlrepg %%v31,24(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v0,%%v24,%%v18,%%v0\n\t"
+    "vfmadb   %%v1,%%v28,%%v18,%%v1\n\t"
+    "vfmadb   %%v0,%%v25,%%v22,%%v0\n\t"
+    "vfmadb   %%v1,%%v29,%%v22,%%v1\n\t"
+    "vfmadb   %%v0,%%v26,%%v19,%%v0\n\t"
+    "vfmadb   %%v1,%%v30,%%v19,%%v1\n\t"
+    "vfmadb   %%v0,%%v27,%%v23,%%v0\n\t"
+    "vfmadb   %%v1,%%v31,%%v23,%%v1\n\t"
+    "vst %%v0,0(%%r1,%[y])\n\t"
+    "vst %%v1,16(%%r1,%[y])\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
+       "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
 }
 
-#endif
-
-#ifdef  HAVE_KERNEL_4x2_VEC
-
-static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
-    BLASLONG i;
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda;
-
+static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
 
+  __asm__("vl     %%v16,0(%[x])\n\t"
+    "vl     %%v17,16(%[x])\n\t"
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-
-    register __vector double vx0_r = {x[0], x[0]};
-    register __vector double vx0_i = {-x[1], x[1]};
-    register __vector double vx1_r = {x[2], x[2]};
-    register __vector double vx1_i = {-x[3], x[3]};
-
+    "vleg   %%v18,8(%[x]),0\n\t"
+    "wflcdb %%v18,%%v18\n\t"
+    "vleg   %%v18,0(%[x]),1\n\t"
+    "vleg   %%v19,24(%[x]),0\n\t"
+    "wflcdb %%v19,%%v19\n\t"
+    "vleg   %%v19,16(%[x]),1\n\t"
 #else
-    register __vector double vx0_r = {x[0], -x[0]};
-    register __vector double vx0_i = {x[1], x[1]};
-    register __vector double vx1_r = {x[2], -x[2]};
-    register __vector double vx1_i = {x[3], x[3]};
+    "vleg   %%v18,0(%[x]),1\n\t"
+    "vflcdb %%v18,%%v18\n\t"
+    "vleg   %%v18,8(%[x]),0\n\t"
+    "vleg   %%v19,16(%[x]),1\n\t"
+    "vflcdb %%v19,%%v19\n\t"
+    "vleg   %%v19,24(%[x]),0\n\t"
 #endif
-
-
-    register __vector double *vy = (__vector double *) y;
-    register __vector double *vptr_a0 = (__vector double *) a0;
-    register __vector double *vptr_a1 = (__vector double *) a1;
-
-    for (i = 0; i < n; i += 4) {
-
-        register __vector double vy_0 = vy[i];
-        register __vector double vy_1 = vy[i + 1];
-        register __vector double vy_2 = vy[i + 2];
-        register __vector double vy_3 = vy[i + 3];
-
-        register __vector double va0 = vptr_a0[i];
-        register __vector double va0_1 = vptr_a0[i + 1];
-        register __vector double va0_2 = vptr_a0[i + 2];
-        register __vector double va0_3 = vptr_a0[i + 3];
-
-        register __vector double va1 = vptr_a1[i];
-        register __vector double va1_1 = vptr_a1[i + 1];
-        register __vector double va1_2 = vptr_a1[i + 2];
-        register __vector double va1_3 = vptr_a1[i + 3];
-
-        vy_0 += va0*vx0_r;
-        vy_1 += va0_1*vx0_r;
-        vy_2 += va0_2*vx0_r;
-        vy_3 += va0_3*vx0_r;
-
-        va0 = vec_permi(va0, va0, 2);
-        va0_1 = vec_permi(va0_1, va0_1, 2);
-        va0_2 = vec_permi(va0_2, va0_2, 2);
-        va0_3 = vec_permi(va0_3, va0_3, 2);
-
-        vy_0 += va1*vx1_r;
-        vy_1 += va1_1*vx1_r;
-        vy_2 += va1_2*vx1_r;
-        vy_3 += va1_3*vx1_r;
-
-        va1 = vec_permi(va1, va1, 2);
-        va1_1 = vec_permi(va1_1, va1_1, 2);
-        va1_2 = vec_permi(va1_2, va1_2, 2);
-        va1_3 = vec_permi(va1_3, va1_3, 2);
-
-        vy_0 += va0*vx0_i;
-        vy_1 += va0_1*vx0_i;
-        vy_2 += va0_2*vx0_i;
-        vy_3 += va0_3*vx0_i;
-
-        vy_0 += va1*vx1_i;
-        vy_1 += va1_1*vx1_i;
-        vy_2 += va1_2*vx1_i;
-        vy_3 += va1_3*vx1_i;
-
-        vy[i] = vy_0;
-        vy[i + 1] = vy_1;
-        vy[i + 2] = vy_2;
-        vy[i + 3] = vy_3;
-
-    }
+    "xgr   %%r1,%%r1\n\t"
+    "srlg  %[n],%[n],1\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 2,1024(%%r1,%[y])\n\t"
+    "vl  %%v0,0(%%r1,%[y])\n\t"
+    "vl  %%v1,16(%%r1,%[y])\n\t"
+    "vlrepg %%v20,0(%%r1,%[ap0])\n\t"
+    "vlrepg %%v21,8(%%r1,%[ap0])\n\t"
+    "vlrepg %%v22,0(%%r1,%[ap1])\n\t"
+    "vlrepg %%v23,8(%%r1,%[ap1])\n\t"
+    "vlrepg %%v24,16(%%r1,%[ap0])\n\t"
+    "vlrepg %%v25,24(%%r1,%[ap0])\n\t"
+    "vlrepg %%v26,16(%%r1,%[ap1])\n\t"
+    "vlrepg %%v27,24(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v0,%%v20,%%v16,%%v0\n\t"
+    "vfmadb   %%v1,%%v24,%%v16,%%v1\n\t"
+    "vfmadb   %%v0,%%v21,%%v18,%%v0\n\t"
+    "vfmadb   %%v1,%%v25,%%v18,%%v1\n\t"
+    "vfmadb   %%v0,%%v22,%%v17,%%v0\n\t"
+    "vfmadb   %%v1,%%v26,%%v17,%%v1\n\t"
+    "vfmadb   %%v0,%%v23,%%v19,%%v0\n\t"
+    "vfmadb   %%v1,%%v27,%%v19,%%v1\n\t"
+    "vst %%v0,0(%%r1,%[y])\n\t"
+    "vst %%v1,16(%%r1,%[y])\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
+         "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
+         "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27");
 }
-#else
-
-static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
-    BLASLONG i;
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda;
-
-    for (i = 0; i < 2 * n; i += 2) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-        y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
-        y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
-        y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
-        y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
-#else 
-        y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
-        y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
-        y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
-        y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
-#endif
-    }
-}
-
-#endif
-
-#ifdef  HAVE_KERNEL_4x1_VEC
 
 static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
-    BLASLONG i;
-    FLOAT *a0;
-    a0 = ap;
-
-
+  __asm__("vl     %%v16,0(%[x])\n\t"
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-
-    register __vector double vx0_r = {x[0], x[0]};
-    register __vector double vx0_i = {-x[1], x[1]};
-
-#else
-    register __vector double vx0_r = {x[0], -x[0]};
-    register __vector double vx0_i = {x[1], x[1]};
-#endif
-
-
-    register __vector double *vy = (__vector double *) y;
-    register __vector double *vptr_a0 = (__vector double *) a0;
-
-    for (i = 0; i < n; i += 4) {
-
-        register __vector double vy_0 = vy[i];
-        register __vector double vy_1 = vy[i + 1];
-        register __vector double vy_2 = vy[i + 2];
-        register __vector double vy_3 = vy[i + 3];
-
-        register __vector double va0 = vptr_a0[i];
-        register __vector double va0_1 = vptr_a0[i + 1];
-        register __vector double va0_2 = vptr_a0[i + 2];
-        register __vector double va0_3 = vptr_a0[i + 3];
-
-        vy_0 += va0*vx0_r;
-        vy_1 += va0_1*vx0_r;
-        vy_2 += va0_2*vx0_r;
-        vy_3 += va0_3*vx0_r;
-
-        va0 = vec_permi(va0, va0, 2);
-        va0_1 = vec_permi(va0_1, va0_1, 2);
-        va0_2 = vec_permi(va0_2, va0_2, 2);
-        va0_3 = vec_permi(va0_3, va0_3, 2);
-
-        vy_0 += va0*vx0_i;
-        vy_1 += va0_1*vx0_i;
-        vy_2 += va0_2*vx0_i;
-        vy_3 += va0_3*vx0_i;
-
-        vy[i] = vy_0;
-        vy[i + 1] = vy_1;
-        vy[i + 2] = vy_2;
-        vy[i + 3] = vy_3;
-
-    }
-}
-
+    "vleg   %%v17,8(%[x]),0\n\t"
+    "wflcdb %%v17,%%v17\n\t"
+    "vleg   %%v17,0(%[x]),1\n\t"
 #else
-
-static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
-    BLASLONG i;
-    FLOAT *a0;
-    a0 = ap;
-
-    for (i = 0; i < 2 * n; i += 2) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-        y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
-        y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
-#else 
-        y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
-        y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
+    "vleg   %%v17,0(%[x]),1\n\t"
+    "vflcdb %%v17,%%v17\n\t"
+    "vleg   %%v17,8(%[x]),0\n\t"
 #endif
-
-    }
+    "xgr   %%r1,%%r1\n\t"
+    "srlg  %[n],%[n],1\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap])\n\t"
+    "pfd 2,1024(%%r1,%[y])\n\t"
+    "vl  %%v0,0(%%r1,%[y])\n\t"
+    "vl  %%v1,16(%%r1,%[y])\n\t"
+    "vlrepg %%v18,0(%%r1,%[ap])\n\t"
+    "vlrepg %%v19,8(%%r1,%[ap])\n\t"
+    "vlrepg %%v20,16(%%r1,%[ap])\n\t"
+    "vlrepg %%v21,24(%%r1,%[ap])\n\t"
+    "vfmadb   %%v0,%%v18,%%v16,%%v0\n\t"
+    "vfmadb   %%v1,%%v20,%%v16,%%v1\n\t"
+    "vfmadb   %%v0,%%v19,%%v17,%%v0\n\t"
+    "vfmadb   %%v1,%%v21,%%v17,%%v1\n\t"
+    "vst %%v0,0(%%r1,%[y])\n\t"
+    "vst %%v1,16(%%r1,%[y])\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
+       "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21");
 }
 
-#endif
-
-#ifdef HAVE_KERNEL_ADDY
-
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i;
-
-
-#if   !defined(XCONJ) 
-
-    register __vector double valpha_r = {alpha_r, alpha_r};
-    register __vector double valpha_i = {-alpha_i, alpha_i};
-
+static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
+                    FLOAT alpha_i) {
+  __asm__(
+#if !defined(XCONJ)
+    "vlrepg %%v0,%[alpha_r]\n\t"
+    "vleg   %%v1,%[alpha_i],0\n\t"
+    "wflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,%[alpha_i],1\n\t"
 #else
-    register __vector double valpha_r = {alpha_r, -alpha_r};
-    register __vector double valpha_i = {alpha_i, alpha_i};
+    "vleg   %%v0,%[alpha_r],1\n\t"
+    "vflcdb %%v0,%%v0\n\t"
+    "vleg   %%v0,%[alpha_r],0\n\t"
+    "vlrepg %%v1,%[alpha_i]\n\t"
 #endif
-
-    register __vector double *vptr_src = (__vector double *) src;
-    if (inc_dest != 2) {
-        register __vector double *vptr_y = (__vector double *) dest;
-        //note that inc_dest is already 2x. so we should add it to double*
-        register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest);
-        register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest);
-        register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest);
-        BLASLONG dest_t=0;
-        BLASLONG add_dest=inc_dest<<1; //inc_dest is already multiplied by 2, so for vector 4  we just multiply 2 times
-        for (i = 0; i < n; i += 4) {
-
-            register __vector double vy_0=vptr_y[dest_t];
-            register __vector double vy_1=vptr_y1[dest_t];
-            register __vector double vy_2=vptr_y2[dest_t];
-            register __vector double vy_3=vptr_y3[dest_t];
-
-            register __vector double vsrc = vptr_src[i];
-            register __vector double vsrc_1 = vptr_src[i + 1];
-            register __vector double vsrc_2 = vptr_src[i + 2];
-            register __vector double vsrc_3 = vptr_src[i + 3];
-
-            vy_0 += vsrc*valpha_r;
-            vy_1 += vsrc_1*valpha_r;
-            vy_2 += vsrc_2*valpha_r;
-            vy_3 += vsrc_3*valpha_r;
-
-            vsrc = vec_permi(vsrc, vsrc, 2);
-            vsrc_1 = vec_permi(vsrc_1, vsrc_1, 2);
-            vsrc_2 = vec_permi(vsrc_2, vsrc_2, 2);
-            vsrc_3 = vec_permi(vsrc_3, vsrc_3, 2);
-
-            vy_0 += vsrc*valpha_i;
-            vy_1 += vsrc_1*valpha_i;
-            vy_2 += vsrc_2*valpha_i;
-            vy_3 += vsrc_3*valpha_i;
-
-            vptr_y[dest_t] = vy_0;
-            vptr_y1[dest_t ] = vy_1;
-            vptr_y2[dest_t] = vy_2;
-            vptr_y3[dest_t] = vy_3;
-            
-            dest_t+=add_dest;
-
-        }
-
-        return;
-    } else {
-        register __vector double *vptr_y = (__vector double *) dest;
-        for (i = 0; i < n; i += 4) {
-
-            register __vector double vy_0=vptr_y[i];
-            register __vector double vy_1=vptr_y[i+1];
-            register __vector double vy_2=vptr_y[i+2];
-            register __vector double vy_3=vptr_y[i+3];
-
-            register __vector double vsrc = vptr_src[i];
-            register __vector double vsrc_1 = vptr_src[i + 1];
-            register __vector double vsrc_2 = vptr_src[i + 2];
-            register __vector double vsrc_3 = vptr_src[i + 3];
-
-            vy_0 += vsrc*valpha_r;
-            vy_1 += vsrc_1*valpha_r;
-            vy_2 += vsrc_2*valpha_r;
-            vy_3 += vsrc_3*valpha_r;
-
-            vsrc = vec_permi(vsrc, vsrc, 2);
-            vsrc_1 = vec_permi(vsrc_1, vsrc_1, 2);
-            vsrc_2 = vec_permi(vsrc_2, vsrc_2, 2);
-            vsrc_3 = vec_permi(vsrc_3, vsrc_3, 2);
-
-            vy_0 += vsrc*valpha_i;
-            vy_1 += vsrc_1*valpha_i;
-            vy_2 += vsrc_2*valpha_i;
-            vy_3 += vsrc_3*valpha_i;
-
-            vptr_y[i] = vy_0;
-            vptr_y[i + 1 ] = vy_1;
-            vptr_y[i + 2] = vy_2;
-            vptr_y[i + 3] = vy_3;
-
-        }
-
-        return;
-    }
-    return;
+    "xgr   %%r1,%%r1\n\t"
+    "srlg  %[n],%[n],2\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[src])\n\t"
+    "pfd 2,1024(%%r1,%[dest])\n\t"
+    "vl   %%v16,0(%%r1,%[src])\n\t"
+    "vl   %%v17,16(%%r1,%[src])\n\t"
+    "vl   %%v18,32(%%r1,%[src])\n\t"
+    "vl   %%v19,48(%%r1,%[src])\n\t"
+    "vl   %%v20,0(%%r1,%[dest])\n\t"
+    "vl   %%v21,16(%%r1,%[dest])\n\t"
+    "vl   %%v22,32(%%r1,%[dest])\n\t"
+    "vl   %%v23,48(%%r1,%[dest])\n\t"
+    "vpdi %%v24,%%v16,%%v16,4\n\t"
+    "vpdi %%v25,%%v17,%%v17,4\n\t"
+    "vpdi %%v26,%%v18,%%v18,4\n\t"
+    "vpdi %%v27,%%v19,%%v19,4\n\t"
+    "vfmadb %%v28,%%v16,%%v0,%%v20\n\t"
+    "vfmadb %%v29,%%v17,%%v0,%%v21\n\t"
+    "vfmadb %%v30,%%v18,%%v0,%%v22\n\t"
+    "vfmadb %%v31,%%v19,%%v0,%%v23\n\t"
+    "vfmadb %%v28,%%v24,%%v1,%%v28\n\t"
+    "vfmadb %%v29,%%v25,%%v1,%%v29\n\t"
+    "vfmadb %%v30,%%v26,%%v1,%%v30\n\t"
+    "vfmadb %%v31,%%v27,%%v1,%%v31\n\t"
+    "vst %%v28,0(%%r1,%[dest])\n\t"
+    "vst %%v29,16(%%r1,%[dest])\n\t"
+    "vst %%v30,32(%%r1,%[dest])\n\t"
+    "vst %%v31,48(%%r1,%[dest])\n\t"
+    "agfi   %%r1,64\n\t"
+    "brctg  %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n)
+    : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src),
+       [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
 }
 
-#else
-
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i;
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,
+                  FLOAT alpha_r, FLOAT alpha_i) {
+  BLASLONG i;
 
-    if (inc_dest != 2) {
+  if (inc_dest != 2) {
 
-        FLOAT temp_r;
-        FLOAT temp_i;
-        for (i = 0; i < n; i++) {
-#if !defined(XCONJ) 
-            temp_r = alpha_r * src[0] - alpha_i * src[1];
-            temp_i = alpha_r * src[1] + alpha_i * src[0];
+    FLOAT temp_r;
+    FLOAT temp_i;
+    for (i = 0; i < n; i++) {
+#if !defined(XCONJ)
+      temp_r = alpha_r * src[0] - alpha_i * src[1];
+      temp_i = alpha_r * src[1] + alpha_i * src[0];
 #else
-            temp_r = alpha_r * src[0] + alpha_i * src[1];
-            temp_i = -alpha_r * src[1] + alpha_i * src[0];
+      temp_r = alpha_r * src[0] + alpha_i * src[1];
+      temp_i = -alpha_r * src[1] + alpha_i * src[0];
 #endif
 
-            *dest += temp_r;
-            *(dest + 1) += temp_i;
+      *dest += temp_r;
+      *(dest + 1) += temp_i;
 
-            src += 2;
-            dest += inc_dest;
-        }
-        return;
-    }
-
-    FLOAT temp_r0;
-    FLOAT temp_i0;
-    FLOAT temp_r1;
-    FLOAT temp_i1;
-    FLOAT temp_r2;
-    FLOAT temp_i2;
-    FLOAT temp_r3;
-    FLOAT temp_i3;
-    for (i = 0; i < n; i += 4) {
-#if !defined(XCONJ) 
-        temp_r0 = alpha_r * src[0] - alpha_i * src[1];
-        temp_i0 = alpha_r * src[1] + alpha_i * src[0];
-        temp_r1 = alpha_r * src[2] - alpha_i * src[3];
-        temp_i1 = alpha_r * src[3] + alpha_i * src[2];
-        temp_r2 = alpha_r * src[4] - alpha_i * src[5];
-        temp_i2 = alpha_r * src[5] + alpha_i * src[4];
-        temp_r3 = alpha_r * src[6] - alpha_i * src[7];
-        temp_i3 = alpha_r * src[7] + alpha_i * src[6];
-#else
-        temp_r0 = alpha_r * src[0] + alpha_i * src[1];
-        temp_i0 = -alpha_r * src[1] + alpha_i * src[0];
-        temp_r1 = alpha_r * src[2] + alpha_i * src[3];
-        temp_i1 = -alpha_r * src[3] + alpha_i * src[2];
-        temp_r2 = alpha_r * src[4] + alpha_i * src[5];
-        temp_i2 = -alpha_r * src[5] + alpha_i * src[4];
-        temp_r3 = alpha_r * src[6] + alpha_i * src[7];
-        temp_i3 = -alpha_r * src[7] + alpha_i * src[6];
-#endif
-
-        dest[0] += temp_r0;
-        dest[1] += temp_i0;
-        dest[2] += temp_r1;
-        dest[3] += temp_i1;
-        dest[4] += temp_r2;
-        dest[5] += temp_i2;
-        dest[6] += temp_r3;
-        dest[7] += temp_i3;
-
-        src += 8;
-        dest += 8;
+      src += 2;
+      dest += inc_dest;
     }
     return;
-}
-#endif
-
-    int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
-        BLASLONG i;
-        BLASLONG j;
-        FLOAT *a_ptr;
-        FLOAT *x_ptr;
-        FLOAT *y_ptr;
-
-        BLASLONG n1;
-        BLASLONG m1;
-        BLASLONG m2;
-        BLASLONG m3;
-        BLASLONG n2;
-
-        FLOAT xbuffer[8], *ybuffer;
-
-        if (m < 1) return (0);
-        if (n < 1) return (0);
-
-        ybuffer = buffer;
-
-        inc_x *= 2;
-        inc_y *= 2;
-        lda *= 2;
-
-        n1 = n / 4;
-        n2 = n % 4;
-
-        m3 = m % 4;
-        m1 = m - (m % 4);
-        m2 = (m % NBMAX) - (m % 4);
+  }
 
-        y_ptr = y;
-
-        BLASLONG NB = NBMAX;
-
-        while (NB == NBMAX) {
-
-            m1 -= NB;
-            if (m1 < 0) {
-                if (m2 == 0) break;
-                NB = m2;
-            }
-
-            a_ptr = a;
-
-            x_ptr = x;
-            //zero_y(NB,ybuffer);
-            memset(ybuffer, 0, NB * 16);
-
-            if (inc_x == 2) {
-
-                for (i = 0; i < n1; i++) {
-                    zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
-
-                    a_ptr += lda << 2;
-                    x_ptr += 8;
-                }
-
-                if (n2 & 2) {
-                    zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
-                    x_ptr += 4;
-                    a_ptr += 2 * lda;
-
-                }
-
-                if (n2 & 1) {
-                    zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
-                    x_ptr += 2;
-                    a_ptr += lda;
-
-                }
-            } else {
-
-                for (i = 0; i < n1; i++) {
-
-                    xbuffer[0] = x_ptr[0];
-                    xbuffer[1] = x_ptr[1];
-                    x_ptr += inc_x;
-                    xbuffer[2] = x_ptr[0];
-                    xbuffer[3] = x_ptr[1];
-                    x_ptr += inc_x;
-                    xbuffer[4] = x_ptr[0];
-                    xbuffer[5] = x_ptr[1];
-                    x_ptr += inc_x;
-                    xbuffer[6] = x_ptr[0];
-                    xbuffer[7] = x_ptr[1];
-                    x_ptr += inc_x;
-
-                    zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
+  add_y_4(n, src, dest, alpha_r, alpha_i);
+}
 
-                    a_ptr += lda << 2;
-                }
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+          FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+          BLASLONG inc_y, FLOAT *buffer) {
+  BLASLONG i;
+  FLOAT *a_ptr;
+  FLOAT *x_ptr;
+  FLOAT *y_ptr;
+  FLOAT *ap[4];
+  BLASLONG n1;
+  BLASLONG m1;
+  BLASLONG m2;
+  BLASLONG m3;
+  BLASLONG n2;
+  BLASLONG lda4;
+  FLOAT xbuffer[8], *ybuffer;
+
+  if (m < 1)
+    return (0);
+  if (n < 1)
+    return (0);
+
+  ybuffer = buffer;
+
+  inc_x *= 2;
+  inc_y *= 2;
+  lda *= 2;
+  lda4 = 4 * lda;
+
+  n1 = n / 4;
+  n2 = n % 4;
+
+  m3 = m % 4;
+  m1 = m - (m % 4);
+  m2 = (m % NBMAX) - (m % 4);
+
+  y_ptr = y;
+
+  BLASLONG NB = NBMAX;
+
+  while (NB == NBMAX) {
+
+    m1 -= NB;
+    if (m1 < 0) {
+      if (m2 == 0)
+        break;
+      NB = m2;
+    }
 
-                for (i = 0; i < n2; i++) {
-                    xbuffer[0] = x_ptr[0];
-                    xbuffer[1] = x_ptr[1];
-                    x_ptr += inc_x;
-                    zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
-                    a_ptr += lda;
+    a_ptr = a;
+    ap[0] = a_ptr;
+    ap[1] = a_ptr + lda;
+    ap[2] = ap[1] + lda;
+    ap[3] = ap[2] + lda;
+    x_ptr = x;
+    //zero_y(NB,ybuffer);
+    memset(ybuffer, 0, NB * 16);
+
+    if (inc_x == 2) {
+
+      for (i = 0; i < n1; i++) {
+        zgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
+        ap[0] += lda4;
+        ap[1] += lda4;
+        ap[2] += lda4;
+        ap[3] += lda4;
+        a_ptr += lda4;
+        x_ptr += 8;
+      }
+
+      if (n2 & 2) {
+        zgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
+        x_ptr += 4;
+        a_ptr += 2 * lda;
+
+      }
+
+      if (n2 & 1) {
+        zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
+        /* x_ptr += 2;  
+           a_ptr += lda; */
+
+      }
+    } else {
 
-                }
+      for (i = 0; i < n1; i++) {
+
+        xbuffer[0] = x_ptr[0];
+        xbuffer[1] = x_ptr[1];
+        x_ptr += inc_x;
+        xbuffer[2] = x_ptr[0];
+        xbuffer[3] = x_ptr[1];
+        x_ptr += inc_x;
+        xbuffer[4] = x_ptr[0];
+        xbuffer[5] = x_ptr[1];
+        x_ptr += inc_x;
+        xbuffer[6] = x_ptr[0];
+        xbuffer[7] = x_ptr[1];
+        x_ptr += inc_x;
+
+        zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer);
+        ap[0] += lda4;
+        ap[1] += lda4;
+        ap[2] += lda4;
+        ap[3] += lda4;
+        a_ptr += lda4;
+      }
+
+      for (i = 0; i < n2; i++) {
+        xbuffer[0] = x_ptr[0];
+        xbuffer[1] = x_ptr[1];
+        x_ptr += inc_x;
+        zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
+        a_ptr += 1 * lda;
+
+      }
 
-            }
+    }
 
-            add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
-            a += 2 * NB;
-            y_ptr += NB * inc_y;
-        }
+    add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
+    a += 2 * NB;
+    y_ptr += NB * inc_y;
+  }
 
-        if (m3 == 0) return (0);
+  if (m3 == 0)
+    return (0);
 
-        if (m3 == 1) {
-            a_ptr = a;
-            x_ptr = x;
-            FLOAT temp_r = 0.0;
-            FLOAT temp_i = 0.0;
+  if (m3 == 1) {
+    a_ptr = a;
+    x_ptr = x;
+    FLOAT temp_r = 0.0;
+    FLOAT temp_i = 0.0;
 
-            if (lda == 2 && inc_x == 2) {
+    if (lda == 2 && inc_x == 2) {
 
-                for (i = 0; i < (n & -2); i += 2) {
+      for (i = 0; i < (n & -2); i += 2) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                    temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                    temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                    temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
-                    temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
+        temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
+        temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
 #else
-                    temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                    temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                    temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
-                    temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
+        temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
+        temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
 #endif
 
-                    a_ptr += 4;
-                    x_ptr += 4;
-                }
+        a_ptr += 4;
+        x_ptr += 4;
+      }
 
-                for (; i < n; i++) {
+      for (; i < n; i++) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                    temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                    temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
 #else
-                    temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                    temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
 #endif
 
-                    a_ptr += 2;
-                    x_ptr += 2;
-                }
+        a_ptr += 2;
+        x_ptr += 2;
+      }
 
-            } else {
+    } else {
 
-                for (i = 0; i < n; i++) {
+      for (i = 0; i < n; i++) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                    temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                    temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
 #else
-                    temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                    temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
 #endif
 
-                    a_ptr += lda;
-                    x_ptr += inc_x;
-                }
+        a_ptr += lda;
+        x_ptr += inc_x;
+      }
 
-            }
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+    }
+#if !defined(XCONJ)
+    y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+    y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
 #else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+    y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+    y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
 #endif
-            return (0);
-        }
+    return (0);
+  }
 
-        if (m3 == 2) {
-            a_ptr = a;
-            x_ptr = x;
-            FLOAT temp_r0 = 0.0;
-            FLOAT temp_i0 = 0.0;
-            FLOAT temp_r1 = 0.0;
-            FLOAT temp_i1 = 0.0;
+  if (m3 == 2) {
+    a_ptr = a;
+    x_ptr = x;
+    FLOAT temp_r0 = 0.0;
+    FLOAT temp_i0 = 0.0;
+    FLOAT temp_r1 = 0.0;
+    FLOAT temp_i1 = 0.0;
 
-            if (lda == 4 && inc_x == 2) {
+    if (lda == 4 && inc_x == 2) {
 
-                for (i = 0; i < (n & -2); i += 2) {
+      for (i = 0; i < (n & -2); i += 2) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 
-                    temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                    temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                    temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                    temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+        temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+
+        temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
+        temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
+        temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
+        temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
 
-                    temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
-                    temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
-                    temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
-                    temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
 #else
-                    temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                    temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                    temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                    temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-
-                    temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
-                    temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
-                    temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
-                    temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
+        temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+
+        temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
+        temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
+        temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+        temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
+
 #endif
 
-                    a_ptr += 8;
-                    x_ptr += 4;
-                }
+        a_ptr += 8;
+        x_ptr += 4;
+      }
 
-                for (; i < n; i++) {
+      for (; i < n; i++) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                    temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                    temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                    temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                    temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+        temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
 #else
-                    temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                    temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                    temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                    temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+        temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
 #endif
 
-                    a_ptr += 4;
-                    x_ptr += 2;
-                }
+        a_ptr += 4;
+        x_ptr += 2;
+      }
 
-            } else {
+    } else {
 
-                for (i = 0; i < n; i++) {
+      for (i = 0; i < n; i++) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                    temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                    temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                    temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                    temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+        temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
 #else
-                    temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                    temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                    temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                    temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+        temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
 #endif
 
-                    a_ptr += lda;
-                    x_ptr += inc_x;
-                }
-
-            }
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-            y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
-            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+        a_ptr += lda;
+        x_ptr += inc_x;
+      }
+
+    }
+#if !defined(XCONJ)
+    y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
 #else
-            y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-            y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
-            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+    y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
 #endif
-            return (0);
-        }
-
-        if (m3 == 3) {
-            a_ptr = a;
-            x_ptr = x;
-            FLOAT temp_r0 = 0.0;
-            FLOAT temp_i0 = 0.0;
-            FLOAT temp_r1 = 0.0;
-            FLOAT temp_i1 = 0.0;
-            FLOAT temp_r2 = 0.0;
-            FLOAT temp_i2 = 0.0;
-
-            if (lda == 6 && inc_x == 2) {
-
-                for (i = 0; i < n; i++) {
+    return (0);
+  }
+
+  if (m3 == 3) {
+    a_ptr = a;
+    x_ptr = x;
+    FLOAT temp_r0 = 0.0;
+    FLOAT temp_i0 = 0.0;
+    FLOAT temp_r1 = 0.0;
+    FLOAT temp_i1 = 0.0;
+    FLOAT temp_r2 = 0.0;
+    FLOAT temp_i2 = 0.0;
+
+    if (lda == 6 && inc_x == 2) {
+
+      for (i = 0; i < n; i++) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                    temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                    temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                    temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                    temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
-                    temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
-                    temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+        temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+        temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+        temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
 #else
-                    temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                    temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                    temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                    temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-                    temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
-                    temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+        temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+        temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+        temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
 #endif
 
-                    a_ptr += 6;
-                    x_ptr += 2;
-                }
+        a_ptr += 6;
+        x_ptr += 2;
+      }
 
-            } else {
+    } else {
 
-                for (i = 0; i < n; i++) {
+      for (i = 0; i < n; i++) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                    temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                    temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                    temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                    temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
-                    temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
-                    temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+        temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+        temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+        temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
 #else
-                    temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                    temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                    temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                    temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-                    temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
-                    temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+        temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+        temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+        temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+        temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+        temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+        temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
 #endif
 
-                    a_ptr += lda;
-                    x_ptr += inc_x;
-                }
-
-            }
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-            y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
-            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
-            y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
-#else
-            y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-            y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
-            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
-            y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
-#endif
-            return (0);
-        }
+        a_ptr += lda;
+        x_ptr += inc_x;
+      }
 
-        return (0);
     }
+#if !defined(XCONJ)
+    y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
+    y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+#else
+    y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+    y_ptr += inc_y;
+    y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
+    y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+#endif
+    return (0);
+  }
 
+  return (0);
+}
diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c
index 8b2be83947..031c31e29b 100644
--- a/kernel/zarch/zgemv_t_4.c
+++ b/kernel/zarch/zgemv_t_4.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2018, The OpenBLAS Project
+Copyright (c) 2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -23,825 +23,635 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
+*****************************************************************************/
 
 #include "common.h"
 
 #define NBMAX 1024
-#define HAVE_KERNEL_4x4_VEC 1
-#define HAVE_KERNEL_4x2_VEC 1
-#define HAVE_KERNEL_4x1_VEC 1
-
-#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
-#include <vecintrin.h> 
-#endif
-
-#ifdef HAVE_KERNEL_4x4_VEC_ASM
-
-#elif HAVE_KERNEL_4x4_VEC
-
-static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i;
-    FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    //p for positive(real*real,image*image) r for image (real*image,image*real)
-    register __vector double vtemp0_p = {0.0, 0.0};
-    register __vector double vtemp0_r = {0.0, 0.0};
-    register __vector double vtemp1_p = {0.0, 0.0};
-    register __vector double vtemp1_r = {0.0, 0.0};
-    register __vector double vtemp2_p = {0.0, 0.0};
-    register __vector double vtemp2_r = {0.0, 0.0};
-    register __vector double vtemp3_p = {0.0, 0.0};
-    register __vector double vtemp3_r = {0.0, 0.0};
-    i = 0;
-    n = n << 1;
-    while (i < n) {
-//        __builtin_prefetch(&x[i]);
-//        __builtin_prefetch(&a0[i]);   
-//        __builtin_prefetch(&a1[i]);
-//        __builtin_prefetch(&a2[i]);
-//        __builtin_prefetch(&a3[i]);
-        register __vector double vx_0 = *(__vector double*) (&x[i]);
-        register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
-        register __vector double vx_2 = *(__vector double*) (&x[i + 4]);
-        register __vector double vx_3 = *(__vector double*) (&x[i + 6]);
-
-        register __vector double va0 = *(__vector double*) (&a0[i]);
-        register __vector double va0_1 = *(__vector double*) (&a0[i + 2]);
-        register __vector double va0_2 = *(__vector double*) (&a0[i + 4]);
-        register __vector double va0_3 = *(__vector double*) (&a0[i + 6]);
-
-        register __vector double va1 = *(__vector double*) (&a1[i]);
-        register __vector double va1_1 = *(__vector double*) (&a1[i + 2]);
-        register __vector double va1_2 = *(__vector double*) (&a1[i + 4]);
-        register __vector double va1_3 = *(__vector double*) (&a1[i + 6]);
-
-        register __vector double va2 = *(__vector double*) (&a2[i]);
-        register __vector double va2_1 = *(__vector double*) (&a2[i + 2]);
-        register __vector double va2_2 = *(__vector double*) (&a2[i + 4]);
-        register __vector double va2_3 = *(__vector double*) (&a2[i + 6]);
-
-        register __vector double va3 = *(__vector double*) (&a3[i]);
-        register __vector double va3_1 = *(__vector double*) (&a3[i + 2]);
-        register __vector double va3_2 = *(__vector double*) (&a3[i + 4]);
-        register __vector double va3_3 = *(__vector double*) (&a3[i + 6]);
-
-        register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2);
-        register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2);
-
-        i += 8;
-
-        vtemp0_p += vx_0*va0;
-        vtemp0_r += vxr_0*va0;
-
-        vtemp1_p += vx_0*va1;
-        vtemp1_r += vxr_0*va1;
-
-        vtemp2_p += vx_0*va2;
-        vtemp2_r += vxr_0*va2;
-
-        vtemp3_p += vx_0*va3;
-        vtemp3_r += vxr_0*va3;
-
-        vtemp0_p += vx_1*va0_1;
-        vtemp0_r += vxr_1*va0_1;
-
-        vtemp1_p += vx_1*va1_1;
-        vtemp1_r += vxr_1*va1_1;
-        vxr_0 = vec_permi(vx_2, vx_2, 2);
-        vtemp2_p += vx_1*va2_1;
-        vtemp2_r += vxr_1*va2_1;
-
-        vtemp3_p += vx_1*va3_1;
-        vtemp3_r += vxr_1*va3_1;
-
-        vtemp0_p += vx_2*va0_2;
-        vtemp0_r += vxr_0*va0_2;
-        vxr_1 = vec_permi(vx_3, vx_3, 2);
-
-        vtemp1_p += vx_2*va1_2;
-        vtemp1_r += vxr_0*va1_2;
-
-        vtemp2_p += vx_2*va2_2;
-        vtemp2_r += vxr_0*va2_2;
-
-        vtemp3_p += vx_2*va3_2;
-        vtemp3_r += vxr_0*va3_2;
-
-        vtemp0_p += vx_3*va0_3;
-        vtemp0_r += vxr_1*va0_3;
-
-        vtemp1_p += vx_3*va1_3;
-        vtemp1_r += vxr_1*va1_3;
-
-        vtemp2_p += vx_3*va2_3;
-        vtemp2_r += vxr_1*va2_3;
-
-        vtemp3_p += vx_3*va3_3;
-        vtemp3_r += vxr_1*va3_3;
-
-    }
 
+static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
+                             FLOAT *alpha) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+  register FLOAT *ap2 = ap[2];
+  register FLOAT *ap3 = ap[3];
+
+  __asm__("vzero  %%v16\n\t"
+    "vzero  %%v17\n\t"
+    "vzero  %%v18\n\t"
+    "vzero  %%v19\n\t"
+    "vzero  %%v20\n\t"
+    "vzero  %%v21\n\t"
+    "vzero  %%v22\n\t"
+    "vzero  %%v23\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "srlg  %[n],%[n],1\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 1,1024(%%r1,%[ap2])\n\t"
+    "pfd 1,1024(%%r1,%[ap3])\n\t"
+    "pfd 1,1024(%%r1,%[x])\n\t"
+    "vl     %%v0,0(%%r1,%[x])\n\t"
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-
-    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1];
-    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1];
-
-    register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1];
-    register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1];
-
-    register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1];
-    register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1];
-
-    register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1];
-    register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1];
-
-#else
-    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1];
-    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1];
-
-    register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1];
-    register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1];
-
-    register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1];
-    register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1];
-
-    register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1];
-    register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1];
-
-#endif    
-
-#if !defined(XCONJ)
-
-    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
-    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
-    y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
-    y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
-    y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
-    y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
-
+    "vleg   %%v1,8(%%r1,%[x]),0\n\t"
+    "wflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,0(%%r1,%[x]),1\n\t"
 #else
-
-    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
-    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-    y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
-    y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
-    y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
-    y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
-
+    "vleg   %%v1,0(%%r1,%[x]),1\n\t"
+    "vflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,8(%%r1,%[x]),0\n\t"
 #endif
-}
-
-#else
-
-static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i;
-    FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-
-    FLOAT temp_r0 = 0.0;
-    FLOAT temp_r1 = 0.0;
-    FLOAT temp_r2 = 0.0;
-    FLOAT temp_r3 = 0.0;
-    FLOAT temp_i0 = 0.0;
-    FLOAT temp_i1 = 0.0;
-    FLOAT temp_i2 = 0.0;
-    FLOAT temp_i3 = 0.0;
-
-    for (i = 0; i < 2 * n; i += 2) {
+    "vlrepg %%v24,0(%%r1,%[ap0])\n\t"
+    "vlrepg %%v25,8(%%r1,%[ap0])\n\t"
+    "vlrepg %%v26,0(%%r1,%[ap1])\n\t"
+    "vlrepg %%v27,8(%%r1,%[ap1])\n\t"
+    "vlrepg %%v28,0(%%r1,%[ap2])\n\t"
+    "vlrepg %%v29,8(%%r1,%[ap2])\n\t"
+    "vlrepg %%v30,0(%%r1,%[ap3])\n\t"
+    "vlrepg %%v31,8(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v16,%%v24,%%v0,%%v16\n\t"
+    "vfmadb   %%v20,%%v25,%%v1,%%v20\n\t"
+    "vfmadb   %%v17,%%v26,%%v0,%%v17\n\t"
+    "vfmadb   %%v21,%%v27,%%v1,%%v21\n\t"
+    "vfmadb   %%v18,%%v28,%%v0,%%v18\n\t"
+    "vfmadb   %%v22,%%v29,%%v1,%%v22\n\t"
+    "vfmadb   %%v19,%%v30,%%v0,%%v19\n\t"
+    "vfmadb   %%v23,%%v31,%%v1,%%v23\n\t"
+    "vl     %%v0,16(%%r1,%[x])\n\t"
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-        temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1];
-        temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i];
-        temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1];
-        temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i];
-        temp_r2 += a2[i] * x[i] - a2[i + 1] * x[i + 1];
-        temp_i2 += a2[i] * x[i + 1] + a2[i + 1] * x[i];
-        temp_r3 += a3[i] * x[i] - a3[i + 1] * x[i + 1];
-        temp_i3 += a3[i] * x[i + 1] + a3[i + 1] * x[i];
+    "vleg   %%v1,24(%%r1,%[x]),0\n\t"
+    "wflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,16(%%r1,%[x]),1\n\t"
 #else
-        temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1];
-        temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i];
-        temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1];
-        temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i];
-        temp_r2 += a2[i] * x[i] + a2[i + 1] * x[i + 1];
-        temp_i2 += a2[i] * x[i + 1] - a2[i + 1] * x[i];
-        temp_r3 += a3[i] * x[i] + a3[i + 1] * x[i + 1];
-        temp_i3 += a3[i] * x[i + 1] - a3[i + 1] * x[i];
+    "vleg   %%v1,16(%%r1,%[x]),1\n\t"
+    "vflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,24(%%r1,%[x]),0\n\t"
 #endif
-    }
-
+    "vlrepg %%v24,16(%%r1,%[ap0])\n\t"
+    "vlrepg %%v25,24(%%r1,%[ap0])\n\t"
+    "vlrepg %%v26,16(%%r1,%[ap1])\n\t"
+    "vlrepg %%v27,24(%%r1,%[ap1])\n\t"
+    "vlrepg %%v28,16(%%r1,%[ap2])\n\t"
+    "vlrepg %%v29,24(%%r1,%[ap2])\n\t"
+    "vlrepg %%v30,16(%%r1,%[ap3])\n\t"
+    "vlrepg %%v31,24(%%r1,%[ap3])\n\t"
+    "vfmadb   %%v16,%%v24,%%v0,%%v16\n\t"
+    "vfmadb   %%v20,%%v25,%%v1,%%v20\n\t"
+    "vfmadb   %%v17,%%v26,%%v0,%%v17\n\t"
+    "vfmadb   %%v21,%%v27,%%v1,%%v21\n\t"
+    "vfmadb   %%v18,%%v28,%%v0,%%v18\n\t"
+    "vfmadb   %%v22,%%v29,%%v1,%%v22\n\t"
+    "vfmadb   %%v19,%%v30,%%v0,%%v19\n\t"
+    "vfmadb   %%v23,%%v31,%%v1,%%v23\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %[n],0b\n\t"
+    "vfadb  %%v16,%%v16,%%v20\n\t"
+    "vfadb  %%v17,%%v17,%%v21\n\t"
+    "vfadb  %%v18,%%v18,%%v22\n\t"
+    "vfadb  %%v19,%%v19,%%v23\n\t"
+    "vpdi %%v20,%%v16,%%v16,4\n\t"
+    "vpdi %%v21,%%v17,%%v17,4\n\t"
+    "vpdi %%v22,%%v18,%%v18,4\n\t"
+    "vpdi %%v23,%%v19,%%v19,4\n\t"
 #if !defined(XCONJ)
-
-    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
-    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
-    y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
-    y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
-    y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
-    y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
-
+    "vlrepg %%v24,0(%[alpha])\n\t"
+    "vleg   %%v25,8(%[alpha]),0\n\t"
+    "wflcdb %%v25,%%v25\n\t"
+    "vleg   %%v25,8(%[alpha]),1\n\t"
 #else
-
-    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
-    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-    y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
-    y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
-    y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
-    y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
-
+    "vleg   %%v24,0(%[alpha]),1\n\t"
+    "vflcdb %%v24,%%v24\n\t"
+    "vleg   %%v24,0(%[alpha]),0\n\t"
+    "vlrepg %%v25,8(%[alpha])\n\t"
 #endif
+    "vl  %%v26,0(%[y])\n\t"
+    "vl  %%v27,16(%[y])\n\t"
+    "vl  %%v28,32(%[y])\n\t"
+    "vl  %%v29,48(%[y])\n\t"
+    "vfmadb   %%v26,%%v16,%%v24,%%v26\n\t"
+    "vfmadb   %%v26,%%v20,%%v25,%%v26\n\t"
+    "vfmadb   %%v27,%%v17,%%v24,%%v27\n\t"
+    "vfmadb   %%v27,%%v21,%%v25,%%v27\n\t"
+    "vfmadb   %%v28,%%v18,%%v24,%%v28\n\t"
+    "vfmadb   %%v28,%%v22,%%v25,%%v28\n\t"
+    "vfmadb   %%v29,%%v19,%%v24,%%v29\n\t"
+    "vfmadb   %%v29,%%v23,%%v25,%%v29\n\t"
+    "vst  %%v26,0(%[y])\n\t"
+    "vst  %%v27,16(%[y])\n\t"
+    "vst  %%v28,32(%[y])\n\t"
+    "vst  %%v29,48(%[y])"
+    : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
+       "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
 }
 
-#endif
-
-#ifdef HAVE_KERNEL_4x2_VEC
-
-static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i;
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda; 
-    //p for positive(real*real,image*image) r for image (real*image,image*real)
-    register __vector double vtemp0_p = {0.0, 0.0};
-    register __vector double vtemp0_r = {0.0, 0.0};
-    register __vector double vtemp1_p = {0.0, 0.0};
-    register __vector double vtemp1_r = {0.0, 0.0}; 
-    i = 0;
-    n = n << 1;
-    while (i < n) {
-
-        register __vector double vx_0 = *(__vector double*) (&x[i]);
-        register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
-        register __vector double vx_2 = *(__vector double*) (&x[i + 4]);
-        register __vector double vx_3 = *(__vector double*) (&x[i + 6]);
-
-        register __vector double va0 = *(__vector double*) (&a0[i]);
-        register __vector double va0_1 = *(__vector double*) (&a0[i + 2]);
-        register __vector double va0_2 = *(__vector double*) (&a0[i + 4]);
-        register __vector double va0_3 = *(__vector double*) (&a0[i + 6]);
-
-        register __vector double va1 = *(__vector double*) (&a1[i]);
-        register __vector double va1_1 = *(__vector double*) (&a1[i + 2]);
-        register __vector double va1_2 = *(__vector double*) (&a1[i + 4]);
-        register __vector double va1_3 = *(__vector double*) (&a1[i + 6]);
-
-        register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2);
-        register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2);
-
-        i += 8;
-
-        vtemp0_p += vx_0*va0;
-        vtemp0_r += vxr_0*va0;
-
-        vtemp1_p += vx_0*va1;
-        vtemp1_r += vxr_0*va1;
-
-        vxr_0 = vec_permi(vx_2, vx_2, 2);  
-        vtemp0_p += vx_1*va0_1;
-        vtemp0_r += vxr_1*va0_1;
-
-        vtemp1_p += vx_1*va1_1;
-        vtemp1_r += vxr_1*va1_1;
-        vxr_1 = vec_permi(vx_3, vx_3, 2);
-
-        vtemp0_p += vx_2*va0_2;
-        vtemp0_r += vxr_0*va0_2;
-
-        vtemp1_p += vx_2*va1_2;
-        vtemp1_r += vxr_0*va1_2;
-
-        vtemp0_p += vx_3*va0_3;
-        vtemp0_r += vxr_1*va0_3;
-
-        vtemp1_p += vx_3*va1_3;
-        vtemp1_r += vxr_1*va1_3;
- 
-    }
-
+static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
+                             FLOAT *alpha) {
+  register FLOAT *ap0 = ap[0];
+  register FLOAT *ap1 = ap[1];
+
+  __asm__("vzero  %%v16\n\t"
+    "vzero  %%v17\n\t"
+    "vzero  %%v18\n\t"
+    "vzero  %%v19\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "srlg  %[n],%[n],1\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap0])\n\t"
+    "pfd 1,1024(%%r1,%[ap1])\n\t"
+    "pfd 1,1024(%%r1,%[x])\n\t"
+    "vl     %%v0,0(%%r1,%[x])\n\t"
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1];
-    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1];
-
-    register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1];
-    register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; 
-
-#else
-    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1];
-    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1];
-
-    register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1];
-    register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1];
-
-#endif    
-
-#if !defined(XCONJ)
-
-    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
-    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
-
+    "vleg   %%v1,8(%%r1,%[x]),0\n\t"
+    "wflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,0(%%r1,%[x]),1\n\t"
 #else
-
-    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
-    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-
+    "vleg   %%v1,0(%%r1,%[x]),1\n\t"
+    "vflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,8(%%r1,%[x]),0\n\t"
 #endif
-}
-
-#else
-
-static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i;
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda;
-
-    FLOAT temp_r0 = 0.0;
-    FLOAT temp_r1 = 0.0;
-    FLOAT temp_i0 = 0.0;
-    FLOAT temp_i1 = 0.0;
-
-    for (i = 0; i < 2 * n; i += 2) {
+    "vlrepg %%v20,0(%%r1,%[ap0])\n\t"
+    "vlrepg %%v21,8(%%r1,%[ap0])\n\t"
+    "vlrepg %%v22,0(%%r1,%[ap1])\n\t"
+    "vlrepg %%v23,8(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v16,%%v20,%%v0,%%v16\n\t"
+    "vfmadb   %%v18,%%v21,%%v1,%%v18\n\t"
+    "vfmadb   %%v17,%%v22,%%v0,%%v17\n\t"
+    "vfmadb   %%v19,%%v23,%%v1,%%v19\n\t"
+    "vl     %%v0,16(%%r1,%[x])\n\t"
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-        temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1];
-        temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i];
-        temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1];
-        temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i];
+    "vleg   %%v1,24(%%r1,%[x]),0\n\t"
+    "wflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,16(%%r1,%[x]),1\n\t"
 #else
-        temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1];
-        temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i];
-        temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1];
-        temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i];
+    "vleg   %%v1,16(%%r1,%[x]),1\n\t"
+    "vflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,24(%%r1,%[x]),0\n\t"
 #endif
-    }
-
+    "vlrepg %%v20,16(%%r1,%[ap0])\n\t"
+    "vlrepg %%v21,24(%%r1,%[ap0])\n\t"
+    "vlrepg %%v22,16(%%r1,%[ap1])\n\t"
+    "vlrepg %%v23,24(%%r1,%[ap1])\n\t"
+    "vfmadb   %%v16,%%v20,%%v0,%%v16\n\t"
+    "vfmadb   %%v18,%%v21,%%v1,%%v18\n\t"
+    "vfmadb   %%v17,%%v22,%%v0,%%v17\n\t"
+    "vfmadb   %%v19,%%v23,%%v1,%%v19\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %[n],0b\n\t"
+    "vfadb  %%v16,%%v16,%%v18\n\t"
+    "vfadb  %%v17,%%v17,%%v19\n\t"
+    "vpdi %%v18,%%v16,%%v16,4\n\t"
+    "vpdi %%v19,%%v17,%%v17,4\n\t"
 #if !defined(XCONJ)
-
-    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
-    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
-
+    "vlrepg %%v20,0(%[alpha])\n\t"
+    "vleg   %%v21,8(%[alpha]),0\n\t"
+    "wflcdb %%v21,%%v21\n\t"
+    "vleg   %%v21,8(%[alpha]),1\n\t"
 #else
-
-    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
-    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-
+    "vleg   %%v20,0(%[alpha]),1\n\t"
+    "vflcdb %%v20,%%v20\n\t"
+    "vleg   %%v20,0(%[alpha]),0\n\t"
+    "vlrepg %%v21,8(%[alpha])\n\t"
 #endif
+    "vl  %%v22,0(%[y])\n\t"
+    "vl  %%v23,16(%[y])\n\t"
+    "vfmadb   %%v22,%%v16,%%v20,%%v22\n\t"
+    "vfmadb   %%v22,%%v18,%%v21,%%v22\n\t"
+    "vfmadb   %%v23,%%v17,%%v20,%%v23\n\t"
+    "vfmadb   %%v23,%%v19,%%v21,%%v23\n\t"
+    "vst  %%v22,0(%[y])\n\t"
+    "vst  %%v23,16(%[y])\n\t"
+    : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
+       "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23");
 }
 
-#endif
-
-#ifdef HAVE_KERNEL_4x1_VEC
-
-static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i;
-    FLOAT *a0 ;
-    a0 = ap;  
-    //p for positive(real*real,image*image) r for image (real*image,image*real)
-    register __vector double vtemp0_p = {0.0, 0.0};
-    register __vector double vtemp0_r = {0.0, 0.0};
-    i = 0;
-    n = n << 1;
-    while (i < n) {
-
-        register __vector double vx_0 = *(__vector double*) (&x[i]);
-        register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
-        register __vector double vx_2 = *(__vector double*) (&x[i + 4]);
-        register __vector double vx_3 = *(__vector double*) (&x[i + 6]);
-
-        register __vector double va0 = *(__vector double*) (&a0[i]);
-        register __vector double va0_1 = *(__vector double*) (&a0[i + 2]);
-        register __vector double va0_2 = *(__vector double*) (&a0[i + 4]);
-        register __vector double va0_3 = *(__vector double*) (&a0[i + 6]);
-       
-        register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2);
-        register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2);
-
-        i += 8;
-
-        vtemp0_p += vx_0*va0;
-        vtemp0_r += vxr_0*va0;
- 
-        vxr_0 = vec_permi(vx_2, vx_2, 2);  
-        vtemp0_p += vx_1*va0_1;
-        vtemp0_r += vxr_1*va0_1;
- 
-        vxr_1 = vec_permi(vx_3, vx_3, 2);
-
-        vtemp0_p += vx_2*va0_2;
-        vtemp0_r += vxr_0*va0_2;
- 
-        vtemp0_p += vx_3*va0_3;
-        vtemp0_r += vxr_1*va0_3;
- 
-    }
-
+static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
+                             FLOAT *alpha) {
+  __asm__("vzero  %%v16\n\t"
+    "vzero  %%v17\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "srlg  %[n],%[n],1\n\t"
+    "0:\n\t"
+    "pfd 1,1024(%%r1,%[ap])\n\t"
+    "pfd 1,1024(%%r1,%[x])\n\t"
+    "vl     %%v0,0(%%r1,%[x])\n\t"
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1];
-    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1];
-
-#else
-    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1];
-    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; 
-
-#endif    
-
-#if !defined(XCONJ)
-
-    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; 
-
+    "vleg   %%v1,8(%%r1,%[x]),0\n\t"
+    "wflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,0(%%r1,%[x]),1\n\t"
 #else
-
-    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; 
+    "vleg   %%v1,0(%%r1,%[x]),1\n\t"
+    "vflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,8(%%r1,%[x]),0\n\t"
 #endif
-
-}
-
-#else
-
-static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i;
-    FLOAT *a0;
-    a0 = ap;
-
-    FLOAT temp_r0 = 0.0;
-    FLOAT temp_i0 = 0.0;
-
-    for (i = 0; i < 2 * n; i += 2) {
+    "vlrepg %%v18,0(%%r1,%[ap])\n\t"
+    "vlrepg %%v19,8(%%r1,%[ap])\n\t"
+    "vfmadb   %%v16,%%v18,%%v0,%%v16\n\t"
+    "vfmadb   %%v17,%%v19,%%v1,%%v17\n\t"
+    "vl     %%v0,16(%%r1,%[x])\n\t"
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-        temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1];
-        temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i];
+    "vleg   %%v1,24(%%r1,%[x]),0\n\t"
+    "wflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,16(%%r1,%[x]),1\n\t"
 #else
-        temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1];
-        temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i];
+    "vleg   %%v1,16(%%r1,%[x]),1\n\t"
+    "vflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,24(%%r1,%[x]),0\n\t"
 #endif
-    }
-
+    "vlrepg %%v18,16(%%r1,%[ap])\n\t"
+    "vlrepg %%v19,24(%%r1,%[ap])\n\t"
+    "vfmadb   %%v16,%%v18,%%v0,%%v16\n\t"
+    "vfmadb   %%v17,%%v19,%%v1,%%v17\n\t"
+    "agfi   %%r1,32\n\t"
+    "brctg  %[n],0b\n\t"
+    "vfadb  %%v16,%%v16,%%v17\n\t"
+    "vpdi %%v17,%%v16,%%v16,4\n\t"
 #if !defined(XCONJ)
-
-    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-
+    "vlrepg %%v18,0(%[alpha])\n\t"
+    "vleg   %%v19,8(%[alpha]),0\n\t"
+    "wflcdb %%v19,%%v19\n\t"
+    "vleg   %%v19,8(%[alpha]),1\n\t"
 #else
-
-    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-
+    "vleg   %%v18,0(%[alpha]),1\n\t"
+    "vflcdb %%v18,%%v18\n\t"
+    "vleg   %%v18,0(%[alpha]),0\n\t"
+    "vlrepg %%v19,8(%[alpha])\n\t"
 #endif
-
+    "vl  %%v0,0(%[y])\n\t"
+    "vfmadb   %%v0,%%v16,%%v18,%%v0\n\t"
+    "vfmadb   %%v0,%%v17,%%v19,%%v0\n\t"
+    "vst  %%v0,0(%[y])\n\t"
+    : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n)
+    : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
+       "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
+       "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19");
 }
 
-#endif
-
-static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
-    BLASLONG i;
-    for (i = 0; i < n; i++) {
-        *dest = *src;
-        *(dest + 1) = *(src + 1);
-        dest += 2;
-        src += inc_src;
-    }
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+  BLASLONG i;
+  for (i = 0; i < n; i++) {
+    *dest = *src;
+    *(dest + 1) = *(src + 1);
+    dest += 2;
+    src += inc_src;
+  }
 }
 
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
-    BLASLONG i;
-    BLASLONG j;
-    FLOAT *a_ptr;
-    FLOAT *x_ptr;
-    FLOAT *y_ptr;
-
-    BLASLONG n1;
-    BLASLONG m1;
-    BLASLONG m2;
-    BLASLONG m3;
-    BLASLONG n2;
-
-    FLOAT ybuffer[8], *xbuffer;
-
-    if (m < 1) return (0);
-    if (n < 1) return (0);
-
-    inc_x <<= 1;
-    inc_y <<= 1;
-    lda <<= 1;
-
-    xbuffer = buffer;
-
-    n1 = n >> 2;
-    n2 = n & 3;
-
-    m3 = m & 3;
-    m1 = m - m3;
-    m2 = (m & (NBMAX - 1)) - m3;
-
-    BLASLONG NB = NBMAX;
-
-    while (NB == NBMAX) {
-
-        m1 -= NB;
-        if (m1 < 0) {
-            if (m2 == 0) break;
-            NB = m2;
-        }
-
-        y_ptr = y;
-        a_ptr = a;
-        x_ptr = x;
-
-        if (inc_x != 2)
-            copy_x(NB, x_ptr, xbuffer, inc_x);
-        else
-            xbuffer = x_ptr;
-
-        if (inc_y == 2) {
-
-            for (i = 0; i < n1; i++) {
-                zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
-                a_ptr += lda << 2;
-                y_ptr += 8;
-
-            }
-
-            if (n2 & 2) {
-                zgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
-                a_ptr += lda << 1;
-                y_ptr += 4;
-
-            }
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+          FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+          BLASLONG inc_y, FLOAT *buffer) {
+  BLASLONG i;
+  BLASLONG j;
+  FLOAT *a_ptr;
+  FLOAT *x_ptr;
+  FLOAT *y_ptr;
+  FLOAT *ap[8];
+  BLASLONG n1;
+  BLASLONG m1;
+  BLASLONG m2;
+  BLASLONG m3;
+  BLASLONG n2;
+  BLASLONG lda4;
+  FLOAT ybuffer[8], *xbuffer;
+  FLOAT alpha[2];
+
+  if (m < 1)
+    return (0);
+  if (n < 1)
+    return (0);
 
-            if (n2 & 1) {
-                zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
-                a_ptr += lda;
-                y_ptr += 2;
+  inc_x <<= 1;
+  inc_y <<= 1;
+  lda <<= 1;
+  lda4 = lda << 2;
 
-            }
+  xbuffer = buffer;
 
-        } else {
+  n1 = n >> 2;
+  n2 = n & 3;
 
-            for (i = 0; i < n1; i++) {
-                memset(ybuffer, 0, sizeof (ybuffer));
-                zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
+  m3 = m & 3;
+  m1 = m - m3;
+  m2 = (m & (NBMAX - 1)) - m3;
 
-                a_ptr += lda << 2;
+  alpha[0] = alpha_r;
+  alpha[1] = alpha_i;
 
-                y_ptr[0] += ybuffer[0];
-                y_ptr[1] += ybuffer[1];
-                y_ptr += inc_y;
-                y_ptr[0] += ybuffer[2];
-                y_ptr[1] += ybuffer[3];
-                y_ptr += inc_y;
-                y_ptr[0] += ybuffer[4];
-                y_ptr[1] += ybuffer[5];
-                y_ptr += inc_y;
-                y_ptr[0] += ybuffer[6];
-                y_ptr[1] += ybuffer[7];
-                y_ptr += inc_y;
+  BLASLONG NB = NBMAX;
 
-            }
+  while (NB == NBMAX) {
 
-            for (i = 0; i < n2; i++) {
-                memset(ybuffer, 0, sizeof (ybuffer));
-                zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
-                a_ptr += lda;
-                y_ptr[0] += ybuffer[0];
-                y_ptr[1] += ybuffer[1];
-                y_ptr += inc_y;
+    m1 -= NB;
+    if (m1 < 0) {
+      if (m2 == 0)
+        break;
+      NB = m2;
+    }
 
-            }
+    y_ptr = y;
+    a_ptr = a;
+    x_ptr = x;
+    ap[0] = a_ptr;
+    ap[1] = a_ptr + lda;
+    ap[2] = ap[1] + lda;
+    ap[3] = ap[2] + lda;
+    if (inc_x != 2)
+      copy_x(NB, x_ptr, xbuffer, inc_x);
+    else
+      xbuffer = x_ptr;
+
+    if (inc_y == 2) {
+
+      for (i = 0; i < n1; i++) {
+        zgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha);
+        ap[0] += lda4;
+        ap[1] += lda4;
+        ap[2] += lda4;
+        ap[3] += lda4;
+        a_ptr += lda4;
+        y_ptr += 8;
+
+      }
+
+      if (n2 & 2) {
+        zgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha);
+        a_ptr += lda * 2;
+        y_ptr += 4;
+
+      }
+
+      if (n2 & 1) {
+        zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
+        /* a_ptr += lda;
+           y_ptr += 2; */
+
+      }
+
+    } else {
+
+      for (i = 0; i < n1; i++) {
+        memset(ybuffer, 0, sizeof(ybuffer));
+        zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha);
+        ap[0] += lda4;
+        ap[1] += lda4;
+        ap[2] += lda4;
+        ap[3] += lda4;
+        a_ptr += lda4;
+
+        y_ptr[0] += ybuffer[0];
+        y_ptr[1] += ybuffer[1];
+        y_ptr += inc_y;
+        y_ptr[0] += ybuffer[2];
+        y_ptr[1] += ybuffer[3];
+        y_ptr += inc_y;
+        y_ptr[0] += ybuffer[4];
+        y_ptr[1] += ybuffer[5];
+        y_ptr += inc_y;
+        y_ptr[0] += ybuffer[6];
+        y_ptr[1] += ybuffer[7];
+        y_ptr += inc_y;
+
+      }
+
+      for (i = 0; i < n2; i++) {
+        memset(ybuffer, 0, sizeof(ybuffer));
+        zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha);
+        a_ptr += lda;
+        y_ptr[0] += ybuffer[0];
+        y_ptr[1] += ybuffer[1];
+        y_ptr += inc_y;
+
+      }
 
-        }
-        a += 2 * NB;
-        x += NB * inc_x;
     }
+    a += 2 * NB;
+    x += NB * inc_x;
+  }
 
-    if (m3 == 0) return (0);
-
-    x_ptr = x;
-    j = 0;
-    a_ptr = a;
-    y_ptr = y;
+  if (m3 == 0)
+    return (0);
 
-    if (m3 == 3) {
-
-        FLOAT temp_r;
-        FLOAT temp_i;
-        FLOAT x0 = x_ptr[0];
-        FLOAT x1 = x_ptr[1];
-        x_ptr += inc_x;
-        FLOAT x2 = x_ptr[0];
-        FLOAT x3 = x_ptr[1];
-        x_ptr += inc_x;
-        FLOAT x4 = x_ptr[0];
-        FLOAT x5 = x_ptr[1];
-        while (j < n) {
+  x_ptr = x;
+  j = 0;
+  a_ptr = a;
+  y_ptr = y;
+
+  if (m3 == 3) {
+
+    FLOAT temp_r;
+    FLOAT temp_i;
+    FLOAT x0 = x_ptr[0];
+    FLOAT x1 = x_ptr[1];
+    x_ptr += inc_x;
+    FLOAT x2 = x_ptr[0];
+    FLOAT x3 = x_ptr[1];
+    x_ptr += inc_x;
+    FLOAT x4 = x_ptr[0];
+    FLOAT x5 = x_ptr[1];
+    while (j < n) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
-            temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
-            temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
+      temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+      temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+      temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+      temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
+      temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
 #else
 
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
-            temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
-            temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
+      temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+      temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+      temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+      temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
+      temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
 #endif
 
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#if !defined(XCONJ)
+      y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+      y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
 #else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+      y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+      y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
 #endif
 
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j++;
-        }
-        return (0);
+      a_ptr += lda;
+      y_ptr += inc_y;
+      j++;
     }
-
-    if (m3 == 2) {
-
-        FLOAT temp_r;
-        FLOAT temp_i;
-        FLOAT temp_r1;
-        FLOAT temp_i1;
-        FLOAT x0 = x_ptr[0];
-        FLOAT x1 = x_ptr[1];
-        x_ptr += inc_x;
-        FLOAT x2 = x_ptr[0];
-        FLOAT x3 = x_ptr[1];
-
-        while (j < (n & -2)) {
+    return (0);
+  }
+
+  if (m3 == 2) {
+
+    FLOAT temp_r;
+    FLOAT temp_i;
+    FLOAT temp_r1;
+    FLOAT temp_i1;
+    FLOAT x0 = x_ptr[0];
+    FLOAT x1 = x_ptr[1];
+    x_ptr += inc_x;
+    FLOAT x2 = x_ptr[0];
+    FLOAT x3 = x_ptr[1];
+    FLOAT ar = alpha[0];
+    FLOAT ai = alpha[1];
+
+    while (j < (n & -2)) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
-            a_ptr += lda;
-            temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
-            temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
-            temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
+      temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+      temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+      temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+      a_ptr += lda;
+      temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+      temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
+      temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
 #else
 
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
-            a_ptr += lda;
-            temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
-            temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
-            temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
+      temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+      temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+      temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+      a_ptr += lda;
+      temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+      temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
+      temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
 #endif
 
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
-            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#if !defined(XCONJ)
+      y_ptr[0] += ar * temp_r - ai * temp_i;
+      y_ptr[1] += ar * temp_i + ai * temp_r;
+      y_ptr += inc_y;
+      y_ptr[0] += ar * temp_r1 - ai * temp_i1;
+      y_ptr[1] += ar * temp_i1 + ai * temp_r1;
 #else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
-            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+      y_ptr[0] += ar * temp_r + ai * temp_i;
+      y_ptr[1] -= ar * temp_i - ai * temp_r;
+      y_ptr += inc_y;
+      y_ptr[0] += ar * temp_r1 + ai * temp_i1;
+      y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
 #endif
 
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j += 2;
-        }
+      a_ptr += lda;
+      y_ptr += inc_y;
+      j += 2;
+    }
 
-        while (j < n) {
+    while (j < n) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+      temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+      temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+      temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
 #else
 
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+      temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+      temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+      temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
 #endif
 
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#if !defined(XCONJ)
+      y_ptr[0] += ar * temp_r - ai * temp_i;
+      y_ptr[1] += ar * temp_i + ai * temp_r;
 #else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+      y_ptr[0] += ar * temp_r + ai * temp_i;
+      y_ptr[1] -= ar * temp_i - ai * temp_r;
 #endif
 
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j++;
-        }
-
-        return (0);
+      a_ptr += lda;
+      y_ptr += inc_y;
+      j++;
     }
 
-    if (m3 == 1) {
+    return (0);
+  }
 
-        FLOAT temp_r;
-        FLOAT temp_i;
-        FLOAT temp_r1;
-        FLOAT temp_i1;
-        FLOAT x0 = x_ptr[0];
-        FLOAT x1 = x_ptr[1];
+  if (m3 == 1) {
 
-        while (j < (n & -2)) {
+    FLOAT temp_r;
+    FLOAT temp_i;
+    FLOAT temp_r1;
+    FLOAT temp_i1;
+    FLOAT x0 = x_ptr[0];
+    FLOAT x1 = x_ptr[1];
+    FLOAT ar = alpha[0];
+    FLOAT ai = alpha[1];
+
+    while (j < (n & -2)) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-            a_ptr += lda;
-            temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+      temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+      a_ptr += lda;
+      temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
 #else
 
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-            a_ptr += lda;
-            temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+      temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+      a_ptr += lda;
+      temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
 #endif
 
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
-            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#if !defined(XCONJ)
+      y_ptr[0] += ar * temp_r - ai * temp_i;
+      y_ptr[1] += ar * temp_i + ai * temp_r;
+      y_ptr += inc_y;
+      y_ptr[0] += ar * temp_r1 - ai * temp_i1;
+      y_ptr[1] += ar * temp_i1 + ai * temp_r1;
 #else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
-            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+      y_ptr[0] += ar * temp_r + ai * temp_i;
+      y_ptr[1] -= ar * temp_i - ai * temp_r;
+      y_ptr += inc_y;
+      y_ptr[0] += ar * temp_r1 + ai * temp_i1;
+      y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
 #endif
 
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j += 2;
-        }
+      a_ptr += lda;
+      y_ptr += inc_y;
+      j += 2;
+    }
 
-        while (j < n) {
+    while (j < n) {
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+      temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
 #else
 
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+      temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+      temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
 #endif
 
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#if !defined(XCONJ)
+      y_ptr[0] += ar * temp_r - ai * temp_i;
+      y_ptr[1] += ar * temp_i + ai * temp_r;
 #else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+      y_ptr[0] += ar * temp_r + ai * temp_i;
+      y_ptr[1] -= ar * temp_i - ai * temp_r;
 #endif
 
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j++;
-        }
-        return (0);
+      a_ptr += lda;
+      y_ptr += inc_y;
+      j++;
     }
-
     return (0);
+  }
 
+  return (0);
 }
-
diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c
index 380f0140e8..6284d5a474 100644
--- a/kernel/zarch/zrot.c
+++ b/kernel/zarch/zrot.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -27,235 +27,210 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-static void   zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
-{
-          __asm__  (
-            "pfd    2, 0(%[ptr_x]) \n\t"
-            "pfd    2, 0(%[ptr_y]) \n\t"
-            "lgdr   %%r1,%[cos]    \n\t"
-            "vlvgp  %%v0,%%r1,%%r1 \n\t"
-            "lgdr   %%r1,%[sin]    \n\t"
-            "vlvgp  %%v1,%%r1,%%r1 \n\t"
-            "sllg   %[tmp],%[tmp],4    \n\t"
-            "xgr    %%r1,%%r1     \n\t"
-            ".align 16 \n\t"
-            "1:     \n\t"
-            "pfd    2, 256(%%r1,%[ptr_x]) \n\t"
-            "pfd    2, 256(%%r1,%[ptr_y]) \n\t"
-            "vl     %%v24,  0(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v25, 16(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v26, 32(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v27, 48(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v16,  0(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v17, 16(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v18, 32(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v19, 48(%%r1,%[ptr_y]) \n\t"  
-           
-            "vfmdb  %%v28,%%v24,%%v0 \n\t"
-            "vfmdb  %%v29,%%v25,%%v0 \n\t"
-            "vfmdb  %%v20,%%v24,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v21,%%v25,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v30,%%v26,%%v0 \n\t"
-            "vfmdb  %%v22,%%v26,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v31,%%v27,%%v0 \n\t"
-            "vfmdb  %%v23,%%v27,%%v1 \n\t" /* yn=x*s  */
-            /* 2nd parts*/
-            "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"  
-            "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"  
-            "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" 
-            "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" 
-            "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t"  /* yn=y*c-yn */
-
-            "vst    %%v28, 0(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v29, 16(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v30, 32(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v31, 48(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v20, 0(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v21, 16(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v22, 32(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v23, 48(%%r1,%[ptr_y]) \n\t"  
-           
-            "vl     %%v24, 64(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v25, 80(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v26, 96(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v27,112(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v16, 64(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v17, 80(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v18, 96(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v19,112(%%r1,%[ptr_y]) \n\t"  
-           
-            "vfmdb  %%v28,%%v24,%%v0 \n\t"
-            "vfmdb  %%v29,%%v25,%%v0 \n\t"
-            "vfmdb  %%v20,%%v24,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v21,%%v25,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v30,%%v26,%%v0 \n\t"
-            "vfmdb  %%v22,%%v26,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v31,%%v27,%%v0 \n\t"
-            "vfmdb  %%v23,%%v27,%%v1 \n\t" /* yn=x*s  */
-            /* 2nd parts*/
-            "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"  
-            "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"  
-            "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" 
-            "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" 
-            "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t"  /* yn=y*c-yn */
-
-            "vst    %%v28, 64(%%r1,%[ptr_x])  \n\t" 
-            "vst    %%v29, 80(%%r1,%[ptr_x])  \n\t" 
-            "vst    %%v30, 96(%%r1,%[ptr_x])  \n\t" 
-            "vst    %%v31, 112(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v20, 64(%%r1,%[ptr_y])  \n\t" 
-            "vst    %%v21, 80(%%r1,%[ptr_y])  \n\t" 
-            "vst    %%v22, 96(%%r1,%[ptr_y])  \n\t" 
-            "vst    %%v23, 112(%%r1,%[ptr_y]) \n\t"
-           
-            "vl     %%v24, 128(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v25, 144(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v26, 160(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v27, 176(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v16, 128(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v17, 144(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v18, 160(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v19, 176(%%r1,%[ptr_y]) \n\t"  
-           
-            "vfmdb  %%v28,%%v24,%%v0 \n\t"
-            "vfmdb  %%v29,%%v25,%%v0 \n\t"
-            "vfmdb  %%v20,%%v24,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v21,%%v25,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v30,%%v26,%%v0 \n\t"
-            "vfmdb  %%v22,%%v26,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v31,%%v27,%%v0 \n\t"
-            "vfmdb  %%v23,%%v27,%%v1 \n\t" /* yn=x*s  */
-            /* 2nd parts*/
-            "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"  
-            "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"  
-            "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" 
-            "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" 
-            "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t"  /* yn=y*c-yn */
-
-            "vst    %%v28, 128(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v29, 144(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v30, 160(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v31, 176(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v20, 128(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v21, 144(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v22, 160(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v23, 176(%%r1,%[ptr_y]) \n\t"  
-           
-            "vl     %%v24, 192(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v25, 208(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v26, 224(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v27, 240(%%r1,%[ptr_x]) \n\t" 
-            "vl     %%v16, 192(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v17, 208(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v18, 224(%%r1,%[ptr_y]) \n\t" 
-            "vl     %%v19, 240(%%r1,%[ptr_y]) \n\t"  
-           
-            "vfmdb  %%v28,%%v24,%%v0 \n\t"
-            "vfmdb  %%v29,%%v25,%%v0 \n\t"
-            "vfmdb  %%v20,%%v24,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v21,%%v25,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v30,%%v26,%%v0 \n\t"
-            "vfmdb  %%v22,%%v26,%%v1 \n\t" /* yn=x*s  */
-            "vfmdb  %%v31,%%v27,%%v0 \n\t"
-            "vfmdb  %%v23,%%v27,%%v1 \n\t" /* yn=x*s  */
-            /* 2nd parts*/
-            "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"  
-            "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"  
-            "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" 
-            "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t"  /* yn=y*c-yn */ 
-            "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" 
-            "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t"  /* yn=y*c-yn */
-
-            "vst    %%v28, 192(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v29, 208(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v30, 224(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v31, 240(%%r1,%[ptr_x]) \n\t" 
-            "vst    %%v20, 192(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v21, 208(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v22, 224(%%r1,%[ptr_y]) \n\t" 
-            "vst    %%v23, 240(%%r1,%[ptr_y]) \n\t"
-
-            "la    %%r1,256(%%r1) \n\t"
-            "clgrjl %%r1,%[tmp],1b        \n\t" 
-            : [mem_x] "+m" (*(double (*)[2*n])x),
-              [mem_y] "+m" (*(double (*)[2*n])y),
-              [tmp] "+&r"(n)
-            : [ptr_x] "a"(x), [ptr_y]  "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) 
-            : "cc","r1" ,"v0","v1","v16",
-            "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
-            );
-      return;
-
+static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
+  __asm__("vlrepg %%v0,%[c]\n\t"
+    "vlrepg %%v1,%[s]\n\t"
+    "srlg   %[n],%[n],4\n\t"
+    "xgr    %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "pfd 2, 1024(%%r1,%[y])\n\t"
+    "vl  %%v24, 0(%%r1,%[x])\n\t"
+    "vl  %%v25, 16(%%r1,%[x])\n\t"
+    "vl  %%v26, 32(%%r1,%[x])\n\t"
+    "vl  %%v27, 48(%%r1,%[x])\n\t"
+    "vl  %%v16, 0(%%r1,%[y])\n\t"
+    "vl  %%v17, 16(%%r1,%[y])\n\t"
+    "vl  %%v18, 32(%%r1,%[y])\n\t"
+    "vl  %%v19, 48(%%r1,%[y])\n\t"
+    "vfmdb %%v28,%%v24,%%v0\n\t"
+    "vfmdb %%v29,%%v25,%%v0\n\t"
+    "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v30,%%v26,%%v0\n\t"
+    "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v31,%%v27,%%v0\n\t"
+    "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 0(%%r1,%[x])\n\t"
+    "vst  %%v29, 16(%%r1,%[x])\n\t"
+    "vst  %%v30, 32(%%r1,%[x])\n\t"
+    "vst  %%v31, 48(%%r1,%[x])\n\t"
+    "vst  %%v20, 0(%%r1,%[y])\n\t"
+    "vst  %%v21, 16(%%r1,%[y])\n\t"
+    "vst  %%v22, 32(%%r1,%[y])\n\t"
+    "vst  %%v23, 48(%%r1,%[y])\n\t"
+    "vl  %%v24, 64(%%r1,%[x])\n\t"
+    "vl  %%v25, 80(%%r1,%[x])\n\t"
+    "vl  %%v26, 96(%%r1,%[x])\n\t"
+    "vl  %%v27, 112(%%r1,%[x])\n\t"
+    "vl  %%v16, 64(%%r1,%[y])\n\t"
+    "vl  %%v17, 80(%%r1,%[y])\n\t"
+    "vl  %%v18, 96(%%r1,%[y])\n\t"
+    "vl  %%v19, 112(%%r1,%[y])\n\t"
+    "vfmdb %%v28,%%v24,%%v0\n\t"
+    "vfmdb %%v29,%%v25,%%v0\n\t"
+    "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v30,%%v26,%%v0\n\t"
+    "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v31,%%v27,%%v0\n\t"
+    "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 64(%%r1,%[x])\n\t"
+    "vst  %%v29, 80(%%r1,%[x])\n\t"
+    "vst  %%v30, 96(%%r1,%[x])\n\t"
+    "vst  %%v31, 112(%%r1,%[x])\n\t"
+    "vst  %%v20, 64(%%r1,%[y])\n\t"
+    "vst  %%v21, 80(%%r1,%[y])\n\t"
+    "vst  %%v22, 96(%%r1,%[y])\n\t"
+    "vst  %%v23, 112(%%r1,%[y])\n\t"
+    "vl  %%v24, 128(%%r1,%[x])\n\t"
+    "vl  %%v25, 144(%%r1,%[x])\n\t"
+    "vl  %%v26, 160(%%r1,%[x])\n\t"
+    "vl  %%v27, 176(%%r1,%[x])\n\t"
+    "vl  %%v16, 128(%%r1,%[y])\n\t"
+    "vl  %%v17, 144(%%r1,%[y])\n\t"
+    "vl  %%v18, 160(%%r1,%[y])\n\t"
+    "vl  %%v19, 176(%%r1,%[y])\n\t"
+    "vfmdb %%v28,%%v24,%%v0\n\t"
+    "vfmdb %%v29,%%v25,%%v0\n\t"
+    "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v30,%%v26,%%v0\n\t"
+    "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v31,%%v27,%%v0\n\t"
+    "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 128(%%r1,%[x])\n\t"
+    "vst  %%v29, 144(%%r1,%[x])\n\t"
+    "vst  %%v30, 160(%%r1,%[x])\n\t"
+    "vst  %%v31, 176(%%r1,%[x])\n\t"
+    "vst  %%v20, 128(%%r1,%[y])\n\t"
+    "vst  %%v21, 144(%%r1,%[y])\n\t"
+    "vst  %%v22, 160(%%r1,%[y])\n\t"
+    "vst  %%v23, 176(%%r1,%[y])\n\t"
+    "vl  %%v24, 192(%%r1,%[x])\n\t"
+    "vl  %%v25, 208(%%r1,%[x])\n\t"
+    "vl  %%v26, 224(%%r1,%[x])\n\t"
+    "vl  %%v27, 240(%%r1,%[x])\n\t"
+    "vl  %%v16, 192(%%r1,%[y])\n\t"
+    "vl  %%v17, 208(%%r1,%[y])\n\t"
+    "vl  %%v18, 224(%%r1,%[y])\n\t"
+    "vl  %%v19, 240(%%r1,%[y])\n\t"
+    "vfmdb %%v28,%%v24,%%v0\n\t"
+    "vfmdb %%v29,%%v25,%%v0\n\t"
+    "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v30,%%v26,%%v0\n\t"
+    "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
+    "vfmdb %%v31,%%v27,%%v0\n\t"
+    "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
+    /* 2nd parts */
+    "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
+    "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
+    "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
+    "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
+    "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
+    "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
+    "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
+    "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
+    "vst  %%v28, 192(%%r1,%[x])\n\t"
+    "vst  %%v29, 208(%%r1,%[x])\n\t"
+    "vst  %%v30, 224(%%r1,%[x])\n\t"
+    "vst  %%v31, 240(%%r1,%[x])\n\t"
+    "vst  %%v20, 192(%%r1,%[y])\n\t"
+    "vst  %%v21, 208(%%r1,%[y])\n\t"
+    "vst  %%v22, 224(%%r1,%[y])\n\t"
+    "vst  %%v23, 240(%%r1,%[y])\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) x),
+       "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
+    : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
 }
 
-int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
-{
-    BLASLONG i=0;
-    BLASLONG ix=0,iy=0;
-    FLOAT temp[2];
-    BLASLONG inc_x2;
-    BLASLONG inc_y2;
-
-    if ( n <= 0     )  return(0); 
-
-    if ( (inc_x == 1) && (inc_y == 1) )
-    {
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT c, FLOAT s) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+  FLOAT temp[2];
+  BLASLONG inc_x2;
+  BLASLONG inc_y2;
+
+  if (n <= 0)
+    return (0);
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -16;
+    if (n1 > 0) {
+      FLOAT cosa, sina;
+      cosa = c;
+      sina = s;
+      zrot_kernel_16(n1, x, y, &cosa, &sina);
+      i = n1;
+      ix = 2 * n1;
+    }
 
-        BLASLONG n1 = n & -16;
-        if ( n1 > 0 )
-        { 
-            zrot_kernel_16(n1, x, y, c, s);
-            i=n1; 
-            ix=2*n1; 
-        }
+    while (i < n) {
+      temp[0] = c * x[ix] + s * y[ix];
+      temp[1] = c * x[ix + 1] + s * y[ix + 1];
+      y[ix] = c * y[ix] - s * x[ix];
+      y[ix + 1] = c * y[ix + 1] - s * x[ix + 1];
+      x[ix] = temp[0];
+      x[ix + 1] = temp[1];
 
-         while(i < n)
-           {
-                temp[0]   = c*x[ix]   + s*y[ix] ;
-                temp[1]   = c*x[ix+1] + s*y[ix+1] ;
-                y[ix]     = c*y[ix]   - s*x[ix] ;
-                y[ix+1]   = c*y[ix+1] - s*x[ix+1] ;
-                x[ix]     = temp[0] ;
-                x[ix+1]   = temp[1] ;
+      ix += 2;
+      i++;
 
-                ix += 2 ; 
-                i++ ;
+    }
 
-            }
+  } else {
+    inc_x2 = 2 * inc_x;
+    inc_y2 = 2 * inc_y;
+    while (i < n) {
+      temp[0] = c * x[ix] + s * y[iy];
+      temp[1] = c * x[ix + 1] + s * y[iy + 1];
+      y[iy] = c * y[iy] - s * x[ix];
+      y[iy + 1] = c * y[iy + 1] - s * x[ix + 1];
+      x[ix] = temp[0];
+      x[ix + 1] = temp[1];
+
+      ix += inc_x2;
+      iy += inc_y2;
+      i++;
 
     }
-    else
-    {
-        inc_x2 = 2 * inc_x ;
-        inc_y2 = 2 * inc_y ;
-        while(i < n)
-        {
-            temp[0]   = c*x[ix]   + s*y[iy] ;
-            temp[1]   = c*x[ix+1] + s*y[iy+1] ;
-            y[iy]     = c*y[iy]   - s*x[ix] ;
-            y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
-            x[ix]     = temp[0] ;
-            x[ix+1]   = temp[1] ;
 
-            ix += inc_x2 ;
-            iy += inc_y2 ;
-            i++ ;
+  }
+  return (0);
 
-        }
-
-    }
-    return(0);
- 
 }
-
diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c
index 4764c0a522..e497a6d7b9 100644
--- a/kernel/zarch/zscal.c
+++ b/kernel/zarch/zscal.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013 - 2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -23,490 +23,403 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
+*****************************************************************************/
 
 #include "common.h"
 
- 
-
-static void   zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) {
-    BLASLONG tempR1 ;
-    __asm__ (
-             "pfd    2, 0(%[x_tmp]) \n\t" 
-#if !defined(CONJ)
-            "lgdr   %[t1],%[alpha_r]    \n\t" 
-            "vlvgp  %%v28,%[t1],%[t1]   \n\t" //load both from disjoint          
-            "lgdr   %[t1],%[alpha_i]    \n\t"  
-            "vlvgp  %%v29,%[t1],%[t1]   \n\t" //load both from disjoint   
-            "vflcdb %%v29,%%v29       \n\t" //complement both
-            "vlvgg  %%v29,%[t1],1     \n\t" //restore 2nd  so that  {-alpha_i, alpha_i}   
-
-#else
-            "lgdr   %[t1],%[alpha_i]    \n\t"  
-            "vlvgp  %%v29,%[t1],%[t1]   \n\t" //load both from disjoint        
-            "lgdr   %[t1],%[alpha_r]    \n\t" 
-            "vlvgp  %%v28,%[t1],%[t1]   \n\t" //load both from disjoint    
-            "vflcdb %%v28,%%v28         \n\t" //complement both
-            "vlvgg  %%v28,%[t1],0       \n\t" //restore 1st  so that  {alpha_r,-alpha_r}   
-#endif           
-                               
-            "xgr    %[t1],%[t1]        \n\t" 
-            "sllg   %[tmp],%[tmp],4    \n\t" 
-            "vl     %%v20 ,  0(%[t1],%[x_tmp])  \n\t" 
-            "vl     %%v21 , 16(%[t1],%[x_tmp])  \n\t" 
-            "vl     %%v22 , 32(%[t1],%[x_tmp])  \n\t" 
-            "vl     %%v23 , 48(%[t1],%[x_tmp])  \n\t"   
-                      
-            "lay  %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition
-            "j 2f \n\t"
-            ".align 16 \n\t"
-            "1:     \n\t"
-  
-            "vpdi   %%v24 , %%v20, %%v20, 4     \n\t"
-            "vpdi   %%v25 , %%v21, %%v21, 4     \n\t"
-            "vpdi   %%v26 , %%v22, %%v22, 4     \n\t"
-            "vpdi   %%v27 , %%v23, %%v23, 4     \n\t" 
-            "vfmdb  %%v16,  %%v20, %%v28        \n\t"
-            "vfmdb  %%v17,  %%v21, %%v28        \n\t"
-            "vfmdb  %%v18,  %%v22, %%v28        \n\t"
-            "vfmdb  %%v19,  %%v23, %%v28        \n\t"
-            "vl     %%v20,  64(%[t1],%[x_tmp])  \n\t" 
-            "vl     %%v21,  80(%[t1],%[x_tmp])  \n\t" 
-            "vl     %%v22,  96(%[t1],%[x_tmp])  \n\t" 
-            "vl     %%v23, 112(%[t1],%[x_tmp])  \n\t" 
-            "vfmadb %%v16,  %%v24, %%v29, %%v16 \n\t"
-            "vfmadb %%v17,  %%v25, %%v29, %%v17 \n\t" 
-            "vfmadb %%v18,  %%v26, %%v29, %%v18 \n\t"
-            "vfmadb %%v19,  %%v27, %%v29, %%v19 \n\t"
-
-
-            "vst    %%v16 ,  0(%[t1],%[x_tmp])  \n\t" 
-            "vst    %%v17 , 16(%[t1],%[x_tmp])  \n\t" 
-            "vst    %%v18 , 32(%[t1],%[x_tmp])  \n\t" 
-            "vst    %%v19 , 48(%[t1],%[x_tmp])  \n\t"   
-    
-            "la     %[t1],64(%[t1] ) \n\t" 
-            "2:  \n\t" 
-            "pfd    2, 256(%[t1],%[x_tmp])  \n\t"  
-            "vpdi   %%v24 , %%v20, %%v20, 4     \n\t"
-            "vpdi   %%v25 , %%v21, %%v21, 4     \n\t"
-            "vpdi   %%v26 , %%v22, %%v22, 4     \n\t"
-            "vpdi   %%v27 , %%v23, %%v23, 4     \n\t" 
-
-            "vfmdb  %%v30,  %%v20, %%v28        \n\t"
-            "vfmdb  %%v31,  %%v21, %%v28        \n\t"
-            "vfmdb  %%v6,   %%v22, %%v28        \n\t"
-            "vfmdb  %%v7,   %%v23, %%v28       \n\t"
-
-            "vl     %%v20 , 64(%[t1],%[x_tmp])  \n\t" 
-            "vl     %%v21 , 80(%[t1],%[x_tmp])  \n\t"  
-            "vl     %%v22 , 96(%[t1],%[x_tmp])  \n\t" 
-            "vl     %%v23 ,112(%[t1],%[x_tmp])  \n\t" 
-
-            "vfmadb %%v30, %%v24, %%v29, %%v30  \n\t"
-            "vfmadb %%v31, %%v25, %%v29, %%v31  \n\t"
-            "vfmadb %%v6,  %%v26, %%v29, %%v6   \n\t"
-            "vfmadb %%v7,  %%v27, %%v29, %%v7   \n\t"
-
-
-            "vst    %%v30 ,  0(%[t1],%[x_tmp])  \n\t" 
-            "vst    %%v31 , 16(%[t1],%[x_tmp])  \n\t" 
-            "vst    %%v6 ,  32(%[t1],%[x_tmp])  \n\t" 
-            "vst    %%v7 ,  48(%[t1],%[x_tmp])  \n\t"  
- 
-            "la     %[t1],64(%[t1] ) \n\t"
-          
-
-             "clgrjl %[t1],%[tmp],1b         \n\t"   
-//----------------------------------------------------------------------
-            "vfmdb  %%v16,  %%v20, %%v28        \n\t"
-            "vfmdb  %%v17,  %%v21, %%v28        \n\t"
-            "vfmdb  %%v18,  %%v22, %%v28        \n\t"
-            "vfmdb  %%v19,  %%v23, %%v28        \n\t"
-            "vpdi   %%v24 , %%v20, %%v20, 4     \n\t"
-            "vpdi   %%v25 , %%v21, %%v21, 4     \n\t" 
-            "vpdi   %%v26 , %%v22, %%v22, 4     \n\t"
-            "vpdi   %%v27 , %%v23, %%v23, 4     \n\t"             
-            "vfmadb %%v16,  %%v24, %%v29, %%v16 \n\t"
-            "vfmadb %%v17,  %%v25, %%v29, %%v17 \n\t"
-            "vfmadb %%v18,  %%v26, %%v29, %%v18 \n\t"
-            "vfmadb %%v19,  %%v27, %%v29, %%v19 \n\t"
-
-            "vst   %%v16 ,  0(%[t1],%[x_tmp])   \n\t" 
-            "vst   %%v17 , 16(%[t1],%[x_tmp])   \n\t" 
-            "vst   %%v18 , 32(%[t1],%[x_tmp])   \n\t" 
-            "vst   %%v19 , 48(%[t1],%[x_tmp])   \n\t"   
-
-            : [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) ,  [t1] "=&a" (tempR1) 
-            : [x_tmp] "a"(x),  [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
-            : "cc",  "v6","v7", "v16",
-            "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
-            );
-            
-
-
+static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) {
+  __asm__("vlrepg %%v0,0(%[alpha])\n\t"
+    "vleg   %%v1,8(%[alpha]),0\n\t"
+    "wflcdb %%v1,%%v1\n\t"
+    "vleg   %%v1,8(%[alpha]),1\n\t"
+    "srlg %[n],%[n],3\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "vl   %%v16,0(%%r1,%[x])\n\t"
+    "vl   %%v17,16(%%r1,%[x])\n\t"
+    "vl   %%v18,32(%%r1,%[x])\n\t"
+    "vl   %%v19,48(%%r1,%[x])\n\t"
+    "vl   %%v20,64(%%r1,%[x])\n\t"
+    "vl   %%v21,80(%%r1,%[x])\n\t"
+    "vl   %%v22,96(%%r1,%[x])\n\t"
+    "vl   %%v23,112(%%r1,%[x])\n\t"
+    "vpdi %%v24,%%v16,%%v16,4\n\t"
+    "vpdi %%v25,%%v17,%%v17,4\n\t"
+    "vpdi %%v26,%%v18,%%v18,4\n\t"
+    "vpdi %%v27,%%v19,%%v19,4\n\t"
+    "vpdi %%v28,%%v20,%%v20,4\n\t"
+    "vpdi %%v29,%%v21,%%v21,4\n\t"
+    "vpdi %%v30,%%v22,%%v22,4\n\t"
+    "vpdi %%v31,%%v23,%%v23,4\n\t"
+    "vfmdb %%v16,%%v16,%%v0\n\t"
+    "vfmdb %%v17,%%v17,%%v0\n\t"
+    "vfmdb %%v18,%%v18,%%v0\n\t"
+    "vfmdb %%v19,%%v19,%%v0\n\t"
+    "vfmdb %%v20,%%v20,%%v0\n\t"
+    "vfmdb %%v21,%%v21,%%v0\n\t"
+    "vfmdb %%v22,%%v22,%%v0\n\t"
+    "vfmdb %%v23,%%v23,%%v0\n\t"
+    "vfmadb %%v16,%%v24,%%v1,%%v16\n\t"
+    "vfmadb %%v17,%%v25,%%v1,%%v17\n\t"
+    "vfmadb %%v18,%%v26,%%v1,%%v18\n\t"
+    "vfmadb %%v19,%%v27,%%v1,%%v19\n\t"
+    "vfmadb %%v20,%%v28,%%v1,%%v20\n\t"
+    "vfmadb %%v21,%%v29,%%v1,%%v21\n\t"
+    "vfmadb %%v22,%%v30,%%v1,%%v22\n\t"
+    "vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
+    "vst %%v16,0(%%r1,%[x])\n\t"
+    "vst %%v17,16(%%r1,%[x])\n\t"
+    "vst %%v18,32(%%r1,%[x])\n\t"
+    "vst %%v19,48(%%r1,%[x])\n\t"
+    "vst %%v20,64(%%r1,%[x])\n\t"
+    "vst %%v21,80(%%r1,%[x])\n\t"
+    "vst %%v22,96(%%r1,%[x])\n\t"
+    "vst %%v23,112(%%r1,%[x])\n\t"
+    "agfi  %%r1,128\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
+    : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
+       [alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+       "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+       "v31");
 }
- 
-static void   zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) {
- 
-        __asm__ (   "pfd    2, 0(%1)          \n\t" 
-                    "lgdr   %%r0,%[alpha]     \n\t"
-                    "vlvgp  %%v16,%%r0,%%r0   \n\t" //load both from disjoint
-                    "vflcdb %%v16,%%v16       \n\t" //complement both
-                    "vlvgg  %%v16,%%r0,0      \n\t" //restore 1st                   
-                    "vlr    %%v17 ,%%v16      \n\t" 
-                    "sllg   %%r0,%[n],4       \n\t"  
-                    "agr    %%r0,%[x_ptr]     \n\t"
-                    ".align 16    \n\t"    
-                    "1:     \n\t"  
-                    "vl     %%v24, 0(%[x_ptr])      \n\t"
-                    "vfmdb  %%v24,%%v24,%%v16        \n\t"
-                    "vsteg  %%v24, 0(%[x_ptr]),1    \n\t" 
-                    "vsteg  %%v24, 8(%[x_ptr]),0    \n\t" 
-                    "vl     %%v25, 16(%[x_ptr])     \n\t"
-                    "vfmdb  %%v25,%%v25,%%v17        \n\t"  
-                    "vsteg  %%v25, 16(%[x_ptr]),1   \n\t" 
-                    "vsteg  %%v25, 24(%[x_ptr]),0   \n\t" 
-                    "vl     %%v26, 32(%[x_ptr])     \n\t"
-                    "vfmdb  %%v26,%%v26,%%v16       \n\t"
-                    "vsteg  %%v26, 32(%[x_ptr]),1   \n\t" 
-                    "vsteg  %%v26, 40(%[x_ptr]),0   \n\t"   
-                    "vl     %%v27, 48(%[x_ptr])     \n\t" 
-                    "vfmdb  %%v27,%%v27,%%v17 \n\t"  
-                    "vsteg  %%v27, 48(%[x_ptr]),1   \n\t" 
-                    "vsteg  %%v27, 56(%[x_ptr]),0   \n\t" 
-                    "vl     %%v28, 64(%[x_ptr])     \n\t"
-                    "vfmdb  %%v28,%%v28,%%v16        \n\t"
-                    "vsteg  %%v28, 64(%[x_ptr]),1   \n\t" 
-                    "vsteg  %%v28, 72(%[x_ptr]),0   \n\t" 
-                    "vl     %%v29, 80(%[x_ptr])     \n\t"
-                    "vfmdb  %%v29,%%v29,%%v17        \n\t"  
-                    "vsteg  %%v29, 80(%[x_ptr]),1   \n\t" 
-                    "vsteg  %%v29, 88(%[x_ptr]),0   \n\t" 
-                    "vl     %%v30, 96(%[x_ptr])     \n\t"
-                    "vfmdb  %%v30,%%v30,%%v16       \n\t"
-                    "vsteg  %%v30,  96(%[x_ptr]),1  \n\t" 
-                    "vsteg  %%v30, 104(%[x_ptr]),0  \n\t"  
-                    "vl     %%v31, 112(%[x_ptr])    \n\t" 
-                    "vfmdb  %%v31,%%v31,%%v17 \n\t"  
-                    "vsteg  %%v31, 112(%[x_ptr]),1  \n\t" 
-                    "vsteg  %%v31, 120(%[x_ptr]),0  \n\t" 
-                    "la     %[x_ptr],128(%[x_ptr])  \n\t"
-                    "clgrjl %[x_ptr],%%r0,1b \n\t"
-                    : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
-                    : [n] "r"(n),[alpha] "f"(da_i)
-                    :"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31" 
-                 );
-
 
+static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
+  __asm__("vleg   %%v0,8(%[alpha]),0\n\t"
+    "wflcdb %%v0,%%v0\n\t"
+    "vleg   %%v0,8(%[alpha]),1\n\t"
+    "srlg %[n],%[n],3\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "vl   %%v16,0(%%r1,%[x])\n\t"
+    "vl   %%v17,16(%%r1,%[x])\n\t"
+    "vl   %%v18,32(%%r1,%[x])\n\t"
+    "vl   %%v19,48(%%r1,%[x])\n\t"
+    "vl   %%v20,64(%%r1,%[x])\n\t"
+    "vl   %%v21,80(%%r1,%[x])\n\t"
+    "vl   %%v22,96(%%r1,%[x])\n\t"
+    "vl   %%v23,112(%%r1,%[x])\n\t"
+    "vpdi %%v16,%%v16,%%v16,4\n\t"
+    "vpdi %%v17,%%v17,%%v17,4\n\t"
+    "vpdi %%v18,%%v18,%%v18,4\n\t"
+    "vpdi %%v19,%%v19,%%v19,4\n\t"
+    "vpdi %%v20,%%v20,%%v20,4\n\t"
+    "vpdi %%v21,%%v21,%%v21,4\n\t"
+    "vpdi %%v22,%%v22,%%v22,4\n\t"
+    "vpdi %%v23,%%v23,%%v23,4\n\t"
+    "vfmdb %%v16,%%v16,%%v0\n\t"
+    "vfmdb %%v17,%%v17,%%v0\n\t"
+    "vfmdb %%v18,%%v18,%%v0\n\t"
+    "vfmdb %%v19,%%v19,%%v0\n\t"
+    "vfmdb %%v20,%%v20,%%v0\n\t"
+    "vfmdb %%v21,%%v21,%%v0\n\t"
+    "vfmdb %%v22,%%v22,%%v0\n\t"
+    "vfmdb %%v23,%%v23,%%v0\n\t"
+    "vst %%v16,0(%%r1,%[x])\n\t"
+    "vst %%v17,16(%%r1,%[x])\n\t"
+    "vst %%v18,32(%%r1,%[x])\n\t"
+    "vst %%v19,48(%%r1,%[x])\n\t"
+    "vst %%v20,64(%%r1,%[x])\n\t"
+    "vst %%v21,80(%%r1,%[x])\n\t"
+    "vst %%v22,96(%%r1,%[x])\n\t"
+    "vst %%v23,112(%%r1,%[x])\n\t"
+    "agfi  %%r1,128\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
+    : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
+       [alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23");
 }
 
-static void   zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) {
-           __asm__ ("pfd    2, 0(%[x_ptr])     \n\t"      
-                    "lgdr   %%r0,%[alpha]      \n\t"
-                    "vlvgp  %%v18,%%r0,%%r0    \n\t"
-                    "vlr    %%v19,%%v18        \n\t"
-                    "vlr    %%v16,%%v18        \n\t"
-                    "vlr    %%v17,%%v18        \n\t" 
-                    "sllg   %%r0,%[n],4        \n\t"  
-                    "agr    %%r0,%[x_ptr]      \n\t"
-                    ".align 16 \n\t"    
-                    "1:    \n\t"  
-                    "vl     %%v24, 0(%[x_ptr])  \n\t"
-                    "vfmdb  %%v24,%%v24,%%v18   \n\t"
-                    "vst    %%v24, 0(%[x_ptr])  \n\t" 
-                    "vl     %%v25, 16(%[x_ptr]) \n\t"
-                    "vfmdb  %%v25,%%v25,%%v19   \n\t"  
-                    "vst    %%v25, 16(%[x_ptr]) \n\t" 
-                    "vl     %%v26, 32(%[x_ptr]) \n\t"
-                    "vfmdb  %%v26,%%v26,%%v16   \n\t"
-                    "vst    %%v26, 32(%[x_ptr]) \n\t"  
-                    "vl     %%v27, 48(%[x_ptr]) \n\t" 
-                    "vfmdb  %%v27,%%v27,%%v17   \n\t"  
-                    "vst    %%v27, 48(%[x_ptr]) \n\t"  
-                    "vl     %%v28, 64(%[x_ptr]) \n\t"
-                    "vfmdb  %%v28,%%v28,%%v18   \n\t"
-                    "vst    %%v28, 64(%[x_ptr]) \n\t" 
-                    "vl     %%v29, 80(%[x_ptr]) \n\t"
-                    "vfmdb  %%v29,%%v29,%%v19   \n\t"  
-                    "vst    %%v29, 80(%[x_ptr]) \n\t" 
-                    "vl     %%v30, 96(%[x_ptr]) \n\t"
-                    "vfmdb  %%v30,%%v30,%%v16   \n\t"
-                    "vst    %%v30, 96(%[x_ptr]) \n\t"  
-                    "vl     %%v31,112(%[x_ptr]) \n\t" 
-                    "vfmdb  %%v31,%%v31,%%v17   \n\t"  
-                    "vst    %%v31,112(%[x_ptr]) \n\t"
-                    "la     %[x_ptr],128(%[x_ptr])   \n\t"
-                    "clgrjl %[x_ptr],%%r0,1b    \n\t"
-                    : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
-                    : [n] "r"(n),[alpha] "f"(da_r)
-                    : "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" 
-                 );
-
+static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
+  __asm__("vlrepg %%v0,0(%[alpha])\n\t"
+    "srlg %[n],%[n],3\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "vl   %%v16,0(%%r1,%[x])\n\t"
+    "vl   %%v17,16(%%r1,%[x])\n\t"
+    "vl   %%v18,32(%%r1,%[x])\n\t"
+    "vl   %%v19,48(%%r1,%[x])\n\t"
+    "vl   %%v20,64(%%r1,%[x])\n\t"
+    "vl   %%v21,80(%%r1,%[x])\n\t"
+    "vl   %%v22,96(%%r1,%[x])\n\t"
+    "vl   %%v23,112(%%r1,%[x])\n\t"
+    "vfmdb %%v16,%%v16,%%v0\n\t"
+    "vfmdb %%v17,%%v17,%%v0\n\t"
+    "vfmdb %%v18,%%v18,%%v0\n\t"
+    "vfmdb %%v19,%%v19,%%v0\n\t"
+    "vfmdb %%v20,%%v20,%%v0\n\t"
+    "vfmdb %%v21,%%v21,%%v0\n\t"
+    "vfmdb %%v22,%%v22,%%v0\n\t"
+    "vfmdb %%v23,%%v23,%%v0\n\t"
+    "vst %%v16,0(%%r1,%[x])\n\t"
+    "vst %%v17,16(%%r1,%[x])\n\t"
+    "vst %%v18,32(%%r1,%[x])\n\t"
+    "vst %%v19,48(%%r1,%[x])\n\t"
+    "vst %%v20,64(%%r1,%[x])\n\t"
+    "vst %%v21,80(%%r1,%[x])\n\t"
+    "vst %%v22,96(%%r1,%[x])\n\t"
+    "vst %%v23,112(%%r1,%[x])\n\t"
+    "agfi  %%r1,128\n\t"
+    "brctg %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
+    : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
+       [alpha] "a"(alpha)
+    : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+       "v23");
 }
 
-static void  zscal_kernel_8_zero(BLASLONG n,  FLOAT *x) {
-
-     __asm__ (      "pfd 2, 0(%[x_ptr])    \n\t"      
-                    "vzero %%v24     \n\t"
-                    "vzero %%v25     \n\t"
-                    "vzero %%v26     \n\t"
-                    "vzero %%v27     \n\t" 
-                    "sllg  %%r0,%[n],4 \n\t"  
-                    "agr   %%r0,%[x_ptr]   \n\t"
-                    ".align 16 \n\t"    
-                    "1: \n\t" 
-                    "pfd     2, 256( %[x_ptr])  \n\t"     
-                    "vst  %%v24,  0( %[x_ptr])  \n\t" 
-                    "vst  %%v25, 16( %[x_ptr])  \n\t" 
-                    "vst  %%v26, 32( %[x_ptr])  \n\t"   
-                    "vst  %%v27, 48( %[x_ptr])  \n\t"  
-                    "vst  %%v24, 64( %[x_ptr])  \n\t" 
-                    "vst  %%v25, 80( %[x_ptr])  \n\t" 
-                    "vst  %%v26, 96( %[x_ptr])  \n\t"  
-                    "vst  %%v27,112( %[x_ptr])  \n\t"  
-              
-                    "la     %[x_ptr],128(%[x_ptr]) \n\t"
-                    "clgrjl %[x_ptr],%%r0,1b \n\t"
-                    : [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x) 
-                    : [n] "r"(n)
-                    :"cc" ,"r0","v24","v25","v26","v27"
-                 );
-
+static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
+  __asm__("vzero %%v0\n\t"
+    "srlg %[n],%[n],3\n\t"
+    "xgr   %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "vst  %%v0,0(%%r1,%[x])\n\t"
+    "vst  %%v0,16(%%r1,%[x])\n\t"
+    "vst  %%v0,32(%%r1,%[x])\n\t"
+    "vst  %%v0,48(%%r1,%[x])\n\t"
+    "vst  %%v0,64(%%r1,%[x])\n\t"
+    "vst  %%v0,80(%%r1,%[x])\n\t"
+    "vst  %%v0,96(%%r1,%[x])\n\t"
+    "vst  %%v0,112(%%r1,%[x])\n\t"
+    "agfi  %%r1,128\n\t"
+    "brctg %[n],0b"
+    : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
+    : [x] "a"(x)
+    : "cc", "r1", "v0");
 }
 
-
-
-
-
-static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) {
-
-    BLASLONG i;
-    BLASLONG inc_x2 = 2 * inc_x;
-    BLASLONG inc_x3 = inc_x2 + inc_x;
-    FLOAT t0, t1, t2, t3; 
-
-    for (i = 0; i < n; i += 4) {
-        t0 = da_r * x[0] - da_i * x[1];
-        t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
-        t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
-        t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];
-
-        x[1] = da_i * x[0] + da_r * x[1];
-        x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
-        x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
-        x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];
-
-        x[0] = t0;
-        x[inc_x] = t1;
-        x[inc_x2] = t2;
-        x[inc_x3] = t3;
-
-        x += 4 * inc_x;
-
-    }
-
-
+static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
+                               BLASLONG inc_x) {
+  BLASLONG i;
+  BLASLONG inc_x2 = 2 * inc_x;
+  BLASLONG inc_x3 = inc_x2 + inc_x;
+  FLOAT t0, t1, t2, t3;
+  FLOAT da_r = alpha[0];
+  FLOAT da_i = alpha[1];
+
+  for (i = 0; i < n; i += 4) {
+    t0 = da_r * x[0] - da_i * x[1];
+    t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
+    t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
+    t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];
+
+    x[1] = da_i * x[0] + da_r * x[1];
+    x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
+    x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
+    x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];
+
+    x[0] = t0;
+    x[inc_x] = t1;
+    x[inc_x2] = t2;
+    x[inc_x3] = t3;
+
+    x += 4 * inc_x;
+  }
 }
 
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
-    BLASLONG i = 0, j = 0;
-    FLOAT temp0;
-    FLOAT temp1;
-
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
+          FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+          BLASLONG dummy2) {
+  BLASLONG i = 0, j = 0;
+  FLOAT temp0;
+  FLOAT temp1;
+  FLOAT alpha[2] __attribute__ ((aligned(16)));
 
-    if (inc_x != 1) {
-        inc_x <<= 1;
+  if (inc_x != 1) {
+    inc_x <<= 1;
 
-        if (da_r == 0.0) {
-
-            BLASLONG n1 = n & -2;
-
-            if (da_i == 0.0) {
-
-                while (j < n1) {
-
-                    x[i] = 0.0;
-                    x[i + 1] = 0.0;
-                    x[i + inc_x] = 0.0;
-                    x[i + 1 + inc_x] = 0.0;
-                    i += 2 * inc_x;
-                    j += 2;
-
-                }
+    if (da_r == 0.0) {
 
-                while (j < n) {
+      BLASLONG n1 = n & -2;
 
-                    x[i] = 0.0;
-                    x[i + 1] = 0.0;
-                    i += inc_x;
-                    j++;
+      if (da_i == 0.0) {
 
-                }
+        while (j < n1) {
 
-            } else {
+          x[i] = 0.0;
+          x[i + 1] = 0.0;
+          x[i + inc_x] = 0.0;
+          x[i + 1 + inc_x] = 0.0;
+          i += 2 * inc_x;
+          j += 2;
 
-                while (j < n1) {
+        }
 
-                    temp0 = -da_i * x[i + 1];
-                    x[i + 1] = da_i * x[i];
-                    x[i] = temp0;
-                    temp1 = -da_i * x[i + 1 + inc_x];
-                    x[i + 1 + inc_x] = da_i * x[i + inc_x];
-                    x[i + inc_x] = temp1;
-                    i += 2 * inc_x;
-                    j += 2;
+        while (j < n) {
 
-                }
+          x[i] = 0.0;
+          x[i + 1] = 0.0;
+          i += inc_x;
+          j++;
 
-                while (j < n) {
+        }
 
-                    temp0 = -da_i * x[i + 1];
-                    x[i + 1] = da_i * x[i];
-                    x[i] = temp0;
-                    i += inc_x;
-                    j++;
+      } else {
 
-                }
+        while (j < n1) {
 
+          temp0 = -da_i * x[i + 1];
+          x[i + 1] = da_i * x[i];
+          x[i] = temp0;
+          temp1 = -da_i * x[i + 1 + inc_x];
+          x[i + 1 + inc_x] = da_i * x[i + inc_x];
+          x[i + inc_x] = temp1;
+          i += 2 * inc_x;
+          j += 2;
 
+        }
 
-            }
+        while (j < n) {
 
-        } else {
+          temp0 = -da_i * x[i + 1];
+          x[i + 1] = da_i * x[i];
+          x[i] = temp0;
+          i += inc_x;
+          j++;
 
+        }
 
-            if (da_i == 0.0) {
-                BLASLONG n1 = n & -2;
+      }
 
-                while (j < n1) {
+    } else {
 
-                    temp0 = da_r * x[i];
-                    x[i + 1] = da_r * x[i + 1];
-                    x[i] = temp0;
-                    temp1 = da_r * x[i + inc_x];
-                    x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
-                    x[i + inc_x] = temp1;
-                    i += 2 * inc_x;
-                    j += 2;
+      if (da_i == 0.0) {
+        BLASLONG n1 = n & -2;
 
-                }
+        while (j < n1) {
 
-                while (j < n) {
+          temp0 = da_r * x[i];
+          x[i + 1] = da_r * x[i + 1];
+          x[i] = temp0;
+          temp1 = da_r * x[i + inc_x];
+          x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
+          x[i + inc_x] = temp1;
+          i += 2 * inc_x;
+          j += 2;
 
-                    temp0 = da_r * x[i];
-                    x[i + 1] = da_r * x[i + 1];
-                    x[i] = temp0;
-                    i += inc_x;
-                    j++;
+        }
 
-                }
+        while (j < n) {
 
-            } else {
+          temp0 = da_r * x[i];
+          x[i + 1] = da_r * x[i + 1];
+          x[i] = temp0;
+          i += inc_x;
+          j++;
 
-                BLASLONG n1 = n & -8;
-                if (n1 > 0) { 
-                    zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x);
-                    j = n1;
-                    i = n1 * inc_x;
-                }
+        }
 
-                while (j < n) {
+      } else {
 
-                    temp0 = da_r * x[i] - da_i * x[i + 1];
-                    x[i + 1] = da_r * x[i + 1] + da_i * x[i];
-                    x[i] = temp0;
-                    i += inc_x;
-                    j++;
+        BLASLONG n1 = n & -8;
+        if (n1 > 0) {
+          alpha[0] = da_r;
+          alpha[1] = da_i;
+          zscal_kernel_inc_8(n1, alpha, x, inc_x);
+          j = n1;
+          i = n1 * inc_x;
+        }
 
-                }
+        while (j < n) {
 
-            }
+          temp0 = da_r * x[i] - da_i * x[i + 1];
+          x[i + 1] = da_r * x[i + 1] + da_i * x[i];
+          x[i] = temp0;
+          i += inc_x;
+          j++;
 
         }
 
-        return (0);
-    }
-
+      }
 
-    BLASLONG n1 = n & -8;
-    if (n1 > 0) {
+    }
 
+    return (0);
+  }
 
-        if (da_r == 0.0)
-            if (da_i == 0)
-                zscal_kernel_8_zero(n1,  x);
-            else
-                zscal_kernel_8_zero_r(n1, da_i, x);
-        else
-            if (da_i == 0)
-            zscal_kernel_8_zero_i(n1, da_r, x);
-        else
-            zscal_kernel_8(n1, da_r,da_i, x);
+  BLASLONG n1 = n & -8;
+  if (n1 > 0) {
 
-        i = n1 << 1;
-        j = n1;
-    }
+    alpha[0] = da_r;
+    alpha[1] = da_i;
 
+    if (da_r == 0.0)
+      if (da_i == 0)
+        zscal_kernel_8_zero(n1, x);
+      else
+        zscal_kernel_8_zero_r(n1, alpha, x);
+    else if (da_i == 0)
+      zscal_kernel_8_zero_i(n1, alpha, x);
+    else
+      zscal_kernel_8(n1, alpha, x);
 
-    if (da_r == 0.0) {
+    i = n1 << 1;
+    j = n1;
+  }
 
-        if (da_i == 0.0) {
+  if (da_r == 0.0) {
 
-            while (j < n) {
+    if (da_i == 0.0) {
 
-                x[i] = 0.0;
-                x[i + 1] = 0.0;
-                i += 2;
-                j++;
+      while (j < n) {
 
-            }
+        x[i] = 0.0;
+        x[i + 1] = 0.0;
+        i += 2;
+        j++;
 
-        } else {
+      }
 
-            while (j < n) {
+    } else {
 
-                temp0 = -da_i * x[i + 1];
-                x[i + 1] = da_i * x[i];
-                x[i] = temp0;
-                i += 2;
-                j++;
+      while (j < n) {
 
-            }
+        temp0 = -da_i * x[i + 1];
+        x[i + 1] = da_i * x[i];
+        x[i] = temp0;
+        i += 2;
+        j++;
 
-        }
+      }
 
-    } else {
+    }
 
-        if (da_i == 0.0) {
+  } else {
 
-            while (j < n) {
+    if (da_i == 0.0) {
 
-                temp0 = da_r * x[i];
-                x[i + 1] = da_r * x[i + 1];
-                x[i] = temp0;
-                i += 2;
-                j++;
+      while (j < n) {
 
-            }
+        temp0 = da_r * x[i];
+        x[i + 1] = da_r * x[i + 1];
+        x[i] = temp0;
+        i += 2;
+        j++;
 
-        } else {
+      }
 
-            while (j < n) {
+    } else {
 
-                temp0 = da_r * x[i] - da_i * x[i + 1];
-                x[i + 1] = da_r * x[i + 1] + da_i * x[i];
-                x[i] = temp0;
-                i += 2;
-                j++;
+      while (j < n) {
 
-            }
+        temp0 = da_r * x[i] - da_i * x[i + 1];
+        x[i + 1] = da_r * x[i + 1] + da_i * x[i];
+        x[i] = temp0;
+        i += 2;
+        j++;
 
-        }
+      }
 
     }
 
-    return (0);
-}
-
+  }
 
+  return (0);
+}
diff --git a/kernel/zarch/zsum.c b/kernel/zarch/zsum.c
new file mode 100644
index 0000000000..7cfc1f17f9
--- /dev/null
+++ b/kernel/zarch/zsum.c
@@ -0,0 +1,136 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) {
+  FLOAT sum;
+
+  __asm__("vzero   %%v24\n\t"
+    "vzero   %%v25\n\t"
+    "vzero   %%v26\n\t"
+    "vzero   %%v27\n\t"
+    "vzero   %%v28\n\t"
+    "vzero   %%v29\n\t"
+    "vzero   %%v30\n\t"
+    "vzero   %%v31\n\t"
+    "srlg  %[n],%[n],4\n\t"
+    "xgr %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd  1, 1024(%%r1,%[x])\n\t"
+    "vl  %%v16, 0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v20, 64(%%r1,%[x])\n\t"
+    "vl  %%v21, 80(%%r1,%[x])\n\t"
+    "vl  %%v22, 96(%%r1,%[x])\n\t"
+    "vl  %%v23, 112(%%r1,%[x])\n\t"
+    "vfadb   %%v24,%%v24,%%v16\n\t"
+    "vfadb   %%v25,%%v25,%%v17\n\t"
+    "vfadb   %%v26,%%v26,%%v18\n\t"
+    "vfadb   %%v27,%%v27,%%v19\n\t"
+    "vfadb   %%v28,%%v28,%%v20\n\t"
+    "vfadb   %%v29,%%v29,%%v21\n\t"
+    "vfadb   %%v30,%%v30,%%v22\n\t"
+    "vfadb   %%v31,%%v31,%%v23\n\t"
+    "vl  %%v16, 128(%%r1,%[x])\n\t"
+    "vl  %%v17, 144(%%r1,%[x])\n\t"
+    "vl  %%v18, 160(%%r1,%[x])\n\t"
+    "vl  %%v19, 176(%%r1,%[x])\n\t"
+    "vl  %%v20, 192(%%r1,%[x])\n\t"
+    "vl  %%v21, 208(%%r1,%[x])\n\t"
+    "vl  %%v22, 224(%%r1,%[x])\n\t"
+    "vl  %%v23, 240(%%r1,%[x])\n\t"
+    "vfadb   %%v24,%%v24,%%v16\n\t"
+    "vfadb   %%v25,%%v25,%%v17\n\t"
+    "vfadb   %%v26,%%v26,%%v18\n\t"
+    "vfadb   %%v27,%%v27,%%v19\n\t"
+    "vfadb   %%v28,%%v28,%%v20\n\t"
+    "vfadb   %%v29,%%v29,%%v21\n\t"
+    "vfadb   %%v30,%%v30,%%v22\n\t"
+    "vfadb   %%v31,%%v31,%%v23\n\t"
+    "agfi  %%r1,256\n\t"
+    "brctg %[n],0b\n\t"
+    "vfadb   %%v24,%%v24,%%v25\n\t"
+    "vfadb   %%v24,%%v24,%%v26\n\t"
+    "vfadb   %%v24,%%v24,%%v27\n\t"
+    "vfadb   %%v24,%%v24,%%v28\n\t"
+    "vfadb   %%v24,%%v24,%%v29\n\t"
+    "vfadb   %%v24,%%v24,%%v30\n\t"
+    "vfadb   %%v24,%%v24,%%v31\n\t"
+    "vrepg   %%v25,%%v24,1\n\t"
+    "vfadb   %%v24,%%v24,%%v25\n\t"
+    "vsteg   %%v24,%[asum],0"
+    : [sum] "=Q"(sum),[n] "+&r"(n)
+    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
+    : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+
+  return sum;
+}
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+  BLASLONG i = 0;
+  BLASLONG ip = 0;
+  FLOAT sumf = 0.0;
+  BLASLONG n1;
+  BLASLONG inc_x2;
+
+  if (n <= 0 || inc_x <= 0)
+    return (sumf);
+
+  if (inc_x == 1) {
+
+    n1 = n & -16;
+    if (n1 > 0) {
+
+      sumf = zsum_kernel_16(n1, x);
+      i = n1;
+      ip = 2 * n1;
+    }
+
+    while (i < n) {
+      sumf += x[ip] + x[ip + 1];
+      i++;
+      ip += 2;
+    }
+
+  } else {
+    inc_x2 = 2 * inc_x;
+
+    while (i < n) {
+      sumf += x[ip] + x[ip + 1];
+      ip += inc_x2;
+      i++;
+    }
+
+  }
+  return (sumf);
+}
diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c
index 0620790020..bc466866cb 100644
--- a/kernel/zarch/zswap.c
+++ b/kernel/zarch/zswap.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013-2017, The OpenBLAS Project
+Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -25,286 +25,145 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
- 
 #include "common.h"
 
-
-#if defined(Z13_SWAP_A)
-static void   zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
-{
-         __asm__ volatile(
-            "pfd  1, 0(%[ptr_x]) \n\t"
-            "pfd  2, 0(%[ptr_y]) \n\t"
-            "srlg %[n_tmp],%[n_tmp],4      \n\t"
-            "xgr  %%r1,%%r1      \n\t"
-            ".align 16 \n\t"
-            "1:  \n\t"
-            "pfd 2, 256(%%r1,%[ptr_x]) \n\t"
-            "pfd 2, 256(%%r1,%[ptr_y]) \n\t"
-            
-            "vl  %%v24, 0(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v16, 0(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v25, 16(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v17, 16(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v26, 32(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v18, 32(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v18, 32(%%r1,%[ptr_x]) \n\t"           
-
-            "vl  %%v27, 48(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v19, 48(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v19, 48(%%r1,%[ptr_x]) \n\t"    
-
-            "vl  %%v28, 64(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v20, 64(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v20, 64(%%r1,%[ptr_x]) \n\t"   
-
-            "vl  %%v29, 80(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v21, 80(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v30, 96(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v22, 96(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v31, 112(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v23, 112(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v24, 128(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v16, 128(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v25, 144(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v17, 144(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v26, 160(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v18, 160(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v18, 160(%%r1,%[ptr_x]) \n\t"           
-
-            "vl  %%v27, 176(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v19, 176(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v19, 176(%%r1,%[ptr_x]) \n\t"    
-
-            "vl  %%v28, 192(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v20, 192(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v20, 192(%%r1,%[ptr_x]) \n\t"   
-
-            "vl  %%v29, 208(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v21, 208(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v30, 224(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v22, 224(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" 
-
-            "vl  %%v31, 240(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v23, 240(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v23, 240(%%r1,%[ptr_x]) \n\t"    
-          
-            "la  %%r1,256(%%r1) \n\t"
-            "brctg %[n_tmp],1b"
-            : [mem_x] "+m" (*(double (*)[2*n])x),
-              [mem_y] "+m" (*(double (*)[2*n])y),
-              [n_tmp] "+&r"(n)
-            : [ptr_x] "a"(x), [ptr_y] "a"(y) 
-            : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
-              ,"v24","v25","v26","v27","v28","v29","v30","v31"
-            );
-    return;
-
-}
-
-#else
-
-static void   zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
-{
-         __asm__ volatile(
-            "pfd 2, 0(%[ptr_x]) \n\t"
-            "pfd 2, 0(%[ptr_y]) \n\t"
-            "srlg %[n_tmp],%[n_tmp],4       \n\t"
-            "xgr %%r1,%%r1       \n\t"
-            ".align 16 \n\t"
-            "1: \n\t"
-            "pfd 2, 256(%%r1,%[ptr_x]) \n\t"
-            "pfd 2, 256(%%r1,%[ptr_y]) \n\t"
-            
-            "vl  %%v16, 0(%%r1,%[ptr_x])   \n\t" 
-            "vl  %%v17, 16(%%r1,%[ptr_x])  \n\t" 
-            "vl  %%v18, 32(%%r1,%[ptr_x])  \n\t" 
-            "vl  %%v19, 48(%%r1,%[ptr_x])  \n\t"  
-            "vl  %%v20, 64(%%r1,%[ptr_x])  \n\t" 
-            "vl  %%v21, 80(%%r1,%[ptr_x])  \n\t" 
-            "vl  %%v22, 96(%%r1,%[ptr_x])  \n\t" 
-            "vl  %%v23, 112(%%r1,%[ptr_x]) \n\t"
-            "vl  %%v24, 128(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v25, 144(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v26, 160(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v27, 176(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v28, 192(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v29, 208(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v30, 224(%%r1,%[ptr_x]) \n\t" 
-            "vl  %%v31, 240(%%r1,%[ptr_x]) \n\t"
-
-
-            "vl  %%v0, 0(%%r1,%[ptr_y])    \n\t" 
-            "vl  %%v1, 16(%%r1,%[ptr_y])   \n\t" 
-            "vl  %%v2, 32(%%r1,%[ptr_y])   \n\t" 
-            "vl  %%v3, 48(%%r1,%[ptr_y])   \n\t"  
-            "vl  %%v4, 64(%%r1,%[ptr_y])   \n\t" 
-            "vl  %%v5, 80(%%r1,%[ptr_y])   \n\t" 
-            "vl  %%v6, 96(%%r1,%[ptr_y])   \n\t" 
-            "vl  %%v7, 112(%%r1,%[ptr_y])  \n\t"
-            "vst %%v0,  0(%%r1,%[ptr_x])   \n\t" 
-            "vst %%v1, 16(%%r1,%[ptr_x])   \n\t" 
-            "vst %%v2, 32(%%r1,%[ptr_x])   \n\t" 
-            "vst %%v3, 48(%%r1,%[ptr_x])   \n\t"  
-            "vst %%v4, 64(%%r1,%[ptr_x])   \n\t" 
-            "vst %%v5, 80(%%r1,%[ptr_x])   \n\t" 
-            "vst %%v6, 96(%%r1,%[ptr_x])   \n\t" 
-            "vst %%v7, 112(%%r1,%[ptr_x])  \n\t" 
-
-            "vl  %%v0, 128(%%r1,%[ptr_y])  \n\t" 
-            "vl  %%v1, 144(%%r1,%[ptr_y])  \n\t" 
-            "vl  %%v2, 160(%%r1,%[ptr_y])  \n\t" 
-            "vl  %%v3, 176(%%r1,%[ptr_y])  \n\t"  
-            "vl  %%v4, 192(%%r1,%[ptr_y])  \n\t" 
-            "vl  %%v5, 208(%%r1,%[ptr_y])  \n\t" 
-            "vl  %%v6, 224(%%r1,%[ptr_y])  \n\t" 
-            "vl  %%v7, 240(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v0, 128(%%r1,%[ptr_x])  \n\t" 
-            "vst %%v1, 144(%%r1,%[ptr_x])  \n\t" 
-            "vst %%v2, 160(%%r1,%[ptr_x])  \n\t" 
-            "vst %%v3, 176(%%r1,%[ptr_x])  \n\t"  
-            "vst %%v4, 192(%%r1,%[ptr_x])  \n\t" 
-            "vst %%v5, 208(%%r1,%[ptr_x])  \n\t" 
-            "vst %%v6, 224(%%r1,%[ptr_x])  \n\t" 
-            "vst %%v7, 240(%%r1,%[ptr_x])  \n\t"
-
-            "vst %%v16,  0(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v17, 16(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v18, 32(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v19, 48(%%r1,%[ptr_y])  \n\t"  
-            "vst %%v20, 64(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v21, 80(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v22, 96(%%r1,%[ptr_y])  \n\t" 
-            "vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
-            "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" 
-            "vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
- 
-          
-            "la  %%r1,256(%%r1) \n\t"
-            "brctg %[n_tmp],1b"
-            : [mem_x] "+m" (*(double (*)[2*n])x),
-              [mem_y] "+m" (*(double (*)[2*n])y),
-              [n_tmp] "+&r"(n)
-            : [ptr_x] "a"(x), [ptr_y] "a"(y) 
-            : "cc",  "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
-            "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" 
-            );
-    return;
-
+static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
+  __asm__("srlg %[n],%[n],4\n\t"
+    "xgr  %%r1,%%r1\n\t"
+    "0:\n\t"
+    "pfd 2, 1024(%%r1,%[x])\n\t"
+    "pfd 2, 1024(%%r1,%[y])\n\t"
+    "vl  %%v16, 0(%%r1,%[x])\n\t"
+    "vl  %%v17, 16(%%r1,%[x])\n\t"
+    "vl  %%v18, 32(%%r1,%[x])\n\t"
+    "vl  %%v19, 48(%%r1,%[x])\n\t"
+    "vl  %%v20, 64(%%r1,%[x])\n\t"
+    "vl  %%v21, 80(%%r1,%[x])\n\t"
+    "vl  %%v22, 96(%%r1,%[x])\n\t"
+    "vl  %%v23, 112(%%r1,%[x])\n\t"
+    "vl  %%v24, 128(%%r1,%[x])\n\t"
+    "vl  %%v25, 144(%%r1,%[x])\n\t"
+    "vl  %%v26, 160(%%r1,%[x])\n\t"
+    "vl  %%v27, 176(%%r1,%[x])\n\t"
+    "vl  %%v28, 192(%%r1,%[x])\n\t"
+    "vl  %%v29, 208(%%r1,%[x])\n\t"
+    "vl  %%v30, 224(%%r1,%[x])\n\t"
+    "vl  %%v31, 240(%%r1,%[x])\n\t"
+    "vl  %%v0, 0(%%r1,%[y])\n\t"
+    "vl  %%v1, 16(%%r1,%[y])\n\t"
+    "vl  %%v2, 32(%%r1,%[y])\n\t"
+    "vl  %%v3, 48(%%r1,%[y])\n\t"
+    "vl  %%v4, 64(%%r1,%[y])\n\t"
+    "vl  %%v5, 80(%%r1,%[y])\n\t"
+    "vl  %%v6, 96(%%r1,%[y])\n\t"
+    "vl  %%v7, 112(%%r1,%[y])\n\t"
+    "vst  %%v0, 0(%%r1,%[x])\n\t"
+    "vst  %%v1, 16(%%r1,%[x])\n\t"
+    "vst  %%v2, 32(%%r1,%[x])\n\t"
+    "vst  %%v3, 48(%%r1,%[x])\n\t"
+    "vst  %%v4, 64(%%r1,%[x])\n\t"
+    "vst  %%v5, 80(%%r1,%[x])\n\t"
+    "vst  %%v6, 96(%%r1,%[x])\n\t"
+    "vst  %%v7, 112(%%r1,%[x])\n\t"
+    "vl  %%v0, 128(%%r1,%[y])\n\t"
+    "vl  %%v1, 144(%%r1,%[y])\n\t"
+    "vl  %%v2, 160(%%r1,%[y])\n\t"
+    "vl  %%v3, 176(%%r1,%[y])\n\t"
+    "vl  %%v4, 192(%%r1,%[y])\n\t"
+    "vl  %%v5, 208(%%r1,%[y])\n\t"
+    "vl  %%v6, 224(%%r1,%[y])\n\t"
+    "vl  %%v7, 240(%%r1,%[y])\n\t"
+    "vst  %%v0, 128(%%r1,%[x])\n\t"
+    "vst  %%v1, 144(%%r1,%[x])\n\t"
+    "vst  %%v2, 160(%%r1,%[x])\n\t"
+    "vst  %%v3, 176(%%r1,%[x])\n\t"
+    "vst  %%v4, 192(%%r1,%[x])\n\t"
+    "vst  %%v5, 208(%%r1,%[x])\n\t"
+    "vst  %%v6, 224(%%r1,%[x])\n\t"
+    "vst  %%v7, 240(%%r1,%[x])\n\t"
+    "vst  %%v16, 0(%%r1,%[y])\n\t"
+    "vst  %%v17, 16(%%r1,%[y])\n\t"
+    "vst  %%v18, 32(%%r1,%[y])\n\t"
+    "vst  %%v19, 48(%%r1,%[y])\n\t"
+    "vst  %%v20, 64(%%r1,%[y])\n\t"
+    "vst  %%v21, 80(%%r1,%[y])\n\t"
+    "vst  %%v22, 96(%%r1,%[y])\n\t"
+    "vst  %%v23, 112(%%r1,%[y])\n\t"
+    "vst  %%v24, 128(%%r1,%[y])\n\t"
+    "vst  %%v25, 144(%%r1,%[y])\n\t"
+    "vst  %%v26, 160(%%r1,%[y])\n\t"
+    "vst  %%v27, 176(%%r1,%[y])\n\t"
+    "vst  %%v28, 192(%%r1,%[y])\n\t"
+    "vst  %%v29, 208(%%r1,%[y])\n\t"
+    "vst  %%v30, 224(%%r1,%[y])\n\t"
+    "vst  %%v31, 240(%%r1,%[y])\n\t"
+    "agfi   %%r1,256\n\t"
+    "brctg  %[n],0b"
+    : "+m"(*(struct { FLOAT x[n * 2]; } *) x),
+       "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
+    : [x] "a"(x),[y] "a"(y)
+    : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+       "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+       "v27", "v28", "v29", "v30", "v31");
 }
 
-#endif
-
-
-
- 
-
-
-int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
-{
-    BLASLONG i=0;
-    BLASLONG ix=0,iy=0;
-    FLOAT temp[2];
-    BLASLONG inc_x2, inc_y2;
-
-    if ( n <= 0     )  return(0);
-
-    if ( (inc_x == 1) && (inc_y == 1 ))
-    {
-
-        BLASLONG n1 = n & -16;
-        if ( n1 > 0 )
-        {
-            zswap_kernel_16(n1, x, y);
-            i=n1;
-            ix = 2* n1;
-            iy = 2* n1;
-        }
-
-        while(i < n)
-        {
-
-            temp[0]  = x[ix]   ;
-            temp[1]  = x[ix+1] ;
-            x[ix]    = y[iy]   ;
-            x[ix+1]  = y[iy+1] ;
-            y[iy]    = temp[0] ;
-            y[iy+1]  = temp[1] ;
-
-            ix += 2 ;
-            iy += 2 ;
-            i++ ;
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
+          FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT *dummy, BLASLONG dummy2) {
+  BLASLONG i = 0;
+  BLASLONG ix = 0, iy = 0;
+  FLOAT temp[2];
+  BLASLONG inc_x2, inc_y2;
+
+  if (n <= 0)
+    return (0);
+
+  if ((inc_x == 1) && (inc_y == 1)) {
+
+    BLASLONG n1 = n & -16;
+    if (n1 > 0) {
+      zswap_kernel_16(n1, x, y);
+      i = n1;
+      ix = 2 * n1;
+      iy = 2 * n1;
+    }
 
+    while (i < n) {
 
-        }
+      temp[0] = x[ix];
+      temp[1] = x[ix + 1];
+      x[ix] = y[iy];
+      x[ix + 1] = y[iy + 1];
+      y[iy] = temp[0];
+      y[iy + 1] = temp[1];
 
+      ix += 2;
+      iy += 2;
+      i++;
 
     }
-    else
-    {
 
-        inc_x2 = 2 * inc_x;
-        inc_y2 = 2 * inc_y;
+  } else {
 
-        while(i < n)
-        {
+    inc_x2 = 2 * inc_x;
+    inc_y2 = 2 * inc_y;
 
-            temp[0]  = x[ix]   ;
-            temp[1]  = x[ix+1] ;
-            x[ix]    = y[iy]   ;
-            x[ix+1]  = y[iy+1] ;
-            y[iy]    = temp[0] ;
-            y[iy+1]  = temp[1] ;
+    while (i < n) {
 
-            ix += inc_x2 ;
-            iy += inc_y2 ;
-            i++ ;
+      temp[0] = x[ix];
+      temp[1] = x[ix + 1];
+      x[ix] = y[iy];
+      x[ix + 1] = y[iy + 1];
+      y[iy] = temp[0];
+      y[iy + 1] = temp[1];
 
-        }
+      ix += inc_x2;
+      iy += inc_y2;
+      i++;
 
     }
-    return(0);
-    
-
-}
 
+  }
+  return (0);
 
+}
diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h
index 6ded78c8b7..c5ea465e0d 100644
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@@ -70,7 +70,11 @@
 
 /* Complex type (single precision) */
 #ifndef lapack_complex_float
+#ifndef __cplusplus
 #include <complex.h>
+#else
+#include <complex>
+#endif
 #define lapack_complex_float    float _Complex
 #endif
 
@@ -86,7 +90,11 @@ lapack_complex_float lapack_make_complex_float( float re, float im );
 
 /* Complex type (double precision) */
 #ifndef lapack_complex_double
+#ifndef __cplusplus
 #include <complex.h>
+#else
+#include <complex>
+#endif
 #define lapack_complex_double   double _Complex
 #endif
 
diff --git a/lapack-netlib/TESTING/EIG/chet21.f b/lapack-netlib/TESTING/EIG/chet21.f
index 8dbdb521ed..5aff649042 100644
--- a/lapack-netlib/TESTING/EIG/chet21.f
+++ b/lapack-netlib/TESTING/EIG/chet21.f
@@ -304,7 +304,8 @@ SUBROUTINE CHET21( ITYPE, UPLO, N, KBAND, A, LDA, D, E, U, LDU, V,
    10    CONTINUE
 *
          IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN
-            DO 20 J = 1, N - 1
+CMK            DO 20 J = 1, N - 1
+            DO 20 J = 2, N - 1
                CALL CHER2( CUPLO, N, -CMPLX( E( J ) ), U( 1, J ), 1,
      $                     U( 1, J-1 ), 1, WORK, N )
    20       CONTINUE
diff --git a/lapack-netlib/TESTING/EIG/chpt21.f b/lapack-netlib/TESTING/EIG/chpt21.f
index 4b92794702..e151a8bd8f 100644
--- a/lapack-netlib/TESTING/EIG/chpt21.f
+++ b/lapack-netlib/TESTING/EIG/chpt21.f
@@ -323,7 +323,7 @@ SUBROUTINE CHPT21( ITYPE, UPLO, N, KBAND, AP, D, E, U, LDU, VP,
    10    CONTINUE
 *
          IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN
-            DO 20 J = 1, N - 1
+            DO 20 J = 2, N - 1
                CALL CHPR2( CUPLO, N, -CMPLX( E( J ) ), U( 1, J ), 1,
      $                     U( 1, J-1 ), 1, WORK )
    20       CONTINUE
diff --git a/lapack-netlib/TESTING/EIG/zhet21.f b/lapack-netlib/TESTING/EIG/zhet21.f
index 32a09741e4..f6cb2d70a0 100644
--- a/lapack-netlib/TESTING/EIG/zhet21.f
+++ b/lapack-netlib/TESTING/EIG/zhet21.f
@@ -304,7 +304,8 @@ SUBROUTINE ZHET21( ITYPE, UPLO, N, KBAND, A, LDA, D, E, U, LDU, V,
    10    CONTINUE
 *
          IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN
-            DO 20 J = 1, N - 1
+CMK            DO 20 J = 1, N - 1
+            DO 20 J = 2, N - 1
                CALL ZHER2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1,
      $                     U( 1, J-1 ), 1, WORK, N )
    20       CONTINUE
diff --git a/lapack-netlib/TESTING/EIG/zhpt21.f b/lapack-netlib/TESTING/EIG/zhpt21.f
index f9268661ac..ef9e4418dc 100644
--- a/lapack-netlib/TESTING/EIG/zhpt21.f
+++ b/lapack-netlib/TESTING/EIG/zhpt21.f
@@ -323,7 +323,8 @@ SUBROUTINE ZHPT21( ITYPE, UPLO, N, KBAND, AP, D, E, U, LDU, VP,
    10    CONTINUE
 *
          IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN
-            DO 20 J = 1, N - 1
+CMK            DO 20 J = 1, N - 1
+            DO 20 J = 2, N - 1
                CALL ZHPR2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1,
      $                     U( 1, J-1 ), 1, WORK )
    20       CONTINUE
diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt
index c0a7543caa..d48a270ab7 100644
--- a/lapack/CMakeLists.txt
+++ b/lapack/CMakeLists.txt
@@ -63,7 +63,6 @@ if (USE_THREAD)
 
   # these do not have 'z' versions
   set(PARALLEL_SOURCES
-    ${GETRF_SRC}
     lauum/lauum_U_parallel.c
     lauum/lauum_L_parallel.c
     potrf/potrf_U_parallel.c
@@ -81,6 +80,10 @@ if (USE_THREAD)
     trtri/trtri_L_parallel.c
   )
 
+  foreach (float_type ${FLOAT_TYPES})
+    GenerateNamedObjects("${GETRF_SRC}" "" "getrf_parallel" false "" "" false ${float_type})  
+  endforeach()
+
   GenerateNamedObjects("${PARALLEL_SOURCES}")
 endif ()
 
diff --git a/param.h b/param.h
index fa6730208d..f094fb0f25 100644
--- a/param.h
+++ b/param.h
@@ -605,7 +605,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SYMV_P  8
 
-#define SWITCH_RATIO	4
+#define SWITCH_RATIO	16
 
 #ifdef ARCH_X86
 
@@ -1999,7 +1999,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_M 2
 #define ZGEMM_DEFAULT_UNROLL_N 2
 
-#ifdef OS_LINUX
+#if defined(OS_LINUX) || defined(OS_DARWIN)
 #if L2_SIZE == 1024976
 #define SGEMM_DEFAULT_P 320
 #define DGEMM_DEFAULT_P 256
@@ -2230,6 +2230,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#if defined(POWER9)
+
+#define SNUMOPT		16
+#define DNUMOPT		8
+
+#define GEMM_DEFAULT_OFFSET_A 0 
+#define GEMM_DEFAULT_OFFSET_B 65536
+#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_M 16
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_M 8
+#define ZGEMM_DEFAULT_UNROLL_N 2
+
+#define SGEMM_DEFAULT_P  1280
+#define DGEMM_DEFAULT_P  128
+#define CGEMM_DEFAULT_P  640
+#define ZGEMM_DEFAULT_P  320
+
+#define SGEMM_DEFAULT_Q  640
+#define DGEMM_DEFAULT_Q  384
+#define CGEMM_DEFAULT_Q  640
+#define ZGEMM_DEFAULT_Q  640
+
+#define SYMV_P	 8
+
+#endif
 
 #if defined(SPARC) && defined(V7)
 
@@ -2591,7 +2622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(CORTEXA53) || defined(CORTEXA57) || \
     defined(CORTEXA72) || defined(CORTEXA73) || \
-    defined(FALKOR)
+    defined(FALKOR)    || defined(TSV110)
 
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -2915,6 +2946,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
+#if defined(Z14)
+#define SNUMOPT		2
+#define DNUMOPT		2
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  4
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  4
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
+
+#define SGEMM_DEFAULT_P	456
+#define DGEMM_DEFAULT_P	320
+#define CGEMM_DEFAULT_P 480
+#define ZGEMM_DEFAULT_P 224
+
+#define SGEMM_DEFAULT_Q 488
+#define DGEMM_DEFAULT_Q 384
+#define CGEMM_DEFAULT_Q 128
+#define ZGEMM_DEFAULT_Q 352
+
+#define SGEMM_DEFAULT_R 8192
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 2048
+
+
+#define SYMV_P	16
+#endif
+
+
 
 #ifdef GENERIC
 
diff --git a/relapack/config.h b/relapack/config.h
index 9113a712da..e4fab0a124 100644
--- a/relapack/config.h
+++ b/relapack/config.h
@@ -36,8 +36,8 @@
 // allow malloc in xsygst for improved performance
 #define XSYGST_ALLOW_MALLOC ALLOW_MALLOC
 // allow malloc in xsytrf if the passed work buffer is too small
-#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
-
+//#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
+#define XSYTRF_ALLOW_MALLOC 0
 
 ////////////////////////////////
 // LAPACK routine replacement //
diff --git a/relapack/inc/relapack.h b/relapack/inc/relapack.h
index e421f352b1..7f283e04d5 100644
--- a/relapack/inc/relapack.h
+++ b/relapack/inc/relapack.h
@@ -1,67 +1,79 @@
 #ifndef RELAPACK_H
 #define RELAPACK_H
 
-void RELAPACK_slauum(const char *, const int *, float *, const int *, int *);
-void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *);
-void RELAPACK_clauum(const char *, const int *, float *, const int *, int *);
-void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *);
+#ifdef USE64BITINT
+  typedef BLASLONG blasint;
+  #if defined(OS_WINDOWS) && defined(__64BIT__)
+     #define blasabs(x) llabs(x)
+  #else
+     #define blasabs(x) labs(x)
+  #endif
+#else
+  typedef int blasint;
+  #define blasabs(x) abs(x)
+#endif
 
-void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *);
-void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *);
-void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *);
-void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *);
+void RELAPACK_slauum(const char *, const blasint *, float *, const blasint *, blasint *);
+void RELAPACK_dlauum(const char *, const blasint *, double *, const blasint *, blasint *);
+void RELAPACK_clauum(const char *, const blasint *, float *, const blasint *, blasint *);
+void RELAPACK_zlauum(const char *, const blasint *, double *, const blasint *, blasint *);
 
-void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *);
-void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *);
-void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *);
-void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *);
+void RELAPACK_strtri(const char *, const char *, const blasint *, float *, const blasint *, blasint *);
+void RELAPACK_dtrtri(const char *, const char *, const blasint *, double *, const blasint *, blasint *);
+void RELAPACK_ctrtri(const char *, const char *, const blasint *, float *, const blasint *, blasint *);
+void RELAPACK_ztrtri(const char *, const char *, const blasint *, double *, const blasint *, blasint *);
 
-void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *);
-void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *);
-void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *);
-void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *);
+void RELAPACK_spotrf(const char *, const blasint *, float *, const blasint *, blasint *);
+void RELAPACK_dpotrf(const char *, const blasint *, double *, const blasint *, blasint *);
+void RELAPACK_cpotrf(const char *, const blasint *, float *, const blasint *, blasint *);
+void RELAPACK_zpotrf(const char *, const blasint *, double *, const blasint *, blasint *);
 
-void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
-void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
-void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
-void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
-void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
-void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
-void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
-void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
-void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
-void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
-void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
-void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_spbtrf(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *);
+void RELAPACK_dpbtrf(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *);
+void RELAPACK_cpbtrf(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *);
+void RELAPACK_zpbtrf(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *);
 
-void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *);
-void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *);
-void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *);
-void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *);
+void RELAPACK_ssytrf(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *);
+void RELAPACK_dsytrf(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *);
+void RELAPACK_csytrf(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *);
+void RELAPACK_chetrf(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *);
+void RELAPACK_zsytrf(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *);
+void RELAPACK_zhetrf(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *);
+void RELAPACK_ssytrf_rook(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *);
+void RELAPACK_dsytrf_rook(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *);
+void RELAPACK_csytrf_rook(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *);
+void RELAPACK_chetrf_rook(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *);
+void RELAPACK_zsytrf_rook(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *);
+void RELAPACK_zhetrf_rook(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *);
 
-void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
-void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
-void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
-void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+void RELAPACK_sgetrf(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *);
+void RELAPACK_dgetrf(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *);
+void RELAPACK_cgetrf(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *);
+void RELAPACK_zgetrf(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *);
 
-void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
-void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
-void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
-void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+void RELAPACK_sgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *);
+void RELAPACK_dgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *);
+void RELAPACK_cgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *);
+void RELAPACK_zgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *);
 
-void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
-void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
-void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
-void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+void RELAPACK_ssygst(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *);
+void RELAPACK_dsygst(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *);
+void RELAPACK_chegst(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *);
+void RELAPACK_zhegst(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *);
 
-void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
-void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
-void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
-void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
+void RELAPACK_strsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *);
+void RELAPACK_dtrsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *);
+void RELAPACK_ctrsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *);
+void RELAPACK_ztrsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *);
 
-void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
-void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
-void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
-void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+void RELAPACK_stgsyl(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, const blasint *, blasint *, blasint *);
+void RELAPACK_dtgsyl(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, const blasint *, blasint *, blasint *);
+void RELAPACK_ctgsyl(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, const blasint *, blasint *, blasint *);
+void RELAPACK_ztgsyl(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, const blasint *, blasint *, blasint *);
+
+void RELAPACK_sgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *);
+void RELAPACK_dgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *);
+void RELAPACK_cgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *);
+void RELAPACK_zgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *);
 
 #endif /*  RELAPACK_H */
diff --git a/relapack/src/blas.h b/relapack/src/blas.h
index 7441c1033d..6d9f1a42a2 100644
--- a/relapack/src/blas.h
+++ b/relapack/src/blas.h
@@ -1,61 +1,61 @@
 #ifndef BLAS_H
 #define BLAS_H
 
-extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *);
-extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *);
-extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *);
-extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *);
-
-extern void BLAS(sscal)(const int *, const float *, float *, const int *);
-extern void BLAS(dscal)(const int *, const double *, double *, const int *);
-extern void BLAS(cscal)(const int *, const float *, float *, const int *);
-extern void BLAS(zscal)(const int *, const double *, double *, const int *);
-
-extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *);
-extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *);
-extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *);
-extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *);
-
-extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
-extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
-extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
-extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
-
-extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
-extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
-extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
-extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
-
-extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
-extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
-extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
-extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
-
-extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
-extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
-extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
-extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
-
-extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
-extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
-extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
-extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
-
-extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
-extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
-extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
-extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
-
-extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
-extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
-extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
-extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+extern void BLAS(sswap)(const blasint *, float *, const blasint *, float *, const blasint *);
+extern void BLAS(dswap)(const blasint *, double *, const blasint *, double *, const blasint *);
+extern void BLAS(cswap)(const blasint *, float *, const blasint *, float *, const blasint *);
+extern void BLAS(zswap)(const blasint *, double *, const blasint *, double *, const blasint *);
+
+extern void BLAS(sscal)(const blasint *, const float *, float *, const blasint *);
+extern void BLAS(dscal)(const blasint *, const double *, double *, const blasint *);
+extern void BLAS(cscal)(const blasint *, const float *, float *, const blasint *);
+extern void BLAS(zscal)(const blasint *, const double *, double *, const blasint *);
+
+extern void BLAS(saxpy)(const blasint *, const float *, const float *, const blasint *, float *, const blasint *);
+extern void BLAS(daxpy)(const blasint *, const double *, const double *, const blasint *, double *, const blasint *);
+extern void BLAS(caxpy)(const blasint *, const float *, const float *, const blasint *, float *, const blasint *);
+extern void BLAS(zaxpy)(const blasint *, const double *, const double *, const blasint *, double *, const blasint *);
+
+extern void BLAS(sgemv)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*);
+extern void BLAS(dgemv)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*);
+extern void BLAS(cgemv)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*);
+extern void BLAS(zgemv)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*);
+
+extern void BLAS(sgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*);
+extern void BLAS(dgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*);
+extern void BLAS(cgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*);
+extern void BLAS(zgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*);
+
+extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *);
+extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *);
+extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *);
+extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *);
+
+extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *);
+extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *);
+extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *);
+extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *);
+
+extern void BLAS(ssyrk)(const char *, const char *, const blasint *, const blasint *, const float *, float *, const blasint *, const float *, float *, const blasint *);
+extern void BLAS(dsyrk)(const char *, const char *, const blasint *, const blasint *, const double *, double *, const blasint *, const double *, double *, const blasint *);
+extern void BLAS(cherk)(const char *, const char *, const blasint *, const blasint *, const float *, float *, const blasint *, const float *, float *, const blasint *);
+extern void BLAS(zherk)(const char *, const char *, const blasint *, const blasint *, const double *, double *, const blasint *, const double *, double *, const blasint *);
+
+extern void BLAS(ssymm)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *);
+extern void BLAS(dsymm)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *);
+extern void BLAS(chemm)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *);
+extern void BLAS(zhemm)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *);
+
+extern void BLAS(ssyr2k)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *);
+extern void BLAS(dsyr2k)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *);
+extern void BLAS(cher2k)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *);
+extern void BLAS(zher2k)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *);
 
 #if HAVE_XGEMMT
-extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
-extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
-extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
-extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+extern void BLAS(sgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*);
+extern void BLAS(dgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*);
+extern void BLAS(cgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*);
+extern void BLAS(zgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*);
 #endif
 
 #endif /* BLAS_H */
diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c
index 90b2c87895..61332c6a6c 100644
--- a/relapack/src/cgbtrf.c
+++ b/relapack/src/cgbtrf.c
@@ -1,9 +1,9 @@
 #include "relapack.h"
 #include "stdlib.h"
 
-static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *,
-    const int *, float *, const int *, int *, float *, const int *, float *,
-    const int *, int *);
+static void RELAPACK_cgbtrf_rec(const blasint *, const blasint *, const blasint *,
+    const blasint *, float *, const blasint *, blasint *, float *, const blasint *, float *,
+    const blasint *, blasint *);
 
 
 /** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
@@ -13,9 +13,9 @@ static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *,
  * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html
  * */
 void RELAPACK_cgbtrf(
-    const int *m, const int *n, const int *kl, const int *ku,
-    float *Ab, const int *ldAb, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n, const blasint *kl, const blasint *ku,
+    float *Ab, const blasint *ldAb, blasint *ipiv,
+    blasint *info
 ) {
 
     // Check arguments
@@ -31,8 +31,8 @@ void RELAPACK_cgbtrf(
     else if (*ldAb < 2 * *kl + *ku + 1)
         *info = -6;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CGBTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CGBTRF", &minfo, strlen("CGBTRF"));
         return;
     }
 
@@ -40,14 +40,14 @@ void RELAPACK_cgbtrf(
     const float ZERO[] = { 0., 0. };
 
     // Result upper band width
-    const int kv = *ku + *kl;
+    const blasint kv = *ku + *kl;
 
     // Unskew A
-    const int ldA[] = { *ldAb - 1 };
+    const blasint ldA[] = { *ldAb - 1 };
     float *const A = Ab + 2 * kv;
 
     // Zero upper diagonal fill-in elements
-    int i, j;
+    blasint i, j;
     for (j = 0; j < *n; j++) {
         float *const A_j = A + 2 * *ldA * j;
         for (i = MAX(0, j - kv); i < j - *ku; i++)
@@ -55,11 +55,11 @@ void RELAPACK_cgbtrf(
     }
 
     // Allocate work space
-    const int n1 = CREC_SPLIT(*n);
-    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
-    const int nWorkl = (kv > n1) ? n1 : kv;
-    const int mWorku = (*kl > n1) ? n1 : *kl;
-    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    const blasint n1 = CREC_SPLIT(*n);
+    const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
+    const blasint nWorkl = (kv > n1) ? n1 : kv;
+    const blasint mWorku = (*kl > n1) ? n1 : *kl;
+    const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
     float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float));
     float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float));
     LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
@@ -76,10 +76,10 @@ void RELAPACK_cgbtrf(
 
 /** cgbtrf's recursive compute kernel */
 static void RELAPACK_cgbtrf_rec(
-    const int *m, const int *n, const int *kl, const int *ku,
-    float *Ab, const int *ldAb, int *ipiv,
-    float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
-    int *info
+    const blasint *m, const blasint *n, const blasint *kl, const blasint *ku,
+    float *Ab, const blasint *ldAb, blasint *ipiv,
+    float *Workl, const blasint *ldWorkl, float *Worku, const blasint *ldWorku,
+    blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_CGBTRF, 1)) {
@@ -91,25 +91,25 @@ static void RELAPACK_cgbtrf_rec(
     // Constants
     const float ONE[]  = { 1., 0. };
     const float MONE[] = { -1., 0. };
-    const int   iONE[] = { 1 };
+    const blasint   iONE[] = { 1 };
 
     // Loop iterators
-    int i, j;
+    blasint i, j;
 
     // Output upper band width
-    const int kv = *ku + *kl;
+    const blasint kv = *ku + *kl;
 
     // Unskew A
-    const int ldA[] = { *ldAb - 1 };
+    const blasint ldA[] = { *ldAb - 1 };
     float *const A = Ab + 2 * kv;
 
     // Splitting
-    const int n1  = MIN(CREC_SPLIT(*n), *kl);
-    const int n2  = *n - n1;
-    const int m1  = MIN(n1, *m);
-    const int m2  = *m - m1;
-    const int mn1 = MIN(m1, n1);
-    const int mn2 = MIN(m2, n2);
+    const blasint n1  = MIN(CREC_SPLIT(*n), *kl);
+    const blasint n2  = *n - n1;
+    const blasint m1  = MIN(n1, *m);
+    const blasint m2  = *m - m1;
+    const blasint mn1 = MIN(m1, n1);
+    const blasint mn2 = MIN(m2, n2);
 
     // Ab_L *
     //      Ab_BR
@@ -129,14 +129,14 @@ static void RELAPACK_cgbtrf_rec(
 
     // ipiv_T
     // ipiv_B
-    int *const ipiv_T = ipiv;
-    int *const ipiv_B = ipiv + n1;
+    blasint *const ipiv_T = ipiv;
+    blasint *const ipiv_B = ipiv + n1;
 
     // Banded splitting
-    const int n21 = MIN(n2, kv - n1);
-    const int n22 = MIN(n2 - n21, n1);
-    const int m21 = MIN(m2, *kl - m1);
-    const int m22 = MIN(m2 - m21, m1);
+    const blasint n21 = MIN(n2, kv - n1);
+    const blasint n22 = MIN(n2 - n21, n1);
+    const blasint m21 = MIN(m2, *kl - m1);
+    const blasint m22 = MIN(m2 - m21, m1);
 
     //   n1 n21  n22
     // m *  A_Rl ARr
@@ -164,7 +164,7 @@ static void RELAPACK_cgbtrf_rec(
 
     // partially redo swaps in A_L
     for (i = 0; i < mn1; i++) {
-        const int ip = ipiv_T[i] - 1;
+        const blasint ip = ipiv_T[i] - 1;
         if (ip != i) {
             if (ip < *kl)
                 BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
@@ -180,7 +180,7 @@ static void RELAPACK_cgbtrf_rec(
     for (j = 0; j < n22; j++) {
         float *const A_Rrj = A_Rr + 2 * *ldA * j;
         for (i = j; i < mn1; i++) {
-            const int ip = ipiv_T[i] - 1;
+            const blasint ip = ipiv_T[i] - 1;
             if (ip != i) {
                 const float tmpr = A_Rrj[2 * i];
                 const float tmpc = A_Rrj[2 * i + 1];
@@ -211,7 +211,7 @@ static void RELAPACK_cgbtrf_rec(
 
     // partially undo swaps in A_L
     for (i = mn1 - 1; i >= 0; i--) {
-        const int ip = ipiv_T[i] - 1;
+        const blasint ip = ipiv_T[i] - 1;
         if (ip != i) {
             if (ip < *kl)
                 BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
@@ -221,7 +221,9 @@ static void RELAPACK_cgbtrf_rec(
     }
 
     // recursion(Ab_BR, ipiv_B)
-    RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+    //RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+       LAPACK(cgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
+       
     if (*info)
         *info += n1;
     // shift pivots
diff --git a/relapack/src/cgemmt.c b/relapack/src/cgemmt.c
index 28e2b00b01..3af4d790f5 100644
--- a/relapack/src/cgemmt.c
+++ b/relapack/src/cgemmt.c
@@ -1,12 +1,12 @@
 #include "relapack.h"
 
 static void RELAPACK_cgemmt_rec(const char *, const char *, const char *,
-    const int *, const int *, const float *, const float *, const int *,
-    const float *, const int *, const float *, float *, const int *);
+    const blasint *, const blasint *, const float *, const float *, const blasint *,
+    const float *, const blasint *, const float *, float *, const blasint *);
 
 static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *,
-    const int *, const int *, const float *, const float *, const int *,
-    const float *, const int *, const float *, float *, const int *);
+    const blasint *, const blasint *, const float *, const float *, const blasint *,
+    const float *, const blasint *, const float *, float *, const blasint *);
 
 
 /** CGEMMT computes a matrix-matrix product with general matrices but updates
@@ -20,10 +20,10 @@ static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *,
  * */
 void RELAPACK_cgemmt(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const float *alpha, const float *A, const int *ldA,
-    const float *B, const int *ldB,
-    const float *beta, float *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const float *alpha, const float *A, const blasint *ldA,
+    const float *B, const blasint *ldB,
+    const float *beta, float *C, const blasint *ldC
 ) {
 
 #if HAVE_XGEMMT
@@ -32,15 +32,15 @@ void RELAPACK_cgemmt(
 #else
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
-    const int notransA = LAPACK(lsame)(transA, "N");
-    const int tranA = LAPACK(lsame)(transA, "T");
-    const int ctransA = LAPACK(lsame)(transA, "C");
-    const int notransB = LAPACK(lsame)(transB, "N");
-    const int tranB = LAPACK(lsame)(transB, "T");
-    const int ctransB = LAPACK(lsame)(transB, "C");
-    int info = 0;
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
+    const blasint notransA = LAPACK(lsame)(transA, "N");
+    const blasint tranA = LAPACK(lsame)(transA, "T");
+    const blasint ctransA = LAPACK(lsame)(transA, "C");
+    const blasint notransB = LAPACK(lsame)(transB, "N");
+    const blasint tranB = LAPACK(lsame)(transB, "T");
+    const blasint ctransB = LAPACK(lsame)(transB, "C");
+    blasint info = 0;
     if (!lower && !upper)
         info = 1;
     else if (!tranA && !ctransA && !notransA)
@@ -58,7 +58,7 @@ void RELAPACK_cgemmt(
     else if (*ldC < MAX(1, *n))
         info = 13;
     if (info) {
-        LAPACK(xerbla)("CGEMMT", &info);
+        LAPACK(xerbla)("CGEMMT", &info, strlen("CGEMMT"));
         return;
     }
 
@@ -76,10 +76,10 @@ void RELAPACK_cgemmt(
 /** cgemmt's recursive compute kernel */
 static void RELAPACK_cgemmt_rec(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const float *alpha, const float *A, const int *ldA,
-    const float *B, const int *ldB,
-    const float *beta, float *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const float *alpha, const float *A, const blasint *ldA,
+    const float *B, const blasint *ldB,
+    const float *beta, float *C, const blasint *ldC
 ) {
 
     if (*n <= MAX(CROSSOVER_CGEMMT, 1)) {
@@ -89,8 +89,8 @@ static void RELAPACK_cgemmt_rec(
     }
 
     // Splitting
-    const int n1 = CREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = CREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_T
     // A_B
@@ -126,16 +126,16 @@ static void RELAPACK_cgemmt_rec(
 /** cgemmt's unblocked compute kernel */
 static void RELAPACK_cgemmt_rec2(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const float *alpha, const float *A, const int *ldA,
-    const float *B, const int *ldB,
-    const float *beta, float *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const float *alpha, const float *A, const blasint *ldA,
+    const float *B, const blasint *ldB,
+    const float *beta, float *C, const blasint *ldC
 ) {
 
-    const int incB = (*transB == 'N') ? 1 : *ldB;
-    const int incC = 1;
+    const blasint incB = (*transB == 'N') ? 1 : *ldB;
+    const blasint incC = 1;
 
-    int i;
+    blasint i;
     for (i = 0; i < *n; i++) {
         // A_0
         // A_i
@@ -151,13 +151,13 @@ static void RELAPACK_cgemmt_rec2(
         float *const C_ii = C + 2 * *ldC * i + 2 * i;
 
         if (*uplo == 'L') {
-            const int nmi = *n - i;
+            const blasint nmi = *n - i;
             if (*transA == 'N')
                 BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
             else
                 BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
         } else {
-            const int ip1 = i + 1;
+            const blasint ip1 = i + 1;
             if (*transA == 'N')
                 BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
             else
diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c
index b31a711d0f..878c9ec15b 100644
--- a/relapack/src/cgetrf.c
+++ b/relapack/src/cgetrf.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_cgetrf_rec(const int *, const int *, float *,
-    const int *, int *, int *);
+static void RELAPACK_cgetrf_rec(const blasint *, const blasint *, float *,
+    const blasint *, blasint *, blasint *);
 
 
 /** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
@@ -11,9 +11,9 @@ static void RELAPACK_cgetrf_rec(const int *, const int *, float *,
  * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html
  */
 void RELAPACK_cgetrf(
-    const int *m, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    blasint *info
 ) {
 
     // Check arguments
@@ -22,15 +22,15 @@ void RELAPACK_cgetrf(
         *info = -1;
     else if (*n < 0)
         *info = -2;
-    else if (*ldA < MAX(1, *n))
+    else if (*ldA < MAX(1, *m))
         *info = -4;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CGETRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CGETRF", &minfo, strlen("CGETRF"));
         return;
     }
 
-    const int sn = MIN(*m, *n);
+    const blasint sn = MIN(*m, *n);
 
     RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info);
 
@@ -38,10 +38,10 @@ void RELAPACK_cgetrf(
     if (*m < *n) {
         // Constants
         const float ONE[]  = { 1., 0. };
-        const int   iONE[] = { 1 };
+        const blasint   iONE[] = { 1 };
 
         // Splitting
-        const int rn = *n - *m;
+        const blasint rn = *n - *m;
 
         // A_L A_R
         const float *const A_L = A;
@@ -57,9 +57,9 @@ void RELAPACK_cgetrf(
 
 /** cgetrf's recursive compute kernel */
 static void RELAPACK_cgetrf_rec(
-    const int *m, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_CGETRF, 1)) {
@@ -71,12 +71,12 @@ static void RELAPACK_cgetrf_rec(
     // Constants
     const float ONE[]  = { 1., 0. };
     const float MONE[] = { -1., 0. };
-    const int   iONE[] = { 1 };
+    const blasint   iONE[] = { 1 };
 
     // Splitting
-    const int n1 = CREC_SPLIT(*n);
-    const int n2 = *n - n1;
-    const int m2 = *m - n1;
+    const blasint n1 = CREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
+    const blasint m2 = *m - n1;
 
     // A_L A_R
     float *const A_L = A;
@@ -91,8 +91,8 @@ static void RELAPACK_cgetrf_rec(
 
     // ipiv_T
     // ipiv_B
-    int *const ipiv_T = ipiv;
-    int *const ipiv_B = ipiv + n1;
+    blasint *const ipiv_T = ipiv;
+    blasint *const ipiv_B = ipiv + n1;
 
     // recursion(A_L, ipiv_T)
     RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
@@ -111,7 +111,7 @@ static void RELAPACK_cgetrf_rec(
     // apply pivots to A_BL
     LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
     // shift pivots
-    int i;
+    blasint i;
     for (i = 0; i < n2; i++)
         ipiv_B[i] += n1;
 }
diff --git a/relapack/src/chegst.c b/relapack/src/chegst.c
index dff875017d..fe77b03eae 100644
--- a/relapack/src/chegst.c
+++ b/relapack/src/chegst.c
@@ -3,9 +3,9 @@
 #include "stdlib.h"
 #endif
 
-static void RELAPACK_chegst_rec(const int *, const char *, const int *,
-    float *, const int *, const float *, const int *,
-    float *, const int *, int *);
+static void RELAPACK_chegst_rec(const blasint *, const char *, const blasint *,
+    float *, const blasint *, const float *, const blasint *,
+    float *, const blasint *, blasint *);
 
 
 /** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
@@ -15,14 +15,14 @@ static void RELAPACK_chegst_rec(const int *, const char *, const int *,
  * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html
  * */
 void RELAPACK_chegst(
-    const int *itype, const char *uplo, const int *n,
-    float *A, const int *ldA, const float *B, const int *ldB,
-    int *info
+    const blasint *itype, const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (*itype < 1 || *itype > 3)
         *info = -1;
@@ -35,8 +35,8 @@ void RELAPACK_chegst(
     else if (*ldB < MAX(1, *n))
         *info = -7;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CHEGST", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CHEGST", &minfo, strlen("CHEGST"));
         return;
     }
 
@@ -45,9 +45,9 @@ void RELAPACK_chegst(
 
     // Allocate work space
     float *Work = NULL;
-    int   lWork = 0;
+    blasint   lWork = 0;
 #if XSYGST_ALLOW_MALLOC
-    const int n1 = CREC_SPLIT(*n);
+    const blasint n1 = CREC_SPLIT(*n);
     lWork = n1 * (*n - n1);
     Work  = malloc(lWork * 2 * sizeof(float));
     if (!Work)
@@ -67,9 +67,9 @@ void RELAPACK_chegst(
 
 /** chegst's recursive compute kernel */
 static void RELAPACK_chegst_rec(
-    const int *itype, const char *uplo, const int *n,
-    float *A, const int *ldA, const float *B, const int *ldB,
-    float *Work, const int *lWork, int *info
+    const blasint *itype, const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *Work, const blasint *lWork, blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_CHEGST, 1)) {
@@ -84,14 +84,14 @@ static void RELAPACK_chegst_rec(
     const float MONE[]  = { -1., 0. };
     const float HALF[]  = { .5, 0. };
     const float MHALF[] = { -.5, 0. };
-    const int   iONE[]  = { 1 };
+    const blasint   iONE[]  = { 1 };
 
     // Loop iterator
-    int i;
+    blasint i;
 
     // Splitting
-    const int n1 = CREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = CREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/chetrf.c b/relapack/src/chetrf.c
index 2928235e47..8cd3c07742 100644
--- a/relapack/src/chetrf.c
+++ b/relapack/src/chetrf.c
@@ -3,8 +3,8 @@
 #include <stdlib.h>
 #endif
 
-static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *,
-    float *, const int *, int *, float *, const int *, int *);
+static void RELAPACK_chetrf_rec(const char *, const blasint *, const blasint *, blasint *,
+    float *, const blasint *, blasint *, float *, const blasint *, blasint *);
 
 
 /** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
@@ -14,21 +14,21 @@ static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *,
  * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html
  * */
 void RELAPACK_chetrf(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *lWork, blasint *info
 ) {
 
     // Required work size
-    const int cleanlWork = *n * (*n / 2);
-    int minlWork = cleanlWork;
+    const blasint cleanlWork = *n * (*n / 2);
+    blasint minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
     minlWork = 1;
 #endif
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -55,8 +55,8 @@ void RELAPACK_chetrf(
 #endif
 
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CHETRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CHETRF", &minfo, strlen("CHETRF"));
         return;
     }
 
@@ -64,7 +64,7 @@ void RELAPACK_chetrf(
     const char cleanuplo = lower ? 'L' : 'U';
 
     // Dummy argument
-    int nout;
+    blasint nout;
 
     // Recursive kernel
     RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
@@ -78,13 +78,13 @@ void RELAPACK_chetrf(
 
 /** chetrf's recursive compute kernel */
 static void RELAPACK_chetrf_rec(
-    const char *uplo, const int *n_full, const int *n, int *n_out,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *ldWork, int *info
+    const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *ldWork, blasint *info
 ) {
 
     // top recursion level?
-    const int top = *n_full == *n;
+    const blasint top = *n_full == *n;
 
     if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
         // Unblocked
@@ -96,31 +96,31 @@ static void RELAPACK_chetrf_rec(
         return;
     }
 
-    int info1, info2;
+    blasint info1, info2;
 
     // Constants
     const float ONE[]  = { 1., 0. };
     const float MONE[] = { -1., 0. };
-    const int   iONE[] = { 1 };
+    const blasint   iONE[] = { 1 };
 
-    const int n_rest = *n_full - *n;
+    const blasint n_rest = *n_full - *n;
 
     if (*uplo == 'L') {
         // Splitting (setup)
-        int n1 = CREC_SPLIT(*n);
-        int n2 = *n - n1;
+        blasint n1 = CREC_SPLIT(*n);
+        blasint n2 = *n - n1;
 
         // Work_L *
         float *const Work_L = Work;
 
         // recursion(A_L)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
         n1 = n1_out;
 
         // Splitting (continued)
         n2 = *n - n1;
-        const int n_full2 = *n_full - n1;
+        const blasint n_full2 = *n_full - n1;
 
         // *      *
         // A_BL   A_BR
@@ -136,23 +136,23 @@ static void RELAPACK_chetrf_rec(
         // (top recursion level: use Work as Work_BR)
         float *const Work_BL =              Work                    + 2 * n1;
         float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
-        const int ldWork_BR = top ? n2 : *ldWork;
+        const blasint ldWork_BR = top ? n2 : *ldWork;
 
         // ipiv_T
         // ipiv_B
-        int *const ipiv_B = ipiv + n1;
+        blasint *const ipiv_B = ipiv + n1;
 
         // A_BR = A_BR - A_BL Work_BL'
         RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
         BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
 
         // recursion(A_BR)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
 
         if (n2_out != n2) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // last column of A_BR
             float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
@@ -169,7 +169,7 @@ static void RELAPACK_chetrf_rec(
         n2 = n2_out;
 
         // shift pivots
-        int i;
+        blasint i;
         for (i = 0; i < n2; i++)
             if (ipiv_B[i] > 0)
                 ipiv_B[i] += n1;
@@ -180,22 +180,22 @@ static void RELAPACK_chetrf_rec(
         *n_out = n1 + n2;
     } else {
         // Splitting (setup)
-        int n2 = CREC_SPLIT(*n);
-        int n1 = *n - n2;
+        blasint n2 = CREC_SPLIT(*n);
+        blasint n1 = *n - n2;
 
         // * Work_R
         // (top recursion level: use Work as Work_R)
         float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
 
         // recursion(A_R)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
-        const int n2_diff = n2 - n2_out;
+        const blasint n2_diff = n2 - n2_out;
         n2 = n2_out;
 
         // Splitting (continued)
         n1 = *n - n2;
-        const int n_full1 = *n_full - n2;
+        const blasint n_full1 = *n_full - n2;
 
         // * A_TL_T A_TR_T
         // * A_TL   A_TR
@@ -211,19 +211,19 @@ static void RELAPACK_chetrf_rec(
         // (top recursion level: Work_R was Work)
         float *const Work_L  = Work;
         float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
-        const int ldWork_L = top ? n1 : *ldWork;
+        const blasint ldWork_L = top ? n1 : *ldWork;
 
         // A_TL = A_TL - A_TR Work_TR'
         RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
         BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
 
         // recursion(A_TL)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
 
         if (n1_out != n1) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
             BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
diff --git a/relapack/src/chetrf_rec2.c b/relapack/src/chetrf_rec2.c
index b5c8341b6b..412f64cf76 100644
--- a/relapack/src/chetrf_rec2.c
+++ b/relapack/src/chetrf_rec2.c
@@ -15,7 +15,7 @@
 /* Table of constant values */
 
 static complex c_b1 = {1.f,0.f};
-static int c__1 = 1;
+static blasint c__1 = 1;
 
 /** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
  *
@@ -24,12 +24,12 @@ static int c__1 = 1;
  * The blocked BLAS Level 3 updates were removed and moved to the
  * recursive algorithm.
  * */
-/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int *
-	nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
-	int *ldw, int *info, ftnlen uplo_len)
+/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, blasint *n, blasint *
+	nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, complex *w,
+	int *ldw, blasint *info, ftnlen uplo_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
     float r__1, r__2, r__3, r__4;
     complex q__1, q__2, q__3, q__4;
 
@@ -38,22 +38,22 @@ static int c__1 = 1;
     void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
 
     /* Local variables */
-    static int j, k;
+    static blasint j, k;
     static float t, r1;
     static complex d11, d21, d22;
-    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static blasint jj, kk, jp, kp, kw, kkw, imax, jmax;
     static float alpha;
     extern logical lsame_(char *, char *, ftnlen, ftnlen);
-    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
-	    , complex *, int *, complex *, int *, complex *, complex *
-	    , int *, ftnlen), ccopy_(int *, complex *, int *,
-	    complex *, int *), cswap_(int *, complex *, int *,
-	    complex *, int *);
-    static int kstep;
+    extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex *
+	    , complex *, blasint *, complex *, blasint *, complex *, complex *
+	    , blasint *, ftnlen), ccopy_(int *, complex *, blasint *,
+	    complex *, blasint *), cswap_(int *, complex *, blasint *,
+	    complex *, blasint *);
+    static blasint kstep;
     static float absakk;
-    extern /* Subroutine */ int clacgv_(int *, complex *, int *);
-    extern int icamax_(int *, complex *, int *);
-    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
+    extern /* Subroutine */ blasint clacgv_(int *, complex *, blasint *);
+    extern blasint icamax_(int *, complex *, blasint *);
+    extern /* Subroutine */ blasint csscal_(int *, float *, complex *, int
 	    *);
     static float colmax, rowmax;
 
diff --git a/relapack/src/chetrf_rook.c b/relapack/src/chetrf_rook.c
index 086393d576..3d2fa32160 100644
--- a/relapack/src/chetrf_rook.c
+++ b/relapack/src/chetrf_rook.c
@@ -3,8 +3,8 @@
 #include <stdlib.h>
 #endif
 
-static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *,
-    float *, const int *, int *, float *, const int *, int *);
+static void RELAPACK_chetrf_rook_rec(const char *, const blasint *, const blasint *, blasint *,
+    float *, const blasint *, blasint *, float *, const blasint *, blasint *);
 
 
 /** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
@@ -14,21 +14,21 @@ static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int
  * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html
  * */
 void RELAPACK_chetrf_rook(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *lWork, blasint *info
 ) {
 
     // Required work size
-    const int cleanlWork = *n * (*n / 2);
-    int minlWork = cleanlWork;
+    const blasint cleanlWork = *n * (*n / 2);
+    blasint minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
     minlWork = 1;
 #endif
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -55,8 +55,8 @@ void RELAPACK_chetrf_rook(
 #endif
 
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CHETRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CHETRF", &minfo, strlen("CHETRF"));
         return;
     }
 
@@ -64,7 +64,7 @@ void RELAPACK_chetrf_rook(
     const char cleanuplo = lower ? 'L' : 'U';
 
     // Dummy argument
-    int nout;
+    blasint nout;
 
     // Recursive kernel
     RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
@@ -78,13 +78,13 @@ void RELAPACK_chetrf_rook(
 
 /** chetrf_rook's recursive compute kernel */
 static void RELAPACK_chetrf_rook_rec(
-    const char *uplo, const int *n_full, const int *n, int *n_out,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *ldWork, int *info
+    const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *ldWork, blasint *info
 ) {
 
     // top recursion level?
-    const int top = *n_full == *n;
+    const blasint top = *n_full == *n;
 
     if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
         // Unblocked
@@ -96,31 +96,31 @@ static void RELAPACK_chetrf_rook_rec(
         return;
     }
 
-    int info1, info2;
+    blasint info1, info2;
 
     // Constants
     const float ONE[]  = { 1., 0. };
     const float MONE[] = { -1., 0. };
-    const int   iONE[] = { 1 };
+    const blasint   iONE[] = { 1 };
 
-    const int n_rest = *n_full - *n;
+    const blasint n_rest = *n_full - *n;
 
     if (*uplo == 'L') {
         // Splitting (setup)
-        int n1 = CREC_SPLIT(*n);
-        int n2 = *n - n1;
+        blasint n1 = CREC_SPLIT(*n);
+        blasint n2 = *n - n1;
 
         // Work_L *
         float *const Work_L = Work;
 
         // recursion(A_L)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
         n1 = n1_out;
 
         // Splitting (continued)
         n2 = *n - n1;
-        const int n_full2 = *n_full - n1;
+        const blasint n_full2 = *n_full - n1;
 
         // *      *
         // A_BL   A_BR
@@ -136,23 +136,23 @@ static void RELAPACK_chetrf_rook_rec(
         // (top recursion level: use Work as Work_BR)
         float *const Work_BL =              Work                    + 2 * n1;
         float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
-        const int ldWork_BR = top ? n2 : *ldWork;
+        const blasint ldWork_BR = top ? n2 : *ldWork;
 
         // ipiv_T
         // ipiv_B
-        int *const ipiv_B = ipiv + n1;
+        blasint *const ipiv_B = ipiv + n1;
 
         // A_BR = A_BR - A_BL Work_BL'
         RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
         BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
 
         // recursion(A_BR)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
 
         if (n2_out != n2) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // last column of A_BR
             float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
@@ -169,7 +169,7 @@ static void RELAPACK_chetrf_rook_rec(
         n2 = n2_out;
 
         // shift pivots
-        int i;
+        blasint i;
         for (i = 0; i < n2; i++)
             if (ipiv_B[i] > 0)
                 ipiv_B[i] += n1;
@@ -180,22 +180,22 @@ static void RELAPACK_chetrf_rook_rec(
         *n_out = n1 + n2;
     } else {
         // Splitting (setup)
-        int n2 = CREC_SPLIT(*n);
-        int n1 = *n - n2;
+        blasint n2 = CREC_SPLIT(*n);
+        blasint n1 = *n - n2;
 
         // * Work_R
         // (top recursion level: use Work as Work_R)
         float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
 
         // recursion(A_R)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
-        const int n2_diff = n2 - n2_out;
+        const blasint n2_diff = n2 - n2_out;
         n2 = n2_out;
 
         // Splitting (continued)
         n1 = *n - n2;
-        const int n_full1 = *n_full - n2;
+        const blasint n_full1 = *n_full - n2;
 
         // * A_TL_T A_TR_T
         // * A_TL   A_TR
@@ -211,19 +211,19 @@ static void RELAPACK_chetrf_rook_rec(
         // (top recursion level: Work_R was Work)
         float *const Work_L  = Work;
         float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
-        const int ldWork_L = top ? n1 : *ldWork;
+        const blasint ldWork_L = top ? n1 : *ldWork;
 
         // A_TL = A_TL - A_TR Work_TR'
         RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
         BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
 
         // recursion(A_TL)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
 
         if (n1_out != n1) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
             BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
diff --git a/relapack/src/chetrf_rook_rec2.c b/relapack/src/chetrf_rook_rec2.c
index a42cbfd44d..e0b2ff9628 100644
--- a/relapack/src/chetrf_rook_rec2.c
+++ b/relapack/src/chetrf_rook_rec2.c
@@ -15,7 +15,7 @@
 /* Table of constant values */
 
 static complex c_b1 = {1.f,0.f};
-static int c__1 = 1;
+static blasint c__1 = 1;
 
 /** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
  *
@@ -24,12 +24,12 @@ static int c__1 = 1;
  * The blocked BLAS Level 3 updates were removed and moved to the
  * recursive algorithm.
  * */
-/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n,
-	int *nb, int *kb, complex *a, int *lda, int *ipiv,
-	complex *w, int *ldw, int *info, ftnlen uplo_len)
+/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, blasint *n,
+	int *nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv,
+	complex *w, blasint *ldw, blasint *info, ftnlen uplo_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
     float r__1, r__2;
     complex q__1, q__2, q__3, q__4, q__5;
 
@@ -38,29 +38,29 @@ static int c__1 = 1;
     void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
 
     /* Local variables */
-    static int j, k, p;
+    static blasint j, k, p;
     static float t, r1;
     static complex d11, d21, d22;
-    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw;
     static logical done;
-    static int imax, jmax;
+    static blasint imax, jmax;
     static float alpha;
     extern logical lsame_(char *, char *, ftnlen, ftnlen);
-    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
-	    , complex *, int *, complex *, int *, complex *, complex *
-	    , int *, ftnlen);
+    extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex *
+	    , complex *, blasint *, complex *, blasint *, complex *, complex *
+	    , blasint *, ftnlen);
     static float sfmin;
-    extern /* Subroutine */ int ccopy_(int *, complex *, int *,
-	    complex *, int *);
-    static int itemp;
-    extern /* Subroutine */ int cswap_(int *, complex *, int *,
-	    complex *, int *);
-    static int kstep;
+    extern /* Subroutine */ blasint ccopy_(int *, complex *, blasint *,
+	    complex *, blasint *);
+    static blasint itemp;
+    extern /* Subroutine */ blasint cswap_(int *, complex *, blasint *,
+	    complex *, blasint *);
+    static blasint kstep;
     static float stemp, absakk;
-    extern /* Subroutine */ int clacgv_(int *, complex *, int *);
-    extern int icamax_(int *, complex *, int *);
+    extern /* Subroutine */ blasint clacgv_(int *, complex *, blasint *);
+    extern blasint icamax_(int *, complex *, blasint *);
     extern double slamch_(char *, ftnlen);
-    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
+    extern /* Subroutine */ blasint csscal_(int *, float *, complex *, int
 	    *);
     static float colmax, rowmax;
 
diff --git a/relapack/src/clauum.c b/relapack/src/clauum.c
index 36d6297cfc..2bc93f182b 100644
--- a/relapack/src/clauum.c
+++ b/relapack/src/clauum.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_clauum_rec(const char *, const int *, float *,
-    const int *, int *);
+static void RELAPACK_clauum_rec(const char *, const blasint *, float *,
+    const blasint *, blasint *);
 
 
 /** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
@@ -11,14 +11,14 @@ static void RELAPACK_clauum_rec(const char *, const int *, float *,
  * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html
  * */
 void RELAPACK_clauum(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -27,8 +27,8 @@ void RELAPACK_clauum(
     else if (*ldA < MAX(1, *n))
         *info = -4;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CLAUUM", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CLAUUM", &minfo, strlen("CLAUUM"));
         return;
     }
 
@@ -42,9 +42,9 @@ void RELAPACK_clauum(
 
 /** clauum's recursive compute kernel */
 static void RELAPACK_clauum_rec(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_CLAUUM, 1)) {
@@ -57,8 +57,8 @@ static void RELAPACK_clauum_rec(
     const float ONE[] = { 1., 0. };
 
     // Splitting
-    const int n1 = CREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = CREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/cpbtrf.c b/relapack/src/cpbtrf.c
index e0ea7b944a..971e547c64 100644
--- a/relapack/src/cpbtrf.c
+++ b/relapack/src/cpbtrf.c
@@ -1,8 +1,8 @@
 #include "relapack.h"
 #include "stdlib.h"
 
-static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *,
-    float *, const int *, float *, const int *, int *);
+static void RELAPACK_cpbtrf_rec(const char *, const blasint *, const blasint *,
+    float *, const blasint *, float *, const blasint *, blasint *);
 
 
 /** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
@@ -12,14 +12,14 @@ static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *,
  * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html
  * */
 void RELAPACK_cpbtrf(
-    const char *uplo, const int *n, const int *kd,
-    float *Ab, const int *ldAb,
-    int *info
+    const char *uplo, const blasint *n, const blasint *kd,
+    float *Ab, const blasint *ldAb,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -30,8 +30,8 @@ void RELAPACK_cpbtrf(
     else if (*ldAb < *kd + 1)
         *info = -5;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CPBTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CPBTRF", &minfo, strlen("CPBTRF"));
         return;
     }
 
@@ -42,9 +42,9 @@ void RELAPACK_cpbtrf(
     const float ZERO[] = { 0., 0. };
 
     // Allocate work space
-    const int n1 = CREC_SPLIT(*n);
-    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
-    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+    const blasint n1 = CREC_SPLIT(*n);
+    const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+    const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
     float *Work = malloc(mWork * nWork * 2 * sizeof(float));
     LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
 
@@ -58,10 +58,10 @@ void RELAPACK_cpbtrf(
 
 /** cpbtrf's recursive compute kernel */
 static void RELAPACK_cpbtrf_rec(
-    const char *uplo, const int *n, const int *kd,
-    float *Ab, const int *ldAb,
-    float *Work, const int *ldWork,
-    int *info
+    const char *uplo, const blasint *n, const blasint *kd,
+    float *Ab, const blasint *ldAb,
+    float *Work, const blasint *ldWork,
+    blasint *info
 ){
 
     if (*n <= MAX(CROSSOVER_CPBTRF, 1)) {
@@ -75,12 +75,12 @@ static void RELAPACK_cpbtrf_rec(
     const float MONE[] = { -1., 0. };
 
     // Unskew A
-    const int ldA[] = { *ldAb - 1 };
+    const blasint ldA[] = { *ldAb - 1 };
     float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);
 
     // Splitting
-    const int n1 = MIN(CREC_SPLIT(*n), *kd);
-    const int n2 = *n - n1;
+    const blasint n1 = MIN(CREC_SPLIT(*n), *kd);
+    const blasint n2 = *n - n1;
 
     // * *
     // * Ab_BR
@@ -99,8 +99,8 @@ static void RELAPACK_cpbtrf_rec(
         return;
 
     // Banded splitting
-    const int n21 = MIN(n2, *kd - n1);
-    const int n22 = MIN(n2 - n21, *kd);
+    const blasint n21 = MIN(n2, *kd - n1);
+    const blasint n22 = MIN(n2 - n21, *kd);
 
     //     n1    n21    n22
     // n1  *     A_TRl  A_TRr
diff --git a/relapack/src/cpotrf.c b/relapack/src/cpotrf.c
index e35caa7fa8..0f8e7ebb06 100644
--- a/relapack/src/cpotrf.c
+++ b/relapack/src/cpotrf.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_cpotrf_rec(const char *, const int *, float *,
-        const int *, int *);
+static void RELAPACK_cpotrf_rec(const char *, const blasint *, float *,
+        const blasint *, blasint *);
 
 
 /** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
@@ -11,14 +11,14 @@ static void RELAPACK_cpotrf_rec(const char *, const int *, float *,
  * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html
  * */
 void RELAPACK_cpotrf(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -27,8 +27,8 @@ void RELAPACK_cpotrf(
     else if (*ldA < MAX(1, *n))
         *info = -4;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CPOTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CPOTRF", &minfo, strlen("CPOTRF"));
         return;
     }
 
@@ -42,9 +42,9 @@ void RELAPACK_cpotrf(
 
 /** cpotrf's recursive compute kernel */
 static void RELAPACK_cpotrf_rec(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ){
 
     if (*n <= MAX(CROSSOVER_CPOTRF, 1)) {
@@ -58,8 +58,8 @@ static void RELAPACK_cpotrf_rec(
     const float MONE[] = { -1., 0. };
 
     // Splitting
-    const int n1 = CREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = CREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/csytrf.c b/relapack/src/csytrf.c
index 01c161d1ae..2ebc310014 100644
--- a/relapack/src/csytrf.c
+++ b/relapack/src/csytrf.c
@@ -3,8 +3,8 @@
 #include <stdlib.h>
 #endif
 
-static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *,
-    float *, const int *, int *, float *, const int *, int *);
+static void RELAPACK_csytrf_rec(const char *, const blasint *, const blasint *, blasint *,
+    float *, const blasint *, blasint *, float *, const blasint *, blasint *);
 
 
 /** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
@@ -14,21 +14,21 @@ static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *,
  * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html
  * */
 void RELAPACK_csytrf(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *lWork, blasint *info
 ) {
 
     // Required work size
-    const int cleanlWork = *n * (*n / 2);
-    int minlWork = cleanlWork;
+    const blasint cleanlWork = *n * (*n / 2);
+    blasint minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
     minlWork = 1;
 #endif
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -55,8 +55,8 @@ void RELAPACK_csytrf(
 #endif
 
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CSYTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CSYTRF", &minfo, strlen("CSYTRF"));
         return;
     }
 
@@ -64,7 +64,7 @@ void RELAPACK_csytrf(
     const char cleanuplo = lower ? 'L' : 'U';
 
     // Dummy arguments
-    int nout;
+    blasint nout;
 
     // Recursive kernel
     RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
@@ -78,13 +78,13 @@ void RELAPACK_csytrf(
 
 /** csytrf's recursive compute kernel */
 static void RELAPACK_csytrf_rec(
-    const char *uplo, const int *n_full, const int *n, int *n_out,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *ldWork, int *info
+    const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *ldWork, blasint *info
 ) {
 
     // top recursion level?
-    const int top = *n_full == *n;
+    const blasint top = *n_full == *n;
 
     if (*n <= MAX(CROSSOVER_CSYTRF, 3)) {
         // Unblocked
@@ -96,34 +96,34 @@ static void RELAPACK_csytrf_rec(
         return;
     }
 
-    int info1, info2;
+    blasint info1, info2;
 
     // Constants
     const float ONE[]  = { 1., 0. };
     const float MONE[] = { -1., 0. };
-    const int   iONE[] = { 1 };
+    const blasint   iONE[] = { 1 };
 
     // Loop iterator
-    int i;
+    blasint i;
 
-    const int n_rest = *n_full - *n;
+    const blasint n_rest = *n_full - *n;
 
     if (*uplo == 'L') {
         // Splitting (setup)
-        int n1 = CREC_SPLIT(*n);
-        int n2 = *n - n1;
+        blasint n1 = CREC_SPLIT(*n);
+        blasint n2 = *n - n1;
 
         // Work_L *
         float *const Work_L = Work;
 
         // recursion(A_L)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
         n1 = n1_out;
 
         // Splitting (continued)
         n2 = *n - n1;
-        const int n_full2 = *n_full - n1;
+        const blasint n_full2 = *n_full - n1;
 
         // *      *
         // A_BL   A_BR
@@ -139,23 +139,23 @@ static void RELAPACK_csytrf_rec(
         // (top recursion level: use Work as Work_BR)
         float *const Work_BL =              Work                    + 2 * n1;
         float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
-        const int ldWork_BR = top ? n2 : *ldWork;
+        const blasint ldWork_BR = top ? n2 : *ldWork;
 
         // ipiv_T
         // ipiv_B
-        int *const ipiv_B = ipiv + n1;
+        blasint *const ipiv_B = ipiv + n1;
 
         // A_BR = A_BR - A_BL Work_BL'
         RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
         BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
 
         // recursion(A_BR)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
 
         if (n2_out != n2) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // last column of A_BR
             float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
@@ -182,22 +182,22 @@ static void RELAPACK_csytrf_rec(
         *n_out = n1 + n2;
     } else {
         // Splitting (setup)
-        int n2 = CREC_SPLIT(*n);
-        int n1 = *n - n2;
+        blasint n2 = CREC_SPLIT(*n);
+        blasint n1 = *n - n2;
 
         // * Work_R
         // (top recursion level: use Work as Work_R)
         float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
 
         // recursion(A_R)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
-        const int n2_diff = n2 - n2_out;
+        const blasint n2_diff = n2 - n2_out;
         n2 = n2_out;
 
         // Splitting (continued)
         n1 = *n - n2;
-        const int n_full1  = *n_full - n2;
+        const blasint n_full1  = *n_full - n2;
 
         // * A_TL_T A_TR_T
         // * A_TL   A_TR
@@ -213,19 +213,19 @@ static void RELAPACK_csytrf_rec(
         // (top recursion level: Work_R was Work)
         float *const Work_L  = Work;
         float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
-        const int ldWork_L = top ? n1 : *ldWork;
+        const blasint ldWork_L = top ? n1 : *ldWork;
 
         // A_TL = A_TL - A_TR Work_TR'
         RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
         BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
 
         // recursion(A_TL)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
 
         if (n1_out != n1) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
             BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
diff --git a/relapack/src/csytrf_rec2.c b/relapack/src/csytrf_rec2.c
index 9d6bd849d0..216a9e2484 100644
--- a/relapack/src/csytrf_rec2.c
+++ b/relapack/src/csytrf_rec2.c
@@ -15,7 +15,7 @@
 /* Table of constant values */
 
 static complex c_b1 = {1.f,0.f};
-static int c__1 = 1;
+static blasint c__1 = 1;
 
 /** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
  *
@@ -24,12 +24,12 @@ static int c__1 = 1;
  * The blocked BLAS Level 3 updates were removed and moved to the
  * recursive algorithm.
  * */
-/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int *
-	nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
-	int *ldw, int *info, ftnlen uplo_len)
+/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, blasint *n, blasint *
+	nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, complex *w,
+	int *ldw, blasint *info, ftnlen uplo_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
     float r__1, r__2, r__3, r__4;
     complex q__1, q__2, q__3;
 
@@ -38,21 +38,21 @@ static int c__1 = 1;
     void c_div(complex *, complex *, complex *);
 
     /* Local variables */
-    static int j, k;
+    static blasint j, k;
     static complex t, r1, d11, d21, d22;
-    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static blasint jj, kk, jp, kp, kw, kkw, imax, jmax;
     static float alpha;
-    extern /* Subroutine */ int cscal_(int *, complex *, complex *,
-	    int *);
+    extern /* Subroutine */ blasint cscal_(int *, complex *, complex *,
+	    blasint *);
     extern logical lsame_(char *, char *, ftnlen, ftnlen);
-    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
-	    , complex *, int *, complex *, int *, complex *, complex *
-	    , int *, ftnlen), ccopy_(int *, complex *, int *,
-	    complex *, int *), cswap_(int *, complex *, int *,
-	    complex *, int *);
-    static int kstep;
+    extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex *
+	    , complex *, blasint *, complex *, blasint *, complex *, complex *
+	    , blasint *, ftnlen), ccopy_(int *, complex *, blasint *,
+	    complex *, blasint *), cswap_(int *, complex *, blasint *,
+	    complex *, blasint *);
+    static blasint kstep;
     static float absakk;
-    extern int icamax_(int *, complex *, int *);
+    extern blasint icamax_(int *, complex *, blasint *);
     static float colmax, rowmax;
 
     /* Parameter adjustments */
diff --git a/relapack/src/csytrf_rook.c b/relapack/src/csytrf_rook.c
index aa7dd0e57a..e8a9865cca 100644
--- a/relapack/src/csytrf_rook.c
+++ b/relapack/src/csytrf_rook.c
@@ -3,8 +3,8 @@
 #include <stdlib.h>
 #endif
 
-static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *,
-    float *, const int *, int *, float *, const int *, int *);
+static void RELAPACK_csytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *,
+    float *, const blasint *, blasint *, float *, const blasint *, blasint *);
 
 
 /** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
@@ -14,21 +14,21 @@ static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int
  * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html
  * */
 void RELAPACK_csytrf_rook(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *lWork, blasint *info
 ) {
 
     // Required work size
-    const int cleanlWork = *n * (*n / 2);
-    int minlWork = cleanlWork;
+    const blasint cleanlWork = *n * (*n / 2);
+    blasint minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
     minlWork = 1;
 #endif
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -55,8 +55,8 @@ void RELAPACK_csytrf_rook(
 #endif
 
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CSYTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CSYTRF", &minfo, strlen("CSYTRF"));
         return;
     }
 
@@ -64,7 +64,7 @@ void RELAPACK_csytrf_rook(
     const char cleanuplo = lower ? 'L' : 'U';
 
     // Dummy argument
-    int nout;
+    blasint nout;
 
     // Recursive kernel
     RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
@@ -78,13 +78,13 @@ void RELAPACK_csytrf_rook(
 
 /** csytrf_rook's recursive compute kernel */
 static void RELAPACK_csytrf_rook_rec(
-    const char *uplo, const int *n_full, const int *n, int *n_out,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *ldWork, int *info
+    const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *ldWork, blasint *info
 ) {
 
     // top recursion level?
-    const int top = *n_full == *n;
+    const blasint top = *n_full == *n;
 
     if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) {
         // Unblocked
@@ -96,31 +96,31 @@ static void RELAPACK_csytrf_rook_rec(
         return;
     }
 
-    int info1, info2;
+    blasint info1, info2;
 
     // Constants
     const float ONE[]  = { 1., 0. };
     const float MONE[] = { -1., 0. };
-    const int   iONE[] = { 1 };
+    const blasint   iONE[] = { 1 };
 
-    const int n_rest = *n_full - *n;
+    const blasint n_rest = *n_full - *n;
 
     if (*uplo == 'L') {
         // Splitting (setup)
-        int n1 = CREC_SPLIT(*n);
-        int n2 = *n - n1;
+        blasint n1 = CREC_SPLIT(*n);
+        blasint n2 = *n - n1;
 
         // Work_L *
         float *const Work_L = Work;
 
         // recursion(A_L)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
         n1 = n1_out;
 
         // Splitting (continued)
         n2 = *n - n1;
-        const int n_full2   = *n_full - n1;
+        const blasint n_full2   = *n_full - n1;
 
         // *      *
         // A_BL   A_BR
@@ -136,23 +136,23 @@ static void RELAPACK_csytrf_rook_rec(
         // (top recursion level: use Work as Work_BR)
         float *const Work_BL =              Work                    + 2 * n1;
         float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
-        const int ldWork_BR = top ? n2 : *ldWork;
+        const blasint ldWork_BR = top ? n2 : *ldWork;
 
         // ipiv_T
         // ipiv_B
-        int *const ipiv_B = ipiv + n1;
+        blasint *const ipiv_B = ipiv + n1;
 
         // A_BR = A_BR - A_BL Work_BL'
         RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
         BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
 
         // recursion(A_BR)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
 
         if (n2_out != n2) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // last column of A_BR
             float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
@@ -169,7 +169,7 @@ static void RELAPACK_csytrf_rook_rec(
         n2 = n2_out;
 
         // shift pivots
-        int i;
+        blasint i;
         for (i = 0; i < n2; i++)
             if (ipiv_B[i] > 0)
                 ipiv_B[i] += n1;
@@ -180,22 +180,22 @@ static void RELAPACK_csytrf_rook_rec(
         *n_out = n1 + n2;
     } else {
         // Splitting (setup)
-        int n2 = CREC_SPLIT(*n);
-        int n1 = *n - n2;
+        blasint n2 = CREC_SPLIT(*n);
+        blasint n1 = *n - n2;
 
         // * Work_R
         // (top recursion level: use Work as Work_R)
         float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
 
         // recursion(A_R)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
-        const int n2_diff = n2 - n2_out;
+        const blasint n2_diff = n2 - n2_out;
         n2 = n2_out;
 
         // Splitting (continued)
         n1 = *n - n2;
-        const int n_full1 = *n_full - n2;
+        const blasint n_full1 = *n_full - n2;
 
         // * A_TL_T A_TR_T
         // * A_TL   A_TR
@@ -211,19 +211,19 @@ static void RELAPACK_csytrf_rook_rec(
         // (top recursion level: Work_R was Work)
         float *const Work_L  = Work;
         float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
-        const int ldWork_L = top ? n1 : *ldWork;
+        const blasint ldWork_L = top ? n1 : *ldWork;
 
         // A_TL = A_TL - A_TR Work_TR'
         RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
         BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
 
         // recursion(A_TL)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
 
         if (n1_out != n1) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
             BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
diff --git a/relapack/src/csytrf_rook_rec2.c b/relapack/src/csytrf_rook_rec2.c
index 6638338a60..2561065d7b 100644
--- a/relapack/src/csytrf_rook_rec2.c
+++ b/relapack/src/csytrf_rook_rec2.c
@@ -15,7 +15,7 @@
 /* Table of constant values */
 
 static complex c_b1 = {1.f,0.f};
-static int c__1 = 1;
+static blasint c__1 = 1;
 
 /** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
  *
@@ -24,12 +24,12 @@ static int c__1 = 1;
  * The blocked BLAS Level 3 updates were removed and moved to the
  * recursive algorithm.
  * */
-/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n,
-	int *nb, int *kb, complex *a, int *lda, int *ipiv,
-	complex *w, int *ldw, int *info, ftnlen uplo_len)
+/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, blasint *n,
+	int *nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv,
+	complex *w, blasint *ldw, blasint *info, ftnlen uplo_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
     float r__1, r__2;
     complex q__1, q__2, q__3, q__4;
 
@@ -38,27 +38,27 @@ static int c__1 = 1;
     void c_div(complex *, complex *, complex *);
 
     /* Local variables */
-    static int j, k, p;
+    static blasint j, k, p;
     static complex t, r1, d11, d12, d21, d22;
-    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw;
     static logical done;
-    static int imax, jmax;
+    static blasint imax, jmax;
     static float alpha;
-    extern /* Subroutine */ int cscal_(int *, complex *, complex *,
-	    int *);
+    extern /* Subroutine */ blasint cscal_(int *, complex *, complex *,
+	    blasint *);
     extern logical lsame_(char *, char *, ftnlen, ftnlen);
-    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
-	    , complex *, int *, complex *, int *, complex *, complex *
-	    , int *, ftnlen);
+    extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex *
+	    , complex *, blasint *, complex *, blasint *, complex *, complex *
+	    , blasint *, ftnlen);
     static float sfmin;
-    extern /* Subroutine */ int ccopy_(int *, complex *, int *,
-	    complex *, int *);
-    static int itemp;
-    extern /* Subroutine */ int cswap_(int *, complex *, int *,
-	    complex *, int *);
-    static int kstep;
+    extern /* Subroutine */ blasint ccopy_(int *, complex *, blasint *,
+	    complex *, blasint *);
+    static blasint itemp;
+    extern /* Subroutine */ blasint cswap_(int *, complex *, blasint *,
+	    complex *, blasint *);
+    static blasint kstep;
     static float stemp, absakk;
-    extern int icamax_(int *, complex *, int *);
+    extern blasint icamax_(int *, complex *, blasint *);
     extern double slamch_(char *, ftnlen);
     static float colmax, rowmax;
 
diff --git a/relapack/src/ctgsyl.c b/relapack/src/ctgsyl.c
index 15c738baf2..704f3ef232 100644
--- a/relapack/src/ctgsyl.c
+++ b/relapack/src/ctgsyl.c
@@ -1,10 +1,10 @@
 #include "relapack.h"
 #include <math.h>
 
-static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *,
-    const int *, const float *, const int *, const float *, const int *,
-    float *, const int *, const float *, const int *, const float *,
-    const int *, float *, const int *, float *, float *, float *, int *);
+static void RELAPACK_ctgsyl_rec(const char *, const blasint *, const blasint *,
+    const blasint *, const float *, const blasint *, const float *, const blasint *,
+    float *, const blasint *, const float *, const blasint *, const float *,
+    const blasint *, float *, const blasint *, float *, float *, float *, blasint *);
 
 
 /** CTGSYL solves the generalized Sylvester equation.
@@ -14,21 +14,21 @@ static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *,
  * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html
  * */
 void RELAPACK_ctgsyl(
-    const char *trans, const int *ijob, const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC,
-    const float *D, const int *ldD, const float *E, const int *ldE,
-    float *F, const int *ldF,
+    const char *trans, const blasint *ijob, const blasint *m, const blasint *n,
+    const float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *C, const blasint *ldC,
+    const float *D, const blasint *ldD, const float *E, const blasint *ldE,
+    float *F, const blasint *ldF,
     float *scale, float *dif,
-    float *Work, const int *lWork, int *iWork, int *info
+    float *Work, const blasint *lWork, blasint *iWork, blasint *info
 ) {
 
     // Parse arguments
-    const int notran = LAPACK(lsame)(trans, "N");
-    const int tran = LAPACK(lsame)(trans, "C");
+    const blasint notran = LAPACK(lsame)(trans, "N");
+    const blasint tran = LAPACK(lsame)(trans, "C");
 
     // Compute work buffer size
-    int lwmin = 1;
+    blasint lwmin = 1;
     if (notran && (*ijob == 1 || *ijob == 2))
         lwmin = MAX(1, 2 * *m * *n);
     *info = 0;
@@ -57,8 +57,8 @@ void RELAPACK_ctgsyl(
     else if (*lWork < lwmin && *lWork != -1)
         *info = -20;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CTGSYL", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CTGSYL", &minfo, strlen("CTGSYL"));
         return;
     }
 
@@ -74,8 +74,8 @@ void RELAPACK_ctgsyl(
     // Constant
     const float ZERO[] = { 0., 0. };
 
-    int isolve = 1;
-    int ifunc  = 0;
+    blasint isolve = 1;
+    blasint ifunc  = 0;
     if (notran) {
         if (*ijob >= 3) {
             ifunc = *ijob - 2;
@@ -86,7 +86,7 @@ void RELAPACK_ctgsyl(
     }
 
     float scale2;
-    int iround;
+    blasint iround;
     for (iround = 1; iround <= isolve; iround++) {
         *scale = 1;
         float dscale = 0;
@@ -119,13 +119,13 @@ void RELAPACK_ctgsyl(
 
 /** ctgsyl's recursive vompute kernel */
 static void RELAPACK_ctgsyl_rec(
-    const char *trans, const int *ifunc, const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC,
-    const float *D, const int *ldD, const float *E, const int *ldE,
-    float *F, const int *ldF,
+    const char *trans, const blasint *ifunc, const blasint *m, const blasint *n,
+    const float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *C, const blasint *ldC,
+    const float *D, const blasint *ldD, const float *E, const blasint *ldE,
+    float *F, const blasint *ldF,
     float *scale, float *dsum, float *dscale,
-    int *info
+    blasint *info
 ) {
 
     if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) {
@@ -137,18 +137,18 @@ static void RELAPACK_ctgsyl_rec(
     // Constants
     const float ONE[]  = { 1., 0. };
     const float MONE[] = { -1., 0. };
-    const int   iONE[] = { 1 };
+    const blasint   iONE[] = { 1 };
 
     // Outputs
     float scale1[] = { 1., 0. };
     float scale2[] = { 1., 0. };
-    int   info1[]  = { 0 };
-    int   info2[]  = { 0 };
+    blasint   info1[]  = { 0 };
+    blasint   info2[]  = { 0 };
 
     if (*m > *n) {
         // Splitting
-        const int m1 = CREC_SPLIT(*m);
-        const int m2 = *m - m1;
+        const blasint m1 = CREC_SPLIT(*m);
+        const blasint m2 = *m - m1;
 
         // A_TL A_TR
         // 0    A_BR
@@ -206,8 +206,8 @@ static void RELAPACK_ctgsyl_rec(
         }
     } else {
         // Splitting
-        const int n1 = CREC_SPLIT(*n);
-        const int n2 = *n - n1;
+        const blasint n1 = CREC_SPLIT(*n);
+        const blasint n2 = *n - n1;
 
         // B_TL B_TR
         // 0    B_BR
diff --git a/relapack/src/ctrsyl.c b/relapack/src/ctrsyl.c
index b548d5354d..fed6e847e5 100644
--- a/relapack/src/ctrsyl.c
+++ b/relapack/src/ctrsyl.c
@@ -1,8 +1,8 @@
 #include "relapack.h"
 
-static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *,
-    const int *, const int *, const float *, const int *, const float *,
-    const int *, float *, const int *, float *, int *);
+static void RELAPACK_ctrsyl_rec(const char *, const char *, const blasint *,
+    const blasint *, const blasint *, const float *, const blasint *, const float *,
+    const blasint *, float *, const blasint *, float *, blasint *);
 
 
 /** CTRSYL solves the complex Sylvester matrix equation.
@@ -12,18 +12,18 @@ static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *,
  * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html
  * */
 void RELAPACK_ctrsyl(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC, float *scale,
-    int *info
+    const char *tranA, const char *tranB, const blasint *isgn,
+    const blasint *m, const blasint *n,
+    const float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *C, const blasint *ldC, float *scale,
+    blasint *info
 ) {
 
     // Check arguments
-    const int notransA = LAPACK(lsame)(tranA, "N");
-    const int ctransA = LAPACK(lsame)(tranA, "C");
-    const int notransB = LAPACK(lsame)(tranB, "N");
-    const int ctransB = LAPACK(lsame)(tranB, "C");
+    const blasint notransA = LAPACK(lsame)(tranA, "N");
+    const blasint ctransA = LAPACK(lsame)(tranA, "C");
+    const blasint notransB = LAPACK(lsame)(tranB, "N");
+    const blasint ctransB = LAPACK(lsame)(tranB, "C");
     *info = 0;
     if (!ctransA && !notransA)
         *info = -1;
@@ -42,8 +42,8 @@ void RELAPACK_ctrsyl(
     else if (*ldC < MAX(1, *m))
         *info = -11;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CTRSYL", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CTRSYL", &minfo, strlen("CTRSYL"));
         return;
     }
 
@@ -58,11 +58,11 @@ void RELAPACK_ctrsyl(
 
 /** ctrsyl's recursive compute kernel */
 static void RELAPACK_ctrsyl_rec(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC, float *scale,
-    int *info
+    const char *tranA, const char *tranB, const blasint *isgn,
+    const blasint *m, const blasint *n,
+    const float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *C, const blasint *ldC, float *scale,
+    blasint *info
 ) {
 
     if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) {
@@ -75,18 +75,18 @@ static void RELAPACK_ctrsyl_rec(
     const float ONE[]  = { 1., 0. };
     const float MONE[] = { -1., 0. };
     const float MSGN[] = { -*isgn, 0. };
-    const int   iONE[] = { 1 };
+    const blasint   iONE[] = { 1 };
 
     // Outputs
     float scale1[] = { 1., 0. };
     float scale2[] = { 1., 0. };
-    int   info1[]  = { 0 };
-    int   info2[]  = { 0 };
+    blasint   info1[]  = { 0 };
+    blasint   info2[]  = { 0 };
 
     if (*m > *n) {
         // Splitting
-        const int m1 = CREC_SPLIT(*m);
-        const int m2 = *m - m1;
+        const blasint m1 = CREC_SPLIT(*m);
+        const blasint m2 = *m - m1;
 
         // A_TL A_TR
         // 0    A_BR
@@ -122,8 +122,8 @@ static void RELAPACK_ctrsyl_rec(
         }
     } else {
         // Splitting
-        const int n1 = CREC_SPLIT(*n);
-        const int n2 = *n - n1;
+        const blasint n1 = CREC_SPLIT(*n);
+        const blasint n2 = *n - n1;
 
         // B_TL B_TR
         // 0    B_BR
diff --git a/relapack/src/ctrsyl_rec2.c b/relapack/src/ctrsyl_rec2.c
index 518574868a..556491c7a1 100644
--- a/relapack/src/ctrsyl_rec2.c
+++ b/relapack/src/ctrsyl_rec2.c
@@ -14,16 +14,16 @@
 #include "f2c.h"
 
 #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
-complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
-    extern void cdotu_(complex *, int *, complex *, int *, complex *, int *);
+complex cdotu_fun(int *n, complex *x, blasint *incx, complex *y, blasint *incy) {
+    extern void cdotu_(complex *, blasint *, complex *, blasint *, complex *, blasint *);
     complex result;
     cdotu_(&result, n, x, incx, y, incy);
     return result;
 }
 #define cdotu_ cdotu_fun
 
-complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
-    extern void cdotc_(complex *, int *, complex *, int *, complex *, int *);
+complex cdotc_fun(int *n, complex *x, blasint *incx, complex *y, blasint *incy) {
+    extern void cdotc_(complex *, blasint *, complex *, blasint *, complex *, blasint *);
     complex result;
     cdotc_(&result, n, x, incx, y, incy);
     return result;
@@ -43,7 +43,7 @@ complex cladiv_fun(complex *a, complex *b) {
 
 /* Table of constant values */
 
-static int c__1 = 1;
+static blasint c__1 = 1;
 
 /** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
  *
@@ -51,12 +51,12 @@ static int c__1 = 1;
  * It serves as an unblocked kernel in the recursive algorithms.
  * */
 /* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int
-	*isgn, int *m, int *n, complex *a, int *lda, complex *b,
-	int *ldb, complex *c__, int *ldc, float *scale, int *info,
+	*isgn, blasint *m, blasint *n, complex *a, blasint *lda, complex *b,
+	int *ldb, complex *c__, blasint *ldc, float *scale, blasint *info,
 	ftnlen trana_len, ftnlen tranb_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+    blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4;
     float r__1, r__2;
     complex q__1, q__2, q__3, q__4;
@@ -66,7 +66,7 @@ static int c__1 = 1;
     void r_cnjg(complex *, complex *);
 
     /* Local variables */
-    static int j, k, l;
+    static blasint j, k, l;
     static complex a11;
     static float db;
     static complex x11;
@@ -75,20 +75,20 @@ static int c__1 = 1;
     static float dum[1], eps, sgn, smin;
     static complex suml, sumr;
     /* Complex */ complex cdotc_(int *, complex *, int
-	    *, complex *, int *);
-    extern int lsame_(char *, char *, ftnlen, ftnlen);
+	    *, complex *, blasint *);
+    extern blasint lsame_(char *, char *, ftnlen, ftnlen);
     /* Complex */ complex cdotu_(int *, complex *, int
-	    *, complex *, int *);
-    extern /* Subroutine */ int slabad_(float *, float *);
-    extern float clange_(char *, int *, int *, complex *,
-	    int *, float *, ftnlen);
+	    *, complex *, blasint *);
+    extern /* Subroutine */ blasint slabad_(float *, float *);
+    extern float clange_(char *, blasint *, blasint *, complex *,
+	    blasint *, float *, ftnlen);
     /* Complex */ complex cladiv_(complex *, complex *);
     static float scaloc;
     extern float slamch_(char *, ftnlen);
-    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
-	    *), xerbla_(char *, int *, ftnlen);
+    extern /* Subroutine */ blasint csscal_(int *, float *, complex *, int
+	    *), xerbla_(char *, blasint *, ftnlen);
     static float bignum;
-    static int notrna, notrnb;
+    static blasint notrna, notrnb;
     static float smlnum;
 
     /* Parameter adjustments */
diff --git a/relapack/src/ctrtri.c b/relapack/src/ctrtri.c
index 0262cb59d9..5201a24c73 100644
--- a/relapack/src/ctrtri.c
+++ b/relapack/src/ctrtri.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_ctrtri_rec(const char *, const char *, const int *,
-    float *, const int *, int *);
+static void RELAPACK_ctrtri_rec(const char *, const char *, const blasint *,
+    float *, const blasint *, blasint *);
 
 
 /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
@@ -11,16 +11,16 @@ static void RELAPACK_ctrtri_rec(const char *, const char *, const int *,
  * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html
  * */
 void RELAPACK_ctrtri(
-    const char *uplo, const char *diag, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const char *diag, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
-    const int nounit = LAPACK(lsame)(diag, "N");
-    const int unit = LAPACK(lsame)(diag, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
+    const blasint nounit = LAPACK(lsame)(diag, "N");
+    const blasint unit = LAPACK(lsame)(diag, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -31,8 +31,8 @@ void RELAPACK_ctrtri(
     else if (*ldA < MAX(1, *n))
         *info = -5;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("CTRTRI", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("CTRTRI", &minfo, strlen("CTRTRI"));
         return;
     }
 
@@ -42,7 +42,7 @@ void RELAPACK_ctrtri(
 
     // check for singularity
     if (nounit) {
-        int i;
+        blasint i;
         for (i = 0; i < *n; i++)
             if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
                 *info = i;
@@ -57,9 +57,9 @@ void RELAPACK_ctrtri(
 
 /** ctrtri's recursive compute kernel */
 static void RELAPACK_ctrtri_rec(
-    const char *uplo, const char *diag, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const char *diag, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ){
 
     if (*n <= MAX(CROSSOVER_CTRTRI, 1)) {
@@ -73,8 +73,8 @@ static void RELAPACK_ctrtri_rec(
     const float MONE[] = { -1., 0. };
 
     // Splitting
-    const int n1 = CREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = CREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c
index 1a1757d311..cdf06ad5be 100644
--- a/relapack/src/dgbtrf.c
+++ b/relapack/src/dgbtrf.c
@@ -1,9 +1,9 @@
 #include "relapack.h"
-#include "stdlib.h"
-
-static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *,
-    const int *, double *, const int *, int *, double *, const int *, double *,
-    const int *, int *);
+#include <stdlib.h>
+#include <stdio.h>
+static void RELAPACK_dgbtrf_rec(const blasint *, const blasint *, const blasint *,
+    const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *,
+    const blasint *, blasint *);
 
 
 /** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
@@ -13,9 +13,9 @@ static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *,
  * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html
  * */
 void RELAPACK_dgbtrf(
-    const int *m, const int *n, const int *kl, const int *ku,
-    double *Ab, const int *ldAb, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n, const blasint *kl, const blasint *ku,
+    double *Ab, const blasint *ldAb, blasint *ipiv,
+    blasint *info
 ) {
 
     // Check arguments
@@ -31,8 +31,8 @@ void RELAPACK_dgbtrf(
     else if (*ldAb < 2 * *kl + *ku + 1)
         *info = -6;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("DGBTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("DGBTRF", &minfo, strlen("DGBTRF"));
         return;
     }
 
@@ -40,14 +40,14 @@ void RELAPACK_dgbtrf(
     const double ZERO[] = { 0. };
 
     // Result upper band width
-    const int kv = *ku + *kl;
+    const blasint kv = *ku + *kl;
 
     // Unskew A
-    const int ldA[] = { *ldAb - 1 };
+    const blasint ldA[] = { *ldAb - 1 };
     double *const A = Ab + kv;
 
     // Zero upper diagonal fill-in elements
-    int i, j;
+    blasint i, j;
     for (j = 0; j < *n; j++) {
         double *const A_j = A + *ldA * j;
         for (i = MAX(0, j - kv); i < j - *ku; i++)
@@ -55,11 +55,12 @@ void RELAPACK_dgbtrf(
     }
 
     // Allocate work space
-    const int n1 = DREC_SPLIT(*n);
-    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
-    const int nWorkl = (kv > n1) ? n1 : kv;
-    const int mWorku = (*kl > n1) ? n1 : *kl;
-    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    const blasint n1 = DREC_SPLIT(*n);
+    const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv);
+    const blasint nWorkl = abs( (kv > n1) ? n1 : kv);
+    const blasint mWorku = abs( (*kl > n1) ? n1 : *kl);
+//    const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl);
+    const blasint nWorku = abs( (*kl > n1) ? MAX(1, *n - *kl) : *kl);
     double *Workl = malloc(mWorkl * nWorkl * sizeof(double));
     double *Worku = malloc(mWorku * nWorku * sizeof(double));
     LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
@@ -76,10 +77,10 @@ void RELAPACK_dgbtrf(
 
 /** dgbtrf's recursive compute kernel */
 static void RELAPACK_dgbtrf_rec(
-    const int *m, const int *n, const int *kl, const int *ku,
-    double *Ab, const int *ldAb, int *ipiv,
-    double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
-    int *info
+    const blasint *m, const blasint *n, const blasint *kl, const blasint *ku,
+    double *Ab, const blasint *ldAb, blasint *ipiv,
+    double *Workl, const blasint *ldWorkl, double *Worku, const blasint *ldWorku,
+    blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_DGBTRF, 1)) {
@@ -91,25 +92,25 @@ static void RELAPACK_dgbtrf_rec(
     // Constants
     const double ONE[]  = { 1. };
     const double MONE[] = { -1. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
     // Loop iterators
-    int i, j;
+    blasint i, j;
 
     // Output upper band width
-    const int kv = *ku + *kl;
+    const blasint kv = *ku + *kl;
 
     // Unskew A
-    const int ldA[] = { *ldAb - 1 };
+    const blasint ldA[] = { *ldAb - 1 };
     double *const A = Ab + kv;
 
     // Splitting
-    const int n1  = MIN(DREC_SPLIT(*n), *kl);
-    const int n2  = *n - n1;
-    const int m1  = MIN(n1, *m);
-    const int m2  = *m - m1;
-    const int mn1 = MIN(m1, n1);
-    const int mn2 = MIN(m2, n2);
+    const blasint n1  = MIN(DREC_SPLIT(*n), *kl);
+    const blasint n2  = *n - n1;
+    const blasint m1  = MIN(n1, *m);
+    const blasint m2  = *m - m1;
+    const blasint mn1 = MIN(m1, n1);
+    const blasint mn2 = MIN(m2, n2);
 
     // Ab_L *
     //      Ab_BR
@@ -129,14 +130,14 @@ static void RELAPACK_dgbtrf_rec(
 
     // ipiv_T
     // ipiv_B
-    int *const ipiv_T = ipiv;
-    int *const ipiv_B = ipiv + n1;
+    blasint *const ipiv_T = ipiv;
+    blasint *const ipiv_B = ipiv + n1;
 
     // Banded splitting
-    const int n21 = MIN(n2, kv - n1);
-    const int n22 = MIN(n2 - n21, n1);
-    const int m21 = MIN(m2, *kl - m1);
-    const int m22 = MIN(m2 - m21, m1);
+    const blasint n21 = MIN(n2, kv - n1);
+    const blasint n22 = MIN(n2 - n21, n1);
+    const blasint m21 = MIN(m2, *kl - m1);
+    const blasint m22 = MIN(m2 - m21, m1);
 
     //   n1 n21  n22
     // m *  A_Rl ARr
@@ -164,7 +165,7 @@ static void RELAPACK_dgbtrf_rec(
 
     // partially redo swaps in A_L
     for (i = 0; i < mn1; i++) {
-        const int ip = ipiv_T[i] - 1;
+        const blasint ip = ipiv_T[i] - 1;
         if (ip != i) {
             if (ip < *kl)
                 BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
@@ -180,7 +181,7 @@ static void RELAPACK_dgbtrf_rec(
     for (j = 0; j < n22; j++) {
         double *const A_Rrj = A_Rr + *ldA * j;
         for (i = j; i < mn1; i++) {
-            const int ip = ipiv_T[i] - 1;
+            const blasint ip = ipiv_T[i] - 1;
             if (ip != i) {
                 const double tmp = A_Rrj[i];
                 A_Rrj[i] = A_Rr[ip];
@@ -208,7 +209,7 @@ static void RELAPACK_dgbtrf_rec(
 
     // partially undo swaps in A_L
     for (i = mn1 - 1; i >= 0; i--) {
-        const int ip = ipiv_T[i] - 1;
+        const blasint ip = ipiv_T[i] - 1;
         if (ip != i) {
             if (ip < *kl)
                 BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
@@ -218,7 +219,8 @@ static void RELAPACK_dgbtrf_rec(
     }
 
     // recursion(Ab_BR, ipiv_B)
-    RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+//    RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+        LAPACK(dgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
     if (*info)
         *info += n1;
     // shift pivots
diff --git a/relapack/src/dgemmt.c b/relapack/src/dgemmt.c
index 9c925b5861..1ceab6c377 100644
--- a/relapack/src/dgemmt.c
+++ b/relapack/src/dgemmt.c
@@ -1,12 +1,12 @@
 #include "relapack.h"
 
 static void RELAPACK_dgemmt_rec(const char *, const char *, const char *,
-    const int *, const int *, const double *, const double *, const int *,
-    const double *, const int *, const double *, double *, const int *);
+    const blasint *, const blasint *, const double *, const double *, const blasint *,
+    const double *, const blasint *, const double *, double *, const blasint *);
 
 static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *,
-    const int *, const int *, const double *, const double *, const int *,
-    const double *, const int *, const double *, double *, const int *);
+    const blasint *, const blasint *, const double *, const double *, const blasint *,
+    const double *, const blasint *, const double *, double *, const blasint *);
 
 
 /** DGEMMT computes a matrix-matrix product with general matrices but updates
@@ -20,10 +20,10 @@ static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *,
  * */
 void RELAPACK_dgemmt(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const double *alpha, const double *A, const int *ldA,
-    const double *B, const int *ldB,
-    const double *beta, double *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const double *alpha, const double *A, const blasint *ldA,
+    const double *B, const blasint *ldB,
+    const double *beta, double *C, const blasint *ldC
 ) {
 
 #if HAVE_XGEMMT
@@ -32,13 +32,13 @@ void RELAPACK_dgemmt(
 #else
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
-    const int notransA = LAPACK(lsame)(transA, "N");
-    const int tranA = LAPACK(lsame)(transA, "T");
-    const int notransB = LAPACK(lsame)(transB, "N");
-    const int tranB = LAPACK(lsame)(transB, "T");
-    int info = 0;
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
+    const blasint notransA = LAPACK(lsame)(transA, "N");
+    const blasint tranA = LAPACK(lsame)(transA, "T");
+    const blasint notransB = LAPACK(lsame)(transB, "N");
+    const blasint tranB = LAPACK(lsame)(transB, "T");
+    blasint info = 0;
     if (!lower && !upper)
         info = 1;
     else if (!tranA && !notransA)
@@ -56,7 +56,7 @@ void RELAPACK_dgemmt(
     else if (*ldC < MAX(1, *n))
         info = 13;
     if (info) {
-        LAPACK(xerbla)("DGEMMT", &info);
+        LAPACK(xerbla)("DGEMMT", &info, strlen("DGEMMT"));
         return;
     }
 
@@ -74,10 +74,10 @@ void RELAPACK_dgemmt(
 /** dgemmt's recursive compute kernel */
 static void RELAPACK_dgemmt_rec(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const double *alpha, const double *A, const int *ldA,
-    const double *B, const int *ldB,
-    const double *beta, double *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const double *alpha, const double *A, const blasint *ldA,
+    const double *B, const blasint *ldB,
+    const double *beta, double *C, const blasint *ldC
 ) {
 
     if (*n <= MAX(CROSSOVER_DGEMMT, 1)) {
@@ -87,8 +87,8 @@ static void RELAPACK_dgemmt_rec(
     }
 
     // Splitting
-    const int n1 = DREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = DREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_T
     // A_B
@@ -124,16 +124,16 @@ static void RELAPACK_dgemmt_rec(
 /** dgemmt's unblocked compute kernel */
 static void RELAPACK_dgemmt_rec2(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const double *alpha, const double *A, const int *ldA,
-    const double *B, const int *ldB,
-    const double *beta, double *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const double *alpha, const double *A, const blasint *ldA,
+    const double *B, const blasint *ldB,
+    const double *beta, double *C, const blasint *ldC
 ) {
 
-    const int incB = (*transB == 'N') ? 1 : *ldB;
-    const int incC = 1;
+    const blasint incB = (*transB == 'N') ? 1 : *ldB;
+    const blasint incC = 1;
 
-    int i;
+    blasint i;
     for (i = 0; i < *n; i++) {
         // A_0
         // A_i
@@ -149,13 +149,13 @@ static void RELAPACK_dgemmt_rec2(
         double *const C_ii = C + *ldC * i + i;
 
         if (*uplo == 'L') {
-            const int nmi = *n - i;
+            const blasint nmi = *n - i;
             if (*transA == 'N')
                 BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
             else
                 BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
         } else {
-            const int ip1 = i + 1;
+            const blasint ip1 = i + 1;
             if (*transA == 'N')
                 BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
             else
diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c
index 07f5472fd1..be960fde9e 100644
--- a/relapack/src/dgetrf.c
+++ b/relapack/src/dgetrf.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_dgetrf_rec(const int *, const int *, double *,
-    const int *, int *, int *);
+static void RELAPACK_dgetrf_rec(const blasint *, const blasint *, double *,
+    const blasint *, blasint *, blasint *);
 
 
 /** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
@@ -11,26 +11,25 @@ static void RELAPACK_dgetrf_rec(const int *, const int *, double *,
  * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html
  * */
 void RELAPACK_dgetrf(
-    const int *m, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    blasint *info
 ) {
-
     // Check arguments
     *info = 0;
     if (*m < 0)
         *info = -1;
     else if (*n < 0)
         *info = -2;
-    else if (*ldA < MAX(1, *n))
+    else if (*ldA < MAX(1, *m))
         *info = -4;
-    if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("DGETRF", &minfo);
+    if (*info!=0) {
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("DGETRF", &minfo, strlen("DGETRF"));
         return;
     }
 
-    const int sn = MIN(*m, *n);
+    const blasint sn = MIN(*m, *n);
 
     RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info);
 
@@ -38,10 +37,10 @@ void RELAPACK_dgetrf(
     if (*m < *n) {
         // Constants
         const double ONE[] = { 1. };
-        const int   iONE[] = { 1. };
+        const blasint   iONE[] = { 1. };
 
         // Splitting
-        const int rn = *n - *m;
+        const blasint rn = *n - *m;
 
         // A_L A_R
         const double *const A_L = A;
@@ -57,9 +56,9 @@ void RELAPACK_dgetrf(
 
 /** dgetrf's recursive compute kernel */
 static void RELAPACK_dgetrf_rec(
-    const int *m, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_DGETRF, 1)) {
@@ -71,12 +70,12 @@ static void RELAPACK_dgetrf_rec(
     // Constants
     const double ONE[]  = { 1. };
     const double MONE[] = { -1. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
     // Splitting
-    const int n1 = DREC_SPLIT(*n);
-    const int n2 = *n - n1;
-    const int m2 = *m - n1;
+    const blasint n1 = DREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
+    const blasint m2 = *m - n1;
 
     // A_L A_R
     double *const A_L = A;
@@ -91,8 +90,8 @@ static void RELAPACK_dgetrf_rec(
 
     // ipiv_T
     // ipiv_B
-    int *const ipiv_T = ipiv;
-    int *const ipiv_B = ipiv + n1;
+    blasint *const ipiv_T = ipiv;
+    blasint *const ipiv_B = ipiv + n1;
 
     // recursion(A_L, ipiv_T)
     RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
@@ -111,7 +110,7 @@ static void RELAPACK_dgetrf_rec(
     // apply pivots to A_BL
     LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
     // shift pivots
-    int i;
+    blasint i;
     for (i = 0; i < n2; i++)
         ipiv_B[i] += n1;
 }
diff --git a/relapack/src/dlauum.c b/relapack/src/dlauum.c
index d722ea809f..6c7dcccb33 100644
--- a/relapack/src/dlauum.c
+++ b/relapack/src/dlauum.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_dlauum_rec(const char *, const int *, double *,
-    const int *, int *);
+static void RELAPACK_dlauum_rec(const char *, const blasint *, double *,
+    const blasint *, blasint *);
 
 
 /** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
@@ -11,14 +11,14 @@ static void RELAPACK_dlauum_rec(const char *, const int *, double *,
  * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html
  * */
 void RELAPACK_dlauum(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -27,8 +27,8 @@ void RELAPACK_dlauum(
     else if (*ldA < MAX(1, *n))
         *info = -4;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("DLAUUM", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("DLAUUM", &minfo, strlen("DLAUUM"));
         return;
     }
 
@@ -42,9 +42,9 @@ void RELAPACK_dlauum(
 
 /** dlauum's recursive compute kernel */
 static void RELAPACK_dlauum_rec(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_DLAUUM, 1)) {
@@ -57,8 +57,8 @@ static void RELAPACK_dlauum_rec(
     const double ONE[] = { 1. };
 
     // Splitting
-    const int n1 = DREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = DREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/dpbtrf.c b/relapack/src/dpbtrf.c
index 6fd0ebe481..9380b28ad6 100644
--- a/relapack/src/dpbtrf.c
+++ b/relapack/src/dpbtrf.c
@@ -1,8 +1,8 @@
 #include "relapack.h"
 #include "stdlib.h"
 
-static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *,
-    double *, const int *, double *, const int *, int *);
+static void RELAPACK_dpbtrf_rec(const char *, const blasint *, const blasint *,
+    double *, const blasint *, double *, const blasint *, blasint *);
 
 
 /** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
@@ -12,14 +12,14 @@ static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *,
  * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html
  * */
 void RELAPACK_dpbtrf(
-    const char *uplo, const int *n, const int *kd,
-    double *Ab, const int *ldAb,
-    int *info
+    const char *uplo, const blasint *n, const blasint *kd,
+    double *Ab, const blasint *ldAb,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -30,8 +30,8 @@ void RELAPACK_dpbtrf(
     else if (*ldAb < *kd + 1)
         *info = -5;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("DPBTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("DPBTRF", &minfo, strlen("DPBTRF"));
         return;
     }
 
@@ -42,9 +42,9 @@ void RELAPACK_dpbtrf(
     const double ZERO[] = { 0. };
 
     // Allocate work space
-    const int n1 = DREC_SPLIT(*n);
-    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
-    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+    const blasint n1 = DREC_SPLIT(*n);
+    const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+    const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
     double *Work = malloc(mWork * nWork * sizeof(double));
     LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
 
@@ -58,10 +58,10 @@ void RELAPACK_dpbtrf(
 
 /** dpbtrf's recursive compute kernel */
 static void RELAPACK_dpbtrf_rec(
-    const char *uplo, const int *n, const int *kd,
-    double *Ab, const int *ldAb,
-    double *Work, const int *ldWork,
-    int *info
+    const char *uplo, const blasint *n, const blasint *kd,
+    double *Ab, const blasint *ldAb,
+    double *Work, const blasint *ldWork,
+    blasint *info
 ){
 
     if (*n <= MAX(CROSSOVER_DPBTRF, 1)) {
@@ -75,12 +75,12 @@ static void RELAPACK_dpbtrf_rec(
     const double MONE[] = { -1. };
 
     // Unskew A
-    const int ldA[] = { *ldAb - 1 };
+    const blasint ldA[] = { *ldAb - 1 };
     double *const A = Ab + ((*uplo == 'L') ? 0 : *kd);
 
     // Splitting
-    const int n1 = MIN(DREC_SPLIT(*n), *kd);
-    const int n2 = *n - n1;
+    const blasint n1 = MIN(DREC_SPLIT(*n), *kd);
+    const blasint n2 = *n - n1;
 
     // * *
     // * Ab_BR
@@ -99,8 +99,8 @@ static void RELAPACK_dpbtrf_rec(
         return;
 
     // Banded splitting
-    const int n21 = MIN(n2, *kd - n1);
-    const int n22 = MIN(n2 - n21, n1);
+    const blasint n21 = MIN(n2, *kd - n1);
+    const blasint n22 = MIN(n2 - n21, n1);
 
     //     n1    n21    n22
     // n1  *     A_TRl  A_TRr
diff --git a/relapack/src/dpotrf.c b/relapack/src/dpotrf.c
index c14fb3d718..cf326b18fd 100644
--- a/relapack/src/dpotrf.c
+++ b/relapack/src/dpotrf.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_dpotrf_rec(const char *, const int *, double *,
-        const int *, int *);
+static void RELAPACK_dpotrf_rec(const char *, const blasint *, double *,
+        const blasint *, blasint *);
 
 
 /** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
@@ -11,14 +11,14 @@ static void RELAPACK_dpotrf_rec(const char *, const int *, double *,
  * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html
  * */
 void RELAPACK_dpotrf(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -27,8 +27,8 @@ void RELAPACK_dpotrf(
     else if (*ldA < MAX(1, *n))
         *info = -4;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("DPOTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("DPOTRF", &minfo, strlen("DPOTRF"));
         return;
     }
 
@@ -42,9 +42,9 @@ void RELAPACK_dpotrf(
 
 /** dpotrf's recursive compute kernel */
 static void RELAPACK_dpotrf_rec(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ){
 
     if (*n <= MAX(CROSSOVER_DPOTRF, 1)) {
@@ -58,8 +58,8 @@ static void RELAPACK_dpotrf_rec(
     const double MONE[] = { -1. };
 
     // Splitting
-    const int n1 = DREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = DREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/dsygst.c b/relapack/src/dsygst.c
index 0228068cef..f68241e3ab 100644
--- a/relapack/src/dsygst.c
+++ b/relapack/src/dsygst.c
@@ -3,9 +3,9 @@
 #include "stdlib.h"
 #endif
 
-static void RELAPACK_dsygst_rec(const int *, const char *, const int *,
-    double *, const int *, const double *, const int *,
-    double *, const int *, int *);
+static void RELAPACK_dsygst_rec(const blasint *, const char *, const blasint *,
+    double *, const blasint *, const double *, const blasint *,
+    double *, const blasint *, blasint *);
 
 
 /** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form.
@@ -15,14 +15,14 @@ static void RELAPACK_dsygst_rec(const int *, const char *, const int *,
  * http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html
  * */
 void RELAPACK_dsygst(
-    const int *itype, const char *uplo, const int *n,
-    double *A, const int *ldA, const double *B, const int *ldB,
-    int *info
+    const blasint *itype, const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (*itype < 1 || *itype > 3)
         *info = -1;
@@ -35,8 +35,8 @@ void RELAPACK_dsygst(
     else if (*ldB < MAX(1, *n))
         *info = -7;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("DSYGST", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("DSYGST", &minfo, strlen("DSYGST"));
         return;
     }
 
@@ -45,10 +45,10 @@ void RELAPACK_dsygst(
 
     // Allocate work space
     double *Work = NULL;
-    int    lWork = 0;
+    blasint    lWork = 0;
 #if XSYGST_ALLOW_MALLOC
-    const int n1 = DREC_SPLIT(*n);
-    lWork = n1 * (*n - n1);
+    const blasint n1 = DREC_SPLIT(*n);
+    lWork = abs( n1 * (*n - n1) );
     Work  = malloc(lWork * sizeof(double));
     if (!Work)
         lWork = 0;
@@ -67,9 +67,9 @@ void RELAPACK_dsygst(
 
 /** dsygst's recursive compute kernel */
 static void RELAPACK_dsygst_rec(
-    const int *itype, const char *uplo, const int *n,
-    double *A, const int *ldA, const double *B, const int *ldB,
-    double *Work, const int *lWork, int *info
+    const blasint *itype, const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *Work, const blasint *lWork, blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_SSYGST, 1)) {
@@ -84,14 +84,14 @@ static void RELAPACK_dsygst_rec(
     const double MONE[]  = { -1. };
     const double HALF[]  = { .5 };
     const double MHALF[] = { -.5 };
-    const int    iONE[]  = { 1 };
+    const blasint    iONE[]  = { 1 };
 
     // Loop iterator
-    int i;
+    blasint i;
 
     // Splitting
-    const int n1 = DREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = DREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/dsytrf.c b/relapack/src/dsytrf.c
index 80b119336a..43d28f94eb 100644
--- a/relapack/src/dsytrf.c
+++ b/relapack/src/dsytrf.c
@@ -3,8 +3,8 @@
 #include <stdlib.h>
 #endif
 
-static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *,
-    double *, const int *, int *, double *, const int *, int *);
+static void RELAPACK_dsytrf_rec(const char *, const blasint *, const blasint *, blasint *,
+    double *, const blasint *, blasint *, double *, const blasint *, blasint *);
 
 
 /** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
@@ -14,21 +14,21 @@ static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *,
  * http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html
  * */
 void RELAPACK_dsytrf(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *lWork, blasint *info
 ) {
 
     // Required work size
-    const int cleanlWork = *n * (*n / 2);
-    int minlWork = cleanlWork;
+    const blasint cleanlWork = *n * (*n / 2);
+    blasint minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
     minlWork = 1;
 #endif
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -55,8 +55,8 @@ void RELAPACK_dsytrf(
 #endif
 
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("DSYTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("DSYTRF", &minfo, strlen("DSYTRF"));
         return;
     }
 
@@ -64,7 +64,7 @@ void RELAPACK_dsytrf(
     const char cleanuplo = lower ? 'L' : 'U';
 
     // Dummy arguments
-    int nout;
+    blasint nout;
 
     // Recursive kernel
     RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
@@ -78,13 +78,13 @@ void RELAPACK_dsytrf(
 
 /** dsytrf's recursive compute kernel */
 static void RELAPACK_dsytrf_rec(
-    const char *uplo, const int *n_full, const int *n, int *n_out,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *ldWork, int *info
+    const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *ldWork, blasint *info
 ) {
 
     // top recursion level?
-    const int top = *n_full == *n;
+    const blasint top = *n_full == *n;
 
     if (*n <= MAX(CROSSOVER_DSYTRF, 3)) {
         // Unblocked
@@ -96,34 +96,34 @@ static void RELAPACK_dsytrf_rec(
         return;
     }
 
-    int info1, info2;
+    blasint info1, info2;
 
     // Constants
     const double ONE[]  = { 1. };
     const double MONE[] = { -1. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
     // Loop iterator
-    int i;
+    blasint i;
 
-    const int n_rest = *n_full - *n;
+    const blasint n_rest = *n_full - *n;
 
     if (*uplo == 'L') {
         // Splitting (setup)
-        int n1 = DREC_SPLIT(*n);
-        int n2 = *n - n1;
+        blasint n1 = DREC_SPLIT(*n);
+        blasint n2 = *n - n1;
 
         // Work_L *
         double *const Work_L = Work;
 
         // recursion(A_L)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
         n1 = n1_out;
 
         // Splitting (continued)
         n2 = *n - n1;
-        const int n_full2 = *n_full - n1;
+        const blasint n_full2 = *n_full - n1;
 
         // *      *
         // A_BL   A_BR
@@ -139,23 +139,23 @@ static void RELAPACK_dsytrf_rec(
         // (top recursion level: use Work as Work_BR)
         double *const Work_BL =              Work                + n1;
         double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
-        const int ldWork_BR = top ? n2 : *ldWork;
+        const blasint ldWork_BR = top ? n2 : *ldWork;
 
         // ipiv_T
         // ipiv_B
-        int *const ipiv_B = ipiv + n1;
+        blasint *const ipiv_B = ipiv + n1;
 
         // A_BR = A_BR - A_BL Work_BL'
         RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
         BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
 
         // recursion(A_BR)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
 
         if (n2_out != n2) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // last column of A_BR
             double *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
@@ -182,22 +182,22 @@ static void RELAPACK_dsytrf_rec(
         *n_out = n1 + n2;
     } else {
         // Splitting (setup)
-        int n2 = DREC_SPLIT(*n);
-        int n1 = *n - n2;
+        blasint n2 = DREC_SPLIT(*n);
+        blasint n1 = *n - n2;
 
         // * Work_R
         // (top recursion level: use Work as Work_R)
         double *const Work_R = top ? Work : Work + *ldWork * n1;
 
         // recursion(A_R)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
-        const int n2_diff = n2 - n2_out;
+        const blasint n2_diff = n2 - n2_out;
         n2 = n2_out;
 
         // Splitting (continued)
         n1 = *n - n2;
-        const int n_full1  = *n_full - n2;
+        const blasint n_full1  = *n_full - n2;
 
         // * A_TL_T A_TR_T
         // * A_TL   A_TR
@@ -213,19 +213,19 @@ static void RELAPACK_dsytrf_rec(
         // (top recursion level: Work_R was Work)
         double *const Work_L  = Work;
         double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
-        const int ldWork_L = top ? n1 : *ldWork;
+        const blasint ldWork_L = top ? n1 : *ldWork;
 
         // A_TL = A_TL - A_TR Work_TR'
         RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
         BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
 
         // recursion(A_TL)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
 
         if (n1_out != n1) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
             BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
diff --git a/relapack/src/dsytrf_rec2.c b/relapack/src/dsytrf_rec2.c
index 72ef827b16..6ed1a47a25 100644
--- a/relapack/src/dsytrf_rec2.c
+++ b/relapack/src/dsytrf_rec2.c
@@ -14,7 +14,7 @@
 
 /* Table of constant values */
 
-static int c__1 = 1;
+static blasint c__1 = 1;
 static double c_b8 = -1.;
 static double c_b9 = 1.;
 
@@ -25,33 +25,33 @@ static double c_b9 = 1.;
  * The blocked BLAS Level 3 updates were removed and moved to the
  * recursive algorithm.
  * */
-/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int *
-	nb, int *kb, double *a, int *lda, int *ipiv,
-	double *w, int *ldw, int *info, ftnlen uplo_len)
+/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, blasint *n, blasint *
+	nb, blasint *kb, double *a, blasint *lda, blasint *ipiv,
+	double *w, blasint *ldw, blasint *info, ftnlen uplo_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
+    blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
     double d__1, d__2, d__3;
 
     /* Builtin functions */
     double sqrt(double);
 
     /* Local variables */
-    static int j, k;
+    static blasint j, k;
     static double t, r1, d11, d21, d22;
-    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static blasint jj, kk, jp, kp, kw, kkw, imax, jmax;
     static double alpha;
-    extern /* Subroutine */ int dscal_(int *, double *, double *,
-	    int *);
+    extern /* Subroutine */ blasint dscal_(int *, double *, double *,
+	    blasint *);
     extern logical lsame_(char *, char *, ftnlen, ftnlen);
-    extern /* Subroutine */ int dgemv_(char *, int *, int *,
-	    double *, double *, int *, double *, int *,
-	    double *, double *, int *, ftnlen), dcopy_(int *,
-	    double *, int *, double *, int *), dswap_(int
-	    *, double *, int *, double *, int *);
-    static int kstep;
+    extern /* Subroutine */ blasint dgemv_(char *, blasint *, blasint *,
+	    double *, double *, blasint *, double *, blasint *,
+	    double *, double *, blasint *, ftnlen), dcopy_(int *,
+	    double *, blasint *, double *, blasint *), dswap_(int
+	    *, double *, blasint *, double *, blasint *);
+    static blasint kstep;
     static double absakk;
-    extern int idamax_(int *, double *, int *);
+    extern blasint idamax_(int *, double *, blasint *);
     static double colmax, rowmax;
 
     /* Parameter adjustments */
diff --git a/relapack/src/dsytrf_rook.c b/relapack/src/dsytrf_rook.c
index 19a875c7ad..78fa652abe 100644
--- a/relapack/src/dsytrf_rook.c
+++ b/relapack/src/dsytrf_rook.c
@@ -3,8 +3,8 @@
 #include <stdlib.h>
 #endif
 
-static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *,
-    double *, const int *, int *, double *, const int *, int *);
+static void RELAPACK_dsytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *,
+    double *, const blasint *, blasint *, double *, const blasint *, blasint *);
 
 
 /** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
@@ -14,21 +14,21 @@ static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int
  * http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html
  * */
 void RELAPACK_dsytrf_rook(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *lWork, blasint *info
 ) {
 
     // Required work size
-    const int cleanlWork = *n * (*n / 2);
-    int minlWork = cleanlWork;
+    const blasint cleanlWork = *n * (*n / 2);
+    blasint minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
     minlWork = 1;
 #endif
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -55,8 +55,8 @@ void RELAPACK_dsytrf_rook(
 #endif
 
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("DSYTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("DSYTRF", &minfo, strlen("DSYTRF"));
         return;
     }
 
@@ -64,7 +64,7 @@ void RELAPACK_dsytrf_rook(
     const char cleanuplo = lower ? 'L' : 'U';
 
     // Dummy argument
-    int nout;
+    blasint nout;
 
     // Recursive kernel
     RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
@@ -78,13 +78,13 @@ void RELAPACK_dsytrf_rook(
 
 /** dsytrf_rook's recursive compute kernel */
 static void RELAPACK_dsytrf_rook_rec(
-    const char *uplo, const int *n_full, const int *n, int *n_out,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *ldWork, int *info
+    const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *ldWork, blasint *info
 ) {
 
     // top recursion level?
-    const int top = *n_full == *n;
+    const blasint top = *n_full == *n;
 
     if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) {
         // Unblocked
@@ -96,31 +96,31 @@ static void RELAPACK_dsytrf_rook_rec(
         return;
     }
 
-    int info1, info2;
+    blasint info1, info2;
 
     // Constants
     const double ONE[]  = { 1. };
     const double MONE[] = { -1. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
-    const int n_rest = *n_full - *n;
+    const blasint n_rest = *n_full - *n;
 
     if (*uplo == 'L') {
         // Splitting (setup)
-        int n1 = DREC_SPLIT(*n);
-        int n2 = *n - n1;
+        blasint n1 = DREC_SPLIT(*n);
+        blasint n2 = *n - n1;
 
         // Work_L *
         double *const Work_L = Work;
 
         // recursion(A_L)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
         n1 = n1_out;
 
         // Splitting (continued)
         n2 = *n - n1;
-        const int n_full2   = *n_full - n1;
+        const blasint n_full2   = *n_full - n1;
 
         // *      *
         // A_BL   A_BR
@@ -136,23 +136,23 @@ static void RELAPACK_dsytrf_rook_rec(
         // (top recursion level: use Work as Work_BR)
         double *const Work_BL =              Work                + n1;
         double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
-        const int ldWork_BR = top ? n2 : *ldWork;
+        const blasint ldWork_BR = top ? n2 : *ldWork;
 
         // ipiv_T
         // ipiv_B
-        int *const ipiv_B = ipiv + n1;
+        blasint *const ipiv_B = ipiv + n1;
 
         // A_BR = A_BR - A_BL Work_BL'
         RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
         BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
 
         // recursion(A_BR)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
 
         if (n2_out != n2) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // last column of A_BR
             double *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
@@ -169,7 +169,7 @@ static void RELAPACK_dsytrf_rook_rec(
         n2 = n2_out;
 
         // shift pivots
-        int i;
+        blasint i;
         for (i = 0; i < n2; i++)
             if (ipiv_B[i] > 0)
                 ipiv_B[i] += n1;
@@ -180,22 +180,22 @@ static void RELAPACK_dsytrf_rook_rec(
         *n_out = n1 + n2;
     } else {
         // Splitting (setup)
-        int n2 = DREC_SPLIT(*n);
-        int n1 = *n - n2;
+        blasint n2 = DREC_SPLIT(*n);
+        blasint n1 = *n - n2;
 
         // * Work_R
         // (top recursion level: use Work as Work_R)
         double *const Work_R = top ? Work : Work + *ldWork * n1;
 
         // recursion(A_R)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
-        const int n2_diff = n2 - n2_out;
+        const blasint n2_diff = n2 - n2_out;
         n2 = n2_out;
 
         // Splitting (continued)
         n1 = *n - n2;
-        const int n_full1 = *n_full - n2;
+        const blasint n_full1 = *n_full - n2;
 
         // * A_TL_T A_TR_T
         // * A_TL   A_TR
@@ -211,19 +211,19 @@ static void RELAPACK_dsytrf_rook_rec(
         // (top recursion level: Work_R was Work)
         double *const Work_L  = Work;
         double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
-        const int ldWork_L = top ? n1 : *ldWork;
+        const blasint ldWork_L = top ? n1 : *ldWork;
 
         // A_TL = A_TL - A_TR Work_TR'
         RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
         BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
 
         // recursion(A_TL)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
 
         if (n1_out != n1) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
             BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
diff --git a/relapack/src/dsytrf_rook_rec2.c b/relapack/src/dsytrf_rook_rec2.c
index 105ef5ed3e..bdb5c6e29c 100644
--- a/relapack/src/dsytrf_rook_rec2.c
+++ b/relapack/src/dsytrf_rook_rec2.c
@@ -14,7 +14,7 @@
 
 /* Table of constant values */
 
-static int c__1 = 1;
+static blasint c__1 = 1;
 static double c_b9 = -1.;
 static double c_b10 = 1.;
 
@@ -25,39 +25,39 @@ static double c_b10 = 1.;
  * The blocked BLAS Level 3 updates were removed and moved to the
  * recursive algorithm.
  * */
-/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n,
-	int *nb, int *kb, double *a, int *lda, int *ipiv,
-	double *w, int *ldw, int *info, ftnlen uplo_len)
+/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, blasint *n,
+	int *nb, blasint *kb, double *a, blasint *lda, blasint *ipiv,
+	double *w, blasint *ldw, blasint *info, ftnlen uplo_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
+    blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
     double d__1;
 
     /* Builtin functions */
     double sqrt(double);
 
     /* Local variables */
-    static int j, k, p;
+    static blasint j, k, p;
     static double t, r1, d11, d12, d21, d22;
-    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw;
     static logical done;
-    static int imax, jmax;
+    static blasint imax, jmax;
     static double alpha;
-    extern /* Subroutine */ int dscal_(int *, double *, double *,
-	    int *);
+    extern /* Subroutine */ blasint dscal_(int *, double *, double *,
+	    blasint *);
     extern logical lsame_(char *, char *, ftnlen, ftnlen);
-    extern /* Subroutine */ int dgemv_(char *, int *, int *,
-	    double *, double *, int *, double *, int *,
-	    double *, double *, int *, ftnlen);
+    extern /* Subroutine */ blasint dgemv_(char *, blasint *, blasint *,
+	    double *, double *, blasint *, double *, blasint *,
+	    double *, double *, blasint *, ftnlen);
     static double dtemp, sfmin;
-    static int itemp;
-    extern /* Subroutine */ int dcopy_(int *, double *, int *,
-	    double *, int *), dswap_(int *, double *, int
-	    *, double *, int *);
-    static int kstep;
+    static blasint itemp;
+    extern /* Subroutine */ blasint dcopy_(int *, double *, blasint *,
+	    double *, blasint *), dswap_(int *, double *, int
+	    *, double *, blasint *);
+    static blasint kstep;
     extern double dlamch_(char *, ftnlen);
     static double absakk;
-    extern int idamax_(int *, double *, int *);
+    extern blasint idamax_(int *, double *, blasint *);
     static double colmax, rowmax;
 
     /* Parameter adjustments */
diff --git a/relapack/src/dtgsyl.c b/relapack/src/dtgsyl.c
index c506926af2..9bbc987e71 100644
--- a/relapack/src/dtgsyl.c
+++ b/relapack/src/dtgsyl.c
@@ -1,11 +1,11 @@
 #include "relapack.h"
 #include <math.h>
 
-static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *,
-    const int *, const double *, const int *, const double *, const int *,
-    double *, const int *, const double *, const int *, const double *,
-    const int *, double *, const int *, double *, double *, double *, int *,
-    int *, int *);
+static void RELAPACK_dtgsyl_rec(const char *, const blasint *, const blasint *,
+    const blasint *, const double *, const blasint *, const double *, const blasint *,
+    double *, const blasint *, const double *, const blasint *, const double *,
+    const blasint *, double *, const blasint *, double *, double *, double *, blasint *,
+    blasint *, blasint *);
 
 
 /** DTGSYL solves the generalized Sylvester equation.
@@ -15,21 +15,21 @@ static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *,
  * http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html
  * */
 void RELAPACK_dtgsyl(
-    const char *trans, const int *ijob, const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC,
-    const double *D, const int *ldD, const double *E, const int *ldE,
-    double *F, const int *ldF,
+    const char *trans, const blasint *ijob, const blasint *m, const blasint *n,
+    const double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *C, const blasint *ldC,
+    const double *D, const blasint *ldD, const double *E, const blasint *ldE,
+    double *F, const blasint *ldF,
     double *scale, double *dif,
-    double *Work, const int *lWork, int *iWork, int *info
+    double *Work, const blasint *lWork, blasint *iWork, blasint *info
 ) {
 
     // Parse arguments
-    const int notran = LAPACK(lsame)(trans, "N");
-    const int tran = LAPACK(lsame)(trans, "T");
+    const blasint notran = LAPACK(lsame)(trans, "N");
+    const blasint tran = LAPACK(lsame)(trans, "T");
 
     // Compute work buffer size
-    int lwmin = 1;
+    blasint lwmin = 1;
     if (notran && (*ijob == 1 || *ijob == 2))
         lwmin = MAX(1, 2 * *m * *n);
     *info = 0;
@@ -58,8 +58,8 @@ void RELAPACK_dtgsyl(
     else if (*lWork < lwmin && *lWork != -1)
         *info = -20;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("DTGSYL", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("DTGSYL", &minfo, strlen("DTGSYL"));
         return;
     }
 
@@ -75,8 +75,8 @@ void RELAPACK_dtgsyl(
     // Constant
     const double ZERO[] = { 0. };
 
-    int isolve = 1;
-    int ifunc  = 0;
+    blasint isolve = 1;
+    blasint ifunc  = 0;
     if (notran) {
         if (*ijob >= 3) {
             ifunc = *ijob - 2;
@@ -87,12 +87,12 @@ void RELAPACK_dtgsyl(
     }
 
     double scale2;
-    int iround;
+    blasint iround;
     for (iround = 1; iround <= isolve; iround++) {
         *scale = 1;
         double dscale = 0;
         double dsum   = 1;
-        int pq;
+        blasint pq;
         RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info);
         if (dscale != 0) {
             if (*ijob == 1 || *ijob == 3)
@@ -121,13 +121,13 @@ void RELAPACK_dtgsyl(
 
 /** dtgsyl's recursive vompute kernel */
 static void RELAPACK_dtgsyl_rec(
-    const char *trans, const int *ifunc, const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC,
-    const double *D, const int *ldD, const double *E, const int *ldE,
-    double *F, const int *ldF,
+    const char *trans, const blasint *ifunc, const blasint *m, const blasint *n,
+    const double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *C, const blasint *ldC,
+    const double *D, const blasint *ldD, const double *E, const blasint *ldE,
+    double *F, const blasint *ldF,
     double *scale, double *dsum, double *dscale,
-    int *iWork, int *pq, int *info
+    blasint *iWork, blasint *pq, blasint *info
 ) {
 
     if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) {
@@ -139,20 +139,20 @@ static void RELAPACK_dtgsyl_rec(
     // Constants
     const double ONE[]  = { 1. };
     const double MONE[] = { -1. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
     // Outputs
     double scale1[] = { 1. };
     double scale2[] = { 1. };
-    int    info1[]  = { 0 };
-    int    info2[]  = { 0 };
+    blasint    info1[]  = { 0 };
+    blasint    info2[]  = { 0 };
 
     if (*m > *n) {
         // Splitting
-        int m1 = DREC_SPLIT(*m);
+        blasint m1 = DREC_SPLIT(*m);
         if (A[m1 + *ldA * (m1 - 1)])
             m1++;
-        const int m2 = *m - m1;
+        const blasint m2 = *m - m1;
 
         // A_TL A_TR
         // 0    A_BR
@@ -210,10 +210,10 @@ static void RELAPACK_dtgsyl_rec(
         }
     } else {
         // Splitting
-        int n1 = DREC_SPLIT(*n);
+        blasint n1 = DREC_SPLIT(*n);
         if (B[n1 + *ldB * (n1 - 1)])
             n1++;
-        const int n2 = *n - n1;
+        const blasint n2 = *n - n1;
 
         // B_TL B_TR
         // 0    B_BR
diff --git a/relapack/src/dtrsyl.c b/relapack/src/dtrsyl.c
index c87b53ae52..7663773007 100644
--- a/relapack/src/dtrsyl.c
+++ b/relapack/src/dtrsyl.c
@@ -1,8 +1,8 @@
 #include "relapack.h"
 
-static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *,
-    const int *, const int *, const double *, const int *, const double *,
-    const int *, double *, const int *, double *, int *);
+static void RELAPACK_dtrsyl_rec(const char *, const char *, const blasint *,
+    const blasint *, const blasint *, const double *, const blasint *, const double *,
+    const blasint *, double *, const blasint *, double *, blasint *);
 
 
 /** DTRSYL solves the real Sylvester matrix equation.
@@ -12,20 +12,20 @@ static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *,
  * http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html
  * */
 void RELAPACK_dtrsyl(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC, double *scale,
-    int *info
+    const char *tranA, const char *tranB, const blasint *isgn,
+    const blasint *m, const blasint *n,
+    const double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *C, const blasint *ldC, double *scale,
+    blasint *info
 ) {
 
     // Check arguments
-    const int notransA = LAPACK(lsame)(tranA, "N");
-    const int transA = LAPACK(lsame)(tranA, "T");
-    const int ctransA = LAPACK(lsame)(tranA, "C");
-    const int notransB = LAPACK(lsame)(tranB, "N");
-    const int transB = LAPACK(lsame)(tranB, "T");
-    const int ctransB = LAPACK(lsame)(tranB, "C");
+    const blasint notransA = LAPACK(lsame)(tranA, "N");
+    const blasint transA = LAPACK(lsame)(tranA, "T");
+    const blasint ctransA = LAPACK(lsame)(tranA, "C");
+    const blasint notransB = LAPACK(lsame)(tranB, "N");
+    const blasint transB = LAPACK(lsame)(tranB, "T");
+    const blasint ctransB = LAPACK(lsame)(tranB, "C");
     *info = 0;
     if (!transA && !ctransA && !notransA)
         *info = -1;
@@ -44,8 +44,8 @@ void RELAPACK_dtrsyl(
     else if (*ldC < MAX(1, *m))
         *info = -11;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("DTRSYL", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("DTRSYL", &minfo, strlen("DTRSYL"));
         return;
     }
 
@@ -60,11 +60,11 @@ void RELAPACK_dtrsyl(
 
 /** dtrsyl's recursive compute kernel */
 static void RELAPACK_dtrsyl_rec(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC, double *scale,
-    int *info
+    const char *tranA, const char *tranB, const blasint *isgn,
+    const blasint *m, const blasint *n,
+    const double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *C, const blasint *ldC, double *scale,
+    blasint *info
 ) {
 
     if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) {
@@ -77,20 +77,20 @@ static void RELAPACK_dtrsyl_rec(
     const double ONE[]  = { 1. };
     const double MONE[] = { -1. };
     const double MSGN[] = { -*isgn };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
     // Outputs
     double scale1[] = { 1. };
     double scale2[] = { 1. };
-    int    info1[]  = { 0 };
-    int    info2[]  = { 0 };
+    blasint    info1[]  = { 0 };
+    blasint    info2[]  = { 0 };
 
     if (*m > *n) {
         // Splitting
-        int m1 = DREC_SPLIT(*m);
+        blasint m1 = DREC_SPLIT(*m);
         if (A[m1 + *ldA * (m1 - 1)])
             m1++;
-        const int m2 = *m - m1;
+        const blasint m2 = *m - m1;
 
         // A_TL A_TR
         // 0    A_BR
@@ -126,10 +126,10 @@ static void RELAPACK_dtrsyl_rec(
         }
     } else {
         // Splitting
-        int n1 = DREC_SPLIT(*n);
+        blasint n1 = DREC_SPLIT(*n);
         if (B[n1 + *ldB * (n1 - 1)])
             n1++;
-        const int n2 = *n - n1;
+        const blasint n2 = *n - n1;
 
         // B_TL B_TR
         // 0    B_BR
diff --git a/relapack/src/dtrsyl_rec2.c b/relapack/src/dtrsyl_rec2.c
index 479c7f340a..50dabf76d7 100644
--- a/relapack/src/dtrsyl_rec2.c
+++ b/relapack/src/dtrsyl_rec2.c
@@ -14,52 +14,52 @@
 
 /* Table of constant values */
 
-static int c__1 = 1;
-static int c_false = FALSE_;
-static int c__2 = 2;
+static blasint c__1 = 1;
+static blasint c_false = FALSE_;
+static blasint c__2 = 2;
 static double c_b26 = 1.;
 static double c_b30 = 0.;
-static int c_true = TRUE_;
+static blasint c_true = TRUE_;
 
-int RELAPACK_dtrsyl_rec2(char *trana, char *tranb, int *isgn, int
-	*m, int *n, double *a, int *lda, double *b, int *
-	ldb, double *c__, int *ldc, double *scale, int *info,
+int RELAPACK_dtrsyl_rec2(char *trana, char *tranb, blasint *isgn, int
+	*m, blasint *n, double *a, blasint *lda, double *b, blasint *
+	ldb, double *c__, blasint *ldc, double *scale, blasint *info,
 	ftnlen trana_len, ftnlen tranb_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+    blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4;
     double d__1, d__2;
 
     /* Local variables */
-    static int j, k, l;
+    static blasint j, k, l;
     static double x[4]	/* was [2][2] */;
-    static int k1, k2, l1, l2;
+    static blasint k1, k2, l1, l2;
     static double a11, db, da11, vec[4]	/* was [2][2] */, dum[1], eps,
 	     sgn;
-    extern double ddot_(int *, double *, int *, double *,
-	    int *);
-    static int ierr;
+    extern double ddot_(int *, double *, blasint *, double *,
+	    blasint *);
+    static blasint ierr;
     static double smin, suml, sumr;
-    extern /* Subroutine */ int dscal_(int *, double *, double *,
-	    int *);
-    extern int lsame_(char *, char *, ftnlen, ftnlen);
-    static int knext, lnext;
+    extern /* Subroutine */ blasint dscal_(int *, double *, double *,
+	    blasint *);
+    extern blasint lsame_(char *, char *, ftnlen, ftnlen);
+    static blasint knext, lnext;
     static double xnorm;
-    extern /* Subroutine */ int dlaln2_(int *, int *, int *,
-	    double *, double *, double *, int *, double *,
-	     double *, double *, int *, double *, double *
-	    , double *, int *, double *, double *, int *),
-	     dlasy2_(int *, int *, int *, int *, int *,
-	    double *, int *, double *, int *, double *,
-	    int *, double *, double *, int *, double *,
-	    int *), dlabad_(double *, double *);
-    extern double dlamch_(char *, ftnlen), dlange_(char *, int *,
-	    int *, double *, int *, double *, ftnlen);
+    extern /* Subroutine */ blasint dlaln2_(int *, blasint *, blasint *,
+	    double *, double *, double *, blasint *, double *,
+	     double *, double *, blasint *, double *, double *
+	    , double *, blasint *, double *, double *, blasint *),
+	     dlasy2_(int *, blasint *, blasint *, blasint *, blasint *,
+	    double *, blasint *, double *, blasint *, double *,
+	    blasint *, double *, double *, blasint *, double *,
+	    blasint *), dlabad_(double *, double *);
+    extern double dlamch_(char *, ftnlen), dlange_(char *, blasint *,
+	    blasint *, double *, blasint *, double *, ftnlen);
     static double scaloc;
-    extern /* Subroutine */ int xerbla_(char *, int *, ftnlen);
+    extern /* Subroutine */ blasint xerbla_(char *, blasint *, ftnlen);
     static double bignum;
-    static int notrna, notrnb;
+    static blasint notrna, notrnb;
     static double smlnum;
 
     /* Parameter adjustments */
diff --git a/relapack/src/dtrtri.c b/relapack/src/dtrtri.c
index 0462609e9e..72777e7e49 100644
--- a/relapack/src/dtrtri.c
+++ b/relapack/src/dtrtri.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_dtrtri_rec(const char *, const char *, const int *,
-    double *, const int *, int *);
+static void RELAPACK_dtrtri_rec(const char *, const char *, const blasint *,
+    double *, const blasint *, blasint *);
 
 
 /** DTRTRI computes the inverse of a real upper or lower triangular matrix A.
@@ -11,16 +11,16 @@ static void RELAPACK_dtrtri_rec(const char *, const char *, const int *,
  * http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html
  * */
 void RELAPACK_dtrtri(
-    const char *uplo, const char *diag, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const char *diag, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
-    const int nounit = LAPACK(lsame)(diag, "N");
-    const int unit = LAPACK(lsame)(diag, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
+    const blasint nounit = LAPACK(lsame)(diag, "N");
+    const blasint unit = LAPACK(lsame)(diag, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -31,8 +31,8 @@ void RELAPACK_dtrtri(
     else if (*ldA < MAX(1, *n))
         *info = -5;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("DTRTRI", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("DTRTRI", &minfo, strlen("DTRTRI"));
         return;
     }
 
@@ -42,7 +42,7 @@ void RELAPACK_dtrtri(
 
     // check for singularity
     if (nounit) {
-        int i;
+        blasint i;
         for (i = 0; i < *n; i++)
             if (A[i + *ldA * i] == 0) {
                 *info = i;
@@ -57,9 +57,9 @@ void RELAPACK_dtrtri(
 
 /** dtrtri's recursive compute kernel */
 static void RELAPACK_dtrtri_rec(
-    const char *uplo, const char *diag, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const char *diag, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ){
 
     if (*n <= MAX(CROSSOVER_DTRTRI, 1)) {
@@ -73,8 +73,8 @@ static void RELAPACK_dtrtri_rec(
     const double MONE[] = { -1. };
 
     // Splitting
-    const int n1 = DREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = DREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/f2c.c b/relapack/src/f2c.c
index 5a34524191..48539c4b9e 100644
--- a/relapack/src/f2c.c
+++ b/relapack/src/f2c.c
@@ -9,7 +9,7 @@
 #endif
 #endif
 
-void sig_die(const char *s, int kill) {
+void sig_die(const char *s, blasint kill) {
 	/* print error message, then clear buffers */
 	fprintf(stderr, "%s\n", s);
 
diff --git a/relapack/src/f2c.h b/relapack/src/f2c.h
index b94ee7c8e1..85337becfa 100644
--- a/relapack/src/f2c.h
+++ b/relapack/src/f2c.h
@@ -7,6 +7,19 @@
 #ifndef F2C_INCLUDE
 #define F2C_INCLUDE
 
+#ifdef USE64BITINT
+typedef BLASLONG blasint;
+#if defined(OS_WINDOWS) && defined(__64BIT__)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+                      
+                      
 typedef long int integer;
 typedef unsigned long int uinteger;
 typedef char *address;
diff --git a/relapack/src/lapack.h b/relapack/src/lapack.h
index 064276b7e0..776b0589fa 100644
--- a/relapack/src/lapack.h
+++ b/relapack/src/lapack.h
@@ -1,80 +1,80 @@
 #ifndef LAPACK_H
 #define LAPACK_H
 
-extern int LAPACK(lsame)(const char *, const char *);
-extern int LAPACK(xerbla)(const char *, const int *);
+extern blasint LAPACK(lsame)(const char *, const char *);
+extern blasint LAPACK(xerbla)(const char *, const blasint *, int);
 
-extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *);
-extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *);
-extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *);
-extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *);
+extern void LAPACK(slaswp)(const blasint *, float *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *);
+extern void LAPACK(dlaswp)(const blasint *, double *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *);
+extern void LAPACK(claswp)(const blasint *, float *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *);
+extern void LAPACK(zlaswp)(const blasint *, double *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *);
 
-extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *);
-extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *);
-extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *);
-extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *);
+extern void LAPACK(slaset)(const char *, const blasint *, const blasint *, const float *, const float *, float *, const blasint *);
+extern void LAPACK(dlaset)(const char *, const blasint *, const blasint *, const double *, const double *, double *, const blasint *);
+extern void LAPACK(claset)(const char *, const blasint *, const blasint *, const float *, const float *, float *, const blasint *);
+extern void LAPACK(zlaset)(const char *, const blasint *, const blasint *, const double *, const double *, double *, const blasint *);
 
-extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *);
-extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *);
-extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *);
-extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *);
+extern void LAPACK(slacpy)(const char *, const blasint *, const blasint *, const float *, const blasint *, float *, const blasint *);
+extern void LAPACK(dlacpy)(const char *, const blasint *, const blasint *, const double *, const blasint *, double *, const blasint *);
+extern void LAPACK(clacpy)(const char *, const blasint *, const blasint *, const float *, const blasint *, float *, const blasint *);
+extern void LAPACK(zlacpy)(const char *, const blasint *, const blasint *, const double *, const blasint *, double *, const blasint *);
 
-extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *);
-extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *);
-extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *);
-extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *);
+extern void LAPACK(slascl)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const blasint *, float *, const blasint *, blasint *);
+extern void LAPACK(dlascl)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const blasint *, double *, const blasint *, blasint *);
+extern void LAPACK(clascl)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const blasint *, float *, const blasint *, blasint *);
+extern void LAPACK(zlascl)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const blasint *, double *, const blasint *, blasint *);
 
-extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *);
-extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *);
-extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *);
-extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *);
+extern void LAPACK(slauu2)(const char *, const blasint *, float *, const blasint *, blasint *);
+extern void LAPACK(dlauu2)(const char *, const blasint *, double *, const blasint *, blasint *);
+extern void LAPACK(clauu2)(const char *, const blasint *, float *, const blasint *, blasint *);
+extern void LAPACK(zlauu2)(const char *, const blasint *, double *, const blasint *, blasint *);
 
-extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
-extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
-extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
-extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+extern void LAPACK(ssygs2)(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *);
+extern void LAPACK(dsygs2)(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *);
+extern void LAPACK(chegs2)(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *);
+extern void LAPACK(zhegs2)(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *);
 
-extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *);
-extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *);
-extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *);
-extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *);
+extern void LAPACK(strti2)(const char *, const char *, const blasint *, float *, const blasint *, blasint *);
+extern void LAPACK(dtrti2)(const char *, const char *, const blasint *, double *, const blasint *, blasint *);
+extern void LAPACK(ctrti2)(const char *, const char *, const blasint *, float *, const blasint *, blasint *);
+extern void LAPACK(ztrti2)(const char *, const char *, const blasint *, double *, const blasint *, blasint *);
 
-extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *);
-extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *);
-extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *);
-extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *);
+extern void LAPACK(spotf2)(const char *, const blasint *, float *, const blasint *, blasint *);
+extern void LAPACK(dpotf2)(const char *, const blasint *, double *, const blasint *, blasint *);
+extern void LAPACK(cpotf2)(const char *, const blasint *, float *, const blasint *, blasint *);
+extern void LAPACK(zpotf2)(const char *, const blasint *, double *, const blasint *, blasint *);
 
-extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *);
-extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *);
-extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *);
-extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *);
+extern void LAPACK(spbtf2)(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *);
+extern void LAPACK(dpbtf2)(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *);
+extern void LAPACK(cpbtf2)(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *);
+extern void LAPACK(zpbtf2)(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *);
 
-extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *);
-extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *);
-extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *);
-extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *);
-extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *);
-extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *);
-extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *);
-extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *);
-extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *);
-extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *);
-extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *);
-extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(ssytf2)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *);
+extern void LAPACK(dsytf2)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *);
+extern void LAPACK(csytf2)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *);
+extern void LAPACK(chetf2)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *);
+extern void LAPACK(zsytf2)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *);
+extern void LAPACK(zhetf2)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *);
+extern void LAPACK(ssytf2_rook)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *);
+extern void LAPACK(dsytf2_rook)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *);
+extern void LAPACK(csytf2_rook)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *);
+extern void LAPACK(chetf2_rook)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *);
+extern void LAPACK(zsytf2_rook)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *);
+extern void LAPACK(zhetf2_rook)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *);
 
-extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *);
-extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *);
-extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *);
-extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(sgetf2)(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *);
+extern void LAPACK(dgetf2)(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *);
+extern void LAPACK(cgetf2)(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *);
+extern void LAPACK(zgetf2)(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *);
 
-extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
-extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
-extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
-extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(sgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *);
+extern void LAPACK(dgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *);
+extern void LAPACK(cgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *);
+extern void LAPACK(zgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *);
 
-extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *);
-extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *);
-extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *);
-extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *);
+extern void LAPACK(stgsy2)(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, blasint *, blasint *, blasint *);
+extern void LAPACK(dtgsy2)(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, blasint *, blasint *, blasint *);
+extern void LAPACK(ctgsy2)(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, blasint *);
+extern void LAPACK(ztgsy2)(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, blasint *);
 
 #endif /* LAPACK_H */
diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c
index 4885472603..0252f3d92b 100644
--- a/relapack/src/lapack_wrappers.c
+++ b/relapack/src/lapack_wrappers.c
@@ -6,9 +6,9 @@
 
 #if INCLUDE_SLAUUM
 void LAPACK(slauum)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
     RELAPACK_slauum(uplo, n, A, ldA, info);
 }
@@ -16,9 +16,9 @@ void LAPACK(slauum)(
 
 #if INCLUDE_DLAUUM
 void LAPACK(dlauum)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
     RELAPACK_dlauum(uplo, n, A, ldA, info);
 }
@@ -26,9 +26,9 @@ void LAPACK(dlauum)(
 
 #if INCLUDE_CLAUUM
 void LAPACK(clauum)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
     RELAPACK_clauum(uplo, n, A, ldA, info);
 }
@@ -36,9 +36,9 @@ void LAPACK(clauum)(
 
 #if INCLUDE_ZLAUUM
 void LAPACK(zlauum)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
     RELAPACK_zlauum(uplo, n, A, ldA, info);
 }
@@ -51,9 +51,9 @@ void LAPACK(zlauum)(
 
 #if INCLUDE_SSYGST
 void LAPACK(ssygst)(
-    const int *itype, const char *uplo, const int *n,
-    float *A, const int *ldA, const float *B, const int *ldB,
-    int *info
+    const blasint *itype, const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    blasint *info
 ) {
     RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info);
 }
@@ -61,9 +61,9 @@ void LAPACK(ssygst)(
 
 #if INCLUDE_DSYGST
 void LAPACK(dsygst)(
-    const int *itype, const char *uplo, const int *n,
-    double *A, const int *ldA, const double *B, const int *ldB,
-    int *info
+    const blasint *itype, const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    blasint *info
 ) {
     RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info);
 }
@@ -71,9 +71,9 @@ void LAPACK(dsygst)(
 
 #if INCLUDE_CHEGST
 void LAPACK(chegst)(
-    const int *itype, const char *uplo, const int *n,
-    float *A, const int *ldA, const float *B, const int *ldB,
-    int *info
+    const blasint *itype, const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    blasint *info
 ) {
     RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info);
 }
@@ -81,9 +81,9 @@ void LAPACK(chegst)(
 
 #if INCLUDE_ZHEGST
 void LAPACK(zhegst)(
-    const int *itype, const char *uplo, const int *n,
-    double *A, const int *ldA, const double *B, const int *ldB,
-    int *info
+    const blasint *itype, const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    blasint *info
 ) {
     RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info);
 }
@@ -96,9 +96,9 @@ void LAPACK(zhegst)(
 
 #if INCLUDE_STRTRI
 void LAPACK(strtri)(
-    const char *uplo, const char *diag, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const char *diag, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
     RELAPACK_strtri(uplo, diag, n, A, ldA, info);
 }
@@ -106,9 +106,9 @@ void LAPACK(strtri)(
 
 #if INCLUDE_DTRTRI
 void LAPACK(dtrtri)(
-    const char *uplo, const char *diag, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const char *diag, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
     RELAPACK_dtrtri(uplo, diag, n, A, ldA, info);
 }
@@ -116,9 +116,9 @@ void LAPACK(dtrtri)(
 
 #if INCLUDE_CTRTRI
 void LAPACK(ctrtri)(
-    const char *uplo, const char *diag, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const char *diag, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
     RELAPACK_ctrtri(uplo, diag, n, A, ldA, info);
 }
@@ -126,9 +126,9 @@ void LAPACK(ctrtri)(
 
 #if INCLUDE_ZTRTRI
 void LAPACK(ztrtri)(
-    const char *uplo, const char *diag, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const char *diag, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
     RELAPACK_ztrtri(uplo, diag, n, A, ldA, info);
 }
@@ -141,9 +141,9 @@ void LAPACK(ztrtri)(
 
 #if INCLUDE_SPOTRF
 void LAPACK(spotrf)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
     RELAPACK_spotrf(uplo, n, A, ldA, info);
 }
@@ -151,9 +151,9 @@ void LAPACK(spotrf)(
 
 #if INCLUDE_DPOTRF
 void LAPACK(dpotrf)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
     RELAPACK_dpotrf(uplo, n, A, ldA, info);
 }
@@ -161,9 +161,9 @@ void LAPACK(dpotrf)(
 
 #if INCLUDE_CPOTRF
 void LAPACK(cpotrf)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
     RELAPACK_cpotrf(uplo, n, A, ldA, info);
 }
@@ -171,9 +171,9 @@ void LAPACK(cpotrf)(
 
 #if INCLUDE_ZPOTRF
 void LAPACK(zpotrf)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
     RELAPACK_zpotrf(uplo, n, A, ldA, info);
 }
@@ -186,9 +186,9 @@ void LAPACK(zpotrf)(
 
 #if INCLUDE_SPBTRF
 void LAPACK(spbtrf)(
-    const char *uplo, const int *n, const int *kd,
-    float *Ab, const int *ldAb,
-    int *info
+    const char *uplo, const blasint *n, const blasint *kd,
+    float *Ab, const blasint *ldAb,
+    blasint *info
 ) {
     RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info);
 }
@@ -196,9 +196,9 @@ void LAPACK(spbtrf)(
 
 #if INCLUDE_DPBTRF
 void LAPACK(dpbtrf)(
-    const char *uplo, const int *n, const int *kd,
-    double *Ab, const int *ldAb,
-    int *info
+    const char *uplo, const blasint *n, const blasint *kd,
+    double *Ab, const blasint *ldAb,
+    blasint *info
 ) {
     RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info);
 }
@@ -206,9 +206,9 @@ void LAPACK(dpbtrf)(
 
 #if INCLUDE_CPBTRF
 void LAPACK(cpbtrf)(
-    const char *uplo, const int *n, const int *kd,
-    float *Ab, const int *ldAb,
-    int *info
+    const char *uplo, const blasint *n, const blasint *kd,
+    float *Ab, const blasint *ldAb,
+    blasint *info
 ) {
     RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info);
 }
@@ -216,9 +216,9 @@ void LAPACK(cpbtrf)(
 
 #if INCLUDE_ZPBTRF
 void LAPACK(zpbtrf)(
-    const char *uplo, const int *n, const int *kd,
-    double *Ab, const int *ldAb,
-    int *info
+    const char *uplo, const blasint *n, const blasint *kd,
+    double *Ab, const blasint *ldAb,
+    blasint *info
 ) {
     RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info);
 }
@@ -231,9 +231,9 @@ void LAPACK(zpbtrf)(
 
 #if INCLUDE_SSYTRF
 void LAPACK(ssytrf)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *lWork, blasint *info
 ) {
     RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
@@ -241,9 +241,9 @@ void LAPACK(ssytrf)(
 
 #if INCLUDE_DSYTRF
 void LAPACK(dsytrf)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *lWork, blasint *info
 ) {
     RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
@@ -251,9 +251,9 @@ void LAPACK(dsytrf)(
 
 #if INCLUDE_CSYTRF
 void LAPACK(csytrf)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *lWork, blasint *info
 ) {
     RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
@@ -261,9 +261,9 @@ void LAPACK(csytrf)(
 
 #if INCLUDE_ZSYTRF
 void LAPACK(zsytrf)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *lWork, blasint *info
 ) {
     RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
@@ -271,9 +271,9 @@ void LAPACK(zsytrf)(
 
 #if INCLUDE_CHETRF
 void LAPACK(chetrf)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *lWork, blasint *info
 ) {
     RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
@@ -281,9 +281,9 @@ void LAPACK(chetrf)(
 
 #if INCLUDE_ZHETRF
 void LAPACK(zhetrf)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *lWork, blasint *info
 ) {
     RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
@@ -291,9 +291,9 @@ void LAPACK(zhetrf)(
 
 #if INCLUDE_SSYTRF_ROOK
 void LAPACK(ssytrf_rook)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *lWork, blasint *info
 ) {
     RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
@@ -301,9 +301,9 @@ void LAPACK(ssytrf_rook)(
 
 #if INCLUDE_DSYTRF_ROOK
 void LAPACK(dsytrf_rook)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *lWork, blasint *info
 ) {
     RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
@@ -311,9 +311,9 @@ void LAPACK(dsytrf_rook)(
 
 #if INCLUDE_CSYTRF_ROOK
 void LAPACK(csytrf_rook)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *lWork, blasint *info
 ) {
     RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
@@ -321,9 +321,9 @@ void LAPACK(csytrf_rook)(
 
 #if INCLUDE_ZSYTRF_ROOK
 void LAPACK(zsytrf_rook)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *lWork, blasint *info
 ) {
     RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
@@ -331,9 +331,9 @@ void LAPACK(zsytrf_rook)(
 
 #if INCLUDE_CHETRF_ROOK
 void LAPACK(chetrf_rook)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *lWork, blasint *info
 ) {
     RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
@@ -341,9 +341,9 @@ void LAPACK(chetrf_rook)(
 
 #if INCLUDE_ZHETRF_ROOK
 void LAPACK(zhetrf_rook)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *lWork, blasint *info
 ) {
     RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
@@ -356,9 +356,9 @@ void LAPACK(zhetrf_rook)(
 
 #if INCLUDE_SGETRF
 void LAPACK(sgetrf)(
-    const int *m, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    blasint *info
 ) {
     RELAPACK_sgetrf(m, n, A, ldA, ipiv, info);
 }
@@ -366,9 +366,9 @@ void LAPACK(sgetrf)(
 
 #if INCLUDE_DGETRF
 void LAPACK(dgetrf)(
-    const int *m, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    blasint *info
 ) {
     RELAPACK_dgetrf(m, n, A, ldA, ipiv, info);
 }
@@ -376,9 +376,9 @@ void LAPACK(dgetrf)(
 
 #if INCLUDE_CGETRF
 void LAPACK(cgetrf)(
-    const int *m, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    blasint *info
 ) {
     RELAPACK_cgetrf(m, n, A, ldA, ipiv, info);
 }
@@ -386,9 +386,9 @@ void LAPACK(cgetrf)(
 
 #if INCLUDE_ZGETRF
 void LAPACK(zgetrf)(
-    const int *m, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    blasint *info
 ) {
     RELAPACK_zgetrf(m, n, A, ldA, ipiv, info);
 }
@@ -401,9 +401,9 @@ void LAPACK(zgetrf)(
 
 #if INCLUDE_SGBTRF
 void LAPACK(sgbtrf)(
-    const int *m, const int *n, const int *kl, const int *ku,
-    float *Ab, const int *ldAb, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n, const blasint *kl, const blasint *ku,
+    float *Ab, const blasint *ldAb, blasint *ipiv,
+    blasint *info
 ) {
     RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
 }
@@ -411,9 +411,9 @@ void LAPACK(sgbtrf)(
 
 #if INCLUDE_DGBTRF
 void LAPACK(dgbtrf)(
-    const int *m, const int *n, const int *kl, const int *ku,
-    double *Ab, const int *ldAb, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n, const blasint *kl, const blasint *ku,
+    double *Ab, const blasint *ldAb, blasint *ipiv,
+    blasint *info
 ) {
     RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
 }
@@ -421,9 +421,9 @@ void LAPACK(dgbtrf)(
 
 #if INCLUDE_CGBTRF
 void LAPACK(cgbtrf)(
-    const int *m, const int *n, const int *kl, const int *ku,
-    float *Ab, const int *ldAb, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n, const blasint *kl, const blasint *ku,
+    float *Ab, const blasint *ldAb, blasint *ipiv,
+    blasint *info
 ) {
     RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
 }
@@ -431,9 +431,9 @@ void LAPACK(cgbtrf)(
 
 #if INCLUDE_ZGBTRF
 void LAPACK(zgbtrf)(
-    const int *m, const int *n, const int *kl, const int *ku,
-    double *Ab, const int *ldAb, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n, const blasint *kl, const blasint *ku,
+    double *Ab, const blasint *ldAb, blasint *ipiv,
+    blasint *info
 ) {
     RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
 }
@@ -446,11 +446,11 @@ void LAPACK(zgbtrf)(
 
 #if INCLUDE_STRSYL
 void LAPACK(strsyl)(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC, float *scale,
-    int *info
+    const char *tranA, const char *tranB, const blasint *isgn,
+    const blasint *m, const blasint *n,
+    const float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *C, const blasint *ldC, float *scale,
+    blasint *info
 ) {
     RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }
@@ -458,11 +458,11 @@ void LAPACK(strsyl)(
 
 #if INCLUDE_DTRSYL
 void LAPACK(dtrsyl)(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC, double *scale,
-    int *info
+    const char *tranA, const char *tranB, const blasint *isgn,
+    const blasint *m, const blasint *n,
+    const double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *C, const blasint *ldC, double *scale,
+    blasint *info
 ) {
     RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }
@@ -470,11 +470,11 @@ void LAPACK(dtrsyl)(
 
 #if INCLUDE_CTRSYL
 void LAPACK(ctrsyl)(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC, float *scale,
-    int *info
+    const char *tranA, const char *tranB, const blasint *isgn,
+    const blasint *m, const blasint *n,
+    const float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *C, const blasint *ldC, float *scale,
+    blasint *info
 ) {
     RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }
@@ -482,11 +482,11 @@ void LAPACK(ctrsyl)(
 
 #if INCLUDE_ZTRSYL
 void LAPACK(ztrsyl)(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC, double *scale,
-    int *info
+    const char *tranA, const char *tranB, const blasint *isgn,
+    const blasint *m, const blasint *n,
+    const double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *C, const blasint *ldC, double *scale,
+    blasint *info
 ) {
     RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }
@@ -499,13 +499,13 @@ void LAPACK(ztrsyl)(
 
 #if INCLUDE_STGSYL
 void LAPACK(stgsyl)(
-    const char *trans, const int *ijob, const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC,
-    const float *D, const int *ldD, const float *E, const int *ldE,
-    float *F, const int *ldF,
+    const char *trans, const blasint *ijob, const blasint *m, const blasint *n,
+    const float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *C, const blasint *ldC,
+    const float *D, const blasint *ldD, const float *E, const blasint *ldE,
+    float *F, const blasint *ldF,
     float *scale, float *dif,
-    float *Work, const int *lWork, int *iWork, int *info
+    float *Work, const blasint *lWork, blasint *iWork, blasint *info
 ) {
     RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
 }
@@ -513,13 +513,13 @@ void LAPACK(stgsyl)(
 
 #if INCLUDE_DTGSYL
 void LAPACK(dtgsyl)(
-    const char *trans, const int *ijob, const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC,
-    const double *D, const int *ldD, const double *E, const int *ldE,
-    double *F, const int *ldF,
+    const char *trans, const blasint *ijob, const blasint *m, const blasint *n,
+    const double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *C, const blasint *ldC,
+    const double *D, const blasint *ldD, const double *E, const blasint *ldE,
+    double *F, const blasint *ldF,
     double *scale, double *dif,
-    double *Work, const int *lWork, int *iWork, int *info
+    double *Work, const blasint *lWork, blasint *iWork, blasint *info
 ) {
     RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
 }
@@ -527,13 +527,13 @@ void LAPACK(dtgsyl)(
 
 #if INCLUDE_CTGSYL
 void LAPACK(ctgsyl)(
-    const char *trans, const int *ijob, const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC,
-    const float *D, const int *ldD, const float *E, const int *ldE,
-    float *F, const int *ldF,
+    const char *trans, const blasint *ijob, const blasint *m, const blasint *n,
+    const float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *C, const blasint *ldC,
+    const float *D, const blasint *ldD, const float *E, const blasint *ldE,
+    float *F, const blasint *ldF,
     float *scale, float *dif,
-    float *Work, const int *lWork, int *iWork, int *info
+    float *Work, const blasint *lWork, blasint *iWork, blasint *info
 ) {
     RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
 }
@@ -541,13 +541,13 @@ void LAPACK(ctgsyl)(
 
 #if INCLUDE_ZTGSYL
 void LAPACK(ztgsyl)(
-    const char *trans, const int *ijob, const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC,
-    const double *D, const int *ldD, const double *E, const int *ldE,
-    double *F, const int *ldF,
+    const char *trans, const blasint *ijob, const blasint *m, const blasint *n,
+    const double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *C, const blasint *ldC,
+    const double *D, const blasint *ldD, const double *E, const blasint *ldE,
+    double *F, const blasint *ldF,
     double *scale, double *dif,
-    double *Work, const int *lWork, int *iWork, int *info
+    double *Work, const blasint *lWork, blasint *iWork, blasint *info
 ) {
     RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
 }
@@ -561,10 +561,10 @@ void LAPACK(ztgsyl)(
 #if INCLUDE_SGEMMT
 void LAPACK(sgemmt)(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const float *alpha, const float *A, const int *ldA,
-    const float *B, const int *ldB,
-    const float *beta, float *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const float *alpha, const float *A, const blasint *ldA,
+    const float *B, const blasint *ldB,
+    const float *beta, float *C, const blasint *ldC
 ) {
     RELAPACK_sgemmt(uplo, n, A, ldA, info);
 }
@@ -573,10 +573,10 @@ void LAPACK(sgemmt)(
 #if INCLUDE_DGEMMT
 void LAPACK(dgemmt)(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const double *alpha, const double *A, const int *ldA,
-    const double *B, const int *ldB,
-    const double *beta, double *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const double *alpha, const double *A, const blasint *ldA,
+    const double *B, const blasint *ldB,
+    const double *beta, double *C, const blasint *ldC
 ) {
     RELAPACK_dgemmt(uplo, n, A, ldA, info);
 }
@@ -585,10 +585,10 @@ void LAPACK(dgemmt)(
 #if INCLUDE_CGEMMT
 void LAPACK(cgemmt)(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const float *alpha, const float *A, const int *ldA,
-    const float *B, const int *ldB,
-    const float *beta, float *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const float *alpha, const float *A, const blasint *ldA,
+    const float *B, const blasint *ldB,
+    const float *beta, float *C, const blasint *ldC
 ) {
     RELAPACK_cgemmt(uplo, n, A, ldA, info);
 }
@@ -597,10 +597,10 @@ void LAPACK(cgemmt)(
 #if INCLUDE_ZGEMMT
 void LAPACK(zgemmt)(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const double *alpha, const double *A, const int *ldA,
-    const double *B, const int *ldB,
-    const double *beta, double *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const double *alpha, const double *A, const blasint *ldA,
+    const double *B, const blasint *ldB,
+    const double *beta, double *C, const blasint *ldC
 ) {
     RELAPACK_zgemmt(uplo, n, A, ldA, info);
 }
diff --git a/relapack/src/relapack.h b/relapack/src/relapack.h
index 2cb061c323..38c5c30d09 100644
--- a/relapack/src/relapack.h
+++ b/relapack/src/relapack.h
@@ -1,6 +1,14 @@
 #ifndef RELAPACK_INT_H
 #define RELAPACK_INT_H
-
+#include <string.h>
+#include "../../config.h"
+#if defined(OS_WINDOWS) && defined(__64BIT__)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
 #include "../config.h"
 
 #include "../inc/relapack.h"
@@ -38,23 +46,23 @@
 #include "blas.h"
 
 // sytrf helper routines
-void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
-void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
-void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
-void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
-void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
-void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
-void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
-void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
-void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
-void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
-void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
-void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_ssytrf_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *);
+void RELAPACK_dsytrf_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *);
+void RELAPACK_csytrf_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *);
+void RELAPACK_chetrf_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *);
+void RELAPACK_zsytrf_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *);
+void RELAPACK_zhetrf_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *);
+void RELAPACK_ssytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *);
+void RELAPACK_dsytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *);
+void RELAPACK_csytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *);
+void RELAPACK_chetrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *);
+void RELAPACK_zsytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *);
+void RELAPACK_zhetrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *);
 
 // trsyl helper routines
-void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
-void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
-void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
-void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+void RELAPACK_strsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *);
+void RELAPACK_dtrsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *);
+void RELAPACK_ctrsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *);
+void RELAPACK_ztrsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *);
 
 #endif /*  RELAPACK_INT_H */
diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c
index bc20e744b2..3e3fdf4555 100644
--- a/relapack/src/sgbtrf.c
+++ b/relapack/src/sgbtrf.c
@@ -1,9 +1,9 @@
 #include "relapack.h"
 #include "stdlib.h"
 
-static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *,
-    const int *, float *, const int *, int *, float *, const int *, float *,
-    const int *, int *);
+static void RELAPACK_sgbtrf_rec(const blasint *, const blasint *, const blasint *,
+    const blasint *, float *, const blasint *, blasint *, float *, const blasint *, float *,
+    const blasint *, blasint *);
 
 
 /** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
@@ -13,11 +13,10 @@ static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *,
  * http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html
  * */
 void RELAPACK_sgbtrf(
-    const int *m, const int *n, const int *kl, const int *ku,
-    float *Ab, const int *ldAb, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n, const blasint *kl, const blasint *ku,
+    float *Ab, const blasint *ldAb, blasint *ipiv,
+    blasint *info
 ) {
-
     // Check arguments
     *info = 0;
     if (*m < 0)
@@ -28,11 +27,11 @@ void RELAPACK_sgbtrf(
         *info = -3;
     else if (*ku < 0)
         *info = -4;
-    else if (*ldAb < 2 * *kl + *ku + 1)
+    else if (*ldAb < 2 * *kl + *ku + 1) 
         *info = -6;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("SGBTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("SGBTRF", &minfo, strlen("SGBTRF"));
         return;
     }
 
@@ -40,14 +39,14 @@ void RELAPACK_sgbtrf(
     const float ZERO[] = { 0. };
 
     // Result upper band width
-    const int kv = *ku + *kl;
+    const blasint kv = *ku + *kl;
 
     // Unskewg A
-    const int ldA[] = { *ldAb - 1 };
+    const blasint ldA[] = { *ldAb - 1 };
     float *const A = Ab + kv;
 
     // Zero upper diagonal fill-in elements
-    int i, j;
+    blasint i, j;
     for (j = 0; j < *n; j++) {
         float *const A_j = A + *ldA * j;
         for (i = MAX(0, j - kv); i < j - *ku; i++)
@@ -55,16 +54,17 @@ void RELAPACK_sgbtrf(
     }
 
     // Allocate work space
-    const int n1 = SREC_SPLIT(*n);
-    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
-    const int nWorkl = (kv > n1) ? n1 : kv;
-    const int mWorku = (*kl > n1) ? n1 : *kl;
-    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    const blasint n1 = SREC_SPLIT(*n);
+    const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv );
+    const blasint nWorkl = abs( (kv > n1) ? n1 : kv );
+    const blasint mWorku = abs( (*kl > n1) ? n1 : *kl );
+    const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl );
     float *Workl = malloc(mWorkl * nWorkl * sizeof(float));
     float *Worku = malloc(mWorku * nWorku * sizeof(float));
     LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
     LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
 
+
     // Recursive kernel
     RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
 
@@ -76,12 +76,13 @@ void RELAPACK_sgbtrf(
 
 /** sgbtrf's recursive compute kernel */
 static void RELAPACK_sgbtrf_rec(
-    const int *m, const int *n, const int *kl, const int *ku,
-    float *Ab, const int *ldAb, int *ipiv,
-    float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
-    int *info
+    const blasint *m, const blasint *n, const blasint *kl, const blasint *ku,
+    float *Ab, const blasint *ldAb, blasint *ipiv,
+    float *Workl, const blasint *ldWorkl, float *Worku, const blasint *ldWorku,
+    blasint *info
 ) {
 
+
     if (*n <= MAX(CROSSOVER_SGBTRF, 1)) {
         // Unblocked
         LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
@@ -91,25 +92,25 @@ static void RELAPACK_sgbtrf_rec(
     // Constants
     const float ONE[]  = { 1. };
     const float MONE[] = { -1. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
     // Loop iterators
-    int i, j;
+    blasint i, j;
 
     // Output upper band width
-    const int kv = *ku + *kl;
+    const blasint kv = *ku + *kl;
 
     // Unskew A
-    const int ldA[] = { *ldAb - 1 };
+    const blasint ldA[] = { *ldAb - 1 };
     float *const A = Ab + kv;
 
     // Splitting
-    const int n1  = MIN(SREC_SPLIT(*n), *kl);
-    const int n2  = *n - n1;
-    const int m1  = MIN(n1, *m);
-    const int m2  = *m - m1;
-    const int mn1 = MIN(m1, n1);
-    const int mn2 = MIN(m2, n2);
+    const blasint n1  = MIN(SREC_SPLIT(*n), *kl);
+    const blasint n2  = *n - n1;
+    const blasint m1  = MIN(n1, *m);
+    const blasint m2  = *m - m1;
+    const blasint mn1 = MIN(m1, n1);
+    const blasint mn2 = MIN(m2, n2);
 
     // Ab_L *
     //      Ab_BR
@@ -128,15 +129,15 @@ static void RELAPACK_sgbtrf_rec(
     float *const A_BR = A + *ldA * n1 + m1;
 
     // ipiv_T
-    // ipiv_B
-    int *const ipiv_T = ipiv;
-    int *const ipiv_B = ipiv + n1;
+    // ipiv_B 
+    blasint *const ipiv_T = ipiv;
+    blasint *const ipiv_B = ipiv + n1;
 
     // Banded splitting
-    const int n21 = MIN(n2, kv - n1);
-    const int n22 = MIN(n2 - n21, n1);
-    const int m21 = MIN(m2, *kl - m1);
-    const int m22 = MIN(m2 - m21, m1);
+    const blasint n21 = MIN(n2, kv - n1);
+    const blasint n22 = MIN(n2 - n21, n1);
+    const blasint m21 = MIN(m2, *kl - m1);
+    const blasint m22 = MIN(m2 - m21, m1);
 
     //   n1 n21  n22
     // m *  A_Rl ARr
@@ -156,6 +157,7 @@ static void RELAPACK_sgbtrf_rec(
     float *const A_BRbl = A_BR              + m21;
     float *const A_BRbr = A_BR + *ldA * n21 + m21;
 
+
     // recursion(Ab_L, ipiv_T)
     RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
 
@@ -164,7 +166,7 @@ static void RELAPACK_sgbtrf_rec(
 
     // partially redo swaps in A_L
     for (i = 0; i < mn1; i++) {
-        const int ip = ipiv_T[i] - 1;
+        const blasint ip = ipiv_T[i] - 1;
         if (ip != i) {
             if (ip < *kl)
                 BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA);
@@ -180,7 +182,7 @@ static void RELAPACK_sgbtrf_rec(
     for (j = 0; j < n22; j++) {
         float *const A_Rrj = A_Rr + *ldA * j;
         for (i = j; i < mn1; i++) {
-            const int ip = ipiv_T[i] - 1;
+            const blasint ip = ipiv_T[i] - 1;
             if (ip != i) {
                 const float tmp = A_Rrj[i];
                 A_Rrj[i] = A_Rr[ip];
@@ -208,7 +210,7 @@ static void RELAPACK_sgbtrf_rec(
 
     // partially undo swaps in A_L
     for (i = mn1 - 1; i >= 0; i--) {
-        const int ip = ipiv_T[i] - 1;
+        const blasint ip = ipiv_T[i] - 1;
         if (ip != i) {
             if (ip < *kl)
                 BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA);
@@ -217,8 +219,11 @@ static void RELAPACK_sgbtrf_rec(
         }
     }
 
+
     // recursion(Ab_BR, ipiv_B)
-    RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+//cause of infinite recursion here ?    
+//      RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+        LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
     if (*info)
         *info += n1;
     // shift pivots
diff --git a/relapack/src/sgemmt.c b/relapack/src/sgemmt.c
index 75f78fabd1..93438858c6 100644
--- a/relapack/src/sgemmt.c
+++ b/relapack/src/sgemmt.c
@@ -1,12 +1,12 @@
 #include "relapack.h"
 
 static void RELAPACK_sgemmt_rec(const char *, const char *, const char *,
-    const int *, const int *, const float *, const float *, const int *,
-    const float *, const int *, const float *, float *, const int *);
+    const blasint *, const blasint *, const float *, const float *, const blasint *,
+    const float *, const blasint *, const float *, float *, const blasint *);
 
 static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *,
-    const int *, const int *, const float *, const float *, const int *,
-    const float *, const int *, const float *, float *, const int *);
+    const blasint *, const blasint *, const float *, const float *, const blasint *,
+    const float *, const blasint *, const float *, float *, const blasint *);
 
 
 /** SGEMMT computes a matrix-matrix product with general matrices but updates
@@ -20,10 +20,10 @@ static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *,
  * */
 void RELAPACK_sgemmt(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const float *alpha, const float *A, const int *ldA,
-    const float *B, const int *ldB,
-    const float *beta, float *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const float *alpha, const float *A, const blasint *ldA,
+    const float *B, const blasint *ldB,
+    const float *beta, float *C, const blasint *ldC
 ) {
 
 #if HAVE_XGEMMT
@@ -32,13 +32,13 @@ void RELAPACK_sgemmt(
 #else
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
-    const int notransA = LAPACK(lsame)(transA, "N");
-    const int tranA = LAPACK(lsame)(transA, "T");
-    const int notransB = LAPACK(lsame)(transB, "N");
-    const int tranB = LAPACK(lsame)(transB, "T");
-    int info = 0;
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
+    const blasint notransA = LAPACK(lsame)(transA, "N");
+    const blasint tranA = LAPACK(lsame)(transA, "T");
+    const blasint notransB = LAPACK(lsame)(transB, "N");
+    const blasint tranB = LAPACK(lsame)(transB, "T");
+    blasint info = 0;
     if (!lower && !upper)
         info = 1;
     else if (!tranA && !notransA)
@@ -56,7 +56,7 @@ void RELAPACK_sgemmt(
     else if (*ldC < MAX(1, *n))
         info = 13;
     if (info) {
-        LAPACK(xerbla)("SGEMMT", &info);
+        LAPACK(xerbla)("SGEMMT", &info, strlen("SGEMMT"));
         return;
     }
 
@@ -74,10 +74,10 @@ void RELAPACK_sgemmt(
 /** sgemmt's recursive compute kernel */
 static void RELAPACK_sgemmt_rec(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const float *alpha, const float *A, const int *ldA,
-    const float *B, const int *ldB,
-    const float *beta, float *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const float *alpha, const float *A, const blasint *ldA,
+    const float *B, const blasint *ldB,
+    const float *beta, float *C, const blasint *ldC
 ) {
 
     if (*n <= MAX(CROSSOVER_SGEMMT, 1)) {
@@ -87,8 +87,8 @@ static void RELAPACK_sgemmt_rec(
     }
 
     // Splitting
-    const int n1 = SREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = SREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_T
     // A_B
@@ -124,16 +124,16 @@ static void RELAPACK_sgemmt_rec(
 /** sgemmt's unblocked compute kernel */
 static void RELAPACK_sgemmt_rec2(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const float *alpha, const float *A, const int *ldA,
-    const float *B, const int *ldB,
-    const float *beta, float *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const float *alpha, const float *A, const blasint *ldA,
+    const float *B, const blasint *ldB,
+    const float *beta, float *C, const blasint *ldC
 ) {
 
-    const int incB = (*transB == 'N') ? 1 : *ldB;
-    const int incC = 1;
+    const blasint incB = (*transB == 'N') ? 1 : *ldB;
+    const blasint incC = 1;
 
-    int i;
+    blasint i;
     for (i = 0; i < *n; i++) {
         // A_0
         // A_i
@@ -149,13 +149,13 @@ static void RELAPACK_sgemmt_rec2(
         float *const C_ii = C + *ldC * i + i;
 
         if (*uplo == 'L') {
-            const int nmi = *n - i;
+            const blasint nmi = *n - i;
             if (*transA == 'N')
                 BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
             else
                 BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
         } else {
-            const int ip1 = i + 1;
+            const blasint ip1 = i + 1;
             if (*transA == 'N')
                 BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
             else
diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c
index 284f8cff67..0231cc166f 100644
--- a/relapack/src/sgetrf.c
+++ b/relapack/src/sgetrf.c
@@ -1,7 +1,6 @@
 #include "relapack.h"
-
-static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *,
-    int *, int *);
+static void RELAPACK_sgetrf_rec(const blasint *, const blasint *, float *, const blasint *,
+    blasint *, blasint *);
 
 
 /** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
@@ -11,9 +10,9 @@ static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *,
  * http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html
  * */
 void RELAPACK_sgetrf(
-    const int *m, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    blasint *info
 ) {
 
     // Check arguments
@@ -22,26 +21,24 @@ void RELAPACK_sgetrf(
         *info = -1;
     else if (*n < 0)
         *info = -2;
-    else if (*ldA < MAX(1, *n))
+    else if (*ldA < MAX(1, *m))
         *info = -4;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("SGETRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF"));
         return;
     }
-
-    const int sn = MIN(*m, *n);
-
+    const blasint sn = MIN(*m, *n);
     RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info);
 
     // Right remainder
     if (*m < *n) {
         // Constants
         const float ONE[] = { 1. };
-        const int  iONE[] = { 1. };
+        const blasint  iONE[] = { 1. };
 
         // Splitting
-        const int rn = *n - *m;
+        const blasint rn = *n - *m;
 
         // A_L A_R
         const float *const A_L = A;
@@ -57,11 +54,10 @@ void RELAPACK_sgetrf(
 
 /** sgetrf's recursive compute kernel */
 static void RELAPACK_sgetrf_rec(
-    const int *m, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    blasint *info
 ) {
-
     if (*n <= MAX(CROSSOVER_SGETRF, 1)) {
         // Unblocked
         LAPACK(sgetf2)(m, n, A, ldA, ipiv, info);
@@ -71,13 +67,12 @@ static void RELAPACK_sgetrf_rec(
     // Constants
     const float ONE[]  = { 1. };
     const float MONE[] = { -1. };
-    const int   iONE[] = { 1 };
+    const blasint   iONE[] = { 1 };
 
     // Splitting
-    const int n1 = SREC_SPLIT(*n);
-    const int n2 = *n - n1;
-    const int m2 = *m - n1;
-
+    const blasint n1 = SREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
+    const blasint m2 = *m - n1;
     // A_L A_R
     float *const A_L = A;
     float *const A_R = A + *ldA * n1;
@@ -91,8 +86,8 @@ static void RELAPACK_sgetrf_rec(
 
     // ipiv_T
     // ipiv_B
-    int *const ipiv_T = ipiv;
-    int *const ipiv_B = ipiv + n1;
+    blasint *const ipiv_T = ipiv;
+    blasint *const ipiv_B = ipiv + n1;
 
     // recursion(A_L, ipiv_T)
     RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
@@ -111,7 +106,7 @@ static void RELAPACK_sgetrf_rec(
     // apply pivots to A_BL
     LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
     // shift pivots
-    int i;
+    blasint i;
     for (i = 0; i < n2; i++)
         ipiv_B[i] += n1;
 }
diff --git a/relapack/src/slauum.c b/relapack/src/slauum.c
index 280f141b31..79212817f8 100644
--- a/relapack/src/slauum.c
+++ b/relapack/src/slauum.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_slauum_rec(const char *, const int *, float *,
-    const int *, int *);
+static void RELAPACK_slauum_rec(const char *, const blasint *, float *,
+    const blasint *, blasint *);
 
 
 /** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
@@ -11,14 +11,14 @@ static void RELAPACK_slauum_rec(const char *, const int *, float *,
  * http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html
  * */
 void RELAPACK_slauum(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -27,8 +27,8 @@ void RELAPACK_slauum(
     else if (*ldA < MAX(1, *n))
         *info = -4;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("SLAUUM", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("SLAUUM", &minfo, strlen("SLAUUM"));
         return;
     }
 
@@ -42,9 +42,9 @@ void RELAPACK_slauum(
 
 /** slauum's recursive compute kernel */
 static void RELAPACK_slauum_rec(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_SLAUUM, 1)) {
@@ -57,8 +57,8 @@ static void RELAPACK_slauum_rec(
     const float ONE[] = { 1. };
 
     // Splitting
-    const int n1 = SREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = SREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/spbtrf.c b/relapack/src/spbtrf.c
index ee0a5546e9..26804dcc2f 100644
--- a/relapack/src/spbtrf.c
+++ b/relapack/src/spbtrf.c
@@ -1,8 +1,8 @@
 #include "relapack.h"
 #include "stdlib.h"
 
-static void RELAPACK_spbtrf_rec(const char *, const int *, const int *,
-    float *, const int *, float *, const int *, int *);
+static void RELAPACK_spbtrf_rec(const char *, const blasint *, const blasint *,
+    float *, const blasint *, float *, const blasint *, blasint *);
 
 
 /** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
@@ -12,14 +12,14 @@ static void RELAPACK_spbtrf_rec(const char *, const int *, const int *,
  * http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html
  * */
 void RELAPACK_spbtrf(
-    const char *uplo, const int *n, const int *kd,
-    float *Ab, const int *ldAb,
-    int *info
+    const char *uplo, const blasint *n, const blasint *kd,
+    float *Ab, const blasint *ldAb,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -30,8 +30,8 @@ void RELAPACK_spbtrf(
     else if (*ldAb < *kd + 1)
         *info = -5;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("SPBTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("SPBTRF", &minfo, strlen("SPBTRF"));
         return;
     }
 
@@ -42,9 +42,9 @@ void RELAPACK_spbtrf(
     const float ZERO[] = { 0. };
 
     // Allocate work space
-    const int n1 = SREC_SPLIT(*n);
-    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
-    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+    const blasint n1 = SREC_SPLIT(*n);
+    const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+    const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
     float *Work = malloc(mWork * nWork * sizeof(float));
     LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
 
@@ -58,10 +58,10 @@ void RELAPACK_spbtrf(
 
 /** spbtrf's recursive compute kernel */
 static void RELAPACK_spbtrf_rec(
-    const char *uplo, const int *n, const int *kd,
-    float *Ab, const int *ldAb,
-    float *Work, const int *ldWork,
-    int *info
+    const char *uplo, const blasint *n, const blasint *kd,
+    float *Ab, const blasint *ldAb,
+    float *Work, const blasint *ldWork,
+    blasint *info
 ){
 
     if (*n <= MAX(CROSSOVER_SPBTRF, 1)) {
@@ -75,12 +75,12 @@ static void RELAPACK_spbtrf_rec(
     const float MONE[] = { -1. };
 
     // Unskew A
-    const int ldA[] = { *ldAb - 1 };
+    const blasint ldA[] = { *ldAb - 1 };
     float *const A = Ab + ((*uplo == 'L') ? 0 : *kd);
 
     // Splitting
-    const int n1 = MIN(SREC_SPLIT(*n), *kd);
-    const int n2 = *n - n1;
+    const blasint n1 = MIN(SREC_SPLIT(*n), *kd);
+    const blasint n2 = *n - n1;
 
     // * *
     // * Ab_BR
@@ -99,8 +99,8 @@ static void RELAPACK_spbtrf_rec(
         return;
 
     // Banded splitting
-    const int n21 = MIN(n2, *kd - n1);
-    const int n22 = MIN(n2 - n21, *kd);
+    const blasint n21 = MIN(n2, *kd - n1);
+    const blasint n22 = MIN(n2 - n21, *kd);
 
     //     n1    n21    n22
     // n1  *     A_TRl  A_TRr
diff --git a/relapack/src/spotrf.c b/relapack/src/spotrf.c
index 2a609321be..b22e917f75 100644
--- a/relapack/src/spotrf.c
+++ b/relapack/src/spotrf.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_spotrf_rec(const char *, const int *, float *, 
-        const int *, int *);
+static void RELAPACK_spotrf_rec(const char *, const blasint *, float *, 
+        const blasint *, blasint *);
 
 
 /** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
@@ -11,14 +11,14 @@ static void RELAPACK_spotrf_rec(const char *, const int *, float *,
  * http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html
  * */
 void RELAPACK_spotrf(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -27,8 +27,8 @@ void RELAPACK_spotrf(
     else if (*ldA < MAX(1, *n))
         *info = -4;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("SPOTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("SPOTRF", &minfo, strlen("SPOTRF"));
         return;
     }
 
@@ -42,9 +42,9 @@ void RELAPACK_spotrf(
 
 /** spotrf's recursive compute kernel */
 static void RELAPACK_spotrf_rec(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_SPOTRF, 1)) {
@@ -58,8 +58,8 @@ static void RELAPACK_spotrf_rec(
     const float MONE[] = { -1. };
 
     // Splitting
-    const int n1 = SREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = SREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/ssygst.c b/relapack/src/ssygst.c
index 7f145cdec9..4259f90319 100644
--- a/relapack/src/ssygst.c
+++ b/relapack/src/ssygst.c
@@ -3,9 +3,9 @@
 #include "stdlib.h"
 #endif
 
-static void RELAPACK_ssygst_rec(const int *, const char *, const int *,
-    float *, const int *, const float *, const int *,
-    float *, const int *, int *);
+static void RELAPACK_ssygst_rec(const blasint *, const char *, const blasint *,
+    float *, const blasint *, const float *, const blasint *,
+    float *, const blasint *, blasint *);
 
 
 /** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form.
@@ -15,14 +15,14 @@ static void RELAPACK_ssygst_rec(const int *, const char *, const int *,
  * http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html
  * */
 void RELAPACK_ssygst(
-    const int *itype, const char *uplo, const int *n,
-    float *A, const int *ldA, const float *B, const int *ldB,
-    int *info
+    const blasint *itype, const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (*itype < 1 || *itype > 3)
         *info = -1;
@@ -35,8 +35,8 @@ void RELAPACK_ssygst(
     else if (*ldB < MAX(1, *n))
         *info = -7;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("SSYGST", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("SSYGST", &minfo, strlen("SSYGST"));
         return;
     }
 
@@ -45,9 +45,9 @@ void RELAPACK_ssygst(
 
     // Allocate work space
     float *Work = NULL;
-    int   lWork = 0;
+    blasint   lWork = 0;
 #if XSYGST_ALLOW_MALLOC
-    const int n1 = SREC_SPLIT(*n);
+    const blasint n1 = SREC_SPLIT(*n);
     lWork = n1 * (*n - n1);
     Work  = malloc(lWork * sizeof(float));
     if (!Work)
@@ -67,9 +67,9 @@ void RELAPACK_ssygst(
 
 /** ssygst's recursive compute kernel */
 static void RELAPACK_ssygst_rec(
-    const int *itype, const char *uplo, const int *n,
-    float *A, const int *ldA, const float *B, const int *ldB,
-    float *Work, const int *lWork, int *info
+    const blasint *itype, const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *Work, const blasint *lWork, blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_SSYGST, 1)) {
@@ -84,14 +84,14 @@ static void RELAPACK_ssygst_rec(
     const float MONE[]  = { -1. };
     const float HALF[]  = { .5 };
     const float MHALF[] = { -.5 };
-    const int   iONE[]  = { 1 };
+    const blasint   iONE[]  = { 1 };
 
     // Loop iterator
-    int i;
+    blasint i;
 
     // Splitting
-    const int n1 = SREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = SREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/ssytrf.c b/relapack/src/ssytrf.c
index 8a4fad9f2a..9fe7ce4a6e 100644
--- a/relapack/src/ssytrf.c
+++ b/relapack/src/ssytrf.c
@@ -2,9 +2,8 @@
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif
-
-static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *,
-    float *, const int *, int *, float *, const int *, int *);
+static void RELAPACK_ssytrf_rec(const char *, const blasint *, const blasint *, blasint *,
+    float *, const blasint *, blasint *, float *, const blasint *, blasint *);
 
 
 /** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
@@ -14,21 +13,21 @@ static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *,
  * http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html
  * */
 void RELAPACK_ssytrf(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *lWork, blasint *info
 ) {
 
     // Required work size
-    const int cleanlWork = *n * (*n / 2);
-    int minlWork = cleanlWork;
+    const blasint cleanlWork = *n * (*n / 2);
+    blasint minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
     minlWork = 1;
 #endif
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -55,8 +54,8 @@ void RELAPACK_ssytrf(
 #endif
 
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("SSYTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("SSYTRF", &minfo, strlen("SSYTRF"));
         return;
     }
 
@@ -64,7 +63,7 @@ void RELAPACK_ssytrf(
     const char cleanuplo = lower ? 'L' : 'U';
 
     // Dummy arguments
-    int nout;
+    blasint nout;
 
     // Recursive kernel
     RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
@@ -78,13 +77,13 @@ void RELAPACK_ssytrf(
 
 /** ssytrf's recursive compute kernel */
 static void RELAPACK_ssytrf_rec(
-    const char *uplo, const int *n_full, const int *n, int *n_out,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *ldWork, int *info
+    const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *ldWork, blasint *info
 ) {
 
     // top recursion level?
-    const int top = *n_full == *n;
+    const blasint top = *n_full == *n;
 
     if (*n <= MAX(CROSSOVER_SSYTRF, 3)) {
         // Unblocked
@@ -96,34 +95,34 @@ static void RELAPACK_ssytrf_rec(
         return;
     }
 
-    int info1, info2;
+    blasint info1, info2;
 
     // Constants
     const float ONE[]  = { 1. };
     const float MONE[] = { -1. };
-    const int   iONE[] = { 1 };
+    const blasint   iONE[] = { 1 };
 
     // Loop iterator
-    int i;
+    blasint i;
 
-    const int n_rest = *n_full - *n;
+    const blasint n_rest = *n_full - *n;
 
     if (*uplo == 'L') {
         // Splitting (setup)
-        int n1 = SREC_SPLIT(*n);
-        int n2 = *n - n1;
+        blasint n1 = SREC_SPLIT(*n);
+        blasint n2 = *n - n1;
 
         // Work_L *
         float *const Work_L = Work;
 
         // recursion(A_L)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
         n1 = n1_out;
 
         // Splitting (continued)
         n2 = *n - n1;
-        const int n_full2 = *n_full - n1;
+        const blasint n_full2 = *n_full - n1;
 
         // *      *
         // A_BL   A_BR
@@ -139,23 +138,23 @@ static void RELAPACK_ssytrf_rec(
         // (top recursion level: use Work as Work_BR)
         float *const Work_BL =              Work                + n1;
         float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
-        const int ldWork_BR = top ? n2 : *ldWork;
+        const blasint ldWork_BR = top ? n2 : *ldWork;
 
         // ipiv_T
         // ipiv_B
-        int *const ipiv_B = ipiv + n1;
+        blasint *const ipiv_B = ipiv + n1;
 
         // A_BR = A_BR - A_BL Work_BL'
         RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
         BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
 
         // recursion(A_BR)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
 
         if (n2_out != n2) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // last column of A_BR
             float *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
@@ -182,22 +181,22 @@ static void RELAPACK_ssytrf_rec(
         *n_out = n1 + n2;
     } else {
         // Splitting (setup)
-        int n2 = SREC_SPLIT(*n);
-        int n1 = *n - n2;
+        blasint n2 = SREC_SPLIT(*n);
+        blasint n1 = *n - n2;
 
         // * Work_R
         // (top recursion level: use Work as Work_R)
         float *const Work_R = top ? Work : Work + *ldWork * n1;
 
         // recursion(A_R)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
-        const int n2_diff = n2 - n2_out;
+        const blasint n2_diff = n2 - n2_out;
         n2 = n2_out;
 
         // Splitting (continued)
         n1 = *n - n2;
-        const int n_full1  = *n_full - n2;
+        const blasint n_full1  = *n_full - n2;
 
         // * A_TL_T A_TR_T
         // * A_TL   A_TR
@@ -213,19 +212,19 @@ static void RELAPACK_ssytrf_rec(
         // (top recursion level: Work_R was Work)
         float *const Work_L  = Work;
         float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
-        const int ldWork_L = top ? n1 : *ldWork;
+        const blasint ldWork_L = top ? n1 : *ldWork;
 
         // A_TL = A_TL - A_TR Work_TR'
         RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
         BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
 
         // recursion(A_TL)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
 
         if (n1_out != n1) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
             BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
diff --git a/relapack/src/ssytrf_rec2.c b/relapack/src/ssytrf_rec2.c
index edc9269eca..13856f0646 100644
--- a/relapack/src/ssytrf_rec2.c
+++ b/relapack/src/ssytrf_rec2.c
@@ -14,7 +14,7 @@
 
 /* Table of constant values */
 
-static int c__1 = 1;
+static blasint c__1 = 1;
 static float c_b8 = -1.f;
 static float c_b9 = 1.f;
 
@@ -25,32 +25,32 @@ static float c_b9 = 1.f;
  * The blocked BLAS Level 3 updates were removed and moved to the
  * recursive algorithm.
  * */
-/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int *
-	nb, int *kb, float *a, int *lda, int *ipiv, float *w,
-	int *ldw, int *info, ftnlen uplo_len)
+/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, blasint *n, blasint *
+	nb, blasint *kb, float *a, blasint *lda, blasint *ipiv, float *w,
+	int *ldw, blasint *info, ftnlen uplo_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
+    blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
     float r__1, r__2, r__3;
 
     /* Builtin functions */
     double sqrt(double);
 
     /* Local variables */
-    static int j, k;
+    static blasint j, k;
     static float t, r1, d11, d21, d22;
-    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static blasint jj, kk, jp, kp, kw, kkw, imax, jmax;
     static float alpha;
     extern logical lsame_(char *, char *, ftnlen, ftnlen);
-    extern /* Subroutine */ int sscal_(int *, float *, float *, int *),
-	    sgemv_(char *, int *, int *, float *, float *, int *,
-	    float *, int *, float *, float *, int *, ftnlen);
-    static int kstep;
-    extern /* Subroutine */ int scopy_(int *, float *, int *, float *,
-	    int *), sswap_(int *, float *, int *, float *, int *
+    extern /* Subroutine */ blasint sscal_(int *, float *, float *, blasint *),
+	    sgemv_(char *, blasint *, blasint *, float *, float *, blasint *,
+	    float *, blasint *, float *, float *, blasint *, ftnlen);
+    static blasint kstep;
+    extern /* Subroutine */ blasint scopy_(int *, float *, blasint *, float *,
+	    blasint *), sswap_(int *, float *, blasint *, float *, blasint *
 	    );
     static float absakk;
-    extern int isamax_(int *, float *, int *);
+    extern blasint isamax_(int *, float *, blasint *);
     static float colmax, rowmax;
 
     /* Parameter adjustments */
diff --git a/relapack/src/ssytrf_rook.c b/relapack/src/ssytrf_rook.c
index 040df24840..abcf29d1cb 100644
--- a/relapack/src/ssytrf_rook.c
+++ b/relapack/src/ssytrf_rook.c
@@ -3,8 +3,8 @@
 #include <stdlib.h>
 #endif
 
-static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *,
-    float *, const int *, int *, float *, const int *, int *);
+static void RELAPACK_ssytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *,
+    float *, const blasint *, blasint *, float *, const blasint *, blasint *);
 
 
 /** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
@@ -14,21 +14,21 @@ static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int
  * http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html
  * */
 void RELAPACK_ssytrf_rook(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *lWork, blasint *info
 ) {
 
     // Required work size
-    const int cleanlWork = *n * (*n / 2);
-    int minlWork = cleanlWork;
+    const blasint cleanlWork = *n * (*n / 2);
+    blasint minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
     minlWork = 1;
 #endif
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -55,8 +55,8 @@ void RELAPACK_ssytrf_rook(
 #endif
 
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("SSYTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("SSYTRF", &minfo, strlen("SSYTRF"));
         return;
     }
 
@@ -64,7 +64,7 @@ void RELAPACK_ssytrf_rook(
     const char cleanuplo = lower ? 'L' : 'U';
 
     // Dummy argument
-    int nout;
+    blasint nout;
 
     // Recursive kernel
     RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
@@ -78,13 +78,13 @@ void RELAPACK_ssytrf_rook(
 
 /** ssytrf_rook's recursive compute kernel */
 static void RELAPACK_ssytrf_rook_rec(
-    const char *uplo, const int *n_full, const int *n, int *n_out,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *ldWork, int *info
+    const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out,
+    float *A, const blasint *ldA, blasint *ipiv,
+    float *Work, const blasint *ldWork, blasint *info
 ) {
 
     // top recursion level?
-    const int top = *n_full == *n;
+    const blasint top = *n_full == *n;
 
     if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) {
         // Unblocked
@@ -96,31 +96,31 @@ static void RELAPACK_ssytrf_rook_rec(
         return;
     }
 
-    int info1, info2;
+    blasint info1, info2;
 
     // Constants
     const float ONE[]  = { 1. };
     const float MONE[] = { -1. };
-    const int   iONE[]  = { 1 };
+    const blasint   iONE[]  = { 1 };
 
-    const int n_rest = *n_full - *n;
+    const blasint n_rest = *n_full - *n;
 
     if (*uplo == 'L') {
         // Splitting (setup)
-        int n1 = SREC_SPLIT(*n);
-        int n2 = *n - n1;
+        blasint n1 = SREC_SPLIT(*n);
+        blasint n2 = *n - n1;
 
         // Work_L *
         float *const Work_L = Work;
 
         // recursion(A_L)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
         n1 = n1_out;
 
         // Splitting (continued)
         n2 = *n - n1;
-        const int n_full2   = *n_full - n1;
+        const blasint n_full2   = *n_full - n1;
 
         // *      *
         // A_BL   A_BR
@@ -136,23 +136,23 @@ static void RELAPACK_ssytrf_rook_rec(
         // (top recursion level: use Work as Work_BR)
         float *const Work_BL =              Work                + n1;
         float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
-        const int ldWork_BR = top ? n2 : *ldWork;
+        const blasint ldWork_BR = top ? n2 : *ldWork;
 
         // ipiv_T
         // ipiv_B
-        int *const ipiv_B = ipiv + n1;
+        blasint *const ipiv_B = ipiv + n1;
 
         // A_BR = A_BR - A_BL Work_BL'
         RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
         BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
 
         // recursion(A_BR)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
 
         if (n2_out != n2) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // last column of A_BR
             float *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
@@ -169,7 +169,7 @@ static void RELAPACK_ssytrf_rook_rec(
         n2 = n2_out;
 
         // shift pivots
-        int i;
+        blasint i;
         for (i = 0; i < n2; i++)
             if (ipiv_B[i] > 0)
                 ipiv_B[i] += n1;
@@ -180,22 +180,22 @@ static void RELAPACK_ssytrf_rook_rec(
         *n_out = n1 + n2;
     } else {
         // Splitting (setup)
-        int n2 = SREC_SPLIT(*n);
-        int n1 = *n - n2;
+        blasint n2 = SREC_SPLIT(*n);
+        blasint n1 = *n - n2;
 
         // * Work_R
         // (top recursion level: use Work as Work_R)
         float *const Work_R = top ? Work : Work + *ldWork * n1;
 
         // recursion(A_R)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
-        const int n2_diff = n2 - n2_out;
+        const blasint n2_diff = n2 - n2_out;
         n2 = n2_out;
 
         // Splitting (continued)
         n1 = *n - n2;
-        const int n_full1 = *n_full - n2;
+        const blasint n_full1 = *n_full - n2;
 
         // * A_TL_T A_TR_T
         // * A_TL   A_TR
@@ -211,19 +211,19 @@ static void RELAPACK_ssytrf_rook_rec(
         // (top recursion level: Work_R was Work)
         float *const Work_L  = Work;
         float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
-        const int ldWork_L = top ? n1 : *ldWork;
+        const blasint ldWork_L = top ? n1 : *ldWork;
 
         // A_TL = A_TL - A_TR Work_TR'
         RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
         BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
 
         // recursion(A_TL)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
 
         if (n1_out != n1) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
             BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
diff --git a/relapack/src/ssytrf_rook_rec2.c b/relapack/src/ssytrf_rook_rec2.c
index 3308826d7e..41659cb3e5 100644
--- a/relapack/src/ssytrf_rook_rec2.c
+++ b/relapack/src/ssytrf_rook_rec2.c
@@ -14,7 +14,7 @@
 
 /* Table of constant values */
 
-static int c__1 = 1;
+static blasint c__1 = 1;
 static float c_b9 = -1.f;
 static float c_b10 = 1.f;
 
@@ -25,39 +25,39 @@ static float c_b10 = 1.f;
  * The blocked BLAS Level 3 updates were removed and moved to the
  * recursive algorithm.
  * */
-/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n,
-	int *nb, int *kb, float *a, int *lda, int *ipiv, float *
-	w, int *ldw, int *info, ftnlen uplo_len)
+/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, blasint *n,
+	int *nb, blasint *kb, float *a, blasint *lda, blasint *ipiv, float *
+	w, blasint *ldw, blasint *info, ftnlen uplo_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
+    blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
     float r__1;
 
     /* Builtin functions */
     double sqrt(double);
 
     /* Local variables */
-    static int j, k, p;
+    static blasint j, k, p;
     static float t, r1, d11, d12, d21, d22;
-    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw;
     static logical done;
-    static int imax, jmax;
+    static blasint imax, jmax;
     static float alpha;
     extern logical lsame_(char *, char *, ftnlen, ftnlen);
-    extern /* Subroutine */ int sscal_(int *, float *, float *, int *);
+    extern /* Subroutine */ blasint sscal_(int *, float *, float *, blasint *);
     static float sfmin;
-    static int itemp;
-    extern /* Subroutine */ int sgemv_(char *, int *, int *, float *,
-	    float *, int *, float *, int *, float *, float *, int *,
+    static blasint itemp;
+    extern /* Subroutine */ blasint sgemv_(char *, blasint *, blasint *, float *,
+	    float *, blasint *, float *, blasint *, float *, float *, blasint *,
 	    ftnlen);
-    static int kstep;
+    static blasint kstep;
     static float stemp;
-    extern /* Subroutine */ int scopy_(int *, float *, int *, float *,
-	    int *), sswap_(int *, float *, int *, float *, int *
+    extern /* Subroutine */ blasint scopy_(int *, float *, blasint *, float *,
+	    blasint *), sswap_(int *, float *, blasint *, float *, blasint *
 	    );
     static float absakk;
     extern double slamch_(char *, ftnlen);
-    extern int isamax_(int *, float *, int *);
+    extern blasint isamax_(int *, float *, blasint *);
     static float colmax, rowmax;
 
     /* Parameter adjustments */
diff --git a/relapack/src/stgsyl.c b/relapack/src/stgsyl.c
index 1870fb9289..6bace9f173 100644
--- a/relapack/src/stgsyl.c
+++ b/relapack/src/stgsyl.c
@@ -1,11 +1,11 @@
 #include "relapack.h"
 #include <math.h>
 
-static void RELAPACK_stgsyl_rec(const char *, const int *, const int *,
-    const int *, const float *, const int *, const float *, const int *,
-    float *, const int *, const float *, const int *, const float *,
-    const int *, float *, const int *, float *, float *, float *, int *, int *,
-    int *);
+static void RELAPACK_stgsyl_rec(const char *, const blasint *, const blasint *,
+    const blasint *, const float *, const blasint *, const float *, const blasint *,
+    float *, const blasint *, const float *, const blasint *, const float *,
+    const blasint *, float *, const blasint *, float *, float *, float *, blasint *, blasint *,
+    blasint *);
 
 
 /** STGSYL solves the generalized Sylvester equation.
@@ -15,21 +15,21 @@ static void RELAPACK_stgsyl_rec(const char *, const int *, const int *,
  * http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html
  * */
 void RELAPACK_stgsyl(
-    const char *trans, const int *ijob, const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC,
-    const float *D, const int *ldD, const float *E, const int *ldE,
-    float *F, const int *ldF,
+    const char *trans, const blasint *ijob, const blasint *m, const blasint *n,
+    const float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *C, const blasint *ldC,
+    const float *D, const blasint *ldD, const float *E, const blasint *ldE,
+    float *F, const blasint *ldF,
     float *scale, float *dif,
-    float *Work, const int *lWork, int *iWork, int *info
+    float *Work, const blasint *lWork, blasint *iWork, blasint *info
 ) {
 
     // Parse arguments
-    const int notran = LAPACK(lsame)(trans, "N");
-    const int tran = LAPACK(lsame)(trans, "T");
+    const blasint notran = LAPACK(lsame)(trans, "N");
+    const blasint tran = LAPACK(lsame)(trans, "T");
 
     // Compute work buffer size
-    int lwmin = 1;
+    blasint lwmin = 1;
     if (notran && (*ijob == 1 || *ijob == 2))
         lwmin = MAX(1, 2 * *m * *n);
     *info = 0;
@@ -58,8 +58,8 @@ void RELAPACK_stgsyl(
     else if (*lWork < lwmin && *lWork != -1)
         *info = -20;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("STGSYL", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("STGSYL", &minfo, strlen("STGSYL"));
         return;
     }
 
@@ -75,8 +75,8 @@ void RELAPACK_stgsyl(
     // Constant
     const float ZERO[] = { 0. };
 
-    int isolve = 1;
-    int ifunc  = 0;
+    blasint isolve = 1;
+    blasint ifunc  = 0;
     if (notran) {
         if (*ijob >= 3) {
             ifunc = *ijob - 2;
@@ -87,12 +87,12 @@ void RELAPACK_stgsyl(
     }
 
     float scale2;
-    int iround;
+    blasint iround;
     for (iround = 1; iround <= isolve; iround++) {
         *scale = 1;
         float dscale = 0;
         float dsum   = 1;
-        int pq;
+        blasint pq;
         RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info);
         if (dscale != 0) {
             if (*ijob == 1 || *ijob == 3)
@@ -121,13 +121,13 @@ void RELAPACK_stgsyl(
 
 /** stgsyl's recursive vompute kernel */
 static void RELAPACK_stgsyl_rec(
-    const char *trans, const int *ifunc, const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC,
-    const float *D, const int *ldD, const float *E, const int *ldE,
-    float *F, const int *ldF,
+    const char *trans, const blasint *ifunc, const blasint *m, const blasint *n,
+    const float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *C, const blasint *ldC,
+    const float *D, const blasint *ldD, const float *E, const blasint *ldE,
+    float *F, const blasint *ldF,
     float *scale, float *dsum, float *dscale,
-    int *iWork, int *pq, int *info
+    blasint *iWork, blasint *pq, blasint *info
 ) {
 
     if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) {
@@ -139,20 +139,20 @@ static void RELAPACK_stgsyl_rec(
     // Constants
     const float ONE[]  = { 1. };
     const float MONE[] = { -1. };
-    const int   iONE[] = { 1 };
+    const blasint   iONE[] = { 1 };
 
     // Outputs
     float scale1[] = { 1. };
     float scale2[] = { 1. };
-    int   info1[]  = { 0 };
-    int   info2[]  = { 0 };
+    blasint   info1[]  = { 0 };
+    blasint   info2[]  = { 0 };
 
     if (*m > *n) {
         // Splitting
-        int m1 = SREC_SPLIT(*m);
+        blasint m1 = SREC_SPLIT(*m);
         if (A[m1 + *ldA * (m1 - 1)])
             m1++;
-        const int m2 = *m - m1;
+        const blasint m2 = *m - m1;
 
         // A_TL A_TR
         // 0    A_BR
@@ -210,10 +210,10 @@ static void RELAPACK_stgsyl_rec(
         }
     } else {
         // Splitting
-        int n1 = SREC_SPLIT(*n);
+        blasint n1 = SREC_SPLIT(*n);
         if (B[n1 + *ldB * (n1 - 1)])
             n1++;
-        const int n2 = *n - n1;
+        const blasint n2 = *n - n1;
 
         // B_TL B_TR
         // 0    B_BR
diff --git a/relapack/src/strsyl.c b/relapack/src/strsyl.c
index 83947ef1a0..012fb35486 100644
--- a/relapack/src/strsyl.c
+++ b/relapack/src/strsyl.c
@@ -1,8 +1,8 @@
 #include "relapack.h"
 
-static void RELAPACK_strsyl_rec(const char *, const char *, const int *,
-    const int *, const int *, const float *, const int *, const float *,
-    const int *, float *, const int *, float *, int *);
+static void RELAPACK_strsyl_rec(const char *, const char *, const blasint *,
+    const blasint *, const blasint *, const float *, const blasint *, const float *,
+    const blasint *, float *, const blasint *, float *, blasint *);
 
 
 /** STRSYL solves the real Sylvester matrix equation.
@@ -12,20 +12,20 @@ static void RELAPACK_strsyl_rec(const char *, const char *, const int *,
  * http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html
  * */
 void RELAPACK_strsyl(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC, float *scale,
-    int *info
+    const char *tranA, const char *tranB, const blasint *isgn,
+    const blasint *m, const blasint *n,
+    const float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *C, const blasint *ldC, float *scale,
+    blasint *info
 ) {
 
     // Check arguments
-    const int notransA = LAPACK(lsame)(tranA, "N");
-    const int transA = LAPACK(lsame)(tranA, "T");
-    const int ctransA = LAPACK(lsame)(tranA, "C");
-    const int notransB = LAPACK(lsame)(tranB, "N");
-    const int transB = LAPACK(lsame)(tranB, "T");
-    const int ctransB = LAPACK(lsame)(tranB, "C");
+    const blasint notransA = LAPACK(lsame)(tranA, "N");
+    const blasint transA = LAPACK(lsame)(tranA, "T");
+    const blasint ctransA = LAPACK(lsame)(tranA, "C");
+    const blasint notransB = LAPACK(lsame)(tranB, "N");
+    const blasint transB = LAPACK(lsame)(tranB, "T");
+    const blasint ctransB = LAPACK(lsame)(tranB, "C");
     *info = 0;
     if (!transA && !ctransA && !notransA)
         *info = -1;
@@ -44,8 +44,8 @@ void RELAPACK_strsyl(
     else if (*ldC < MAX(1, *m))
         *info = -11;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("STRSYL", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("STRSYL", &minfo, strlen("STRSYL"));
         return;
     }
 
@@ -60,11 +60,11 @@ void RELAPACK_strsyl(
 
 /** strsyl's recursive compute kernel */
 static void RELAPACK_strsyl_rec(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC, float *scale,
-    int *info
+    const char *tranA, const char *tranB, const blasint *isgn,
+    const blasint *m, const blasint *n,
+    const float *A, const blasint *ldA, const float *B, const blasint *ldB,
+    float *C, const blasint *ldC, float *scale,
+    blasint *info
 ) {
 
     if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) {
@@ -77,20 +77,20 @@ static void RELAPACK_strsyl_rec(
     const float ONE[]  = { 1. };
     const float MONE[] = { -1. };
     const float MSGN[] = { -*isgn };
-    const int   iONE[] = { 1 };
+    const blasint   iONE[] = { 1 };
 
     // Outputs
     float scale1[] = { 1. };
     float scale2[] = { 1. };
-    int   info1[]  = { 0 };
-    int   info2[]  = { 0 };
+    blasint   info1[]  = { 0 };
+    blasint   info2[]  = { 0 };
 
     if (*m > *n) {
         // Splitting
-        int m1 = SREC_SPLIT(*m);
+        blasint m1 = SREC_SPLIT(*m);
         if (A[m1 + *ldA * (m1 - 1)])
             m1++;
-        const int m2 = *m - m1;
+        const blasint m2 = *m - m1;
 
         // A_TL A_TR
         // 0    A_BR
@@ -126,10 +126,10 @@ static void RELAPACK_strsyl_rec(
         }
     } else {
         // Splitting
-        int n1 = SREC_SPLIT(*n);
+        blasint n1 = SREC_SPLIT(*n);
         if (B[n1 + *ldB * (n1 - 1)])
             n1++;
-        const int n2 = *n - n1;
+        const blasint n2 = *n - n1;
 
         // B_TL B_TR
         // 0    B_BR
diff --git a/relapack/src/strsyl_rec2.c b/relapack/src/strsyl_rec2.c
index 6d40a475d7..37a24c7dc2 100644
--- a/relapack/src/strsyl_rec2.c
+++ b/relapack/src/strsyl_rec2.c
@@ -14,48 +14,48 @@
 
 /* Table of constant values */
 
-static int c__1 = 1;
-static int c_false = FALSE_;
-static int c__2 = 2;
+static blasint c__1 = 1;
+static blasint c_false = FALSE_;
+static blasint c__2 = 2;
 static float c_b26 = 1.f;
 static float c_b30 = 0.f;
-static int c_true = TRUE_;
+static blasint c_true = TRUE_;
 
-void RELAPACK_strsyl_rec2(char *trana, char *tranb, int *isgn, int
-	*m, int *n, float *a, int *lda, float *b, int *ldb, float *
-	c__, int *ldc, float *scale, int *info, ftnlen trana_len,
+void RELAPACK_strsyl_rec2(char *trana, char *tranb, blasint *isgn, int
+	*m, blasint *n, float *a, blasint *lda, float *b, blasint *ldb, float *
+	c__, blasint *ldc, float *scale, blasint *info, ftnlen trana_len,
 	ftnlen tranb_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+    blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4;
     float r__1, r__2;
 
     /* Local variables */
-    static int j, k, l;
+    static blasint j, k, l;
     static float x[4]	/* was [2][2] */;
-    static int k1, k2, l1, l2;
+    static blasint k1, k2, l1, l2;
     static float a11, db, da11, vec[4]	/* was [2][2] */, dum[1], eps, sgn;
-    static int ierr;
+    static blasint ierr;
     static float smin;
-    extern float sdot_(int *, float *, int *, float *, int *);
+    extern float sdot_(int *, float *, blasint *, float *, blasint *);
     static float suml, sumr;
-    extern int lsame_(char *, char *, ftnlen, ftnlen);
-    extern /* Subroutine */ int sscal_(int *, float *, float *, int *);
-    static int knext, lnext;
+    extern blasint lsame_(char *, char *, ftnlen, ftnlen);
+    extern /* Subroutine */ blasint sscal_(int *, float *, float *, blasint *);
+    static blasint knext, lnext;
     static float xnorm;
-    extern /* Subroutine */ int slaln2_(int *, int *, int *, float
-	    *, float *, float *, int *, float *, float *, float *, int *,
-	    float *, float *, float *, int *, float *, float *, int *),
-	    slasy2_(int *, int *, int *, int *, int *,
-	    float *, int *, float *, int *, float *, int *, float *,
-	    float *, int *, float *, int *), slabad_(float *, float *);
+    extern /* Subroutine */ blasint slaln2_(int *, blasint *, blasint *, float
+	    *, float *, float *, blasint *, float *, float *, float *, blasint *,
+	    float *, float *, float *, blasint *, float *, float *, blasint *),
+	    slasy2_(int *, blasint *, blasint *, blasint *, blasint *,
+	    float *, blasint *, float *, blasint *, float *, blasint *, float *,
+	    float *, blasint *, float *, blasint *), slabad_(float *, float *);
     static float scaloc;
-    extern float slamch_(char *, ftnlen), slange_(char *, int *,
-	    int *, float *, int *, float *, ftnlen);
-    extern /* Subroutine */ int xerbla_(char *, int *, ftnlen);
+    extern float slamch_(char *, ftnlen), slange_(char *, blasint *,
+	    blasint *, float *, blasint *, float *, ftnlen);
+    extern /* Subroutine */ blasint xerbla_(char *, blasint *, ftnlen);
     static float bignum;
-    static int notrna, notrnb;
+    static blasint notrna, notrnb;
     static float smlnum;
 
     /* Parameter adjustments */
diff --git a/relapack/src/strtri.c b/relapack/src/strtri.c
index d35bbd49f4..18d11f5ebc 100644
--- a/relapack/src/strtri.c
+++ b/relapack/src/strtri.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_strtri_rec(const char *, const char *, const int *,
-    float *, const int *, int *);
+static void RELAPACK_strtri_rec(const char *, const char *, const blasint *,
+    float *, const blasint *, blasint *);
 
 
 /** CTRTRI computes the inverse of a real upper or lower triangular matrix A.
@@ -11,16 +11,16 @@ static void RELAPACK_strtri_rec(const char *, const char *, const int *,
  * http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html
  * */
 void RELAPACK_strtri(
-    const char *uplo, const char *diag, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const char *diag, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
-    const int nounit = LAPACK(lsame)(diag, "N");
-    const int unit = LAPACK(lsame)(diag, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
+    const blasint nounit = LAPACK(lsame)(diag, "N");
+    const blasint unit = LAPACK(lsame)(diag, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -31,8 +31,8 @@ void RELAPACK_strtri(
     else if (*ldA < MAX(1, *n))
         *info = -5;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("STRTRI", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("STRTRI", &minfo, strlen("STRTRI"));
         return;
     }
 
@@ -42,7 +42,7 @@ void RELAPACK_strtri(
 
     // check for singularity
     if (nounit) {
-        int i;
+        blasint i;
         for (i = 0; i < *n; i++)
             if (A[i + *ldA * i] == 0) {
                 *info = i;
@@ -57,9 +57,9 @@ void RELAPACK_strtri(
 
 /** strtri's recursive compute kernel */
 static void RELAPACK_strtri_rec(
-    const char *uplo, const char *diag, const int *n,
-    float *A, const int *ldA,
-    int *info
+    const char *uplo, const char *diag, const blasint *n,
+    float *A, const blasint *ldA,
+    blasint *info
 ){
 
     if (*n <= MAX(CROSSOVER_STRTRI, 1)) {
@@ -73,8 +73,8 @@ static void RELAPACK_strtri_rec(
     const float MONE[] = { -1. };
 
     // Splitting
-    const int n1 = SREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = SREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c
index 3aa6bf5318..d4ba417531 100644
--- a/relapack/src/zgbtrf.c
+++ b/relapack/src/zgbtrf.c
@@ -1,9 +1,9 @@
 #include "relapack.h"
 #include "stdlib.h"
 
-static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *,
-    const int *, double *, const int *, int *, double *, const int *, double *,
-    const int *, int *);
+static void RELAPACK_zgbtrf_rec(const blasint *, const blasint *, const blasint *,
+    const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *,
+    const blasint *, blasint *);
 
 
 /** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
@@ -13,9 +13,9 @@ static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *,
  * http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html
  * */
 void RELAPACK_zgbtrf(
-    const int *m, const int *n, const int *kl, const int *ku,
-    double *Ab, const int *ldAb, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n, const blasint *kl, const blasint *ku,
+    double *Ab, const blasint *ldAb, blasint *ipiv,
+    blasint *info
 ) {
 
     // Check arguments
@@ -31,8 +31,8 @@ void RELAPACK_zgbtrf(
     else if (*ldAb < 2 * *kl + *ku + 1)
         *info = -6;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZGBTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZGBTRF", &minfo, strlen("ZGBTRF"));
         return;
     }
 
@@ -40,14 +40,14 @@ void RELAPACK_zgbtrf(
     const double ZERO[] = { 0., 0. };
 
     // Result upper band width
-    const int kv = *ku + *kl;
+    const blasint kv = *ku + *kl;
 
     // Unskew A
-    const int ldA[] = { *ldAb - 1 };
+    const blasint ldA[] = { *ldAb - 1 };
     double *const A = Ab + 2 * kv;
 
     // Zero upper diagonal fill-in elements
-    int i, j;
+    blasint i, j;
     for (j = 0; j < *n; j++) {
         double *const A_j = A + 2 * *ldA * j;
         for (i = MAX(0, j - kv); i < j - *ku; i++)
@@ -55,11 +55,11 @@ void RELAPACK_zgbtrf(
     }
 
     // Allocate work space
-    const int n1 = ZREC_SPLIT(*n);
-    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
-    const int nWorkl = (kv > n1) ? n1 : kv;
-    const int mWorku = (*kl > n1) ? n1 : *kl;
-    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    const blasint n1 = ZREC_SPLIT(*n);
+    const blasint mWorkl = abs ( (kv > n1) ? MAX(1, *m - *kl) : kv);
+    const blasint nWorkl = abs ( (kv > n1) ? n1 : kv);
+    const blasint mWorku = abs ( (*kl > n1) ? n1 : *kl);
+    const blasint nWorku = abs ( (*kl > n1) ? MAX(0, *n - *kl) : *kl);
     double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double));
     double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double));
     LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
@@ -76,10 +76,10 @@ void RELAPACK_zgbtrf(
 
 /** zgbtrf's recursive compute kernel */
 static void RELAPACK_zgbtrf_rec(
-    const int *m, const int *n, const int *kl, const int *ku,
-    double *Ab, const int *ldAb, int *ipiv,
-    double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
-    int *info
+    const blasint *m, const blasint *n, const blasint *kl, const blasint *ku,
+    double *Ab, const blasint *ldAb, blasint *ipiv,
+    double *Workl, const blasint *ldWorkl, double *Worku, const blasint *ldWorku,
+    blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) {
@@ -91,25 +91,25 @@ static void RELAPACK_zgbtrf_rec(
     // Constants
     const double ONE[]  = { 1., 0. };
     const double MONE[] = { -1., 0. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
     // Loop iterators
-    int i, j;
+    blasint i, j;
 
     // Output upper band width
-    const int kv = *ku + *kl;
+    const blasint kv = *ku + *kl;
 
     // Unskew A
-    const int ldA[] = { *ldAb - 1 };
+    const blasint ldA[] = { *ldAb - 1 };
     double *const A = Ab + 2 * kv;
 
     // Splitting
-    const int n1  = MIN(ZREC_SPLIT(*n), *kl);
-    const int n2  = *n - n1;
-    const int m1  = MIN(n1, *m);
-    const int m2  = *m - m1;
-    const int mn1 = MIN(m1, n1);
-    const int mn2 = MIN(m2, n2);
+    const blasint n1  = MIN(ZREC_SPLIT(*n), *kl);
+    const blasint n2  = *n - n1;
+    const blasint m1  = MIN(n1, *m);
+    const blasint m2  = *m - m1;
+    const blasint mn1 = MIN(m1, n1);
+    const blasint mn2 = MIN(m2, n2);
 
     // Ab_L *
     //      Ab_BR
@@ -129,14 +129,14 @@ static void RELAPACK_zgbtrf_rec(
 
     // ipiv_T
     // ipiv_B
-    int *const ipiv_T = ipiv;
-    int *const ipiv_B = ipiv + n1;
+    blasint *const ipiv_T = ipiv;
+    blasint *const ipiv_B = ipiv + n1;
 
     // Banded splitting
-    const int n21 = MIN(n2, kv - n1);
-    const int n22 = MIN(n2 - n21, n1);
-    const int m21 = MIN(m2, *kl - m1);
-    const int m22 = MIN(m2 - m21, m1);
+    const blasint n21 = MIN(n2, kv - n1);
+    const blasint n22 = MIN(n2 - n21, n1);
+    const blasint m21 = MIN(m2, *kl - m1);
+    const blasint m22 = MIN(m2 - m21, m1);
 
     //   n1 n21  n22
     // m *  A_Rl ARr
@@ -164,7 +164,7 @@ static void RELAPACK_zgbtrf_rec(
 
     // partially redo swaps in A_L
     for (i = 0; i < mn1; i++) {
-        const int ip = ipiv_T[i] - 1;
+        const blasint ip = ipiv_T[i] - 1;
         if (ip != i) {
             if (ip < *kl)
                 BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
@@ -180,7 +180,7 @@ static void RELAPACK_zgbtrf_rec(
     for (j = 0; j < n22; j++) {
         double *const A_Rrj = A_Rr + 2 * *ldA * j;
         for (i = j; i < mn1; i++) {
-            const int ip = ipiv_T[i] - 1;
+            const blasint ip = ipiv_T[i] - 1;
             if (ip != i) {
                 const double tmpr = A_Rrj[2 * i];
                 const double tmpc = A_Rrj[2 * i + 1];
@@ -211,7 +211,7 @@ static void RELAPACK_zgbtrf_rec(
 
     // partially undo swaps in A_L
     for (i = mn1 - 1; i >= 0; i--) {
-        const int ip = ipiv_T[i] - 1;
+        const blasint ip = ipiv_T[i] - 1;
         if (ip != i) {
             if (ip < *kl)
                 BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
@@ -221,7 +221,9 @@ static void RELAPACK_zgbtrf_rec(
     }
 
     // recursion(Ab_BR, ipiv_B)
-    RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+ //   RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+ LAPACK(zgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
+ 
     if (*info)
         *info += n1;
     // shift pivots
diff --git a/relapack/src/zgemmt.c b/relapack/src/zgemmt.c
index aa59302386..f53a3ca6f7 100644
--- a/relapack/src/zgemmt.c
+++ b/relapack/src/zgemmt.c
@@ -1,12 +1,12 @@
 #include "relapack.h"
 
 static void RELAPACK_zgemmt_rec(const char *, const char *, const char *,
-    const int *, const int *, const double *, const double *, const int *,
-    const double *, const int *, const double *, double *, const int *);
+    const blasint *, const blasint *, const double *, const double *, const blasint *,
+    const double *, const blasint *, const double *, double *, const blasint *);
 
 static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *,
-    const int *, const int *, const double *, const double *, const int *,
-    const double *, const int *, const double *, double *, const int *);
+    const blasint *, const blasint *, const double *, const double *, const blasint *,
+    const double *, const blasint *, const double *, double *, const blasint *);
 
 
 /** ZGEMMT computes a matrix-matrix product with general matrices but updates
@@ -20,10 +20,10 @@ static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *,
  * */
 void RELAPACK_zgemmt(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const double *alpha, const double *A, const int *ldA,
-    const double *B, const int *ldB,
-    const double *beta, double *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const double *alpha, const double *A, const blasint *ldA,
+    const double *B, const blasint *ldB,
+    const double *beta, double *C, const blasint *ldC
 ) {
 
 #if HAVE_XGEMMT
@@ -32,15 +32,15 @@ void RELAPACK_zgemmt(
 #else
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
-    const int notransA = LAPACK(lsame)(transA, "N");
-    const int tranA = LAPACK(lsame)(transA, "T");
-    const int ctransA = LAPACK(lsame)(transA, "C");
-    const int notransB = LAPACK(lsame)(transB, "N");
-    const int tranB = LAPACK(lsame)(transB, "T");
-    const int ctransB = LAPACK(lsame)(transB, "C");
-    int info = 0;
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
+    const blasint notransA = LAPACK(lsame)(transA, "N");
+    const blasint tranA = LAPACK(lsame)(transA, "T");
+    const blasint ctransA = LAPACK(lsame)(transA, "C");
+    const blasint notransB = LAPACK(lsame)(transB, "N");
+    const blasint tranB = LAPACK(lsame)(transB, "T");
+    const blasint ctransB = LAPACK(lsame)(transB, "C");
+    blasint info = 0;
     if (!lower && !upper)
         info = 1;
     else if (!tranA && !ctransA && !notransA)
@@ -58,7 +58,7 @@ void RELAPACK_zgemmt(
     else if (*ldC < MAX(1, *n))
         info = 13;
     if (info) {
-        LAPACK(xerbla)("ZGEMMT", &info);
+        LAPACK(xerbla)("ZGEMMT", &info, strlen("ZGEMMT"));
         return;
     }
 
@@ -76,10 +76,10 @@ void RELAPACK_zgemmt(
 /** zgemmt's recursive compute kernel */
 static void RELAPACK_zgemmt_rec(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const double *alpha, const double *A, const int *ldA,
-    const double *B, const int *ldB,
-    const double *beta, double *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const double *alpha, const double *A, const blasint *ldA,
+    const double *B, const blasint *ldB,
+    const double *beta, double *C, const blasint *ldC
 ) {
 
     if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) {
@@ -89,8 +89,8 @@ static void RELAPACK_zgemmt_rec(
     }
 
     // Splitting
-    const int n1 = ZREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = ZREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_T
     // A_B
@@ -126,16 +126,16 @@ static void RELAPACK_zgemmt_rec(
 /** zgemmt's unblocked compute kernel */
 static void RELAPACK_zgemmt_rec2(
     const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const double *alpha, const double *A, const int *ldA,
-    const double *B, const int *ldB,
-    const double *beta, double *C, const int *ldC
+    const blasint *n, const blasint *k,
+    const double *alpha, const double *A, const blasint *ldA,
+    const double *B, const blasint *ldB,
+    const double *beta, double *C, const blasint *ldC
 ) {
 
-    const int incB = (*transB == 'N') ? 1 : *ldB;
-    const int incC = 1;
+    const blasint incB = (*transB == 'N') ? 1 : *ldB;
+    const blasint incC = 1;
 
-    int i;
+    blasint i;
     for (i = 0; i < *n; i++) {
         // A_0
         // A_i
@@ -151,13 +151,13 @@ static void RELAPACK_zgemmt_rec2(
         double *const C_ii = C + 2 * *ldC * i + 2 * i;
 
         if (*uplo == 'L') {
-            const int nmi = *n - i;
+            const blasint nmi = *n - i;
             if (*transA == 'N')
                 BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
             else
                 BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
         } else {
-            const int ip1 = i + 1;
+            const blasint ip1 = i + 1;
             if (*transA == 'N')
                 BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
             else
diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c
index cf8921e1f0..b0d14ffb1e 100644
--- a/relapack/src/zgetrf.c
+++ b/relapack/src/zgetrf.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_zgetrf_rec(const int *, const int *, double *,
-    const int *, int *, int *);
+static void RELAPACK_zgetrf_rec(const blasint *, const blasint *, double *,
+    const blasint *, blasint *, blasint *);
 
 
 /** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
@@ -11,9 +11,9 @@ static void RELAPACK_zgetrf_rec(const int *, const int *, double *,
  * http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html
  * */
 void RELAPACK_zgetrf(
-    const int *m, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    blasint *info
 ) {
 
     // Check arguments
@@ -22,15 +22,15 @@ void RELAPACK_zgetrf(
         *info = -1;
     else if (*n < 0)
         *info = -2;
-    else if (*ldA < MAX(1, *n))
+    else if (*ldA < MAX(1, *m))
         *info = -4;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZGETRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZGETRF", &minfo, strlen("ZGETRF"));
         return;
     }
 
-    const int sn = MIN(*m, *n);
+    const blasint sn = MIN(*m, *n);
 
     RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info);
 
@@ -38,10 +38,10 @@ void RELAPACK_zgetrf(
     if (*m < *n) {
         // Constants
         const double ONE[]  = { 1., 0. };
-        const int    iONE[] = { 1 };
+        const blasint    iONE[] = { 1 };
 
         // Splitting
-        const int rn = *n - *m;
+        const blasint rn = *n - *m;
 
         // A_L A_R
         const double *const A_L = A;
@@ -57,9 +57,9 @@ void RELAPACK_zgetrf(
 
 /** zgetrf's recursive compute kernel */
 static void RELAPACK_zgetrf_rec(
-    const int *m, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    int *info
+    const blasint *m, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_ZGETRF, 1)) {
@@ -71,12 +71,12 @@ static void RELAPACK_zgetrf_rec(
     // Constants
     const double ONE[]  = { 1., 0. };
     const double MONE[] = { -1., 0. };
-    const int    iONE[] = { 1. };
+    const blasint    iONE[] = { 1. };
 
     // Splitting
-    const int n1 = ZREC_SPLIT(*n);
-    const int n2 = *n - n1;
-    const int m2 = *m - n1;
+    const blasint n1 = ZREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
+    const blasint m2 = *m - n1;
 
     // A_L A_R
     double *const A_L = A;
@@ -91,8 +91,8 @@ static void RELAPACK_zgetrf_rec(
 
     // ipiv_T
     // ipiv_B
-    int *const ipiv_T = ipiv;
-    int *const ipiv_B = ipiv + n1;
+    blasint *const ipiv_T = ipiv;
+    blasint *const ipiv_B = ipiv + n1;
 
     // recursion(A_L, ipiv_T)
     RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
@@ -111,7 +111,7 @@ static void RELAPACK_zgetrf_rec(
     // apply pivots to A_BL
     LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
     // shift pivots
-    int i;
+    blasint i;
     for (i = 0; i < n2; i++)
         ipiv_B[i] += n1;
 }
diff --git a/relapack/src/zhegst.c b/relapack/src/zhegst.c
index d0ece21481..dc9b7eacee 100644
--- a/relapack/src/zhegst.c
+++ b/relapack/src/zhegst.c
@@ -3,9 +3,9 @@
 #include "stdlib.h"
 #endif
 
-static void RELAPACK_zhegst_rec(const int *, const char *, const int *,
-    double *, const int *, const double *, const int *,
-    double *, const int *, int *);
+static void RELAPACK_zhegst_rec(const blasint *, const char *, const blasint *,
+    double *, const blasint *, const double *, const blasint *,
+    double *, const blasint *, blasint *);
 
 
 /** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
@@ -15,14 +15,14 @@ static void RELAPACK_zhegst_rec(const int *, const char *, const int *,
  * http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html
  * */
 void RELAPACK_zhegst(
-    const int *itype, const char *uplo, const int *n,
-    double *A, const int *ldA, const double *B, const int *ldB,
-    int *info
+    const blasint *itype, const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (*itype < 1 || *itype > 3)
         *info = -1;
@@ -35,8 +35,8 @@ void RELAPACK_zhegst(
     else if (*ldB < MAX(1, *n))
         *info = -7;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZHEGST", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZHEGST", &minfo, strlen("ZHEGST"));
         return;
     }
 
@@ -45,9 +45,9 @@ void RELAPACK_zhegst(
 
     // Allocate work space
     double *Work = NULL;
-    int    lWork = 0;
+    blasint    lWork = 0;
 #if XSYGST_ALLOW_MALLOC
-    const int n1 = ZREC_SPLIT(*n);
+    const blasint n1 = ZREC_SPLIT(*n);
     lWork = n1 * (*n - n1);
     Work  = malloc(lWork * 2 * sizeof(double));
     if (!Work)
@@ -67,9 +67,9 @@ void RELAPACK_zhegst(
 
 /** zhegst's recursive compute kernel */
 static void RELAPACK_zhegst_rec(
-    const int *itype, const char *uplo, const int *n,
-    double *A, const int *ldA, const double *B, const int *ldB,
-    double *Work, const int *lWork, int *info
+    const blasint *itype, const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *Work, const blasint *lWork, blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_ZHEGST, 1)) {
@@ -84,14 +84,14 @@ static void RELAPACK_zhegst_rec(
     const double MONE[]  = { -1., 0. };
     const double HALF[]  = { .5, 0. };
     const double MHALF[] = { -.5, 0. };
-    const int    iONE[]  = { 1 };
+    const blasint    iONE[]  = { 1 };
 
     // Loop iterator
-    int i;
+    blasint i;
 
     // Splitting
-    const int n1 = ZREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = ZREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/zhetrf.c b/relapack/src/zhetrf.c
index ef4e1f5d5d..3d458fecf8 100644
--- a/relapack/src/zhetrf.c
+++ b/relapack/src/zhetrf.c
@@ -3,8 +3,8 @@
 #include <stdlib.h>
 #endif
 
-static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *,
-    double *, const int *, int *, double *, const int *, int *);
+static void RELAPACK_zhetrf_rec(const char *, const blasint *, const blasint *, blasint *,
+    double *, const blasint *, blasint *, double *, const blasint *, blasint *);
 
 
 /** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
@@ -14,21 +14,21 @@ static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *,
  * http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html
  * */
 void RELAPACK_zhetrf(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *lWork, blasint *info
 ) {
 
     // Required work size
-    const int cleanlWork = *n * (*n / 2);
-    int minlWork = cleanlWork;
+    const blasint cleanlWork = *n * (*n / 2);
+    blasint minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
     minlWork = 1;
 #endif
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -55,8 +55,8 @@ void RELAPACK_zhetrf(
 #endif
 
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZHETRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZHETRF", &minfo, strlen("ZHETRF"));
         return;
     }
 
@@ -64,7 +64,7 @@ void RELAPACK_zhetrf(
     const char cleanuplo = lower ? 'L' : 'U';
 
     // Dummy argument
-    int nout;
+    blasint nout;
 
     // Recursive kernel
     RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
@@ -78,13 +78,13 @@ void RELAPACK_zhetrf(
 
 /** zhetrf's recursive compute kernel */
 static void RELAPACK_zhetrf_rec(
-    const char *uplo, const int *n_full, const int *n, int *n_out,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *ldWork, int *info
+    const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *ldWork, blasint *info
 ) {
 
     // top recursion level?
-    const int top = *n_full == *n;
+    const blasint top = *n_full == *n;
 
     if (*n <= MAX(CROSSOVER_ZHETRF, 3)) {
         // Unblocked
@@ -96,31 +96,31 @@ static void RELAPACK_zhetrf_rec(
         return;
     }
 
-    int info1, info2;
+    blasint info1, info2;
 
     // Constants
     const double ONE[]  = { 1., 0. };
     const double MONE[] = { -1., 0. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
-    const int n_rest = *n_full - *n;
+    const blasint n_rest = *n_full - *n;
 
     if (*uplo == 'L') {
         // Splitting (setup)
-        int n1 = ZREC_SPLIT(*n);
-        int n2 = *n - n1;
+        blasint n1 = ZREC_SPLIT(*n);
+        blasint n2 = *n - n1;
 
         // Work_L *
         double *const Work_L = Work;
 
         // recursion(A_L)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
         n1 = n1_out;
 
         // Splitting (continued)
         n2 = *n - n1;
-        const int n_full2 = *n_full - n1;
+        const blasint n_full2 = *n_full - n1;
 
         // *      *
         // A_BL   A_BR
@@ -136,23 +136,23 @@ static void RELAPACK_zhetrf_rec(
         // (top recursion level: use Work as Work_BR)
         double *const Work_BL =              Work                    + 2 * n1;
         double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
-        const int ldWork_BR = top ? n2 : *ldWork;
+        const blasint ldWork_BR = top ? n2 : *ldWork;
 
         // ipiv_T
         // ipiv_B
-        int *const ipiv_B = ipiv + n1;
+        blasint *const ipiv_B = ipiv + n1;
 
         // A_BR = A_BR - A_BL Work_BL'
         RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
         BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
 
         // recursion(A_BR)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
 
         if (n2_out != n2) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // last column of A_BR
             double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
@@ -169,7 +169,7 @@ static void RELAPACK_zhetrf_rec(
         n2 = n2_out;
 
         // shift pivots
-        int i;
+        blasint i;
         for (i = 0; i < n2; i++)
             if (ipiv_B[i] > 0)
                 ipiv_B[i] += n1;
@@ -180,22 +180,22 @@ static void RELAPACK_zhetrf_rec(
         *n_out = n1 + n2;
     } else {
         // Splitting (setup)
-        int n2 = ZREC_SPLIT(*n);
-        int n1 = *n - n2;
+        blasint n2 = ZREC_SPLIT(*n);
+        blasint n1 = *n - n2;
 
         // * Work_R
         // (top recursion level: use Work as Work_R)
         double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
 
         // recursion(A_R)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
-        const int n2_diff = n2 - n2_out;
+        const blasint n2_diff = n2 - n2_out;
         n2 = n2_out;
 
         // Splitting (continued)
         n1 = *n - n2;
-        const int n_full1 = *n_full - n2;
+        const blasint n_full1 = *n_full - n2;
 
         // * A_TL_T A_TR_T
         // * A_TL   A_TR
@@ -211,19 +211,19 @@ static void RELAPACK_zhetrf_rec(
         // (top recursion level: Work_R was Work)
         double *const Work_L  = Work;
         double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
-        const int ldWork_L = top ? n1 : *ldWork;
+        const blasint ldWork_L = top ? n1 : *ldWork;
 
         // A_TL = A_TL - A_TR Work_TR'
         RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
         BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
 
         // recursion(A_TL)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
 
         if (n1_out != n1) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
             BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
diff --git a/relapack/src/zhetrf_rec2.c b/relapack/src/zhetrf_rec2.c
index 867ea64e15..c14cf04406 100644
--- a/relapack/src/zhetrf_rec2.c
+++ b/relapack/src/zhetrf_rec2.c
@@ -15,7 +15,7 @@
 /* Table of constant values */
 
 static doublecomplex c_b1 = {1.,0.};
-static int c__1 = 1;
+static blasint c__1 = 1;
 
 /** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
  *
@@ -24,12 +24,12 @@ static int c__1 = 1;
  * The blocked BLAS Level 3 updates were removed and moved to the
  * recursive algorithm.
  * */
-/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int *
-	nb, int *kb, doublecomplex *a, int *lda, int *ipiv,
-	doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
+/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, blasint *n, blasint *
+	nb, blasint *kb, doublecomplex *a, blasint *lda, blasint *ipiv,
+	doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
     double d__1, d__2, d__3, d__4;
     doublecomplex z__1, z__2, z__3, z__4;
 
@@ -39,26 +39,26 @@ static int c__1 = 1;
 	    doublecomplex *, doublecomplex *);
 
     /* Local variables */
-    static int j, k;
+    static blasint j, k;
     static double t, r1;
     static doublecomplex d11, d21, d22;
-    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static blasint jj, kk, jp, kp, kw, kkw, imax, jmax;
     static double alpha;
     extern logical lsame_(char *, char *, ftnlen, ftnlen);
-    static int kstep;
-    extern /* Subroutine */ int zgemv_(char *, int *, int *,
-	    doublecomplex *, doublecomplex *, int *, doublecomplex *,
-	    int *, doublecomplex *, doublecomplex *, int *, ftnlen),
-	    zcopy_(int *, doublecomplex *, int *, doublecomplex *,
-	    int *), zswap_(int *, doublecomplex *, int *,
-	    doublecomplex *, int *);
+    static blasint kstep;
+    extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *,
+	    doublecomplex *, doublecomplex *, blasint *, doublecomplex *,
+	    blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen),
+	    zcopy_(int *, doublecomplex *, blasint *, doublecomplex *,
+	    blasint *), zswap_(int *, doublecomplex *, blasint *,
+	    doublecomplex *, blasint *);
     static double absakk;
-    extern /* Subroutine */ int zdscal_(int *, double *,
-	    doublecomplex *, int *);
+    extern /* Subroutine */ blasint zdscal_(int *, double *,
+	    doublecomplex *, blasint *);
     static double colmax;
-    extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *)
+    extern /* Subroutine */ blasint zlacgv_(int *, doublecomplex *, blasint *)
 	    ;
-    extern int izamax_(int *, doublecomplex *, int *);
+    extern blasint izamax_(int *, doublecomplex *, blasint *);
     static double rowmax;
 
     /* Parameter adjustments */
diff --git a/relapack/src/zhetrf_rook.c b/relapack/src/zhetrf_rook.c
index 15ceaeae7a..285aea96e8 100644
--- a/relapack/src/zhetrf_rook.c
+++ b/relapack/src/zhetrf_rook.c
@@ -3,8 +3,8 @@
 #include <stdlib.h>
 #endif
 
-static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *,
-    double *, const int *, int *, double *, const int *, int *);
+static void RELAPACK_zhetrf_rook_rec(const char *, const blasint *, const blasint *, blasint *,
+    double *, const blasint *, blasint *, double *, const blasint *, blasint *);
 
 
 /** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
@@ -14,21 +14,21 @@ static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int
  * http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html
  * */
 void RELAPACK_zhetrf_rook(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *lWork, blasint *info
 ) {
 
     // Required work size
-    const int cleanlWork = *n * (*n / 2);
-    int minlWork = cleanlWork;
+    const blasint cleanlWork = *n * (*n / 2);
+    blasint minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
     minlWork = 1;
 #endif
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -55,8 +55,8 @@ void RELAPACK_zhetrf_rook(
 #endif
 
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZHETRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZHETRF", &minfo, strlen("ZHETRF"));
         return;
     }
 
@@ -64,7 +64,7 @@ void RELAPACK_zhetrf_rook(
     const char cleanuplo = lower ? 'L' : 'U';
 
     // Dummy argument
-    int nout;
+    blasint nout;
 
     // Recursive kernel
     RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
@@ -78,13 +78,13 @@ void RELAPACK_zhetrf_rook(
 
 /** zhetrf_rook's recursive compute kernel */
 static void RELAPACK_zhetrf_rook_rec(
-    const char *uplo, const int *n_full, const int *n, int *n_out,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *ldWork, int *info
+    const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *ldWork, blasint *info
 ) {
 
     // top recursion level?
-    const int top = *n_full == *n;
+    const blasint top = *n_full == *n;
 
     if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) {
         // Unblocked
@@ -96,31 +96,31 @@ static void RELAPACK_zhetrf_rook_rec(
         return;
     }
 
-    int info1, info2;
+    blasint info1, info2;
 
     // Constants
     const double ONE[]  = { 1., 0. };
     const double MONE[] = { -1., 0. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
-    const int n_rest = *n_full - *n;
+    const blasint n_rest = *n_full - *n;
 
     if (*uplo == 'L') {
         // Splitting (setup)
-        int n1 = ZREC_SPLIT(*n);
-        int n2 = *n - n1;
+        blasint n1 = ZREC_SPLIT(*n);
+        blasint n2 = *n - n1;
 
         // Work_L *
         double *const Work_L = Work;
 
         // recursion(A_L)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
         n1 = n1_out;
 
         // Splitting (continued)
         n2 = *n - n1;
-        const int n_full2 = *n_full - n1;
+        const blasint n_full2 = *n_full - n1;
 
         // *      *
         // A_BL   A_BR
@@ -136,23 +136,23 @@ static void RELAPACK_zhetrf_rook_rec(
         // (top recursion level: use Work as Work_BR)
         double *const Work_BL =              Work                    + 2 * n1;
         double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
-        const int ldWork_BR = top ? n2 : *ldWork;
+        const blasint ldWork_BR = top ? n2 : *ldWork;
 
         // ipiv_T
         // ipiv_B
-        int *const ipiv_B = ipiv + n1;
+        blasint *const ipiv_B = ipiv + n1;
 
         // A_BR = A_BR - A_BL Work_BL'
         RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
         BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
 
         // recursion(A_BR)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
 
         if (n2_out != n2) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // last column of A_BR
             double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
@@ -169,7 +169,7 @@ static void RELAPACK_zhetrf_rook_rec(
         n2 = n2_out;
 
         // shift pivots
-        int i;
+        blasint i;
         for (i = 0; i < n2; i++)
             if (ipiv_B[i] > 0)
                 ipiv_B[i] += n1;
@@ -180,22 +180,22 @@ static void RELAPACK_zhetrf_rook_rec(
         *n_out = n1 + n2;
     } else {
         // Splitting (setup)
-        int n2 = ZREC_SPLIT(*n);
-        int n1 = *n - n2;
+        blasint n2 = ZREC_SPLIT(*n);
+        blasint n1 = *n - n2;
 
         // * Work_R
         // (top recursion level: use Work as Work_R)
         double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
 
         // recursion(A_R)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
-        const int n2_diff = n2 - n2_out;
+        const blasint n2_diff = n2 - n2_out;
         n2 = n2_out;
 
         // Splitting (continued)
         n1 = *n - n2;
-        const int n_full1 = *n_full - n2;
+        const blasint n_full1 = *n_full - n2;
 
         // * A_TL_T A_TR_T
         // * A_TL   A_TR
@@ -211,19 +211,19 @@ static void RELAPACK_zhetrf_rook_rec(
         // (top recursion level: Work_R was Work)
         double *const Work_L  = Work;
         double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
-        const int ldWork_L = top ? n1 : *ldWork;
+        const blasint ldWork_L = top ? n1 : *ldWork;
 
         // A_TL = A_TL - A_TR Work_TR'
         RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
         BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
 
         // recursion(A_TL)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
 
         if (n1_out != n1) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
             BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
diff --git a/relapack/src/zhetrf_rook_rec2.c b/relapack/src/zhetrf_rook_rec2.c
index a56ad710b7..e5033ad491 100644
--- a/relapack/src/zhetrf_rook_rec2.c
+++ b/relapack/src/zhetrf_rook_rec2.c
@@ -15,7 +15,7 @@
 /* Table of constant values */
 
 static doublecomplex c_b1 = {1.,0.};
-static int c__1 = 1;
+static blasint c__1 = 1;
 
 /** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
  *
@@ -24,12 +24,12 @@ static int c__1 = 1;
  * The blocked BLAS Level 3 updates were removed and moved to the
  * recursive algorithm.
  * */
-/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n,
-	int *nb, int *kb, doublecomplex *a, int *lda, int *
-	ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
+/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, blasint *n,
+	int *nb, blasint *kb, doublecomplex *a, blasint *lda, blasint *
+	ipiv, doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
     double d__1, d__2;
     doublecomplex z__1, z__2, z__3, z__4, z__5;
 
@@ -39,30 +39,30 @@ static int c__1 = 1;
 	    doublecomplex *, doublecomplex *);
 
     /* Local variables */
-    static int j, k, p;
+    static blasint j, k, p;
     static double t, r1;
     static doublecomplex d11, d21, d22;
-    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw;
     static logical done;
-    static int imax, jmax;
+    static blasint imax, jmax;
     static double alpha;
     extern logical lsame_(char *, char *, ftnlen, ftnlen);
     static double dtemp, sfmin;
-    static int itemp, kstep;
-    extern /* Subroutine */ int zgemv_(char *, int *, int *,
-	    doublecomplex *, doublecomplex *, int *, doublecomplex *,
-	    int *, doublecomplex *, doublecomplex *, int *, ftnlen),
-	    zcopy_(int *, doublecomplex *, int *, doublecomplex *,
-	    int *), zswap_(int *, doublecomplex *, int *,
-	    doublecomplex *, int *);
+    static blasint itemp, kstep;
+    extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *,
+	    doublecomplex *, doublecomplex *, blasint *, doublecomplex *,
+	    blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen),
+	    zcopy_(int *, doublecomplex *, blasint *, doublecomplex *,
+	    blasint *), zswap_(int *, doublecomplex *, blasint *,
+	    doublecomplex *, blasint *);
     extern double dlamch_(char *, ftnlen);
     static double absakk;
-    extern /* Subroutine */ int zdscal_(int *, double *,
-	    doublecomplex *, int *);
+    extern /* Subroutine */ blasint zdscal_(int *, double *,
+	    doublecomplex *, blasint *);
     static double colmax;
-    extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *)
+    extern /* Subroutine */ blasint zlacgv_(int *, doublecomplex *, blasint *)
 	    ;
-    extern int izamax_(int *, doublecomplex *, int *);
+    extern blasint izamax_(int *, doublecomplex *, blasint *);
     static double rowmax;
 
     /* Parameter adjustments */
diff --git a/relapack/src/zlauum.c b/relapack/src/zlauum.c
index 490dcc82e9..14fcd92138 100644
--- a/relapack/src/zlauum.c
+++ b/relapack/src/zlauum.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_zlauum_rec(const char *, const int *, double *,
-    const int *, int *);
+static void RELAPACK_zlauum_rec(const char *, const blasint *, double *,
+    const blasint *, blasint *);
 
 
 /** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
@@ -11,14 +11,14 @@ static void RELAPACK_zlauum_rec(const char *, const int *, double *,
  * http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html
  * */
 void RELAPACK_zlauum(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -27,8 +27,8 @@ void RELAPACK_zlauum(
     else if (*ldA < MAX(1, *n))
         *info = -4;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZLAUUM", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZLAUUM", &minfo, strlen("ZLAUUM"));
         return;
     }
 
@@ -42,9 +42,9 @@ void RELAPACK_zlauum(
 
 /** zlauum's recursive compute kernel */
 static void RELAPACK_zlauum_rec(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) {
@@ -57,8 +57,8 @@ static void RELAPACK_zlauum_rec(
     const double ONE[] = { 1., 0. };
 
     // Splitting
-    const int n1 = ZREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = ZREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/zpbtrf.c b/relapack/src/zpbtrf.c
index 37e711c9dd..fb0e1e97b5 100644
--- a/relapack/src/zpbtrf.c
+++ b/relapack/src/zpbtrf.c
@@ -1,8 +1,8 @@
 #include "relapack.h"
 #include "stdlib.h"
 
-static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *,
-    double *, const int *, double *, const int *, int *);
+static void RELAPACK_zpbtrf_rec(const char *, const blasint *, const blasint *,
+    double *, const blasint *, double *, const blasint *, blasint *);
 
 
 /** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
@@ -12,14 +12,14 @@ static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *,
  * http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html
  * */
 void RELAPACK_zpbtrf(
-    const char *uplo, const int *n, const int *kd,
-    double *Ab, const int *ldAb,
-    int *info
+    const char *uplo, const blasint *n, const blasint *kd,
+    double *Ab, const blasint *ldAb,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -30,8 +30,8 @@ void RELAPACK_zpbtrf(
     else if (*ldAb < *kd + 1)
         *info = -5;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZPBTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZPBTRF", &minfo, strlen("ZPBTRF"));
         return;
     }
 
@@ -42,9 +42,9 @@ void RELAPACK_zpbtrf(
     const double ZERO[] = { 0., 0. };
 
     // Allocate work space
-    const int n1 = ZREC_SPLIT(*n);
-    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
-    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+    const blasint n1 = ZREC_SPLIT(*n);
+    const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+    const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
     double *Work = malloc(mWork * nWork * 2 * sizeof(double));
     LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
 
@@ -58,10 +58,10 @@ void RELAPACK_zpbtrf(
 
 /** zpbtrf's recursive compute kernel */
 static void RELAPACK_zpbtrf_rec(
-    const char *uplo, const int *n, const int *kd,
-    double *Ab, const int *ldAb,
-    double *Work, const int *ldWork,
-    int *info
+    const char *uplo, const blasint *n, const blasint *kd,
+    double *Ab, const blasint *ldAb,
+    double *Work, const blasint *ldWork,
+    blasint *info
 ){
 
     if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) {
@@ -75,12 +75,12 @@ static void RELAPACK_zpbtrf_rec(
     const double MONE[] = { -1., 0. };
 
     // Unskew A
-    const int ldA[] = { *ldAb - 1 };
+    const blasint ldA[] = { *ldAb - 1 };
     double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);
 
     // Splitting
-    const int n1 = MIN(ZREC_SPLIT(*n), *kd);
-    const int n2 = *n - n1;
+    const blasint n1 = MIN(ZREC_SPLIT(*n), *kd);
+    const blasint n2 = *n - n1;
 
     // * *
     // * Ab_BR
@@ -99,8 +99,8 @@ static void RELAPACK_zpbtrf_rec(
         return;
 
     // Banded splitting
-    const int n21 = MIN(n2, *kd - n1);
-    const int n22 = MIN(n2 - n21, *kd);
+    const blasint n21 = MIN(n2, *kd - n1);
+    const blasint n22 = MIN(n2 - n21, *kd);
 
     //     n1    n21    n22
     // n1  *     A_TRl  A_TRr
diff --git a/relapack/src/zpotrf.c b/relapack/src/zpotrf.c
index 411ac5fc0c..9259279c1f 100644
--- a/relapack/src/zpotrf.c
+++ b/relapack/src/zpotrf.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_zpotrf_rec(const char *, const int *, double *,
-        const int *, int *);
+static void RELAPACK_zpotrf_rec(const char *, const blasint *, double *,
+        const blasint *, blasint *);
 
 
 /** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
@@ -11,14 +11,14 @@ static void RELAPACK_zpotrf_rec(const char *, const int *, double *,
  * http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html
  * */
 void RELAPACK_zpotrf(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -27,8 +27,8 @@ void RELAPACK_zpotrf(
     else if (*ldA < MAX(1, *n))
         *info = -4;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZPOTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZPOTRF", &minfo, strlen("ZPOTRF"));
         return;
     }
 
@@ -42,9 +42,9 @@ void RELAPACK_zpotrf(
 
 /** zpotrf's recursive compute kernel */
 static void RELAPACK_zpotrf_rec(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
 
     if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) {
@@ -58,8 +58,8 @@ static void RELAPACK_zpotrf_rec(
     const double MONE[] = { -1., 0. };
 
     // Splitting
-    const int n1 = ZREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = ZREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/relapack/src/zsytrf.c b/relapack/src/zsytrf.c
index 3be21563a7..f3412ad8f3 100644
--- a/relapack/src/zsytrf.c
+++ b/relapack/src/zsytrf.c
@@ -3,8 +3,8 @@
 #include <stdlib.h>
 #endif
 
-static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *,
-    double *, const int *, int *, double *, const int *, int *);
+static void RELAPACK_zsytrf_rec(const char *, const blasint *, const blasint *, blasint *,
+    double *, const blasint *, blasint *, double *, const blasint *, blasint *);
 
 
 /** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
@@ -14,21 +14,21 @@ static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *,
  * http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html
  * */
 void RELAPACK_zsytrf(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *lWork, blasint *info
 ) {
 
     // Required work size
-    const int cleanlWork = *n * (*n / 2);
-    int minlWork = cleanlWork;
+    const blasint cleanlWork = *n * (*n / 2);
+    blasint minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
     minlWork = 1;
 #endif
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -55,8 +55,8 @@ void RELAPACK_zsytrf(
 #endif
 
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZSYTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZSYTRF", &minfo, strlen("ZSYTRF"));
         return;
     }
 
@@ -64,7 +64,7 @@ void RELAPACK_zsytrf(
     const char cleanuplo = lower ? 'L' : 'U';
 
     // Dummy arguments
-    int nout;
+    blasint nout;
 
     // Recursive kernel
     RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
@@ -78,13 +78,13 @@ void RELAPACK_zsytrf(
 
 /** zsytrf's recursive compute kernel */
 static void RELAPACK_zsytrf_rec(
-    const char *uplo, const int *n_full, const int *n, int *n_out,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *ldWork, int *info
+    const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *ldWork, blasint *info
 ) {
 
     // top recursion level?
-    const int top = *n_full == *n;
+    const blasint top = *n_full == *n;
 
     if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) {
         // Unblocked
@@ -96,34 +96,34 @@ static void RELAPACK_zsytrf_rec(
         return;
     }
 
-    int info1, info2;
+    blasint info1, info2;
 
     // Constants
     const double ONE[]  = { 1., 0. };
     const double MONE[] = { -1., 0. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
     // Loop iterator
-    int i;
+    blasint i;
 
-    const int n_rest = *n_full - *n;
+    const blasint n_rest = *n_full - *n;
 
     if (*uplo == 'L') {
         // Splitting (setup)
-        int n1 = ZREC_SPLIT(*n);
-        int n2 = *n - n1;
+        blasint n1 = ZREC_SPLIT(*n);
+        blasint n2 = *n - n1;
 
         // Work_L *
         double *const Work_L = Work;
 
         // recursion(A_L)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
         n1 = n1_out;
 
         // Splitting (continued)
         n2 = *n - n1;
-        const int n_full2 = *n_full - n1;
+        const blasint n_full2 = *n_full - n1;
 
         // *      *
         // A_BL   A_BR
@@ -139,23 +139,23 @@ static void RELAPACK_zsytrf_rec(
         // (top recursion level: use Work as Work_BR)
         double *const Work_BL =              Work                    + 2 * n1;
         double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
-        const int ldWork_BR = top ? n2 : *ldWork;
+        const blasint ldWork_BR = top ? n2 : *ldWork;
 
         // ipiv_T
         // ipiv_B
-        int *const ipiv_B = ipiv + n1;
+        blasint *const ipiv_B = ipiv + n1;
 
         // A_BR = A_BR - A_BL Work_BL'
         RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
         BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
 
         // recursion(A_BR)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
 
         if (n2_out != n2) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // last column of A_BR
             double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
@@ -182,22 +182,22 @@ static void RELAPACK_zsytrf_rec(
         *n_out = n1 + n2;
     } else {
         // Splitting (setup)
-        int n2 = ZREC_SPLIT(*n);
-        int n1 = *n - n2;
+        blasint n2 = ZREC_SPLIT(*n);
+        blasint n1 = *n - n2;
 
         // * Work_R
         // (top recursion level: use Work as Work_R)
         double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
 
         // recursion(A_R)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
-        const int n2_diff = n2 - n2_out;
+        const blasint n2_diff = n2 - n2_out;
         n2 = n2_out;
 
         // Splitting (continued)
         n1 = *n - n2;
-        const int n_full1  = *n_full - n2;
+        const blasint n_full1  = *n_full - n2;
 
         // * A_TL_T A_TR_T
         // * A_TL   A_TR
@@ -213,19 +213,19 @@ static void RELAPACK_zsytrf_rec(
         // (top recursion level: Work_R was Work)
         double *const Work_L  = Work;
         double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
-        const int ldWork_L = top ? n1 : *ldWork;
+        const blasint ldWork_L = top ? n1 : *ldWork;
 
         // A_TL = A_TL - A_TR Work_TR'
         RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
         BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
 
         // recursion(A_TL)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
 
         if (n1_out != n1) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
             BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
diff --git a/relapack/src/zsytrf_rec2.c b/relapack/src/zsytrf_rec2.c
index 33902ee9ed..ff17267c70 100644
--- a/relapack/src/zsytrf_rec2.c
+++ b/relapack/src/zsytrf_rec2.c
@@ -15,7 +15,7 @@
 /* Table of constant values */
 
 static doublecomplex c_b1 = {1.,0.};
-static int c__1 = 1;
+static blasint c__1 = 1;
 
 /** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
  *
@@ -24,12 +24,12 @@ static int c__1 = 1;
  * The blocked BLAS Level 3 updates were removed and moved to the
  * recursive algorithm.
  * */
-/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int *
-	nb, int *kb, doublecomplex *a, int *lda, int *ipiv,
-	doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
+/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, blasint *n, blasint *
+	nb, blasint *kb, doublecomplex *a, blasint *lda, blasint *ipiv,
+	doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
     double d__1, d__2, d__3, d__4;
     doublecomplex z__1, z__2, z__3;
 
@@ -38,22 +38,22 @@ static int c__1 = 1;
     void z_div(doublecomplex *, doublecomplex *, doublecomplex *);
 
     /* Local variables */
-    static int j, k;
+    static blasint j, k;
     static doublecomplex t, r1, d11, d21, d22;
-    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+    static blasint jj, kk, jp, kp, kw, kkw, imax, jmax;
     static double alpha;
     extern logical lsame_(char *, char *, ftnlen, ftnlen);
-    extern /* Subroutine */ int zscal_(int *, doublecomplex *,
-	    doublecomplex *, int *);
-    static int kstep;
-    extern /* Subroutine */ int zgemv_(char *, int *, int *,
-	    doublecomplex *, doublecomplex *, int *, doublecomplex *,
-	    int *, doublecomplex *, doublecomplex *, int *, ftnlen),
-	    zcopy_(int *, doublecomplex *, int *, doublecomplex *,
-	    int *), zswap_(int *, doublecomplex *, int *,
-	    doublecomplex *, int *);
+    extern /* Subroutine */ blasint zscal_(int *, doublecomplex *,
+	    doublecomplex *, blasint *);
+    static blasint kstep;
+    extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *,
+	    doublecomplex *, doublecomplex *, blasint *, doublecomplex *,
+	    blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen),
+	    zcopy_(int *, doublecomplex *, blasint *, doublecomplex *,
+	    blasint *), zswap_(int *, doublecomplex *, blasint *,
+	    doublecomplex *, blasint *);
     static double absakk, colmax;
-    extern int izamax_(int *, doublecomplex *, int *);
+    extern blasint izamax_(int *, doublecomplex *, blasint *);
     static double rowmax;
 
     /* Parameter adjustments */
diff --git a/relapack/src/zsytrf_rook.c b/relapack/src/zsytrf_rook.c
index c598f7b1eb..fc6d736455 100644
--- a/relapack/src/zsytrf_rook.c
+++ b/relapack/src/zsytrf_rook.c
@@ -3,8 +3,8 @@
 #include <stdlib.h>
 #endif
 
-static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *,
-    double *, const int *, int *, double *, const int *, int *);
+static void RELAPACK_zsytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *,
+    double *, const blasint *, blasint *, double *, const blasint *, blasint *);
 
 
 /** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
@@ -14,21 +14,21 @@ static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int
  * http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html
  * */
 void RELAPACK_zsytrf_rook(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
+    const char *uplo, const blasint *n,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *lWork, blasint *info
 ) {
 
     // Required work size
-    const int cleanlWork = *n * (*n / 2);
-    int minlWork = cleanlWork;
+    const blasint cleanlWork = *n * (*n / 2);
+    blasint minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
     minlWork = 1;
 #endif
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -55,8 +55,8 @@ void RELAPACK_zsytrf_rook(
 #endif
 
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZSYTRF", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZSYTRF", &minfo, strlen("ZSYTRF"));
         return;
     }
 
@@ -64,7 +64,7 @@ void RELAPACK_zsytrf_rook(
     const char cleanuplo = lower ? 'L' : 'U';
 
     // Dummy argument
-    int nout;
+    blasint nout;
 
     // Recursive kernel
     RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
@@ -78,13 +78,13 @@ void RELAPACK_zsytrf_rook(
 
 /** zsytrf_rook's recursive compute kernel */
 static void RELAPACK_zsytrf_rook_rec(
-    const char *uplo, const int *n_full, const int *n, int *n_out,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *ldWork, int *info
+    const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out,
+    double *A, const blasint *ldA, blasint *ipiv,
+    double *Work, const blasint *ldWork, blasint *info
 ) {
 
     // top recursion level?
-    const int top = *n_full == *n;
+    const blasint top = *n_full == *n;
 
     if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) {
         // Unblocked
@@ -96,31 +96,31 @@ static void RELAPACK_zsytrf_rook_rec(
         return;
     }
 
-    int info1, info2;
+    blasint info1, info2;
 
     // Constants
     const double ONE[]  = { 1., 0. };
     const double MONE[] = { -1., 0. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
-    const int n_rest = *n_full - *n;
+    const blasint n_rest = *n_full - *n;
 
     if (*uplo == 'L') {
         // Splitting (setup)
-        int n1 = ZREC_SPLIT(*n);
-        int n2 = *n - n1;
+        blasint n1 = ZREC_SPLIT(*n);
+        blasint n2 = *n - n1;
 
         // Work_L *
         double *const Work_L = Work;
 
         // recursion(A_L)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
         n1 = n1_out;
 
         // Splitting (continued)
         n2 = *n - n1;
-        const int n_full2   = *n_full - n1;
+        const blasint n_full2   = *n_full - n1;
 
         // *      *
         // A_BL   A_BR
@@ -136,23 +136,23 @@ static void RELAPACK_zsytrf_rook_rec(
         // (top recursion level: use Work as Work_BR)
         double *const Work_BL =              Work                    + 2 * n1;
         double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
-        const int ldWork_BR = top ? n2 : *ldWork;
+        const blasint ldWork_BR = top ? n2 : *ldWork;
 
         // ipiv_T
         // ipiv_B
-        int *const ipiv_B = ipiv + n1;
+        blasint *const ipiv_B = ipiv + n1;
 
         // A_BR = A_BR - A_BL Work_BL'
         RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
         BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
 
         // recursion(A_BR)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
 
         if (n2_out != n2) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // last column of A_BR
             double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
@@ -169,7 +169,7 @@ static void RELAPACK_zsytrf_rook_rec(
         n2 = n2_out;
 
         // shift pivots
-        int i;
+        blasint i;
         for (i = 0; i < n2; i++)
             if (ipiv_B[i] > 0)
                 ipiv_B[i] += n1;
@@ -180,22 +180,22 @@ static void RELAPACK_zsytrf_rook_rec(
         *n_out = n1 + n2;
     } else {
         // Splitting (setup)
-        int n2 = ZREC_SPLIT(*n);
-        int n1 = *n - n2;
+        blasint n2 = ZREC_SPLIT(*n);
+        blasint n1 = *n - n2;
 
         // * Work_R
         // (top recursion level: use Work as Work_R)
         double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
 
         // recursion(A_R)
-        int n2_out;
+        blasint n2_out;
         RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
-        const int n2_diff = n2 - n2_out;
+        const blasint n2_diff = n2 - n2_out;
         n2 = n2_out;
 
         // Splitting (continued)
         n1 = *n - n2;
-        const int n_full1 = *n_full - n2;
+        const blasint n_full1 = *n_full - n2;
 
         // * A_TL_T A_TR_T
         // * A_TL   A_TR
@@ -211,19 +211,19 @@ static void RELAPACK_zsytrf_rook_rec(
         // (top recursion level: Work_R was Work)
         double *const Work_L  = Work;
         double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
-        const int ldWork_L = top ? n1 : *ldWork;
+        const blasint ldWork_L = top ? n1 : *ldWork;
 
         // A_TL = A_TL - A_TR Work_TR'
         RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
         BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
 
         // recursion(A_TL)
-        int n1_out;
+        blasint n1_out;
         RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
 
         if (n1_out != n1) {
             // undo 1 column of updates
-            const int n_restp1 = n_rest + 1;
+            const blasint n_restp1 = n_rest + 1;
 
             // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
             BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
diff --git a/relapack/src/zsytrf_rook_rec2.c b/relapack/src/zsytrf_rook_rec2.c
index 9e111fe0cd..4dbf8733af 100644
--- a/relapack/src/zsytrf_rook_rec2.c
+++ b/relapack/src/zsytrf_rook_rec2.c
@@ -15,7 +15,7 @@
 /* Table of constant values */
 
 static doublecomplex c_b1 = {1.,0.};
-static int c__1 = 1;
+static blasint c__1 = 1;
 
 /** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
  *
@@ -24,12 +24,12 @@ static int c__1 = 1;
  * The blocked BLAS Level 3 updates were removed and moved to the
  * recursive algorithm.
  * */
-/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n,
-	int *nb, int *kb, doublecomplex *a, int *lda, int *
-	ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
+/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, blasint *n,
+	int *nb, blasint *kb, doublecomplex *a, blasint *lda, blasint *
+	ipiv, doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+    blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
     double d__1, d__2;
     doublecomplex z__1, z__2, z__3, z__4;
 
@@ -38,26 +38,26 @@ static int c__1 = 1;
     void z_div(doublecomplex *, doublecomplex *, doublecomplex *);
 
     /* Local variables */
-    static int j, k, p;
+    static blasint j, k, p;
     static doublecomplex t, r1, d11, d12, d21, d22;
-    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+    static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw;
     static logical done;
-    static int imax, jmax;
+    static blasint imax, jmax;
     static double alpha;
     extern logical lsame_(char *, char *, ftnlen, ftnlen);
     static double dtemp, sfmin;
-    extern /* Subroutine */ int zscal_(int *, doublecomplex *,
-	    doublecomplex *, int *);
-    static int itemp, kstep;
-    extern /* Subroutine */ int zgemv_(char *, int *, int *,
-	    doublecomplex *, doublecomplex *, int *, doublecomplex *,
-	    int *, doublecomplex *, doublecomplex *, int *, ftnlen),
-	    zcopy_(int *, doublecomplex *, int *, doublecomplex *,
-	    int *), zswap_(int *, doublecomplex *, int *,
-	    doublecomplex *, int *);
+    extern /* Subroutine */ blasint zscal_(int *, doublecomplex *,
+	    doublecomplex *, blasint *);
+    static blasint itemp, kstep;
+    extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *,
+	    doublecomplex *, doublecomplex *, blasint *, doublecomplex *,
+	    blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen),
+	    zcopy_(int *, doublecomplex *, blasint *, doublecomplex *,
+	    blasint *), zswap_(int *, doublecomplex *, blasint *,
+	    doublecomplex *, blasint *);
     extern double dlamch_(char *, ftnlen);
     static double absakk, colmax;
-    extern int izamax_(int *, doublecomplex *, int *);
+    extern blasint izamax_(int *, doublecomplex *, blasint *);
     static double rowmax;
 
     /* Parameter adjustments */
diff --git a/relapack/src/ztgsyl.c b/relapack/src/ztgsyl.c
index 2c8a35256d..6a41475e86 100644
--- a/relapack/src/ztgsyl.c
+++ b/relapack/src/ztgsyl.c
@@ -1,10 +1,10 @@
 #include "relapack.h"
 #include <math.h>
 
-static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *,
-    const int *, const double *, const int *, const double *, const int *,
-    double *, const int *, const double *, const int *, const double *,
-    const int *, double *, const int *, double *, double *, double *, int *);
+static void RELAPACK_ztgsyl_rec(const char *, const blasint *, const blasint *,
+    const blasint *, const double *, const blasint *, const double *, const blasint *,
+    double *, const blasint *, const double *, const blasint *, const double *,
+    const blasint *, double *, const blasint *, double *, double *, double *, blasint *);
 
 
 /** ZTGSYL solves the generalized Sylvester equation.
@@ -14,21 +14,21 @@ static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *,
  * http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html
  * */
 void RELAPACK_ztgsyl(
-    const char *trans, const int *ijob, const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC,
-    const double *D, const int *ldD, const double *E, const int *ldE,
-    double *F, const int *ldF,
+    const char *trans, const blasint *ijob, const blasint *m, const blasint *n,
+    const double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *C, const blasint *ldC,
+    const double *D, const blasint *ldD, const double *E, const blasint *ldE,
+    double *F, const blasint *ldF,
     double *scale, double *dif,
-    double *Work, const int *lWork, int *iWork, int *info
+    double *Work, const blasint *lWork, blasint *iWork, blasint *info
 ) {
 
     // Parse arguments
-    const int notran = LAPACK(lsame)(trans, "N");
-    const int tran = LAPACK(lsame)(trans, "C");
+    const blasint notran = LAPACK(lsame)(trans, "N");
+    const blasint tran = LAPACK(lsame)(trans, "C");
 
     // Compute work buffer size
-    int lwmin = 1;
+    blasint lwmin = 1;
     if (notran && (*ijob == 1 || *ijob == 2))
         lwmin = MAX(1, 2 * *m * *n);
     *info = 0;
@@ -57,8 +57,8 @@ void RELAPACK_ztgsyl(
     else if (*lWork < lwmin && *lWork != -1)
         *info = -20;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZTGSYL", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZTGSYL", &minfo, strlen("ZTGSYL"));
         return;
     }
 
@@ -74,8 +74,8 @@ void RELAPACK_ztgsyl(
     // Constant
     const double ZERO[] = { 0., 0. };
 
-    int isolve = 1;
-    int ifunc  = 0;
+    blasint isolve = 1;
+    blasint ifunc  = 0;
     if (notran) {
         if (*ijob >= 3) {
             ifunc = *ijob - 2;
@@ -86,7 +86,7 @@ void RELAPACK_ztgsyl(
     }
 
     double scale2;
-    int iround;
+    blasint iround;
     for (iround = 1; iround <= isolve; iround++) {
         *scale = 1;
         double dscale = 0;
@@ -119,13 +119,13 @@ void RELAPACK_ztgsyl(
 
 /** ztgsyl's recursive vompute kernel */
 static void RELAPACK_ztgsyl_rec(
-    const char *trans, const int *ifunc, const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC,
-    const double *D, const int *ldD, const double *E, const int *ldE,
-    double *F, const int *ldF,
+    const char *trans, const blasint *ifunc, const blasint *m, const blasint *n,
+    const double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *C, const blasint *ldC,
+    const double *D, const blasint *ldD, const double *E, const blasint *ldE,
+    double *F, const blasint *ldF,
     double *scale, double *dsum, double *dscale,
-    int *info
+    blasint *info
 ) {
 
     if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) {
@@ -137,18 +137,18 @@ static void RELAPACK_ztgsyl_rec(
     // Constants
     const double ONE[]  = { 1., 0. };
     const double MONE[] = { -1., 0. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
     // Outputs
     double scale1[] = { 1., 0. };
     double scale2[] = { 1., 0. };
-    int    info1[]  = { 0 };
-    int    info2[]  = { 0 };
+    blasint    info1[]  = { 0 };
+    blasint    info2[]  = { 0 };
 
     if (*m > *n) {
         // Splitting
-        const int m1 = ZREC_SPLIT(*m);
-        const int m2 = *m - m1;
+        const blasint m1 = ZREC_SPLIT(*m);
+        const blasint m2 = *m - m1;
 
         // A_TL A_TR
         // 0    A_BR
@@ -206,8 +206,8 @@ static void RELAPACK_ztgsyl_rec(
         }
     } else {
         // Splitting
-        const int n1 = ZREC_SPLIT(*n);
-        const int n2 = *n - n1;
+        const blasint n1 = ZREC_SPLIT(*n);
+        const blasint n2 = *n - n1;
 
         // B_TL B_TR
         // 0    B_BR
diff --git a/relapack/src/ztrsyl.c b/relapack/src/ztrsyl.c
index 82b2c88031..567ef115a8 100644
--- a/relapack/src/ztrsyl.c
+++ b/relapack/src/ztrsyl.c
@@ -1,8 +1,8 @@
 #include "relapack.h"
 
-static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *,
-    const int *, const int *, const double *, const int *, const double *,
-    const int *, double *, const int *, double *, int *);
+static void RELAPACK_ztrsyl_rec(const char *, const char *, const blasint *,
+    const blasint *, const blasint *, const double *, const blasint *, const double *,
+    const blasint *, double *, const blasint *, double *, blasint *);
 
 
 /** ZTRSYL solves the complex Sylvester matrix equation.
@@ -12,18 +12,18 @@ static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *,
  * http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html
  * */
 void RELAPACK_ztrsyl(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC, double *scale,
-    int *info
+    const char *tranA, const char *tranB, const blasint *isgn,
+    const blasint *m, const blasint *n,
+    const double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *C, const blasint *ldC, double *scale,
+    blasint *info
 ) {
 
     // Check arguments
-    const int notransA = LAPACK(lsame)(tranA, "N");
-    const int ctransA = LAPACK(lsame)(tranA, "C");
-    const int notransB = LAPACK(lsame)(tranB, "N");
-    const int ctransB = LAPACK(lsame)(tranB, "C");
+    const blasint notransA = LAPACK(lsame)(tranA, "N");
+    const blasint ctransA = LAPACK(lsame)(tranA, "C");
+    const blasint notransB = LAPACK(lsame)(tranB, "N");
+    const blasint ctransB = LAPACK(lsame)(tranB, "C");
     *info = 0;
     if (!ctransA && !notransA)
         *info = -1;
@@ -42,8 +42,8 @@ void RELAPACK_ztrsyl(
     else if (*ldC < MAX(1, *m))
         *info = -11;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZTRSYL", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZTRSYL", &minfo, strlen("ZTRSYL"));
         return;
     }
 
@@ -58,11 +58,11 @@ void RELAPACK_ztrsyl(
 
 /** ztrsyl's recursive compute kernel */
 static void RELAPACK_ztrsyl_rec(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC, double *scale,
-    int *info
+    const char *tranA, const char *tranB, const blasint *isgn,
+    const blasint *m, const blasint *n,
+    const double *A, const blasint *ldA, const double *B, const blasint *ldB,
+    double *C, const blasint *ldC, double *scale,
+    blasint *info
 ) {
 
     if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) {
@@ -75,18 +75,18 @@ static void RELAPACK_ztrsyl_rec(
     const double ONE[]  = { 1., 0. };
     const double MONE[] = { -1., 0. };
     const double MSGN[] = { -*isgn, 0. };
-    const int    iONE[] = { 1 };
+    const blasint    iONE[] = { 1 };
 
     // Outputs
     double scale1[] = { 1., 0. };
     double scale2[] = { 1., 0. };
-    int    info1[]  = { 0 };
-    int    info2[]  = { 0 };
+    blasint    info1[]  = { 0 };
+    blasint    info2[]  = { 0 };
 
     if (*m > *n) {
         // Splitting
-        const int m1 = ZREC_SPLIT(*m);
-        const int m2 = *m - m1;
+        const blasint m1 = ZREC_SPLIT(*m);
+        const blasint m2 = *m - m1;
 
         // A_TL A_TR
         // 0    A_BR
@@ -122,8 +122,8 @@ static void RELAPACK_ztrsyl_rec(
         }
     } else {
         // Splitting
-        const int n1 = ZREC_SPLIT(*n);
-        const int n2 = *n - n1;
+        const blasint n1 = ZREC_SPLIT(*n);
+        const blasint n2 = *n - n1;
 
         // B_TL B_TR
         // 0    B_BR
diff --git a/relapack/src/ztrsyl_rec2.c b/relapack/src/ztrsyl_rec2.c
index 526ab097cd..edc6ffc6bd 100644
--- a/relapack/src/ztrsyl_rec2.c
+++ b/relapack/src/ztrsyl_rec2.c
@@ -14,16 +14,16 @@
 #include "f2c.h"
 
 #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
-doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) {
-    extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *);
+doublecomplex zdotu_fun(int *n, doublecomplex *x, blasint *incx, doublecomplex *y, blasint *incy) {
+    extern void zdotu_(doublecomplex *, blasint *, doublecomplex *, blasint *, doublecomplex *, blasint *);
     doublecomplex result;
     zdotu_(&result, n, x, incx, y, incy);
     return result;
 }
 #define zdotu_ zdotu_fun
 
-doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) {
-    extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *);
+doublecomplex zdotc_fun(int *n, doublecomplex *x, blasint *incx, doublecomplex *y, blasint *incy) {
+    extern void zdotc_(doublecomplex *, blasint *, doublecomplex *, blasint *, doublecomplex *, blasint *);
     doublecomplex result;
     zdotc_(&result, n, x, incx, y, incy);
     return result;
@@ -43,7 +43,7 @@ doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) {
 
 /* Table of constant values */
 
-static int c__1 = 1;
+static blasint c__1 = 1;
 
 /** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
  *
@@ -51,12 +51,12 @@ static int c__1 = 1;
  * It serves as an unblocked kernel in the recursive algorithms.
  * */
 /* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int
-	*isgn, int *m, int *n, doublecomplex *a, int *lda,
-	doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc,
-	double *scale, int *info, ftnlen trana_len, ftnlen tranb_len)
+	*isgn, blasint *m, blasint *n, doublecomplex *a, blasint *lda,
+	doublecomplex *b, blasint *ldb, doublecomplex *c__, blasint *ldc,
+	double *scale, blasint *info, ftnlen trana_len, ftnlen tranb_len)
 {
     /* System generated locals */
-    int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+    blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4;
     double d__1, d__2;
     doublecomplex z__1, z__2, z__3, z__4;
@@ -66,7 +66,7 @@ static int c__1 = 1;
     void d_cnjg(doublecomplex *, doublecomplex *);
 
     /* Local variables */
-    static int j, k, l;
+    static blasint j, k, l;
     static doublecomplex a11;
     static double db;
     static doublecomplex x11;
@@ -74,23 +74,23 @@ static int c__1 = 1;
     static doublecomplex vec;
     static double dum[1], eps, sgn, smin;
     static doublecomplex suml, sumr;
-    extern int lsame_(char *, char *, ftnlen, ftnlen);
+    extern blasint lsame_(char *, char *, ftnlen, ftnlen);
     /* Double Complex */ doublecomplex zdotc_(int *,
-	    doublecomplex *, int *, doublecomplex *, int *), zdotu_(
-	    int *, doublecomplex *, int *,
-	    doublecomplex *, int *);
-    extern /* Subroutine */ int dlabad_(double *, double *);
+	    doublecomplex *, blasint *, doublecomplex *, blasint *), zdotu_(
+	    blasint *, doublecomplex *, blasint *,
+	    doublecomplex *, blasint *);
+    extern /* Subroutine */ blasint dlabad_(double *, double *);
     extern double dlamch_(char *, ftnlen);
     static double scaloc;
-    extern /* Subroutine */ int xerbla_(char *, int *, ftnlen);
-    extern double zlange_(char *, int *, int *, doublecomplex *,
-	    int *, double *, ftnlen);
+    extern /* Subroutine */ blasint xerbla_(char *, blasint *, ftnlen);
+    extern double zlange_(char *, blasint *, blasint *, doublecomplex *,
+	    blasint *, double *, ftnlen);
     static double bignum;
-    extern /* Subroutine */ int zdscal_(int *, double *,
-	    doublecomplex *, int *);
+    extern /* Subroutine */ blasint zdscal_(int *, double *,
+	    doublecomplex *, blasint *);
     /* Double Complex */ doublecomplex zladiv_(doublecomplex *,
 	     doublecomplex *);
-    static int notrna, notrnb;
+    static blasint notrna, notrnb;
     static double smlnum;
 
     /* Parameter adjustments */
diff --git a/relapack/src/ztrtri.c b/relapack/src/ztrtri.c
index ac9fe7bd48..3f6606d84b 100644
--- a/relapack/src/ztrtri.c
+++ b/relapack/src/ztrtri.c
@@ -1,7 +1,7 @@
 #include "relapack.h"
 
-static void RELAPACK_ztrtri_rec(const char *, const char *, const int *,
-    double *, const int *, int *);
+static void RELAPACK_ztrtri_rec(const char *, const char *, const blasint *,
+    double *, const blasint *, blasint *);
 
 
 /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
@@ -11,16 +11,16 @@ static void RELAPACK_ztrtri_rec(const char *, const char *, const int *,
  * http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html
  * */
 void RELAPACK_ztrtri(
-    const char *uplo, const char *diag, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const char *diag, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ) {
 
     // Check arguments
-    const int lower = LAPACK(lsame)(uplo, "L");
-    const int upper = LAPACK(lsame)(uplo, "U");
-    const int nounit = LAPACK(lsame)(diag, "N");
-    const int unit = LAPACK(lsame)(diag, "U");
+    const blasint lower = LAPACK(lsame)(uplo, "L");
+    const blasint upper = LAPACK(lsame)(uplo, "U");
+    const blasint nounit = LAPACK(lsame)(diag, "N");
+    const blasint unit = LAPACK(lsame)(diag, "U");
     *info = 0;
     if (!lower && !upper)
         *info = -1;
@@ -31,8 +31,8 @@ void RELAPACK_ztrtri(
     else if (*ldA < MAX(1, *n))
         *info = -5;
     if (*info) {
-        const int minfo = -*info;
-        LAPACK(xerbla)("ZTRTRI", &minfo);
+        const blasint minfo = -*info;
+        LAPACK(xerbla)("ZTRTRI", &minfo, strlen("ZTRTRI"));
         return;
     }
 
@@ -42,7 +42,7 @@ void RELAPACK_ztrtri(
 
     // check for singularity
     if (nounit) {
-        int i;
+        blasint i;
         for (i = 0; i < *n; i++)
             if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
                 *info = i;
@@ -57,9 +57,9 @@ void RELAPACK_ztrtri(
 
 /** ztrtri's recursive compute kernel */
 static void RELAPACK_ztrtri_rec(
-    const char *uplo, const char *diag, const int *n,
-    double *A, const int *ldA,
-    int *info
+    const char *uplo, const char *diag, const blasint *n,
+    double *A, const blasint *ldA,
+    blasint *info
 ){
 
     if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) {
@@ -73,8 +73,8 @@ static void RELAPACK_ztrtri_rec(
     const double MONE[] = { -1. };
 
     // Splitting
-    const int n1 = ZREC_SPLIT(*n);
-    const int n2 = *n - n1;
+    const blasint n1 = ZREC_SPLIT(*n);
+    const blasint n2 = *n - n1;
 
     // A_TL A_TR
     // A_BL A_BR
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 1b426afe7e..dc306501f2 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -61,7 +61,7 @@ foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
   set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${CMAKE_CURRENT_BINARY_DIR})
 endforeach()
 
-if (MSVC)
+if (MSVC AND BUILD_SHARED_LIBS)
 add_custom_command(TARGET ${OpenBLAS_utest_bin}
           POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/${CMAKE_CFG_INTDIR}/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/.
diff --git a/utest/Makefile b/utest/Makefile
index e40b3c6db5..550a655691 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -37,4 +37,3 @@ clean:
 	-rm -f *.o $(UTESTBIN)
 
 libs:
-
diff --git a/utest/ctest.h b/utest/ctest.h
index f297dafbae..d316b14943 100644
--- a/utest/ctest.h
+++ b/utest/ctest.h
@@ -83,10 +83,6 @@ struct ctest {
 #undef CTEST_SEGFAULT
 #endif
 
-#if _MSC_VER < 1900
-#define snprintf _snprintf
-#endif
-
 #ifndef __cplusplus
 #define inline __inline
 #endif