Merge branch 'develop' into topic/sgemm_direct_sme1

OpenMathLib · Feb 2, 2025 · bcd59e0 · bcd59e0
2 parents c1d80e6 + c139b63
commit bcd59e0
Show file tree

Hide file tree

Showing 183 changed files with 7,525 additions and 2,609 deletions.
diff --git a/.cirrus.yml b/.cirrus.yml
@@ -125,9 +125,9 @@ task:
   - make USE_OPENMP=1
 
 FreeBSD_task:
-  name: FreeBSD-gcc12
+  name: FreeBSD-gcc
   freebsd_instance:
-    image_family: freebsd-13-3
+    image_family: freebsd-14-1
   install_script:
   - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
   compile_script:
@@ -136,9 +136,9 @@ FreeBSD_task:
 
 
 FreeBSD_task:
-  name: freebsd-gcc12-ilp64
+  name: freebsd-gcc-ilp64
   freebsd_instance:
-    image_family: freebsd-13-3
+    image_family: freebsd-14-1
   install_script:
   - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
   compile_script:
@@ -148,7 +148,7 @@ FreeBSD_task:
 FreeBSD_task:
   name: FreeBSD-clang-openmp
   freebsd_instance:
-    image_family: freebsd-13-3
+    image_family: freebsd-14-1
   install_script:
   - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc 
   - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so

diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml
@@ -37,7 +37,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
-          gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
+          gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev
 
       - name: checkout qemu
         uses: actions/checkout@v3
@@ -52,6 +52,7 @@ jobs:
           wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
           cd qemu
           patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
+          export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error"
           ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
           make -j$(nproc)
           make install

diff --git a/.github/workflows/codspeed-bench.yml b/.github/workflows/codspeed-bench.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest]
+        os: [ubuntu-22.04]
         fortran: [gfortran]
         build: [make]
         pyver: ["3.12"]
@@ -147,7 +147,7 @@ jobs:
           OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd'
 
       - name: Run benchmarks
-        uses: CodSpeedHQ/action@v2
+        uses: CodSpeedHQ/action@v3
         with:
           token: ${{ secrets.CODSPEED_TOKEN }}
           run: |

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -23,7 +23,7 @@ jobs:
           python-version: "3.10"
 
       - name: Install MkDocs and doc theme packages
-        run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin
+        run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin mkdocs-mermaid2-plugin
 
       - name: Build docs site
         run: mkdocs build

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
@@ -43,7 +43,9 @@ jobs:
         run: |
           if [ "$RUNNER_OS" == "Linux" ]; then
             sudo apt-get update
-            sudo apt-get install -y gfortran cmake ccache libtinfo5
+            sudo apt-get install -y gfortran cmake ccache
+            wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb
+            sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb
           elif [ "$RUNNER_OS" == "macOS" ]; then
             # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
             brew reinstall gcc
@@ -158,7 +160,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        msystem: [UCRT64, MINGW32, CLANG64, CLANG32]
+        msystem: [UCRT64, MINGW32, CLANG64]
         idx: [int32, int64]
         build-type: [Release]
         include:
@@ -174,14 +176,6 @@ jobs:
             idx: int32
             target-prefix: mingw-w64-clang-x86_64
             fc-pkg: fc
-            # Compiling with Flang 16 seems to cause test errors on machines
-            # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
-            no-avx512-flags: -DNO_AVX512=1
-          - msystem: CLANG32
-            idx: int32
-            target-prefix: mingw-w64-clang-i686
-            fc-pkg: cc
-            c-lapack-flags: -DC_LAPACK=ON
           - msystem: UCRT64
             idx: int64
             idx64-flags: -DBINARY=64 -DINTERFACE64=1
@@ -192,9 +186,6 @@ jobs:
             idx64-flags: -DBINARY=64 -DINTERFACE64=1
             target-prefix: mingw-w64-clang-x86_64
             fc-pkg: fc
-            # Compiling with Flang 16 seems to cause test errors on machines
-            # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
-            no-avx512-flags: -DNO_AVX512=1
           - msystem: UCRT64
             idx: int32
             target-prefix: mingw-w64-ucrt-x86_64
@@ -203,8 +194,6 @@ jobs:
         exclude:
           - msystem: MINGW32
             idx: int64
-          - msystem: CLANG32
-            idx: int64
 
     defaults:
       run:
@@ -280,8 +269,6 @@ jobs:
                 -DNUM_THREADS=64 \
                 -DTARGET=CORE2 \
                 ${{ matrix.idx64-flags }} \
-                ${{ matrix.c-lapack-flags }} \
-                ${{ matrix.no-avx512-flags }} \
                 -DCMAKE_C_COMPILER_LAUNCHER=ccache \
                 -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
                 ..
@@ -369,3 +356,23 @@ jobs:
       - name: Build OpenBLAS
         run: |
           make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }}
+
+  neoverse_build:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    runs-on: ubuntu-24.04-arm
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Install Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y gcc gfortran make
+          
+      - name: Build OpenBLAS
+        run: |
+          make -j${nproc} TARGET=NEOVERSEN2
+          make -j${nproc} TARGET=NEOVERSEN2 lapack-test
+    
+
diff --git a/.github/workflows/harmonyos.yml b/.github/workflows/harmonyos.yml
@@ -0,0 +1,37 @@
+name: harmonyos
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  build:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    runs-on: ubuntu-latest
+    env:
+      OHOS_NDK_CMAKE: $GITHUB_WORKSPACE/ohos-sdk/linux/native/build-tools/cmake/bin/cmake
+      COMMON_CMAKE_OPTIONS: |
+        -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \
+        -DCMAKE_INSTALL_PREFIX=install \
+        -DCMAKE_BUILD_TYPE=Release \
+    steps:
+    - uses: actions/checkout@v4
+    - name: ndk-install
+      run: |
+        wget https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz
+        tar -xf ohos-sdk-windows_linux-public.tar.gz
+        cd ohos-sdk/linux
+        unzip -q native-linux-x64-4.1.7.8-Release.zip
+        cd -
+    - name: build-armv8
+      run: |
+       mkdir build && cd build
+       ${{ env.OHOS_NDK_CMAKE }} ${{ env.COMMON_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" \
+       -DTARGET=ARMV8 -DNOFORTRAN=1 ..
+       ${{ env.OHOS_NDK_CMAKE }} --build . -j $(nproc)
+       
diff --git a/.github/workflows/loongarch64_clang.yml b/.github/workflows/loongarch64_clang.yml
@@ -41,7 +41,7 @@ jobs:
       - name: Install APT deps
         run: |
           sudo apt-get update
-          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache
+          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache libglib2.0-dev
 
       - name: Download and install loongarch64-toolchain
         run: |

diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml
@@ -41,14 +41,14 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
-          gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross
+          gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross libglib2.0-dev
 
       - name: checkout qemu
         uses: actions/checkout@v3
         with:
           repository: qemu/qemu
           path: qemu
-          ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2
+          ref: ae35f033b874c627d81d51070187fbf55f0bf1a7
 
       - name: build qemu
         run: |

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,11 +4,12 @@
 
 cmake_minimum_required(VERSION 3.16.0)
 
+set (CMAKE_ASM_SOURCE_FILE_EXTENSIONS "S")
 project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 28.dev)
+set(OpenBLAS_PATCH_VERSION 29.dev)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -229,3 +229,14 @@ In chronological order:
 
 * Christopher Daley <https://github.com/cdaley>
   * [2024-01-24] Optimize GEMV forwarding on ARM64 systems
+
+* Aniket P. Garade <https://github.com/garadeaniket>   Sushil Pratap Singh <https://github.com/SushilPratap04>  Juliya James <https://github.com/Juliya32> 
+  *  [2024-12-13] Optimized swap and rot  Level-1 BLAS routines with ARM SVE
+
+* Annop Wongwathanarat <annop.wongwathanarat@arm.com>
+  * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1
+  * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel
+
+* Marek Michalowski <https://github.com/michalowski-arm>
+  * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1`
+
diff --git a/Changelog.txt b/Changelog.txt
@@ -1,4 +1,99 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.29
+12-Jan-2025
+
+general:
+ - fixed a potential NULL pointer dereference in multithreaded builds
+ - added function aliases for GEMMT using its new name GEMMTR adopted by Reference-BLAS
+ - fixed a build failure when building without LAPACK_DEPRECATED functions
+ - the minimum required CMake version for CMake-based builds was raised to 3.16.0 in order 
+   to remove many compatibility and deprecation warnings
+ - added more detailed CMake rules for OpenMP builds (mainly to support recent LLVM)
+ - fixed the behavior of the recently added CBLAS_?GEMMT functions with row-major data
+ - improved thread scaling of multithreaded SBGEMV 
+ - improved thread scaling of multithreaded TRTRI 
+ - fixed compilation of the CBLAS testsuite with gcc14 (and no Fortran compiler)
+ - added support for option handling changes in flang-new from LLVM18 onwards 
+ - added support for recent calling conventions changes in Cray and NVIDIA compilers
+ - added support for compilation with the NAG Fortran compiler
+ - fixed placement of the -fopenmp flag and libsuffix in the generated pkgconfig file
+ - improved the CMakeConfig file generated by the Makefile build
+ - fixed const-correctness of cblas_?geadd in cblas.h
+ - fixed a potential inaccuracy in multithreaded BLAS3 calls
+ - fixed empty implementations of get/set_affinity that print a warning in OpenMP builds
+ - fixed function signatures for TRTRS in the converted C version of LAPACK
+ - fixed omission of several single-precision LAPACK symbols in the shared library 
+ - improved build instructions for the provided "pybench" benchmarks
+ - improved documentation, including added build instructions for WoA and HarmonyOS
+   as well as descriptions of environment variables that affect build and runtime behavior
+ - added a separate "make install_tests" target for use with cross-compilations
+ - integrated improvements and corrections from Reference-LAPACK:
+   - removed a comparison in LAPACKE ?tpmqrt that is always false (LAPACK PR 1062)
+   - fixed the leading dimension for B in tests for GGEV (LAPACK PR 1064)
+   - replaced the ?LARFT functions with a recursive implementation (LAPACK PR 1080)
+
+arm:
+ - fixed build with recent versions of the NDK (missing .type declaration of symbols)
+
+arm64:
+ - fixed a long-standing bug in the (generic) c/zgemm_beta kernel that could lead to
+   reads and writes outside the array bounds in some circumstances
+ - rewrote cpu autodetection to scan all cores and return the highest performing type
+ - improved the DGEMM performance for SVE targets and small matrix sizes
+ - improved dimension criteria for forwarding from GEMM to GEMV kernels
+ - added SVE kernels for ROT and SWAP
+ - improved SVE kernels for SGEMV and DGEMV on A64FX and NEOVERSEV1
+ - added support for using the "small matrix" kernels with CMake as well
+ - fixed compilation on Windows on Arm
+ - improved compile-time detection of SVE capability
+ - added cpu autodetection and initial support for Apple M4
+ - added support for compilation on systems running IOS
+ - added support for compilation on NetBSD ("evbarm" architecture)
+ - fixed NRM2 implementations for generic SVE targets and the Neoverse N2
+ - fixed compilation for SVE-capable targets with the NVIDIA compiler
+
+x86_64:
+ - fixed a wrong storage size in the SBGEMV kernel for Cooper Lake
+ - added cpu autodetection for Intel Granite Rapids
+ - added cpu autodetection for AMD Ryzen 5 series
+ - added optimized SOMATCOPY_CT for AVX-capable targets
+ - fixed the fallback implementation of GEMM3M in GENERIC builds
+ - tentatively re-enabled builds with the EXPRECISION option
+ - worked around a miscompilation of tests with mingw32-gfortran14
+ - added support for compilation with the Intel oneAPI 2025.0 compiler on Windows
+
+power:
+ - fixed multithreaded SBGEMM
+ - fixed a CMake build problem on POWER10
+ - improved the performance of SGEMV
+ - added vectorized implementations of SBGEMV and support for forwarding 1xN SBGEMM to them
+ - fixed illegal instructions and potential memory overflow in SGEMM on PPCG4
+ - fixed handling of NaN and Inf arguments in SSCAL and DSCAL on PPC440,G4 and 970
+ - added improved CGEMM and ZGEMM kernels for POWER10
+ - added Makefile logic to remove all optimization flags in DEBUG builds
+
+mips64:
+ - fixed compilation with gcc14
+ - fixed GEMM parameter selection for the MIPS64_GENERIC target
+ - fixed a potential build failure when compiling with OpenMP
+
+loongarch64:
+ - fixed compilation for Loongson3 with recent versions of gmake
+ - fixed a potential loss of precision in Loongson3A GEMM
+ - fixed a potential build failure when compiling with OpenMP
+ - added optimized SOMATCOPY for LASX-capable targets
+ - introduced a new cpu naming scheme while retaining compatibility
+ - added support for cross-compiling Loongarch64 targets with CMake
+ - added support for compilation with LLVM
+
+riscv64:
+ - removed thread yielding overhead caused by sched_yield
+ - replaced some non-standard intrinsics with their official names
+ - fixed and sped up the implementations of CGEMM/ZGEMM TCOPY for vector lenghts 128 and 256
+ - improved the performance of SNRM2/DNRM2 for RVV1.0 targets
+ - added optimized ?OMATCOPY_CN kernels for RVV1.0 targets
+
 ====================================================================
 Version 0.3.28
  8-Aug-2024

diff --git a/Makefile b/Makefile
@@ -426,6 +426,9 @@ dummy :
 install :
 	$(MAKE) -f Makefile.install install
 
+install_tests :
+	$(MAKE) -f Makefile.install install_tests
+
 clean ::
 	@for d in $(SUBDIRS_ALL) ; \
 	do if test -d $$d; then \

diff --git a/Makefile.arm64 b/Makefile.arm64
@@ -356,4 +356,31 @@ endif
 
 endif
 
+else
+# NVIDIA HPC options necessary to enable SVE in the compiler
+ifeq ($(CORE), THUNDERX2T99)
+CCOMMON_OPT += -tp=thunderx2t99
+FCOMMON_OPT += -tp=thunderx2t99
+endif
+ifeq ($(CORE), NEOVERSEN1)
+CCOMMON_OPT += -tp=neoverse-n1
+FCOMMON_OPT += -tp=neoverse-n1
+endif
+ifeq ($(CORE), NEOVERSEV1)
+CCOMMON_OPT += -tp=neoverse-v1
+FCOMMON_OPT += -tp=neoverse-v1
+endif
+ifeq ($(CORE), NEOVERSEV2)
+CCOMMON_OPT += -tp=neoverse-v2
+FCOMMON_OPT += -tp=neoverse-v2
+endif
+ifeq ($(CORE), ARMV8SVE)
+CCOMMON_OPT += -tp=neoverse-v2
+FCOMMON_OPT += -tp=neoverse-v2
+endif
+ifeq ($(CORE), ARMV9SVE)
+CCOMMON_OPT += -tp=neoverse-v2
+FCOMMON_OPT += -tp=neoverse-v2
+endif
+
 endif