Skip to content

Commit

Permalink
Merge branch 'develop' into topic/sgemm_direct_sme1
Browse files Browse the repository at this point in the history
  • Loading branch information
vaiskv authored Feb 2, 2025
2 parents c1d80e6 + c139b63 commit bcd59e0
Show file tree
Hide file tree
Showing 183 changed files with 7,525 additions and 2,609 deletions.
10 changes: 5 additions & 5 deletions .cirrus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,9 @@ task:
- make USE_OPENMP=1

FreeBSD_task:
name: FreeBSD-gcc12
name: FreeBSD-gcc
freebsd_instance:
image_family: freebsd-13-3
image_family: freebsd-14-1
install_script:
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
compile_script:
Expand All @@ -136,9 +136,9 @@ FreeBSD_task:


FreeBSD_task:
name: freebsd-gcc12-ilp64
name: freebsd-gcc-ilp64
freebsd_instance:
image_family: freebsd-13-3
image_family: freebsd-14-1
install_script:
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
compile_script:
Expand All @@ -148,7 +148,7 @@ FreeBSD_task:
FreeBSD_task:
name: FreeBSD-clang-openmp
freebsd_instance:
image_family: freebsd-13-3
image_family: freebsd-14-1
install_script:
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
- ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/c910v.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev
- name: checkout qemu
uses: actions/checkout@v3
Expand All @@ -52,6 +52,7 @@ jobs:
wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
cd qemu
patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error"
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
make -j$(nproc)
make install
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/codspeed-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
os: [ubuntu-22.04]
fortran: [gfortran]
build: [make]
pyver: ["3.12"]
Expand Down Expand Up @@ -147,7 +147,7 @@ jobs:
OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd'
- name: Run benchmarks
uses: CodSpeedHQ/action@v2
uses: CodSpeedHQ/action@v3
with:
token: ${{ secrets.CODSPEED_TOKEN }}
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
python-version: "3.10"

- name: Install MkDocs and doc theme packages
run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin
run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin mkdocs-mermaid2-plugin

- name: Build docs site
run: mkdocs build
Expand Down
41 changes: 24 additions & 17 deletions .github/workflows/dynamic_arch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ jobs:
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt-get update
sudo apt-get install -y gfortran cmake ccache libtinfo5
sudo apt-get install -y gfortran cmake ccache
wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb
sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb
elif [ "$RUNNER_OS" == "macOS" ]; then
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
brew reinstall gcc
Expand Down Expand Up @@ -158,7 +160,7 @@ jobs:
strategy:
fail-fast: false
matrix:
msystem: [UCRT64, MINGW32, CLANG64, CLANG32]
msystem: [UCRT64, MINGW32, CLANG64]
idx: [int32, int64]
build-type: [Release]
include:
Expand All @@ -174,14 +176,6 @@ jobs:
idx: int32
target-prefix: mingw-w64-clang-x86_64
fc-pkg: fc
# Compiling with Flang 16 seems to cause test errors on machines
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
no-avx512-flags: -DNO_AVX512=1
- msystem: CLANG32
idx: int32
target-prefix: mingw-w64-clang-i686
fc-pkg: cc
c-lapack-flags: -DC_LAPACK=ON
- msystem: UCRT64
idx: int64
idx64-flags: -DBINARY=64 -DINTERFACE64=1
Expand All @@ -192,9 +186,6 @@ jobs:
idx64-flags: -DBINARY=64 -DINTERFACE64=1
target-prefix: mingw-w64-clang-x86_64
fc-pkg: fc
# Compiling with Flang 16 seems to cause test errors on machines
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
no-avx512-flags: -DNO_AVX512=1
- msystem: UCRT64
idx: int32
target-prefix: mingw-w64-ucrt-x86_64
Expand All @@ -203,8 +194,6 @@ jobs:
exclude:
- msystem: MINGW32
idx: int64
- msystem: CLANG32
idx: int64

defaults:
run:
Expand Down Expand Up @@ -280,8 +269,6 @@ jobs:
-DNUM_THREADS=64 \
-DTARGET=CORE2 \
${{ matrix.idx64-flags }} \
${{ matrix.c-lapack-flags }} \
${{ matrix.no-avx512-flags }} \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
..
Expand Down Expand Up @@ -369,3 +356,23 @@ jobs:
- name: Build OpenBLAS
run: |
make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }}
neoverse_build:
if: "github.repository == 'OpenMathLib/OpenBLAS'"
runs-on: ubuntu-24.04-arm

steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Install Dependencies
run: |
sudo apt-get update
sudo apt-get install -y gcc gfortran make
- name: Build OpenBLAS
run: |
make -j${nproc} TARGET=NEOVERSEN2
make -j${nproc} TARGET=NEOVERSEN2 lapack-test

37 changes: 37 additions & 0 deletions .github/workflows/harmonyos.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: harmonyos

on: [push, pull_request]

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

permissions:
contents: read # to fetch code (actions/checkout)

jobs:
build:
if: "github.repository == 'OpenMathLib/OpenBLAS'"
runs-on: ubuntu-latest
env:
OHOS_NDK_CMAKE: $GITHUB_WORKSPACE/ohos-sdk/linux/native/build-tools/cmake/bin/cmake
COMMON_CMAKE_OPTIONS: |
-DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \
-DCMAKE_INSTALL_PREFIX=install \
-DCMAKE_BUILD_TYPE=Release \
steps:
- uses: actions/checkout@v4
- name: ndk-install
run: |
wget https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz
tar -xf ohos-sdk-windows_linux-public.tar.gz
cd ohos-sdk/linux
unzip -q native-linux-x64-4.1.7.8-Release.zip
cd -
- name: build-armv8
run: |
mkdir build && cd build
${{ env.OHOS_NDK_CMAKE }} ${{ env.COMMON_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" \
-DTARGET=ARMV8 -DNOFORTRAN=1 ..
${{ env.OHOS_NDK_CMAKE }} --build . -j $(nproc)
2 changes: 1 addition & 1 deletion .github/workflows/loongarch64_clang.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
- name: Install APT deps
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache libglib2.0-dev
- name: Download and install loongarch64-toolchain
run: |
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/mips64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ jobs:
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross
gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross libglib2.0-dev
- name: checkout qemu
uses: actions/checkout@v3
with:
repository: qemu/qemu
path: qemu
ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2
ref: ae35f033b874c627d81d51070187fbf55f0bf1a7

- name: build qemu
run: |
Expand Down
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@

cmake_minimum_required(VERSION 3.16.0)

set (CMAKE_ASM_SOURCE_FILE_EXTENSIONS "S")
project(OpenBLAS C ASM)

set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 28.dev)
set(OpenBLAS_PATCH_VERSION 29.dev)

set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

Expand Down
11 changes: 11 additions & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,3 +229,14 @@ In chronological order:

* Christopher Daley <https://github.com/cdaley>
* [2024-01-24] Optimize GEMV forwarding on ARM64 systems

* Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32>
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE

* Annop Wongwathanarat <annop.wongwathanarat@arm.com>
* [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1
* [2025-01-21] Optimize gemv_t_sve_v1x3 kernel

* Marek Michalowski <https://github.com/michalowski-arm>
* [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1`

95 changes: 95 additions & 0 deletions Changelog.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,99 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.29
12-Jan-2025

general:
- fixed a potential NULL pointer dereference in multithreaded builds
- added function aliases for GEMMT using its new name GEMMTR adopted by Reference-BLAS
- fixed a build failure when building without LAPACK_DEPRECATED functions
- the minimum required CMake version for CMake-based builds was raised to 3.16.0 in order
to remove many compatibility and deprecation warnings
- added more detailed CMake rules for OpenMP builds (mainly to support recent LLVM)
- fixed the behavior of the recently added CBLAS_?GEMMT functions with row-major data
- improved thread scaling of multithreaded SBGEMV
- improved thread scaling of multithreaded TRTRI
- fixed compilation of the CBLAS testsuite with gcc14 (and no Fortran compiler)
- added support for option handling changes in flang-new from LLVM18 onwards
- added support for recent calling conventions changes in Cray and NVIDIA compilers
- added support for compilation with the NAG Fortran compiler
- fixed placement of the -fopenmp flag and libsuffix in the generated pkgconfig file
- improved the CMakeConfig file generated by the Makefile build
- fixed const-correctness of cblas_?geadd in cblas.h
- fixed a potential inaccuracy in multithreaded BLAS3 calls
- fixed empty implementations of get/set_affinity that print a warning in OpenMP builds
- fixed function signatures for TRTRS in the converted C version of LAPACK
- fixed omission of several single-precision LAPACK symbols in the shared library
- improved build instructions for the provided "pybench" benchmarks
- improved documentation, including added build instructions for WoA and HarmonyOS
as well as descriptions of environment variables that affect build and runtime behavior
- added a separate "make install_tests" target for use with cross-compilations
- integrated improvements and corrections from Reference-LAPACK:
- removed a comparison in LAPACKE ?tpmqrt that is always false (LAPACK PR 1062)
- fixed the leading dimension for B in tests for GGEV (LAPACK PR 1064)
- replaced the ?LARFT functions with a recursive implementation (LAPACK PR 1080)

arm:
- fixed build with recent versions of the NDK (missing .type declaration of symbols)

arm64:
- fixed a long-standing bug in the (generic) c/zgemm_beta kernel that could lead to
reads and writes outside the array bounds in some circumstances
- rewrote cpu autodetection to scan all cores and return the highest performing type
- improved the DGEMM performance for SVE targets and small matrix sizes
- improved dimension criteria for forwarding from GEMM to GEMV kernels
- added SVE kernels for ROT and SWAP
- improved SVE kernels for SGEMV and DGEMV on A64FX and NEOVERSEV1
- added support for using the "small matrix" kernels with CMake as well
- fixed compilation on Windows on Arm
- improved compile-time detection of SVE capability
- added cpu autodetection and initial support for Apple M4
- added support for compilation on systems running IOS
- added support for compilation on NetBSD ("evbarm" architecture)
- fixed NRM2 implementations for generic SVE targets and the Neoverse N2
- fixed compilation for SVE-capable targets with the NVIDIA compiler

x86_64:
- fixed a wrong storage size in the SBGEMV kernel for Cooper Lake
- added cpu autodetection for Intel Granite Rapids
- added cpu autodetection for AMD Ryzen 5 series
- added optimized SOMATCOPY_CT for AVX-capable targets
- fixed the fallback implementation of GEMM3M in GENERIC builds
- tentatively re-enabled builds with the EXPRECISION option
- worked around a miscompilation of tests with mingw32-gfortran14
- added support for compilation with the Intel oneAPI 2025.0 compiler on Windows

power:
- fixed multithreaded SBGEMM
- fixed a CMake build problem on POWER10
- improved the performance of SGEMV
- added vectorized implementations of SBGEMV and support for forwarding 1xN SBGEMM to them
- fixed illegal instructions and potential memory overflow in SGEMM on PPCG4
- fixed handling of NaN and Inf arguments in SSCAL and DSCAL on PPC440,G4 and 970
- added improved CGEMM and ZGEMM kernels for POWER10
- added Makefile logic to remove all optimization flags in DEBUG builds

mips64:
- fixed compilation with gcc14
- fixed GEMM parameter selection for the MIPS64_GENERIC target
- fixed a potential build failure when compiling with OpenMP

loongarch64:
- fixed compilation for Loongson3 with recent versions of gmake
- fixed a potential loss of precision in Loongson3A GEMM
- fixed a potential build failure when compiling with OpenMP
- added optimized SOMATCOPY for LASX-capable targets
- introduced a new cpu naming scheme while retaining compatibility
- added support for cross-compiling Loongarch64 targets with CMake
- added support for compilation with LLVM

riscv64:
- removed thread yielding overhead caused by sched_yield
- replaced some non-standard intrinsics with their official names
- fixed and sped up the implementations of CGEMM/ZGEMM TCOPY for vector lenghts 128 and 256
- improved the performance of SNRM2/DNRM2 for RVV1.0 targets
- added optimized ?OMATCOPY_CN kernels for RVV1.0 targets

====================================================================
Version 0.3.28
8-Aug-2024
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,9 @@ dummy :
install :
$(MAKE) -f Makefile.install install

install_tests :
$(MAKE) -f Makefile.install install_tests

clean ::
@for d in $(SUBDIRS_ALL) ; \
do if test -d $$d; then \
Expand Down
27 changes: 27 additions & 0 deletions Makefile.arm64
Original file line number Diff line number Diff line change
Expand Up @@ -356,4 +356,31 @@ endif

endif

else
# NVIDIA HPC options necessary to enable SVE in the compiler
ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -tp=thunderx2t99
FCOMMON_OPT += -tp=thunderx2t99
endif
ifeq ($(CORE), NEOVERSEN1)
CCOMMON_OPT += -tp=neoverse-n1
FCOMMON_OPT += -tp=neoverse-n1
endif
ifeq ($(CORE), NEOVERSEV1)
CCOMMON_OPT += -tp=neoverse-v1
FCOMMON_OPT += -tp=neoverse-v1
endif
ifeq ($(CORE), NEOVERSEV2)
CCOMMON_OPT += -tp=neoverse-v2
FCOMMON_OPT += -tp=neoverse-v2
endif
ifeq ($(CORE), ARMV8SVE)
CCOMMON_OPT += -tp=neoverse-v2
FCOMMON_OPT += -tp=neoverse-v2
endif
ifeq ($(CORE), ARMV9SVE)
CCOMMON_OPT += -tp=neoverse-v2
FCOMMON_OPT += -tp=neoverse-v2
endif

endif
Loading

0 comments on commit bcd59e0

Please sign in to comment.