New native half-precision floating-point arithmetic not working on Ampere Altra (Aarch64 with FP16 enable architecture) #49987
Replies: 5 comments 5 replies
-
What is and please use ``` for code -formatting |
Beta Was this translation helpful? Give feedback.
-
I believe the problem here is that BLAS/LAPACK do not have 16 bit support yet. It might work if you use https://github.com/JuliaLinearAlgebra/RecursiveFactorization.jl. |
Beta Was this translation helpful? Give feedback.
-
As additional information, I found the same issue with basic operations (matrix multiplication, division or transpose). using Distributions, LinearAlgebra, BenchmarkTools, DataFrames, CSV
function CREATE_DATABASE(N, K, T)
matA = Array{T}(rand(K, N))
matB = Array{T}(rand(N, K))
return matA, matB
end
function BASIC(matA, matB, option)
if option == 1
res = matB*matA
elseif option == 2
res = matA*matB
elseif option == 3
res = matB/matA'
end
return [res]
end
function benchmark_BASIC(matA, matB)
results = DataFrame(zeros(3, 1), :auto)
results[1, 1] = mean(@benchmark(BASIC(matA,matB,1))).time
results[2, 1] = mean(@benchmark(BASIC(matA,matB,2))).time
results[3, 1] = mean(@benchmark(BASIC(matA,matB,3))).time
return results
end
# Inicialization
N=0
K=0
matA=0.
matB=0.
results=0.
#SMALL DATA TIMMINGS
#Nsmall = [1000, 500, 100]
Nsmall = [1000]
#Ksmall = [25, 20, 15]
Ksmall = [100]
Type=[Float64, Float32, Float16]
timmings=DataFrame()
for i in Nsmall
for j in Ksmall
for h in Type
println("Benchmark results for N = $i, K = $j and T = $h:")
N=i
K=j
T=h
matA, matB = CREATE_DATABASE(N, K, T)
results=benchmark_BASIC(matA,matB)
timmings=hcat(timmings, results)
rename!(timmings,:x1 => ":t-$N-$K-$T")
display(timmings)
end
end
end
CSV.write("methods_timmings_BASIC.csv", timmings) |
Beta Was this translation helpful? Give feedback.
-
Following a Valentin's suggestion I have been working on simpler FP16 cases, like saxpy scalar-vector operations. using LinearAlgebra, BenchmarkTools
function saxpy!(N,SA,SX,INCX::Int,SY,INCY::Int)
if (N ≤ 0) return nothing end
if (SA==0.0) return nothing end
if (INCX==1) && (INCY==1)
M = N%4
if (M≠0)
@inbounds Threads.@threads for I = 1:M
SY[I] = SY[I] + SA*SX[I]
end
end
if (N < 4) return nothing end
MP1 = M + 1
@inbounds Threads.@threads for I = MP1:4:N
SY[I] = SY[I] + SA*SX[I]
SY[I+1] = SY[I+1] + SA*SX[I+1]
SY[I+2] = SY[I+2] + SA*SX[I+2]
SY[I+3] = SY[I+3] + SA*SX[I+3]
end
else
IX = 1
IY = 1
if (INCX<0)
IX = (-N+1)*INCX + 1
end
if (INCY<0)
IY = (-N+1)*INCY + 1
end
@inbounds Threads.@threads for I = 1:N
SY[IY] = SY[IY] + SA*SX[IX]
IX = IX + INCX
IY = IY + INCY
end
end
return nothing
end
#DATA CREATION
N1::Int64=10^4
a1::Float64 = 0.3141592653589793
x1 = convert(Array{Float64},collect(1:N1));
y1 = convert(Array{Float64},collect(1:N1));
N2::Int32=10^4
a2::Float32 = 0.3141592653589793
x2 = convert(Array{Float32},collect(1:N2));
y2 = convert(Array{Float32},collect(1:N2));
N3::Int16=10^4
a3::Float16 = 0.3141592653589793
x3 = convert(Array{Float16},collect(1:N3));
y3 = convert(Array{Float16},collect(1:N3));
@btime BLAS.axpy!($a1,$x1,$y1);
@btime saxpy!($N1,$a1,$x1,1,$y1,1);
@btime $y1 .= $a1.*$x1 + $y1;
@btime BLAS.axpy!($a1,$x2,$y2);
@btime saxpy!($N2,$a2,$x2,1,$y2,1);
@btime $y2 .= $a2.*$x2 + $y2;
try @btime BLAS.axpy!($a3,$x3,$y3);
catch e
println("FP16 not supported by BLAS")
end
@btime saxpy!($N3,$a3,$x3,1,$y3,1);
@btime $y3 .= $a3.*$x3 + $y3; |
Beta Was this translation helpful? Give feedback.
-
Julia v.1.9.0 has amazing new features (native code caching, pkg extensions, new sorting algorithms, etc.)
For econometric purposes, one of the most salient upgrades is native FP16 arithmetic.
Theoretically, it is a game changer for training models in economics where single/double precision requirements are unusual (and range issues can be deal with using standardized data).
In this post (https://julialang.org/blog/2023/04/julia-1.9-highlights/#native_half-precision_floating-point_arithmetic) it is emphasized that this new feature is only available for hardware with the appropriate architecture (Aarch64 with FP16 ALUs. like Apple's M series or Fujitsu's A64FX).
I'm not sure if I'm doing something wrong but I was trying to use this new feature through the t2a-standard-4 Google Cloud Machine (with Ampere Altra Aarch64 CPUs, with -allegedly- support for FP16 arithmetic). Unfortunately, my Julia code (see below) was unable to obtain the expected runtime gains. On the contrary, Matrix Factorization (particularly the Cholesky Decomposition) is significantly slower with FP16 operations (as in the x86 architecture, showing that FP16 ALUs are not being used).
Has anyone tested this new Julia’s feature in other Aarch64 hardware?
Thanks in advance, Demian
Beta Was this translation helpful? Give feedback.
All reactions