diff --git a/Manifest.toml b/Manifest.toml index eb6e3b60..e22118bd 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.8.5" manifest_format = "2.0" -project_hash = "7a3272ed59a2ad154cfe72c74eda180b1c208c6b" +project_hash = "c2f50cf42770de6ef32bf0ade955b592c49b8da5" [[deps.Adapt]] deps = ["LinearAlgebra", "Requires"] @@ -79,9 +79,9 @@ uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" [[deps.GPUArrays]] deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] -git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1" +git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.8.1" +version = "9.0.0" [[deps.GPUArraysCore]] deps = ["Adapt"] diff --git a/Project.toml b/Project.toml index ebd87f9e..a556381b 100644 --- a/Project.toml +++ b/Project.toml @@ -28,7 +28,7 @@ oneAPI_Support_jll = "b049733a-a71d-5ed3-8eba-7d323ac00b36" Adapt = "2.0, 3.0" CEnum = "0.4" ExprTools = "0.1" -GPUArrays = "8.4" +GPUArrays = "9" GPUCompiler = "0.23" KernelAbstractions = "0.9.1" LLVM = "6" diff --git a/src/array.jl b/src/array.jl index f7d40e1c..03f5e8a3 100644 --- a/src/array.jl +++ b/src/array.jl @@ -1,36 +1,54 @@ export oneArray, oneVector, oneMatrix, oneVecOrMat -## array storage - -# array storage is shared by arrays that refer to the same data, while keeping track of -# the number of outstanding references - -struct ArrayStorage{B} - buffer::B +## array type - # the refcount also encodes the state of the array: - # < 0: unmanaged - # = 0: freed - # > 0: referenced - refcount::Threads.Atomic{Int} +function hasfieldcount(@nospecialize(dt)) + try + fieldcount(dt) + catch + return false + end + return true end -ArrayStorage(buf::B, state::Int) where {B} = - ArrayStorage{B}(buf, Threads.Atomic{Int}(state)) - +function contains_eltype(T, X) + if T === X + return true + elseif T isa Union + for U in Base.uniontypes(T) + contains_eltype(U, X) && return true + end + elseif hasfieldcount(T) + for U in fieldtypes(T) + contains_eltype(U, X) && return true + end + end + return false +end -## array type +function check_eltype(T) + Base.allocatedinline(T) || error("oneArray only supports element types that are stored inline") + Base.isbitsunion(T) && error("oneArray does not yet support isbits-union arrays") + if oneL0.module_properties(device()).fp16flags & oneL0.ZE_DEVICE_MODULE_FLAG_FP16 != + oneL0.ZE_DEVICE_MODULE_FLAG_FP16 + contains_eltype(T, Float16) && error("Float16 is not supported on this device") + end + if oneL0.module_properties(device()).fp64flags & oneL0.ZE_DEVICE_MODULE_FLAG_FP64 != + oneL0.ZE_DEVICE_MODULE_FLAG_FP64 + contains_eltype(T, Float64) && error("Float64 is not supported on this device") + end +end mutable struct oneArray{T,N,B} <: AbstractGPUArray{T,N} - storage::Union{Nothing,ArrayStorage{B}} + data::DataRef{B} maxsize::Int # maximum data size; excluding any selector bytes offset::Int # offset of the data in the buffer, in number of elements dims::Dims{N} function oneArray{T,N,B}(::UndefInitializer, dims::Dims{N}) where {T,N,B} - Base.allocatedinline(T) || error("oneArray only supports element types that are stored inline") + check_eltype(T) maxsize = prod(dims) * sizeof(T) bufsize = if Base.isbitsunion(T) # type tag array past the data @@ -42,36 +60,22 @@ mutable struct oneArray{T,N,B} <: AbstractGPUArray{T,N} ctx = context() dev = device() buf = allocate(B, ctx, dev, bufsize, Base.datatype_alignment(T)) - storage = ArrayStorage(buf, 1) - obj = new{T,N,B}(storage, maxsize, 0, dims) + data = DataRef(buf) do buf + release(buf) + end + obj = new{T,N,B}(data, maxsize, 0, dims) finalizer(unsafe_free!, obj) end - function oneArray{T,N}(storage::ArrayStorage{B}, dims::Dims{N}; + function oneArray{T,N}(data::DataRef{B}, dims::Dims{N}; maxsize::Int=prod(dims) * sizeof(T), offset::Int=0) where {T,N,B} - Base.allocatedinline(T) || error("oneArray only supports element types that are stored inline") - return new{T,N,B}(storage, maxsize, offset, dims) + check_eltype(T) + obj = new{T,N,B}(copy(data), maxsize, offset, dims) + finalizer(unsafe_free!, obj) end end -function unsafe_free!(xs::oneArray) - # this call should only have an effect once, because both the user and the GC can call it - if xs.storage === nothing - return - elseif xs.storage.refcount[] < 0 - throw(ArgumentError("Cannot free an unmanaged buffer.")) - end - - refcount = Threads.atomic_add!(xs.storage.refcount, -1) - if refcount == 1 - release(xs.storage.buffer) - end - - # this array object is now dead, so replace its storage by a dummy one - xs.storage = nothing - - return -end +unsafe_free!(a::oneArray) = GPUArrays.unsafe_free!(a.data) ## alias detection @@ -86,6 +90,7 @@ function Base.mightalias(A::oneArray, B::oneArray) return first(rA) <= first(rB) < last(rA) || first(rB) <= first(rA) < last(rB) end + ## convenience constructors const oneVector{T} = oneArray{T,1} @@ -96,17 +101,23 @@ const oneVecOrMat{T} = Union{oneVector{T},oneMatrix{T}} oneArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} = oneArray{T,N,oneL0.DeviceBuffer}(undef, dims) -# type and dimensionality specified, accepting dims as series of Ints -oneArray{T,N,B}(::UndefInitializer, dims::Integer...) where {T,N,B} = +# buffer, type and dimensionality specified +oneArray{T,N,B}(::UndefInitializer, dims::NTuple{N,Integer}) where {T,N,B} = + oneArray{T,N,B}(undef, convert(Tuple{Vararg{Int}}, dims)) +oneArray{T,N,B}(::UndefInitializer, dims::Vararg{Integer,N}) where {T,N,B} = oneArray{T,N,B}(undef, convert(Tuple{Vararg{Int}}, dims)) -oneArray{T,N}(::UndefInitializer, dims::Integer...) where {T,N} = + +# type and dimensionality specified +oneArray{T,N}(::UndefInitializer, dims::NTuple{N,Integer}) where {T,N} = + oneArray{T,N}(undef, convert(Tuple{Vararg{Int}}, dims)) +oneArray{T,N}(::UndefInitializer, dims::Vararg{Integer,N}) where {T,N} = oneArray{T,N}(undef, convert(Tuple{Vararg{Int}}, dims)) -# type but not dimensionality specified -oneArray{T}(::UndefInitializer, dims::Dims{N}) where {T,N} = - oneArray{T,N}(undef, dims) -oneArray{T}(::UndefInitializer, dims::Integer...) where {T} = - oneArray{T}(undef, convert(Tuple{Vararg{Int}}, dims)) +# only type specified +oneArray{T}(::UndefInitializer, dims::NTuple{N,Integer}) where {T,N} = + oneArray{T,N}(undef, convert(Tuple{Vararg{Int}}, dims)) +oneArray{T}(::UndefInitializer, dims::Vararg{Integer,N}) where {T,N} = + oneArray{T,N}(undef, convert(Tuple{Vararg{Int}}, dims)) # empty vector constructor oneArray{T,1,B}() where {T,B} = oneArray{T,1,B}(undef, 0) @@ -150,13 +161,11 @@ Base.size(x::oneArray) = x.dims Base.sizeof(x::oneArray) = Base.elsize(x) * length(x) function context(A::oneArray) - A.storage === nothing && throw(UndefRefError()) - return oneL0.context(A.storage.buffer) + return oneL0.context(A.data[]) end function device(A::oneArray) - A.storage === nothing && throw(UndefRefError()) - return oneL0.device(A.storage.buffer) + return oneL0.device(A.data[]) end @@ -166,22 +175,22 @@ export oneDenseArray, oneDenseVector, oneDenseMatrix, oneDenseVecOrMat, oneStridedArray, oneStridedVector, oneStridedMatrix, oneStridedVecOrMat, oneWrappedArray, oneWrappedVector, oneWrappedMatrix, oneWrappedVecOrMat -oneContiguousSubArray{T,N,A<:oneArray} = Base.FastContiguousSubArray{T,N,A} - # dense arrays: stored contiguously in memory -const oneDenseReinterpretArray{T,N,A<:Union{oneArray,oneContiguousSubArray}} = Base.ReinterpretArray{T,N,S,A} where S -const oneDenseReshapedArray{T,N,A<:Union{oneArray,oneContiguousSubArray,oneDenseReinterpretArray}} = Base.ReshapedArray{T,N,A} -const DenseSuboneArray{T,N,A<:Union{oneArray,oneDenseReshapedArray,oneDenseReinterpretArray}} = Base.FastContiguousSubArray{T,N,A} -const oneDenseArray{T,N} = Union{oneArray{T,N}, DenseSuboneArray{T,N}, oneDenseReshapedArray{T,N}, oneDenseReinterpretArray{T,N}} +# +# all common dense wrappers are currently represented as oneArray objects. +# this simplifies common use cases, and greatly improves load time. +const oneDenseArray{T,N} = oneArray{T,N} const oneDenseVector{T} = oneDenseArray{T,1} const oneDenseMatrix{T} = oneDenseArray{T,2} const oneDenseVecOrMat{T} = Union{oneDenseVector{T}, oneDenseMatrix{T}} +# XXX: these dummy aliases (oneDenseArray=oneArray) break alias printing, as +# `Base.print_without_params` only handles the case of a single alias. # strided arrays -const oneStridedSubArray{T,N,A<:Union{oneArray,oneDenseReshapedArray,oneDenseReinterpretArray}, - I<:Tuple{Vararg{Union{Base.RangeIndex, Base.ReshapedUnitRange, - Base.AbstractCartesianIndex}}}} = SubArray{T,N,A,I} -const oneStridedArray{T,N} = Union{oneArray{T,N}, oneStridedSubArray{T,N}, oneDenseReshapedArray{T,N}, oneDenseReinterpretArray{T,N}} +const oneStridedSubArray{T,N,I<:Tuple{Vararg{Union{Base.RangeIndex, Base.ReshapedUnitRange, + Base.AbstractCartesianIndex}}}} = + SubArray{T,N,<:oneArray,I} +const oneStridedArray{T,N} = Union{oneArray{T,N}, oneStridedSubArray{T,N}} const oneStridedVector{T} = oneStridedArray{T,1} const oneStridedMatrix{T} = oneStridedArray{T,2} const oneStridedVecOrMat{T} = Union{oneStridedVector{T}, oneStridedMatrix{T}} @@ -191,7 +200,7 @@ Base.pointer(x::oneStridedArray{T}) where {T} = Base.unsafe_convert(ZePtr{T}, x) Base.unsafe_convert(ZePtr{T}, x) + Base._memory_offset(x, i) end -# wrapped arrays: can be used in kernels +# anything that's (secretly) backed by a oneArray const oneWrappedArray{T,N} = Union{oneArray{T,N}, WrappedArray{T,N,oneArray,oneArray{T,N}}} const oneWrappedVector{T} = oneWrappedArray{T,1} const oneWrappedMatrix{T} = oneWrappedArray{T,2} @@ -237,7 +246,7 @@ Base.convert(::Type{T}, x::T) where T <: oneArray = x Base.unsafe_convert(::Type{Ptr{T}}, x::oneArray{T}) where {T} = throw(ArgumentError("cannot take the host address of a $(typeof(x))")) Base.unsafe_convert(::Type{ZePtr{T}}, x::oneArray{T}) where {T} = - convert(ZePtr{T}, x.storage.buffer) + x.offset*Base.elsize(x) + convert(ZePtr{T}, x.data[]) + x.offset*Base.elsize(x) ## interop with GPU arrays @@ -255,7 +264,7 @@ Adapt.adapt_storage(::KernelAdaptor, xs::oneArray{T,N}) where {T,N} = typetagdata(a::Array, i=1) = ccall(:jl_array_typetagdata, Ptr{UInt8}, (Any,), a) + i - 1 typetagdata(a::oneArray, i=1) = - convert(ZePtr{UInt8}, a.storage.buffer) + a.maxsize + a.offset + i - 1 + convert(ZePtr{UInt8}, a.data[]) + a.maxsize + a.offset + i - 1 function Base.copyto!(dest::oneArray{T}, doffs::Integer, src::Array{T}, soffs::Integer, n::Integer) where T @@ -305,7 +314,7 @@ function Base.unsafe_copyto!(ctx::ZeContext, dev::ZeDevice, GC.@preserve src dest unsafe_copyto!(ctx, dev, pointer(dest, doffs), pointer(src, soffs), n) if Base.isbitsunion(T) # copy selector bytes - error("Not implemented") + error("oneArray does not yet support isbits-union arrays") end return dest end @@ -315,7 +324,7 @@ function Base.unsafe_copyto!(ctx::ZeContext, dev::ZeDevice, GC.@preserve src dest unsafe_copyto!(ctx, dev, pointer(dest, doffs), pointer(src, soffs), n) if Base.isbitsunion(T) # copy selector bytes - error("Not implemented") + error("oneArray does not yet support isbits-union arrays") end # copies to the host are synchronizing @@ -329,7 +338,7 @@ function Base.unsafe_copyto!(ctx::ZeContext, dev::ZeDevice, GC.@preserve src dest unsafe_copyto!(ctx, dev, pointer(dest, doffs), pointer(src, soffs), n) if Base.isbitsunion(T) # copy selector bytes - error("Not implemented") + error("oneArray does not yet support isbits-union arrays") end return dest end @@ -364,37 +373,24 @@ function Base.fill!(A::oneDenseArray{T}, val) where T end +## derived arrays + +function GPUArrays.derive(::Type{T}, N::Int, a::oneArray, dims::Dims, offset::Int) where {T} + offset = (a.offset * Base.elsize(a)) รท sizeof(T) + offset + oneArray{T,N}(a.data, dims; a.maxsize, offset) +end + + ## views device(a::SubArray) = device(parent(a)) context(a::SubArray) = context(parent(a)) -# we don't really want an array, so don't call `adapt(Array, ...)`, -# but just want oneArray indices to get downloaded back to the CPU. -# this makes sure we preserve array-like containers, like Base.Slice. -struct BackToCPU end -Adapt.adapt_storage(::BackToCPU, xs::oneArray) = convert(Array, xs) - -@inline function Base.view(A::oneArray, I::Vararg{Any,N}) where {N} - J = to_indices(A, I) - @boundscheck begin - # Base's boundscheck accesses the indices, so make sure they reside on the CPU. - # this is expensive, but it's a bounds check after all. - J_cpu = map(j->adapt(BackToCPU(), j), J) - checkbounds(A, J_cpu...) - end - J_gpu = map(j->adapt(oneArray, j), J) - Base.unsafe_view(Base._maybe_reshape_parent(A, Base.index_ndims(J_gpu...)), J_gpu...) -end - # pointer conversions -## contiguous function Base.unsafe_convert(::Type{ZePtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{Base.RangeIndex}}}) where {T,N,P} return Base.unsafe_convert(ZePtr{T}, parent(V)) + Base._memory_offset(V.parent, map(first, V.indices)...) end - -## reshaped function Base.unsafe_convert(::Type{ZePtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{Union{Base.RangeIndex,Base.ReshapedUnitRange}}}}) where {T,N,P} return Base.unsafe_convert(ZePtr{T}, parent(V)) + (Base.first_index(V)-1)*sizeof(T) @@ -410,24 +406,6 @@ Base.unsafe_convert(::Type{ZePtr{T}}, A::PermutedDimsArray) where {T} = Base.unsafe_convert(ZePtr{T}, parent(A)) -## reshape - -device(a::Base.ReshapedArray) = device(parent(a)) -context(a::Base.ReshapedArray) = context(parent(a)) - -Base.unsafe_convert(::Type{ZePtr{T}}, a::Base.ReshapedArray{T}) where {T} = - Base.unsafe_convert(ZePtr{T}, parent(a)) - - -## reinterpret - -device(a::Base.ReinterpretArray) = device(parent(a)) -context(a::Base.ReinterpretArray) = context(parent(a)) - -Base.unsafe_convert(::Type{ZePtr{T}}, a::Base.ReinterpretArray{T,N,S} where N) where {T,S} = - ZePtr{T}(Base.unsafe_convert(ZePtr{S}, parent(a))) - - ## unsafe_wrap """ diff --git a/src/device/array.jl b/src/device/array.jl index 941c6ba0..8ad97ca0 100644 --- a/src/device/array.jl +++ b/src/device/array.jl @@ -166,7 +166,7 @@ Base.IndexStyle(::Type{<:oneDeviceArray}) = Base.IndexLinear() Base.@propagate_inbounds Base.getindex(A::oneDeviceArray{T}, i1::Integer) where {T} = arrayref(A, i1) -Base.@propagate_inbounds Base.setindex!(A::oneDeviceArray{T}, x, i1::Int) where {T} = +Base.@propagate_inbounds Base.setindex!(A::oneDeviceArray{T}, x, i1::Integer) where {T} = arrayset(A, convert(T,x)::T, i1) # preserve the specific integer type when indexing device arrays, diff --git a/test/array.jl b/test/array.jl index 6cfe2c46..39d88ef3 100644 --- a/test/array.jl +++ b/test/array.jl @@ -4,18 +4,18 @@ import Adapt @testset "constructors" begin xs = oneArray{Int}(undef, 2, 3) @test collect(oneArray([1 2; 3 4])) == [1 2; 3 4] - @test testf(vec, rand(5,3)) + @test testf(vec, rand(Float32, 5,3)) @test Base.elsize(xs) == sizeof(Int) @test oneArray{Int, 2}(xs) === xs @test_throws ArgumentError Base.unsafe_convert(Ptr{Int}, xs) @test_throws ArgumentError Base.unsafe_convert(Ptr{Float32}, xs) - @test collect(oneAPI.zeros(2, 2)) == zeros(2, 2) - @test collect(oneAPI.ones(2, 2)) == ones(2, 2) + @test collect(oneAPI.zeros(Float32, 2, 2)) == zeros(Float32, 2, 2) + @test collect(oneAPI.ones(Float32, 2, 2)) == ones(Float32, 2, 2) - @test collect(oneAPI.fill(0, 2, 2)) == zeros(2, 2) - @test collect(oneAPI.fill(1, 2, 2)) == ones(2, 2) + @test collect(oneAPI.fill(0, 2, 2)) == zeros(Int, 2, 2) + @test collect(oneAPI.fill(1, 2, 2)) == ones(Int, 2, 2) end @testset "adapt" begin