-
Notifications
You must be signed in to change notification settings - Fork 233
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Inference failure when multiple structs are broadcasted via tuples #2623
Comments
What is the failing GPU version of that simple reproducer? Switching the inputs to GPU arrays works here: julia> gb = cu(b)
5×5 CuArray{Float32, 2, CUDA.DeviceMemory}:
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
julia> ga = cu(a)
5×5 CuArray{Float32, 2, CUDA.DeviceMemory}:
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
julia> gbc = instantiate(broadcasted(foo, gb, p1, p2));
julia> materialize!(ga, gbc)
5×5 CuArray{Float32, 2, CUDA.DeviceMemory}:
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0 In any case, the inference failure can manifest in the CPU case as well, it just executes with dynamic calls. |
I'll copy it from the gist here: #=
using Revise; include("cuda_broadcast_inference_reproducer.jl")
julia --project=.buildkite
julia --project=.buildkite cuda_broadcast_inference_reproducer.jl
julia +1.11 --project=.buildkite cuda_broadcast_inference_reproducer.jl
=#
@show VERSION
@static if !(VERSION ≥ v"1.11.0-beta")
using JET;
end
import CUDA # comment to run without CUDA
using Test
import Adapt
import Base
import Base.Broadcast: BroadcastStyle,
Broadcasted, instantiate, broadcasted, materialize, materialize!
struct VF{S <: AbstractFloat, Nv, A}
array::A
end
struct VFStyle{Nv, A} <: Base.BroadcastStyle end
function VF{S, Nv}(array::AbstractArray{T, 2}) where {S, Nv, T}
@assert size(array, 1) == Nv
@assert size(array, 2) == typesize(T, S)
VF{S, Nv, typeof(array)}(array)
end
function VF{S}(
::Type{ArrayType};
Nv::Integer,
) where {S, ArrayType}
Nf = typesize(eltype(ArrayType), S)
array = similar(ArrayType, Nv, Nf)
fill!(array, 0)
VF{S, Nv}(array)
end
typesize(::Type{T}, ::Type{S}) where {T, S} = div(sizeof(S), sizeof(T))
parent_array_type(::Type{<:Array{T}}) where {T} = Array{T}
Base.eltype(::Type{<:VF{S}}) where {S} = S
Base.parent(data::VF) = getfield(data, :array)
Base.similar(data::VF{S}) where {S} = similar(data, S)
@inline Base.size(data::VF, i::Integer) = size(data)[i]
@inline Base.size(data::VF{S, Nv}) where {S, Nv} = (1, 1, 1, Nv, 1)
Base.length(data::VF{S, Nv}) where {S, Nv} = Nv
Base.lastindex(data::VF) = length(data)
Base.copy(data::VF{S, NV}) where {S, NV} = VF{S, NV}(copy(parent(data)))
Base.Broadcast.BroadcastStyle(::Type{VF{S, Nv, A}}) where {S, Nv, A} = VFStyle{Nv, parent_array_type(A)}()
Base.Broadcast.BroadcastStyle(::Base.Broadcast.Style{<:Tuple}, ds::VFStyle) = ds
Base.Broadcast.broadcastable(data::VF) = data
Adapt.adapt_structure(to, data::VF{S, NV}) where {S, NV} = VF{S, NV}(Adapt.adapt(to, parent(data)))
@inline parent_array_type(::Type{VF{S, Nv, A}}) where {S, Nv, A} = A
Base.ndims(data::VF) = Base.ndims(typeof(data))
Base.ndims(::Type{T}) where {T <: VF} = Base.ndims(parent_array_type(T))
function Base.similar(
bc::Union{Base.Broadcast.Broadcasted{VFStyle{Nv, A}}, VF{S, Nv, A}},
::Type{S},
) where {Nv, A, S}
PA = parent_array_type(A)
array = similar(PA, (Nv, typesize(eltype(A), S)))
return VF{S, Nv}(array)
end
@inline function Base.getindex(
data::VF{S, Nv},
I::CartesianIndex,
) where {S, Nv}
@boundscheck 1 <= I.I[4] <= Nv || throw(BoundsError(data, I))
return parent(data)[I.I[4], 1]
end
@inline function Base.setindex!(
data::VF{S, Nv},
val,
I::CartesianIndex,
) where {S, Nv}
@boundscheck 1 <= I.I[4] <= Nv || throw(BoundsError(data, I))
parent(data)[I.I[4], 1] = val
end
function Base.copyto!(
dest::VF{S},
bc::Union{VF, Base.Broadcast.Broadcasted},
) where {S}
Base.copyto!(dest, bc, parent(dest))
dest
end
function Base.copyto!(
dest::VF{S, Nv},
bc::Union{Base.Broadcast.Broadcasted{VFStyle{Nv, A}}, VF{S, Nv, A}},
::Array,
) where {S, Nv, A}
@inbounds for v in 1:Nv
idx = CartesianIndex(1, 1, 1, v, 1)
dest[idx] = convert(S, bc[idx])
end
return dest
end
# Extension
@static if @isdefined(CUDA)
parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} = CUDA.CuArray{T, N, B} where {N}
Base.similar(
::Type{CUDA.CuArray{T, N′, B} where {N′}},
dims::Dims{N},
) where {T, N, B} = similar(CUDA.CuArray{T, N, B}, dims)
function knl_copyto!(dest::VF{S, Nv}, src) where {S, Nv}
(tv,) = CUDA.threadIdx()
(bv,) = CUDA.blockIdx()
v = tv + (bv - 1) * CUDA.blockDim().x
I = CartesianIndex((1, 1, 1, v, 1))
if 1 ≤ I.I[4] ≤ Nv
@inbounds dest[I] = src[I]
end
return nothing
end
function Base.copyto!(dest::VF{S, Nv}, bc, to::CUDA.CuArray) where {S, Nv}
kernel = CUDA.@cuda always_inline = true launch = false knl_copyto!(dest, bc)
config = CUDA.launch_configuration(kernel.fun)
n_max_threads = min(config.threads, Nv)
Nvt = fld(n_max_threads, Nv)
Nv_thread = Nvt == 0 ? n_max_threads : min(Int(Nvt), Nv)
Nv_blocks = cld(Nv, Nv_thread)
@assert Nv_thread ≤ n_max_threads "threads,n_max_threads=($(Nv_thread),$n_max_threads)"
p = (; threads = (Nv_thread,), blocks = (Nv_blocks,))
kernel(dest, bc; threads = p.threads, blocks = p.blocks)
return dest
end
end
struct MyParams1{A}
a::A
end;
struct MyParams2{B}
b::B
end;
Base.Broadcast.broadcastable(x::MyParams1) = tuple(x);
Base.Broadcast.broadcastable(x::MyParams2) = tuple(x);
foo(f, p1, p2) = f + p1.a - p2.b;
bar(p1, p2, f) = f + p1.a - p2.b;
FT = Float64;
p1 = MyParams1{FT}(1);
p2 = MyParams2{FT}(2);
@testset "Broken test" begin
b = zeros(FT, 5,5); # Ordinary CPU array works
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
materialize!(a, bc)
@static if !(VERSION ≥ v"1.11.0-beta")
@test_opt materialize!(a, bc) # also passes inference
end
b = VF{FT}(Array{FT}; Nv=4); # VF with CPU array works
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
materialize!(a, bc)
# @code_warntype materialize!(a, bc) # looks fine
@static if !(VERSION ≥ v"1.11.0-beta")
@test_opt materialize!(a, bc) # also passes inference
end
@static if @isdefined(CUDA)
b = CUDA.zeros(FT, 5,5); # CUDA.CuArray works
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
materialize!(a, bc)
b = VF{FT}(CUDA.CuArray{FT}; Nv=4); # VF with CUDA.CuArray fails
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
@test_throws CUDA.InvalidIRError materialize!(a, bc) # fails to compile
# CUDA.@device_code_warntype materialize!(a, bc)
end
end
#=
# re-run the last, breaking, part:
b = VF{FT}(CUDA.CuArray{FT}; Nv=4); # VF with CUDA.CuArray fails
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
materialize!(a, bc) # fails to compile
=#
nothing Note the |
I suppose that's possible, but I don't think it is because it passes |
I'm not sure if this is the best place for this issue, so please let me know and I can move it if it belongs somewhere else.
I'm running into inference failure when multiple structs are broadcasted via tuples. The CPU, ordinary array version of this looks like the following:
Here is a reproducer that has all 4 cases I'm looking at.
AFAICT, the actual error/issue seems to be inference failure due to tuple recursion depth limit in the recursive broadcast
getindex
, but it's kind of surprising because the tuple that is being indexed is((MyParams1,), (MyParams2,))
.In summary, this is what is working / not working:
VF
)`CUDA.@device_code_warntype` does seem to detect the issue:
version info:
Manifest.
The text was updated successfully, but these errors were encountered: