size(range), length(range) and range[end] fail inside CUDA kernels #434

samo-lin · 2020-09-21T10:36:20Z

The following basic code using UnitRange works and shows that ranges are supported inside CUDA kernels:

using CUDA
A = CUDA.ones(2,2)

function f(X, range::UnitRange)
	ix = (blockIdx().x-1) * blockDim().x + threadIdx().x
	iy = (blockIdx().y-1) * blockDim().y + threadIdx().y
        for i in range
		X[ix,iy] = 2*X[ix,iy]
	end
	return
end

@cuda threads=(2,2) f(A, 1:3)

However, e.g. size(range,1) and range[end] fails inside CUDA kernels, e.g. the following codes fail:


using CUDA
A = CUDA.ones(2,2)

function f(X, range::UnitRange)
	ix = (blockIdx().x-1) * blockDim().x + threadIdx().x
	iy = (blockIdx().y-1) * blockDim().y + threadIdx().y
        if (size(range,1) > 10) return; end
        for i in range
		X[ix,iy] = 2*X[ix,iy]
	end
	return
end

@cuda threads=(2,2) f(A, 1:3)


using CUDA
A = CUDA.ones(2,2)

function f(X, range::UnitRange)
	ix = (blockIdx().x-1) * blockDim().x + threadIdx().x
	iy = (blockIdx().y-1) * blockDim().y + threadIdx().y
        if (range[end] > 10) return; end
        for i in range
		X[ix,iy] = 2*X[ix,iy]
	end
	return
end

@cuda threads=(2,2) f(A, 1:3)

The error is the following for size(range,1):

julia> @cuda threads=(2,2) f(A, 1:3)
ERROR: InvalidIRError: compiling kernel f(CuDeviceArray{Float32,2,CUDA.AS.Global}, UnitRange{Int64}) resulted in invalid LLVM IR
Reason: unsupported call to the Julia runtime (call to jl_f__apply_latest)
Stacktrace:
 [1] #invokelatest#1 at essentials.jl:712
 [2] invokelatest at essentials.jl:711
 [3] throw_overflowerr_binaryop at checked.jl:154
 [4] multiple call sites at unknown:0
Stacktrace:
 [1] check_ir(::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget,CUDA.CUDACompilerParams}, ::LLVM.Module) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/validation.jl:123
 [2] macro expansion at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/driver.jl:241 [inlined]
 [3] macro expansion at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/TimerOutputs/dVnaw/src/TimerOutput.jl:206 [inlined]
 [4] codegen(::Symbol, ::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget,CUDA.CUDACompilerParams}; libraries::Bool, deferred_codegen::Bool, optimize::Bool, strip::Bool, validate::Bool, only_entry::Bool) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/driver.jl:239
 [5] compile(::Symbol, ::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget,CUDA.CUDACompilerParams}; libraries::Bool, deferred_codegen::Bool, optimize::Bool, strip::Bool, validate::Bool, only_entry::Bool) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/driver.jl:39
 [6] compile at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/driver.jl:35 [inlined]
 [7] _cufunction(::GPUCompiler.FunctionSpec{typeof(f),Tuple{CuDeviceArray{Float32,2,CUDA.AS.Global},UnitRange{Int64}}}; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/CUDA/sjcZt/src/compiler/execution.jl:308
 [8] _cufunction at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/CUDA/sjcZt/src/compiler/execution.jl:302 [inlined]
 [9] check_cache(::typeof(CUDA._cufunction), ::GPUCompiler.FunctionSpec{typeof(f),Tuple{CuDeviceArray{Float32,2,CUDA.AS.Global},UnitRange{Int64}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/cache.jl:24
 [10] f at ./none:2 [inlined]
 [11] cached_compilation(::typeof(CUDA._cufunction), ::GPUCompiler.FunctionSpec{typeof(f),Tuple{CuDeviceArray{Float32,2,CUDA.AS.Global},UnitRange{Int64}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/cache.jl:0
 [12] cached_compilation(::Function, ::GPUCompiler.FunctionSpec{typeof(f),Tuple{CuDeviceArray{Float32,2,CUDA.AS.Global},UnitRange{Int64}}}, ::UInt64) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/cache.jl:44
 [13] cufunction(::Function, ::Type{T} where T; name::Nothing, kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/CUDA/sjcZt/src/compiler/execution.jl:296
 [14] cufunction(::Function, ::Type{T} where T) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/CUDA/sjcZt/src/compiler/execution.jl:291
 [15] top-level scope at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/CUDA/sjcZt/src/compiler/execution.jl:108

The error is the following for range[end]:

julia> @cuda threads=(2,2) f(A, 1:3)
ERROR: InvalidIRError: compiling kernel f(CuDeviceArray{Float32,2,CUDA.AS.Global}, UnitRange{Int64}) resulted in invalid LLVM IR
Reason: unsupported call to the Julia runtime (call to jl_f__apply_latest)
Stacktrace:
 [1] #invokelatest#1 at essentials.jl:712
 [2] invokelatest at essentials.jl:711
 [3] throw_overflowerr_binaryop at checked.jl:154
 [4] multiple call sites at unknown:0
Stacktrace:
 [1] check_ir(::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget,CUDA.CUDACompilerParams}, ::LLVM.Module) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/validation.jl:123
 [2] macro expansion at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/driver.jl:241 [inlined]
 [3] macro expansion at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/TimerOutputs/dVnaw/src/TimerOutput.jl:206 [inlined]
 [4] codegen(::Symbol, ::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget,CUDA.CUDACompilerParams}; libraries::Bool, deferred_codegen::Bool, optimize::Bool, strip::Bool, validate::Bool, only_entry::Bool) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/driver.jl:239
 [5] compile(::Symbol, ::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget,CUDA.CUDACompilerParams}; libraries::Bool, deferred_codegen::Bool, optimize::Bool, strip::Bool, validate::Bool, only_entry::Bool) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/driver.jl:39
 [6] compile at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/driver.jl:35 [inlined]
 [7] _cufunction(::GPUCompiler.FunctionSpec{typeof(f),Tuple{CuDeviceArray{Float32,2,CUDA.AS.Global},UnitRange{Int64}}}; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/CUDA/sjcZt/src/compiler/execution.jl:308
 [8] _cufunction at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/CUDA/sjcZt/src/compiler/execution.jl:302 [inlined]
 [9] check_cache(::typeof(CUDA._cufunction), ::GPUCompiler.FunctionSpec{typeof(f),Tuple{CuDeviceArray{Float32,2,CUDA.AS.Global},UnitRange{Int64}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/cache.jl:24
 [10] f at ./none:2 [inlined]
 [11] cached_compilation(::typeof(CUDA._cufunction), ::GPUCompiler.FunctionSpec{typeof(f),Tuple{CuDeviceArray{Float32,2,CUDA.AS.Global},UnitRange{Int64}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/cache.jl:0
 [12] cached_compilation(::Function, ::GPUCompiler.FunctionSpec{typeof(f),Tuple{CuDeviceArray{Float32,2,CUDA.AS.Global},UnitRange{Int64}}}, ::UInt64) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/GPUCompiler/pCBTA/src/cache.jl:44
 [13] cufunction(::Function, ::Type{T} where T; name::Nothing, kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/CUDA/sjcZt/src/compiler/execution.jl:296
 [14] cufunction(::Function, ::Type{T} where T) at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/CUDA/sjcZt/src/compiler/execution.jl:291
 [15] top-level scope at /home/omlins/.juliapro/JuliaPro_v1.4.1-1/packages/CUDA/sjcZt/src/compiler/execution.jl:108

CUDA-jl version used is: v1.1.0

The text was updated successfully, but these errors were encountered:

maleadt · 2020-09-21T11:18:52Z

#434

samo-lin · 2020-09-21T11:54:12Z

@maleadt , you noted #434, however this is this very issue; I suppose you wanted to note another one here.

Thanks for working on this!

maleadt · 2020-09-21T11:55:02Z

Hah, yeah. Meant to link to JuliaGPU/GPUCompiler.jl#81

samo-lin · 2020-09-22T07:51:10Z

Thanks @maleadt !

samo-lin added the bug Something isn't working label Sep 21, 2020

samo-lin changed the title ~~size(range), length(range) and range[end] fails inside CUDA kernels~~ size(range), length(range) and range[end] fail inside CUDA kernels Sep 21, 2020

maleadt mentioned this issue Sep 21, 2020

Catch some additional early errors. JuliaGPU/GPUCompiler.jl#81

Merged

maleadt closed this as completed in JuliaGPU/GPUCompiler.jl#81 Sep 22, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

size(range), length(range) and range[end] fail inside CUDA kernels #434

size(range), length(range) and range[end] fail inside CUDA kernels #434

samo-lin commented Sep 21, 2020

maleadt commented Sep 21, 2020

samo-lin commented Sep 21, 2020

maleadt commented Sep 21, 2020

samo-lin commented Sep 22, 2020

size(range), length(range) and range[end] fail inside CUDA kernels #434

size(range), length(range) and range[end] fail inside CUDA kernels #434

Comments

samo-lin commented Sep 21, 2020

maleadt commented Sep 21, 2020

samo-lin commented Sep 21, 2020

maleadt commented Sep 21, 2020

samo-lin commented Sep 22, 2020