You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
CuArray reduction operations that eventually land in mapreducedim! may segfault when called from multiple host threads simultaneously.
To reproduce
$ julia --threads=2
julia>using CUDA
julia>function_sum(x::CuVector) # My own sum function, see below for the actual definition# [...]end
julia>functionsums(n, sum_f)
xs = [CUDA.randn(1024) for _ in1:n]
for x in xs
sum_f(x)
endend
sums (generic function with 1 method)
julia>functionsums_threaded(n, sum_f)
xs = [CUDA.randn(1024) for _ in1:n]
Threads.@threadsfor x in xs
sum_f(x)
endend
sums_threaded (generic function with 1 method)
julia>sums(200000, _sum)
julia>sums(200000, sum)
julia>sums_threaded(200000, _sum)
julia>sums_threaded(200000, sum)
signal (11): Segmentation fault
signal (11): Segmentation fault
in expression starting at REPL[70]:1
in expression starting at REPL[70]:1
jl_get_cfunction_trampoline at ~/local/julia-1.5.3/bin/../lib/libjul
ia.so.1 (unknown line)
#launch_configuration#606 at ~/.julia/packages/CUDA/YeS8q/lib/cudadr
v/occupancy.jl:61 [inlined]
launch_configuration##kw at ~/.julia/packages/CUDA/YeS8q/lib/cudadrv/occupancy.jl:55 [inlined]
#mapreducedim!#900 at ~/.julia/packages/CUDA/YeS8q/src/mapreduce.jl:197
mapreducedim!##kw at ~/.julia/packages/CUDA/YeS8q/src/mapreduce.jl:143 [inlined]
#mapreducedim!#900 at ~/.julia/packages/CUDA/YeS8q/src/mapreduce.jl:239
mapreducedim!##kw at ~/.julia/packages/CUDA/YeS8q/src/mapreduce.jl:143 [inlined]
#_mapreduce#17 at ~/.julia/packages/GPUArrays/jhRU7/src/host/mapredu
ce.jl:62
_mapreduce##kw at ~/.julia/packages/GPUArrays/jhRU7/src/host/mapredu
ce.jl:34 [inlined]
#mapreduce#15 at ~/.julia/packages/GPUArrays/jhRU7/src/host/mapreduc
e.jl:28 [inlined]
mapreduce at ~/.julia/packages/GPUArrays/jhRU7/src/host/mapreduce.jl
:28 [inlined]
_sum at ./reducedim.jl:727 [inlined]
_sum at ./reducedim.jl:726 [inlined]
#sum#627 at ./reducedim.jl:722 [inlined]
sum at ./reducedim.jl:722 [inlined]
macro expansion at ./REPL[66]:4 [inlined]
#11#threadsfor_fun at ./threadingconstructs.jl:81#11#threadsfor_fun at ./threadingconstructs.jl:48
unknown function (ip:0x7fec602a018c)
trampoline_alloc at /buildworker/worker/package_linux64/build/src/runtime_ccall
.cpp:228 [inlined]
jl_get_cfunction_trampoline at /buildworker/worker/package_linux64/build/src/ru
ntime_ccall.cpp:334#launch_configuration#606 at ~/.julia/packages/CUDA/YeS8q/lib/cudadr
v/occupancy.jl:61 [inlined]
launch_configuration##kw at ~/.julia/packages/CUDA/YeS8q/lib/cudadrv/occupancy.jl:55 [inlined]
#mapreducedim!#900 at ~/.julia/packages/CUDA/YeS8q/src/mapreduce.jl:197
mapreducedim!##kw at ~/.julia/packages/CUDA/YeS8q/src/mapreduce.jl:143 [inlined]
#_mapreduce#17 at ~/.julia/packages/GPUArrays/jhRU7/src/host/mapredu
ce.jl:62
_mapreduce##kw at ~/.julia/packages/GPUArrays/jhRU7/src/host/mapredu
ce.jl:34 [inlined]
#mapreduce#15 at ~/.julia/packages/GPUArrays/jhRU7/src/host/mapreduc
e.jl:28 [inlined]
mapreduce at ~/.julia/packages/GPUArrays/jhRU7/src/host/mapreduce.jl
:28 [inlined]
_sum at ./reducedim.jl:727 [inlined]
_sum at ./reducedim.jl:726 [inlined]
#sum#627 at ./reducedim.jl:722 [inlined]
sum at ./reducedim.jl:722 [inlined]
macro expansion at ./REPL[66]:4 [inlined]
#11#threadsfor_fun at ./threadingconstructs.jl:81#11#threadsfor_fun at ./threadingconstructs.jl:48
unknown function (ip:0x7fec602a018c)
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2231 [inlined]
jl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2398
_jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2231 [inlined]
jl_apply_generic at /buildworker/worker/package_linux64/build/src/gf.c:2398
jl_apply at /buildworker/worker/package_linux64/build/src/julia.h:1690 [inlined
]
start_task at /buildworker/worker/package_linux64/build/src/task.c:705
unknown function (ip: (nil))
Allocations:98869074 (Pool:98853314; Big:15760); GC:61
Segmentation fault (core dumped)
For completeness, here's my own sum function that does not seem to have thread safety issues. It's vanilla tree reduction, most of the code is kernel boilerplate.
using CUDA:@allowscalar, unsafe_free!
function_sum(x::CuVector{T}) where {T}
function_sum_kernel!(out::CuDeviceVector{T}, x) where {T}
ti, nt =threadIdx().x, blockDim().x
shared =@cuDynamicSharedMem(T, nt)
shared[ti] =0@inboundsfor i in ti:nt:length(x)
shared[ti] += x[i]
endreduce_sum(shared, nt, ti)
if ti ==1
out[1] = shared[1]
endreturnnothingend
N =length(x)
if N ==1return@allowscalar a[1]
end
threads, next =1, 2while next < N # Find largest 2^k < N
threads = next
next <<=1end
out =CuVector{T}(undef, 1)
shmem = threads *sizeof(T)
@cuda threads=threads shmem=shmem _sum_kernel!(out, x)
synchronize(CuDefaultStream())
s =@allowscalar out[1]
unsafe_free!(out)
return s
endfunctionreduce_sum(x::CuDeviceVector, n, t)
while n >1
n >>=1sync_threads()
if t <= n
x[t] += x[t + n]
endendreturnnothingend
The text was updated successfully, but these errors were encountered:
Describe the bug
CuArray
reduction operations that eventually land inmapreducedim!
may segfault when called from multiple host threads simultaneously.To reproduce
Manifest.toml
Expected behavior
No segfault
Version info
Details on Julia:
Details on CUDA:
Additional context
My own sum function
For completeness, here's my own sum function that does not seem to have thread safety issues. It's vanilla tree reduction, most of the code is kernel boilerplate.
The text was updated successfully, but these errors were encountered: