Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limit time held by CUDNN locks. #1491

Merged
merged 3 commits into from
May 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions lib/cudadrv/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,7 @@ function __pin(ptr::Ptr{Nothing}, sz::Int)
ctx = context()
key = (ctx,ptr)

@lock __pin_lock begin
Base.@lock __pin_lock begin
pin_count = if haskey(__pin_count, key)
__pin_count[key] += 1
else
Expand All @@ -687,7 +687,7 @@ end
function __unpin(ptr::Ptr{Nothing}, ctx::CuContext)
key = (ctx,ptr)

@spinlock __pin_lock begin
Base.@lock __pin_lock begin
@assert haskey(__pin_count, key) "Cannot unpin unmanaged pointer $ptr."
pin_count = __pin_count[key] -= 1
@assert pin_count >= 0 "Double unpin for $ptr"
Expand Down
4 changes: 2 additions & 2 deletions lib/cudnn/CUDNN.jl
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ function log_message(sev, udata, dbg_ptr, ptr)
str = unsafe_string(ptr, len) # XXX: can this yield?

# print asynchronously
@spinlock log_lock begin
Base.@lock log_lock begin
push!(log_messages, (; sev, dbg, str))
end
ccall(:uv_async_send, Cint, (Ptr{Cvoid},), udata)
Expand Down Expand Up @@ -153,7 +153,7 @@ function __runtime_init__()
if (isdebug(:init, CUDNN) || Base.JLOptions().debug_level >= 2) &&
version() >= v"8.2" # NVIDIA bug #3256123
log_cond[] = Base.AsyncCondition() do async_cond
message = @lock log_lock popfirst!(log_messages)
message = Base.@lock log_lock popfirst!(log_messages)
_log_message(message...)
end

Expand Down
78 changes: 48 additions & 30 deletions lib/cudnn/convolution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -172,52 +172,70 @@ end
const cudnnConvolutionFwdAlgoPerfCache = Dict{Tuple,cudnnConvolutionFwdAlgoPerf_t}()
const cudnnConvolutionFwdAlgoPerfCacheLock = ReentrantLock()
function cudnnConvolutionFwdAlgoPerf(xDesc, x, wDesc, w, convDesc, yDesc, y, biasDesc, activation)
lock(cudnnConvolutionFwdAlgoPerfCacheLock) do
get!(cudnnConvolutionFwdAlgoPerfCache, (xDesc, wDesc, convDesc, biasDesc, activation)) do
requestedAlgoCount = Int(CUDNN_CONVOLUTION_FWD_ALGO_COUNT)
returnedAlgoCount = Cint[0]
perfResults = Array{cudnnConvolutionFwdAlgoPerf_t}(undef,requestedAlgoCount)
workspaceSize() = cudnnFindConvolutionAlgorithmWorkspaceSize(x)
with_workspace(workspaceSize) do workspace
cudnnFindConvolutionForwardAlgorithmEx(handle(),xDesc,x,wDesc,w,convDesc,yDesc,y,requestedAlgoCount,returnedAlgoCount,perfResults,workspace,sizeof(workspace))
end
cudnnConvolutionAlgoPerfChoose(perfResults, returnedAlgoCount[1])
key = (xDesc, wDesc, convDesc, biasDesc, activation)
val = lock(cudnnConvolutionFwdAlgoPerfCacheLock) do
get(cudnnConvolutionFwdAlgoPerfCache, key, nothing)
end
if val === nothing
requestedAlgoCount = Int(CUDNN_CONVOLUTION_FWD_ALGO_COUNT)
returnedAlgoCount = Cint[0]
perfResults = Array{cudnnConvolutionFwdAlgoPerf_t}(undef,requestedAlgoCount)
workspaceSize() = cudnnFindConvolutionAlgorithmWorkspaceSize(x)
with_workspace(workspaceSize) do workspace
cudnnFindConvolutionForwardAlgorithmEx(handle(),xDesc,x,wDesc,w,convDesc,yDesc,y,requestedAlgoCount,returnedAlgoCount,perfResults,workspace,sizeof(workspace))
end
val = cudnnConvolutionAlgoPerfChoose(perfResults, returnedAlgoCount[1])
lock(cudnnConvolutionFwdAlgoPerfCacheLock) do
cudnnConvolutionFwdAlgoPerfCache[key] = val
end
end
return val
end

const cudnnConvolutionBwdDataAlgoPerfCache = Dict{Tuple,cudnnConvolutionBwdDataAlgoPerf_t}()
const cudnnConvolutionBwdDataAlgoPerfCacheLock = ReentrantLock()
function cudnnConvolutionBwdDataAlgoPerf(wDesc, w, dyDesc, dy, convDesc, dxDesc, dx)
lock(cudnnConvolutionBwdDataAlgoPerfCacheLock) do
get!(cudnnConvolutionBwdDataAlgoPerfCache, (wDesc, dyDesc, convDesc)) do
requestedAlgoCount = Int(CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT)
returnedAlgoCount = Cint[0]
perfResults = Array{cudnnConvolutionBwdDataAlgoPerf_t}(undef,requestedAlgoCount)
workspaceSize() = cudnnFindConvolutionAlgorithmWorkspaceSize(dx)
with_workspace(workspaceSize) do workspace
cudnnFindConvolutionBackwardDataAlgorithmEx(handle(),wDesc,w,dyDesc,dy,convDesc,dxDesc,dx,requestedAlgoCount,returnedAlgoCount,perfResults,workspace,sizeof(workspace))
end
cudnnConvolutionAlgoPerfChoose(perfResults, returnedAlgoCount[1])
key = (wDesc, dyDesc, convDesc)
val = lock(cudnnConvolutionBwdDataAlgoPerfCacheLock) do
get(cudnnConvolutionBwdDataAlgoPerfCache, key, nothing)
end
if val === nothing
requestedAlgoCount = Int(CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT)
returnedAlgoCount = Cint[0]
perfResults = Array{cudnnConvolutionBwdDataAlgoPerf_t}(undef,requestedAlgoCount)
workspaceSize() = cudnnFindConvolutionAlgorithmWorkspaceSize(dx)
with_workspace(workspaceSize) do workspace
cudnnFindConvolutionBackwardDataAlgorithmEx(handle(),wDesc,w,dyDesc,dy,convDesc,dxDesc,dx,requestedAlgoCount,returnedAlgoCount,perfResults,workspace,sizeof(workspace))
end
val = cudnnConvolutionAlgoPerfChoose(perfResults, returnedAlgoCount[1])
lock(cudnnConvolutionBwdDataAlgoPerfCacheLock) do
cudnnConvolutionBwdDataAlgoPerfCache[key] = val
end
end
val
end

const cudnnConvolutionBwdFilterAlgoPerfCache = Dict{Tuple,cudnnConvolutionBwdFilterAlgoPerf_t}()
const cudnnConvolutionBwdFilterAlgoPerfCacheLock = ReentrantLock()
function cudnnConvolutionBwdFilterAlgoPerf(xDesc, x, dyDesc, dy, convDesc, dwDesc, dw)
lock(cudnnConvolutionBwdFilterAlgoPerfCacheLock) do
get!(cudnnConvolutionBwdFilterAlgoPerfCache, (xDesc, dyDesc, convDesc)) do
requestedAlgoCount = Int(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT)
returnedAlgoCount = Cint[0]
perfResults = Array{cudnnConvolutionBwdFilterAlgoPerf_t}(undef,requestedAlgoCount)
workspaceSize() = cudnnFindConvolutionAlgorithmWorkspaceSize(x)
with_workspace(workspaceSize) do workspace
cudnnFindConvolutionBackwardFilterAlgorithmEx(handle(),xDesc,x,dyDesc,dy,convDesc,dwDesc,dw,requestedAlgoCount,returnedAlgoCount,perfResults,workspace,sizeof(workspace))
end
cudnnConvolutionAlgoPerfChoose(perfResults, returnedAlgoCount[1])
key = (xDesc, dyDesc, convDesc)
val = lock(cudnnConvolutionBwdFilterAlgoPerfCacheLock) do
get(cudnnConvolutionBwdFilterAlgoPerfCache, (xDesc, dyDesc, convDesc), nothing)
end
if val === nothing
requestedAlgoCount = Int(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT)
returnedAlgoCount = Cint[0]
perfResults = Array{cudnnConvolutionBwdFilterAlgoPerf_t}(undef,requestedAlgoCount)
workspaceSize() = cudnnFindConvolutionAlgorithmWorkspaceSize(x)
with_workspace(workspaceSize) do workspace
cudnnFindConvolutionBackwardFilterAlgorithmEx(handle(),xDesc,x,dyDesc,dy,convDesc,dwDesc,dw,requestedAlgoCount,returnedAlgoCount,perfResults,workspace,sizeof(workspace))
end
val = cudnnConvolutionAlgoPerfChoose(perfResults, returnedAlgoCount[1])
lock(cudnnConvolutionBwdFilterAlgoPerfCacheLock) do
cudnnConvolutionBwdFilterAlgoPerfCache[key] = val
end
end
val
end


Expand Down
26 changes: 15 additions & 11 deletions lib/cudnn/descriptors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,27 @@ macro cudnnDescriptor(x, set = Symbol("cudnnSet$(x)Descriptor"))
const $cache = Dict{Tuple,$sname}() # Dict is 3x faster than IdDict!
const $cache_lock = ReentrantLock()
function $sname(args...)
lock($cache_lock) do
get!($cache, args) do
ptr = $tname[C_NULL]
$create(ptr)
$set(ptr[1], args...)
d = $sname(ptr[1])
finalizer(x->$destroy(x.ptr), d)
return d
d = lock($cache_lock) do
get($cache, args, nothing)
end
if d === nothing
ptr = $tname[C_NULL]
$create(ptr)
$set(ptr[1], args...)
d = $sname(ptr[1])
finalizer(x->$destroy(x.ptr), d)
lock($cache_lock) do
$cache[args] = d
end
end
return d
end
end |> esc
end


"""
cudnnActivationDescriptor(mode::cudnnActivationMode_t,
cudnnActivationDescriptor(mode::cudnnActivationMode_t,
reluNanOpt::cudnnNanPropagation_t,
coef::Cfloat)
"""
Expand Down Expand Up @@ -116,8 +120,8 @@ cudnnConvolutionDescriptor(pad::Vector{Cint},

"""
cudnnLRNDescriptor(lrnN::Cuint,
lrnAlpha::Cdouble,
lrnBeta::Cdouble,
lrnAlpha::Cdouble,
lrnBeta::Cdouble,
lrnK::Cdouble)
"""
@cudnnDescriptor(LRN)
Expand Down
28 changes: 11 additions & 17 deletions lib/utils/cache.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,18 @@ end
# remove a handle from the cache, or create a new one
function Base.pop!(f::Function, cache::HandleCache{K,V}, key) where {K,V}
function check_cache(f::Function=()->nothing)
try
GC.enable_finalizers(false)
lock(cache.lock) do
handle = if !haskey(cache.idle_handles, key) || isempty(cache.idle_handles[key])
f()
else
pop!(cache.idle_handles[key])
end

if handle !== nothing
push!(cache.active_handles, key=>handle)
end
lock(cache.lock) do
handle = if !haskey(cache.idle_handles, key) || isempty(cache.idle_handles[key])
f()
else
pop!(cache.idle_handles[key])
end

return handle
if handle !== nothing
push!(cache.active_handles, key=>handle)
end
finally
GC.enable_finalizers(true)

return handle
end
end

Expand All @@ -51,8 +46,7 @@ end

# put a handle in the cache, or destroy it if it doesn't fit
function Base.push!(f::Function, cache::HandleCache{K,V}, key::K, handle::V) where {K,V}
# XXX: take this lock in a normal way once we have JuliaLang/julia#35689
@spinlock cache.lock begin
lock(cache.lock) do
delete!(cache.active_handles, key=>handle)

if haskey(cache.idle_handles, key)
Expand Down
23 changes: 1 addition & 22 deletions lib/utils/threading.jl
Original file line number Diff line number Diff line change
@@ -1,25 +1,4 @@
export @spinlock, @lock, LazyInitialized

const var"@lock" = Base.var"@lock"

# a safe way to acquire locks from finalizers, where we can't wait (which switches tasks)
macro spinlock(l, ex)
quote
temp = $(esc(l))
while !trylock(temp)
ccall(:jl_cpu_pause, Cvoid, ())
# Temporary solution before we have gc transition support in codegen.
ccall(:jl_gc_safepoint, Cvoid, ())
# we can't yield here
end
try
$(esc(ex))
finally
unlock(temp)
end
end
end

export LazyInitialized

"""
LazyInitialized{T}()
Expand Down