Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add GPUCompiler precompilation caching #425

Draft
wants to merge 16 commits into
base: master
Choose a base branch
from
3 changes: 3 additions & 0 deletions src/GPUCompiler.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ include("cache.jl")
include("execution.jl")
include("reflection.jl")


include("precompile.jl")
include("precompilation_cache.jl")

_precompile_()

function __init__()
Expand Down
20 changes: 20 additions & 0 deletions src/jlgen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,28 @@ struct CodeCache
dict::IdDict{MethodInstance,Vector{CodeInstance}}

CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}())
CodeCache(cache::CodeCache) = new(GPUCompiler.copyAndFilter(cache.dict))
end

function copyAndFilter(dict::IdDict)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this needed for?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is used in https://github.com/collinwarner/GPUCompiler.jl/blob/3dbe9d5b7c7c5f56f18553f0e4d4bd9c2bdcaca5/src/precompile_native.jl#L102

It creates a CodeCache that contains unbounded entries only. Used when snapshotting.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we just write this as filter(validate_codecache, cache.dict) where valid is:

function validate_codecache(cache)
    for ci in cache
        if ci.max_world < typemax(typeof(ci.max_world))
            return false
        end
        return true
    end
end

But that seems overeager, are we gurantueed just one entry? Or do we want to remove all CIs that don't have max_world?

out= IdDict()
for key in keys(dict)
useKey = true
# why is it an array of code instances, can there be more than 1?
for ci in dict[key]
if ci.max_world < typemax(typeof(ci.max_world))
useKey = false
break
end
end
if useKey
out[key] = dict[key]
end
end
return out
end


function Base.show(io::IO, ::MIME"text/plain", cc::CodeCache)
print(io, "CodeCache with $(mapreduce(length, +, values(cc.dict); init=0)) entries")
if !isempty(cc.dict)
Expand Down
120 changes: 120 additions & 0 deletions src/precompilation_cache.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable)
is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0

export ci_cache_snapshot, ci_cache_delta, ci_cache_insert, precompile_gpucompiler

function ci_cache_snapshot()
cleaned_cache_to_save = IdDict()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this just copy(GPUCompiler.GLOBAL_CI_CACHES)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is an additional parse when constructing the CodeCache that removes CodeInstances in finite ranges. I could potentially split up that process so there are two phases. Copying then filtering, I though since we were already doing one pass over the data we could add filtering in directly.

for key in keys(GPUCompiler.GLOBAL_CI_CACHES)
# Will only keep those elements with infinite ranges
cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key])
end
return cleaned_cache_to_save
end

function ci_cache_delta(previous_snapshot)
current_snapshot = ci_cache_snapshot()
delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}()
for (cachekey, codecache) in current_snapshot
if cachekey in keys(previous_snapshot)
for (mi, civ) in codecache.dict
if mi in keys(previous_snapshot[cachekey].dict)
for ci in civ
if !(ci in previous_snapshot[cachekey].dict[mi])
if !(cachekey in keys(delta_snapshot))
delta_snapshot[cachekey] = GPUCompiler.CodeCache()
delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}()
elseif !(mi in keys(delta_snapshot[cachekey].dict))
delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}()
end

push!(delta_snapshot[cachekey].dict[mi], ci)
end
end
else
# this whole cache is not present in the previous snapshot, can add all
if !(cachekey in keys(delta_snapshot))
delta_snapshot[cachekey] = GPUCompiler.CodeCache()
end
delta_snapshot[cachekey].dict[mi] = civ
end
end
else
delta_snapshot[cachekey] = current_snapshot[cachekey]
end
end
return delta_snapshot
end

function ci_cache_insert(cache)
if !is_precompiling()
#first clean the cache
cleaned_cache = IdDict()
for (key, c) in cache
usedCache = false
newCodeCache = GPUCompiler.CodeCache()
for (mi, civ) in c.dict
new_civ = Vector()
for ci in civ
if ci.min_world <= ci.max_world
push!(new_civ, ci)
end
end
if length(new_civ) > 0
usedCache = true
newCodeCache.dict[mi] = new_civ
end
end
if usedCache
cleaned_cache[key] = newCodeCache
end
end

# need to merge caches at the code instance level
for (key, local_cache) in cleaned_cache
if haskey(GPUCompiler.GLOBAL_CI_CACHES, key)
global_cache = GPUCompiler.GLOBAL_CI_CACHES[key]
#local_cache = cache[key]
for (mi, civ) in (local_cache.dict)
# this should be one since there is only one range that is infinite
@assert length(civ) == 1
# add all code instances to global cache
# could move truncating code to set index
ci = civ[1]
if haskey(global_cache.dict, mi)
gciv = global_cache.dict[mi]
# truncation cod3
# sort by min world age, then make sure no age ranges overlap // this part is uneeded
sort(gciv, by=x->x.min_world)
if ci.min_world > gciv[length(gciv)].min_world
invalidate_code_cache(global_cache, mi, ci.min_world - 1)
Core.Compiler.setindex!(global_cache, ci, mi)
else
println("Should not get here?")
@assert false
end
else
# occurs if we kill everything in the parent and then need to store in child
Core.Compiler.setindex!(global_cache, ci, mi)
end
end
else
# no conflict at cache level
GPUCompiler.GLOBAL_CI_CACHES[key] = cache[key]
end
end
end
end

"""
Given a function and param types caches the function to the global cache
"""
function precompile_gpucompiler(job)
# populate the cache
cache = GPUCompiler.ci_cache(job)
mt = GPUCompiler.method_table(job)
interp = GPUCompiler.get_interpreter(job)
if GPUCompiler.ci_cache_lookup(cache, job.source, job.world, typemax(Cint)) === nothing
GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint))
end
end
26 changes: 26 additions & 0 deletions test/ExamplePersistentCache/GPUKernel.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
module GPUKernel
using GPUCompiler
using TestRuntime
snapshot = GPUCompiler.ci_cache_snapshot()

struct TestCompilerParams <: AbstractCompilerParams end
GPUCompiler.runtime_module(::CompilerJob{<:Any,TestCompilerParams}) = TestRuntime

kernel() = nothing
function main()
source = methodinstance(typeof(kernel), Tuple{})
target = NativeCompilerTarget()
params = TestCompilerParams()
config = CompilerConfig(target, params)
job = CompilerJob(source, config)

println(GPUCompiler.compile(:asm, job)[1])
end

main()
const persistent_cache = GPUCompiler.ci_cache_delta(snapshot)

function __init__()
GPUCompiler.ci_cache_insert(persistent_cache)
end
end # module GPUKernel
20 changes: 20 additions & 0 deletions test/ExamplePersistentCache/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Persistent Cache api:

GPUCompiler.ci_cache_snapshot() -> cache: returns a snapshot of GLOBAL_CI_CACHES used
as a base point for what will be persistently cached.

GPUCompiler.ci_cache_delta(snapshot::cache) -> cache: takes a snapshot and returns
the cache that represents the difference between (current GLOBAL_CI_CACHES - snapshot)

GPUCompiler.ci_cache_insert(snapshot::cache): inserts snapshot into GLOBAL_CI_CACHES


Usage:
snapshot = GPUCompiler.ci_cache_snapshot()
... precompile work ...
const persistent_cache = GPUCompiler.ci_cache_delta(snapshot)

function __init__()
GPUCompiler.ci_cache_insert(persistent_cache)
... rest of init logic ...
end
8 changes: 8 additions & 0 deletions test/ExamplePersistentCache/TestRuntime.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
module TestRuntime
signal_exception() = return
malloc(sz) = C_NULL
report_oom(sz) = return
report_exception(ex) = return
report_exception_name(ex) = return
report_exception_frame(idx, func, file, line) = return
end # module TestRuntime
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[deps]
Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
Metal_LLVM_Tools_jll = "0418c028-ff8c-56b8-a53e-0f9676ed36fc"
Expand Down