diff --git a/src/GPUCompiler.jl b/src/GPUCompiler.jl index 34f3fbd6..8d477705 100644 --- a/src/GPUCompiler.jl +++ b/src/GPUCompiler.jl @@ -40,7 +40,10 @@ include("cache.jl") include("execution.jl") include("reflection.jl") + include("precompile.jl") +include("precompilation_cache.jl") + _precompile_() function __init__() diff --git a/src/cache.jl b/src/cache.jl index fa71ab19..7a38a88e 100644 --- a/src/cache.jl +++ b/src/cache.jl @@ -26,7 +26,6 @@ function cached_compilation(cache::AbstractDict{UInt,V}, key = hash(tt, key) key = hash(world, key) key = hash(cfg, key) - # NOTE: no use of lock(::Function)/@lock/get! to avoid try/catch and closure overhead lock(cache_lock) obj = get(cache, key, nothing) @@ -36,6 +35,7 @@ function cached_compilation(cache::AbstractDict{UInt,V}, if obj === nothing || compile_hook[] !== nothing obj = actual_compilation(cache, key, cfg, ft, tt, compiler, linker)::V end + return obj::V end @@ -45,10 +45,14 @@ end src = methodinstance(ft, tt) job = CompilerJob(src, cfg) + global_cache = ci_cache(job) asm = nothing - # TODO: consider loading the assembly from an on-disk cache here - # compile + # read asm from persistent offline cache + if haskey(global_cache.asm, src) + asm = global_cache.asm[src] + end + if asm === nothing asm = compiler(job) end @@ -57,7 +61,7 @@ end # in which case the cache will already be populated) lock(cache_lock) do haskey(cache, key) && return cache[key] - + global_cache.asm[src] = asm obj = linker(job, asm) cache[key] = obj obj diff --git a/src/jlgen.jl b/src/jlgen.jl index 5a5304b8..44e95a0a 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -255,10 +255,33 @@ using Core.Compiler: CodeInstance, MethodInstance, InferenceParams, Optimization struct CodeCache dict::IdDict{MethodInstance,Vector{CodeInstance}} + asm::IdDict{MethodInstance, NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}}} - CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}()) + CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}(), + Dict{MethodInstance, NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}}}()) + + CodeCache(cache::CodeCache) = new(GPUCompiler.copyAndFilter(cache.dict), cache.asm) end +function copyAndFilter(dict::IdDict) + out= IdDict() + for key in keys(dict) + useKey = true + + for ci in dict[key] + if ci.max_world < typemax(typeof(ci.max_world)) + useKey = false + break + end + end + if useKey + out[key] = dict[key] + end + end + return out +end + + function Base.show(io::IO, ::MIME"text/plain", cc::CodeCache) print(io, "CodeCache with $(mapreduce(length, +, values(cc.dict); init=0)) entries") if !isempty(cc.dict) @@ -570,7 +593,6 @@ end function ci_cache_populate(interp, cache, mt, mi, min_world, max_world) src = Core.Compiler.typeinf_ext_toplevel(interp, mi) - # inference populates the cache, so we don't need to jl_get_method_inferred wvc = WorldView(cache, min_world, max_world) @assert Core.Compiler.haskey(wvc, mi) @@ -602,7 +624,6 @@ function ci_cache_lookup(cache, mi, min_world, max_world) return ci end - ## interface # for platforms without @cfunction-with-closure support diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl new file mode 100644 index 00000000..138d850e --- /dev/null +++ b/src/precompilation_cache.jl @@ -0,0 +1,124 @@ +const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable) +is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0 + +export ci_cache_snapshot, ci_cache_delta, ci_cache_insert, precompile_gpucompiler + +function ci_cache_snapshot() + cleaned_cache_to_save = IdDict() + for key in keys(GPUCompiler.GLOBAL_CI_CACHES) + # Will only keep those elements with infinite ranges + # copy constructor + cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key]) + end + + return cleaned_cache_to_save +end + +function ci_cache_delta(previous_snapshot) + current_snapshot = ci_cache_snapshot() + delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}() + for (cachekey, codecache) in current_snapshot # iterate through all caches + if cachekey in keys(previous_snapshot) + for (mi, civ) in codecache.dict # iterate through all mi + if mi in keys(previous_snapshot[cachekey].dict) + for ci in civ + if !(ci in previous_snapshot[cachekey].dict[mi]) + if !(cachekey in keys(delta_snapshot)) + delta_snapshot[cachekey] = GPUCompiler.CodeCache() + delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}() + if haskey(codecache.asm, mi) + delta_snapshot[cachekey].asm[mi] = codecache.asm[mi] + end + elseif !(mi in keys(delta_snapshot[cachekey].dict)) + delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}() + if haskey(codecache.asm, mi) + delta_snapshot[cachekey].asm[mi] = codecache.asm[mi] + end + end + + push!(delta_snapshot[cachekey].dict[mi], ci) + end + end + else + # this whole cache is not present in the previous snapshot, can add all + if !(cachekey in keys(delta_snapshot)) + delta_snapshot[cachekey] = GPUCompiler.CodeCache() + end + + if haskey(codecache.asm, mi) + delta_snapshot[cachekey].asm[mi] = codecache.asm[mi] + end + delta_snapshot[cachekey].dict[mi] = civ + end + end + else + delta_snapshot[cachekey] = current_snapshot[cachekey] + end + end + + return delta_snapshot +end + +function print_keys(caches) + println("************") + for (key, cache) in caches + for (mi, civ) in cache.dict + println("$mi -> $(length(civ))") + end + end + println("************") +end +function ci_cache_insert(cache) + if !is_precompiling() + # need to merge caches at the code instance level + for (key, local_cache) in cache + if haskey(GPUCompiler.GLOBAL_CI_CACHES, key) + global_cache = GPUCompiler.GLOBAL_CI_CACHES[key] + for (mi, civ) in (local_cache.dict) + # this should be one since there is only one range that is infinite + @assert length(civ) == 1 + # add all code instances to global cache + # could move truncating code to set index + Core.Compiler.setindex!(global_cache, civ[1], mi) + #@assert haskey(local_cache.asm, mi) + if haskey(local_cache.asm, mi) + global_cache.asm[mi] = local_cache.asm[mi] + end + end + else + # no conflict at cache level + GPUCompiler.GLOBAL_CI_CACHES[key] = cache[key] + end + end + end +end + +""" +Given a function and param types caches the function to the global cache +""" +function precompile_gpucompiler(job) + # populate the cache + cache = GPUCompiler.ci_cache(job) + mt = GPUCompiler.method_table(job) + interp = GPUCompiler.get_interpreter(job) + if GPUCompiler.ci_cache_lookup(cache, job.source, job.world, typemax(Cint)) === nothing + GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint)) + end +end + +""" +Generate a precompile file for the current state of the cache +""" +function generate_precompilation_file(snapshot, filename, precompilation_function) + method_instances = [] + for (cachekey, cache) in snapshot + for (mi, civ) in cache.dict + push!(method_instances, mi) + end + end + + precompile_statements = join(["$precompilation_function($(mi.specTypes.parameters[1]), Core.$(mi.specTypes.parameters[2:length(mi.specTypes.parameters)]))" for mi in method_instances], '\n') + open(filename, "w") do file + write(file, precompile_statements) + end +end diff --git a/test/ExamplePersistentCache/GPUKernel.jl b/test/ExamplePersistentCache/GPUKernel.jl new file mode 100644 index 00000000..628e50f1 --- /dev/null +++ b/test/ExamplePersistentCache/GPUKernel.jl @@ -0,0 +1,26 @@ +module GPUKernel +using GPUCompiler +using TestRuntime +snapshot = GPUCompiler.ci_cache_snapshot() + +struct TestCompilerParams <: AbstractCompilerParams end +GPUCompiler.runtime_module(::CompilerJob{<:Any,TestCompilerParams}) = TestRuntime + +kernel() = nothing +function main() + source = methodinstance(typeof(kernel), Tuple{}) + target = NativeCompilerTarget() + params = TestCompilerParams() + config = CompilerConfig(target, params) + job = CompilerJob(source, config) + + println(GPUCompiler.compile(:asm, job)[1]) +end + +main() +const persistent_cache = GPUCompiler.ci_cache_delta(snapshot) + +function __init__() + GPUCompiler.ci_cache_insert(persistent_cache) +end +end # module GPUKernel diff --git a/test/ExamplePersistentCache/README.txt b/test/ExamplePersistentCache/README.txt new file mode 100644 index 00000000..80462fd5 --- /dev/null +++ b/test/ExamplePersistentCache/README.txt @@ -0,0 +1,20 @@ +Persistent Cache api: + +GPUCompiler.ci_cache_snapshot() -> cache: returns a snapshot of GLOBAL_CI_CACHES used +as a base point for what will be persistently cached. + +GPUCompiler.ci_cache_delta(snapshot::cache) -> cache: takes a snapshot and returns +the cache that represents the difference between (current GLOBAL_CI_CACHES - snapshot) + +GPUCompiler.ci_cache_insert(snapshot::cache): inserts snapshot into GLOBAL_CI_CACHES + + +Usage: +snapshot = GPUCompiler.ci_cache_snapshot() +... precompile work ... +const persistent_cache = GPUCompiler.ci_cache_delta(snapshot) + +function __init__() + GPUCompiler.ci_cache_insert(persistent_cache) + ... rest of init logic ... +end diff --git a/test/ExamplePersistentCache/TestRuntime.jl b/test/ExamplePersistentCache/TestRuntime.jl new file mode 100644 index 00000000..1d29e4ba --- /dev/null +++ b/test/ExamplePersistentCache/TestRuntime.jl @@ -0,0 +1,8 @@ +module TestRuntime + signal_exception() = return + malloc(sz) = C_NULL + report_oom(sz) = return + report_exception(ex) = return + report_exception_name(ex) = return + report_exception_frame(idx, func, file, line) = return +end # module TestRuntime diff --git a/test/Project.toml b/test/Project.toml index e602d235..002ebb1c 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,5 +1,6 @@ [deps] Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f" +GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" Metal_LLVM_Tools_jll = "0418c028-ff8c-56b8-a53e-0f9676ed36fc"