diff --git a/src/GPUCompiler.jl b/src/GPUCompiler.jl
index 34f3fbd6..8d477705 100644
--- a/src/GPUCompiler.jl
+++ b/src/GPUCompiler.jl
@@ -40,7 +40,10 @@ include("cache.jl")
 include("execution.jl")
 include("reflection.jl")
 
+
 include("precompile.jl")
+include("precompilation_cache.jl")
+
 _precompile_()
 
 function __init__()
diff --git a/src/cache.jl b/src/cache.jl
index fa71ab19..7a38a88e 100644
--- a/src/cache.jl
+++ b/src/cache.jl
@@ -26,7 +26,6 @@ function cached_compilation(cache::AbstractDict{UInt,V},
     key = hash(tt, key)
     key = hash(world, key)
     key = hash(cfg, key)
-
     # NOTE: no use of lock(::Function)/@lock/get! to avoid try/catch and closure overhead
     lock(cache_lock)
     obj = get(cache, key, nothing)
@@ -36,6 +35,7 @@ function cached_compilation(cache::AbstractDict{UInt,V},
     if obj === nothing || compile_hook[] !== nothing
         obj = actual_compilation(cache, key, cfg, ft, tt, compiler, linker)::V
     end
+
     return obj::V
 end
 
@@ -45,10 +45,14 @@ end
     src = methodinstance(ft, tt)
     job = CompilerJob(src, cfg)
 
+    global_cache = ci_cache(job)
     asm = nothing
-    # TODO: consider loading the assembly from an on-disk cache here
 
-    # compile
+    # read asm from persistent offline cache
+    if haskey(global_cache.asm, src)
+        asm = global_cache.asm[src]
+    end
+
     if asm === nothing
         asm = compiler(job)
     end
@@ -57,7 +61,7 @@ end
     # in which case the cache will already be populated)
     lock(cache_lock) do
         haskey(cache, key) && return cache[key]
-
+        global_cache.asm[src] = asm
         obj = linker(job, asm)
         cache[key] = obj
         obj
diff --git a/src/jlgen.jl b/src/jlgen.jl
index 5a5304b8..44e95a0a 100644
--- a/src/jlgen.jl
+++ b/src/jlgen.jl
@@ -255,10 +255,33 @@ using Core.Compiler: CodeInstance, MethodInstance, InferenceParams, Optimization
 
 struct CodeCache
     dict::IdDict{MethodInstance,Vector{CodeInstance}}
+    asm::IdDict{MethodInstance, NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}}}
 
-    CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}())
+    CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}(), 
+    Dict{MethodInstance, NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}}}())
+
+    CodeCache(cache::CodeCache) = new(GPUCompiler.copyAndFilter(cache.dict), cache.asm)
 end
 
+function copyAndFilter(dict::IdDict)
+    out= IdDict()
+    for key in keys(dict)
+        useKey = true
+
+        for ci in dict[key]
+            if ci.max_world < typemax(typeof(ci.max_world))
+                useKey = false
+                break
+            end
+        end
+        if useKey
+            out[key] = dict[key]
+        end
+    end
+    return out
+end
+
+
 function Base.show(io::IO, ::MIME"text/plain", cc::CodeCache)
     print(io, "CodeCache with $(mapreduce(length, +, values(cc.dict); init=0)) entries")
     if !isempty(cc.dict)
@@ -570,7 +593,6 @@ end
 
 function ci_cache_populate(interp, cache, mt, mi, min_world, max_world)
     src = Core.Compiler.typeinf_ext_toplevel(interp, mi)
-
     # inference populates the cache, so we don't need to jl_get_method_inferred
     wvc = WorldView(cache, min_world, max_world)
     @assert Core.Compiler.haskey(wvc, mi)
@@ -602,7 +624,6 @@ function ci_cache_lookup(cache, mi, min_world, max_world)
     return ci
 end
 
-
 ## interface
 
 # for platforms without @cfunction-with-closure support
diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl
new file mode 100644
index 00000000..138d850e
--- /dev/null
+++ b/src/precompilation_cache.jl
@@ -0,0 +1,124 @@
+const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable)
+is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0
+
+export ci_cache_snapshot, ci_cache_delta, ci_cache_insert, precompile_gpucompiler
+
+function ci_cache_snapshot()
+    cleaned_cache_to_save = IdDict()
+    for key in keys(GPUCompiler.GLOBAL_CI_CACHES)
+        # Will only keep those elements with infinite ranges
+        # copy constructor
+        cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key])
+    end
+
+    return cleaned_cache_to_save
+end
+
+function ci_cache_delta(previous_snapshot)
+    current_snapshot = ci_cache_snapshot()
+    delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}()
+    for (cachekey, codecache) in current_snapshot # iterate through all caches
+        if cachekey in keys(previous_snapshot)
+            for (mi, civ) in codecache.dict # iterate through all mi
+                if mi in keys(previous_snapshot[cachekey].dict)
+                    for ci in civ
+                        if !(ci in previous_snapshot[cachekey].dict[mi])
+                            if !(cachekey in keys(delta_snapshot))
+                                delta_snapshot[cachekey] = GPUCompiler.CodeCache()
+                                delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}()
+                                if haskey(codecache.asm, mi)
+                                    delta_snapshot[cachekey].asm[mi] = codecache.asm[mi]
+                                end
+                            elseif !(mi in keys(delta_snapshot[cachekey].dict))
+                                delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}()
+                                if haskey(codecache.asm, mi)
+                                    delta_snapshot[cachekey].asm[mi] = codecache.asm[mi]
+                                end
+                            end
+
+                            push!(delta_snapshot[cachekey].dict[mi], ci)
+                        end
+                    end
+                else
+                    # this whole cache is not present in the previous snapshot, can add all
+                    if !(cachekey in keys(delta_snapshot))
+                        delta_snapshot[cachekey] = GPUCompiler.CodeCache()
+                    end
+                    
+                    if haskey(codecache.asm, mi)
+                        delta_snapshot[cachekey].asm[mi] = codecache.asm[mi]
+                    end
+                    delta_snapshot[cachekey].dict[mi] = civ
+                end
+            end
+        else
+            delta_snapshot[cachekey] = current_snapshot[cachekey]
+        end
+    end
+
+    return delta_snapshot
+end
+
+function print_keys(caches)
+    println("************")
+    for (key, cache) in caches
+        for (mi, civ) in cache.dict
+            println("$mi -> $(length(civ))")
+        end
+    end
+    println("************")
+end
+function ci_cache_insert(cache)
+    if !is_precompiling()
+        # need to merge caches at the code instance level
+        for (key, local_cache) in cache
+            if haskey(GPUCompiler.GLOBAL_CI_CACHES, key)
+                global_cache = GPUCompiler.GLOBAL_CI_CACHES[key]
+                for (mi, civ) in (local_cache.dict)
+                    # this should be one since there is only one range that is infinite
+                    @assert length(civ) == 1
+                    # add all code instances to global cache
+                    # could move truncating code to set index
+                    Core.Compiler.setindex!(global_cache, civ[1], mi)
+                    #@assert haskey(local_cache.asm, mi)
+                    if haskey(local_cache.asm, mi)
+                        global_cache.asm[mi] = local_cache.asm[mi]
+                    end
+                end
+            else
+                # no conflict at cache level
+                GPUCompiler.GLOBAL_CI_CACHES[key] = cache[key]
+            end
+        end
+    end
+end
+
+"""
+Given a function and param types caches the function to the global cache
+"""
+function precompile_gpucompiler(job)
+    # populate the cache
+    cache = GPUCompiler.ci_cache(job)
+    mt = GPUCompiler.method_table(job)
+    interp = GPUCompiler.get_interpreter(job)
+    if GPUCompiler.ci_cache_lookup(cache, job.source, job.world, typemax(Cint)) === nothing
+        GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint))
+    end
+end
+
+"""
+Generate a precompile file for the current state of the cache
+"""
+function generate_precompilation_file(snapshot, filename, precompilation_function)
+    method_instances = []
+    for (cachekey, cache) in snapshot
+        for (mi, civ) in cache.dict
+            push!(method_instances, mi)
+        end
+    end
+
+    precompile_statements = join(["$precompilation_function($(mi.specTypes.parameters[1]), Core.$(mi.specTypes.parameters[2:length(mi.specTypes.parameters)]))" for mi in method_instances], '\n')
+    open(filename, "w") do file
+        write(file, precompile_statements)
+    end
+end
diff --git a/test/ExamplePersistentCache/GPUKernel.jl b/test/ExamplePersistentCache/GPUKernel.jl
new file mode 100644
index 00000000..628e50f1
--- /dev/null
+++ b/test/ExamplePersistentCache/GPUKernel.jl
@@ -0,0 +1,26 @@
+module GPUKernel
+using GPUCompiler
+using TestRuntime
+snapshot = GPUCompiler.ci_cache_snapshot()
+
+struct TestCompilerParams <: AbstractCompilerParams end
+GPUCompiler.runtime_module(::CompilerJob{<:Any,TestCompilerParams}) = TestRuntime
+
+kernel() = nothing
+function main()
+    source = methodinstance(typeof(kernel), Tuple{})
+    target = NativeCompilerTarget()
+    params = TestCompilerParams()
+    config = CompilerConfig(target, params)
+    job = CompilerJob(source, config)
+
+    println(GPUCompiler.compile(:asm, job)[1])
+end
+
+main()
+const persistent_cache = GPUCompiler.ci_cache_delta(snapshot)
+
+function __init__()
+    GPUCompiler.ci_cache_insert(persistent_cache)
+end
+end # module GPUKernel
diff --git a/test/ExamplePersistentCache/README.txt b/test/ExamplePersistentCache/README.txt
new file mode 100644
index 00000000..80462fd5
--- /dev/null
+++ b/test/ExamplePersistentCache/README.txt
@@ -0,0 +1,20 @@
+Persistent Cache api:
+
+GPUCompiler.ci_cache_snapshot() -> cache: returns a snapshot of GLOBAL_CI_CACHES used 
+as a base point for what will be persistently cached.
+
+GPUCompiler.ci_cache_delta(snapshot::cache) -> cache: takes a snapshot and returns
+the cache that represents the difference between (current GLOBAL_CI_CACHES - snapshot)
+
+GPUCompiler.ci_cache_insert(snapshot::cache): inserts snapshot into GLOBAL_CI_CACHES
+
+
+Usage:
+snapshot = GPUCompiler.ci_cache_snapshot()
+... precompile work ...
+const persistent_cache = GPUCompiler.ci_cache_delta(snapshot)
+
+function __init__()
+    GPUCompiler.ci_cache_insert(persistent_cache)
+    ... rest of init logic ...
+end
diff --git a/test/ExamplePersistentCache/TestRuntime.jl b/test/ExamplePersistentCache/TestRuntime.jl
new file mode 100644
index 00000000..1d29e4ba
--- /dev/null
+++ b/test/ExamplePersistentCache/TestRuntime.jl
@@ -0,0 +1,8 @@
+module TestRuntime
+    signal_exception() = return
+    malloc(sz) = C_NULL
+    report_oom(sz) = return
+    report_exception(ex) = return
+    report_exception_name(ex) = return
+    report_exception_frame(idx, func, file, line) = return
+end # module TestRuntime
diff --git a/test/Project.toml b/test/Project.toml
index e602d235..002ebb1c 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
+GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 Metal_LLVM_Tools_jll = "0418c028-ff8c-56b8-a53e-0f9676ed36fc"