diff --git a/src/cache.jl b/src/cache.jl
index fa71ab19..7a38a88e 100644
--- a/src/cache.jl
+++ b/src/cache.jl
@@ -26,7 +26,6 @@ function cached_compilation(cache::AbstractDict{UInt,V},
     key = hash(tt, key)
     key = hash(world, key)
     key = hash(cfg, key)
-
     # NOTE: no use of lock(::Function)/@lock/get! to avoid try/catch and closure overhead
     lock(cache_lock)
     obj = get(cache, key, nothing)
@@ -36,6 +35,7 @@ function cached_compilation(cache::AbstractDict{UInt,V},
     if obj === nothing || compile_hook[] !== nothing
         obj = actual_compilation(cache, key, cfg, ft, tt, compiler, linker)::V
     end
+
     return obj::V
 end
 
@@ -45,10 +45,14 @@ end
     src = methodinstance(ft, tt)
     job = CompilerJob(src, cfg)
 
+    global_cache = ci_cache(job)
     asm = nothing
-    # TODO: consider loading the assembly from an on-disk cache here
 
-    # compile
+    # read asm from persistent offline cache
+    if haskey(global_cache.asm, src)
+        asm = global_cache.asm[src]
+    end
+
     if asm === nothing
         asm = compiler(job)
     end
@@ -57,7 +61,7 @@ end
     # in which case the cache will already be populated)
     lock(cache_lock) do
         haskey(cache, key) && return cache[key]
-
+        global_cache.asm[src] = asm
         obj = linker(job, asm)
         cache[key] = obj
         obj
diff --git a/src/jlgen.jl b/src/jlgen.jl
index 74c0fd4e..44e95a0a 100644
--- a/src/jlgen.jl
+++ b/src/jlgen.jl
@@ -255,16 +255,19 @@ using Core.Compiler: CodeInstance, MethodInstance, InferenceParams, Optimization
 
 struct CodeCache
     dict::IdDict{MethodInstance,Vector{CodeInstance}}
+    asm::IdDict{MethodInstance, NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}}}
 
-    CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}())
-    CodeCache(cache::CodeCache) = new(GPUCompiler.copyAndFilter(cache.dict))
+    CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}(), 
+    Dict{MethodInstance, NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}}}())
+
+    CodeCache(cache::CodeCache) = new(GPUCompiler.copyAndFilter(cache.dict), cache.asm)
 end
 
 function copyAndFilter(dict::IdDict)
     out= IdDict()
     for key in keys(dict)
         useKey = true
-        # why is it an array of code instances, can there be more than 1?
+
         for ci in dict[key]
             if ci.max_world < typemax(typeof(ci.max_world))
                 useKey = false
@@ -590,7 +593,6 @@ end
 
 function ci_cache_populate(interp, cache, mt, mi, min_world, max_world)
     src = Core.Compiler.typeinf_ext_toplevel(interp, mi)
-
     # inference populates the cache, so we don't need to jl_get_method_inferred
     wvc = WorldView(cache, min_world, max_world)
     @assert Core.Compiler.haskey(wvc, mi)
@@ -622,7 +624,6 @@ function ci_cache_lookup(cache, mi, min_world, max_world)
     return ci
 end
 
-
 ## interface
 
 # for platforms without @cfunction-with-closure support
diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl
index 1bc5c19a..138d850e 100644
--- a/src/precompilation_cache.jl
+++ b/src/precompilation_cache.jl
@@ -7,25 +7,33 @@ function ci_cache_snapshot()
     cleaned_cache_to_save = IdDict()
     for key in keys(GPUCompiler.GLOBAL_CI_CACHES)
         # Will only keep those elements with infinite ranges
+        # copy constructor
         cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key])
     end
+
     return cleaned_cache_to_save
 end
 
 function ci_cache_delta(previous_snapshot)
     current_snapshot = ci_cache_snapshot()
     delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}()
-    for (cachekey, codecache) in current_snapshot
+    for (cachekey, codecache) in current_snapshot # iterate through all caches
         if cachekey in keys(previous_snapshot)
-            for (mi, civ) in codecache.dict
+            for (mi, civ) in codecache.dict # iterate through all mi
                 if mi in keys(previous_snapshot[cachekey].dict)
                     for ci in civ
                         if !(ci in previous_snapshot[cachekey].dict[mi])
                             if !(cachekey in keys(delta_snapshot))
                                 delta_snapshot[cachekey] = GPUCompiler.CodeCache()
                                 delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}()
+                                if haskey(codecache.asm, mi)
+                                    delta_snapshot[cachekey].asm[mi] = codecache.asm[mi]
+                                end
                             elseif !(mi in keys(delta_snapshot[cachekey].dict))
                                 delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}()
+                                if haskey(codecache.asm, mi)
+                                    delta_snapshot[cachekey].asm[mi] = codecache.asm[mi]
+                                end
                             end
 
                             push!(delta_snapshot[cachekey].dict[mi], ci)
@@ -36,6 +44,10 @@ function ci_cache_delta(previous_snapshot)
                     if !(cachekey in keys(delta_snapshot))
                         delta_snapshot[cachekey] = GPUCompiler.CodeCache()
                     end
+                    
+                    if haskey(codecache.asm, mi)
+                        delta_snapshot[cachekey].asm[mi] = codecache.asm[mi]
+                    end
                     delta_snapshot[cachekey].dict[mi] = civ
                 end
             end
@@ -43,59 +55,34 @@ function ci_cache_delta(previous_snapshot)
             delta_snapshot[cachekey] = current_snapshot[cachekey]
         end
     end
+
     return delta_snapshot
 end
 
+function print_keys(caches)
+    println("************")
+    for (key, cache) in caches
+        for (mi, civ) in cache.dict
+            println("$mi -> $(length(civ))")
+        end
+    end
+    println("************")
+end
 function ci_cache_insert(cache)
     if !is_precompiling()
-        #first clean the cache
-        cleaned_cache = IdDict()
-        for (key, c) in cache
-            usedCache = false
-            newCodeCache = GPUCompiler.CodeCache()
-            for (mi, civ) in c.dict
-                new_civ = Vector()
-                for ci in civ
-                    if ci.min_world <= ci.max_world
-                        push!(new_civ, ci)
-                    end
-                end
-                if length(new_civ) > 0
-                    usedCache = true
-                    newCodeCache.dict[mi] = new_civ
-                end
-            end
-            if usedCache
-                cleaned_cache[key] = newCodeCache
-            end
-        end
-
         # need to merge caches at the code instance level
-        for (key, local_cache) in cleaned_cache
+        for (key, local_cache) in cache
             if haskey(GPUCompiler.GLOBAL_CI_CACHES, key)
                 global_cache = GPUCompiler.GLOBAL_CI_CACHES[key]
-                #local_cache = cache[key]
                 for (mi, civ) in (local_cache.dict)
                     # this should be one since there is only one range that is infinite
                     @assert length(civ) == 1
                     # add all code instances to global cache
                     # could move truncating code to set index
-                    ci = civ[1]
-                    if haskey(global_cache.dict, mi)
-                        gciv = global_cache.dict[mi]
-                        # truncation cod3
-                        # sort by min world age, then make sure no age ranges overlap // this part is uneeded
-                        sort(gciv, by=x->x.min_world)
-                        if ci.min_world > gciv[length(gciv)].min_world
-                            invalidate_code_cache(global_cache, mi, ci.min_world - 1)
-                            Core.Compiler.setindex!(global_cache, ci, mi)
-                        else
-                            println("Should not get here?")
-                            @assert false
-                        end
-                    else
-                        # occurs if we kill everything in the parent and then need to store in child
-                        Core.Compiler.setindex!(global_cache, ci, mi)
+                    Core.Compiler.setindex!(global_cache, civ[1], mi)
+                    #@assert haskey(local_cache.asm, mi)
+                    if haskey(local_cache.asm, mi)
+                        global_cache.asm[mi] = local_cache.asm[mi]
                     end
                 end
             else
@@ -118,3 +105,20 @@ function precompile_gpucompiler(job)
         GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint))
     end
 end
+
+"""
+Generate a precompile file for the current state of the cache
+"""
+function generate_precompilation_file(snapshot, filename, precompilation_function)
+    method_instances = []
+    for (cachekey, cache) in snapshot
+        for (mi, civ) in cache.dict
+            push!(method_instances, mi)
+        end
+    end
+
+    precompile_statements = join(["$precompilation_function($(mi.specTypes.parameters[1]), Core.$(mi.specTypes.parameters[2:length(mi.specTypes.parameters)]))" for mi in method_instances], '\n')
+    open(filename, "w") do file
+        write(file, precompile_statements)
+    end
+end