diff --git a/src/driver.jl b/src/driver.jl
index 728e2763..e5d8b2c8 100644
--- a/src/driver.jl
+++ b/src/driver.jl
@@ -42,7 +42,7 @@ end
 ## deferred compilation
 
 """
-    var"gpuc.deferred"(f, args...)::Ptr{Cvoid}
+    var"gpuc.deferred"(meta, f, args...)::Ptr{Cvoid}
 
 As if we were to call `f(args...)` but instead we are
 putting down a marker and return a function pointer to later
@@ -154,10 +154,11 @@ const __llvm_initialized = Ref(false)
 
     @timeit_debug to "IR generation" begin
         ir, compiled = irgen(job)
+        edge = Edge(inference_metadata(job), job.source)
         if job.config.entry_abi === :specfunc
-            entry_fn = compiled[job.source].specfunc
+            entry_fn = compiled[edge].specfunc
         else
-            entry_fn = compiled[job.source].func
+            entry_fn = compiled[edge].func
         end
         entry = functions(ir)[entry_fn]
     end
@@ -198,24 +199,28 @@ const __llvm_initialized = Ref(false)
             return val
         end
 
-        worklist = Dict{Any, Vector{LLVM.CallInst}}()
+        worklist = Dict{Edge, Vector{LLVM.CallInst}}()
         for use in uses(dyn_marker)
             # decode the call
             call = user(use)::LLVM.CallInst
-            dyn_mi_inst = find_base_object(operands(call)[1])
+            dyn_meta_inst = find_base_object(operands(call)[1])
+            @compiler_assert isa(dyn_meta_inst, LLVM.ConstantInt) job
+            dyn_mi_inst = find_base_object(operands(call)[2])
             @compiler_assert isa(dyn_mi_inst, LLVM.ConstantInt) job
+            dyn_meta = Base.unsafe_pointer_to_objref(
+                convert(Ptr{Cvoid}, convert(Int, dyn_meta_inst)))
             dyn_mi = Base.unsafe_pointer_to_objref(
-                convert(Ptr{Cvoid}, convert(Int, dyn_mi_inst)))
-            push!(get!(worklist, dyn_mi, LLVM.CallInst[]), call)
+                convert(Ptr{Cvoid}, convert(Int, dyn_mi_inst)))::MethodInstance
+            push!(get!(worklist, Edge(dyn_meta, dyn_mi), LLVM.CallInst[]), call)
         end
 
-        for dyn_mi in keys(worklist)
-            dyn_fn_name = compiled[dyn_mi].specfunc
+        for dyn_edge in keys(worklist)
+            dyn_fn_name = compiled[dyn_edge].specfunc
             dyn_fn = functions(ir)[dyn_fn_name]
 
             # insert a pointer to the function everywhere the entry is used
             T_ptr = convert(LLVMType, Ptr{Cvoid})
-            for call in worklist[dyn_mi]
+            for call in worklist[dyn_edge]
                 @dispose builder=IRBuilder() begin
                     position!(builder, call)
                     fptr = if LLVM.version() >= v"17"
diff --git a/src/interface.jl b/src/interface.jl
index 1346c858..a8af2d9c 100644
--- a/src/interface.jl
+++ b/src/interface.jl
@@ -89,6 +89,7 @@ Several keyword arguments can be used to customize the compilation process:
 struct CompilerConfig{T,P}
     target::T
     params::P
+    meta
 
     kernel::Bool
     name::Union{Nothing,String}
@@ -98,6 +99,7 @@ struct CompilerConfig{T,P}
 
     function CompilerConfig(target::AbstractCompilerTarget,
                             params::AbstractCompilerParams;
+                            meta = nothing,
                             kernel=true,
                             name=nothing,
                             entry_abi=:specfunc,
@@ -106,16 +108,16 @@ struct CompilerConfig{T,P}
         if entry_abi ∉ (:specfunc, :func)
             error("Unknown entry_abi=$entry_abi")
         end
-        new{typeof(target), typeof(params)}(target, params, kernel, name, entry_abi,
+        new{typeof(target), typeof(params)}(target, params, meta, kernel, name, entry_abi,
                                             always_inline, opt_level)
     end
 end
 
 # copy constructor
-CompilerConfig(cfg::CompilerConfig; target=cfg.target, params=cfg.params,
+CompilerConfig(cfg::CompilerConfig; target=cfg.target, params=cfg.params, meta=cfg.meta, 
                kernel=cfg.kernel, name=cfg.name, entry_abi=cfg.entry_abi,
                always_inline=cfg.always_inline, opt_level=cfg.opt_level) =
-    CompilerConfig(target, params; kernel, entry_abi, name, always_inline, opt_level)
+    CompilerConfig(target, params; meta, kernel, entry_abi, name, always_inline, opt_level)
 
 function Base.show(io::IO, @nospecialize(cfg::CompilerConfig{T})) where {T}
     print(io, "CompilerConfig for ", T)
@@ -124,6 +126,7 @@ end
 function Base.hash(cfg::CompilerConfig, h::UInt)
     h = hash(cfg.target, h)
     h = hash(cfg.params, h)
+    h = hash(cfg.meta, h)::UInt
 
     h = hash(cfg.kernel, h)
     h = hash(cfg.name, h)
@@ -178,15 +181,17 @@ runtime_module(@nospecialize(job::CompilerJob)) = error("Not implemented")
 # check if a function is an intrinsic that can assumed to be always available
 isintrinsic(@nospecialize(job::CompilerJob), fn::String) = false
 
+inference_metadata(@nospecialize(job::CompilerJob)) = job.config.meta
+
 # provide a specific interpreter to use.
 if VERSION >= v"1.11.0-DEV.1552"
 get_interpreter(@nospecialize(job::CompilerJob)) =
-    GPUInterpreter(job.world; method_table=method_table(job),
+    GPUInterpreter(job.world; meta=inference_metadata(job), method_table=method_table(job),
                    token=ci_cache_token(job), inf_params=inference_params(job),
                    opt_params=optimization_params(job))
 else
 get_interpreter(@nospecialize(job::CompilerJob)) =
-    GPUInterpreter(job.world; method_table=method_table(job),
+    GPUInterpreter(job.world; meta=inference_metadata(job), method_table=method_table(job),
                    code_cache=ci_cache(job), inf_params=inference_params(job),
                    opt_params=optimization_params(job))
 end
@@ -227,10 +232,11 @@ struct GPUCompilerCacheToken
     target_type::Type
     always_inline::Bool
     method_table::Core.MethodTable
+    metadata
 end
 
 ci_cache_token(@nospecialize(job::CompilerJob)) =
-    GPUCompilerCacheToken(typeof(job.config.target), job.config.always_inline, method_table(job))
+    GPUCompilerCacheToken(typeof(job.config.target), job.config.always_inline, method_table(job), inference_metadata(job))
 
 # the codeinstance cache to use -- should only be used for the constructor
 if VERSION >= v"1.11.0-DEV.1552"
diff --git a/src/irgen.jl b/src/irgen.jl
index 874ed961..cdeceb3c 100644
--- a/src/irgen.jl
+++ b/src/irgen.jl
@@ -2,10 +2,11 @@
 
 function irgen(@nospecialize(job::CompilerJob))
     mod, compiled = @timeit_debug to "emission" compile_method_instance(job)
+    edge = Edge(inference_metadata(job), job.source)
     if job.config.entry_abi === :specfunc
-        entry_fn = compiled[job.source].specfunc
+        entry_fn = compiled[edge].specfunc
     else
-        entry_fn = compiled[job.source].func
+        entry_fn = compiled[edge].func
     end
     @assert entry_fn !== nothing
     entry = functions(mod)[entry_fn]
@@ -70,25 +71,25 @@ function irgen(@nospecialize(job::CompilerJob))
         entry = deprecation_marker
     end
     if job.config.entry_abi === :specfunc
-        func = compiled[job.source].func
+        func = compiled[edge].func
         specfunc = LLVM.name(entry)
     else
         func = LLVM.name(entry)
-        specfunc = compiled[job.source].specfunc
+        specfunc = compiled[edge].specfunc
     end
 
-    compiled[job.source] =
-        (; compiled[job.source].ci, func, specfunc)
+    compiled[edge] =
+        (; compiled[edge].ci, func, specfunc)
 
     # Earlier we sanitize global names, this invalidates the
     # func, specfunc names safed in compiled. Update the names now,
     # such that when when use the compiled mappings to lookup the
     # llvm function for a methodinstance (deferred codegen) we have 
     # valid targets.
-    for mi in keys(compiled)
-        mi == job.source && continue
-        ci, func, specfunc = compiled[mi]
-        compiled[mi] = (; ci, func=safe_name(func), specfunc=safe_name(specfunc))
+    for other in keys(compiled)
+        other == edge && continue
+        ci, func, specfunc = compiled[other]
+        compiled[other] = (; ci, func=safe_name(func), specfunc=safe_name(specfunc))
     end
 
     # TODO: Should we rewrite gpuc.lookup here?
@@ -111,11 +112,11 @@ function irgen(@nospecialize(job::CompilerJob))
         # internalize all functions, but keep exported global variables.
         linkage!(entry, LLVM.API.LLVMExternalLinkage)
         preserved_gvs = String[LLVM.name(entry)]
-        for mi in keys(compiled)
+        for other in keys(compiled)
             # delay internalizing of deferred calls since
             # gpuc.lookup is not yet rewriten.
-            mi == job.source && continue
-            _, _, specfunc = compiled[mi]
+            other == edge && continue
+            _, _, specfunc = compiled[other]
             push!(preserved_gvs, specfunc) # this could be deleted if we rewrite gpuc.lookup earlier
         end
         for gvar in globals(mod)
diff --git a/src/jlgen.jl b/src/jlgen.jl
index c6be8c94..213b45c4 100644
--- a/src/jlgen.jl
+++ b/src/jlgen.jl
@@ -320,6 +320,7 @@ else
 end
 
 struct GPUInterpreter <: CC.AbstractInterpreter
+    meta::Any
     world::UInt
     method_table::GPUMethodTableView
 
@@ -336,6 +337,7 @@ end
 
 @static if HAS_INTEGRATED_CACHE
 function GPUInterpreter(world::UInt=Base.get_world_counter();
+                        meta = nothing,
                         method_table::MTType,
                         token::Any,
                         inf_params::CC.InferenceParams,
@@ -345,19 +347,20 @@ function GPUInterpreter(world::UInt=Base.get_world_counter();
     method_table = get_method_table_view(world, method_table)
     inf_cache = Vector{CC.InferenceResult}()
 
-    return GPUInterpreter(world, method_table,
+    return GPUInterpreter(meta, world, method_table,
                           token, inf_cache,
                           inf_params, opt_params)
 end
 
 function GPUInterpreter(interp::GPUInterpreter;
+                        meta=interp.meta,
                         world::UInt=interp.world,
                         method_table::GPUMethodTableView=interp.method_table,
                         token::Any=interp.token,
                         inf_cache::Vector{CC.InferenceResult}=interp.inf_cache,
                         inf_params::CC.InferenceParams=interp.inf_params,
                         opt_params::CC.OptimizationParams=interp.opt_params)
-    return GPUInterpreter(world, method_table,
+    return GPUInterpreter(meta, world, method_table,
                           token, inf_cache,
                           inf_params, opt_params)
 end
@@ -365,6 +368,7 @@ end
 else
 
 function GPUInterpreter(world::UInt=Base.get_world_counter();
+                        meta=nothing,
                         method_table::MTType,
                         code_cache::CodeCache,
                         inf_params::CC.InferenceParams,
@@ -374,19 +378,20 @@ function GPUInterpreter(world::UInt=Base.get_world_counter();
     method_table = get_method_table_view(world, method_table)
     inf_cache = Vector{CC.InferenceResult}()
 
-    return GPUInterpreter(world, method_table,
+    return GPUInterpreter(meta, world, method_table,
                           code_cache, inf_cache,
                           inf_params, opt_params)
 end
 
 function GPUInterpreter(interp::GPUInterpreter;
+                        meta=interp.meta,
                         world::UInt=interp.world,
                         method_table::GPUMethodTableView=interp.method_table,
                         code_cache::CodeCache=interp.code_cache,
                         inf_cache::Vector{CC.InferenceResult}=interp.inf_cache,
                         inf_params::CC.InferenceParams=interp.inf_params,
                         opt_params::CC.OptimizationParams=interp.opt_params)
-    return GPUInterpreter(world, method_table,
+    return GPUInterpreter(meta, world, method_table,
                           code_cache, inf_cache,
                           inf_params, opt_params)
 end
@@ -437,28 +442,76 @@ function CC.concrete_eval_eligible(interp::GPUInterpreter,
 end
 
 
+within_gpucompiler() = false
+
 ## deferred compilation
 
 struct DeferredCallInfo <: CC.CallInfo
+    meta::Any
     rt::DataType
     info::CC.CallInfo
 end
 
 # recognize calls to gpuc.deferred and save DeferredCallInfo metadata
-function CC.abstract_call_known(interp::GPUInterpreter, @nospecialize(f),
-                                arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState,
-                                max_methods::Int = CC.get_max_methods(interp, f, sv))
+# default implementation, extensible through meta argument. 
+# XXX: (or should we dispatch on `f`)?
+function abstract_call_known(meta::Nothing, interp::GPUInterpreter, @nospecialize(f),
+                             arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState,
+                             max_methods::Int = CC.get_max_methods(interp, f, sv))
     (; fargs, argtypes) = arginfo
     if f === var"gpuc.deferred"
-        argvec = argtypes[2:end]
+        argvec = argtypes[3:end]
         call = CC.abstract_call(interp, CC.ArgInfo(nothing, argvec), si, sv, max_methods)
-        callinfo = DeferredCallInfo(call.rt, call.info)
+        metaT = argtypes[2]
+        meta = CC.singleton_type(metaT)
+        if meta === nothing
+            if metaT isa Core.Const
+                meta = metaT.val
+            else
+                # meta is not a singleton type result may depend on runtime configuration
+                add_remark!(interp, sv, "Skipped gpuc.deferred since meta not constant")
+                @static if VERSION < v"1.11.0-"
+                    return CC.CallMeta(Union{}, CC.Effects(), CC.NoCallInfo())
+                else
+                    return CC.CallMeta(Union{}, Union{}, CC.Effects(), CC.NoCallInfo())
+                end
+            end
+        end
+
+        callinfo = DeferredCallInfo(meta, call.rt, call.info)
         @static if VERSION < v"1.11.0-"
             return CC.CallMeta(Ptr{Cvoid}, CC.Effects(), callinfo)
         else
             return CC.CallMeta(Ptr{Cvoid}, Union{}, CC.Effects(), callinfo)
         end
+    elseif f === within_gpucompiler
+        if length(argtypes) != 1
+            @static if VERSION < v"1.11.0-"
+                return CC.CallMeta(Union{}, CC.Effects(), CC.NoCallInfo())
+            else
+                return CC.CallMeta(Union{}, Union{}, CC.Effects(), CC.NoCallInfo())
+            end
+        end
+        @static if VERSION < v"1.11.0-"
+            return CC.CallMeta(Core.Const(true), CC.EFFECTS_TOTAL, CC.MethodResultPure())
+        else
+            return CC.CallMeta(Core.Const(true), Union{}, CC.EFFECTS_TOTAL, CC.MethodResultPure(),)
+        end
     end
+    return nothing
+end
+
+function CC.abstract_call_known(interp::GPUInterpreter, @nospecialize(f),
+                                arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState,
+                                max_methods::Int = CC.get_max_methods(interp, f, sv))
+    candidate = abstract_call_known(interp.meta, interp, f, arginfo, si, sv, max_methods)
+    if candidate === nothing && interp.meta !== nothing
+        candidate = abstract_call_known(interp.meta, interp, f, arginfo, si, sv, max_methods)
+    end
+    if candidate !== nothing
+        return candidate
+    end
+    
     return @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f,
         arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState,
         max_methods::Int)
@@ -485,23 +538,29 @@ function CC.handle_call!(todo::Vector{Pair{Int,Any}}, ir::CC.IRCode, idx::CC.Int
     args = Any[
         "extern gpuc.lookup",
         Ptr{Cvoid},
-        Core.svec(Any, Any, match.spec_types.parameters[2:end]...), # Must use Any for MethodInstance or ftype
+        Core.svec(Any, Any, Any, match.spec_types.parameters[2:end]...), # Must use Any for MethodInstance or ftype
         0,
         QuoteNode(:llvmcall),
+        info.meta,
         case.invoke,
-        stmt.args[2:end]...
+        stmt.args[3:end]...
     ]
     stmt.head = :foreigncall
     stmt.args = args
     return nothing
 end
 
+struct Edge
+    meta::Any
+    mi::MethodInstance
+end
+
 struct DeferredEdges
-    edges::Vector{MethodInstance}
+    edges::Vector{Edge}
 end
 
 function find_deferred_edges(ir::CC.IRCode)
-    edges = MethodInstance[]
+    edges = Edge[]
     # XXX: can we add this instead in handle_call?
     for stmt in ir.stmts
         inst = stmt[:inst]
@@ -509,8 +568,9 @@ function find_deferred_edges(ir::CC.IRCode)
         expr = inst::Expr
         if expr.head === :foreigncall &&
             expr.args[1] == "extern gpuc.lookup"
-            deferred_mi = expr.args[6]
-            push!(edges, deferred_mi)
+            deferred_meta = expr.args[6]
+            deferred_mi = expr.args[7]
+            push!(edges, Edge(deferred_meta, deferred_mi))
         end
     end
     unique!(edges)
@@ -542,6 +602,116 @@ function CC.finish(interp::GPUInterpreter, opt::CC.OptimizationState, ir::CC.IRC
 end
 end
 
+import .CC: CallInfo
+struct NoInlineCallInfo <: CallInfo
+    info::CallInfo # wrapped call
+    tt::Any # ::Type
+    kind::Symbol
+    NoInlineCallInfo(@nospecialize(info::CallInfo), @nospecialize(tt), kind::Symbol) =
+        new(info, tt, kind)
+end
+
+CC.nsplit_impl(info::NoInlineCallInfo) = CC.nsplit(info.info)
+CC.getsplit_impl(info::NoInlineCallInfo, idx::Int) = CC.getsplit(info.info, idx)
+CC.getresult_impl(info::NoInlineCallInfo, idx::Int) = CC.getresult(info.info, idx)
+struct AlwaysInlineCallInfo <: CallInfo
+    info::CallInfo # wrapped call
+    tt::Any # ::Type
+    AlwaysInlineCallInfo(@nospecialize(info::CallInfo), @nospecialize(tt)) = new(info, tt)
+end
+
+CC.nsplit_impl(info::AlwaysInlineCallInfo) = Core.Compiler.nsplit(info.info)
+CC.getsplit_impl(info::AlwaysInlineCallInfo, idx::Int) = CC.getsplit(info.info, idx)
+CC.getresult_impl(info::AlwaysInlineCallInfo, idx::Int) = CC.getresult(info.info, idx)
+
+
+function inlining_handler(meta::Nothing, interp::GPUInterpreter, @nospecialize(atype), callinfo)
+    return nothing
+end
+
+using Core.Compiler: ArgInfo, StmtInfo, AbsIntState
+function CC.abstract_call_gf_by_type(interp::GPUInterpreter, @nospecialize(f), arginfo::ArgInfo,
+                                     si::StmtInfo, @nospecialize(atype), sv::AbsIntState, max_methods::Int)
+    ret = @invoke CC.abstract_call_gf_by_type(interp::CC.AbstractInterpreter, f::Any, arginfo::ArgInfo,
+                                              si::StmtInfo, atype::Any, sv::AbsIntState, max_methods::Int)
+
+    callinfo = nothing
+    if interp.meta !== nothing
+        callinfo = inlining_handler(interp.meta, interp, atype, ret.info)
+    end
+    if callinfo === nothing
+        callinfo = inlining_handler(nothing, interp, atype, ret.info)
+    end
+    if callinfo === nothing
+        callinfo = ret.info
+    end
+    
+    @static if VERSION ≥ v"1.11-"
+        return CC.CallMeta(ret.rt, ret.exct, ret.effects, callinfo)
+    else
+        return CC.CallMeta(ret.rt, ret.effects, callinfo)
+    end
+end
+
+@static if VERSION < v"1.12.0-DEV.45" 
+let # overload `inlining_policy`
+    @static if VERSION ≥ v"1.11.0-DEV.879"
+        sigs_ex = :(
+            interp::GPUInterpreter,
+            @nospecialize(src),
+            @nospecialize(info::CC.CallInfo),
+            stmt_flag::UInt32,
+        )
+        args_ex = :(
+            interp::CC.AbstractInterpreter,
+            src::Any,
+            info::CC.CallInfo,
+            stmt_flag::UInt32,
+        )
+    else
+        sigs_ex = :(
+            interp::GPUInterpreter,
+            @nospecialize(src),
+            @nospecialize(info::CC.CallInfo),
+            stmt_flag::UInt8,
+            mi::MethodInstance,
+            argtypes::Vector{Any},
+        )
+        args_ex = :(
+            interp::CC.AbstractInterpreter,
+            src::Any,
+            info::CC.CallInfo,
+            stmt_flag::UInt8,
+            mi::MethodInstance,
+            argtypes::Vector{Any},
+        )
+    end
+    @eval function CC.inlining_policy($(sigs_ex.args...))
+        if info isa NoInlineCallInfo
+            @safe_debug "Blocking inlining" info.tt info.kind
+            return nothing
+        elseif info isa AlwaysInlineCallInfo
+            @safe_debug "Forcing inlining for" info.tt
+            return src
+        end
+        return @invoke CC.inlining_policy($(args_ex.args...))
+    end
+end
+else
+function CC.src_inlining_policy(interp::GPUInterpreter,
+                                @nospecialize(src), @nospecialize(info::CC.CallInfo), stmt_flag::UInt32)
+                                
+    if info isa NoInlineCallInfo
+        @safe_debug "Blocking inlining" info.tt info.kind
+        return false
+    elseif info isa AlwaysInlineCallInfo
+        @safe_debug "Forcing inlining for" info.tt
+        return true
+    end
+    return @invoke CC.src_inlining_policy(interp::CC.AbstractInterpreter, src, info::CC.CallInfo, stmt_flag::UInt32)
+end
+end
+
 
 ## world view of the cache
 using Core.Compiler: WorldView
@@ -697,14 +867,16 @@ function compile_method_instance(@nospecialize(job::CompilerJob))
     #        generate for the same mi multiple LLVM functions. 
     # `outstanding` are the missing edges that were not compiled by `compile_method_instance`
     # Currently these edges are generated through deferred codegen.
-    compiled = IdDict()
+    compiled = IdDict{Edge, Any}()
     llvm_mod, outstanding = compile_method_instance(job, compiled)
     worklist = outstanding
     while !isempty(worklist)
-        source = pop!(worklist)
-        haskey(compiled, source) && continue # We have fulfilled the request already
+        edge = pop!(worklist)
+        haskey(compiled, edge) && continue # We have fulfilled the request already
+        source = edge.mi
+        meta = edge.meta
         # Create a new compiler job for this edge, reusing the config settings from the inital one
-        job2 = CompilerJob(source, job.config)
+        job2 = CompilerJob(source, CompilerConfig(job.config; meta))
         llvm_mod2, outstanding = compile_method_instance(job2, compiled)
         append!(worklist, outstanding) # merge worklist with new outstanding edges
         @assert context(llvm_mod) == context(llvm_mod2)
@@ -714,7 +886,7 @@ function compile_method_instance(@nospecialize(job::CompilerJob))
     return llvm_mod, compiled
 end
 
-function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDict{Any, Any})
+function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDict{Edge, Any})
     # populate the cache
     interp = get_interpreter(job)
     cache = CC.code_cache(interp)
@@ -790,6 +962,7 @@ function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDi
     end
 
     # process all compiled method instances
+    meta = inference_metadata(job)
     for mi in method_instances
         ci = ci_cache_lookup(cache, mi, job.world, job.world)
         ci === nothing && continue
@@ -825,14 +998,15 @@ function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDi
         #       removed or renamed during optimization, so we store their name instead.
         # FIXME: Enable this assert when we have a fully featured worklist
         # @assert !haskey(compiled, mi)
-        compiled[mi] = (; ci, func=llvm_func, specfunc=llvm_specfunc)
+        compiled[Edge(meta, mi)] = (; ci, func=llvm_func, specfunc=llvm_specfunc)
     end
 
     # Collect the deferred edges
-    outstanding = Any[]
+    outstanding = Edge[]
     for mi in method_instances
-        !haskey(compiled, mi) && continue # Equivalent to ci_cache_lookup == nothing
-        ci = compiled[mi].ci
+        edge = Edge(meta, mi)
+        !haskey(compiled, edge) && continue # Equivalent to ci_cache_lookup == nothing
+        ci = compiled[edge].ci
         @static if VERSION >= v"1.11.0-"
             edges = CC.traverse_analysis_results(ci) do @nospecialize result
                 return result isa DeferredEdges ? result : return
@@ -844,16 +1018,16 @@ function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDi
             end
         end
         if edges !== nothing
-            for deferred_mi in (edges::DeferredEdges).edges
-                if !haskey(compiled, deferred_mi)
-                    push!(outstanding, deferred_mi)
+            for other in (edges::DeferredEdges).edges
+                if !haskey(compiled, other)
+                    push!(outstanding, other)
                 end
             end
         end
     end
 
     # ensure that the requested method instance was compiled
-    @assert haskey(compiled, job.source)
+    @assert haskey(compiled, Edge(meta, job.source))
 
     return llvm_mod, outstanding
 end
diff --git a/test/bpf_testsetup.jl b/test/bpf_testsetup.jl
index 0570a6e0..b7fc7dfc 100644
--- a/test/bpf_testsetup.jl
+++ b/test/bpf_testsetup.jl
@@ -10,7 +10,8 @@ struct CompilerParams <: AbstractCompilerParams end
 GPUCompiler.runtime_module(::CompilerJob{<:Any,CompilerParams}) = TestRuntime
 
 function create_job(@nospecialize(func), @nospecialize(types);
-                    kernel::Bool=false, always_inline=false, kwargs...)
+                    kernel::Bool=false, always_inline=false,
+                    meta=nothing, kwargs...)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
     target = BPFCompilerTarget()
     params = CompilerParams()
diff --git a/test/gcn_testsetup.jl b/test/gcn_testsetup.jl
index 846db4b6..d7a6b88d 100644
--- a/test/gcn_testsetup.jl
+++ b/test/gcn_testsetup.jl
@@ -10,11 +10,12 @@ struct CompilerParams <: AbstractCompilerParams end
 GPUCompiler.runtime_module(::CompilerJob{<:Any,CompilerParams}) = TestRuntime
 
 function create_job(@nospecialize(func), @nospecialize(types);
-                 kernel::Bool=false, always_inline=false, kwargs...)
+                 kernel::Bool=false, always_inline=false,
+                 meta=nothing, kwargs...)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
     target = GCNCompilerTarget(dev_isa="gfx900")
     params = CompilerParams()
-    config = CompilerConfig(target, params; kernel, always_inline)
+    config = CompilerConfig(target, params; kernel, always_inline, meta)
     CompilerJob(source, config), kwargs
 end
 
diff --git a/test/metal_testsetup.jl b/test/metal_testsetup.jl
index 0055cb18..2fbd956f 100644
--- a/test/metal_testsetup.jl
+++ b/test/metal_testsetup.jl
@@ -10,11 +10,12 @@ struct CompilerParams <: AbstractCompilerParams end
 GPUCompiler.runtime_module(::CompilerJob{<:Any,CompilerParams}) = TestRuntime
 
 function create_job(@nospecialize(func), @nospecialize(types);
-                    kernel::Bool=false, always_inline=false, kwargs...)
+                    kernel::Bool=false, always_inline=false,
+                    meta=nothing, kwargs...)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
     target = MetalCompilerTarget(; macos=v"12.2", metal=v"3.0", air=v"3.0")
     params = CompilerParams()
-    config = CompilerConfig(target, params; kernel, always_inline)
+    config = CompilerConfig(target, params; kernel, always_inline, meta)
     CompilerJob(source, config), kwargs
 end
 
diff --git a/test/native_tests.jl b/test/native_tests.jl
index cd4a20c0..d07e930c 100644
--- a/test/native_tests.jl
+++ b/test/native_tests.jl
@@ -43,12 +43,12 @@ end
 
             meth = only(methods(outer, (Int,)))
 
-            mis = filter(mi->mi.def == meth, keys(meta.compiled))
+            mis = filter(edge->edge.mi.def == meth, keys(meta.compiled))
             @test length(mis) == 1
 
-            other_mis = filter(mi->mi.def != meth, keys(meta.compiled))
+            other_mis = filter(edge->edge.mi.def != meth, keys(meta.compiled))
             @test length(other_mis) == 1
-            @test only(other_mis).def in methods(inner)
+            @test only(other_mis).mi.def in methods(inner)
         end
     end
 
@@ -63,11 +63,11 @@ end
 
             meth = only(methods(foo, (Float64,)))
 
-            mis = filter(mi->mi.def == meth, keys(meta.compiled))
+            mis = filter(edge->edge.mi.def == meth, keys(meta.compiled))
             @test length(mis) == 1
 
-            inner_methods = filter(keys(meta.compiled)) do mi
-                mi.def in methods(inner) && mi.specTypes == Tuple{typeof(inner), Float64}
+            inner_methods = filter(keys(meta.compiled)) do edge
+                edge.mi.def in methods(inner) && edge.mi.specTypes == Tuple{typeof(inner), Float64}
             end
             @test length(inner_methods) == 1
         end
@@ -166,7 +166,7 @@ end
     @testset "deferred" begin
         @gensym child kernel unrelated
         @eval @noinline $child(i) = i
-        @eval $kernel(i) = GPUCompiler.var"gpuc.deferred"($child, i)
+        @eval $kernel(i) = GPUCompiler.var"gpuc.deferred"(nothing, $child, i)
 
         # smoke test
         job, _ = Native.create_job(eval(kernel), (Int64,))
diff --git a/test/native_testsetup.jl b/test/native_testsetup.jl
index 3406276c..2f674470 100644
--- a/test/native_testsetup.jl
+++ b/test/native_testsetup.jl
@@ -26,11 +26,12 @@ GPUCompiler.can_safepoint(@nospecialize(job::NativeCompilerJob)) = job.config.pa
 
 function create_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false,
                     entry_abi=:specfunc, entry_safepoint::Bool=false, always_inline=false,
-                    method_table=test_method_table, kwargs...)
+                    method_table=test_method_table,
+                    meta=nothing, kwargs...)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
     target = NativeCompilerTarget()
     params = CompilerParams(entry_safepoint, method_table)
-    config = CompilerConfig(target, params; kernel, entry_abi, always_inline)
+    config = CompilerConfig(target, params; kernel, entry_abi, always_inline, meta)
     CompilerJob(source, config), kwargs
 end
 
diff --git a/test/plugin_testsetup.jl b/test/plugin_testsetup.jl
index fdab7ee5..35a4abdc 100644
--- a/test/plugin_testsetup.jl
+++ b/test/plugin_testsetup.jl
@@ -27,4 +27,47 @@ end
 
 GPUCompiler.register_plugin!("gpucompiler.mark", remove_mark!)
 
+current_inlinestate() = nothing
+
+abstract type InlineStateMeta end
+struct AlwaysInlineMeta <: InlineStateMeta end
+struct NeverInlineMeta <: InlineStateMeta end
+
+import GPUCompiler: abstract_call_known, GPUInterpreter
+import Core.Compiler: CallMeta, Effects, NoCallInfo, ArgInfo,
+                      StmtInfo, AbsIntState, EFFECTS_TOTAL,
+                      MethodResultPure
+
+function abstract_call_known(meta::InlineStateMeta, interp::GPUInterpreter, @nospecialize(f),
+                             arginfo::ArgInfo, si::StmtInfo, sv::AbsIntState, max_methods::Int)
+    (; fargs, argtypes) = arginfo
+
+    if f === current_inlinestate
+        if length(argtypes) != 1
+            @static if VERSION < v"1.11.0-"
+                return CallMeta(Union{}, Effects(), NoCallInfo())
+            else
+                return CallMeta(Union{}, Union{}, Effects(), NoCallInfo())
+            end
+        end
+        @static if VERSION < v"1.11.0-"
+            return CallMeta(Core.Const(meta), EFFECTS_TOTAL, MethodResultPure())
+        else
+            return CallMeta(Core.Const(meta), Union{}, EFFECTS_TOTAL, MethodResultPure())
+        end
+    end
+    return nothing
+end
+
+import GPUCompiler: inlining_handler, NoInlineCallInfo, AlwaysInlineCallInfo
+function inlining_handler(meta::InlineStateMeta, interp::GPUInterpreter, @nospecialize(atype), callinfo)
+    if meta isa NeverInlineMeta
+        return NoInlineCallInfo(callinfo, atype, :default)
+    elseif meta isa AlwaysInlineMeta
+        return AlwaysInlineCallInfo(callinfo, atype)
+    end
+    return nothing
+end
+
+
 end
\ No newline at end of file
diff --git a/test/ptx_tests.jl b/test/ptx_tests.jl
index 600561f5..6476afde 100644
--- a/test/ptx_tests.jl
+++ b/test/ptx_tests.jl
@@ -277,6 +277,19 @@ end
     @test "We did not crash!" != ""
 end
 
+@testset "within_gpucompiler" begin
+    function kernel(a)
+        unsafe_store!(a, GPUCompiler.within_gpucompiler())
+    end
+    ir = sprint(io->InteractiveUtils.code_llvm(io, kernel, Tuple{Ptr{Bool}}))
+    @test occursin("store i8 0,", ir)
+    @test !occursin("store i8 1,", ir)
+
+    ir = sprint(io->PTX.code_llvm(io, kernel, Tuple{Ptr{Bool}}))
+    @test !occursin("store i8 0,", ir)
+    @test occursin("store i8 1,", ir)
+end
+
 @testset "exception arguments" begin
     function kernel(a)
         unsafe_store!(a, trunc(Int, unsafe_load(a)))
@@ -425,4 +438,70 @@ import InteractiveUtils
     ir = sprint(io->PTX.code_llvm(io, kernel, Tuple{Int}))
     @test !occursin("gpucompiler.mark", ir)
 end
+
+@testset "current_inlinestate" begin
+    function kernel(a)
+        state = Plugin.current_inlinestate()
+        if state === nothing
+            unsafe_store!(a, 0)
+        elseif state === Plugin.NeverInlineMeta()
+            unsafe_store!(a, 1)
+        elseif state === Plugin.AlwaysInlineMeta()
+            unsafe_store!(a, 2)
+        end
+        return nothing
+    end
+    ir = sprint(io->InteractiveUtils.code_llvm(io, kernel, Tuple{Ptr{Int64}}))
+    @test occursin("store i64 0,", ir)
+    @test !occursin("store i64 1,", ir)
+    @test !occursin("store i64 2,", ir)
+
+    ir = sprint(io->PTX.code_llvm(io, kernel, Tuple{Ptr{Int64}}))
+    @test occursin("store i64 0,", ir)
+    @test !occursin("store i64 1,", ir)
+    @test !occursin("store i64 2,", ir)
+
+    ir = sprint(io->PTX.code_llvm(io, kernel, Tuple{Ptr{Int64}}, meta=Plugin.NeverInlineMeta()))
+    @test !occursin("store i64 0,", ir)
+    @test occursin("call fastcc void @julia_unsafe_store", ir) # call fastcc void @julia_unsafe_store__1397(i64 zeroext %0, i64 signext 1)
+    @test !occursin("store i64 2,", ir)
+
+    ir = sprint(io->PTX.code_llvm(io, kernel, Tuple{Ptr{Int64}}, meta=Plugin.AlwaysInlineMeta()))
+    @test !occursin("store i64 0,", ir)
+    @test !occursin("store i64 1,", ir)
+    @test occursin("store i64 2,", ir)
+end
+
+@testset "InlineStateMeta" begin
+
+    @noinline function noinline(x)
+        x^2
+    end
+
+    @inline function inline(x)
+        x^2
+    end
+
+    function kernel_noinline(a, x)
+        unsafe_store!(a, noinline(x))
+        nothing
+    end
+    
+    function kernel_inline(a, x)
+        unsafe_store!(a, inline(x))
+        nothing
+    end
+
+    ir = sprint(io->PTX.code_llvm(io, kernel_noinline, Tuple{Ptr{Int64}, Int64}))
+    @test occursin("call fastcc i64 @julia_noinline", ir)
+    
+    ir = sprint(io->PTX.code_llvm(io, kernel_noinline, Tuple{Ptr{Int64}, Int64}, meta=Plugin.AlwaysInlineMeta()))
+    @test !occursin("call fastcc i64 @julia_noinline", ir)
+
+    ir = sprint(io->PTX.code_llvm(io, kernel_inline, Tuple{Ptr{Int64}, Int64}))
+    @test !occursin("call fastcc i64 @julia_inline", ir)
+    
+    ir = sprint(io->PTX.code_llvm(io, kernel_inline, Tuple{Ptr{Int64}, Int64}, meta=Plugin.NeverInlineMeta()))
+    @test occursin("call fastcc i64 @julia_inline", ir)
+end
 end #testitem
diff --git a/test/ptx_testsetup.jl b/test/ptx_testsetup.jl
index 89516283..33505756 100644
--- a/test/ptx_testsetup.jl
+++ b/test/ptx_testsetup.jl
@@ -39,13 +39,13 @@ GPUCompiler.runtime_module(::PTXCompilerJob) = PTXTestRuntime
 
 function create_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false,
                     minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing,
-                    maxregs=nothing, always_inline=false, kwargs...)
+                    maxregs=nothing, always_inline=false, meta=nothing, kwargs...)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
     target = PTXCompilerTarget(;cap=v"7.0",
                                minthreads, maxthreads,
                                blocks_per_sm, maxregs)
     params = CompilerParams()
-    config = CompilerConfig(target, params; kernel, always_inline)
+    config = CompilerConfig(target, params; kernel, always_inline, meta)
     CompilerJob(source, config), kwargs
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 199e641f..66c0fff6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -22,7 +22,7 @@ runtests(GPUCompiler; nworkers=min(Sys.CPU_THREADS,4), nworker_threads=1,
         end
     end
 
-    if ti.name in ["PTX", "GCN", "PTX precompile"] && Sys.isapple()
+    if ti.name in ["PTX", "GCN", "PTX precompile", "PTX plugin"] && Sys.isapple()
         # support for AMDGPU and NVTX on macOS has been removed from Julia's LLVM build
         return false
     end
diff --git a/test/spirv_testsetup.jl b/test/spirv_testsetup.jl
index f1221545..f1fd9325 100644
--- a/test/spirv_testsetup.jl
+++ b/test/spirv_testsetup.jl
@@ -11,11 +11,12 @@ GPUCompiler.runtime_module(::CompilerJob{<:Any,CompilerParams}) = TestRuntime
 
 function create_job(@nospecialize(func), @nospecialize(types);
                    kernel::Bool=false, always_inline=false,
-                   supports_fp16=true, supports_fp64=true, kwargs...)
+                   supports_fp16=true, supports_fp64=true, 
+                   meta=nothing, kwargs...)
     source = methodinstance(typeof(func), Base.to_tuple_type(types), Base.get_world_counter())
     target = SPIRVCompilerTarget(; supports_fp16, supports_fp64)
     params = CompilerParams()
-    config = CompilerConfig(target, params; kernel, always_inline)
+    config = CompilerConfig(target, params; kernel, always_inline, meta)
     CompilerJob(source, config), kwargs
 end