Inline singleton splats

As noted in #36087 and #29114, splatting integers currently has a performance penalty that is unexpected. For tuples and SimpleVectors, we have special purpose inliners that will simply inline the tuple/SimpleVector into the call being splatted. However, for everything else we'd have to run the iteration protocol to find out what the values to substitute are. This change does just that, limited to the case of length-1 (and empty) iterables. Benchmark: ``` f(x) = (x...,) @code_typed f(1) @benchmark f(1) ``` Before: ``` julia> @code_typed f(1) CodeInfo( 1 ─ %1 = Core._apply_iterate(Base.iterate, Core.tuple, x)::Tuple{Int64} └── return %1 ) => Tuple{Int64} julia> @benchmark f(1) BenchmarkTools.Trial: memory estimate: 32 bytes allocs estimate: 2 -------------- minimum time: 209.357 ns (0.00% GC) median time: 213.404 ns (0.00% GC) mean time: 218.674 ns (0.16% GC) maximum time: 1.922 μs (0.00% GC) -------------- samples: 10000 evals/sample: 540 ``` After: ``` julia> @code_typed f(1) CodeInfo( 1 ─ %1 = invoke Base.iterate(_2::Int64)::Tuple{Int64,Nothing} │ %2 = (getfield)(%1, 1)::Int64 │ %3 = (getfield)(%1, 2)::Nothing │ invoke Base.iterate(_2::Int64, %3::Nothing)::Nothing │ %5 = Core.tuple(%2)::Tuple{Int64} └── return %5 ) => Tuple{Int64} julia> @benchmark f(1) BenchmarkTools.Trial: memory estimate: 0 bytes allocs estimate: 0 -------------- minimum time: 3.044 ns (0.00% GC) median time: 3.047 ns (0.00% GC) mean time: 3.049 ns (0.00% GC) maximum time: 7.700 ns (0.00% GC) -------------- samples: 10000 evals/sample: 1000 ``` Obviously this isn't 100% optimal yet, because the `iterate` calls themselves don't get inlined, but it's a lot better. Inlining the `iterate` calls is left for a follow up commit.
JuliaLang · Jun 6, 2020 · 0eb1f0f · 0eb1f0f
1 parent d825de8
commit 0eb1f0f
Show file tree

Hide file tree

Showing 2 changed files with 160 additions and 72 deletions.
diff --git a/base/compiler/ssair/inlining.jl b/base/compiler/ssair/inlining.jl
@@ -594,50 +594,81 @@ function spec_lambda(@nospecialize(atype), sv::OptimizationState, @nospecialize(
     return mi
 end
 
+struct ExpandTuple; end
+struct InlineIterate
+    first::Tuple{MethodInstance, Any, UInt, UInt}
+    second::Union{Nothing, Tuple{MethodInstance, Any, UInt, UInt}}
+end
+
+function add_backedges!(rewrite::InlineIterate, sv::OptimizationState)
+    add_backedge!(rewrite.first[1], sv)
+    update_valid_age!(rewrite.first[3], rewrite.first[4], sv)
+    if rewrite.second !== nothing
+        add_backedge!(rewrite.second[1], sv)
+        update_valid_age!(rewrite.second[3], rewrite.second[4], sv)
+    end
+end
+
 # This assumes the caller has verified that all arguments to the _apply call are Tuples.
-function rewrite_apply_exprargs!(ir::IRCode, idx::Int, argexprs::Vector{Any}, atypes::Vector{Any}, arg_start::Int)
+function rewrite_apply_exprargs!(ir::IRCode, idx::Int, argexprs::Vector{Any}, atypes::Vector{Any}, rewrites::Vector{Any}, arg_start::Int, sv::OptimizationState)
     new_argexprs = Any[argexprs[arg_start]]
     new_atypes = Any[atypes[arg_start]]
     # loop over original arguments and flatten any known iterators
-    for i in (arg_start+1):length(argexprs)
+    for j in 1:length(rewrites)
+        rewrite = rewrites[j]
+        i = arg_start+j
         def = argexprs[i]
         def_type = atypes[i]
-        if def_type isa PartialStruct
-            # def_type.typ <: Tuple is assumed
-            def_atypes = def_type.fields
-        else
-            def_atypes = Any[]
-            if isa(def_type, Const) # && isa(def_type.val, Union{Tuple, SimpleVector}) is implied
-                for p in def_type.val
-                    push!(def_atypes, Const(p))
-                end
+        if isa(rewrite, ExpandTuple)
+            if def_type isa PartialStruct
+                # def_type.typ <: Tuple is assumed
+                def_atypes = def_type.fields
             else
-                ti = widenconst(def_type)
-                if ti.name === NamedTuple_typename
-                    ti = ti.parameters[2]
-                end
-                for p in ti.parameters
-                    if isa(p, DataType) && isdefined(p, :instance)
-                        # replace singleton types with their equivalent Const object
-                        p = Const(p.instance)
-                    elseif isconstType(p)
-                        p = Const(p.parameters[1])
+                def_atypes = Any[]
+                if isa(def_type, Const) # && isa(def_type.val, Union{Tuple, SimpleVector}) is implied
+                    for p in def_type.val
+                        push!(def_atypes, Const(p))
+                    end
+                else
+                    ti = widenconst(def_type)
+                    if ti.name === NamedTuple_typename
+                        ti = ti.parameters[2]
+                    end
+                    for p in ti.parameters
+                        if isa(p, DataType) && isdefined(p, :instance)
+                            # replace singleton types with their equivalent Const object
+                            p = Const(p.instance)
+                        elseif isconstType(p)
+                            p = Const(p.parameters[1])
+                        end
+                        push!(def_atypes, p)
                     end
-                    push!(def_atypes, p)
                 end
             end
-        end
-        # now push flattened types into new_atypes and getfield exprs into new_argexprs
-        for j in 1:length(def_atypes)
-            def_atype = def_atypes[j]
-            if isa(def_atype, Const) && is_inlineable_constant(def_atype.val)
-                new_argexpr = quoted(def_atype.val)
-            else
-                new_call = Expr(:call, Core.getfield, def, j)
-                new_argexpr = insert_node!(ir, idx, def_atype, new_call)
+            # now push flattened types into new_atypes and getfield exprs into new_argexprs
+            for j in 1:length(def_atypes)
+                def_atype = def_atypes[j]
+                if isa(def_atype, Const) && is_inlineable_constant(def_atype.val)
+                    new_argexpr = quoted(def_atype.val)
+                else
+                    new_call = Expr(:call, Core.getfield, def, j)
+                    new_argexpr = insert_node!(ir, idx, def_atype, new_call)
+                end
+                push!(new_argexprs, new_argexpr)
+                push!(new_atypes, def_atype)
+            end
+        elseif isa(rewrite, InlineIterate)
+            add_backedges!(rewrite, sv)
+            T = rewrite.first[2]
+            it1 = insert_node!(ir, idx, T, Expr(:invoke, rewrite.first[1], argexprs[arg_start-1], def))
+            if rewrite.second !== nothing
+                valT = getfield_tfunc(T, Const(1))
+                val = insert_node!(ir, idx, valT, Expr(:call, Core.getfield, it1, 1))
+                state = insert_node!(ir, idx, getfield_tfunc(T, Const(2)), Expr(:call, Core.getfield, it1, 2))
+                insert_node!(ir, idx, Nothing, Expr(:invoke, rewrite.second[1], argexprs[arg_start-1], def, state))
+                push!(new_argexprs, val)
+                push!(new_atypes, valT)
             end
-            push!(new_argexprs, new_argexpr)
-            push!(new_atypes, def_atype)
         end
     end
     return new_argexprs, new_atypes
@@ -817,13 +848,26 @@ function handle_single_case!(ir::IRCode, stmt::Expr, idx::Int, @nospecialize(cas
     nothing
 end
 
-function is_valid_type_for_apply_rewrite(@nospecialize(typ), params::OptimizationParams)
+function method_lookup_iterate(atypes, sv)
+    (meth, min_valid, max_valid) = method_lookup_inlining(Tuple{atypes...}, sv)
+    if meth == false || length(meth) != 1
+        return nothing
+    end
+    mi = specialize_method(meth[1], true) # Union{Nothing, MethodInstance}
+    if !isa(mi, MethodInstance)
+        return nothing
+    end
+    rt = find_inferred_rettype(mi, atypes, sv, Any)
+    (mi, rt, min_valid, max_valid)
+end
+
+function analyze_type_for_apply_rewrite(@nospecialize(typ), @nospecialize(inlinet), params::OptimizationParams, sv::OptimizationState)
     if isa(typ, Const) && isa(typ.val, SimpleVector)
-        length(typ.val) > params.MAX_TUPLE_SPLAT && return false
+        length(typ.val) > params.MAX_TUPLE_SPLAT && return nothing
         for p in typ.val
-            is_inlineable_constant(p) || return false
+            is_inlineable_constant(p) || return nothing
         end
-        return true
+        return ExpandTuple()
     end
     typ = widenconst(typ)
     if isa(typ, DataType) && typ.name === NamedTuple_typename
@@ -832,12 +876,31 @@ function is_valid_type_for_apply_rewrite(@nospecialize(typ), params::Optimizatio
             typ = typ.ub
         end
     end
-    isa(typ, DataType) || return false
+    isa(typ, DataType) || return nothing
     if typ.name === Tuple.name
-        return !isvatuple(typ) && length(typ.parameters) <= params.MAX_TUPLE_SPLAT
-    else
-        return false
+        if !isvatuple(typ) && length(typ.parameters) <= params.MAX_TUPLE_SPLAT
+            return ExpandTuple()
+        else
+            return nothing
+        end
+    end
+    inlinet = widenconst(inlinet)
+    isa(inlinet, DataType) || return nothing
+    # Simulate iteration protocol for two steps, see if this is just a
+    # singleton
+    iterate1 = method_lookup_iterate([inlinet, typ], sv)
+    iterate1 === nothing && return nothing
+    rt = iterate1[2]
+    if rt === Nothing
+        return InlineIterate(iterate1, nothing)
+    end
+    if !isa(rt, DataType) || !(rt <: Tuple) || isvatuple(rt) || length(rt.parameters) != 2
+        return nothing
     end
+    iterate2 = method_lookup_iterate([inlinet, typ, rt.parameters[2]], sv)
+    iterate2 === nothing && return nothing
+    iterate2[2] === Nothing || return nothing
+    return InlineIterate(iterate1, iterate2)
 end
 
 function inline_splatnew!(ir::IRCode, idx::Int)
@@ -887,11 +950,11 @@ function call_sig(ir::IRCode, stmt::Expr)
     Signature(f, ft, atypes)
 end
 
-function inline_apply!(ir::IRCode, idx::Int, sig::Signature, params::OptimizationParams)
+function inline_apply!(ir::IRCode, idx::Int, sig::Signature, params::OptimizationParams, sv::OptimizationState)
     stmt = ir.stmts[idx]
     while sig.f === Core._apply || sig.f === Core._apply_iterate
-        arg_start = sig.f === Core._apply ? 2 : 3
         atypes = sig.atypes
+        arg_start = sig.f === Core._apply ? 2 : 3
         if arg_start > length(atypes)
             return nothing
         end
@@ -917,15 +980,16 @@ function inline_apply!(ir::IRCode, idx::Int, sig::Signature, params::Optimizatio
         end
         # Try to figure out the signature of the function being called
         # and if rewrite_apply_exprargs can deal with this form
+        inlinet = sig.f === Core._apply ? Union{} : atypes[2]
+        rewrites = Any[]
         for i = (arg_start + 1):length(atypes)
-            # TODO: We could basically run the iteration protocol here
-            if !is_valid_type_for_apply_rewrite(atypes[i], params)
-                return nothing
-            end
+            rewrite = analyze_type_for_apply_rewrite(atypes[i], inlinet, params, sv)
+            rewrite === nothing && return nothing
+            push!(rewrites, rewrite)
         end
         # Independent of whether we can inline, the above analysis allows us to rewrite
         # this apply call to a regular call
-        stmt.args, atypes = rewrite_apply_exprargs!(ir, idx, stmt.args, atypes, arg_start)
+        stmt.args, atypes = rewrite_apply_exprargs!(ir, idx, stmt.args, atypes, rewrites, arg_start, sv)
         has_free_typevars(ft) && return nothing
         f = singleton_type(ft)
         sig = Signature(f, ft, atypes)
@@ -957,7 +1021,7 @@ end
 # Handles all analysis and inlining of intrinsics and builtins. In particular,
 # this method does not access the method table or otherwise process generic
 # functions.
-function process_simple!(ir::IRCode, idx::Int, params::OptimizationParams, world::UInt)
+function process_simple!(ir::IRCode, idx::Int, params::OptimizationParams, world::UInt, sv::OptimizationState)
     stmt = ir.stmts[idx]
     stmt isa Expr || return nothing
     if stmt.head === :splatnew
@@ -971,7 +1035,7 @@ function process_simple!(ir::IRCode, idx::Int, params::OptimizationParams, world
     sig === nothing && return nothing
 
     # Handle _apply
-    sig = inline_apply!(ir, idx, sig, params)
+    sig = inline_apply!(ir, idx, sig, params, sv)
     sig === nothing && return nothing
 
     # Check if we match any of the early inliners
@@ -1006,11 +1070,26 @@ function process_simple!(ir::IRCode, idx::Int, params::OptimizationParams, world
     return (sig, invoke_data)
 end
 
+function method_lookup_inlining(atype, sv)
+    get(sv.matching_methods_cache, atype) do
+        # World age does not need to be taken into account in the cache
+        # because it is forwarded from type inference through `sv.params`
+        # in the case that the cache is nonempty, so it should be unchanged
+        # The max number of methods should be the same as in inference most
+        # of the time, and should not affect correctness otherwise.
+        min_val = UInt[typemin(UInt)]
+        max_val = UInt[typemax(UInt)]
+        ms = _methods_by_ftype(atype, sv.params.MAX_METHODS,
+                               sv.world, min_val, max_val)
+        return (ms, min_val[1], max_val[1])
+    end
+end
+
 function assemble_inline_todo!(ir::IRCode, sv::OptimizationState)
     # todo = (inline_idx, (isva, isinvoke, na), method, spvals, inline_linetable, inline_ir, lie)
     todo = Any[]
     for idx in 1:length(ir.stmts)
-        r = process_simple!(ir, idx, sv.params, sv.world)
+        r = process_simple!(ir, idx, sv.params, sv.world, sv)
         r === nothing && continue
 
         stmt = ir.stmts[idx]
@@ -1024,20 +1103,8 @@ function assemble_inline_todo!(ir::IRCode, sv::OptimizationState)
         end
 
         # Regular case: Retrieve matching methods from cache (or compute them)
-        (meth, min_valid, max_valid) = get(sv.matching_methods_cache, sig.atype) do
-            # World age does not need to be taken into account in the cache
-            # because it is forwarded from type inference through `sv.params`
-            # in the case that the cache is nonempty, so it should be unchanged
-            # The max number of methods should be the same as in inference most
-            # of the time, and should not affect correctness otherwise.
-            min_val = UInt[typemin(UInt)]
-            max_val = UInt[typemax(UInt)]
-            ms = _methods_by_ftype(sig.atype, sv.params.MAX_METHODS,
-                                   sv.world, min_val, max_val)
-            return (ms, min_val[1], max_val[1])
-        end
+        (meth, min_valid, max_valid) = method_lookup_inlining(sig.atype, sv)
         if meth === false || length(meth) == 0
-            # No applicable method, or too many applicable methods
             continue
         end
         update_valid_age!(min_valid, max_valid, sv)
@@ -1304,7 +1371,7 @@ function ssa_substitute_op!(@nospecialize(val), arg_replacements::Vector{Any},
     return urs[]
 end
 
-function find_inferred(mi::MethodInstance, @nospecialize(atypes), sv::OptimizationState, @nospecialize(rettype))
+function _find_inferred(mi::MethodInstance, @nospecialize(atypes), sv::OptimizationState, @nospecialize(rettype))::Union{InferenceResult, CodeInstance, Nothing}
     # see if the method has a InferenceResult in the current cache
     # or an existing inferred code info store in `.inferred`
     haveconst = false
@@ -1321,24 +1388,42 @@ function find_inferred(mi::MethodInstance, @nospecialize(atypes), sv::Optimizati
         inf_result = nothing
     end
     #XXX: update_valid_age!(min_valid[1], max_valid[1], sv)
-    if isa(inf_result, InferenceResult)
-        let inferred_src = inf_result.src
+    inf_result !== nothing && return inf_result
+    linfo = inf_for_methodinstance(sv.interp, mi, sv.world)
+    isa(linfo, CodeInstance) && return linfo
+    return nothing
+end
+
+function find_inferred(mi::MethodInstance, @nospecialize(atypes), sv::OptimizationState, @nospecialize(rettype))
+    result = _find_inferred(mi, atypes, sv, rettype)
+    result === nothing && return (false, nothing)
+    if isa(result, InferenceResult)
+        let inferred_src = result.src
             if isa(inferred_src, CodeInfo)
                 return svec(false, inferred_src)
             end
             if isa(inferred_src, Const) && is_inlineable_constant(inferred_src.val)
                 return svec(true, quoted(inferred_src.val),)
             end
         end
-    end
-
-    linfo = inf_for_methodinstance(sv.interp, mi, sv.world)
-    if linfo isa CodeInstance
+        return (false, nothing)
+    else
+        linfo = result::CodeInstance
         if invoke_api(linfo) == 2
             # in this case function can be inlined to a constant
             return svec(true, quoted(linfo.rettype_const))
         end
         return svec(false, linfo.inferred)
     end
-    return svec(false, nothing)
+end
+
+function find_inferred_rettype(mi, atypes, sv, @nospecialize(rettype))
+    result = _find_inferred(mi, atypes, sv, rettype)
+    result === nothing && return Any
+    if isa(result, InferenceResult)
+        isa(result.result, Type) && return result.result
+        return Any
+    else
+        return (result::CodeInstance).rettype
+    end
 end
diff --git a/base/compiler/utilities.jl b/base/compiler/utilities.jl
@@ -133,6 +133,9 @@ function specialize_method(method::Method, @nospecialize(atypes), sparams::Simpl
     return ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, (Any, Any, Any), method, atypes, sparams)
 end
 
+specialize_method(lookup::SimpleVector, preexisting::Bool=false) =
+    specialize_method(lookup[3], lookup[1], lookup[2], preexisting)
+
 # This function is used for computing alternate limit heuristics
 function method_for_inference_heuristics(method::Method, @nospecialize(sig), sparams::SimpleVector)
     if isdefined(method, :generator) && method.generator.expand_early && may_invoke_generator(method, sig, sparams)