Skip to content

Commit

Permalink
some updates and fixes to inlining cost
Browse files Browse the repository at this point in the history
- Instead of always inlining functions marked at-inline, increase
  the cost threshold 20x
- Don't inline functions inferred not to return
- statement_cost no longer needs to look at nested Exprs in general
- Fix cost of `:copyast`
  • Loading branch information
JeffBezanson committed Jul 7, 2018
1 parent 30bf7ac commit 9277d3a
Show file tree
Hide file tree
Showing 10 changed files with 91 additions and 70 deletions.
8 changes: 6 additions & 2 deletions base/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1666,7 +1666,9 @@ julia> findnext(isodd, A, CartesianIndex(1, 1))
CartesianIndex(1, 1)
```
"""
function findnext(testf::Function, A, start)
@inline findnext(testf::Function, A, start) = findnext_internal(testf, A, start)

function findnext_internal(testf::Function, A, start)
l = last(keys(A))
i = start
while i <= l
Expand Down Expand Up @@ -1854,7 +1856,9 @@ julia> findprev(isodd, A, CartesianIndex(1, 2))
CartesianIndex(2, 1)
```
"""
function findprev(testf::Function, A, start)
@inline findprev(testf::Function, A, start) = findprev_internal(testf, A, start)

function findprev_internal(testf::Function, A, start)
i = start
while i >= first(keys(A))
testf(A[i]) && return i
Expand Down
5 changes: 3 additions & 2 deletions base/compiler/abstractinterpretation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,9 @@ function abstract_call_method_with_const_args(@nospecialize(f), argtypes::Vector
code === nothing && return Any
code = code::MethodInstance
# decide if it's likely to be worthwhile
cache_inlineable = false
if isdefined(code, :inferred)
declared_inline = isdefined(method, :source) && ccall(:jl_ast_flag_inlineable, Bool, (Any,), method.source)
cache_inlineable = declared_inline
if isdefined(code, :inferred) && !cache_inlineable
cache_inf = code.inferred
if !(cache_inf === nothing)
cache_src_inferred = ccall(:jl_ast_flag_inferred, Bool, (Any,), cache_inf)
Expand Down
124 changes: 66 additions & 58 deletions base/compiler/optimize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -212,14 +212,21 @@ function optimize(opt::OptimizationState, @nospecialize(result))
else
force_noinline = true
end
if !opt.src.inlineable && result === Union{}
force_noinline = true
end
end
if force_noinline
opt.src.inlineable = false
elseif !opt.src.inlineable && isa(def, Method)
elseif isa(def, Method)
bonus = 0
if result Tuple && !isbitstype(widenconst(result))
bonus = opt.params.inline_tupleret_bonus
end
if opt.src.inlineable
# For functions declared @inline, increase the cost threshold 20x
bonus += opt.params.inline_cost_threshold*19
end
opt.src.inlineable = isinlineable(def, opt, bonus)
end
nothing
Expand Down Expand Up @@ -270,86 +277,87 @@ isknowntype(@nospecialize T) = (T == Union{}) || isconcretetype(T)

function statement_cost(ex::Expr, line::Int, src::CodeInfo, spvals::SimpleVector, slottypes::Vector{Any}, params::Params)
head = ex.head
if is_meta_expr_head(head) || head == :copyast # not sure if copyast is right
if is_meta_expr_head(head)
return 0
end
argcost = 0
for a in ex.args
if a isa Expr
argcost = plus_saturate(argcost, statement_cost(a, -1, src, spvals, slottypes, params))
end
end
if head == :return || head == :(=)
return argcost
end
extyp = line == -1 ? Any : src.ssavaluetypes[line]
if head == :call
elseif head === :call
ftyp = argextype(ex.args[1], src, spvals, slottypes)
if isa(ftyp, Type)
return argcost
f = singleton_type(ftyp)
if isa(f, IntrinsicFunction)
iidx = Int(reinterpret(Int32, f::IntrinsicFunction)) + 1
if !isassigned(T_IFUNC_COST, iidx)
# unknown/unhandled intrinsic
return params.inline_nonleaf_penalty
end
return T_IFUNC_COST[iidx]
end
if isa(ftyp, Const)
f = (ftyp::Const).val
if isa(f, IntrinsicFunction)
iidx = Int(reinterpret(Int32, f::IntrinsicFunction)) + 1
if !isassigned(T_IFUNC_COST, iidx)
# unknown/unhandled intrinsic
return plus_saturate(argcost, params.inline_nonleaf_penalty)
end
return plus_saturate(argcost, T_IFUNC_COST[iidx])
if isa(f, Builtin)
# The efficiency of operations like a[i] and s.b
# depend strongly on whether the result can be
# inferred, so check the type of ex
if f === Main.Core.getfield || f === Main.Core.tuple
# we might like to penalize non-inferrability, but
# tuple iteration/destructuring makes that impossible
# return plus_saturate(argcost, isknowntype(extyp) ? 1 : params.inline_nonleaf_penalty)
return 0
elseif f === Main.Core.arrayref && length(ex.args) >= 3
atyp = argextype(ex.args[3], src, spvals, slottypes)
return isknowntype(atyp) ? 4 : params.inline_nonleaf_penalty
end
if isa(f, Builtin)
# The efficiency of operations like a[i] and s.b
# depend strongly on whether the result can be
# inferred, so check the type of ex
if f == Main.Core.getfield || f == Main.Core.tuple
# we might like to penalize non-inferrability, but
# tuple iteration/destructuring makes that
# impossible
# return plus_saturate(argcost, isknowntype(extyp) ? 1 : params.inline_nonleaf_penalty)
return argcost
elseif f == Main.Core.arrayref && length(ex.args) >= 3
atyp = argextype(ex.args[3], src, spvals, slottypes)
return plus_saturate(argcost, isknowntype(atyp) ? 4 : params.inline_nonleaf_penalty)
end
fidx = findfirst(x->x===f, T_FFUNC_KEY)
if fidx === nothing
# unknown/unhandled builtin or anonymous function
# Use the generic cost of a direct function call
return plus_saturate(argcost, 20)
end
return plus_saturate(argcost, T_FFUNC_COST[fidx])
fidx = findfirst(x->x===f, T_FFUNC_KEY)
if fidx === nothing
# unknown/unhandled builtin or anonymous function
# Use the generic cost of a direct function call
return 20
end
return T_FFUNC_COST[fidx]
end
return plus_saturate(argcost, params.inline_nonleaf_penalty)
elseif head == :foreigncall || head == :invoke
return params.inline_nonleaf_penalty
elseif head === :foreigncall || head === :invoke
# Calls whose "return type" is Union{} do not actually return:
# they are errors. Since these are not part of the typical
# run-time of the function, we omit them from
# consideration. This way, non-inlined error branches do not
# prevent inlining.
return extyp == Union{} ? 0 : plus_saturate(20, argcost)
elseif head == :llvmcall
return plus_saturate(10, argcost) # a wild guess at typical cost
elseif head == :enter
extyp = line == -1 ? Any : src.ssavaluetypes[line]
return extyp === Union{} ? 0 : 20
elseif head === :return
a = ex.args[1]
if a isa Expr
return statement_cost(a, -1, src, spvals, slottypes, params)
end
return 0
elseif head === :(=)
if ex.args[1] isa GlobalRef
cost = 20
else
cost = 0
end
a = ex.args[2]
if a isa Expr
cost = plus_saturate(cost, statement_cost(a, -1, src, spvals, slottypes, params))
end
return cost
elseif head === :copyast
return 100
elseif head === :enter
# try/catch is a couple function calls,
# but don't inline functions with try/catch
# since these aren't usually performance-sensitive functions,
# and llvm is more likely to miscompile them when these functions get large
return typemax(Int)
elseif head == :gotoifnot
elseif head === :gotoifnot
target = ex.args[2]::Int
# loops are generally always expensive
# but assume that forward jumps are already counted for from
# summing the cost of the not-taken branch
return target < line ? plus_saturate(40, argcost) : argcost
return target < line ? 40 : 0
end
return argcost
return 0
end

function inline_worthy(body::Array{Any,1}, src::CodeInfo, spvals::SimpleVector, slottypes::Vector{Any},
params::Params, cost_threshold::Integer=params.inline_cost_threshold)
bodycost = 0
bodycost::Int = 0
for line = 1:length(body)
stmt = body[line]
if stmt isa Expr
Expand All @@ -363,9 +371,9 @@ function inline_worthy(body::Array{Any,1}, src::CodeInfo, spvals::SimpleVector,
continue
end
bodycost = plus_saturate(bodycost, thiscost)
bodycost == typemax(Int) && return false
bodycost > cost_threshold && return false
end
return bodycost <= cost_threshold
return true
end

function is_known_call(e::Expr, @nospecialize(func), src, spvals::SimpleVector, slottypes::Vector{Any} = empty_slottypes)
Expand Down
2 changes: 2 additions & 0 deletions base/compiler/ssair/inlining.jl
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,8 @@ end
function singleton_type(@nospecialize(ft))
if isa(ft, Const)
return ft.val
elseif ft isa DataType && isdefined(ft, :instance)
return ft.instance
end
return nothing
end
Expand Down
4 changes: 3 additions & 1 deletion base/compiler/tfuncs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,9 @@ add_tfunc(checked_smul_int, 2, 2, chk_tfunc, 10)
add_tfunc(checked_umul_int, 2, 2, chk_tfunc, 10)
## other, misc intrinsics ##
add_tfunc(Core.Intrinsics.llvmcall, 3, INT_INF,
(@nospecialize(fptr), @nospecialize(rt), @nospecialize(at), a...) -> instanceof_tfunc(rt)[1], 10)
# TODO: Lower this inlining cost. We currently need to prevent inlining llvmcall
# to avoid issues with its IR.
(@nospecialize(fptr), @nospecialize(rt), @nospecialize(at), a...) -> instanceof_tfunc(rt)[1], 1000)
cglobal_tfunc(@nospecialize(fptr)) = Ptr{Cvoid}
cglobal_tfunc(@nospecialize(fptr), @nospecialize(t)) = (isType(t) ? Ptr{t.parameters[1]} : Ptr)
cglobal_tfunc(@nospecialize(fptr), t::Const) = (isa(t.val, Type) ? Ptr{t.val} : Ptr)
Expand Down
4 changes: 2 additions & 2 deletions base/compiler/validation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ is_valid_lvalue(@nospecialize(x)) = isa(x, Slot) || isa(x, GlobalRef)

function is_valid_argument(@nospecialize(x))
if isa(x, Slot) || isa(x, SSAValue) || isa(x, GlobalRef) || isa(x, QuoteNode) ||
(isa(x,Expr) && (x.head in (:static_parameter, :boundscheck, :copyast))) ||
(isa(x,Expr) && (x.head in (:static_parameter, :boundscheck))) ||
isa(x, Number) || isa(x, AbstractString) || isa(x, AbstractChar) || isa(x, Tuple) ||
isa(x, Type) || isa(x, Core.Box) || isa(x, Module) || x === nothing
return true
Expand All @@ -223,7 +223,7 @@ end

function is_valid_rvalue(@nospecialize(x))
is_valid_argument(x) && return true
if isa(x, Expr) && x.head in (:new, :the_exception, :isdefined, :call, :invoke, :foreigncall, :cfunction, :gc_preserve_begin)
if isa(x, Expr) && x.head in (:new, :the_exception, :isdefined, :call, :invoke, :foreigncall, :cfunction, :gc_preserve_begin, :copyast)
return true
end
return false
Expand Down
5 changes: 3 additions & 2 deletions base/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -735,8 +735,9 @@ end
@deprecate findprev(A, v, i::Integer) something(findprev(isequal(v), A, i), 0)
@deprecate findlast(A, v) something(findlast(isequal(v), A), 0)
# to fix ambiguities introduced by deprecations
findnext(pred::Function, A, i::Integer) = invoke(findnext, Tuple{Function, Any, Any}, pred, A, i)
findprev(pred::Function, A, i::Integer) = invoke(findprev, Tuple{Function, Any, Any}, pred, A, i)
# TODO: also remove find*_internal in array.jl
findnext(pred::Function, A, i::Integer) = findnext_internal(pred, A, i)
findprev(pred::Function, A, i::Integer) = findprev_internal(pred, A, i)
# also remove deprecation warnings in find* functions in array.jl, sparse/sparsematrix.jl,
# and sparse/sparsevector.jl.

Expand Down
1 change: 1 addition & 0 deletions base/strings/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
pointer(s::String, i::Integer) = pointer(s)+(i-1)

ncodeunits(s::String) = Core.sizeof(s)
sizeof(s::String) = Core.sizeof(s)
codeunit(s::String) = UInt8

@inline function codeunit(s::String, i::Integer)
Expand Down
2 changes: 1 addition & 1 deletion test/compiler/compiler.jl
Original file line number Diff line number Diff line change
Expand Up @@ -860,7 +860,7 @@ function break_21369()
local fr
while true
fr = Base.StackTraces.lookup(bt[i])[end]
if !fr.from_c
if !fr.from_c && fr.func !== :error
break
end
i += 1
Expand Down
6 changes: 4 additions & 2 deletions test/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1269,10 +1269,12 @@ function compute_annotations(f, types)
join((strip(string(a, " "^(max_loc_method-length(a)), b)) for (a, b) in zip(la, lb)), '\n')
end

g_line() = leaf() # Deliberately not implemented to end up as a leaf after inlining
@noinline leaffunc() = print()

@inline g_line() = leaffunc()

# Test that separate instances of the same function do not get merged
function f_line()
@inline function f_line()
g_line()
g_line()
g_line()
Expand Down

3 comments on commit 9277d3a

@chriselrod
Copy link
Contributor

@chriselrod chriselrod commented on 9277d3a Jul 13, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This commit causes a severe regression for SIMD.jl.
Before this commit:

julia> @benchmark smul!($D32x6, $A32x32, $X32x6)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     116.412 ns (0.00% GC)
  median time:      116.647 ns (0.00% GC)
  mean time:        118.251 ns (0.00% GC)
  maximum time:     170.740 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     916

julia> @benchmark smul!($D96x96, $A96x128, $X128x96)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     19.515 μs (0.00% GC)
  median time:      19.634 μs (0.00% GC)
  mean time:        19.822 μs (0.00% GC)
  maximum time:     63.956 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

After:

julia> @benchmark smul!($D32x6, $A32x32, $X32x6)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.475 μs (0.00% GC)
  median time:      1.501 μs (0.00% GC)
  mean time:        1.526 μs (0.00% GC)
  maximum time:     3.671 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark smul!($D96x96, $A96x128, $X128x96)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     267.302 μs (0.00% GC)
  median time:      268.539 μs (0.00% GC)
  mean time:        271.203 μs (0.00% GC)
  maximum time:     395.263 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

While not as bad, the example from SIMD.jl's README is enough to reproduce (I'd be happy to share the code for the above examples as well, but this briefer example is enough to reproduce):

julia> using SIMD, BenchmarkTools

julia> const V = Vec{8,Float64}
Vec{8,Float64}

julia> function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N,T}
           @assert length(ys) == length(xs)
           @assert length(xs) % N == 0
           @inbounds for i in 1:N:length(xs)
               xv = vload(Vec{N,T}, xs, i)
               yv = vload(Vec{N,T}, ys, i)
               xv += yv
               vstore(xv, xs, i)
           end
       end
vadd! (generic function with 1 method)

julia> xs = randn(800);

julia> ys = randn(800);

julia> @btime vadd!($xs, $ys, V); # Commit 30bf7ac736
  139.569 ns (0 allocations: 0 bytes)
julia> @btime vadd!($xs, $ys, V); # Commit 9277d3af2d
  253.899 ns (0 allocations: 0 bytes)

Looking at @code_native for these examples shows code that looks like this prior to the commit:

; Location: SIMD.jl:116
	vbroadcastsd	768(%rdx,%rsi,8), %zmm28
;}}
; Function fma; {
; Location: SIMD.jl:1010
; Function llvmwrap; {
; Location: SIMD.jl:666
; Function llvmwrap; {
; Location: SIMD.jl:666
; Function macro expansion; {
; Location: SIMD.jl:688
	vfmadd231pd	%zmm28, %zmm21, %zmm8
	vfmadd231pd	%zmm28, %zmm22, %zmm9
	vfmadd231pd	%zmm28, %zmm23, %zmm10
	vfmadd231pd	%zmm28, %zmm27, %zmm11
;}}}}

vs this after:

; Location: SIMD.jl:1010
	leaq	1216(%rsp), %rdi
	movq	%r14, %rsi
	movq	%r13, %rdx
	leaq	3904(%rsp), %r12
	movq	%r12, %rcx
	leaq	4672(%rsp), %r8
	vzeroupper
	callq	*%r15
	leaq	1152(%rsp), %rdi
	movq	%r14, %rsi
	leaq	512(%rsp), %rdx
	movq	%r12, %rcx
	leaq	4608(%rsp), %r8
	callq	*%r15
	leaq	1088(%rsp), %rdi
	movq	%r14, %rsi
	leaq	256(%rsp), %rdx
	movq	%r12, %rcx
	leaq	4544(%rsp), %r8
	callq	*%r15
	leaq	1024(%rsp), %rdi
	movq	%r14, %rsi
	leaq	448(%rsp), %rdx
	movq	%r12, %rcx
	leaq	4480(%rsp), %r8
	callq	*%r15
;}

Difference is less extreme in the vadd! example.

Looking at the source code for SIMD.jl, it looks like it makes extensive use of @inline and $(Expr(:meta, :inline)) hints, that are now ignored:
https://github.com/eschnett/SIMD.jl/blob/master/src/SIMD.jl

@JeffBezanson
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guessing it's the same issue as #28078?

@chriselrod
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, sorry!

Please sign in to comment.