Codegen regression with llvmcall/inlining #28078

maleadt · 2018-07-12T12:45:42Z

MWE:

@inline use(i::Int64) =
    Base.llvmcall("""%slot = alloca i64
                     store volatile i64 %0, i64* %slot
                     %value = load volatile i64, i64* %slot
                     ret i64 %value""", Int64, Tuple{Int64}, i)


struct Wrapper{F} <: Core.Function
    f::F
end

function (k::Wrapper)(arg1, arg2)
    (k.f)(arg1, arg2)
    return nothing
end


foobar(unused, used) = use(used)

code_llvm(Wrapper(foobar), Tuple{Type{Int64}, Int64})

Before #27857:

; Function Wrapper
; Location: /home/tbesard/Julia/CUDAnative/wip.jl:13
define void @julia_Wrapper_32376(%jl_value_t addrspace(10)*, i64) {
top:
  %slot.i = alloca i64, align 8
; Function foobar; {
; Location: /home/tbesard/Julia/CUDAnative/wip.jl:18
; Function use; {
; Location: /home/tbesard/Julia/CUDAnative/wip.jl:1
  %slot.i.0.slot.i.0..sroa_cast = bitcast i64* %slot.i to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %slot.i.0.slot.i.0..sroa_cast)
  store volatile i64 %1, i64* %slot.i, align 8
  %slot.i.0.slot.i.0.slot.0.slot.0.value.i = load volatile i64, i64* %slot.i, align 8
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %slot.i.0.slot.i.0..sroa_cast)
;}}
; Location: /home/tbesard/Julia/CUDAnative/wip.jl:14
  ret void
}

After:

; Function Wrapper
; Location: /home/tbesard/Julia/CUDAnative/wip.jl:13
define void @julia_Wrapper_32797(%jl_value_t addrspace(10)*, i64) {
top:
  %2 = alloca %jl_value_t addrspace(10)*, i32 3
  %gcframe = alloca %jl_value_t addrspace(10)*, i32 3
  %3 = bitcast %jl_value_t addrspace(10)** %gcframe to i8*
  call void @llvm.memset.p0i8.i32(i8* %3, i8 0, i32 24, i32 0, i1 false)
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"()
  %ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -10920
  %ptls = bitcast i8* %ptls_i8 to %jl_value_t***
  %4 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 0
  %5 = bitcast %jl_value_t addrspace(10)** %4 to i64*
  store i64 2, i64* %5
  %6 = getelementptr %jl_value_t**, %jl_value_t*** %ptls, i32 0
  %7 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 1
  %8 = bitcast %jl_value_t addrspace(10)** %7 to %jl_value_t***
  %9 = load %jl_value_t**, %jl_value_t*** %6
  store %jl_value_t** %9, %jl_value_t*** %8
  %10 = bitcast %jl_value_t*** %6 to %jl_value_t addrspace(10)***
  store %jl_value_t addrspace(10)** %gcframe, %jl_value_t addrspace(10)*** %10
  %11 = call %jl_value_t addrspace(10)* @jl_box_int64(i64 signext %1)
  %12 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 2
  store %jl_value_t addrspace(10)* %11, %jl_value_t addrspace(10)** %12
  %13 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %2, i32 0
  store %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140506765508720 to %jl_value_t*) to %jl_value_t addrspace(10)*), %jl_value_t addrspace(10)** %13
  %14 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %2, i32 1
  store %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140506765445104 to %jl_value_t*) to %jl_value_t addrspace(10)*), %jl_value_t addrspace(10)** %14
  %15 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %2, i32 2
  store %jl_value_t addrspace(10)* %11, %jl_value_t addrspace(10)** %15
  %16 = call nonnull %jl_value_t addrspace(10)* @jl_invoke(%jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140506779001488 to %jl_value_t*) to %jl_value_t addrspace(10)*), %jl_value_t addrspace(10)** %2, i32 3)
; Location: /home/tbesard/Julia/CUDAnative/wip.jl:14
  %17 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 1
  %18 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %17
  %19 = getelementptr %jl_value_t**, %jl_value_t*** %ptls, i32 0
  %20 = bitcast %jl_value_t*** %19 to %jl_value_t addrspace(10)**
  store %jl_value_t addrspace(10)* %18, %jl_value_t addrspace(10)** %20
  ret void
}

Bisected to 9277d3a.

The text was updated successfully, but these errors were encountered:

KristofferC · 2018-07-12T12:46:54Z

Perhaps 9277d3a#diff-2c25696889a7cf8eed57b80f011be7fbR191?

JeffBezanson · 2018-07-12T20:41:08Z

Basically a duplicate of #27694; llvmcall needs to be overhauled for the new IR.

- Instead of always inlining functions marked at-inline, increase the cost threshold 20x - Don't inline functions inferred not to return - statement_cost no longer needs to look at nested Exprs in general - Fix cost of `:copyast`

chriselrod · 2018-07-14T14:13:26Z

Here's an example of another regression, this time without using llvmcall:

julia> using StaticArrays, BenchmarkTools
julia> A = @SMatrix randn(8,8);
julia> B = @SMatrix randn(8,8);
julia> @benchmark $A * $B
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     16.310 ns (0.00% GC)
  median time:      17.290 ns (0.00% GC)
  mean time:        17.885 ns (0.00% GC)
  maximum time:     63.709 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     998

vs

julia> @benchmark $A * $B
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     28.014 ns (0.00% GC)
  median time:      28.112 ns (0.00% GC)
  mean time:        28.664 ns (0.00% GC)
  maximum time:     59.865 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     995

I think it might be copying big tuples when it doesn't inline.
Structs get copied on non-inlined function calls, right? Is that factored into the cost of not inlining?
FWIW, prior to that commit the @code_native shows a bunch of math, and after:

@code_native A * B
	.text
; Function * {
; Location: matrix_multiply.jl:9
	pushq	%r14
	pushq	%rbx
	subq	$520, %rsp              # imm = 0x208
	movq	%rdi, %rbx
; Function _mul; {
; Location: matrix_multiply.jl:75
; Function macro expansion; {
; Location: matrix_multiply.jl:78
	movabsq	$"*;", %rax
	leaq	8(%rsp), %r14
	movq	%r14, %rdi
	callq	*%rax
	movabsq	$__memcpy_avx_unaligned_erms, %rax
;}}
	movl	$512, %edx              # imm = 0x200
	movq	%rbx, %rdi
	movq	%r14, %rsi
	callq	*%rax
	movq	%rbx, %rax
	addq	$520, %rsp              # imm = 0x208
	popq	%rbx
	popq	%r14
	retq
	nopw	%cs:(%rax,%rax)
;}

JeffBezanson · 2018-07-14T16:42:09Z

Ok, I think something can be done about that.

JeffBezanson · 2018-07-18T16:29:37Z

That issue captured as #28168. Closing this one as a duplicate of #27694.

along with the llvmcall fix, this fixes #28078

maleadt added regression Regression in behavior compared to a previous version compiler:codegen Generation of LLVM IR and native code gpu Affects running Julia on a GPU labels Jul 12, 2018

maleadt added a commit to JuliaGPU/CUDAnative.jl that referenced this issue Jul 12, 2018

Mark test broken, JuliaLang/julia#28078.

211e214

vtjnash removed the regression Regression in behavior compared to a previous version label Jul 12, 2018

JeffBezanson mentioned this issue Jul 18, 2018

inlining regression in large StaticArray kernels #28168

Closed

JeffBezanson closed this as completed Jul 18, 2018

maleadt mentioned this issue Jul 19, 2018

update llvmcall for new IR #28172

Merged

JeffBezanson added a commit that referenced this issue Jul 19, 2018

fix inlining cost of IntrinsicFunctions when consts have been widened

f29ab20

along with the llvmcall fix, this fixes #28078

JeffBezanson added a commit that referenced this issue Jul 19, 2018

fix inlining cost of IntrinsicFunctions when consts have been widened

9e9c4f8

along with the llvmcall fix, this fixes #28078

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Codegen regression with llvmcall/inlining #28078

Codegen regression with llvmcall/inlining #28078

maleadt commented Jul 12, 2018

KristofferC commented Jul 12, 2018

JeffBezanson commented Jul 12, 2018

chriselrod commented Jul 14, 2018

JeffBezanson commented Jul 14, 2018

JeffBezanson commented Jul 18, 2018

Codegen regression with llvmcall/inlining #28078

Codegen regression with llvmcall/inlining #28078

Comments

maleadt commented Jul 12, 2018

KristofferC commented Jul 12, 2018

JeffBezanson commented Jul 12, 2018

chriselrod commented Jul 14, 2018

JeffBezanson commented Jul 14, 2018

JeffBezanson commented Jul 18, 2018