Skip to content

Commit

Permalink
Remove legacy pass manager pipelines. (#605)
Browse files Browse the repository at this point in the history
  • Loading branch information
maleadt authored Jul 18, 2024
1 parent 4f63a1f commit c9391bf
Showing 1 changed file with 1 addition and 283 deletions.
284 changes: 1 addition & 283 deletions src/optim.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,6 @@
# LLVM IR optimization

function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level=2)
optimize_newpm!(job, mod; opt_level)
# TODO: clean up
return
end


## new pm

function optimize_newpm!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level)
function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level=1)
tm = llvm_machine(job.config.target)

global current_job
Expand Down Expand Up @@ -292,279 +283,6 @@ function buildCleanupPipeline(mpm, @nospecialize(job::CompilerJob), opt_level)
end


## legacy pm

function optimize_legacypm!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level)
triple = llvm_triple(job.config.target)
tm = llvm_machine(job.config.target)

global current_job
current_job = job

@dispose pm=ModulePassManager() begin
addTargetPasses!(pm, tm, triple)
addOptimizationPasses!(pm, opt_level)
run!(pm, mod)
end

# NOTE: we need to use multiple distinct pass managers to force pass ordering;
# intrinsics should never get lowered before Julia has optimized them.
# XXX: why doesn't the barrier noop pass work here?

# lower intrinsics
@dispose pm=ModulePassManager() begin
addTargetPasses!(pm, tm, triple)

if !uses_julia_runtime(job)
lower_gc_frame!(pm)
end

if job.config.kernel
# GC lowering is the last pass that may introduce calls to the runtime library,
# and thus additional uses of the kernel state intrinsic.
# TODO: now that all kernel state-related passes are being run here, merge some?
add_kernel_state!(pm)
lower_kernel_state!(pm)
cleanup_kernel_state!(pm)
end

if !uses_julia_runtime(job)
# remove dead uses of ptls
aggressive_dce!(pm)
lower_ptls!(pm)
end

if uses_julia_runtime(job)
lower_exc_handlers!(pm)
end
# the Julia GC lowering pass also has some clean-up that is required
late_lower_gc_frame!(pm)
if uses_julia_runtime(job)
final_lower_gc!(pm)
end

remove_ni!(pm)
remove_julia_addrspaces!(pm)

if uses_julia_runtime(job)
# We need these two passes and the instcombine below
# after GC lowering to let LLVM do some constant propagation on the tags.
# and remove some unnecessary write barrier checks.
gvn!(pm)
sccp!(pm)
# Remove dead use of ptls
dce!(pm)
LLVM.Interop.lower_ptls!(pm, dump_native(job))
instruction_combining!(pm)
# Clean up write barrier and ptls lowering
cfgsimplification!(pm)
end

# Julia's operand bundles confuse the inliner, so repeat here now they are gone.
# FIXME: we should fix the inliner so that inlined code gets optimized early-on
always_inliner!(pm)

# some of Julia's optimization passes happen _after_ lowering intrinsics
combine_mul_add!(pm)
div_rem_pairs!(pm)

run!(pm, mod)
end

# target-specific optimizations
optimize_module!(job, mod)

# we compile a module containing the entire call graph,
# so perform some interprocedural optimizations.
#
# for some reason, these passes need to be distinct from the regular optimization chain,
# or certain values (such as the constant arrays used to populare llvm.compiler.user ad
# part of the LateLowerGCFrame pass) aren't collected properly.
#
# these might not always be safe, as Julia's IR metadata isn't designed for IPO.
@dispose pm=ModulePassManager() begin
addTargetPasses!(pm, tm, triple)

# simplify function calls that don't use the returned value
dead_arg_elimination!(pm)

run!(pm, mod)
end

return
end

function addTargetPasses!(pm, tm, triple)
add_library_info!(pm, triple)
add_transform_info!(pm, tm)
end

# Based on Julia's optimization pipeline, minus the SLP and loop vectorizers.
function addOptimizationPasses!(pm, opt_level)
# compare with the using Julia's optimization pipeline directly:
#ccall(:jl_add_optimization_passes, Cvoid,
# (LLVM.API.LLVMPassManagerRef, Cint, Cint),
# pm, opt_level, #=lower_intrinsics=# 0)
#return

# NOTE: LLVM 12 disabled the hoisting of common instruction
# before loop vectorization (https://reviews.llvm.org/D84108).
#
# This is re-enabled with calls to cfg_simplify here,
# to merge allocations and sometimes eliminate them,
# since AllocOpt does not handle PhiNodes.
# Enable this instruction hoisting because of this and Union benchmarks.

constant_merge!(pm)

if opt_level < 2
cpu_features!(pm)
if opt_level == 1
instruction_simplify!(pm)
end
if LLVM.version() >= v"12"
cfgsimplification!(pm; hoist_common_insts=true)
else
cfgsimplification!(pm)
end
if opt_level == 1
scalar_repl_aggregates!(pm)
instruction_combining!(pm)
early_cse!(pm)
# maybe add GVN?
# also try GVNHoist and GVNSink
end
mem_cpy_opt!(pm)
always_inliner!(pm) # Respect always_inline
lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop
return
end

propagate_julia_addrsp!(pm)
scoped_no_alias_aa!(pm)
type_based_alias_analysis!(pm)
if opt_level >= 3
basic_alias_analysis!(pm)
end
if LLVM.version() >= v"12"
cfgsimplification!(pm; hoist_common_insts=true)
else
cfgsimplification!(pm)
end
dce!(pm)
scalar_repl_aggregates!(pm)

#mem_cpy_opt!(pm)

always_inliner!(pm) # Respect always_inline

# Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard
# time merging the `alloca` for the unboxed data and the `alloca` created by
# the `alloc_opt` pass.

alloc_opt!(pm)
# consider AggressiveInstCombinePass at optlevel > 2
instruction_combining!(pm)
if LLVM.version() >= v"12"
cfgsimplification!(pm; hoist_common_insts=true)
else
cfgsimplification!(pm)
end
cpu_features!(pm)
scalar_repl_aggregates!(pm)
# SROA can duplicate PHI nodes which can block LowerSIMD
instruction_combining!(pm)
jump_threading!(pm)
correlated_value_propagation!(pm)

reassociate!(pm)

early_cse!(pm)

# Load forwarding above can expose allocations that aren't actually used
# remove those before optimizing loops.
alloc_opt!(pm)
loop_rotate!(pm)
# moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1)

# LoopRotate strips metadata from terminator, so run LowerSIMD afterwards
lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop
licm!(pm)
julia_licm!(pm)
if LLVM.version() >= v"15"
simple_loop_unswitch_legacy!(pm)
else
# XXX: simple loop unswitch is available on older versions of LLVM too,
# but using this pass instead of the old one breaks Metal.jl.
loop_unswitch!(pm)
end
licm!(pm)
julia_licm!(pm)
inductive_range_check_elimination!(pm)
# Subsequent passes not stripping metadata from terminator
instruction_simplify!(pm)
loop_idiom!(pm)
ind_var_simplify!(pm)
loop_deletion!(pm)
loop_unroll!(pm) # TODO: in Julia createSimpleLoopUnroll

# Run our own SROA on heap objects before LLVM's
alloc_opt!(pm)
# Re-run SROA after loop-unrolling (useful for small loops that operate,
# over the structure of an aggregate)
scalar_repl_aggregates!(pm)
# might not be necessary:
instruction_simplify!(pm)

gvn!(pm)
mem_cpy_opt!(pm)
sccp!(pm)

# These next two passes must come before IRCE to eliminate the bounds check in #43308
correlated_value_propagation!(pm)
dce!(pm)

inductive_range_check_elimination!(pm) # Must come between the two GVN passes

# Run instcombine after redundancy elimination to exploit opportunities
# opened up by them.
# This needs to be InstCombine instead of InstSimplify to allow
# loops over Union-typed arrays to vectorize.
instruction_combining!(pm)
jump_threading!(pm)
if opt_level >= 3
gvn!(pm) # Must come after JumpThreading and before LoopVectorize
end
dead_store_elimination!(pm)

# More dead allocation (store) deletion before loop optimization
# consider removing this:
alloc_opt!(pm)
# see if all of the constant folding has exposed more loops
# to simplification and deletion
# this helps significantly with cleaning up iteration
cfgsimplification!(pm) # See note above, don't hoist instructions before LV
loop_deletion!(pm)
instruction_combining!(pm)
loop_vectorize!(pm)
loop_load_elimination!(pm)
# Cleanup after LV pass
instruction_combining!(pm)
if LLVM.version() >= v"12"
cfgsimplification!(pm; # Aggressive CFG simplification
forward_switch_cond_to_phi=true,
convert_switch_to_lookup_table=true,
need_canonical_loop=true,
hoist_common_insts=true,
#sink_common_insts=true # FIXME: Causes assertion in llvm-late-lowering
)
else
cfgsimplification!(pm)
end

aggressive_dce!(pm)
end


## custom passes

Expand Down

0 comments on commit c9391bf

Please sign in to comment.