Skip to content

Commit

Permalink
Auto merge of #112448 - nnethercote:no-tiny-cgus, r=wesleywiser
Browse files Browse the repository at this point in the history
Introduce a minimum CGU size in non-incremental builds.

Because tiny CGUs slow down compilation *and* result in worse generated code.

r? `@wesleywiser`
  • Loading branch information
bors committed Jun 14, 2023
2 parents 6330daa + 7c3ce02 commit fa8762b
Show file tree
Hide file tree
Showing 5 changed files with 201 additions and 46 deletions.
2 changes: 1 addition & 1 deletion compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1385,7 +1385,7 @@ fn vcall_visibility_metadata<'ll, 'tcx>(
let trait_def_id = trait_ref_self.def_id();
let trait_vis = cx.tcx.visibility(trait_def_id);

let cgus = cx.sess().codegen_units();
let cgus = cx.sess().codegen_units().as_usize();
let single_cgu = cgus == 1;

let lto = cx.sess().lto();
Expand Down
6 changes: 3 additions & 3 deletions compiler/rustc_codegen_ssa/src/back/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -646,10 +646,10 @@ fn produce_final_output_artifacts(
// rlib.
let needs_crate_object = crate_output.outputs.contains_key(&OutputType::Exe);

let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units() > 1;
let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units().as_usize() > 1;

let keep_numbered_objects =
needs_crate_object || (user_wants_objects && sess.codegen_units() > 1);
needs_crate_object || (user_wants_objects && sess.codegen_units().as_usize() > 1);

for module in compiled_modules.modules.iter() {
if let Some(ref path) = module.object {
Expand Down Expand Up @@ -1923,7 +1923,7 @@ impl<B: ExtraBackendMethods> OngoingCodegen<B> {

// FIXME: time_llvm_passes support - does this use a global context or
// something?
if sess.codegen_units() == 1 && sess.opts.unstable_opts.time_llvm_passes {
if sess.codegen_units().as_usize() == 1 && sess.opts.unstable_opts.time_llvm_passes {
self.backend.print_pass_timings()
}

Expand Down
202 changes: 168 additions & 34 deletions compiler/rustc_monomorphize/src/partitioning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ use rustc_middle::query::Providers;
use rustc_middle::ty::print::{characteristic_def_id_of_type, with_no_trimmed_paths};
use rustc_middle::ty::{self, visit::TypeVisitableExt, InstanceDef, TyCtxt};
use rustc_session::config::{DumpMonoStatsFormat, SwitchWithOptPath};
use rustc_session::CodegenUnits;
use rustc_span::symbol::Symbol;

use crate::collector::UsageMap;
Expand All @@ -121,7 +122,6 @@ use crate::errors::{CouldntDumpMonoStats, SymbolAlreadyDefined, UnknownCguCollec

struct PartitioningCx<'a, 'tcx> {
tcx: TyCtxt<'tcx>,
target_cgu_count: usize,
usage_map: &'a UsageMap<'tcx>,
}

Expand All @@ -130,26 +130,30 @@ struct PlacedRootMonoItems<'tcx> {
codegen_units: Vec<CodegenUnit<'tcx>>,

internalization_candidates: FxHashSet<MonoItem<'tcx>>,

/// These must be obtained when the iterator in `partition` runs. They
/// can't be obtained later because some inlined functions might not be
/// reachable.
unique_inlined_stats: (usize, usize),
}

// The output CGUs are sorted by name.
fn partition<'tcx, I>(
tcx: TyCtxt<'tcx>,
mono_items: I,
max_cgu_count: usize,
usage_map: &UsageMap<'tcx>,
) -> Vec<CodegenUnit<'tcx>>
where
I: Iterator<Item = MonoItem<'tcx>>,
{
let _prof_timer = tcx.prof.generic_activity("cgu_partitioning");

let cx = &PartitioningCx { tcx, target_cgu_count: max_cgu_count, usage_map };
let cx = &PartitioningCx { tcx, usage_map };

// In the first step, we place all regular monomorphizations into their
// respective 'home' codegen unit. Regular monomorphizations are all
// functions and statics defined in the local crate.
let PlacedRootMonoItems { mut codegen_units, internalization_candidates } = {
let PlacedRootMonoItems { mut codegen_units, internalization_candidates, unique_inlined_stats } = {
let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_roots");
place_root_mono_items(cx, mono_items)
};
Expand All @@ -158,15 +162,15 @@ where
cgu.create_size_estimate(tcx);
}

debug_dump(tcx, "INITIAL PARTITIONING", &codegen_units);
debug_dump(tcx, "ROOTS", &codegen_units, unique_inlined_stats);

// Merge until we have at most `max_cgu_count` codegen units.
// `merge_codegen_units` is responsible for updating the CGU size
// estimates.
{
let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus");
merge_codegen_units(cx, &mut codegen_units);
debug_dump(tcx, "POST MERGING", &codegen_units);
debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats);
}

// In the next step, we use the inlining map to determine which additional
Expand All @@ -182,7 +186,7 @@ where
cgu.create_size_estimate(tcx);
}

debug_dump(tcx, "POST INLINING", &codegen_units);
debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats);

// Next we try to make as many symbols "internal" as possible, so LLVM has
// more freedom to optimize.
Expand Down Expand Up @@ -226,7 +230,7 @@ where
// Ensure CGUs are sorted by name, so that we get deterministic results.
assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str()))));

debug_dump(tcx, "FINAL", &codegen_units);
debug_dump(tcx, "FINAL", &codegen_units, unique_inlined_stats);

codegen_units
}
Expand All @@ -252,10 +256,16 @@ where
let cgu_name_builder = &mut CodegenUnitNameBuilder::new(cx.tcx);
let cgu_name_cache = &mut FxHashMap::default();

let mut num_unique_inlined_items = 0;
let mut unique_inlined_items_size = 0;
for mono_item in mono_items {
match mono_item.instantiation_mode(cx.tcx) {
InstantiationMode::GloballyShared { .. } => {}
InstantiationMode::LocalCopy => continue,
InstantiationMode::LocalCopy => {
num_unique_inlined_items += 1;
unique_inlined_items_size += mono_item.size_estimate(cx.tcx);
continue;
}
}

let characteristic_def_id = characteristic_def_id_of_mono_item(cx.tcx, mono_item);
Expand Down Expand Up @@ -300,7 +310,11 @@ where
let mut codegen_units: Vec<_> = codegen_units.into_values().collect();
codegen_units.sort_by(|a, b| a.name().as_str().cmp(b.name().as_str()));

PlacedRootMonoItems { codegen_units, internalization_candidates }
PlacedRootMonoItems {
codegen_units,
internalization_candidates,
unique_inlined_stats: (num_unique_inlined_items, unique_inlined_items_size),
}
}

// This function requires the CGUs to be sorted by name on input, and ensures
Expand All @@ -309,7 +323,7 @@ fn merge_codegen_units<'tcx>(
cx: &PartitioningCx<'_, 'tcx>,
codegen_units: &mut Vec<CodegenUnit<'tcx>>,
) {
assert!(cx.target_cgu_count >= 1);
assert!(cx.tcx.sess.codegen_units().as_usize() >= 1);

// A sorted order here ensures merging is deterministic.
assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str()))));
Expand All @@ -318,11 +332,32 @@ fn merge_codegen_units<'tcx>(
let mut cgu_contents: FxHashMap<Symbol, Vec<Symbol>> =
codegen_units.iter().map(|cgu| (cgu.name(), vec![cgu.name()])).collect();

// Merge the two smallest codegen units until the target size is
// reached.
while codegen_units.len() > cx.target_cgu_count {
// Sort small cgus to the back
// Having multiple CGUs can drastically speed up compilation. But for
// non-incremental builds, tiny CGUs slow down compilation *and* result in
// worse generated code. So we don't allow CGUs smaller than this (unless
// there is just one CGU, of course). Note that CGU sizes of 100,000+ are
// common in larger programs, so this isn't all that large.
const NON_INCR_MIN_CGU_SIZE: usize = 1000;

// Repeatedly merge the two smallest codegen units as long as:
// - we have more CGUs than the upper limit, or
// - (Non-incremental builds only) the user didn't specify a CGU count, and
// there are multiple CGUs, and some are below the minimum size.
//
// The "didn't specify a CGU count" condition is because when an explicit
// count is requested we observe it as closely as possible. For example,
// the `compiler_builtins` crate sets `codegen-units = 10000` and it's
// critical they aren't merged. Also, some tests use explicit small values
// and likewise won't work if small CGUs are merged.
while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize()
|| (cx.tcx.sess.opts.incremental.is_none()
&& matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
&& codegen_units.len() > 1
&& codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE))
{
// Sort small cgus to the back.
codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));

let mut smallest = codegen_units.pop().unwrap();
let second_smallest = codegen_units.last_mut().unwrap();

Expand Down Expand Up @@ -814,47 +849,147 @@ fn default_visibility(tcx: TyCtxt<'_>, id: DefId, is_generic: bool) -> Visibilit
}
}

fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit<'tcx>]) {
fn debug_dump<'a, 'tcx: 'a>(
tcx: TyCtxt<'tcx>,
label: &str,
cgus: &[CodegenUnit<'tcx>],
(unique_inlined_items, unique_inlined_size): (usize, usize),
) {
let dump = move || {
use std::fmt::Write;

let num_cgus = cgus.len();
let num_items: usize = cgus.iter().map(|cgu| cgu.items().len()).sum();
let total_size: usize = cgus.iter().map(|cgu| cgu.size_estimate()).sum();
let max_size = cgus.iter().map(|cgu| cgu.size_estimate()).max().unwrap();
let min_size = cgus.iter().map(|cgu| cgu.size_estimate()).min().unwrap();
let max_min_size_ratio = max_size as f64 / min_size as f64;
let mut num_cgus = 0;
let mut all_cgu_sizes = Vec::new();

// Note: every unique root item is placed exactly once, so the number
// of unique root items always equals the number of placed root items.

let mut root_items = 0;
// unique_inlined_items is passed in above.
let mut placed_inlined_items = 0;

let mut root_size = 0;
// unique_inlined_size is passed in above.
let mut placed_inlined_size = 0;

for cgu in cgus.iter() {
num_cgus += 1;
all_cgu_sizes.push(cgu.size_estimate());

for (item, _) in cgu.items() {
match item.instantiation_mode(tcx) {
InstantiationMode::GloballyShared { .. } => {
root_items += 1;
root_size += item.size_estimate(tcx);
}
InstantiationMode::LocalCopy => {
placed_inlined_items += 1;
placed_inlined_size += item.size_estimate(tcx);
}
}
}
}

all_cgu_sizes.sort_unstable_by_key(|&n| cmp::Reverse(n));

let unique_items = root_items + unique_inlined_items;
let placed_items = root_items + placed_inlined_items;
let items_ratio = placed_items as f64 / unique_items as f64;

let unique_size = root_size + unique_inlined_size;
let placed_size = root_size + placed_inlined_size;
let size_ratio = placed_size as f64 / unique_size as f64;

let mean_cgu_size = placed_size as f64 / num_cgus as f64;

assert_eq!(placed_size, all_cgu_sizes.iter().sum::<usize>());

let s = &mut String::new();
let _ = writeln!(s, "{label}");
let _ = writeln!(
s,
"{label} ({num_items} items, total_size={total_size}; {num_cgus} CGUs, \
max_size={max_size}, min_size={min_size}, max_size/min_size={max_min_size_ratio:.1}):"
"- unique items: {unique_items} ({root_items} root + {unique_inlined_items} inlined), \
unique size: {unique_size} ({root_size} root + {unique_inlined_size} inlined)\n\
- placed items: {placed_items} ({root_items} root + {placed_inlined_items} inlined), \
placed size: {placed_size} ({root_size} root + {placed_inlined_size} inlined)\n\
- placed/unique items ratio: {items_ratio:.2}, \
placed/unique size ratio: {size_ratio:.2}\n\
- CGUs: {num_cgus}, mean size: {mean_cgu_size:.1}, sizes: {}",
list(&all_cgu_sizes),
);
let _ = writeln!(s);

for (i, cgu) in cgus.iter().enumerate() {
let name = cgu.name();
let size = cgu.size_estimate();
let num_items = cgu.items().len();
let _ = writeln!(
s,
"- CGU[{i}] {} ({num_items} items, size={}):",
cgu.name(),
cgu.size_estimate()
);
let mean_size = size as f64 / num_items as f64;

let mut placed_item_sizes: Vec<_> =
cgu.items().iter().map(|(item, _)| item.size_estimate(tcx)).collect();
placed_item_sizes.sort_unstable_by_key(|&n| cmp::Reverse(n));
let sizes = list(&placed_item_sizes);

let _ = writeln!(s, "- CGU[{i}]");
let _ = writeln!(s, " - {name}, size: {size}");
let _ =
writeln!(s, " - items: {num_items}, mean size: {mean_size:.1}, sizes: {sizes}",);

for (item, linkage) in cgu.items_in_deterministic_order(tcx) {
let symbol_name = item.symbol_name(tcx).name;
let symbol_hash_start = symbol_name.rfind('h');
let symbol_hash = symbol_hash_start.map_or("<no hash>", |i| &symbol_name[i..]);
let size = item.size_estimate(tcx);
let kind = match item.instantiation_mode(tcx) {
InstantiationMode::GloballyShared { .. } => "root",
InstantiationMode::LocalCopy => "inlined",
};
let _ = with_no_trimmed_paths!(writeln!(
s,
" - {item} [{linkage:?}] [{symbol_hash}] (size={size})"
" - {item} [{linkage:?}] [{symbol_hash}] ({kind}, size: {size})"
));
}

let _ = writeln!(s);
}

std::mem::take(s)
return std::mem::take(s);

// Converts a slice to a string, capturing repetitions to save space.
// E.g. `[4, 4, 4, 3, 2, 1, 1, 1, 1, 1]` -> "[4 (x3), 3, 2, 1 (x5)]".
fn list(ns: &[usize]) -> String {
let mut v = Vec::new();
if ns.is_empty() {
return "[]".to_string();
}

let mut elem = |curr, curr_count| {
if curr_count == 1 {
v.push(format!("{curr}"));
} else {
v.push(format!("{curr} (x{curr_count})"));
}
};

let mut curr = ns[0];
let mut curr_count = 1;

for &n in &ns[1..] {
if n != curr {
elem(curr, curr_count);
curr = n;
curr_count = 1;
} else {
curr_count += 1;
}
}
elem(curr, curr_count);

let mut s = "[".to_string();
s.push_str(&v.join(", "));
s.push_str("]");
s
}
};

debug!("{}", dump());
Expand Down Expand Up @@ -922,8 +1057,7 @@ fn collect_and_partition_mono_items(tcx: TyCtxt<'_>, (): ()) -> (&DefIdSet, &[Co
let (codegen_units, _) = tcx.sess.time("partition_and_assert_distinct_symbols", || {
sync::join(
|| {
let mut codegen_units =
partition(tcx, items.iter().copied(), tcx.sess.codegen_units(), &usage_map);
let mut codegen_units = partition(tcx, items.iter().copied(), &usage_map);
codegen_units[0].make_primary();
&*tcx.arena.alloc_from_iter(codegen_units)
},
Expand Down
Loading

0 comments on commit fa8762b

Please sign in to comment.