From 51821515b3ccd7dd8f42ffd6a2eee536dcf7ddb0 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Sun, 11 Jun 2023 20:45:04 +1000
Subject: [PATCH 1/3] Remove `PartitioningCx::target_cgu_count`.

Because that value can be easily obtained from `Partitioning::tcx`.
---
 compiler/rustc_monomorphize/src/partitioning.rs | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index 1d9c8ded349c0..fe869038393a2 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -121,7 +121,6 @@ use crate::errors::{CouldntDumpMonoStats, SymbolAlreadyDefined, UnknownCguCollec
 
 struct PartitioningCx<'a, 'tcx> {
     tcx: TyCtxt<'tcx>,
-    target_cgu_count: usize,
     usage_map: &'a UsageMap<'tcx>,
 }
 
@@ -136,7 +135,6 @@ struct PlacedRootMonoItems<'tcx> {
 fn partition<'tcx, I>(
     tcx: TyCtxt<'tcx>,
     mono_items: I,
-    max_cgu_count: usize,
     usage_map: &UsageMap<'tcx>,
 ) -> Vec<CodegenUnit<'tcx>>
 where
@@ -144,7 +142,7 @@ where
 {
     let _prof_timer = tcx.prof.generic_activity("cgu_partitioning");
 
-    let cx = &PartitioningCx { tcx, target_cgu_count: max_cgu_count, usage_map };
+    let cx = &PartitioningCx { tcx, usage_map };
 
     // In the first step, we place all regular monomorphizations into their
     // respective 'home' codegen unit. Regular monomorphizations are all
@@ -309,7 +307,7 @@ fn merge_codegen_units<'tcx>(
     cx: &PartitioningCx<'_, 'tcx>,
     codegen_units: &mut Vec<CodegenUnit<'tcx>>,
 ) {
-    assert!(cx.target_cgu_count >= 1);
+    assert!(cx.tcx.sess.codegen_units() >= 1);
 
     // A sorted order here ensures merging is deterministic.
     assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str()))));
@@ -320,7 +318,7 @@ fn merge_codegen_units<'tcx>(
 
     // Merge the two smallest codegen units until the target size is
     // reached.
-    while codegen_units.len() > cx.target_cgu_count {
+    while codegen_units.len() > cx.tcx.sess.codegen_units() {
         // Sort small cgus to the back
         codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
         let mut smallest = codegen_units.pop().unwrap();
@@ -922,8 +920,7 @@ fn collect_and_partition_mono_items(tcx: TyCtxt<'_>, (): ()) -> (&DefIdSet, &[Co
     let (codegen_units, _) = tcx.sess.time("partition_and_assert_distinct_symbols", || {
         sync::join(
             || {
-                let mut codegen_units =
-                    partition(tcx, items.iter().copied(), tcx.sess.codegen_units(), &usage_map);
+                let mut codegen_units = partition(tcx, items.iter().copied(), &usage_map);
                 codegen_units[0].make_primary();
                 &*tcx.arena.alloc_from_iter(codegen_units)
             },

From 95d85899ce7ffca741c180c9f2ef0d29f5ab9e40 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Tue, 13 Jun 2023 11:07:38 +1000
Subject: [PATCH 2/3] Add more measurements to the CGU debug printing.

For example, we go from this:
```
FINAL (4059 items, total_size=232342; 16 CGUs, max_size=39608,
min_size=5468, max_size/min_size=7.2):
- CGU[0] regex.f2ff11e98f8b05c7-cgu.0 (318 items, size=39608):
  - fn ...
  - fn ...
```
to this:
```
FINAL
- unique items: 2726 (1459 root + 1267 inlined), unique size: 201214 (146046 root + 55168 inlined)
- placed items: 4059 (1459 root + 2600 inlined), placed size: 232342 (146046 root + 86296 inlined)
- placed/unique items ratio: 1.49, placed/unique size ratio: 1.15
- CGUs: 16, mean size: 14521.4, sizes: [39608, 31122, 20318, 20236, 16268, 13777, 12310, 10531, 10205, 9810, 9250, 9065 (x2), 7785, 7524, 5468]

- CGU[0]
  - regex.f2ff11e98f8b05c7-cgu.0, size: 39608
  - items: 318, mean size: 124.6, sizes: [28395, 3418, 558, 485, 259, 228, 176, 166, 146, 118, 117 (x3), 114 (x5), 113 (x3), 101, 84, 82, 77, 76, 72, 71 (x2), 66, 65, 62, 61, 59 (x2), 57, 55, 54 (x2), 53 (x4), 52 (x5), 51 (x4), 50, 48, 47, 46, 45 (x3), 44, 43 (x5), 42, 40, 38 (x4), 37, 35, 34 (x2), 32 (x2), 31, 30, 28 (x2), 27 (x2), 26 (x3), 24 (x2), 23 (x3), 22 (x2), 21, 20, 16 (x4), 15, 13 (x7), 12 (x3), 11 (x6), 10, 9 (x2), 8 (x4), 7 (x8), 6 (x38), 5 (x21), 4 (x7), 3 (x45), 2 (x63), 1 (x13)]
  - fn ...
  - fn ...
```
This is a lot more information, distinguishing between root items and
inlined items, showing how much duplication there is of inlined items,
plus the full range of sizes for CGUs and items within CGUs. All of
which is really helpful when analyzing this stuff and trying different
CGU formation algorithms.
---
 .../rustc_monomorphize/src/partitioning.rs    | 157 +++++++++++++++---
 1 file changed, 134 insertions(+), 23 deletions(-)

diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index fe869038393a2..f0c9605da1d13 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -129,6 +129,11 @@ struct PlacedRootMonoItems<'tcx> {
     codegen_units: Vec<CodegenUnit<'tcx>>,
 
     internalization_candidates: FxHashSet<MonoItem<'tcx>>,
+
+    /// These must be obtained when the iterator in `partition` runs. They
+    /// can't be obtained later because some inlined functions might not be
+    /// reachable.
+    unique_inlined_stats: (usize, usize),
 }
 
 // The output CGUs are sorted by name.
@@ -147,7 +152,7 @@ where
     // In the first step, we place all regular monomorphizations into their
     // respective 'home' codegen unit. Regular monomorphizations are all
     // functions and statics defined in the local crate.
-    let PlacedRootMonoItems { mut codegen_units, internalization_candidates } = {
+    let PlacedRootMonoItems { mut codegen_units, internalization_candidates, unique_inlined_stats } = {
         let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_roots");
         place_root_mono_items(cx, mono_items)
     };
@@ -156,7 +161,7 @@ where
         cgu.create_size_estimate(tcx);
     }
 
-    debug_dump(tcx, "INITIAL PARTITIONING", &codegen_units);
+    debug_dump(tcx, "ROOTS", &codegen_units, unique_inlined_stats);
 
     // Merge until we have at most `max_cgu_count` codegen units.
     // `merge_codegen_units` is responsible for updating the CGU size
@@ -164,7 +169,7 @@ where
     {
         let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus");
         merge_codegen_units(cx, &mut codegen_units);
-        debug_dump(tcx, "POST MERGING", &codegen_units);
+        debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats);
     }
 
     // In the next step, we use the inlining map to determine which additional
@@ -180,7 +185,7 @@ where
         cgu.create_size_estimate(tcx);
     }
 
-    debug_dump(tcx, "POST INLINING", &codegen_units);
+    debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats);
 
     // Next we try to make as many symbols "internal" as possible, so LLVM has
     // more freedom to optimize.
@@ -224,7 +229,7 @@ where
     // Ensure CGUs are sorted by name, so that we get deterministic results.
     assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str()))));
 
-    debug_dump(tcx, "FINAL", &codegen_units);
+    debug_dump(tcx, "FINAL", &codegen_units, unique_inlined_stats);
 
     codegen_units
 }
@@ -250,10 +255,16 @@ where
     let cgu_name_builder = &mut CodegenUnitNameBuilder::new(cx.tcx);
     let cgu_name_cache = &mut FxHashMap::default();
 
+    let mut num_unique_inlined_items = 0;
+    let mut unique_inlined_items_size = 0;
     for mono_item in mono_items {
         match mono_item.instantiation_mode(cx.tcx) {
             InstantiationMode::GloballyShared { .. } => {}
-            InstantiationMode::LocalCopy => continue,
+            InstantiationMode::LocalCopy => {
+                num_unique_inlined_items += 1;
+                unique_inlined_items_size += mono_item.size_estimate(cx.tcx);
+                continue;
+            }
         }
 
         let characteristic_def_id = characteristic_def_id_of_mono_item(cx.tcx, mono_item);
@@ -298,7 +309,11 @@ where
     let mut codegen_units: Vec<_> = codegen_units.into_values().collect();
     codegen_units.sort_by(|a, b| a.name().as_str().cmp(b.name().as_str()));
 
-    PlacedRootMonoItems { codegen_units, internalization_candidates }
+    PlacedRootMonoItems {
+        codegen_units,
+        internalization_candidates,
+        unique_inlined_stats: (num_unique_inlined_items, unique_inlined_items_size),
+    }
 }
 
 // This function requires the CGUs to be sorted by name on input, and ensures
@@ -812,31 +827,91 @@ fn default_visibility(tcx: TyCtxt<'_>, id: DefId, is_generic: bool) -> Visibilit
     }
 }
 
-fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit<'tcx>]) {
+fn debug_dump<'a, 'tcx: 'a>(
+    tcx: TyCtxt<'tcx>,
+    label: &str,
+    cgus: &[CodegenUnit<'tcx>],
+    (unique_inlined_items, unique_inlined_size): (usize, usize),
+) {
     let dump = move || {
         use std::fmt::Write;
 
-        let num_cgus = cgus.len();
-        let num_items: usize = cgus.iter().map(|cgu| cgu.items().len()).sum();
-        let total_size: usize = cgus.iter().map(|cgu| cgu.size_estimate()).sum();
-        let max_size = cgus.iter().map(|cgu| cgu.size_estimate()).max().unwrap();
-        let min_size = cgus.iter().map(|cgu| cgu.size_estimate()).min().unwrap();
-        let max_min_size_ratio = max_size as f64 / min_size as f64;
+        let mut num_cgus = 0;
+        let mut all_cgu_sizes = Vec::new();
+
+        // Note: every unique root item is placed exactly once, so the number
+        // of unique root items always equals the number of placed root items.
+
+        let mut root_items = 0;
+        // unique_inlined_items is passed in above.
+        let mut placed_inlined_items = 0;
+
+        let mut root_size = 0;
+        // unique_inlined_size is passed in above.
+        let mut placed_inlined_size = 0;
+
+        for cgu in cgus.iter() {
+            num_cgus += 1;
+            all_cgu_sizes.push(cgu.size_estimate());
+
+            for (item, _) in cgu.items() {
+                match item.instantiation_mode(tcx) {
+                    InstantiationMode::GloballyShared { .. } => {
+                        root_items += 1;
+                        root_size += item.size_estimate(tcx);
+                    }
+                    InstantiationMode::LocalCopy => {
+                        placed_inlined_items += 1;
+                        placed_inlined_size += item.size_estimate(tcx);
+                    }
+                }
+            }
+        }
+
+        all_cgu_sizes.sort_unstable_by_key(|&n| cmp::Reverse(n));
+
+        let unique_items = root_items + unique_inlined_items;
+        let placed_items = root_items + placed_inlined_items;
+        let items_ratio = placed_items as f64 / unique_items as f64;
+
+        let unique_size = root_size + unique_inlined_size;
+        let placed_size = root_size + placed_inlined_size;
+        let size_ratio = placed_size as f64 / unique_size as f64;
+
+        let mean_cgu_size = placed_size as f64 / num_cgus as f64;
+
+        assert_eq!(placed_size, all_cgu_sizes.iter().sum::<usize>());
 
         let s = &mut String::new();
+        let _ = writeln!(s, "{label}");
         let _ = writeln!(
             s,
-            "{label} ({num_items} items, total_size={total_size}; {num_cgus} CGUs, \
-             max_size={max_size}, min_size={min_size}, max_size/min_size={max_min_size_ratio:.1}):"
+            "- unique items: {unique_items} ({root_items} root + {unique_inlined_items} inlined), \
+               unique size: {unique_size} ({root_size} root + {unique_inlined_size} inlined)\n\
+             - placed items: {placed_items} ({root_items} root + {placed_inlined_items} inlined), \
+               placed size: {placed_size} ({root_size} root + {placed_inlined_size} inlined)\n\
+             - placed/unique items ratio: {items_ratio:.2}, \
+               placed/unique size ratio: {size_ratio:.2}\n\
+             - CGUs: {num_cgus}, mean size: {mean_cgu_size:.1}, sizes: {}",
+            list(&all_cgu_sizes),
         );
+        let _ = writeln!(s);
+
         for (i, cgu) in cgus.iter().enumerate() {
+            let name = cgu.name();
+            let size = cgu.size_estimate();
             let num_items = cgu.items().len();
-            let _ = writeln!(
-                s,
-                "- CGU[{i}] {} ({num_items} items, size={}):",
-                cgu.name(),
-                cgu.size_estimate()
-            );
+            let mean_size = size as f64 / num_items as f64;
+
+            let mut placed_item_sizes: Vec<_> =
+                cgu.items().iter().map(|(item, _)| item.size_estimate(tcx)).collect();
+            placed_item_sizes.sort_unstable_by_key(|&n| cmp::Reverse(n));
+            let sizes = list(&placed_item_sizes);
+
+            let _ = writeln!(s, "- CGU[{i}]");
+            let _ = writeln!(s, "  - {name}, size: {size}");
+            let _ =
+                writeln!(s, "  - items: {num_items}, mean size: {mean_size:.1}, sizes: {sizes}",);
 
             for (item, linkage) in cgu.items_in_deterministic_order(tcx) {
                 let symbol_name = item.symbol_name(tcx).name;
@@ -852,7 +927,43 @@ fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit<
             let _ = writeln!(s);
         }
 
-        std::mem::take(s)
+        return std::mem::take(s);
+
+        // Converts a slice to a string, capturing repetitions to save space.
+        // E.g. `[4, 4, 4, 3, 2, 1, 1, 1, 1, 1]` -> "[4 (x3), 3, 2, 1 (x5)]".
+        fn list(ns: &[usize]) -> String {
+            let mut v = Vec::new();
+            if ns.is_empty() {
+                return "[]".to_string();
+            }
+
+            let mut elem = |curr, curr_count| {
+                if curr_count == 1 {
+                    v.push(format!("{curr}"));
+                } else {
+                    v.push(format!("{curr} (x{curr_count})"));
+                }
+            };
+
+            let mut curr = ns[0];
+            let mut curr_count = 1;
+
+            for &n in &ns[1..] {
+                if n != curr {
+                    elem(curr, curr_count);
+                    curr = n;
+                    curr_count = 1;
+                } else {
+                    curr_count += 1;
+                }
+            }
+            elem(curr, curr_count);
+
+            let mut s = "[".to_string();
+            s.push_str(&v.join(", "));
+            s.push_str("]");
+            s
+        }
     };
 
     debug!("{}", dump());

From 7c3ce02a1124b008addb9764173ad441f4741c00 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Fri, 9 Jun 2023 14:39:13 +1000
Subject: [PATCH 3/3] Introduce a minimum CGU size in non-incremental builds.

Because tiny CGUs make compilation less efficient *and* result in worse
generated code.

We don't do this when the number of CGUs is explicitly given, because
there are times when the requested number is very important, as
described in some comments within the commit. So the commit also
introduces a `CodegenUnits` type that distinguishes between default
values and user-specified values.

This change has a roughly neutral effect on walltimes across the
rustc-perf benchmarks; there are some speedups and some slowdowns. But
it has significant wins for most other metrics on numerous benchmarks,
including instruction counts, cycles, binary size, and max-rss. It also
reduces parallelism, which is good for reducing jobserver competition
when multiple rustc processes are running at the same time. It's smaller
benchmarks that benefit the most; larger benchmarks already have CGUs
that are all larger than the minimum size.

Here are some example before/after CGU sizes for opt builds.

- html5ever
  - CGUs: 16, mean size: 1196.1, sizes: [3908, 2992, 1706, 1652, 1572,
    1136, 1045, 948, 946, 938, 579, 471, 443, 327, 286, 189]
  - CGUs: 4, mean size: 4396.0, sizes: [6706, 3908, 3490, 3480]

- libc
  - CGUs: 12, mean size: 35.3, sizes: [163, 93, 58, 53, 37, 8, 2 (x6)]
  - CGUs: 1, mean size: 424.0, sizes: [424]

- tt-muncher
  - CGUs: 5, mean size: 1819.4, sizes: [8508, 350, 198, 34, 7]
  - CGUs: 1, mean size: 9075.0, sizes: [9075]

Note that CGUs of size 100,000+ aren't unusual in larger programs.
---
 .../src/debuginfo/metadata.rs                 |  2 +-
 compiler/rustc_codegen_ssa/src/back/write.rs  |  6 +--
 .../rustc_monomorphize/src/partitioning.rs    | 38 ++++++++++++++++---
 compiler/rustc_session/src/session.rs         | 33 +++++++++++++---
 src/doc/rustc/src/codegen-options/index.md    |  4 +-
 5 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs b/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs
index 166454d3ae74c..fa67a1b331011 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs
@@ -1385,7 +1385,7 @@ fn vcall_visibility_metadata<'ll, 'tcx>(
     let trait_def_id = trait_ref_self.def_id();
     let trait_vis = cx.tcx.visibility(trait_def_id);
 
-    let cgus = cx.sess().codegen_units();
+    let cgus = cx.sess().codegen_units().as_usize();
     let single_cgu = cgus == 1;
 
     let lto = cx.sess().lto();
diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs
index 9be69e560e888..109e9959aeac8 100644
--- a/compiler/rustc_codegen_ssa/src/back/write.rs
+++ b/compiler/rustc_codegen_ssa/src/back/write.rs
@@ -646,10 +646,10 @@ fn produce_final_output_artifacts(
         // rlib.
         let needs_crate_object = crate_output.outputs.contains_key(&OutputType::Exe);
 
-        let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units() > 1;
+        let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units().as_usize() > 1;
 
         let keep_numbered_objects =
-            needs_crate_object || (user_wants_objects && sess.codegen_units() > 1);
+            needs_crate_object || (user_wants_objects && sess.codegen_units().as_usize() > 1);
 
         for module in compiled_modules.modules.iter() {
             if let Some(ref path) = module.object {
@@ -1923,7 +1923,7 @@ impl<B: ExtraBackendMethods> OngoingCodegen<B> {
 
         // FIXME: time_llvm_passes support - does this use a global context or
         // something?
-        if sess.codegen_units() == 1 && sess.opts.unstable_opts.time_llvm_passes {
+        if sess.codegen_units().as_usize() == 1 && sess.opts.unstable_opts.time_llvm_passes {
             self.backend.print_pass_timings()
         }
 
diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index f0c9605da1d13..ebcc3b0399973 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -113,6 +113,7 @@ use rustc_middle::query::Providers;
 use rustc_middle::ty::print::{characteristic_def_id_of_type, with_no_trimmed_paths};
 use rustc_middle::ty::{self, visit::TypeVisitableExt, InstanceDef, TyCtxt};
 use rustc_session::config::{DumpMonoStatsFormat, SwitchWithOptPath};
+use rustc_session::CodegenUnits;
 use rustc_span::symbol::Symbol;
 
 use crate::collector::UsageMap;
@@ -322,7 +323,7 @@ fn merge_codegen_units<'tcx>(
     cx: &PartitioningCx<'_, 'tcx>,
     codegen_units: &mut Vec<CodegenUnit<'tcx>>,
 ) {
-    assert!(cx.tcx.sess.codegen_units() >= 1);
+    assert!(cx.tcx.sess.codegen_units().as_usize() >= 1);
 
     // A sorted order here ensures merging is deterministic.
     assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str()))));
@@ -331,11 +332,32 @@ fn merge_codegen_units<'tcx>(
     let mut cgu_contents: FxHashMap<Symbol, Vec<Symbol>> =
         codegen_units.iter().map(|cgu| (cgu.name(), vec![cgu.name()])).collect();
 
-    // Merge the two smallest codegen units until the target size is
-    // reached.
-    while codegen_units.len() > cx.tcx.sess.codegen_units() {
-        // Sort small cgus to the back
+    // Having multiple CGUs can drastically speed up compilation. But for
+    // non-incremental builds, tiny CGUs slow down compilation *and* result in
+    // worse generated code. So we don't allow CGUs smaller than this (unless
+    // there is just one CGU, of course). Note that CGU sizes of 100,000+ are
+    // common in larger programs, so this isn't all that large.
+    const NON_INCR_MIN_CGU_SIZE: usize = 1000;
+
+    // Repeatedly merge the two smallest codegen units as long as:
+    // - we have more CGUs than the upper limit, or
+    // - (Non-incremental builds only) the user didn't specify a CGU count, and
+    //   there are multiple CGUs, and some are below the minimum size.
+    //
+    // The "didn't specify a CGU count" condition is because when an explicit
+    // count is requested we observe it as closely as possible. For example,
+    // the `compiler_builtins` crate sets `codegen-units = 10000` and it's
+    // critical they aren't merged. Also, some tests use explicit small values
+    // and likewise won't work if small CGUs are merged.
+    while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize()
+        || (cx.tcx.sess.opts.incremental.is_none()
+            && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
+            && codegen_units.len() > 1
+            && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE))
+    {
+        // Sort small cgus to the back.
         codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
+
         let mut smallest = codegen_units.pop().unwrap();
         let second_smallest = codegen_units.last_mut().unwrap();
 
@@ -918,9 +940,13 @@ fn debug_dump<'a, 'tcx: 'a>(
                 let symbol_hash_start = symbol_name.rfind('h');
                 let symbol_hash = symbol_hash_start.map_or("<no hash>", |i| &symbol_name[i..]);
                 let size = item.size_estimate(tcx);
+                let kind = match item.instantiation_mode(tcx) {
+                    InstantiationMode::GloballyShared { .. } => "root",
+                    InstantiationMode::LocalCopy => "inlined",
+                };
                 let _ = with_no_trimmed_paths!(writeln!(
                     s,
-                    "  - {item} [{linkage:?}] [{symbol_hash}] (size={size})"
+                    "  - {item} [{linkage:?}] [{symbol_hash}] ({kind}, size: {size})"
                 ));
             }
 
diff --git a/compiler/rustc_session/src/session.rs b/compiler/rustc_session/src/session.rs
index 2cc02003218ee..5feea83edb6a3 100644
--- a/compiler/rustc_session/src/session.rs
+++ b/compiler/rustc_session/src/session.rs
@@ -234,6 +234,27 @@ pub enum MetadataKind {
     Compressed,
 }
 
+#[derive(Clone, Copy)]
+pub enum CodegenUnits {
+    /// Specified by the user. In this case we try fairly hard to produce the
+    /// number of CGUs requested.
+    User(usize),
+
+    /// A default value, i.e. not specified by the user. In this case we take
+    /// more liberties about CGU formation, e.g. avoid producing very small
+    /// CGUs.
+    Default(usize),
+}
+
+impl CodegenUnits {
+    pub fn as_usize(self) -> usize {
+        match self {
+            CodegenUnits::User(n) => n,
+            CodegenUnits::Default(n) => n,
+        }
+    }
+}
+
 impl Session {
     pub fn miri_unleashed_feature(&self, span: Span, feature_gate: Option<Symbol>) {
         self.miri_unleashed_features.lock().push((span, feature_gate));
@@ -1104,7 +1125,7 @@ impl Session {
 
         // If there's only one codegen unit and LTO isn't enabled then there's
         // no need for ThinLTO so just return false.
-        if self.codegen_units() == 1 {
+        if self.codegen_units().as_usize() == 1 {
             return config::Lto::No;
         }
 
@@ -1206,19 +1227,19 @@ impl Session {
 
     /// Returns the number of codegen units that should be used for this
     /// compilation
-    pub fn codegen_units(&self) -> usize {
+    pub fn codegen_units(&self) -> CodegenUnits {
         if let Some(n) = self.opts.cli_forced_codegen_units {
-            return n;
+            return CodegenUnits::User(n);
         }
         if let Some(n) = self.target.default_codegen_units {
-            return n as usize;
+            return CodegenUnits::Default(n as usize);
         }
 
         // If incremental compilation is turned on, we default to a high number
         // codegen units in order to reduce the "collateral damage" small
         // changes cause.
         if self.opts.incremental.is_some() {
-            return 256;
+            return CodegenUnits::Default(256);
         }
 
         // Why is 16 codegen units the default all the time?
@@ -1271,7 +1292,7 @@ impl Session {
         // As a result 16 was chosen here! Mostly because it was a power of 2
         // and most benchmarks agreed it was roughly a local optimum. Not very
         // scientific.
-        16
+        CodegenUnits::Default(16)
     }
 
     pub fn teach(&self, code: &DiagnosticId) -> bool {
diff --git a/src/doc/rustc/src/codegen-options/index.md b/src/doc/rustc/src/codegen-options/index.md
index 1041d5026690f..8de638dde4fbf 100644
--- a/src/doc/rustc/src/codegen-options/index.md
+++ b/src/doc/rustc/src/codegen-options/index.md
@@ -31,8 +31,8 @@ Supported values can also be discovered by running `rustc --print code-models`.
 
 ## codegen-units
 
-This flag controls how many code generation units the crate is split into. It
-takes an integer greater than 0.
+This flag controls the maximum number of code generation units the crate is
+split into. It takes an integer greater than 0.
 
 When a crate is split into multiple codegen units, LLVM is able to process
 them in parallel. Increasing parallelism may speed up compile times, but may