Skip to content

Commit

Permalink
Auto merge of #123441 - saethlin:fixed-len-file-names, r=<try>
Browse files Browse the repository at this point in the history
Stabilize the size of incr comp object file names

The current implementation does not produce stable-length paths, and we create the paths in a way that makes our allocation behavior is nondeterministic. I think `@eddyb` fixed a number of other cases like this in the past, and this PR fixes another one. Whether that actually matters I have no idea, but we still have bimodal behavior in rustc-perf and the non-uniformity in `find` and `ls` was bothering me.

I've also removed the truncation of the mangled CGU names. Before this PR incr comp paths look like this:
```
target/debug/incremental/scratch-38izrrq90cex7/s-gux6gz0ow8-1ph76gg-ewe1xj434l26w9up5bedsojpd/261xgo1oqnd90ry5.o
```
And after, they look like this:
```
target/debug/incremental/scratch-035omutqbfkbw/s-gux6borni0-16r3v1j-6n64tmwqzchtgqzwwim5amuga/55v2re42sztc8je9bva6g8ft3.o
```

On the one hand, I'm sure this will break some people's builds because they're on Windows and only a few bytes from the path length limit. But if we're that seriously worried about the length of our file names, I have some other ideas on how to make them smaller. And last time I deleted some hash truncations from the compiler, there was a huge drop in the number if incremental compilation ICEs that were reported: #110367
  • Loading branch information
bors committed Apr 4, 2024
2 parents b4acbe4 + a6397f0 commit 0912e5f
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 26 deletions.
50 changes: 50 additions & 0 deletions compiler/rustc_data_structures/src/base_n.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::iter;
/// Converts unsigned integers into a string representation with some base.
/// Bases up to and including 36 can be used for case-insensitive things.
use std::str;
Expand Down Expand Up @@ -42,3 +43,52 @@ pub fn encode(n: u128, base: usize) -> String {
push_str(n, base, &mut s);
s
}

// This trait just lets us reserve the exact right amount of space when doing fixed-length
// case-insensitve encoding. Add any impls you need.
pub trait Base36Encodable: Copy + Into<u128> {
fn encoded_len() -> usize;
}

impl Base36Encodable for u128 {
fn encoded_len() -> usize {
25
}
}

impl Base36Encodable for u64 {
fn encoded_len() -> usize {
13
}
}

impl Base36Encodable for u32 {
fn encoded_len() -> usize {
7
}
}

pub fn push_case_insensitive<N: Base36Encodable>(n: N, output: &mut String) {
// SAFETY: We will only append ASCII bytes.
let output = unsafe { output.as_mut_vec() };

// Add encoded_len '0's to the end of the String, that's the area we're going to write to.
let prev_len = output.len();
output.extend(iter::repeat(b'0').take(N::encoded_len()));
let output = &mut output[prev_len..];

let base = CASE_INSENSITIVE as u128;
let mut n: u128 = n.into();

for out in output.iter_mut().rev() {
*out = BASE_64[(n % base) as usize];
n /= base;
}
assert_eq!(n, 0);
}

pub fn case_insensitive<N: Base36Encodable>(n: N) -> String {
let mut output = String::new();
push_case_insensitive(n, &mut output);
output
}
8 changes: 8 additions & 0 deletions compiler/rustc_data_structures/src/base_n/tests.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
use super::*;

#[test]
fn limits() {
assert_eq!(Ok(u128::MAX), u128::from_str_radix(&case_insensitive(u128::MAX), 36));
assert_eq!(Ok(u64::MAX), u64::from_str_radix(&case_insensitive(u64::MAX), 36));
assert_eq!(Ok(u32::MAX), u32::from_str_radix(&case_insensitive(u32::MAX), 36));
}

#[test]
fn test_encode() {
fn test(n: u128, base: usize) {
assert_eq!(Ok(n), u128::from_str_radix(&encode(n, base), base as u32));
assert_eq!(Ok(n), u128::from_str_radix(&case_insensitive(n), 36));
}

for base in 2..37 {
Expand Down
37 changes: 15 additions & 22 deletions compiler/rustc_incremental/src/persist/fs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
//! implemented.

use crate::errors;
use rustc_data_structures::base_n::Base36Encodable;
use rustc_data_structures::fx::{FxHashSet, FxIndexSet};
use rustc_data_structures::svh::Svh;
use rustc_data_structures::unord::{UnordMap, UnordSet};
Expand Down Expand Up @@ -329,28 +330,21 @@ pub fn finalize_session_directory(sess: &Session, svh: Option<Svh>) {

debug!("finalize_session_directory() - session directory: {}", incr_comp_session_dir.display());

let old_sub_dir_name = incr_comp_session_dir.file_name().unwrap().to_string_lossy();
assert_no_characters_lost(&old_sub_dir_name);
let mut sub_dir_name = incr_comp_session_dir.file_name().unwrap().to_string_lossy().to_string();
assert_no_characters_lost(&sub_dir_name);

// Keep the 's-{timestamp}-{random-number}' prefix, but replace the
// '-working' part with the SVH of the crate
let dash_indices: Vec<_> = old_sub_dir_name.match_indices('-').map(|(idx, _)| idx).collect();
if dash_indices.len() != 3 {
bug!(
"Encountered incremental compilation session directory with \
malformed name: {}",
incr_comp_session_dir.display()
)
}

// State: "s-{timestamp}-{random-number}-"
let mut new_sub_dir_name = String::from(&old_sub_dir_name[..=dash_indices[2]]);
// We want to keep this: "s-{timestamp}-{random-number}-"
sub_dir_name.truncate(2 + (u64::encoded_len() - 3) + 1 + u32::encoded_len() + 1);
assert!(sub_dir_name.ends_with('-'));
assert!(sub_dir_name.as_bytes().iter().filter(|b| **b == b'-').count() == 3);

// Append the svh
base_n::push_str(svh.as_u128(), INT_ENCODE_BASE, &mut new_sub_dir_name);
base_n::push_case_insensitive(svh.as_u128(), &mut sub_dir_name);

// Create the full path
let new_path = incr_comp_session_dir.parent().unwrap().join(new_sub_dir_name);
let new_path = incr_comp_session_dir.parent().unwrap().join(&*sub_dir_name);
debug!("finalize_session_directory() - new path: {}", new_path.display());

match rename_path_with_retry(&*incr_comp_session_dir, &new_path, 3) {
Expand Down Expand Up @@ -446,11 +440,10 @@ fn generate_session_dir_path(crate_dir: &Path) -> PathBuf {
let random_number = thread_rng().next_u32();
debug!("generate_session_dir_path: random_number = {}", random_number);

let directory_name = format!(
"s-{}-{}-working",
timestamp,
base_n::encode(random_number as u128, INT_ENCODE_BASE)
);
// Chop the first 3 characters off the timestamp. Those 3 bytes will be zero for a while.
assert_eq!(&timestamp[..3], "000");
let directory_name =
format!("s-{}-{}-working", &timestamp[3..], base_n::case_insensitive(random_number));
debug!("generate_session_dir_path: directory_name = {}", directory_name);
let directory_path = crate_dir.join(directory_name);
debug!("generate_session_dir_path: directory_path = {}", directory_path.display());
Expand Down Expand Up @@ -582,7 +575,7 @@ fn extract_timestamp_from_session_dir(directory_name: &str) -> Result<SystemTime
fn timestamp_to_string(timestamp: SystemTime) -> String {
let duration = timestamp.duration_since(UNIX_EPOCH).unwrap();
let micros = duration.as_secs() * 1_000_000 + (duration.subsec_nanos() as u64) / 1000;
base_n::encode(micros as u128, INT_ENCODE_BASE)
base_n::case_insensitive(micros)
}

fn string_to_timestamp(s: &str) -> Result<SystemTime, &'static str> {
Expand Down Expand Up @@ -613,7 +606,7 @@ fn crate_path(sess: &Session) -> PathBuf {
sess.cfg_version,
);

let stable_crate_id = base_n::encode(stable_crate_id.as_u64() as u128, INT_ENCODE_BASE);
let stable_crate_id = base_n::case_insensitive(stable_crate_id.as_u64());

let crate_name = format!("{crate_name}-{stable_crate_id}");
incr_dir.join(crate_name)
Expand Down
5 changes: 1 addition & 4 deletions compiler/rustc_middle/src/mir/mono.rs
Original file line number Diff line number Diff line change
Expand Up @@ -335,13 +335,10 @@ impl<'tcx> CodegenUnit<'tcx> {
}

pub fn mangle_name(human_readable_name: &str) -> String {
// We generate a 80 bit hash from the name. This should be enough to
// avoid collisions and is still reasonably short for filenames.
let mut hasher = StableHasher::new();
human_readable_name.hash(&mut hasher);
let hash: Hash128 = hasher.finish();
let hash = hash.as_u128() & ((1u128 << 80) - 1);
base_n::encode(hash, base_n::CASE_INSENSITIVE)
base_n::case_insensitive(hash.as_u128())
}

pub fn compute_size_estimate(&mut self) {
Expand Down

0 comments on commit 0912e5f

Please sign in to comment.