From f312b39eb903a481f6a38eb0b2c525a259e29c96 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Thu, 19 Dec 2024 12:25:14 -0500 Subject: [PATCH 1/4] fix(cargo-package): sort dirty file report To make error message more deterministic --- src/cargo/ops/cargo_package.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cargo/ops/cargo_package.rs b/src/cargo/ops/cargo_package.rs index bc619c1b86a..2c7b6eb44cf 100644 --- a/src/cargo/ops/cargo_package.rs +++ b/src/cargo/ops/cargo_package.rs @@ -824,7 +824,7 @@ fn check_repo_state( // Find the intersection of dirty in git, and the src_files that would // be packaged. This is a lazy n^2 check, but seems fine with // thousands of files. - let dirty_src_files: Vec = src_files + let mut dirty_src_files: Vec<_> = src_files .iter() .filter(|src_file| dirty_files.iter().any(|path| src_file.starts_with(path))) .map(|path| { @@ -847,6 +847,7 @@ fn check_repo_state( dirty, })) } else { + dirty_src_files.sort_unstable(); anyhow::bail!( "{} files in the working directory contain changes that were \ not yet committed into git:\n\n{}\n\n\ From 238610454c63f3031691a2224f6ded47a9ffa0e2 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Thu, 19 Dec 2024 10:40:46 -0500 Subject: [PATCH 2/4] test(cargo-package): track dirty on workspace member granularity --- tests/testsuite/package.rs | 135 +++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/tests/testsuite/package.rs b/tests/testsuite/package.rs index 5a27cc34920..72a7a2c0f6b 100644 --- a/tests/testsuite/package.rs +++ b/tests/testsuite/package.rs @@ -1156,6 +1156,141 @@ src/lib.rs .run(); } +#[cargo_test] +fn dirty_workspace_package() { + // Cargo see them as clean if not in any local package's directory. + let (p, repo) = git::new_repo("foo", |p| { + p.file( + "Cargo.toml", + r#" + [workspace] + members = ["isengard", "mordor"] + "#, + ) + .file("not-belong-to-any-mortal-man", "...") + .file( + "isengard/Cargo.toml", + r#" + [package] + name = "isengard" + edition = "2015" + homepage = "saruman" + description = "saruman" + license = "MIT" + "#, + ) + .file("isengard/src/lib.rs", "") + .file( + "mordor/Cargo.toml", + r#" + [package] + name = "mordor" + edition = "2015" + homepage = "sauron" + description = "sauron" + license = "MIT" + "#, + ) + .file("mordor/src/lib.rs", "") + }); + git::commit(&repo); + + p.change_file( + "Cargo.toml", + r#" + [workspace] + members = ["isengard", "mordor"] + [workspace.package] + edition = "2021" + "#, + ); + // Dirty file outside won't affect packaging. + p.change_file("not-belong-to-any-mortal-man", "changed!"); + p.change_file("mordor/src/lib.rs", "changed!"); + p.change_file("mordor/src/main.rs", "fn main() {}"); + + // Ensure dirty files be reported. + p.cargo("package --workspace --no-verify") + .with_status(101) + .with_stderr_data(str![[r#" +[PACKAGING] isengard v0.0.0 ([ROOT]/foo/isengard) +[PACKAGED] 5 files, [FILE_SIZE]B ([FILE_SIZE]B compressed) +[ERROR] 2 files in the working directory contain changes that were not yet committed into git: + +src/lib.rs +src/main.rs + +to proceed despite this and include the uncommitted changes, pass the `--allow-dirty` flag + +"#]]) + .run(); + + // Ensure only dirty package mordor be dirty. + p.cargo("package --workspace --no-verify --allow-dirty") + .with_stderr_data(str![[r#" +[PACKAGING] isengard v0.0.0 ([ROOT]/foo/isengard) +[PACKAGED] 5 files, [FILE_SIZE]B ([FILE_SIZE]B compressed) +[PACKAGING] mordor v0.0.0 ([ROOT]/foo/mordor) +[PACKAGED] 6 files, [FILE_SIZE]B ([FILE_SIZE]B compressed) + +"#]]) + .run(); + + let f = File::open(&p.root().join("target/package/isengard-0.0.0.crate")).unwrap(); + validate_crate_contents( + f, + "isengard-0.0.0.crate", + &[ + ".cargo_vcs_info.json", + "Cargo.toml", + "Cargo.toml.orig", + "src/lib.rs", + "Cargo.lock", + ], + [( + ".cargo_vcs_info.json", + // No change within `isengard/`, so not dirty at all. + str![[r#" +{ + "git": { + "sha1": "[..]" + }, + "path_in_vcs": "isengard" +} +"#]] + .is_json(), + )], + ); + + let f = File::open(&p.root().join("target/package/mordor-0.0.0.crate")).unwrap(); + validate_crate_contents( + f, + "mordor-0.0.0.crate", + &[ + ".cargo_vcs_info.json", + "Cargo.toml", + "Cargo.toml.orig", + "src/lib.rs", + "src/main.rs", + "Cargo.lock", + ], + [( + ".cargo_vcs_info.json", + // Dirty bit is recorded. + str![[r#" +{ + "git": { + "dirty": true, + "sha1": "[..]" + }, + "path_in_vcs": "mordor" +} +"#]] + .is_json(), + )], + ); +} + #[cargo_test] fn issue_13695_allow_dirty_vcs_info() { let p = project() From ca1060b7e02aac5c884572c340d24749316995ca Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Thu, 19 Dec 2024 11:13:00 -0500 Subject: [PATCH 3/4] fix(cargo-package): show dirty filepaths relative to git workdir --- src/cargo/ops/cargo_package.rs | 5 +++-- tests/testsuite/package.rs | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/cargo/ops/cargo_package.rs b/src/cargo/ops/cargo_package.rs index 2c7b6eb44cf..508afa4234a 100644 --- a/src/cargo/ops/cargo_package.rs +++ b/src/cargo/ops/cargo_package.rs @@ -805,7 +805,7 @@ fn check_repo_state( return Ok(Some(VcsInfo { git, path_in_vcs })); fn git( - pkg: &Package, + _pkg: &Package, src_files: &[PathBuf], repo: &git2::Repository, opts: &PackageOpts<'_>, @@ -824,11 +824,12 @@ fn check_repo_state( // Find the intersection of dirty in git, and the src_files that would // be packaged. This is a lazy n^2 check, but seems fine with // thousands of files. + let workdir = repo.workdir().unwrap(); let mut dirty_src_files: Vec<_> = src_files .iter() .filter(|src_file| dirty_files.iter().any(|path| src_file.starts_with(path))) .map(|path| { - path.strip_prefix(pkg.root()) + path.strip_prefix(workdir) .unwrap_or(path) .display() .to_string() diff --git a/tests/testsuite/package.rs b/tests/testsuite/package.rs index 72a7a2c0f6b..4a0506c0363 100644 --- a/tests/testsuite/package.rs +++ b/tests/testsuite/package.rs @@ -1217,8 +1217,8 @@ fn dirty_workspace_package() { [PACKAGED] 5 files, [FILE_SIZE]B ([FILE_SIZE]B compressed) [ERROR] 2 files in the working directory contain changes that were not yet committed into git: -src/lib.rs -src/main.rs +mordor/src/lib.rs +mordor/src/main.rs to proceed despite this and include the uncommitted changes, pass the `--allow-dirty` flag From 2bc27e1b191ba9343e9609ed9378d291e949f4d4 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Wed, 18 Dec 2024 23:47:31 -0500 Subject: [PATCH 4/4] perf(cargo-package): match certain path prefix with pathspec `check_repo_state` checks the entire git repo status. This is usually fine if you have only a few packages in a workspace. For huge monorepos, it may hit performance issues. For example, on awslabs/aws-sdk-rust@2cbd34db53389aed1cafe4c56d23186ef5ff304a the workspace has roughly 434 members to publish. `git ls-files` reported us 204379 files in this Git repository. That means git may need to check status of all files 434 times. That would be `204379 * 434 = 88,700,486` checks! Moreover, the current algorithm is finding the intersection of `PathSource::list_files` and `git status`. It is an `O(n^2)` check. Let's assume files are evenly distributed into each package, so roughly 470 files per package. If we're unlucky to have some dirty files, say 100 files. We will have to do `470 * 100 = 47,000` times of path comparisons. Even worse, because we `git status` everything in the repo, we'll have to it for all members, even when those dirty files are not part of the current package in question. So it becomes `470 * 100 * 434 = 20,398,000`! Instead of comparing with the status of the entire repository, this patch use the magic pathspec[1] to tell git only reports paths that match a certain path prefix. This wouldn't help the `O(n^2)` algorithm, but at least it won't check dirty files outside the current package. [1]: https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec --- src/cargo/ops/cargo_package.rs | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/cargo/ops/cargo_package.rs b/src/cargo/ops/cargo_package.rs index 508afa4234a..c8d9887e167 100644 --- a/src/cargo/ops/cargo_package.rs +++ b/src/cargo/ops/cargo_package.rs @@ -805,7 +805,7 @@ fn check_repo_state( return Ok(Some(VcsInfo { git, path_in_vcs })); fn git( - _pkg: &Package, + pkg: &Package, src_files: &[PathBuf], repo: &git2::Repository, opts: &PackageOpts<'_>, @@ -816,7 +816,8 @@ fn check_repo_state( // - ignored (in case the user has an `include` directive that // conflicts with .gitignore). let mut dirty_files = Vec::new(); - collect_statuses(repo, &mut dirty_files)?; + let pathspec = relative_pathspec(repo, pkg.root()); + collect_statuses(repo, &[pathspec.as_str()], &mut dirty_files)?; // Include each submodule so that the error message can provide // specifically *which* files in a submodule are modified. status_submodules(repo, &mut dirty_files)?; @@ -859,16 +860,27 @@ fn check_repo_state( } } + /// Use pathspec so git only matches a certain path prefix + fn relative_pathspec(repo: &git2::Repository, pkg_root: &Path) -> String { + let workdir = repo.workdir().unwrap(); + let relpath = pkg_root.strip_prefix(workdir).unwrap_or(Path::new("")); + // to unix separators + relpath.to_str().unwrap().replace('\\', "/") + } + // Helper to collect dirty statuses for a single repo. fn collect_statuses( repo: &git2::Repository, + pathspecs: &[&str], dirty_files: &mut Vec, ) -> CargoResult<()> { let mut status_opts = git2::StatusOptions::new(); // Exclude submodules, as they are being handled manually by recursing // into each one so that details about specific files can be // retrieved. - status_opts + pathspecs + .iter() + .fold(&mut status_opts, git2::StatusOptions::pathspec) .exclude_submodules(true) .include_ignored(true) .include_untracked(true); @@ -903,7 +915,7 @@ fn check_repo_state( // If its files are required, then the verification step should fail. if let Ok(sub_repo) = submodule.open() { status_submodules(&sub_repo, dirty_files)?; - collect_statuses(&sub_repo, dirty_files)?; + collect_statuses(&sub_repo, &[], dirty_files)?; } } Ok(())