From 3dabdcdd20d6382e7efa2c01833efa20944f3a79 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Tue, 31 Dec 2024 21:55:45 -0500 Subject: [PATCH] perf(cargo-package): match certain path prefix with pathspec `check_repo_state` checks the entire git repo status. This is usually fine if you have only a few packages in a workspace. For huge monorepos, it may hit performance issues. For example, on awslabs/aws-sdk-rust@2cbd34d the workspace has roughly 434 members to publish. `git ls-files` reported us 204379 files in this Git repository. That means git may need to check status of all files 434 times. That would be `204379 * 434 = 88,700,486` checks! Moreover, the current algorithm is finding the intersection of `PathSource::list_files` and `git status`. It is an `O(n^2)` check. Let's assume files are evenly distributed into each package, so roughly 470 files per package. If we're unlucky to have some dirty files, say 100 files. We will have to do `470 * 100 = 47,000` times of path comparisons. Even worse, because we `git status` everything in the repo, we'll have to it for all members, even when those dirty files are not part of the current package in question. So it becomes `470 * 100 * 434 = 20,398,000`! Instead of comparing with the status of the entire repository, this patch use the magic pathspec[1] to tell git only reports paths that match a certain path prefix. This wouldn't help the `O(n^2)` algorithm, but at least it won't check dirty files outside the current package. Also, we don't `git status` against entire git worktree/index anymore. [1]: https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec --- src/cargo/ops/cargo_package/vcs.rs | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/cargo/ops/cargo_package/vcs.rs b/src/cargo/ops/cargo_package/vcs.rs index 9f2f57284..04d7a9d5a 100644 --- a/src/cargo/ops/cargo_package/vcs.rs +++ b/src/cargo/ops/cargo_package/vcs.rs @@ -1,6 +1,7 @@ //! Helpers to gather the VCS information for `cargo package`. use std::collections::HashSet; +use std::path::Path; use std::path::PathBuf; use anyhow::Context as _; @@ -173,7 +174,9 @@ fn git( // - ignored (in case the user has an `include` directive that // conflicts with .gitignore). let mut dirty_files = Vec::new(); - collect_statuses(repo, &mut dirty_files)?; + let pathspec = relative_pathspec(repo, pkg.root()); + collect_statuses(repo, &[pathspec.as_str()], &mut dirty_files)?; + // Include each submodule so that the error message can provide // specifically *which* files in a submodule are modified. status_submodules(repo, &mut dirty_files)?; @@ -263,12 +266,18 @@ fn dirty_files_outside_pkg_root( } /// Helper to collect dirty statuses for a single repo. -fn collect_statuses(repo: &git2::Repository, dirty_files: &mut Vec) -> CargoResult<()> { +fn collect_statuses( + repo: &git2::Repository, + pathspecs: &[&str], + dirty_files: &mut Vec, +) -> CargoResult<()> { let mut status_opts = git2::StatusOptions::new(); // Exclude submodules, as they are being handled manually by recursing // into each one so that details about specific files can be // retrieved. - status_opts + pathspecs + .iter() + .fold(&mut status_opts, git2::StatusOptions::pathspec) .exclude_submodules(true) .include_ignored(true) .include_untracked(true); @@ -300,8 +309,16 @@ fn status_submodules(repo: &git2::Repository, dirty_files: &mut Vec) -> // If its files are required, then the verification step should fail. if let Ok(sub_repo) = submodule.open() { status_submodules(&sub_repo, dirty_files)?; - collect_statuses(&sub_repo, dirty_files)?; + collect_statuses(&sub_repo, &[], dirty_files)?; } } Ok(()) } + +/// Use pathspec so git only matches a certain path prefix +fn relative_pathspec(repo: &git2::Repository, pkg_root: &Path) -> String { + let workdir = repo.workdir().unwrap(); + let relpath = pkg_root.strip_prefix(workdir).unwrap_or(Path::new("")); + // to unix separators + relpath.to_str().unwrap().replace('\\', "/") +}