Optimize performance of cargo package.

This is achieved by allowing `gix status` to only run in the
package root, while running it another time just on the few files
that are interesting outside of the package root.

This saves a lot of time compared to the previous implementation,
which ran the status on the entire repository.
This commit is contained in:
Sebastian Thiel 2025-07-18 05:43:03 +02:00
parent d3b85cd96a
commit dfe3737634
No known key found for this signature in database
GPG Key ID: 9CB5EE7895E8268B

View File

@ -1,4 +1,5 @@
//! Helpers to gather the VCS information for `cargo package`. //! Helpers to gather the VCS information for `cargo package`.
use crate::core::{Package, Workspace}; use crate::core::{Package, Workspace};
use crate::ops::PackageOpts; use crate::ops::PackageOpts;
use crate::sources::PathEntry; use crate::sources::PathEntry;
@ -7,11 +8,11 @@ use anyhow::Context;
use cargo_util::paths; use cargo_util::paths;
use gix::bstr::ByteSlice; use gix::bstr::ByteSlice;
use gix::dir::walk::EmissionMode; use gix::dir::walk::EmissionMode;
use gix::dirwalk::Options;
use gix::index::entry::Mode; use gix::index::entry::Mode;
use gix::status::tree_index::TrackRenames; use gix::status::tree_index::TrackRenames;
use gix::worktree::stack::state::ignore::Source; use gix::worktree::stack::state::ignore::Source;
use serde::Serialize; use serde::Serialize;
use std::collections::HashSet;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use tracing::debug; use tracing::debug;
@ -47,7 +48,7 @@ pub fn check_repo_state(
opts: &PackageOpts<'_>, opts: &PackageOpts<'_>,
) -> CargoResult<Option<VcsInfo>> { ) -> CargoResult<Option<VcsInfo>> {
let gctx = ws.gctx(); let gctx = ws.gctx();
let Ok(repo) = gix::discover(p.root()) else { let Ok(mut repo) = gix::discover(p.root()) else {
gctx.shell().verbose(|shell| { gctx.shell().verbose(|shell| {
shell.warn(format_args!( shell.warn(format_args!(
"no (git) VCS found for `{}`", "no (git) VCS found for `{}`",
@ -115,7 +116,7 @@ pub fn check_repo_state(
path.display(), path.display(),
workdir.display(), workdir.display(),
); );
let Some(git) = git(ws, p, src_files, &repo, &opts)? else { let Some(git) = git(ws, p, src_files, &mut repo, &opts)? else {
// If the git repo lacks essential field like `sha1`, and since this field exists from the beginning, // If the git repo lacks essential field like `sha1`, and since this field exists from the beginning,
// then don't generate the corresponding file in order to maintain consistency with past behavior. // then don't generate the corresponding file in order to maintain consistency with past behavior.
return Ok(None); return Ok(None);
@ -181,31 +182,32 @@ fn git(
ws: &Workspace<'_>, ws: &Workspace<'_>,
pkg: &Package, pkg: &Package,
src_files: &[PathEntry], src_files: &[PathEntry],
repo: &gix::Repository, repo: &mut gix::Repository,
opts: &PackageOpts<'_>, opts: &PackageOpts<'_>,
) -> CargoResult<Option<GitVcsInfo>> { ) -> CargoResult<Option<GitVcsInfo>> {
{
let mut config = repo.config_snapshot_mut();
// This currently is only a very minor speedup for the biggest repositories,
// but might trigger creating many threads.
config.set_value(&gix::config::tree::Index::THREADS, "false")?;
}
// This is a collection of any dirty or untracked files. This covers: // This is a collection of any dirty or untracked files. This covers:
// - new/modified/deleted/renamed/type change (index or worktree) // - new/modified/deleted/renamed/type change (index or worktree)
// - untracked files (which are "new" worktree files) // - untracked files (which are "new" worktree files)
// - ignored (in case the user has an `include` directive that // - ignored (in case the user has an `include` directive that
// conflicts with .gitignore). // conflicts with .gitignore).
let (mut dirty_files, mut dirty_files_outside_package_root) = (Vec::new(), Vec::new()); let mut dirty_files = Vec::new();
let workdir = repo.workdir().unwrap(); let workdir = repo.workdir().unwrap();
collect_statuses( collect_statuses(
repo, repo,
workdir, workdir,
relative_package_root(repo, pkg.root()).as_deref(), relative_package_root(repo, pkg.root()).as_deref(),
&mut dirty_files, &mut dirty_files,
&mut dirty_files_outside_package_root,
)?; )?;
// Include each submodule so that the error message can provide // Include each submodule so that the error message can provide
// specifically *which* files in a submodule are modified. // specifically *which* files in a submodule are modified.
status_submodules( status_submodules(repo, &mut dirty_files)?;
repo,
&mut dirty_files,
&mut dirty_files_outside_package_root,
)?;
// Find the intersection of dirty in git, and the src_files that would // Find the intersection of dirty in git, and the src_files that would
// be packaged. This is a lazy n^2 check, but seems fine with // be packaged. This is a lazy n^2 check, but seems fine with
@ -230,10 +232,7 @@ fn git(
} }
}) })
.map(|p| p.as_ref()) .map(|p| p.as_ref())
.chain( .chain(dirty_files_outside_pkg_root(ws, pkg, repo, src_files)?.iter())
dirty_files_outside_pkg_root(ws, pkg, &dirty_files_outside_package_root, src_files)?
.iter(),
)
.map(|path| { .map(|path| {
pathdiff::diff_paths(path, cwd) pathdiff::diff_paths(path, cwd)
.as_ref() .as_ref()
@ -271,25 +270,17 @@ fn collect_statuses(
workdir: &Path, workdir: &Path,
relative_package_root: Option<&Path>, relative_package_root: Option<&Path>,
dirty_files: &mut Vec<PathBuf>, dirty_files: &mut Vec<PathBuf>,
dirty_files_outside_package_root: &mut Vec<PathBuf>,
) -> CargoResult<()> { ) -> CargoResult<()> {
let statuses = repo let statuses = repo
.status(gix::progress::Discard)? .status(gix::progress::Discard)?
.dirwalk_options(|opts| { .dirwalk_options(configure_dirwalk)
opts.emit_untracked(gix::dir::walk::EmissionMode::Matching)
// Also pick up ignored files or whole directories
// to specifically catch overzealously ignored source files.
// Later we will match these dirs by prefix, which is why collapsing
// them is desirable here.
.emit_ignored(Some(EmissionMode::CollapseDirectory))
.emit_tracked(false)
.recurse_repositories(false)
.symlinks_to_directories_are_ignored_like_directories(true)
.emit_empty_directories(false)
})
.tree_index_track_renames(TrackRenames::Disabled) .tree_index_track_renames(TrackRenames::Disabled)
.index_worktree_submodules(None) .index_worktree_submodules(None)
.into_iter(None /* pathspec patterns */) .into_iter(
relative_package_root.map(|rela_pkg_root| {
gix::path::into_bstr(rela_pkg_root).into_owned()
}), /* pathspec patterns */
)
.with_context(|| { .with_context(|| {
format!( format!(
"failed to begin git status for repo {}", "failed to begin git status for repo {}",
@ -307,11 +298,6 @@ fn collect_statuses(
let rel_path = gix::path::from_bstr(status.location()); let rel_path = gix::path::from_bstr(status.location());
let path = workdir.join(&rel_path); let path = workdir.join(&rel_path);
if relative_package_root.is_some_and(|pkg_root| !rel_path.starts_with(pkg_root)) {
dirty_files_outside_package_root.push(path);
continue;
}
// It is OK to include Cargo.lock even if it is ignored. // It is OK to include Cargo.lock even if it is ignored.
if path.ends_with("Cargo.lock") if path.ends_with("Cargo.lock")
&& matches!( && matches!(
@ -330,11 +316,7 @@ fn collect_statuses(
} }
/// Helper to collect dirty statuses while recursing into submodules. /// Helper to collect dirty statuses while recursing into submodules.
fn status_submodules( fn status_submodules(repo: &gix::Repository, dirty_files: &mut Vec<PathBuf>) -> CargoResult<()> {
repo: &gix::Repository,
dirty_files: &mut Vec<PathBuf>,
dirty_files_outside_package_root: &mut Vec<PathBuf>,
) -> CargoResult<()> {
let Some(submodules) = repo.submodules()? else { let Some(submodules) = repo.submodules()? else {
return Ok(()); return Ok(());
}; };
@ -345,14 +327,8 @@ fn status_submodules(
let Some(workdir) = sub_repo.workdir() else { let Some(workdir) = sub_repo.workdir() else {
continue; continue;
}; };
status_submodules(&sub_repo, dirty_files, dirty_files_outside_package_root)?; status_submodules(&sub_repo, dirty_files)?;
collect_statuses( collect_statuses(&sub_repo, workdir, None, dirty_files)?;
&sub_repo,
workdir,
None,
dirty_files,
dirty_files_outside_package_root,
)?;
} }
} }
Ok(()) Ok(())
@ -374,7 +350,7 @@ fn relative_package_root(repo: &gix::Repository, pkg_root: &Path) -> Option<Path
/// This currently looks at /// This currently looks at
/// ///
/// * `package.readme` and `package.license-file` pointing to paths outside package root /// * `package.readme` and `package.license-file` pointing to paths outside package root
/// * symlinks targets reside outside package root /// * symlinks targets residing outside package root
/// * Any change in the root workspace manifest, regardless of what has changed. /// * Any change in the root workspace manifest, regardless of what has changed.
/// ///
/// This is required because those paths may link to a file outside the /// This is required because those paths may link to a file outside the
@ -383,10 +359,12 @@ fn relative_package_root(repo: &gix::Repository, pkg_root: &Path) -> Option<Path
fn dirty_files_outside_pkg_root( fn dirty_files_outside_pkg_root(
ws: &Workspace<'_>, ws: &Workspace<'_>,
pkg: &Package, pkg: &Package,
dirty_files_outside_of_package_root: &[PathBuf], repo: &gix::Repository,
src_files: &[PathEntry], src_files: &[PathEntry],
) -> CargoResult<HashSet<PathBuf>> { ) -> CargoResult<Vec<PathBuf>> {
let pkg_root = pkg.root(); let pkg_root = pkg.root();
let workdir = repo.workdir().unwrap();
let meta = pkg.manifest().metadata(); let meta = pkg.manifest().metadata();
let metadata_paths: Vec<_> = [&meta.license_file, &meta.readme] let metadata_paths: Vec<_> = [&meta.license_file, &meta.readme]
.into_iter() .into_iter()
@ -394,7 +372,7 @@ fn dirty_files_outside_pkg_root(
.map(|path| paths::normalize_path(&pkg_root.join(path))) .map(|path| paths::normalize_path(&pkg_root.join(path)))
.collect(); .collect();
let dirty_files = src_files let linked_files_outside_package_root: Vec<_> = src_files
.iter() .iter()
.filter(|p| p.is_symlink_or_under_symlink()) .filter(|p| p.is_symlink_or_under_symlink())
.map(|p| p.as_ref().as_path()) .map(|p| p.as_ref().as_path())
@ -403,19 +381,58 @@ fn dirty_files_outside_pkg_root(
// If inside package root. Don't bother checking git status. // If inside package root. Don't bother checking git status.
.filter(|p| paths::strip_prefix_canonical(p, pkg_root).is_err()) .filter(|p| paths::strip_prefix_canonical(p, pkg_root).is_err())
// Handle files outside package root but under git workdir, // Handle files outside package root but under git workdir,
.filter_map(|src_file| { .filter_map(|p| paths::strip_prefix_canonical(p, workdir).ok())
let canon_src_path = gix::path::realpath_opts(
src_file,
ws.gctx().cwd(),
gix::path::realpath::MAX_SYMLINKS,
)
.unwrap_or_else(|_| src_file.to_owned());
dirty_files_outside_of_package_root
.iter()
.any(|p| canon_src_path.starts_with(p))
.then_some(canon_src_path)
})
.collect(); .collect();
if linked_files_outside_package_root.is_empty() {
return Ok(Vec::new());
}
let statuses = repo
.status(gix::progress::Discard)?
.dirwalk_options(configure_dirwalk)
// Limit the amount of threads for used for the worktree status, as the pathspec will
// prevent most paths from being visited anyway there is not much work.
.index_worktree_options_mut(|opts| opts.thread_limit = Some(1))
.tree_index_track_renames(TrackRenames::Disabled)
.index_worktree_submodules(None)
.into_iter(
linked_files_outside_package_root
.into_iter()
.map(|p| gix::path::into_bstr(p).into_owned()),
)
.with_context(|| {
format!(
"failed to begin git status for outfor repo {}",
repo.path().display()
)
})?;
let mut dirty_files = Vec::new();
for status in statuses {
let status = status.with_context(|| {
format!(
"failed to retrieve git status from repo {}",
repo.path().display()
)
})?;
let rel_path = gix::path::from_bstr(status.location());
let path = workdir.join(&rel_path);
dirty_files.push(path);
}
Ok(dirty_files) Ok(dirty_files)
} }
fn configure_dirwalk(opts: Options) -> Options {
opts.emit_untracked(gix::dir::walk::EmissionMode::Matching)
// Also pick up ignored files or whole directories
// to specifically catch overzealously ignored source files.
// Later we will match these dirs by prefix, which is why collapsing
// them is desirable here.
.emit_ignored(Some(EmissionMode::CollapseDirectory))
.emit_tracked(false)
.recurse_repositories(false)
.symlinks_to_directories_are_ignored_like_directories(true)
.emit_empty_directories(false)
}