From da3ca05677fff6d82e1b1eddfd5c0c95f9ff0ed7 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Wed, 6 Sep 2023 21:34:05 -0700 Subject: [PATCH] Add a global cache garbage collector. This adds a garbage collector which will remove old files from cargo's global cache. A general overview of the changes here: - `cargo::core::global_cache_tracker` contains the `GlobalCacheTracker` which handles the interface to a sqlite database which stores timestamps of the last time a file was used. - `DeferredGlobalLastUse` is a type that implements an optimization for collecting last-use timestamps so that they can be flushed to disk all at once. - `cargo::core::gc` contains the `Gc` type which is the interface for performing garbage collection. It coordinates with the `GlobalCacheTracker` for determining what to delete. - Garbage collection can either be automatic or manual. The automatic garbage collection supports some config options for defining when it runs and how much it deletes. - Manual garbage collection can be performed via options to `cargo clean`. - `cargo clean` uses the new package cache locking system to coordinate access to the package cache to prevent interference with other cargo commands running concurrently. --- Cargo.lock | 20 +- Cargo.toml | 2 + crates/cargo-test-support/Cargo.toml | 1 + crates/cargo-test-support/src/paths.rs | 13 + src/bin/cargo/commands/clean.rs | 222 ++- src/cargo/core/gc.rs | 550 ++++++ src/cargo/core/global_cache_tracker.rs | 1754 +++++++++++++++++++ src/cargo/core/mod.rs | 2 + src/cargo/core/package.rs | 4 + src/cargo/ops/cargo_clean.rs | 97 +- src/cargo/ops/cargo_compile/mod.rs | 1 + src/cargo/ops/cargo_fetch.rs | 1 + src/cargo/ops/mod.rs | 2 +- src/cargo/ops/resolve.rs | 3 + src/cargo/sources/git/source.rs | 34 +- src/cargo/sources/registry/download.rs | 18 + src/cargo/sources/registry/http_remote.rs | 19 +- src/cargo/sources/registry/mod.rs | 20 + src/cargo/sources/registry/remote.rs | 18 +- src/cargo/util/config/mod.rs | 24 + tests/testsuite/clean.rs | 14 +- tests/testsuite/global_cache_tracker.rs | 1890 +++++++++++++++++++++ tests/testsuite/main.rs | 1 + 23 files changed, 4652 insertions(+), 58 deletions(-) create mode 100644 src/cargo/core/gc.rs create mode 100644 src/cargo/core/global_cache_tracker.rs create mode 100644 tests/testsuite/global_cache_tracker.rs diff --git a/Cargo.lock b/Cargo.lock index 9cf756194..0cd1bbbe6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -287,6 +287,7 @@ dependencies = [ "pathdiff", "pulldown-cmark", "rand", + "regex", "rusqlite", "rustfix", "same-file", @@ -407,6 +408,7 @@ dependencies = [ "time", "toml", "url", + "walkdir", "windows-sys", ] @@ -2669,7 +2671,7 @@ dependencies = [ "rand", "rand_chacha", "rand_xorshift", - "regex-syntax 0.7.2", + "regex-syntax 0.7.5", "rusty-fork", "tempfile", "unarray", @@ -2797,13 +2799,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.8.4" +version = "1.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" +checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.2", + "regex-automata 0.3.8", + "regex-syntax 0.7.5", ] [[package]] @@ -2820,6 +2823,11 @@ name = "regex-automata" version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.7.5", +] [[package]] name = "regex-syntax" @@ -2829,9 +2837,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.7.2" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" [[package]] name = "resolver-tests" diff --git a/Cargo.toml b/Cargo.toml index 4c3a4bde6..60ffec21b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,6 +73,7 @@ pretty_assertions = "1.4.0" proptest = "1.3.1" pulldown-cmark = { version = "0.9.3", default-features = false } rand = "0.8.5" +regex = "1.9.3" rusqlite = { version = "0.29.0", features = ["bundled"] } rustfix = "0.6.1" same-file = "1.0.6" @@ -163,6 +164,7 @@ pasetors.workspace = true pathdiff.workspace = true pulldown-cmark.workspace = true rand.workspace = true +regex.workspace = true rusqlite.workspace = true rustfix.workspace = true semver.workspace = true diff --git a/crates/cargo-test-support/Cargo.toml b/crates/cargo-test-support/Cargo.toml index fc32e1c9c..42f8c2af9 100644 --- a/crates/cargo-test-support/Cargo.toml +++ b/crates/cargo-test-support/Cargo.toml @@ -29,6 +29,7 @@ tar.workspace = true time.workspace = true toml.workspace = true url.workspace = true +walkdir.workspace = true [target.'cfg(windows)'.dependencies] windows-sys = { workspace = true, features = ["Win32_Storage_FileSystem"] } diff --git a/crates/cargo-test-support/src/paths.rs b/crates/cargo-test-support/src/paths.rs index 50040e1d4..8e2909963 100644 --- a/crates/cargo-test-support/src/paths.rs +++ b/crates/cargo-test-support/src/paths.rs @@ -114,6 +114,10 @@ pub trait CargoPathExt { fn rm_rf(&self); fn mkdir_p(&self); + /// Returns a list of all files and directories underneath the given + /// directory, recursively, including the starting path. + fn ls_r(&self) -> Vec; + fn move_into_the_past(&self) { self.move_in_time(|sec, nsec| (sec - 3600, nsec)) } @@ -155,6 +159,15 @@ impl CargoPathExt for Path { .unwrap_or_else(|e| panic!("failed to mkdir_p {}: {}", self.display(), e)) } + fn ls_r(&self) -> Vec { + let mut file_list: Vec<_> = walkdir::WalkDir::new(self) + .into_iter() + .filter_map(|e| e.map(|e| e.path().to_owned()).ok()) + .collect(); + file_list.sort(); + file_list + } + fn move_in_time(&self, travel_amount: F) where F: Fn(i64, u32) -> (i64, u32), diff --git a/src/bin/cargo/commands/clean.rs b/src/bin/cargo/commands/clean.rs index 8596561c9..c51de5650 100644 --- a/src/bin/cargo/commands/clean.rs +++ b/src/bin/cargo/commands/clean.rs @@ -1,7 +1,11 @@ use crate::command_prelude::*; - +use cargo::core::gc::{parse_human_size, parse_time_span}; +use cargo::core::gc::{AutoGcKind, GcOpts}; use cargo::ops::{self, CleanOptions}; use cargo::util::print_available_packages; +use cargo::CargoResult; +use clap::builder::{PossibleValuesParser, TypedValueParser}; +use std::time::Duration; pub fn cli() -> Command { subcommand("clean") @@ -15,18 +19,227 @@ pub fn cli() -> Command { .arg_target_dir() .arg_manifest_path() .arg_dry_run("Display what would be deleted without deleting anything") + + // NOTE: Not all of these options may get stabilized. Some of them are + // very low-level details, and may not be something typical users need. + .arg( + optional_opt( + "gc", + "Delete old and unused files (unstable) (comma separated): all, download, target, shared-target", + ) + .hide(true) + .value_name("KINDS") + .value_parser( + PossibleValuesParser::new(["all", "download", "target", "shared-target"]).map(|x| + match x.as_str() { + "all" => AutoGcKind::All, + "download" => AutoGcKind::Download, + "target" => panic!("target is not yet implemented"), + "shared-target" => panic!("shared-target is not yet implemented"), + x => panic!("possible value out of sync with `{x}`"), + } + )) + .require_equals(true), + ) + .arg( + opt( + "max-src-age", + "Deletes source cache files that have not been used since the given age (unstable)", + ) + .hide(true) + .value_name("DURATION"), + ) + .arg( + opt( + "max-crate-age", + "Deletes crate cache files that have not been used since the given age (unstable)", + ) + .hide(true) + .value_name("DURATION"), + ) + .arg( + opt( + "max-index-age", + "Deletes registry indexes that have not been used since then given age (unstable)", + ) + .hide(true) + .value_name("DURATION"), + ) + .arg( + opt( + "max-git-co-age", + "Deletes git dependency checkouts that have not been used since then given age (unstable)", + ) + .hide(true) + .value_name("DURATION"), + ) + .arg( + opt( + "max-git-db-age", + "Deletes git dependency clones that have not been used since then given age (unstable)", + ) + .hide(true) + .value_name("DURATION"), + ) + .arg( + opt( + "max-download-age", + "Deletes any downloaded cache data that has not been used since then given age (unstable)", + ) + .hide(true) + .value_name("DURATION"), + ) + + .arg( + opt( + "max-src-size", + "Deletes source cache files until the cache is under the given size (unstable)", + ) + .hide(true) + .value_name("SIZE"), + ) + .arg( + opt( + "max-crate-size", + "Deletes crate cache files until the cache is under the given size (unstable)", + ) + .hide(true) + .value_name("SIZE"), + ) + .arg( + opt("max-git-size", + "Deletes git dependency caches until the cache is under the given size (unstable") + .hide(true) + .value_name("SIZE")) + .arg( + opt( + "max-download-size", + "Deletes downloaded cache data until the cache is under the given size (unstable)", + ) + .hide(true) + .value_name("DURATION"), + ) + + // These are unimplemented. Leaving here as a guide for how this is + // intended to evolve. These will likely change, this is just a sketch + // of ideas. + .arg( + opt( + "max-target-age", + "Deletes any build artifact files that have not been used since then given age (unstable) (UNIMPLEMENTED)", + ) + .hide(true) + .value_name("DURATION"), + ) + .arg( + // TODO: come up with something less wordy? + opt( + "max-shared-target-age", + "Deletes any shared build artifact files that have not been used since then given age (unstable) (UNIMPLEMENTED)", + ) + .hide(true) + .value_name("DURATION"), + ) + .arg( + opt( + "max-target-size", + "Deletes build artifact files until the cache is under the given size (unstable) (UNIMPLEMENTED)", + ) + .hide(true) + .value_name("SIZE"), + ) + .arg( + // TODO: come up with something less wordy? + opt( + "max-shared-target-size", + "Deletes shared build artifact files until the cache is under the given size (unstable) (UNIMPLEMENTED)", + ) + .hide(true) + .value_name("DURATION"), + ) + .after_help(color_print::cstr!( "Run `cargo help clean` for more detailed information.\n" )) } pub fn exec(config: &mut Config, args: &ArgMatches) -> CliResult { - let ws = args.workspace(config)?; + let ws = args.workspace(config); if args.is_present_with_zero_values("package") { - print_available_packages(&ws)?; + print_available_packages(&ws?)?; + return Ok(()); } + let unstable_gc = |opt| { + // TODO: issue number + config + .cli_unstable() + .fail_if_stable_opt_custom_z(opt, 0, "gc", config.cli_unstable().gc) + }; + let unstable_cache_opt = |opt| -> CargoResult> { + let arg = args.get_one::(opt).map(String::as_str); + if arg.is_some() { + unstable_gc(opt)?; + } + Ok(arg) + }; + let unstable_size_opt = |opt| -> CargoResult> { + unstable_cache_opt(opt)? + .map(|s| parse_human_size(s)) + .transpose() + }; + let unstable_duration_opt = |opt| -> CargoResult> { + unstable_cache_opt(opt)? + .map(|s| parse_time_span(s)) + .transpose() + }; + let unimplemented_opt = |opt| -> CargoResult> { + let arg = args.get_one::(opt).map(String::as_str); + if arg.is_some() { + anyhow::bail!("option --{opt} is not yet implemented"); + } + Ok(None) + }; + let unimplemented_size_opt = |opt| -> CargoResult> { + unimplemented_opt(opt)?; + Ok(None) + }; + let unimplemented_duration_opt = |opt| -> CargoResult> { + unimplemented_opt(opt)?; + Ok(None) + }; + + let mut gc: Vec<_> = args + .get_many::("gc") + .unwrap_or_default() + .cloned() + .collect(); + if gc.is_empty() && args.contains_id("gc") { + gc.push(AutoGcKind::All); + } + if !gc.is_empty() { + unstable_gc("gc")?; + } + + let mut gc_opts = GcOpts { + max_src_age: unstable_duration_opt("max-src-age")?, + max_crate_age: unstable_duration_opt("max-crate-age")?, + max_index_age: unstable_duration_opt("max-index-age")?, + max_git_co_age: unstable_duration_opt("max-git-co-age")?, + max_git_db_age: unstable_duration_opt("max-git-db-age")?, + max_src_size: unstable_size_opt("max-src-size")?, + max_crate_size: unstable_size_opt("max-crate-size")?, + max_git_size: unstable_size_opt("max-git-size")?, + max_download_size: unstable_size_opt("max-download-size")?, + max_target_age: unimplemented_duration_opt("max-target-age")?, + max_shared_target_age: unimplemented_duration_opt("max-shared-target-age")?, + max_target_size: unimplemented_size_opt("max-target-size")?, + max_shared_target_size: unimplemented_size_opt("max-shared-target-size")?, + }; + let max_download_age = unstable_duration_opt("max-download-age")?; + gc_opts.update_for_auto_gc(config, &gc, max_download_age)?; + let opts = CleanOptions { config, spec: values(args, "package"), @@ -35,7 +248,8 @@ pub fn exec(config: &mut Config, args: &ArgMatches) -> CliResult { profile_specified: args.contains_id("profile") || args.flag("release"), doc: args.flag("doc"), dry_run: args.dry_run(), + gc_opts, }; - ops::clean(&ws, &opts)?; + ops::clean(ws, &opts)?; Ok(()) } diff --git a/src/cargo/core/gc.rs b/src/cargo/core/gc.rs new file mode 100644 index 000000000..f70aee584 --- /dev/null +++ b/src/cargo/core/gc.rs @@ -0,0 +1,550 @@ +//! Support for garbage collecting unused files from downloaded files or +//! artifacts from the target directory. +//! +//! Garbage collection can be done "automatically" by cargo, which it does by +//! default once a day when running any command that does a lot of work (like +//! `cargo build`). +//! +//! Garbage collection can also be done manually via the `cargo clean` command +//! by passing any option that requests deleting unused files. +//! +//! Garbage collection is guided by the last-use tracking implemented in the +//! [`crate::core::global_cache_tracker`] module. + +use crate::core::global_cache_tracker::{self, GlobalCacheTracker}; +use crate::core::Verbosity; +use crate::ops::CleanContext; +use crate::util::cache_lock::{CacheLock, CacheLockMode}; +use crate::{CargoResult, Config}; +use anyhow::format_err; +use anyhow::{bail, Context}; +use serde::Deserialize; +use std::time::Duration; + +/// Garbage collector. +pub struct Gc<'a, 'config> { + config: &'config Config, + global_cache_tracker: &'a mut GlobalCacheTracker, + /// A lock on the package cache. + /// + /// This is important to be held, since we don't want multiple cargos to + /// be allowed to write to the cache at the same time, or for others to + /// read while we are modifying the cache. + #[allow(dead_code)] // Held for drop. + lock: CacheLock<'config>, +} + +/// Automatic garbage collection settings from the `gc.auto` config table. +/// +/// NOTE: Not all of these options may get stabilized. Some of them are very +/// low-level details, and may not be something typical users need. +/// +/// If any of these options are `None`, the built-in default is used. +#[derive(Deserialize, Default)] +#[serde(rename_all = "kebab-case")] +struct AutoConfig { + /// The maximum frequency that automatic garbage collection happens. + frequency: Option, + /// Anything older than this duration will be deleted in the source cache. + max_src_age: Option, + /// Anything older than this duration will be deleted in the compressed crate cache. + max_crate_age: Option, + /// Any index older than this duration will be deleted from the index cache. + max_index_age: Option, + /// Any git checkout older than this duration will be deleted from the checkout cache. + max_git_co_age: Option, + /// Any git clone older than this duration will be deleted from the git cache. + max_git_db_age: Option, +} + +/// Options to use for garbage collection. +#[derive(Clone, Debug, Default)] +pub struct GcOpts { + /// The `--max-src-age` CLI option. + pub max_src_age: Option, + // The `--max-crate-age` CLI option. + pub max_crate_age: Option, + /// The `--max-index-age` CLI option. + pub max_index_age: Option, + /// The `--max-git-co-age` CLI option. + pub max_git_co_age: Option, + /// The `--max-git-db-age` CLI option. + pub max_git_db_age: Option, + /// The `--max-src-size` CLI option. + pub max_src_size: Option, + /// The `--max-crate-size` CLI option. + pub max_crate_size: Option, + /// The `--max-git-size` CLI option. + pub max_git_size: Option, + /// The `--max-download-size` CLI option. + pub max_download_size: Option, + + /// The `--max-target-age` CLI option (UNIMPLEMENTED). + pub max_target_age: Option, + /// The `--max-shared-target-age CLI option (UNIMPLEMENTED). + pub max_shared_target_age: Option, + /// The `--max-target-size` CLI option (UNIMPLEMENTED). + pub max_target_size: Option, + /// The `--max-shared-target-size` CLI option (UNIMPLEMENTED). + pub max_shared_target_size: Option, +} + +impl GcOpts { + /// Returns whether any download cache cleaning options are set. + pub fn is_download_cache_opt_set(&self) -> bool { + self.max_src_age.is_some() + || self.max_crate_age.is_some() + || self.max_index_age.is_some() + || self.max_git_co_age.is_some() + || self.max_git_db_age.is_some() + || self.max_src_size.is_some() + || self.max_crate_size.is_some() + || self.max_git_size.is_some() + || self.max_download_size.is_some() + } + + /// Returns whether any download cache cleaning options based on size are set. + pub fn is_download_cache_size_set(&self) -> bool { + self.max_src_size.is_some() + || self.max_crate_size.is_some() + || self.max_git_size.is_some() + || self.max_download_size.is_some() + } + + /// Returns whether any target directory cleaning options are set. + pub fn is_target_opt_set(&self) -> bool { + self.max_target_size.is_some() + || self.max_target_age.is_some() + || self.max_shared_target_age.is_some() + || self.max_shared_target_size.is_some() + } + + /// Updates the configuration of this [`GcOpts`] to incorporate the + /// settings from config and the given CLI options. + /// + /// * `kinds` is a list of [`AutoGcKind`] that is being requested to + /// perform. This corresponds to the `cargo clean --gc` flag. If empty, + /// no config options are incorporated. + /// * `max_download_age` is the `--max-download-age` CLI option which + /// requires special handling since it implicitly overlaps two options. + /// It will use the newer value of either this or the explicit value. + /// + /// The `kinds` list is used in a few different ways: + /// + /// * If empty, uses only the options the user specified on the + /// command-line, like `cargo clean --max-crate-size=…`. + /// * If the user specified a `cargo clean --gc` option, then the `kinds` + /// list is filled in with whatever `--gc` option the user picked, and + /// then this function *merges* the settings between the requested + /// `--gc` option and any options that were explicitly specified. + /// * [`AutoGcKind::All`] is used in `cargo clean` when no options are + /// specified. + pub fn update_for_auto_gc( + &mut self, + config: &Config, + kinds: &[AutoGcKind], + max_download_age: Option, + ) -> CargoResult<()> { + let auto_config = config + .get::>("gc.auto")? + .unwrap_or_default(); + self.update_for_auto_gc_config(&auto_config, kinds, max_download_age) + } + + fn update_for_auto_gc_config( + &mut self, + auto_config: &AutoConfig, + kinds: &[AutoGcKind], + max_download_age: Option, + ) -> CargoResult<()> { + for kind in kinds { + if matches!(kind, AutoGcKind::All | AutoGcKind::Download) { + self.max_src_age = newer_time_span_for_config( + self.max_src_age, + "gc.auto.max-src-age", + auto_config.max_src_age.as_deref().unwrap_or("1 month"), + )?; + self.max_crate_age = newer_time_span_for_config( + self.max_crate_age, + "gc.auto.max-crate-age", + auto_config.max_crate_age.as_deref().unwrap_or("3 months"), + )?; + self.max_index_age = newer_time_span_for_config( + self.max_index_age, + "gc.auto.max-index-age", + auto_config.max_index_age.as_deref().unwrap_or("3 months"), + )?; + self.max_git_co_age = newer_time_span_for_config( + self.max_git_co_age, + "gc.auto.max-git-co-age", + auto_config.max_git_co_age.as_deref().unwrap_or("1 month"), + )?; + self.max_git_db_age = newer_time_span_for_config( + self.max_git_db_age, + "gc.auto.max-git-db-age", + auto_config.max_git_db_age.as_deref().unwrap_or("3 months"), + )?; + } + if matches!(kind, AutoGcKind::Target | AutoGcKind::SharedTarget) { + bail!("target is unimplemented"); + } + } + if let Some(max_download_age) = max_download_age { + self.max_src_age = Some(maybe_newer_span(max_download_age, self.max_src_age)); + self.max_crate_age = Some(maybe_newer_span(max_download_age, self.max_crate_age)); + self.max_index_age = Some(maybe_newer_span(max_download_age, self.max_index_age)); + self.max_git_co_age = Some(maybe_newer_span(max_download_age, self.max_git_co_age)); + self.max_git_db_age = Some(maybe_newer_span(max_download_age, self.max_git_db_age)); + } + Ok(()) + } +} + +/// The kind of automatic garbage collection to perform. +/// +/// "Automatic" is the kind of gc performed automatically by Cargo in any +/// command that is already doing a bunch of work. See [`auto_gc`] for more. +#[derive(Clone, Debug)] +pub enum AutoGcKind { + /// Automatically clean up the downloaded files *and* the target directory. + /// + /// This is the mode used by default. + All, + /// Automatically clean only downloaded files. + /// + /// This corresponds to `cargo clean --gc=download`. + Download, + /// Automatically clean only the target directory. + /// + /// THIS IS NOT IMPLEMENTED. + /// + /// This corresponds to `cargo clean --gc=target`. + Target, + /// Automatically clean only the shared target directory. + /// + /// THIS IS NOT IMPLEMENTED. + /// + /// This corresponds to `cargo clean --gc=shared-target`. + SharedTarget, +} + +impl<'a, 'config> Gc<'a, 'config> { + pub fn new( + config: &'config Config, + global_cache_tracker: &'a mut GlobalCacheTracker, + ) -> CargoResult> { + let lock = config.acquire_package_cache_lock(CacheLockMode::MutateExclusive)?; + Ok(Gc { + config, + global_cache_tracker, + lock, + }) + } + + /// Performs automatic garbage cleaning. + /// + /// This returns immediately without doing work if garbage collection has + /// been performed recently (since `gc.auto.frequency`). + fn auto(&mut self, clean_ctx: &mut CleanContext<'config>) -> CargoResult<()> { + if !self.config.cli_unstable().gc { + return Ok(()); + } + let auto_config = self + .config + .get::>("gc.auto")? + .unwrap_or_default(); + let Some(freq) = parse_frequency(auto_config.frequency.as_deref().unwrap_or("1 day"))? + else { + tracing::trace!("auto gc disabled"); + return Ok(()); + }; + if !self.global_cache_tracker.should_run_auto_gc(freq)? { + return Ok(()); + } + let mut gc_opts = GcOpts::default(); + gc_opts.update_for_auto_gc_config(&auto_config, &[AutoGcKind::All], None)?; + self.gc(clean_ctx, &gc_opts)?; + if !clean_ctx.dry_run { + self.global_cache_tracker.set_last_auto_gc()?; + } + Ok(()) + } + + /// Performs garbage collection based on the given options. + pub fn gc( + &mut self, + clean_ctx: &mut CleanContext<'config>, + gc_opts: &GcOpts, + ) -> CargoResult<()> { + self.global_cache_tracker.clean(clean_ctx, gc_opts)?; + // In the future, other gc operations go here, such as target cleaning. + Ok(()) + } +} + +/// Returns the shorter duration from `cur_span` versus `config_span`. +/// +/// This is used because the user may specify multiple options which overlap, +/// and this will pick whichever one is shorter. +/// +/// * `cur_span` is the span we are comparing against (the value from the CLI +/// option). If None, just returns the config duration. +/// * `config_name` is the name of the config option the span is loaded from. +/// * `config_span` is the span value loaded from config. +fn newer_time_span_for_config( + cur_span: Option, + config_name: &str, + config_span: &str, +) -> CargoResult> { + let config_span = parse_time_span_for_config(config_name, config_span)?; + Ok(Some(maybe_newer_span(config_span, cur_span))) +} + +/// Returns whichever [`Duration`] is shorter. +fn maybe_newer_span(a: Duration, b: Option) -> Duration { + match b { + Some(b) => { + if b < a { + b + } else { + a + } + } + None => a, + } +} + +/// Parses a frequency string. +/// +/// Returns `Ok(None)` if the frequency is "never". +fn parse_frequency(frequency: &str) -> CargoResult> { + if frequency == "always" { + return Ok(Some(Duration::new(0, 0))); + } else if frequency == "never" { + return Ok(None); + } + let duration = maybe_parse_time_span(frequency).ok_or_else(|| { + format_err!( + "config option `gc.auto.frequency` expected a value of \"always\", \"never\", \ + or \"N seconds/minutes/days/weeks/months\", got: {frequency:?}" + ) + })?; + Ok(Some(duration)) +} + +/// Parses a time span value fetched from config. +/// +/// This is here to provide better error messages specific to reading from +/// config. +fn parse_time_span_for_config(config_name: &str, span: &str) -> CargoResult { + maybe_parse_time_span(span).ok_or_else(|| { + format_err!( + "config option `{config_name}` expected a value of the form \ + \"N seconds/minutes/days/weeks/months\", got: {span:?}" + ) + }) +} + +/// Parses a time span string. +/// +/// Returns None if the value is not valid. See [`parse_time_span`] if you +/// need a variant that generates an error message. +fn maybe_parse_time_span(span: &str) -> Option { + let Some(right_i) = span.find(|c: char| !c.is_ascii_digit()) else { + return None; + }; + let left = &span[..right_i]; + let mut right = &span[right_i..]; + if right.starts_with(' ') { + right = &right[1..]; + } + let count: u64 = left.parse().ok()?; + let factor = match right { + "second" | "seconds" => 1, + "minute" | "minutes" => 60, + "hour" | "hours" => 60 * 60, + "day" | "days" => 24 * 60 * 60, + "week" | "weeks" => 7 * 24 * 60 * 60, + "month" | "months" => 30 * 24 * 60 * 60, + _ => return None, + }; + Some(Duration::from_secs(factor * count)) +} + +/// Parses a time span string. +pub fn parse_time_span(span: &str) -> CargoResult { + maybe_parse_time_span(span).ok_or_else(|| { + format_err!( + "expected a value of the form \ + \"N seconds/minutes/days/weeks/months\", got: {span:?}" + ) + }) +} + +/// Parses a file size using metric or IEC units. +pub fn parse_human_size(input: &str) -> CargoResult { + let re = regex::Regex::new(r"(?i)^([0-9]+(\.[0-9])?) ?(b|kb|mb|gb|kib|mib|gib)?$").unwrap(); + let cap = re.captures(input).ok_or_else(|| { + format_err!( + "invalid size `{input}`, \ + expected a number with an optional B, kB, MB, GB, kiB, MiB, or GiB suffix" + ) + })?; + let factor = match cap.get(3) { + Some(suffix) => match suffix.as_str().to_lowercase().as_str() { + "b" => 1.0, + "kb" => 1_000.0, + "mb" => 1_000_000.0, + "gb" => 1_000_000_000.0, + "kib" => 1024.0, + "mib" => 1024.0 * 1024.0, + "gib" => 1024.0 * 1024.0 * 1024.0, + s => panic!("suffix `{s}` out of sync with regex"), + }, + None => { + return cap[1] + .parse() + .with_context(|| format!("expected an integer size, got `{}`", &cap[1])) + } + }; + let num = cap[1] + .parse::() + .with_context(|| format!("expected an integer or float, found `{}`", &cap[1]))?; + Ok((num * factor) as u64) +} + +/// Performs automatic garbage collection. +/// +/// This is called in various places in Cargo where garbage collection should +/// be performed automatically based on the config settings. The default +/// behavior is to only clean once a day. +/// +/// This should only be called in code paths for commands that are already +/// doing a lot of work. It should only be called *after* crates are +/// downloaded so that the last-use data is updated first. +/// +/// It should be cheap to call this multiple times (subsequent calls are +/// ignored), but try not to abuse that. +pub fn auto_gc(config: &Config) { + if !config.cli_unstable().gc { + return; + } + if !config.network_allowed() { + // As a conservative choice, auto-gc is disabled when offline. If the + // user is indefinitely offline, we don't want to delete things they + // may later depend on. + return; + } + + if let Err(e) = auto_gc_inner(config) { + if global_cache_tracker::is_silent_error(&e) + && config.shell().verbosity() != Verbosity::Verbose + { + tracing::warn!("failed to auto-clean cache data: {e:?}"); + } else { + crate::display_warning_with_error( + "failed to auto-clean cache data", + &e, + &mut config.shell(), + ); + } + } +} + +fn auto_gc_inner(config: &Config) -> CargoResult<()> { + let _lock = match config.try_acquire_package_cache_lock(CacheLockMode::MutateExclusive)? { + Some(lock) => lock, + None => { + tracing::debug!("unable to acquire mutate lock, auto gc disabled"); + return Ok(()); + } + }; + // This should not be called when there are pending deferred entries, so check that. + let deferred = config.deferred_global_last_use()?; + debug_assert!(deferred.is_empty()); + let mut global_cache_tracker = config.global_cache_tracker()?; + let mut gc = Gc::new(config, &mut global_cache_tracker)?; + let mut clean_ctx = CleanContext::new(config); + gc.auto(&mut clean_ctx)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn time_spans() { + let d = |x| Some(Duration::from_secs(x)); + assert_eq!(maybe_parse_time_span("0 seconds"), d(0)); + assert_eq!(maybe_parse_time_span("1second"), d(1)); + assert_eq!(maybe_parse_time_span("23 seconds"), d(23)); + assert_eq!(maybe_parse_time_span("5 minutes"), d(60 * 5)); + assert_eq!(maybe_parse_time_span("2 hours"), d(60 * 60 * 2)); + assert_eq!(maybe_parse_time_span("1 day"), d(60 * 60 * 24)); + assert_eq!(maybe_parse_time_span("2 weeks"), d(60 * 60 * 24 * 14)); + assert_eq!(maybe_parse_time_span("6 months"), d(60 * 60 * 24 * 30 * 6)); + + assert_eq!(parse_frequency("5 seconds").unwrap(), d(5)); + assert_eq!(parse_frequency("always").unwrap(), d(0)); + assert_eq!(parse_frequency("never").unwrap(), None); + } + + #[test] + fn time_span_errors() { + assert_eq!(maybe_parse_time_span(""), None); + assert_eq!(maybe_parse_time_span("1"), None); + assert_eq!(maybe_parse_time_span("second"), None); + assert_eq!(maybe_parse_time_span("+2 seconds"), None); + assert_eq!(maybe_parse_time_span("day"), None); + assert_eq!(maybe_parse_time_span("-1 days"), None); + assert_eq!(maybe_parse_time_span("1.5 days"), None); + assert_eq!(maybe_parse_time_span("1 dayz"), None); + assert_eq!(maybe_parse_time_span("always"), None); + assert_eq!(maybe_parse_time_span("never"), None); + assert_eq!(maybe_parse_time_span("1 day "), None); + assert_eq!(maybe_parse_time_span(" 1 day"), None); + assert_eq!(maybe_parse_time_span("1 second"), None); + + let e = parse_time_span_for_config("gc.auto.max-src-age", "-1 days").unwrap_err(); + assert_eq!( + e.to_string(), + "config option `gc.auto.max-src-age` \ + expected a value of the form \"N seconds/minutes/days/weeks/months\", \ + got: \"-1 days\"" + ); + let e = parse_frequency("abc").unwrap_err(); + assert_eq!( + e.to_string(), + "config option `gc.auto.frequency` \ + expected a value of \"always\", \"never\", or \"N seconds/minutes/days/weeks/months\", \ + got: \"abc\"" + ); + } + + #[test] + fn human_sizes() { + assert_eq!(parse_human_size("0").unwrap(), 0); + assert_eq!(parse_human_size("123").unwrap(), 123); + assert_eq!(parse_human_size("123b").unwrap(), 123); + assert_eq!(parse_human_size("123B").unwrap(), 123); + assert_eq!(parse_human_size("123 b").unwrap(), 123); + assert_eq!(parse_human_size("123 B").unwrap(), 123); + assert_eq!(parse_human_size("1kb").unwrap(), 1_000); + assert_eq!(parse_human_size("5kb").unwrap(), 5_000); + assert_eq!(parse_human_size("1mb").unwrap(), 1_000_000); + assert_eq!(parse_human_size("1gb").unwrap(), 1_000_000_000); + assert_eq!(parse_human_size("1kib").unwrap(), 1_024); + assert_eq!(parse_human_size("1mib").unwrap(), 1_048_576); + assert_eq!(parse_human_size("1gib").unwrap(), 1_073_741_824); + assert_eq!(parse_human_size("1.5kb").unwrap(), 1_500); + assert_eq!(parse_human_size("1.7b").unwrap(), 1); + + assert!(parse_human_size("").is_err()); + assert!(parse_human_size("x").is_err()); + assert!(parse_human_size("1x").is_err()); + assert!(parse_human_size("1 2").is_err()); + assert!(parse_human_size("1.5").is_err()); + assert!(parse_human_size("+1").is_err()); + assert!(parse_human_size("123 b").is_err()); + } +} diff --git a/src/cargo/core/global_cache_tracker.rs b/src/cargo/core/global_cache_tracker.rs new file mode 100644 index 000000000..ee4024eda --- /dev/null +++ b/src/cargo/core/global_cache_tracker.rs @@ -0,0 +1,1754 @@ +//! Support for tracking the last time files were used to assist with cleaning +//! up those files if they haven't been used in a while. +//! +//! Tracking of cache files is stored in a sqlite database which contains a +//! timestamp of the last time the file was used, as well as the size of the +//! file. +//! +//! While cargo is running, when it detects a use of a cache file, it adds a +//! timestamp to [`DeferredGlobalLastUse`]. This batches up a set of changes +//! that are then flushed to the database all at once (via +//! [`DeferredGlobalLastUse::save`]). Ideally saving would only be done once +//! for performance reasons, but that is not really possible due to the way +//! cargo works, since there are different ways cargo can be used (like `cargo +//! generate-lockfile`, `cargo fetch`, and `cargo build` are all very +//! different ways the code is used). +//! +//! All of the database interaction is done through the [`GlobalCacheTracker`] +//! type. +//! +//! There is a single global [`GlobalCacheTracker`] and +//! [`DeferredGlobalLastUse`] stored in [`Config`]. +//! +//! ## Automatic gc +//! +//! Some commands (primarily the build commands) will trigger an automatic +//! deletion of files that haven't been used in a while. The interface for +//! this is in the [`crate::core::gc`] module. The database tracks the last +//! time an automatic gc was performed so that it is only done once per day +//! for performance reasons. +//! +//! ## Manual gc +//! +//! The user can perform a manual garbage collection with the `cargo clean` +//! command. That command has a variety of options to specify what to delete. +//! Manual gc supports deleting based on age or size or both. +//! +//! ## Locking +//! +//! Usage of the database requires that the package cache is locked to prevent +//! concurrent access. Although sqlite has built-in locking support, we want +//! to use cargo's locking so that the "Blocking" message gets displayed, and +//! so that locks can block indefinitely for long-running build commands. +//! [`rusqlite`] has a default timeout of 5 seconds, though that is +//! configurable. +//! +//! When garbage collection is being performed, the package cache lock must be +//! in [`CacheLockMode::MutateExclusive`] to ensure no other cargo process is +//! running. See [`crate::util::cache_lock`] for more detail on locking. +//! +//! ## Compatibility +//! +//! The database must retain both forwards and backwards compatibility between +//! different versions of cargo. For the most part, this shouldn't be too +//! difficult to maintain. Generally sqlite doesn't change on-disk formats +//! between versions (the introduction of WAL is one of the few examples where +//! version 3 had a format change, but we wouldn't use it anyway since it has +//! shared-memory requirements cargo can't depend on due to things like +//! network mounts). +//! +//! Schema changes must be managed through [`migrations`] by adding new +//! entries that make a change to the database. Changes must not break older +//! versions of cargo. Generally, adding columns should be fine (either with a +//! default value, or NULL). Adding tables should also be fine. Just don't do +//! destructive things like removing a column, or changing the semantics of an +//! existing column. +//! +//! ## Performance +//! +//! A lot of focus on the design of this system is to minimize the performance +//! impact. Every build command needs to save updates which we try to avoid +//! having a noticeable impact on build times. Systems like Windows, +//! particularly with a magnetic hard disk, can experience a fairly large +//! impact of cargo's overhead. Cargo's benchsuite has some benchmarks to help +//! compare different environments, or changes to the code here. Please try to +//! keep performance in mind if making any major changes. +//! +//! Performance of `cargo clean` is not quite as important since it is not +//! expected to be run often. However, it is still courteous to the user to +//! try to not impact it too much. One part that has a performance concern is +//! that the clean command will synchronize the database with whatever is on +//! disk if needed (in case files were added by older versions of cargo that +//! don't do cache tracking, or if the user manually deleted some files). This +//! can potentially be very slow, especially if the two are very out of sync. +//! +//! ## Filesystems +//! +//! Everything here is sensitive to the kind of filesystem it is running on. +//! People tend to run cargo in all sorts of strange environments that have +//! limited capabilities, or on things like read-only mounts. The code here +//! needs to gracefully handle as many situations as possible. +//! +//! The sections above about performance and locking are very relevant when +//! considering different filesystems. +//! +//! There are checks for read-only filesystems, which is generally ignored. + +use crate::core::gc::GcOpts; +use crate::core::Verbosity; +use crate::ops::CleanContext; +use crate::util::cache_lock::CacheLockMode; +use crate::util::interning::InternedString; +use crate::util::sqlite::{self, basic_migration, Migration}; +use crate::util::{Filesystem, Progress, ProgressStyle}; +use crate::{CargoResult, Config}; +use anyhow::{bail, Context}; +use cargo_util::paths; +use rusqlite::{params, Connection, ErrorCode}; +use std::collections::{hash_map, HashMap}; +use std::path::{Path, PathBuf}; +use std::time::{Duration, SystemTime}; +use tracing::{debug, trace}; + +/// The filename of the database. +const GLOBAL_CACHE_FILENAME: &str = ".global-cache"; + +const REGISTRY_INDEX_TABLE: &str = "registry_index"; +const REGISTRY_CRATE_TABLE: &str = "registry_crate"; +const REGISTRY_SRC_TABLE: &str = "registry_src"; +const GIT_DB_TABLE: &str = "git_db"; +const GIT_CO_TABLE: &str = "git_checkout"; + +/// How often timestamps will be updated. +/// +/// As an optimization timestamps are not updated unless they are older than +/// the given number of seconds. This helps reduce the amount of disk I/O when +/// running cargo multiple times within a short window. +const UPDATE_RESOLUTION: u64 = 60 * 5; + +/// Type for timestamps as stored in the database. +/// +/// These are seconds since the Unix epoch. +type Timestamp = u64; + +/// Tracking for the global shared cache (registry files, etc.). +#[derive(Debug)] +pub struct GlobalCacheTracker { + /// Connection to the SQLite database. + conn: Connection, + auto_gc_checked_this_session: bool, +} + +/// This is a cache of modifications that will be saved to disk all at once +/// via the [`DeferredGlobalLastUse::save`] method. +/// +/// This is here to improve performance. +#[derive(Debug)] +pub struct DeferredGlobalLastUse { + /// Cache of registry keys, used for faster fetching. + /// + /// The key is the registry name (which is its directory name) and the + /// value is the `id` in the `registry_index` table. + registry_keys: HashMap, + /// Cache of git keys, used for faster fetching. + /// + /// The key is the git db name (which is its directory name) and the value + /// is the `id` in the `git_db` table. + git_keys: HashMap, + + /// New registry index entries to insert. + registry_index_timestamps: HashMap, + /// New registry `.crate` entries to insert. + registry_crate_timestamps: HashMap, + /// New registry src directory entries to insert. + registry_src_timestamps: HashMap, + /// New git db entries to insert. + git_db_timestamps: HashMap, + /// New git checkout entries to insert. + git_checkout_timestamps: HashMap, + /// This is used so that a warning about failing to update the database is + /// only displayed once. + save_err_has_warned: bool, + /// The current time, used to improve performance to avoid accessing the + /// clock hundreds of times. + now: Timestamp, +} + +/// The key for a registry index entry stored in the database. +#[derive(Clone, Debug, Hash, Eq, PartialEq)] +pub struct RegistryIndex { + pub encoded_registry_name: InternedString, +} + +/// The key for a registry `.crate` entry stored in the database. +#[derive(Clone, Debug, Hash, Eq, PartialEq)] +pub struct RegistryCrate { + pub encoded_registry_name: InternedString, + pub crate_filename: InternedString, + pub size: u64, +} + +/// The key for a registry src directory entry stored in the database. +#[derive(Clone, Debug, Hash, Eq, PartialEq)] +pub struct RegistrySrc { + pub encoded_registry_name: InternedString, + pub package_dir: InternedString, + // Total size of the src directory in bytes. + // + // This can be None when the size is unknown. For example, when the src + // directory already exists on disk, and we just want to update the + // last-use timestamp. We don't want to take the expense of computing disk + // usage unless necessary. `populate_untracked_src` will handle any actual + // NULL values in the database, which can happen when the src directory is + // created by an older version of cargo that did not track sizes. + pub size: Option, +} + +/// The key for a git db entry stored in the database. +#[derive(Clone, Debug, Hash, Eq, PartialEq)] +pub struct GitDb { + pub encoded_git_name: InternedString, +} + +/// The key for a git checkout entry stored in the database. +#[derive(Clone, Debug, Hash, Eq, PartialEq)] +pub struct GitCheckout { + pub encoded_git_name: InternedString, + pub short_name: InternedString, + /// Total size of the checkout directory. + /// + /// This can be None when the size is unknown. See [`RegistrySrc::size`] + /// for an explanation. + pub size: Option, +} + +/// Paths in the global cache. +/// +/// Accessing these assumes a lock has already been acquired. +struct BasePaths { + /// Root path to the index caches. + index: PathBuf, + /// Root path to the git DBs. + git_db: PathBuf, + /// Root path to the git checkouts. + git_co: PathBuf, + /// Root path to the `.crate` files. + crate_dir: PathBuf, + /// Root path to the `src` directories. + src: PathBuf, +} + +/// Migrations which initialize the database, and can be used to evolve it over time. +/// +/// See [`Migration`] for more detail. +/// +/// **Be sure to not change the order or entries here!** +fn migrations() -> Vec { + vec![ + // registry_index tracks the overall usage of an index cache, and tracks a + // numeric ID to refer to that index that is used in other tables. + basic_migration( + "CREATE TABLE registry_index ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT UNIQUE NOT NULL, + timestamp INTEGER NOT NULL + )", + ), + // .crate files + basic_migration( + "CREATE TABLE registry_crate ( + registry_id INTEGER NOT NULL, + name TEXT NOT NULL, + size INTEGER NOT NULL, + timestamp INTEGER NOT NULL, + PRIMARY KEY (registry_id, name), + FOREIGN KEY (registry_id) REFERENCES registry_index (id) ON DELETE CASCADE + )", + ), + // Extracted src directories + // + // Note that `size` can be NULL. This will happen when marking a src + // directory as used that was created by an older version of cargo + // that didn't do size tracking. + basic_migration( + "CREATE TABLE registry_src ( + registry_id INTEGER NOT NULL, + name TEXT NOT NULL, + size INTEGER, + timestamp INTEGER NOT NULL, + PRIMARY KEY (registry_id, name), + FOREIGN KEY (registry_id) REFERENCES registry_index (id) ON DELETE CASCADE + )", + ), + // Git db directories + basic_migration( + "CREATE TABLE git_db ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT UNIQUE NOT NULL, + timestamp INTEGER NOT NULL + )", + ), + // Git checkout directories + basic_migration( + "CREATE TABLE git_checkout ( + git_id INTEGER NOT NULL, + name TEXT UNIQUE NOT NULL, + size INTEGER, + timestamp INTEGER NOT NULL, + PRIMARY KEY (git_id, name), + FOREIGN KEY (git_id) REFERENCES git_db (id) ON DELETE CASCADE + )", + ), + // This is a general-purpose single-row table that can store arbitrary + // data. Feel free to add columns (with ALTER TABLE) if necessary. + basic_migration( + "CREATE TABLE global_data ( + last_auto_gc INTEGER NOT NULL + )", + ), + // last_auto_gc tracks the last time auto-gc was run (so that it only + // runs roughly once a day for performance reasons). Prime it with the + // current time to establish a baseline. + Box::new(|conn| { + conn.execute( + "INSERT INTO global_data (last_auto_gc) VALUES (?1)", + [now()], + )?; + Ok(()) + }), + ] +} + +impl GlobalCacheTracker { + /// Creates a new [`GlobalCacheTracker`]. + /// + /// The caller is responsible for locking the package cache with + /// [`CacheLockMode::DownloadExclusive`] before calling this. + pub fn new(config: &Config) -> CargoResult { + let mut conn = if config.cli_unstable().gc { + let db_path = Self::db_path(config); + // A package cache lock is required to ensure only one cargo is + // accessing at the same time. If there is concurrent access, we + // want to rely on cargo's own "Blocking" system (which can + // provide user feedback) rather than blocking inside sqlite + // (which by default has a short timeout). + let db_path = + config.assert_package_cache_locked(CacheLockMode::DownloadExclusive, &db_path); + Connection::open(db_path)? + } else { + // To simplify things (so there aren't checks everywhere for being + // enabled), just process everything in memory. + Connection::open_in_memory()? + }; + conn.pragma_update(None, "foreign_keys", true)?; + sqlite::migrate(&mut conn, &migrations())?; + Ok(GlobalCacheTracker { + conn, + auto_gc_checked_this_session: false, + }) + } + + /// The path to the database. + pub fn db_path(config: &Config) -> Filesystem { + config.home().join(GLOBAL_CACHE_FILENAME) + } + + /// Given an encoded registry name, returns its ID. + /// + /// Returns None if the given name isn't in the database. + fn id_from_name( + conn: &Connection, + table_name: &str, + encoded_name: &str, + ) -> CargoResult> { + let mut stmt = + conn.prepare_cached(&format!("SELECT id FROM {table_name} WHERE name = ?"))?; + match stmt.query_row([encoded_name], |row| row.get(0)) { + Ok(id) => Ok(Some(id)), + Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), + Err(e) => Err(e.into()), + } + } + + /// Returns a map of ID to path for the given ids in the given table. + /// + /// For example, given `registry_index` IDs, it returns filenames of the + /// form "index.crates.io-6f17d22bba15001f". + fn get_id_map( + conn: &Connection, + table_name: &str, + ids: &[i64], + ) -> CargoResult> { + let mut stmt = + conn.prepare_cached(&format!("SELECT name FROM {table_name} WHERE id = ?1"))?; + ids.iter() + .map(|id| { + let name = stmt.query_row(params![id], |row| { + Ok(PathBuf::from(row.get::<_, String>(0)?)) + })?; + Ok((*id, name)) + }) + .collect() + } + + /// Returns all index cache timestamps. + pub fn registry_index_all(&self) -> CargoResult> { + let mut stmt = self + .conn + .prepare_cached("SELECT name, timestamp FROM registry_index")?; + let rows = stmt + .query_map([], |row| { + let encoded_registry_name = row.get_unwrap(0); + let timestamp = row.get_unwrap(1); + let kind = RegistryIndex { + encoded_registry_name, + }; + Ok((kind, timestamp)) + })? + .collect::, _>>()?; + Ok(rows) + } + + /// Returns all registry crate cache timestamps. + pub fn registry_crate_all(&self) -> CargoResult> { + let mut stmt = self.conn.prepare_cached( + "SELECT registry_index.name, registry_crate.name, registry_crate.size, registry_crate.timestamp + FROM registry_index, registry_crate + WHERE registry_crate.registry_id = registry_index.id", + )?; + let rows = stmt + .query_map([], |row| { + let encoded_registry_name = row.get_unwrap(0); + let crate_filename = row.get_unwrap(1); + let size = row.get_unwrap(2); + let timestamp = row.get_unwrap(3); + let kind = RegistryCrate { + encoded_registry_name, + crate_filename, + size, + }; + Ok((kind, timestamp)) + })? + .collect::, _>>()?; + Ok(rows) + } + + /// Returns all registry source cache timestamps. + pub fn registry_src_all(&self) -> CargoResult> { + let mut stmt = self.conn.prepare_cached( + "SELECT registry_index.name, registry_src.name, registry_src.size, registry_src.timestamp + FROM registry_index, registry_src + WHERE registry_src.registry_id = registry_index.id", + )?; + let rows = stmt + .query_map([], |row| { + let encoded_registry_name = row.get_unwrap(0); + let package_dir = row.get_unwrap(1); + let size = row.get_unwrap(2); + let timestamp = row.get_unwrap(3); + let kind = RegistrySrc { + encoded_registry_name, + package_dir, + size, + }; + Ok((kind, timestamp)) + })? + .collect::, _>>()?; + Ok(rows) + } + + /// Returns all git db timestamps. + pub fn git_db_all(&self) -> CargoResult> { + let mut stmt = self + .conn + .prepare_cached("SELECT name, timestamp FROM git_db")?; + let rows = stmt + .query_map([], |row| { + let encoded_git_name = row.get_unwrap(0); + let timestamp = row.get_unwrap(1); + let kind = GitDb { encoded_git_name }; + Ok((kind, timestamp)) + })? + .collect::, _>>()?; + Ok(rows) + } + + /// Returns all git checkout timestamps. + pub fn git_checkout_all(&self) -> CargoResult> { + let mut stmt = self.conn.prepare_cached( + "SELECT git_db.name, git_checkout.name, git_checkout.size, git_checkout.timestamp + FROM git_db, git_checkout + WHERE git_checkout.registry_id = git_db.id", + )?; + let rows = stmt + .query_map([], |row| { + let encoded_git_name = row.get_unwrap(0); + let short_name = row.get_unwrap(1); + let size = row.get_unwrap(2); + let timestamp = row.get_unwrap(3); + let kind = GitCheckout { + encoded_git_name, + short_name, + size, + }; + Ok((kind, timestamp)) + })? + .collect::, _>>()?; + Ok(rows) + } + + /// Returns whether or not an auto GC should be performed, compared to the + /// last time it was recorded in the database. + pub fn should_run_auto_gc(&mut self, frequency: Duration) -> CargoResult { + trace!("should_run_auto_gc"); + if self.auto_gc_checked_this_session { + return Ok(false); + } + let last_auto_gc: Timestamp = + self.conn + .query_row("SELECT last_auto_gc FROM global_data", [], |row| row.get(0))?; + let should_run = last_auto_gc + frequency.as_secs() < now(); + trace!( + "last auto gc was {}, {}", + last_auto_gc, + if should_run { "running" } else { "skipping" } + ); + self.auto_gc_checked_this_session = true; + Ok(should_run) + } + + /// Writes to the database to indicate that an automatic GC has just been + /// completed. + pub fn set_last_auto_gc(&self) -> CargoResult<()> { + self.conn + .execute("UPDATE global_data SET last_auto_gc = ?1", [now()])?; + Ok(()) + } + + /// Deletes files from the global cache based on the given options. + pub fn clean(&mut self, clean_ctx: &mut CleanContext<'_>, gc_opts: &GcOpts) -> CargoResult<()> { + self.clean_inner(clean_ctx, gc_opts) + .with_context(|| "failed to clean entries from the global cache") + } + + fn clean_inner( + &mut self, + clean_ctx: &mut CleanContext<'_>, + gc_opts: &GcOpts, + ) -> CargoResult<()> { + let _p = crate::util::profile::start("cleaning global cache files"); + let config = clean_ctx.config; + let base_git_path = config.git_path().into_path_unlocked(); + let base = BasePaths { + index: config.registry_index_path().into_path_unlocked(), + git_db: base_git_path.join("db"), + git_co: base_git_path.join("checkouts"), + crate_dir: config.registry_cache_path().into_path_unlocked(), + src: config.registry_source_path().into_path_unlocked(), + }; + let now = now(); + trace!("cleaning {gc_opts:?}"); + let tx = self.conn.transaction()?; + let mut delete_paths = Vec::new(); + // This can be an expensive operation, so only perform it if necessary. + if gc_opts.is_download_cache_opt_set() { + // TODO: Investigate how slow this might be. + Self::sync_db_with_files( + &tx, + config, + &base, + gc_opts.is_download_cache_size_set(), + &mut delete_paths, + ) + .with_context(|| "failed to sync tracking database")? + } + if let Some(max_age) = gc_opts.max_index_age { + let max_age = now - max_age.as_secs(); + Self::get_registry_index_to_clean(&tx, max_age, &base, &mut delete_paths)?; + } + if let Some(max_age) = gc_opts.max_src_age { + let max_age = now - max_age.as_secs(); + Self::get_registry_items_to_clean_age( + &tx, + max_age, + REGISTRY_SRC_TABLE, + &base.src, + &mut delete_paths, + )?; + } + if let Some(max_age) = gc_opts.max_crate_age { + let max_age = now - max_age.as_secs(); + Self::get_registry_items_to_clean_age( + &tx, + max_age, + REGISTRY_CRATE_TABLE, + &base.crate_dir, + &mut delete_paths, + )?; + } + if let Some(max_age) = gc_opts.max_git_db_age { + let max_age = now - max_age.as_secs(); + Self::get_git_db_items_to_clean(&tx, max_age, &base, &mut delete_paths)?; + } + if let Some(max_age) = gc_opts.max_git_co_age { + let max_age = now - max_age.as_secs(); + Self::get_git_co_items_to_clean(&tx, max_age, &base.git_co, &mut delete_paths)?; + } + // Size collection must happen after date collection so that dates + // have precedence, since size constraints are a more blunt + // instrument. + // + // These are also complicated by the `--max-download-size` option + // overlapping with `--max-crate-size` and `--max-src-size`, which + // requires some coordination between those options which isn't + // necessary with the age-based options. An item's age is either older + // or it isn't, but contrast that with size which is based on the sum + // of all tracked items. Also, `--max-download-size` is summed against + // both the crate and src tracking, which requires combining them to + // compute the size, and then separating them to calculate the correct + // paths. + if let Some(max_size) = gc_opts.max_crate_size { + Self::get_registry_items_to_clean_size( + &tx, + max_size, + REGISTRY_CRATE_TABLE, + &base.crate_dir, + &mut delete_paths, + )?; + } + if let Some(max_size) = gc_opts.max_src_size { + Self::get_registry_items_to_clean_size( + &tx, + max_size, + REGISTRY_SRC_TABLE, + &base.src, + &mut delete_paths, + )?; + } + if let Some(max_size) = gc_opts.max_git_size { + Self::get_git_items_to_clean_size(&tx, max_size, &base, &mut delete_paths)?; + } + if let Some(max_size) = gc_opts.max_download_size { + Self::get_registry_items_to_clean_size_both(&tx, max_size, &base, &mut delete_paths)?; + } + + clean_ctx.remove_paths(&delete_paths)?; + + if clean_ctx.dry_run { + tx.rollback()?; + } else { + tx.commit()?; + } + Ok(()) + } + + /// Returns a list of directory entries in the given path. + fn names_from(path: &Path) -> CargoResult> { + let entries = match path.read_dir() { + Ok(e) => e, + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + return Ok(Vec::new()); + } else { + return Err( + anyhow::Error::new(e).context(format!("failed to read path `{path:?}`")) + ); + } + } + }; + let names = entries + .filter_map(|entry| entry.ok()?.file_name().into_string().ok()) + .collect(); + Ok(names) + } + + /// Synchronizes the database to match the files on disk. + /// + /// This performs the following cleanups: + /// + /// 1. Remove entries from the database that are missing on disk. + /// 2. Adds missing entries to the database that are on disk (such as when + /// files are added by older versions of cargo). + /// 3. Fills in the `size` column where it is NULL (such as when something + /// is added to disk by an older version of cargo, and one of the mark + /// functions marked it without knowing the size). + /// + /// This is only called by `cargo clean` when needed since it is an + /// expensive operation. Size computations are only done if `sync_size` is + /// set since that adds an even larger expense. + /// + /// Adds paths to `delete_paths` that should be removed since they are + /// orphaned (for example, deleting `.crate` files if the corresponding + /// index doesn't exist). + fn sync_db_with_files( + conn: &Connection, + config: &Config, + base: &BasePaths, + sync_size: bool, + delete_paths: &mut Vec, + ) -> CargoResult<()> { + let _p = crate::util::profile::start("global cache db sync"); + debug!("starting db sync"); + // For registry_index and git_db, add anything that is missing in the db. + Self::update_parent_for_missing_from_db(conn, REGISTRY_INDEX_TABLE, &base.index)?; + Self::update_parent_for_missing_from_db(conn, GIT_DB_TABLE, &base.git_db)?; + + // For registry_crate, registry_src, and git_checkout, remove anything + // from the db that isn't on disk. + Self::update_db_for_removed( + conn, + REGISTRY_INDEX_TABLE, + "registry_id", + REGISTRY_CRATE_TABLE, + &base.crate_dir, + )?; + Self::update_db_for_removed( + conn, + REGISTRY_INDEX_TABLE, + "registry_id", + REGISTRY_SRC_TABLE, + &base.src, + )?; + Self::update_db_for_removed(conn, GIT_DB_TABLE, "git_id", GIT_CO_TABLE, &base.git_co)?; + + // For registry_index and git_db, remove anything from the db that + // isn't on disk. + // + // This also collects paths for any child files that don't have their + // respective parent on disk. + Self::update_db_parent_for_removed_from_disk( + conn, + REGISTRY_INDEX_TABLE, + &base.index, + &[&base.crate_dir, &base.src], + delete_paths, + )?; + Self::update_db_parent_for_removed_from_disk( + conn, + GIT_DB_TABLE, + &base.git_db, + &[&base.git_co], + delete_paths, + )?; + + // For registry_crate, registry_src, and git_checkout, add anything + // that is missing in the db. + Self::populate_untracked_crate(conn, &base.crate_dir)?; + Self::populate_untracked( + conn, + config, + REGISTRY_INDEX_TABLE, + "registry_id", + REGISTRY_SRC_TABLE, + &base.src, + sync_size, + )?; + Self::populate_untracked( + conn, + config, + GIT_DB_TABLE, + "git_id", + GIT_CO_TABLE, + &base.git_co, + sync_size, + )?; + + // Update any NULL sizes if needed. + if sync_size { + Self::update_null_sizes( + conn, + config, + REGISTRY_INDEX_TABLE, + "registry_id", + REGISTRY_SRC_TABLE, + &base.src, + )?; + Self::update_null_sizes( + conn, + config, + GIT_DB_TABLE, + "git_id", + GIT_CO_TABLE, + &base.git_co, + )?; + } + Ok(()) + } + + /// For parent tables, add any entries that are on disk but aren't tracked in the db. + fn update_parent_for_missing_from_db( + conn: &Connection, + parent_table_name: &str, + base_path: &Path, + ) -> CargoResult<()> { + let _p = crate::util::profile::start(format!( + "update parent db for missing from db {parent_table_name}" + )); + trace!("checking for untracked parent to add to {parent_table_name}"); + let names = Self::names_from(base_path)?; + + let mut stmt = conn.prepare_cached(&format!( + "INSERT INTO {parent_table_name} (name, timestamp) + VALUES (?1, ?2) + ON CONFLICT DO NOTHING", + ))?; + let now = now(); + for name in names { + stmt.execute(params![name, now])?; + } + Ok(()) + } + + /// Removes database entries for any files that are not on disk for the child tables. + /// + /// This could happen for example if the user manually deleted the file or + /// any such scenario where the filesystem and db are out of sync. + fn update_db_for_removed( + conn: &Connection, + parent_table_name: &str, + id_column_name: &str, + table_name: &str, + base_path: &Path, + ) -> CargoResult<()> { + let _p = crate::util::profile::start(format!("update db for removed {table_name}")); + trace!("checking for db entries to remove from {table_name}"); + let mut select_stmt = conn.prepare_cached(&format!( + "SELECT {table_name}.rowid, {parent_table_name}.name, {table_name}.name + FROM {parent_table_name}, {table_name} + WHERE {table_name}.{id_column_name} = {parent_table_name}.id", + ))?; + let mut delete_stmt = + conn.prepare_cached(&format!("DELETE FROM {table_name} WHERE rowid = ?1"))?; + let mut rows = select_stmt.query([])?; + while let Some(row) = rows.next()? { + let rowid: i64 = row.get_unwrap(0); + let id_name: String = row.get_unwrap(1); + let name: String = row.get_unwrap(2); + if !base_path.join(id_name).join(name).exists() { + delete_stmt.execute([rowid])?; + } + } + Ok(()) + } + + /// Removes database entries for any files that are not on disk for the parent tables. + fn update_db_parent_for_removed_from_disk( + conn: &Connection, + parent_table_name: &str, + base_path: &Path, + child_base_paths: &[&Path], + delete_paths: &mut Vec, + ) -> CargoResult<()> { + let _p = crate::util::profile::start(format!( + "update db parent for removed from disk {parent_table_name}" + )); + trace!("checking for db entries to remove from {parent_table_name}"); + let mut select_stmt = + conn.prepare_cached(&format!("SELECT rowid, name FROM {parent_table_name}"))?; + let mut delete_stmt = + conn.prepare_cached(&format!("DELETE FROM {parent_table_name} WHERE rowid = ?1"))?; + let mut rows = select_stmt.query([])?; + while let Some(row) = rows.next()? { + let rowid: i64 = row.get_unwrap(0); + let id_name: String = row.get_unwrap(1); + if !base_path.join(&id_name).exists() { + delete_stmt.execute([rowid])?; + // Make sure any child data is also cleaned up. + for child_base in child_base_paths { + let child_path = child_base.join(&id_name); + if child_path.exists() { + debug!("removing orphaned path {child_path:?}"); + delete_paths.push(child_path); + } + } + } + } + Ok(()) + } + + /// Updates the database to add any `.crate` files that are currently + /// not tracked (such as when they are downloaded by an older version of + /// cargo). + fn populate_untracked_crate(conn: &Connection, base_path: &Path) -> CargoResult<()> { + let _p = crate::util::profile::start("populate untracked crate"); + trace!("populating untracked crate files"); + let mut insert_stmt = conn.prepare_cached( + "INSERT INTO registry_crate (registry_id, name, size, timestamp) + VALUES (?1, ?2, ?3, ?4) + ON CONFLICT DO NOTHING", + )?; + let now = now(); + let index_names = Self::names_from(&base_path)?; + for index_name in index_names { + let Some(id) = Self::id_from_name(conn, REGISTRY_INDEX_TABLE, &index_name)? else { + // The id is missing from the database. This should be resolved + // via update_db_parent_for_removed_from_disk. + continue; + }; + let index_path = base_path.join(index_name); + for crate_name in Self::names_from(&index_path)? { + if crate_name.ends_with(".crate") { + // Missing files should have already been taken care of by + // update_db_for_removed. + let size = paths::metadata(index_path.join(&crate_name))?.len(); + insert_stmt.execute(params![id, crate_name, size, now])?; + } + } + } + Ok(()) + } + + /// Updates the database to add any files that are currently not tracked + /// (such as when they are downloaded by an older version of cargo). + fn populate_untracked( + conn: &Connection, + config: &Config, + id_table_name: &str, + id_column_name: &str, + table_name: &str, + base_path: &Path, + populate_size: bool, + ) -> CargoResult<()> { + let _p = crate::util::profile::start(format!("populate untracked {table_name}")); + trace!("populating untracked files for {table_name}"); + // Gather names (and make sure they are in the database). + let id_names = Self::names_from(&base_path)?; + + // This SELECT is used to determine if the directory is already + // tracked. We don't want to do the expensive size computation unless + // necessary. + let mut select_stmt = conn.prepare_cached(&format!( + "SELECT 1 FROM {table_name} + WHERE {id_column_name} = ?1 AND name = ?2", + ))?; + let mut insert_stmt = conn.prepare_cached(&format!( + "INSERT INTO {table_name} ({id_column_name}, name, size, timestamp) + VALUES (?1, ?2, ?3, ?4) + ON CONFLICT DO NOTHING", + ))?; + let mut progress = Progress::with_style("Scanning", ProgressStyle::Ratio, config); + let now = now(); + // Compute the size of any directory not in the database. + for id_name in id_names { + let Some(id) = Self::id_from_name(conn, id_table_name, &id_name)? else { + // The id is missing from the database. This should be resolved + // via update_db_parent_for_removed_from_disk. + continue; + }; + let index_path = base_path.join(id_name); + let names = Self::names_from(&index_path)?; + let max = names.len(); + for (i, name) in names.iter().enumerate() { + if select_stmt.exists(params![id, name])? { + continue; + } + let dir_path = index_path.join(name); + if !dir_path.is_dir() { + continue; + } + progress.tick(i, max, "")?; + let size = if populate_size { + Some(du(&dir_path, table_name)?) + } else { + None + }; + insert_stmt.execute(params![id, name, size, now])?; + } + } + Ok(()) + } + + /// Fills in the `size` column where it is NULL. + /// + /// This can happen when something is added to disk by an older version of + /// cargo, and one of the mark functions marked it without knowing the + /// size. + /// + /// `update_db_for_removed` should be called before this is called. + fn update_null_sizes( + conn: &Connection, + config: &Config, + parent_table_name: &str, + id_column_name: &str, + table_name: &str, + base_path: &Path, + ) -> CargoResult<()> { + let _p = crate::util::profile::start(format!("update NULL sizes {table_name}")); + trace!("updating NULL size information in {table_name}"); + let mut null_stmt = conn.prepare_cached(&format!( + "SELECT {table_name}.rowid, {table_name}.name, {parent_table_name}.name + FROM {table_name}, {parent_table_name} + WHERE {table_name}.size IS NULL AND {table_name}.{id_column_name} = {parent_table_name}.id", + ))?; + let mut update_stmt = conn.prepare_cached(&format!( + "UPDATE {table_name} SET size = ?1 WHERE rowid = ?2" + ))?; + let mut progress = Progress::with_style("Scanning", ProgressStyle::Ratio, config); + let rows: Vec<_> = null_stmt + .query_map([], |row| { + Ok((row.get_unwrap(0), row.get_unwrap(1), row.get_unwrap(2))) + })? + .collect(); + let max = rows.len(); + for (i, row) in rows.into_iter().enumerate() { + let (rowid, name, id_name): (i64, String, String) = row?; + let path = base_path.join(id_name).join(name); + progress.tick(i, max, "")?; + // Missing files should have already been taken care of by + // update_db_for_removed. + let size = du(&path, table_name)?; + update_stmt.execute(params![size, rowid])?; + } + Ok(()) + } + + /// Adds paths to delete from either registry_crate or registry_src whose + /// last use is older than the given timestamp. + fn get_registry_items_to_clean_age( + conn: &Connection, + max_age: Timestamp, + table_name: &str, + base_path: &Path, + delete_paths: &mut Vec, + ) -> CargoResult<()> { + debug!("cleaning {table_name} since {max_age:?}"); + let mut stmt = conn.prepare_cached(&format!( + "DELETE FROM {table_name} WHERE timestamp < ?1 + RETURNING registry_id, name" + ))?; + let rows = stmt + .query_map(params![max_age], |row| { + let registry_id = row.get_unwrap(0); + let name: String = row.get_unwrap(1); + Ok((registry_id, name)) + })? + .collect::, _>>()?; + let ids: Vec<_> = rows.iter().map(|r| r.0).collect(); + let id_map = Self::get_id_map(conn, REGISTRY_INDEX_TABLE, &ids)?; + for (id, name) in rows { + let encoded_registry_name = &id_map[&id]; + delete_paths.push(base_path.join(encoded_registry_name).join(name)); + } + Ok(()) + } + + /// Adds paths to delete from either `registry_crate` or `registry_src` in + /// order to keep the total size under the given max size. + fn get_registry_items_to_clean_size( + conn: &Connection, + max_size: u64, + table_name: &str, + base_path: &Path, + delete_paths: &mut Vec, + ) -> CargoResult<()> { + debug!("cleaning {table_name} till under {max_size:?}"); + let total_size: u64 = conn.query_row( + &format!("SELECT coalesce(SUM(size), 0) FROM {table_name}"), + [], + |row| row.get(0), + )?; + if total_size <= max_size { + return Ok(()); + } + // This SQL statement selects all of the rows ordered by timestamp, + // and then uses a window function to keep a running total of the + // size. It selects all rows until the running total exceeds the + // threshold of the total number of bytes that we want to delete. + // + // The window function essentially computes an aggregate over all + // previous rows as it goes along. As long as the running size is + // below the total amount that we need to delete, it keeps picking + // more rows. + // + // The ORDER BY includes `name` mainly for test purposes so that + // entries with the same timestamp have deterministic behavior. + // + // The coalesce helps convert NULL to 0. + let mut stmt = conn.prepare(&format!( + "DELETE FROM {table_name} WHERE rowid IN \ + (SELECT x.rowid FROM \ + (SELECT rowid, size, SUM(size) OVER \ + (ORDER BY timestamp, name ROWS UNBOUNDED PRECEDING) AS running_amount \ + FROM {table_name}) x \ + WHERE coalesce(x.running_amount, 0) - x.size < ?1) \ + RETURNING registry_id, name;" + ))?; + let rows = stmt + .query_map(params![total_size - max_size], |row| { + let id = row.get_unwrap(0); + let name: String = row.get_unwrap(1); + Ok((id, name)) + })? + .collect::, _>>()?; + // Convert registry_id to the encoded registry name, and join those. + let ids: Vec<_> = rows.iter().map(|r| r.0).collect(); + let id_map = Self::get_id_map(conn, REGISTRY_INDEX_TABLE, &ids)?; + for (id, name) in rows { + let encoded_name = &id_map[&id]; + delete_paths.push(base_path.join(encoded_name).join(name)); + } + Ok(()) + } + + /// Adds paths to delete from both `registry_crate` and `registry_src` in + /// order to keep the total size under the given max size. + fn get_registry_items_to_clean_size_both( + conn: &Connection, + max_size: u64, + base: &BasePaths, + delete_paths: &mut Vec, + ) -> CargoResult<()> { + debug!("cleaning download till under {max_size:?}"); + + // This SQL statement selects from both registry_src and + // registry_crate so that sorting of timestamps incorporates both of + // them at the same time. It uses a const value of 1 or 2 as the first + // column so that the code below can determine which table the value + // came from. + let mut stmt = conn.prepare_cached( + "SELECT 1, registry_src.rowid, registry_src.name AS name, registry_index.name, + registry_src.size, registry_src.timestamp AS timestamp + FROM registry_src, registry_index + WHERE registry_src.registry_id = registry_index.id AND registry_src.size NOT NULL + + UNION + + SELECT 2, registry_crate.rowid, registry_crate.name AS name, registry_index.name, + registry_crate.size, registry_crate.timestamp AS timestamp + FROM registry_crate, registry_index + WHERE registry_crate.registry_id = registry_index.id + + ORDER BY timestamp, name", + )?; + let mut delete_src_stmt = + conn.prepare_cached("DELETE FROM registry_src WHERE rowid = ?1")?; + let mut delete_crate_stmt = + conn.prepare_cached("DELETE FROM registry_crate WHERE rowid = ?1")?; + let rows = stmt + .query_map([], |row| { + Ok(( + row.get_unwrap(0), + row.get_unwrap(1), + row.get_unwrap(2), + row.get_unwrap(3), + row.get_unwrap(4), + )) + })? + .collect::, _>>()?; + let mut total_size: u64 = rows.iter().map(|r| r.4).sum(); + debug!("total download cache size appears to be {total_size}"); + for (table, rowid, name, index_name, size) in rows { + if total_size <= max_size { + break; + } + if table == 1 { + delete_paths.push(base.src.join(index_name).join(name)); + delete_src_stmt.execute([rowid])?; + } else { + delete_paths.push(base.crate_dir.join(index_name).join(name)); + delete_crate_stmt.execute([rowid])?; + } + // TODO: If delete crate, ensure src is also deleted. + total_size -= size; + } + Ok(()) + } + + /// Adds paths to delete from the git cache, keeping the total size under + /// the give value. + /// + /// Paths are relative to the `git` directory in the cache directory. + fn get_git_items_to_clean_size( + conn: &Connection, + max_size: u64, + base: &BasePaths, + delete_paths: &mut Vec, + ) -> CargoResult<()> { + debug!("cleaning git till under {max_size:?}"); + + // Collect all the sizes from git_db and git_checkouts, and then sort them by timestamp. + let mut stmt = conn.prepare_cached("SELECT rowid, name, timestamp FROM git_db")?; + let mut git_info = stmt + .query_map([], |row| { + let rowid: i64 = row.get_unwrap(0); + let name: String = row.get_unwrap(1); + let timestamp: Timestamp = row.get_unwrap(2); + // Size is added below so that the error doesn't need to be + // converted to a rusqlite error. + Ok((timestamp, rowid, None, name, 0)) + })? + .collect::, _>>()?; + for info in &mut git_info { + let size = cargo_util::du(&base.git_db.join(&info.3), &[])?; + info.4 = size; + } + + let mut stmt = conn.prepare_cached( + "SELECT git_checkout.rowid, git_db.name, git_checkout.name, + git_checkout.size, git_checkout.timestamp + FROM git_checkout, git_db + WHERE git_checkout.git_id = git_db.id AND git_checkout.size NOT NULL", + )?; + let git_co_rows = stmt + .query_map([], |row| { + let rowid = row.get_unwrap(0); + let db_name: String = row.get_unwrap(1); + let name = row.get_unwrap(2); + let size = row.get_unwrap(3); + let timestamp = row.get_unwrap(4); + Ok((timestamp, rowid, Some(db_name), name, size)) + })? + .collect::, _>>()?; + git_info.extend(git_co_rows); + + // Sort by timestamp, and name. The name is included mostly for test + // purposes so that entries with the same timestamp have deterministic + // behavior. + git_info.sort_by(|a, b| (b.0, &b.3).cmp(&(a.0, &a.3))); + + // Collect paths to delete. + let mut delete_db_stmt = conn.prepare_cached("DELETE FROM git_db WHERE rowid = ?1")?; + let mut delete_co_stmt = + conn.prepare_cached("DELETE FROM git_checkout WHERE rowid = ?1")?; + let mut total_size: u64 = git_info.iter().map(|r| r.4).sum(); + debug!("total git cache size appears to be {total_size}"); + while let Some((_timestamp, rowid, db_name, name, size)) = git_info.pop() { + if total_size <= max_size { + break; + } + if let Some(db_name) = db_name { + delete_paths.push(base.git_co.join(db_name).join(name)); + delete_co_stmt.execute([rowid])?; + total_size -= size; + } else { + total_size -= size; + delete_paths.push(base.git_db.join(&name)); + delete_db_stmt.execute([rowid])?; + // If the db is deleted, then all the checkouts must be deleted. + let mut i = 0; + while i < git_info.len() { + if git_info[i].2.as_deref() == Some(name.as_ref()) { + let (_, rowid, db_name, name, size) = git_info.remove(i); + delete_paths.push(base.git_co.join(db_name.unwrap()).join(name)); + delete_co_stmt.execute([rowid])?; + total_size -= size; + } else { + i += 1; + } + } + } + } + Ok(()) + } + + /// Adds paths to delete from `registry_index` whose last use is older + /// than the given timestamp. + fn get_registry_index_to_clean( + conn: &Connection, + max_age: Timestamp, + base: &BasePaths, + delete_paths: &mut Vec, + ) -> CargoResult<()> { + debug!("cleaning index since {max_age:?}"); + let mut stmt = conn.prepare_cached( + "DELETE FROM registry_index WHERE timestamp < ?1 + RETURNING name", + )?; + let mut rows = stmt.query([max_age])?; + while let Some(row) = rows.next()? { + let name: String = row.get_unwrap(0); + delete_paths.push(base.index.join(&name)); + // Also delete .crate and src directories, since by definition + // they cannot be used without their index. + delete_paths.push(base.src.join(&name)); + delete_paths.push(base.crate_dir.join(&name)); + } + Ok(()) + } + + /// Adds paths to delete from `git_checkout` whose last use is + /// older than the given timestamp. + fn get_git_co_items_to_clean( + conn: &Connection, + max_age: Timestamp, + base_path: &Path, + delete_paths: &mut Vec, + ) -> CargoResult<()> { + debug!("cleaning git co since {max_age:?}"); + let mut stmt = conn.prepare_cached( + "DELETE FROM git_checkout WHERE timestamp < ?1 + RETURNING git_id, name", + )?; + let rows = stmt + .query_map(params![max_age], |row| { + let git_id = row.get_unwrap(0); + let name: String = row.get_unwrap(1); + Ok((git_id, name)) + })? + .collect::, _>>()?; + let ids: Vec<_> = rows.iter().map(|r| r.0).collect(); + let id_map = Self::get_id_map(conn, GIT_DB_TABLE, &ids)?; + for (id, name) in rows { + let encoded_git_name = &id_map[&id]; + delete_paths.push(base_path.join(encoded_git_name).join(name)); + } + Ok(()) + } + + /// Adds paths to delete from `git_db` in order to keep the total size + /// under the given max size. + fn get_git_db_items_to_clean( + conn: &Connection, + max_age: Timestamp, + base: &BasePaths, + delete_paths: &mut Vec, + ) -> CargoResult<()> { + debug!("cleaning git db since {max_age:?}"); + let mut stmt = conn.prepare_cached( + "DELETE FROM git_db WHERE timestamp < ?1 + RETURNING name", + )?; + let mut rows = stmt.query([max_age])?; + while let Some(row) = rows.next()? { + let name: String = row.get_unwrap(0); + delete_paths.push(base.git_db.join(&name)); + // Also delete checkout directories, since by definition they + // cannot be used without their db. + delete_paths.push(base.git_co.join(&name)); + } + Ok(()) + } +} + +/// Helper to generate the upsert for the parent tables. +/// +/// This handles checking if the row already exists, and only updates the +/// timestamp it if it hasn't been updated recently. This also handles keeping +/// a cached map of the `id` value. +/// +/// Unfortunately it is a bit tricky to share this code without a macro. +macro_rules! insert_or_update_parent { + ($self:expr, $conn:expr, $table_name:expr, $timestamps_field:ident, $keys_field:ident, $encoded_name:ident) => { + let mut select_stmt = $conn.prepare_cached(concat!( + "SELECT id, timestamp FROM ", + $table_name, + " WHERE name = ?1" + ))?; + let mut insert_stmt = $conn.prepare_cached(concat!( + "INSERT INTO ", + $table_name, + " (name, timestamp) + VALUES (?1, ?2) + ON CONFLICT DO UPDATE SET timestamp=excluded.timestamp + RETURNING id", + ))?; + let mut update_stmt = $conn.prepare_cached(concat!( + "UPDATE ", + $table_name, + " SET timestamp = ?1 WHERE id = ?2" + ))?; + for (parent, new_timestamp) in std::mem::take(&mut $self.$timestamps_field) { + trace!( + concat!("insert ", $table_name, " {:?} {}"), + parent, + new_timestamp + ); + let mut rows = select_stmt.query([parent.$encoded_name])?; + let id = if let Some(row) = rows.next()? { + let id: i64 = row.get_unwrap(0); + let timestamp: Timestamp = row.get_unwrap(1); + if timestamp < new_timestamp - UPDATE_RESOLUTION { + update_stmt.execute(params![new_timestamp, id])?; + } + id + } else { + insert_stmt.query_row(params![parent.$encoded_name, new_timestamp], |row| { + row.get(0) + })? + }; + match $self.$keys_field.entry(parent.$encoded_name) { + hash_map::Entry::Occupied(o) => { + assert_eq!(*o.get(), id); + } + hash_map::Entry::Vacant(v) => { + v.insert(id); + } + } + } + return Ok(()); + }; +} + +impl DeferredGlobalLastUse { + pub fn new() -> DeferredGlobalLastUse { + DeferredGlobalLastUse { + registry_keys: HashMap::new(), + git_keys: HashMap::new(), + registry_index_timestamps: HashMap::new(), + registry_crate_timestamps: HashMap::new(), + registry_src_timestamps: HashMap::new(), + git_db_timestamps: HashMap::new(), + git_checkout_timestamps: HashMap::new(), + save_err_has_warned: false, + now: now(), + } + } + + pub fn is_empty(&self) -> bool { + self.registry_index_timestamps.is_empty() + && self.registry_crate_timestamps.is_empty() + && self.registry_src_timestamps.is_empty() + && self.git_db_timestamps.is_empty() + && self.git_checkout_timestamps.is_empty() + } + + fn clear(&mut self) { + self.registry_index_timestamps.clear(); + self.registry_crate_timestamps.clear(); + self.registry_src_timestamps.clear(); + self.git_db_timestamps.clear(); + self.git_checkout_timestamps.clear(); + } + + /// Indicates the given [`RegistryIndex`] has been used right now. + pub fn mark_registry_index_used(&mut self, registry_index: RegistryIndex) { + self.mark_registry_index_used_stamp(registry_index, None); + } + + /// Indicates the given [`RegistryCrate`] has been used right now. + /// + /// Also implicitly marks the index used, too. + pub fn mark_registry_crate_used(&mut self, registry_crate: RegistryCrate) { + self.mark_registry_crate_used_stamp(registry_crate, None); + } + + /// Indicates the given [`RegistrySrc`] has been used right now. + /// + /// Also implicitly marks the index used, too. + pub fn mark_registry_src_used(&mut self, registry_src: RegistrySrc) { + self.mark_registry_src_used_stamp(registry_src, None); + } + + /// Indicates the given [`GitCheckout`] has been used right now. + /// + /// Also implicitly marks the git db used, too. + pub fn mark_git_checkout_used(&mut self, git_checkout: GitCheckout) { + self.mark_git_checkout_used_stamp(git_checkout, None); + } + + /// Indicates the given [`RegistryIndex`] has been used with the given + /// time (or "now" if `None`). + pub fn mark_registry_index_used_stamp( + &mut self, + registry_index: RegistryIndex, + timestamp: Option<&SystemTime>, + ) { + let timestamp = timestamp.map_or(self.now, to_timestamp); + self.registry_index_timestamps + .insert(registry_index, timestamp); + } + + /// Indicates the given [`RegistryCrate`] has been used with the given + /// time (or "now" if `None`). + /// + /// Also implicitly marks the index used, too. + pub fn mark_registry_crate_used_stamp( + &mut self, + registry_crate: RegistryCrate, + timestamp: Option<&SystemTime>, + ) { + let timestamp = timestamp.map_or(self.now, to_timestamp); + let index = RegistryIndex { + encoded_registry_name: registry_crate.encoded_registry_name, + }; + self.registry_index_timestamps.insert(index, timestamp); + self.registry_crate_timestamps + .insert(registry_crate, timestamp); + } + + /// Indicates the given [`RegistrySrc`] has been used with the given + /// time (or "now" if `None`). + /// + /// Also implicitly marks the index used, too. + pub fn mark_registry_src_used_stamp( + &mut self, + registry_src: RegistrySrc, + timestamp: Option<&SystemTime>, + ) { + let timestamp = timestamp.map_or(self.now, to_timestamp); + let index = RegistryIndex { + encoded_registry_name: registry_src.encoded_registry_name, + }; + self.registry_index_timestamps.insert(index, timestamp); + self.registry_src_timestamps.insert(registry_src, timestamp); + } + + /// Indicates the given [`GitCheckout`] has been used with the given + /// time (or "now" if `None`). + /// + /// Also implicitly marks the git db used, too. + pub fn mark_git_checkout_used_stamp( + &mut self, + git_checkout: GitCheckout, + timestamp: Option<&SystemTime>, + ) { + let timestamp = timestamp.map_or(self.now, to_timestamp); + let db = GitDb { + encoded_git_name: git_checkout.encoded_git_name, + }; + self.git_db_timestamps.insert(db, timestamp); + self.git_checkout_timestamps.insert(git_checkout, timestamp); + } + + /// Saves all of the deferred information to the database. + /// + /// This will also clear the state of `self`. + pub fn save(&mut self, tracker: &mut GlobalCacheTracker) -> CargoResult<()> { + let _p = crate::util::profile::start("saving last-use data"); + trace!("saving last-use data"); + if self.is_empty() { + return Ok(()); + } + let tx = tracker.conn.transaction()?; + // These must run before the ones that refer to their IDs. + self.insert_registry_index_from_cache(&tx)?; + self.insert_git_db_from_cache(&tx)?; + self.insert_registry_crate_from_cache(&tx)?; + self.insert_registry_src_from_cache(&tx)?; + self.insert_git_checkout_from_cache(&tx)?; + tx.commit()?; + trace!("last-use save complete"); + Ok(()) + } + + /// Variant of [`DeferredGlobalLastUse::save`] that does not return an + /// error. + /// + /// This will log or display a warning to the user. + pub fn save_no_error(&mut self, config: &Config) { + if let Err(e) = self.save_with_config(config) { + // Because there is an assertion in auto-gc that checks if this is + // empty, be sure to clear it so that assertion doesn't fail. + self.clear(); + if !self.save_err_has_warned { + if is_silent_error(&e) && config.shell().verbosity() != Verbosity::Verbose { + tracing::warn!("failed to save last-use data: {e:?}"); + } else { + crate::display_warning_with_error( + "failed to save last-use data\n\ + This may prevent cargo from accurately tracking what is being \ + used in its global cache. This information is used for \ + automatically removing unused data in the cache.", + &e, + &mut config.shell(), + ); + self.save_err_has_warned = true; + } + } + } + } + + fn save_with_config(&mut self, config: &Config) -> CargoResult<()> { + let mut tracker = config.global_cache_tracker()?; + self.save(&mut tracker) + } + + /// Flushes all of the `registry_index_timestamps` to the database, + /// clearing `registry_index_timestamps`. + fn insert_registry_index_from_cache(&mut self, conn: &Connection) -> CargoResult<()> { + insert_or_update_parent!( + self, + conn, + "registry_index", + registry_index_timestamps, + registry_keys, + encoded_registry_name + ); + } + + /// Flushes all of the `git_db_timestamps` to the database, + /// clearing `registry_index_timestamps`. + fn insert_git_db_from_cache(&mut self, conn: &Connection) -> CargoResult<()> { + insert_or_update_parent!( + self, + conn, + "git_db", + git_db_timestamps, + git_keys, + encoded_git_name + ); + } + + /// Flushes all of the `registry_crate_timestamps` to the database, + /// clearing `registry_index_timestamps`. + fn insert_registry_crate_from_cache(&mut self, conn: &Connection) -> CargoResult<()> { + let registry_crate_timestamps = std::mem::take(&mut self.registry_crate_timestamps); + for (registry_crate, timestamp) in registry_crate_timestamps { + trace!("insert registry crate {registry_crate:?} {timestamp}"); + let registry_id = self.registry_id(conn, registry_crate.encoded_registry_name)?; + let mut stmt = conn.prepare_cached( + "INSERT INTO registry_crate (registry_id, name, size, timestamp) + VALUES (?1, ?2, ?3, ?4) + ON CONFLICT DO UPDATE SET timestamp=excluded.timestamp + WHERE timestamp < ?5 + ", + )?; + stmt.execute(params![ + registry_id, + registry_crate.crate_filename, + registry_crate.size, + timestamp, + timestamp - UPDATE_RESOLUTION + ])?; + } + Ok(()) + } + + /// Flushes all of the `registry_src_timestamps` to the database, + /// clearing `registry_index_timestamps`. + fn insert_registry_src_from_cache(&mut self, conn: &Connection) -> CargoResult<()> { + let registry_src_timestamps = std::mem::take(&mut self.registry_src_timestamps); + for (registry_src, timestamp) in registry_src_timestamps { + trace!("insert registry src {registry_src:?} {timestamp}"); + let registry_id = self.registry_id(conn, registry_src.encoded_registry_name)?; + let mut stmt = conn.prepare_cached( + "INSERT INTO registry_src (registry_id, name, size, timestamp) + VALUES (?1, ?2, ?3, ?4) + ON CONFLICT DO UPDATE SET timestamp=excluded.timestamp + WHERE timestamp < ?5 + ", + )?; + stmt.execute(params![ + registry_id, + registry_src.package_dir, + registry_src.size, + timestamp, + timestamp - UPDATE_RESOLUTION + ])?; + } + + Ok(()) + } + + /// Flushes all of the `git_checkout_timestamps` to the database, + /// clearing `registry_index_timestamps`. + fn insert_git_checkout_from_cache(&mut self, conn: &Connection) -> CargoResult<()> { + let git_checkout_timestamps = std::mem::take(&mut self.git_checkout_timestamps); + for (git_checkout, timestamp) in git_checkout_timestamps { + let git_id = self.git_id(conn, git_checkout.encoded_git_name)?; + let mut stmt = conn.prepare_cached( + "INSERT INTO git_checkout (git_id, name, size, timestamp) + VALUES (?1, ?2, ?3, ?4) + ON CONFLICT DO UPDATE SET timestamp=excluded.timestamp + WHERE timestamp < ?5", + )?; + stmt.execute(params![ + git_id, + git_checkout.short_name, + git_checkout.size, + timestamp, + timestamp - UPDATE_RESOLUTION + ])?; + } + + Ok(()) + } + + /// Returns the numeric ID of the registry, either fetching from the local + /// cache, or getting it from the database. + /// + /// It is an error if the registry does not exist. + fn registry_id( + &mut self, + conn: &Connection, + encoded_registry_name: InternedString, + ) -> CargoResult { + match self.registry_keys.get(&encoded_registry_name) { + Some(i) => Ok(*i), + None => { + let Some(id) = GlobalCacheTracker::id_from_name( + conn, + REGISTRY_INDEX_TABLE, + &encoded_registry_name, + )? + else { + bail!("expected registry_index {encoded_registry_name} to exist, but wasn't found"); + }; + self.registry_keys.insert(encoded_registry_name, id); + Ok(id) + } + } + } + + /// Returns the numeric ID of the git db, either fetching from the local + /// cache, or getting it from the database. + /// + /// It is an error if the git db does not exist. + fn git_id(&mut self, conn: &Connection, encoded_git_name: InternedString) -> CargoResult { + match self.git_keys.get(&encoded_git_name) { + Some(i) => Ok(*i), + None => { + let Some(id) = + GlobalCacheTracker::id_from_name(conn, GIT_DB_TABLE, &encoded_git_name)? + else { + bail!("expected git_db {encoded_git_name} to exist, but wasn't found") + }; + self.git_keys.insert(encoded_git_name, id); + Ok(id) + } + } + } +} + +/// Converts a [`SystemTime`] to a [`Timestamp`] which can be stored in the database. +fn to_timestamp(t: &SystemTime) -> Timestamp { + t.duration_since(SystemTime::UNIX_EPOCH) + .expect("invalid clock") + .as_secs() +} + +/// Returns the current time. +/// +/// This supports pretending that the time is different for testing using an +/// environment variable. +/// +/// If possible, try to avoid calling this too often since accessing clocks +/// can be a little slow on some systems. +#[allow(clippy::disallowed_methods)] +fn now() -> Timestamp { + match std::env::var("__CARGO_TEST_LAST_USE_NOW") { + Ok(now) => now.parse().unwrap(), + Err(_) => to_timestamp(&SystemTime::now()), + } +} + +/// Returns whether or not the given error should cause a warning to be +/// displayed to the user. +/// +/// In some situations, like a read-only global cache, we don't want to spam +/// the user with a warning. I think once cargo has controllable lints, I +/// think we should consider changing this to always warn, but give the user +/// an option to silence the warning. +pub fn is_silent_error(e: &anyhow::Error) -> bool { + if let Some(e) = e.downcast_ref::() { + if matches!( + e.sqlite_error_code(), + Some(ErrorCode::CannotOpen | ErrorCode::ReadOnly) + ) { + return true; + } + } + false +} + +fn du(path: &Path, table_name: &str) -> CargoResult { + // !.git is used because clones typically use hardlinks for the git + // contents. TODO: Verify behavior on Windows. + // TODO: Or even better, switch to worktrees, and remove this. + let patterns = if table_name == GIT_CO_TABLE { + &["!.git"][..] + } else { + &[][..] + }; + cargo_util::du(&path, patterns) +} diff --git a/src/cargo/core/mod.rs b/src/cargo/core/mod.rs index 2add52d5c..808091061 100644 --- a/src/cargo/core/mod.rs +++ b/src/cargo/core/mod.rs @@ -19,6 +19,8 @@ pub use crate::util::toml::schema::InheritableFields; pub mod compiler; pub mod dependency; pub mod features; +pub mod gc; +pub mod global_cache_tracker; pub mod manifest; pub mod package; pub mod package_id; diff --git a/src/cargo/core/package.rs b/src/cargo/core/package.rs index 274798474..d87f81036 100644 --- a/src/cargo/core/package.rs +++ b/src/cargo/core/package.rs @@ -491,6 +491,10 @@ impl<'cfg> PackageSet<'cfg> { pkgs.push(downloads.wait()?); } downloads.success = true; + drop(downloads); + + let mut deferred = self.config.deferred_global_last_use()?; + deferred.save_no_error(self.config); Ok(pkgs) } diff --git a/src/cargo/ops/cargo_clean.rs b/src/cargo/ops/cargo_clean.rs index 6f58b8bdc..35c7063f4 100644 --- a/src/cargo/ops/cargo_clean.rs +++ b/src/cargo/ops/cargo_clean.rs @@ -1,7 +1,10 @@ use crate::core::compiler::{CompileKind, CompileMode, Layout, RustcTargetData}; +use crate::core::gc::{AutoGcKind, Gc, GcOpts}; +use crate::core::global_cache_tracker::GlobalCacheTracker; use crate::core::profiles::Profiles; use crate::core::{PackageIdSpec, TargetKind, Workspace}; use crate::ops; +use crate::util::cache_lock::CacheLockMode; use crate::util::edit_distance; use crate::util::errors::CargoResult; use crate::util::interning::InternedString; @@ -25,6 +28,7 @@ pub struct CleanOptions<'cfg> { pub doc: bool, /// If set, doesn't delete anything. pub dry_run: bool, + pub gc_opts: GcOpts, } pub struct CleanContext<'cfg> { @@ -37,45 +41,76 @@ pub struct CleanContext<'cfg> { } /// Cleans various caches. -pub fn clean(ws: &Workspace<'_>, opts: &CleanOptions<'_>) -> CargoResult<()> { - let mut target_dir = ws.target_dir(); +pub fn clean(ws: CargoResult>, opts: &CleanOptions<'_>) -> CargoResult<()> { let config = opts.config; let mut ctx = CleanContext::new(config); ctx.dry_run = opts.dry_run; - if opts.doc { - if !opts.spec.is_empty() { - // FIXME: https://github.com/rust-lang/cargo/issues/8790 - // This should support the ability to clean specific packages - // within the doc directory. It's a little tricky since it - // needs to find all documentable targets, but also consider - // the fact that target names might overlap with dependency - // names and such. - bail!("--doc cannot be used with -p"); - } - // If the doc option is set, we just want to delete the doc directory. - target_dir = target_dir.join("doc"); - ctx.remove_paths(&[target_dir.into_path_unlocked()])?; - } else { - let profiles = Profiles::new(&ws, opts.requested_profile)?; + let any_download_cache_opts = opts.gc_opts.is_download_cache_opt_set(); - if opts.profile_specified { - // After parsing profiles we know the dir-name of the profile, if a profile - // was passed from the command line. If so, delete only the directory of - // that profile. - let dir_name = profiles.get_dir_name(); - target_dir = target_dir.join(dir_name); - } + // The following options need a workspace. + let any_ws_opts = !opts.spec.is_empty() + || !opts.targets.is_empty() + || opts.profile_specified + || opts.doc + || opts.gc_opts.is_target_opt_set(); - // If we have a spec, then we need to delete some packages, otherwise, just - // remove the whole target directory and be done with it! - // - // Note that we don't bother grabbing a lock here as we're just going to - // blow it all away anyway. - if opts.spec.is_empty() { + // When no options are specified, do the default action. + let no_opts_specified = !any_download_cache_opts && !any_ws_opts; + + if any_ws_opts || no_opts_specified { + let ws = ws?; + let mut target_dir = ws.target_dir(); + + if opts.doc { + if !opts.spec.is_empty() { + // FIXME: https://github.com/rust-lang/cargo/issues/8790 + // This should support the ability to clean specific packages + // within the doc directory. It's a little tricky since it + // needs to find all documentable targets, but also consider + // the fact that target names might overlap with dependency + // names and such. + bail!("--doc cannot be used with -p"); + } + // If the doc option is set, we just want to delete the doc directory. + target_dir = target_dir.join("doc"); ctx.remove_paths(&[target_dir.into_path_unlocked()])?; } else { - clean_specs(&mut ctx, &ws, &profiles, &opts.targets, &opts.spec)?; + let profiles = Profiles::new(&ws, opts.requested_profile)?; + + if opts.profile_specified { + // After parsing profiles we know the dir-name of the profile, if a profile + // was passed from the command line. If so, delete only the directory of + // that profile. + let dir_name = profiles.get_dir_name(); + target_dir = target_dir.join(dir_name); + } + + // If we have a spec, then we need to delete some packages, otherwise, just + // remove the whole target directory and be done with it! + // + // Note that we don't bother grabbing a lock here as we're just going to + // blow it all away anyway. + if opts.spec.is_empty() { + ctx.remove_paths(&[target_dir.into_path_unlocked()])?; + } else { + clean_specs(&mut ctx, &ws, &profiles, &opts.targets, &opts.spec)?; + } + } + } + + if config.cli_unstable().gc { + let _lock = config.acquire_package_cache_lock(CacheLockMode::MutateExclusive)?; + let mut cache_track = GlobalCacheTracker::new(&config)?; + let mut gc = Gc::new(config, &mut cache_track)?; + if no_opts_specified { + // This is the behavior for `cargo clean` without *any* options. + // It uses the defaults from config to determine what is cleaned. + let mut gc_opts = opts.gc_opts.clone(); + gc_opts.update_for_auto_gc(config, &[AutoGcKind::All], None)?; + gc.gc(&mut ctx, &gc_opts)?; + } else { + gc.gc(&mut ctx, &opts.gc_opts)?; } } diff --git a/src/cargo/ops/cargo_compile/mod.rs b/src/cargo/ops/cargo_compile/mod.rs index 94c6cf9de..3522ef9d3 100644 --- a/src/cargo/ops/cargo_compile/mod.rs +++ b/src/cargo/ops/cargo_compile/mod.rs @@ -153,6 +153,7 @@ pub fn compile_ws<'a>( unit_graph::emit_serialized_unit_graph(&bcx.roots, &bcx.unit_graph, ws.config())?; return Compilation::new(&bcx); } + crate::core::gc::auto_gc(bcx.config); let _p = profile::start("compiling"); let cx = Context::new(&bcx)?; cx.compile(exec) diff --git a/src/cargo/ops/cargo_fetch.rs b/src/cargo/ops/cargo_fetch.rs index 6acdbddef..ac2b60aab 100644 --- a/src/cargo/ops/cargo_fetch.rs +++ b/src/cargo/ops/cargo_fetch.rs @@ -76,6 +76,7 @@ pub fn fetch<'a>( } packages.get_many(to_download)?; + crate::core::gc::auto_gc(config); Ok((resolve, packages)) } diff --git a/src/cargo/ops/mod.rs b/src/cargo/ops/mod.rs index 13613eaf6..76fa91d25 100644 --- a/src/cargo/ops/mod.rs +++ b/src/cargo/ops/mod.rs @@ -1,6 +1,6 @@ use crate::sources::CRATES_IO_DOMAIN; -pub use self::cargo_clean::{clean, CleanOptions}; +pub use self::cargo_clean::{clean, CleanContext, CleanOptions}; pub use self::cargo_compile::{ compile, compile_with_exec, compile_ws, create_bcx, print, resolve_all_features, CompileOptions, }; diff --git a/src/cargo/ops/resolve.rs b/src/cargo/ops/resolve.rs index 8ca72f77c..00d3b1144 100644 --- a/src/cargo/ops/resolve.rs +++ b/src/cargo/ops/resolve.rs @@ -530,6 +530,9 @@ pub fn resolve_with_previous<'cfg>( if let Some(previous) = previous { resolved.merge_from(previous)?; } + let config = ws.config(); + let mut deferred = config.deferred_global_last_use()?; + deferred.save_no_error(config); Ok(resolved) } diff --git a/src/cargo/sources/git/source.rs b/src/cargo/sources/git/source.rs index a75c1ec6d..9fc874b29 100644 --- a/src/cargo/sources/git/source.rs +++ b/src/cargo/sources/git/source.rs @@ -1,5 +1,6 @@ //! See [GitSource]. +use crate::core::global_cache_tracker; use crate::core::GitReference; use crate::core::SourceId; use crate::core::{Dependency, Package, PackageId, Summary}; @@ -11,6 +12,7 @@ use crate::sources::PathSource; use crate::util::cache_lock::CacheLockMode; use crate::util::errors::CargoResult; use crate::util::hex::short_hash; +use crate::util::interning::InternedString; use crate::util::Config; use anyhow::Context; use cargo_util::paths::exclude_from_backups_and_indexing; @@ -74,9 +76,10 @@ pub struct GitSource<'cfg> { source_id: SourceId, /// The underlying path source to discover packages inside the Git repository. path_source: Option>, + short_id: Option, /// The identifier of this source for Cargo's Git cache directory. /// See [`ident`] for more. - ident: String, + ident: InternedString, config: &'cfg Config, /// Disables status messages. quiet: bool, @@ -104,7 +107,8 @@ impl<'cfg> GitSource<'cfg> { locked_rev, source_id, path_source: None, - ident, + short_id: None, + ident: ident.into(), config, quiet: false, }; @@ -127,6 +131,17 @@ impl<'cfg> GitSource<'cfg> { } self.path_source.as_mut().unwrap().read_packages() } + + fn mark_used(&self, size: Option) -> CargoResult<()> { + self.config + .deferred_global_last_use()? + .mark_git_checkout_used(global_cache_tracker::GitCheckout { + encoded_git_name: self.ident, + short_name: self.short_id.expect("update before download"), + size, + }); + Ok(()) + } } /// Create an identifier from a URL, @@ -200,6 +215,7 @@ impl<'cfg> Source for GitSource<'cfg> { fn block_until_ready(&mut self) -> CargoResult<()> { if self.path_source.is_some() { + self.mark_used(None)?; return Ok(()); } @@ -290,8 +306,19 @@ impl<'cfg> Source for GitSource<'cfg> { let path_source = PathSource::new_recursive(&checkout_path, source_id, self.config); self.path_source = Some(path_source); + self.short_id = Some(short_id.as_str().into()); self.locked_rev = Some(actual_rev); - self.path_source.as_mut().unwrap().update() + self.path_source.as_mut().unwrap().update()?; + + // Hopefully this shouldn't incur too much of a performance hit since + // most of this should already be in cache since it was just + // extracted. + // + // !.git is used because clones typically use hardlinks for the git + // contents. TODO: Verify behavior on Windows. + let size = cargo_util::du(&checkout_path, &["!.git"])?; + self.mark_used(Some(size))?; + Ok(()) } fn download(&mut self, id: PackageId) -> CargoResult { @@ -300,6 +327,7 @@ impl<'cfg> Source for GitSource<'cfg> { id, self.remote ); + self.mark_used(None)?; self.path_source .as_mut() .expect("BUG: `update()` must be called before `get()`") diff --git a/src/cargo/sources/registry/download.rs b/src/cargo/sources/registry/download.rs index 786432835..daf1d0537 100644 --- a/src/cargo/sources/registry/download.rs +++ b/src/cargo/sources/registry/download.rs @@ -3,11 +3,13 @@ //! [`HttpRegistry`]: super::http_remote::HttpRegistry //! [`RemoteRegistry`]: super::remote::RemoteRegistry +use crate::util::interning::InternedString; use anyhow::Context; use cargo_credential::Operation; use cargo_util::registry::make_dep_path; use cargo_util::Sha256; +use crate::core::global_cache_tracker; use crate::core::PackageId; use crate::sources::registry::MaybeLock; use crate::sources::registry::RegistryConfig; @@ -34,6 +36,7 @@ const CHECKSUM_TEMPLATE: &str = "{sha256-checksum}"; pub(super) fn download( cache_path: &Filesystem, config: &Config, + encoded_registry_name: InternedString, pkg: PackageId, checksum: &str, registry_config: RegistryConfig, @@ -50,6 +53,13 @@ pub(super) fn download( if let Ok(dst) = File::open(path) { let meta = dst.metadata()?; if meta.len() > 0 { + config.deferred_global_last_use()?.mark_registry_crate_used( + global_cache_tracker::RegistryCrate { + encoded_registry_name, + crate_filename: pkg.tarball_name().into(), + size: meta.len(), + }, + ); return Ok(MaybeLock::Ready(dst)); } } @@ -106,6 +116,7 @@ pub(super) fn download( pub(super) fn finish_download( cache_path: &Filesystem, config: &Config, + encoded_registry_name: InternedString, pkg: PackageId, checksum: &str, data: &[u8], @@ -115,6 +126,13 @@ pub(super) fn finish_download( if actual != checksum { anyhow::bail!("failed to verify the checksum of `{}`", pkg) } + config.deferred_global_last_use()?.mark_registry_crate_used( + global_cache_tracker::RegistryCrate { + encoded_registry_name, + crate_filename: pkg.tarball_name().into(), + size: data.len() as u64, + }, + ); cache_path.create_dir()?; let path = cache_path.join(&pkg.tarball_name()); diff --git a/src/cargo/sources/registry/http_remote.rs b/src/cargo/sources/registry/http_remote.rs index 3d31110c3..1dfae4ad8 100644 --- a/src/cargo/sources/registry/http_remote.rs +++ b/src/cargo/sources/registry/http_remote.rs @@ -1,11 +1,13 @@ //! Access to a HTTP-based crate registry. See [`HttpRegistry`] for details. +use crate::core::global_cache_tracker; use crate::core::{PackageId, SourceId}; use crate::sources::registry::download; use crate::sources::registry::MaybeLock; use crate::sources::registry::{LoadResponse, RegistryConfig, RegistryData}; use crate::util::cache_lock::CacheLockMode; use crate::util::errors::{CargoResult, HttpNotSuccessful}; +use crate::util::interning::InternedString; use crate::util::network::http::http_handle; use crate::util::network::retry::{Retry, RetryResult}; use crate::util::network::sleep::SleepTracker; @@ -52,6 +54,7 @@ const UNKNOWN: &'static str = "Unknown"; /// /// [RFC 2789]: https://github.com/rust-lang/rfcs/pull/2789 pub struct HttpRegistry<'cfg> { + name: InternedString, /// Path to the registry index (`$CARGO_HOME/registry/index/$REG-HASH`). /// /// To be fair, `HttpRegistry` doesn't store the registry index it @@ -199,6 +202,7 @@ impl<'cfg> HttpRegistry<'cfg> { .expect("a url with the sparse+ stripped should still be valid"); Ok(HttpRegistry { + name: name.into(), index_path: config.registry_index_path().join(name), cache_path: config.registry_cache_path().join(name), source_id, @@ -454,6 +458,11 @@ impl<'cfg> HttpRegistry<'cfg> { impl<'cfg> RegistryData for HttpRegistry<'cfg> { fn prepare(&self) -> CargoResult<()> { + self.config + .deferred_global_last_use()? + .mark_registry_index_used(global_cache_tracker::RegistryIndex { + encoded_registry_name: self.name, + }); Ok(()) } @@ -750,6 +759,7 @@ impl<'cfg> RegistryData for HttpRegistry<'cfg> { download::download( &self.cache_path, &self.config, + self.name.clone(), pkg, checksum, registry_config, @@ -762,7 +772,14 @@ impl<'cfg> RegistryData for HttpRegistry<'cfg> { checksum: &str, data: &[u8], ) -> CargoResult { - download::finish_download(&self.cache_path, &self.config, pkg, checksum, data) + download::finish_download( + &self.cache_path, + &self.config, + self.name.clone(), + pkg, + checksum, + data, + ) } fn is_crate_downloaded(&self, pkg: PackageId) -> bool { diff --git a/src/cargo/sources/registry/mod.rs b/src/cargo/sources/registry/mod.rs index 7ee461edd..f884eec30 100644 --- a/src/cargo/sources/registry/mod.rs +++ b/src/cargo/sources/registry/mod.rs @@ -201,6 +201,7 @@ use tar::Archive; use tracing::debug; use crate::core::dependency::Dependency; +use crate::core::global_cache_tracker; use crate::core::{Package, PackageId, SourceId, Summary}; use crate::sources::source::MaybePackage; use crate::sources::source::QueryKind; @@ -239,6 +240,7 @@ struct LockMetadata { /// /// For general concepts of registries, see the [module-level documentation](crate::sources::registry). pub struct RegistrySource<'cfg> { + name: InternedString, /// The unique identifier of this source. source_id: SourceId, /// The path where crate files are extracted (`$CARGO_HOME/registry/src/$REG-HASH`). @@ -514,6 +516,7 @@ impl<'cfg> RegistrySource<'cfg> { yanked_whitelist: &HashSet, ) -> RegistrySource<'cfg> { RegistrySource { + name: name.into(), src_path: config.registry_source_path().join(name), config, source_id, @@ -589,6 +592,13 @@ impl<'cfg> RegistrySource<'cfg> { match fs::read_to_string(path) { Ok(ok) => match serde_json::from_str::(&ok) { Ok(lock_meta) if lock_meta.v == 1 => { + self.config + .deferred_global_last_use()? + .mark_registry_src_used(global_cache_tracker::RegistrySrc { + encoded_registry_name: self.name, + package_dir: package_dir.into(), + size: None, + }); return Ok(unpack_dir.to_path_buf()); } _ => { @@ -613,6 +623,7 @@ impl<'cfg> RegistrySource<'cfg> { set_mask(&mut tar); tar }; + let mut bytes_written = 0; let prefix = unpack_dir.file_name().unwrap(); let parent = unpack_dir.parent().unwrap(); for entry in tar.entries()? { @@ -644,6 +655,7 @@ impl<'cfg> RegistrySource<'cfg> { continue; } // Unpacking failed + bytes_written += entry.size(); let mut result = entry.unpack_in(parent).map_err(anyhow::Error::from); if cfg!(windows) && restricted_names::is_windows_reserved_path(&entry_path) { result = result.with_context(|| { @@ -670,6 +682,14 @@ impl<'cfg> RegistrySource<'cfg> { let lock_meta = LockMetadata { v: 1 }; write!(ok, "{}", serde_json::to_string(&lock_meta).unwrap())?; + self.config + .deferred_global_last_use()? + .mark_registry_src_used(global_cache_tracker::RegistrySrc { + encoded_registry_name: self.name, + package_dir: package_dir.into(), + size: Some(bytes_written), + }); + Ok(unpack_dir.to_path_buf()) } diff --git a/src/cargo/sources/registry/remote.rs b/src/cargo/sources/registry/remote.rs index ba171eac3..4e7dd5f6c 100644 --- a/src/cargo/sources/registry/remote.rs +++ b/src/cargo/sources/registry/remote.rs @@ -1,5 +1,6 @@ //! Access to a Git index based registry. See [`RemoteRegistry`] for details. +use crate::core::global_cache_tracker; use crate::core::{GitReference, PackageId, SourceId}; use crate::sources::git; use crate::sources::git::fetch::RemoteKind; @@ -47,6 +48,7 @@ use tracing::{debug, trace}; /// /// [`HttpRegistry`]: super::http_remote::HttpRegistry pub struct RemoteRegistry<'cfg> { + name: InternedString, /// Path to the registry index (`$CARGO_HOME/registry/index/$REG-HASH`). index_path: Filesystem, /// Path to the cache of `.crate` files (`$CARGO_HOME/registry/cache/$REG-HASH`). @@ -87,6 +89,7 @@ impl<'cfg> RemoteRegistry<'cfg> { /// registry index are stored. Expect to be unique. pub fn new(source_id: SourceId, config: &'cfg Config, name: &str) -> RemoteRegistry<'cfg> { RemoteRegistry { + name: name.into(), index_path: config.registry_index_path().join(name), cache_path: config.registry_cache_path().join(name), source_id, @@ -211,6 +214,11 @@ impl<'cfg> RemoteRegistry<'cfg> { impl<'cfg> RegistryData for RemoteRegistry<'cfg> { fn prepare(&self) -> CargoResult<()> { self.repo()?; + self.config + .deferred_global_last_use()? + .mark_registry_index_used(global_cache_tracker::RegistryIndex { + encoded_registry_name: self.name, + }); Ok(()) } @@ -403,6 +411,7 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { download::download( &self.cache_path, &self.config, + self.name, pkg, checksum, registry_config, @@ -415,7 +424,14 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { checksum: &str, data: &[u8], ) -> CargoResult { - download::finish_download(&self.cache_path, &self.config, pkg, checksum, data) + download::finish_download( + &self.cache_path, + &self.config, + self.name.clone(), + pkg, + checksum, + data, + ) } fn is_crate_downloaded(&self, pkg: PackageId) -> bool { diff --git a/src/cargo/util/config/mod.rs b/src/cargo/util/config/mod.rs index 50153466b..b054541d0 100644 --- a/src/cargo/util/config/mod.rs +++ b/src/cargo/util/config/mod.rs @@ -68,6 +68,7 @@ use std::time::Instant; use self::ConfigValue as CV; use crate::core::compiler::rustdoc::RustdocExternMap; +use crate::core::global_cache_tracker::{DeferredGlobalLastUse, GlobalCacheTracker}; use crate::core::shell::Verbosity; use crate::core::{features, CliUnstable, Shell, SourceId, Workspace, WorkspaceRootConfig}; use crate::ops::RegistryCredentialConfig; @@ -244,6 +245,8 @@ pub struct Config { pub nightly_features_allowed: bool, /// WorkspaceRootConfigs that have been found pub ws_roots: RefCell>, + global_cache_tracker: LazyCell>, + deferred_global_last_use: LazyCell>, } impl Config { @@ -317,6 +320,8 @@ impl Config { env_config: LazyCell::new(), nightly_features_allowed: matches!(&*features::channel(), "nightly" | "dev"), ws_roots: RefCell::new(HashMap::new()), + global_cache_tracker: LazyCell::new(), + deferred_global_last_use: LazyCell::new(), } } @@ -1919,6 +1924,25 @@ impl Config { ) -> CargoResult>> { self.package_cache_lock.try_lock(self, mode) } + + /// Returns a reference to the shared [`GlobalCacheTracker`]. + /// + /// The package cache lock must be held to call this function (and to use + /// it in general). + pub fn global_cache_tracker(&self) -> CargoResult> { + let tracker = self.global_cache_tracker.try_borrow_with(|| { + Ok::<_, anyhow::Error>(RefCell::new(GlobalCacheTracker::new(self)?)) + })?; + Ok(tracker.borrow_mut()) + } + + /// Returns a reference to the shared [`DeferredGlobalLastUse`]. + pub fn deferred_global_last_use(&self) -> CargoResult> { + let deferred = self.deferred_global_last_use.try_borrow_with(|| { + Ok::<_, anyhow::Error>(RefCell::new(DeferredGlobalLastUse::new())) + })?; + Ok(deferred.borrow_mut()) + } } /// Internal error for serde errors. diff --git a/tests/testsuite/clean.rs b/tests/testsuite/clean.rs index fbb4d3e5b..fef351e9d 100644 --- a/tests/testsuite/clean.rs +++ b/tests/testsuite/clean.rs @@ -1,5 +1,6 @@ //! Tests for the `cargo clean` command. +use cargo_test_support::paths::CargoPathExt; use cargo_test_support::registry::Package; use cargo_test_support::{ basic_bin_manifest, basic_manifest, git, main_file, project, project_in, rustc_host, @@ -805,15 +806,6 @@ fn clean_dry_run() { .file("src/lib.rs", "") .build(); - let ls_r = || -> Vec<_> { - let mut file_list: Vec<_> = walkdir::WalkDir::new(p.build_dir()) - .into_iter() - .filter_map(|e| e.map(|e| e.path().to_owned()).ok()) - .collect(); - file_list.sort(); - file_list - }; - // Start with no files. p.cargo("clean --dry-run") .with_stdout("") @@ -823,7 +815,7 @@ fn clean_dry_run() { ) .run(); p.cargo("check").run(); - let before = ls_r(); + let before = p.build_dir().ls_r(); p.cargo("clean --dry-run") .with_stderr( "[SUMMARY] [..] files, [..] total\n\ @@ -831,7 +823,7 @@ fn clean_dry_run() { ) .run(); // Verify it didn't delete anything. - let after = ls_r(); + let after = p.build_dir().ls_r(); assert_eq!(before, after); let expected = cargo::util::iter_join(before.iter().map(|p| p.to_str().unwrap()), "\n"); eprintln!("{expected}"); diff --git a/tests/testsuite/global_cache_tracker.rs b/tests/testsuite/global_cache_tracker.rs new file mode 100644 index 000000000..7c658f7c0 --- /dev/null +++ b/tests/testsuite/global_cache_tracker.rs @@ -0,0 +1,1890 @@ +//! Tests for last-use tracking and auto-gc. +//! +//! Cargo supports an environment variable called `__CARGO_TEST_LAST_USE_NOW` +//! to have cargo pretend that the current time is the given time (in seconds +//! since the unix epoch). This is used throughout these tests to simulate +//! what happens when time passes. The [`days_ago_unix`] and +//! [`months_ago_unix`] functions help with setting this value. + +use super::config::ConfigBuilder; +use cargo::core::global_cache_tracker::{self, DeferredGlobalLastUse, GlobalCacheTracker}; +use cargo::util::cache_lock::CacheLockMode; +use cargo::util::interning::InternedString; +use cargo::Config; +use cargo_test_support::paths::{self, CargoPathExt}; +use cargo_test_support::registry::{Package, RegistryBuilder}; +use cargo_test_support::{ + basic_manifest, cargo_process, execs, git, project, retry, sleep_ms, thread_wait_timeout, + Project, +}; +use itertools::Itertools; +use std::fmt::Write; +use std::path::PathBuf; +use std::process::Stdio; +use std::time::{Duration, SystemTime}; + +/// Helper to create a simple `foo` project which depends on a registry +/// dependency called `bar`. +fn basic_foo_bar_project() -> Project { + Package::new("bar", "1.0.0").publish(); + project() + .file( + "Cargo.toml", + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + bar = "1.0" + "#, + ) + .file("src/lib.rs", "") + .build() +} + +/// Helper to get the names of files in a directory as strings. +fn get_names(glob: &str) -> Vec { + let mut names: Vec<_> = glob::glob(paths::home().join(glob).to_str().unwrap()) + .unwrap() + .map(|p| p.unwrap().file_name().unwrap().to_str().unwrap().to_owned()) + .collect(); + names.sort(); + names +} + +fn get_registry_names(which: &str) -> Vec { + get_names(&format!(".cargo/registry/{which}/*/*")) +} + +fn get_index_names() -> Vec { + get_names(&format!(".cargo/registry/index/*")) +} + +fn get_git_db_names() -> Vec { + get_names(&format!(".cargo/git/db/*")) +} + +fn get_git_checkout_names(db_name: &str) -> Vec { + get_names(&format!(".cargo/git/checkouts/{db_name}/*")) +} + +fn days_ago(n: u64) -> SystemTime { + SystemTime::now() - Duration::from_secs(60 * 60 * 24 * n) +} + +/// Helper for simulating running cargo in the past. Use with the +/// __CARGO_TEST_LAST_USE_NOW environment variable. +fn days_ago_unix(n: u64) -> String { + days_ago(n) + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs() + .to_string() +} + +/// Helper for simulating running cargo in the past. Use with the +/// __CARGO_TEST_LAST_USE_NOW environment variable. +fn months_ago_unix(n: u64) -> String { + days_ago_unix(n * 30) +} + +/// Populates last-use database and the cache files. +/// +/// This makes it easier to more accurately specify exact sizes. Creating +/// specific sizes with `Package` is too difficult. +fn populate_cache(config: &Config, test_crates: &[(&str, u64, u64, u64)]) -> (PathBuf, PathBuf) { + let cache_dir = paths::home().join(".cargo/registry/cache/example.com-a6c4a5adcb232b9a"); + let src_dir = paths::home().join(".cargo/registry/src/example.com-a6c4a5adcb232b9a"); + + GlobalCacheTracker::db_path(&config) + .into_path_unlocked() + .rm_rf(); + + let _lock = config + .acquire_package_cache_lock(CacheLockMode::MutateExclusive) + .unwrap(); + let mut tracker = GlobalCacheTracker::new(&config).unwrap(); + let mut deferred = DeferredGlobalLastUse::new(); + + cache_dir.rm_rf(); + cache_dir.mkdir_p(); + src_dir.rm_rf(); + src_dir.mkdir_p(); + paths::home() + .join(".cargo/registry/index/example.com-a6c4a5adcb232b9a") + .mkdir_p(); + let mut create = |name: &str, age, crate_size: u64, src_size: u64| { + let crate_filename = InternedString::new(&format!("{name}.crate")); + deferred.mark_registry_crate_used_stamp( + global_cache_tracker::RegistryCrate { + encoded_registry_name: "example.com-a6c4a5adcb232b9a".into(), + crate_filename, + size: crate_size, + }, + Some(&days_ago(age)), + ); + deferred.mark_registry_src_used_stamp( + global_cache_tracker::RegistrySrc { + encoded_registry_name: "example.com-a6c4a5adcb232b9a".into(), + package_dir: name.into(), + size: Some(src_size), + }, + Some(&days_ago(age)), + ); + std::fs::write( + cache_dir.join(crate_filename), + "x".repeat(crate_size as usize), + ) + .unwrap(); + let path = src_dir.join(name); + path.mkdir_p(); + std::fs::write(path.join("data"), "x".repeat(src_size as usize)).unwrap() + }; + + for (name, age, crate_size, src_size) in test_crates { + create(name, *age, *crate_size, *src_size); + } + deferred.save(&mut tracker).unwrap(); + + (cache_dir, src_dir) +} + +#[cargo_test] +fn auto_gc_gated() { + // Requires -Zgc to both track last-use data and to run auto-gc. + let p = basic_foo_bar_project(); + p.cargo("check") + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + // Check that it did not create a database or delete anything. + let config = ConfigBuilder::new().build(); + assert!(!GlobalCacheTracker::db_path(&config) + .into_path_unlocked() + .exists()); + assert_eq!(get_index_names().len(), 1); + + // Again in the future, shouldn't auto-gc. + p.cargo("check").run(); + assert!(!GlobalCacheTracker::db_path(&config) + .into_path_unlocked() + .exists()); + assert_eq!(get_index_names().len(), 1); +} + +#[cargo_test] +fn cache_clean_options_gated() { + // Checks that all cache clean options require -Zgc. + let p = project().build(); + for opt in [ + "--gc", + "--max-src-age=0 day", + "--max-index-age=0 day", + "--max-git-co-age=0 day", + "--max-git-db-age=0 day", + "--max-download-age=0 day", + "--max-src-size=0", + "--max-crate-size=0", + "--max-download-size=0", + ] { + let trimmed_opt = opt.trim_start_matches('-').split('=').next().unwrap(); + p.cargo("clean") + .arg(opt) + .with_status(101) + .with_stderr(&format!( + "\ +error: the `{trimmed_opt}` flag is unstable, [..] +See [..] +See [..] for more information about the `{trimmed_opt}` flag. +" + )) + .run(); + } + + for opt in [ + "--max-target-age=0 day", + "--max-shared-target-age=0 day", + "--max-target-size=0", + "--max-shared-target-size=0", + ] { + let trimmed_opt = opt.split('=').next().unwrap(); + p.cargo("clean") + .arg(opt) + .with_status(101) + .with_stderr(&format!( + "error: option {trimmed_opt} is not yet implemented" + )) + .run(); + } +} + +#[cargo_test] +fn implies_source() { + // Checks that when a src, crate, or checkout is marked as used, the + // corresponding index or git db also gets marked as used. + let config = ConfigBuilder::new().unstable_flag("gc").build(); + let _lock = config + .acquire_package_cache_lock(CacheLockMode::MutateExclusive) + .unwrap(); + let mut deferred = DeferredGlobalLastUse::new(); + let mut tracker = GlobalCacheTracker::new(&config).unwrap(); + + deferred.mark_registry_crate_used(global_cache_tracker::RegistryCrate { + encoded_registry_name: "example.com-a6c4a5adcb232b9a".into(), + crate_filename: "regex-1.8.4.crate".into(), + size: 123, + }); + deferred.mark_registry_src_used(global_cache_tracker::RegistrySrc { + encoded_registry_name: "index.crates.io-6f17d22bba15001f".into(), + package_dir: "rand-0.8.5".into(), + size: None, + }); + deferred.mark_git_checkout_used(global_cache_tracker::GitCheckout { + encoded_git_name: "cargo-e7ff1db891893a9e".into(), + short_name: "f0a4ee0".into(), + size: None, + }); + deferred.save(&mut tracker).unwrap(); + + let mut indexes = tracker.registry_index_all().unwrap(); + assert_eq!(indexes.len(), 2); + indexes.sort_by(|a, b| a.0.encoded_registry_name.cmp(&b.0.encoded_registry_name)); + assert_eq!( + indexes[0].0.encoded_registry_name, + "example.com-a6c4a5adcb232b9a" + ); + assert_eq!( + indexes[1].0.encoded_registry_name, + "index.crates.io-6f17d22bba15001f" + ); + + let dbs = tracker.git_db_all().unwrap(); + assert_eq!(dbs.len(), 1); + assert_eq!(dbs[0].0.encoded_git_name, "cargo-e7ff1db891893a9e"); +} + +#[cargo_test] +fn auto_gc_defaults() { + // Checks that the auto-gc deletes old entries, and leaves new ones intact. + Package::new("old", "1.0.0").publish(); + Package::new("new", "1.0.0").publish(); + let p = project() + .file( + "Cargo.toml", + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + old = "1.0" + new = "1.0" + "#, + ) + .file("src/lib.rs", "") + .build(); + // Populate the last-use data. + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + assert_eq!(get_registry_names("src"), ["new-1.0.0", "old-1.0.0"]); + assert_eq!( + get_registry_names("cache"), + ["new-1.0.0.crate", "old-1.0.0.crate"] + ); + + // Run again with just one package. Make sure the old src gets deleted, + // but .crate does not. + p.change_file( + "Cargo.toml", + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + new = "1.0" + "#, + ); + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(2)) + .run(); + assert_eq!(get_registry_names("src"), ["new-1.0.0"]); + assert_eq!( + get_registry_names("cache"), + ["new-1.0.0.crate", "old-1.0.0.crate"] + ); + + // Run again after the .crate should have aged out. + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + assert_eq!(get_registry_names("src"), ["new-1.0.0"]); + assert_eq!(get_registry_names("cache"), ["new-1.0.0.crate"]); +} + +#[cargo_test] +fn auto_gc_config() { + // Can configure auto gc settings. + Package::new("old", "1.0.0").publish(); + Package::new("new", "1.0.0").publish(); + let p = project() + .file( + ".cargo/config.toml", + r#" + [gc.auto] + frequency = "always" + max-src-age = "1 day" + max-crate-age = "3 days" + max-index-age = "3 days" + max-git-co-age = "1 day" + max-git-db-age = "3 days" + "#, + ) + .file( + "Cargo.toml", + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + old = "1.0" + new = "1.0" + "#, + ) + .file("src/lib.rs", "") + .build(); + // Populate the last-use data. + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", days_ago_unix(4)) + .run(); + assert_eq!(get_registry_names("src"), ["new-1.0.0", "old-1.0.0"]); + assert_eq!( + get_registry_names("cache"), + ["new-1.0.0.crate", "old-1.0.0.crate"] + ); + + // Run again with just one package. Make sure the old src gets deleted, + // but .crate does not. + p.change_file( + "Cargo.toml", + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + new = "1.0" + "#, + ); + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", days_ago_unix(2)) + .run(); + assert_eq!(get_registry_names("src"), ["new-1.0.0"]); + assert_eq!( + get_registry_names("cache"), + ["new-1.0.0.crate", "old-1.0.0.crate"] + ); + + // Run again after the .crate should have aged out. + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + assert_eq!(get_registry_names("src"), ["new-1.0.0"]); + assert_eq!(get_registry_names("cache"), ["new-1.0.0.crate"]); +} + +#[cargo_test] +fn frequency() { + // gc.auto.frequency settings + let p = basic_foo_bar_project(); + p.change_file( + ".cargo/config.toml", + r#" + [gc.auto] + frequency = "never" + "#, + ); + // Populate data in the past. + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + assert_eq!(get_index_names().len(), 1); + assert_eq!(get_registry_names("src"), ["bar-1.0.0"]); + assert_eq!(get_registry_names("cache"), ["bar-1.0.0.crate"]); + + p.change_file("Cargo.toml", &basic_manifest("foo", "0.2.0")); + + // Try after the default expiration time, with "never" it shouldn't gc. + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + assert_eq!(get_index_names().len(), 1); + assert_eq!(get_registry_names("src"), ["bar-1.0.0"]); + assert_eq!(get_registry_names("cache"), ["bar-1.0.0.crate"]); + + // Try again with a setting that allows it to run. + p.cargo("check -Zgc") + .env("CARGO_GC_AUTO_FREQUENCY", "1 day") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + assert_eq!(get_index_names().len(), 0); + assert_eq!(get_registry_names("src").len(), 0); + assert_eq!(get_registry_names("cache").len(), 0); +} + +#[cargo_test] +fn auto_gc_index() { + // Deletes the index if it hasn't been used in a while. + let p = basic_foo_bar_project(); + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + assert_eq!(get_index_names().len(), 1); + + // Make sure it stays within the time frame. + p.change_file( + "Cargo.toml", + r#" + [package] + name = "foo" + version = "0.1.0" + "#, + ); + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(2)) + .run(); + assert_eq!(get_index_names().len(), 1); + + // After it expires, it should be deleted. + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + assert_eq!(get_index_names().len(), 0); +} + +#[cargo_test] +fn auto_gc_git() { + // auto-gc should delete git checkouts and dbs. + + // Returns the short git name of a a checkout. + let short_id = |repo: &git2::Repository| -> String { + let head = repo.revparse_single("HEAD").unwrap(); + let short_id = head.short_id().unwrap(); + short_id.as_str().unwrap().to_owned() + }; + + // Set up a git dependency and fetch it and populate the database, + // 6 months in the past. + let (git_project, git_repo) = git::new_repo("bar", |p| { + p.file("Cargo.toml", &basic_manifest("bar", "1.0.0")) + .file("src/lib.rs", "") + }); + let p = project() + .file( + "Cargo.toml", + &format!( + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + bar = {{ git = '{}' }} + "#, + git_project.url() + ), + ) + .file("src/lib.rs", "") + .build(); + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(6)) + .run(); + let db_names = get_git_db_names(); + assert_eq!(db_names.len(), 1); + let first_short_oid = short_id(&git_repo); + assert_eq!( + get_git_checkout_names(&db_names[0]), + [first_short_oid.clone()] + ); + + // Use a new git checkout, should keep both. + git_project.change_file("src/lib.rs", "// modified"); + git::add(&git_repo); + git::commit(&git_repo); + p.cargo("update -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(6)) + .run(); + assert_eq!(get_git_db_names().len(), 1); + let second_short_oid = short_id(&git_repo); + let mut both = vec![first_short_oid, second_short_oid.clone()]; + both.sort(); + assert_eq!(get_git_checkout_names(&db_names[0]), both); + + // In the future, using the second checkout should delete the first. + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + assert_eq!(get_git_db_names().len(), 1); + assert_eq!( + get_git_checkout_names(&db_names[0]), + [second_short_oid.clone()] + ); + + // After three months, the db should get deleted. + p.change_file("Cargo.toml", &basic_manifest("foo", "0.2.0")); + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + assert_eq!(get_git_db_names().len(), 0); + assert_eq!(get_git_checkout_names(&db_names[0]).len(), 0); +} + +#[cargo_test] +fn auto_gc_various_commands() { + // Checks that auto gc works with a variety of commands. + // + // Auto-gc is only run on a subset of commands. Generally it is run on + // commands that are already doing a lot of work, or heavily involve the + // use of the registry. + Package::new("bar", "1.0.0").publish(); + let cmds = ["check", "fetch"]; + for cmd in cmds { + eprintln!("checking command {cmd}"); + let p = project() + .file( + "Cargo.toml", + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + bar = "1.0" + "#, + ) + .file("src/lib.rs", "") + .build(); + // Populate the last-use data. + p.cargo(cmd) + .arg("-Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + let config = ConfigBuilder::new().unstable_flag("gc").build(); + let lock = config + .acquire_package_cache_lock(CacheLockMode::MutateExclusive) + .unwrap(); + let tracker = GlobalCacheTracker::new(&config).unwrap(); + let indexes = tracker.registry_index_all().unwrap(); + assert_eq!(indexes.len(), 1); + let crates = tracker.registry_crate_all().unwrap(); + assert_eq!(crates.len(), 1); + let srcs = tracker.registry_src_all().unwrap(); + assert_eq!(srcs.len(), 1); + drop(lock); + + // After everything is aged out, it should all be deleted. + p.change_file("Cargo.toml", &basic_manifest("foo", "0.2.0")); + p.cargo(cmd) + .arg("-Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + let lock = config + .acquire_package_cache_lock(CacheLockMode::MutateExclusive) + .unwrap(); + let indexes = tracker.registry_index_all().unwrap(); + assert_eq!(indexes.len(), 0); + let crates = tracker.registry_crate_all().unwrap(); + assert_eq!(crates.len(), 0); + let srcs = tracker.registry_src_all().unwrap(); + assert_eq!(srcs.len(), 0); + drop(tracker); + drop(lock); + paths::home().join(".cargo/registry").rm_rf(); + GlobalCacheTracker::db_path(&config) + .into_path_unlocked() + .rm_rf(); + } +} + +#[cargo_test] +fn updates_last_use_various_commands() { + // Checks that last-use tracking is updated by various commands. + // + // Not *all* commands update the index tracking, even though they + // technically involve reading the index. There isn't a convenient place + // to ensure it gets saved while avoiding saving too often in other + // commands. For the most part, this should be fine, since these commands + // usually aren't run without running one of the commands that does save + // the tracking. Some of the commands are: + // + // - login, owner, yank, search + // - report future-incompatibilities + // - package --no-verify + // - fetch --locked + Package::new("bar", "1.0.0").publish(); + let cmds = [ + // name, expected_crates (0=doesn't download) + ("check", 1), + ("fetch", 1), + ("tree", 1), + ("generate-lockfile", 0), + ("update", 0), + ("metadata", 1), + ("vendor --respect-source-config", 1), + ]; + for (cmd, expected_crates) in cmds { + eprintln!("checking command {cmd}"); + let p = project() + .file( + "Cargo.toml", + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + bar = "1.0" + "#, + ) + .file("src/lib.rs", "") + .build(); + // Populate the last-use data. + p.cargo(cmd) + .arg("-Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + let config = ConfigBuilder::new().unstable_flag("gc").build(); + let lock = config + .acquire_package_cache_lock(CacheLockMode::MutateExclusive) + .unwrap(); + let tracker = GlobalCacheTracker::new(&config).unwrap(); + let indexes = tracker.registry_index_all().unwrap(); + assert_eq!(indexes.len(), 1); + let crates = tracker.registry_crate_all().unwrap(); + assert_eq!(crates.len(), expected_crates); + let srcs = tracker.registry_src_all().unwrap(); + assert_eq!(srcs.len(), expected_crates); + drop(tracker); + drop(lock); + paths::home().join(".cargo/registry").rm_rf(); + GlobalCacheTracker::db_path(&config) + .into_path_unlocked() + .rm_rf(); + } +} + +#[cargo_test] +fn both_git_and_http_index_cleans() { + // Checks that either the git or http index cache gets cleaned. + let _crates_io = RegistryBuilder::new().build(); + let _alternative = RegistryBuilder::new().alternative().http_index().build(); + Package::new("from_git", "1.0.0").publish(); + Package::new("from_http", "1.0.0") + .alternative(true) + .publish(); + let p = project() + .file( + "Cargo.toml", + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + from_git = "1.0" + from_http = { version = "1.0", registry = "alternative" } + "#, + ) + .file("src/lib.rs", "") + .build(); + + p.cargo("update -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + let config = ConfigBuilder::new().unstable_flag("gc").build(); + let lock = config + .acquire_package_cache_lock(CacheLockMode::MutateExclusive) + .unwrap(); + let tracker = GlobalCacheTracker::new(&config).unwrap(); + let indexes = tracker.registry_index_all().unwrap(); + assert_eq!(indexes.len(), 2); + assert_eq!(get_index_names().len(), 2); + drop(lock); + + // Running in the future without these indexes should delete them. + p.change_file("Cargo.toml", &basic_manifest("foo", "0.2.0")); + p.cargo("clean --gc -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + let lock = config + .acquire_package_cache_lock(CacheLockMode::MutateExclusive) + .unwrap(); + let indexes = tracker.registry_index_all().unwrap(); + assert_eq!(indexes.len(), 0); + assert_eq!(get_index_names().len(), 0); + drop(lock); +} + +#[cargo_test] +fn clean_gc_dry_run() { + // Basic `clean --gc --dry-run` test. + let p = basic_foo_bar_project(); + // Populate the last-use data. + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + + let registry_root = paths::home().join(".cargo/registry"); + let glob_registry = |name| -> PathBuf { + let mut paths: Vec<_> = glob::glob(registry_root.join(name).join("*").to_str().unwrap()) + .unwrap() + .map(|p| p.unwrap()) + .collect(); + assert_eq!(paths.len(), 1); + paths.pop().unwrap() + }; + let index = glob_registry("index").ls_r(); + let src = glob_registry("src").ls_r(); + let cache = glob_registry("cache").ls_r(); + let expected_files = index + .iter() + .chain(src.iter()) + .chain(cache.iter()) + .map(|p| p.to_str().unwrap()) + .join("\n"); + + p.cargo("clean --gc --dry-run -v -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stdout_unordered(&expected_files) + .with_stderr( + "[SUMMARY] [..] files, [..] total\n\ + [WARNING] no files deleted due to --dry-run", + ) + .run(); + + // Again, make sure the information is still tracked. + p.cargo("clean --gc --dry-run -v -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stdout_unordered(&expected_files) + .with_stderr( + "[SUMMARY] [..] files, [..] total\n\ + [WARNING] no files deleted due to --dry-run", + ) + .run(); +} + +#[cargo_test] +fn clean_default_gc() { + // `clean` without options should also gc + let p = basic_foo_bar_project(); + // Populate the last-use data. + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + p.cargo("clean -v -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr_unordered( + "\ +[REMOVING] [ROOT]/home/.cargo/registry/index/[..] +[REMOVING] [ROOT]/home/.cargo/registry/src/[..] +[REMOVING] [ROOT]/home/.cargo/registry/cache/[..] +[REMOVED] [..] files, [..] total +", + ) + .run(); +} + +#[cargo_test] +fn tracks_sizes() { + // Checks that sizes are properly tracked in the db. + Package::new("dep1", "1.0.0") + .file("src/lib.rs", "") + .publish(); + Package::new("dep2", "1.0.0") + .file("src/lib.rs", "") + .file("data", &"abcdefghijklmnopqrstuvwxyz".repeat(1000)) + .publish(); + let p = project() + .file( + "Cargo.toml", + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + dep1 = "1.0" + dep2 = "1.0" + "#, + ) + .file("src/lib.rs", "") + .build(); + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + + // Check that the crate sizes are the same as on disk. + let config = ConfigBuilder::new().unstable_flag("gc").build(); + let _lock = config + .acquire_package_cache_lock(CacheLockMode::MutateExclusive) + .unwrap(); + let tracker = GlobalCacheTracker::new(&config).unwrap(); + let mut crates = tracker.registry_crate_all().unwrap(); + crates.sort_by(|a, b| a.0.crate_filename.cmp(&b.0.crate_filename)); + let db_sizes: Vec<_> = crates.iter().map(|c| c.0.size).collect(); + + let mut actual: Vec<_> = p + .glob(paths::home().join(".cargo/registry/cache/*/*")) + .map(|p| p.unwrap()) + .collect(); + actual.sort(); + let actual_sizes: Vec<_> = actual + .iter() + .map(|path| std::fs::metadata(path).unwrap().len()) + .collect(); + assert_eq!(db_sizes, actual_sizes); + + // Also check the src sizes are computed. + let mut srcs = tracker.registry_src_all().unwrap(); + srcs.sort_by(|a, b| a.0.package_dir.cmp(&b.0.package_dir)); + let db_sizes: Vec<_> = srcs.iter().map(|c| c.0.size.unwrap()).collect(); + let mut actual: Vec<_> = p + .glob(paths::home().join(".cargo/registry/src/*/*")) + .map(|p| p.unwrap()) + .collect(); + actual.sort(); + // .cargo-ok is not tracked in the size. + actual.iter().for_each(|p| p.join(".cargo-ok").rm_rf()); + let actual_sizes: Vec<_> = actual + .iter() + .map(|path| cargo_util::du(path, &[]).unwrap()) + .collect(); + assert_eq!(db_sizes, actual_sizes); + assert!(db_sizes[1] > 26000); +} + +#[cargo_test] +fn max_size() { + // Checks --max-crate-size and --max-src-size with various cleaning thresholds. + let config = ConfigBuilder::new().unstable_flag("gc").build(); + + let test_crates = [ + // name, age, crate_size, src_size + ("a-1.0.0", 5, 1, 1), + ("b-1.0.0", 6, 2, 2), + ("c-1.0.0", 3, 3, 3), + ("d-1.0.0", 2, 4, 4), + ("e-1.0.0", 2, 5, 5), + ("f-1.0.0", 9, 6, 6), + ("g-1.0.0", 1, 1, 1), + ]; + + // Determine the order things get deleted so they can be verified. + let mut names_by_timestamp: Vec<_> = test_crates + .iter() + .map(|(name, age, _, _)| (days_ago_unix(*age), name)) + .collect(); + names_by_timestamp.sort(); + let names_by_timestamp: Vec<_> = names_by_timestamp + .into_iter() + .map(|(_, name)| name) + .collect(); + + // This exercises the different boundary conditions. + for (clean_size, files, bytes) in [ + (22, 0, 0), + (21, 1, 6), + (16, 1, 6), + (15, 2, 8), + (14, 2, 8), + (13, 3, 9), + (12, 4, 12), + (10, 4, 12), + (9, 5, 16), + (6, 5, 16), + (5, 6, 21), + (1, 6, 21), + (0, 7, 22), + ] { + let (removed, kept) = names_by_timestamp.split_at(files); + // --max-crate-size + let (cache_dir, src_dir) = populate_cache(&config, &test_crates); + let mut stderr = String::new(); + for name in removed { + writeln!(stderr, "[REMOVING] [..]{name}.crate").unwrap(); + } + let total_display = if removed.is_empty() { + String::new() + } else { + format!(", {bytes}B total") + }; + let files_display = if files == 1 { + format!("1 file") + } else { + format!("{files} files") + }; + write!(stderr, "[REMOVED] {files_display}{total_display}").unwrap(); + cargo_process(&format!("clean -Zgc -v --max-crate-size={clean_size}")) + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr_unordered(&stderr) + .run(); + for name in kept { + assert!(cache_dir.join(format!("{name}.crate")).exists()); + } + for name in removed { + assert!(!cache_dir.join(format!("{name}.crate")).exists()); + } + + // --max-src-size + populate_cache(&config, &test_crates); + let mut stderr = String::new(); + for name in removed { + writeln!(stderr, "[REMOVING] [..]{name}").unwrap(); + } + let total_display = if files == 0 { + String::new() + } else { + format!(", {bytes}B total") + }; + write!(stderr, "[REMOVED] {files_display}{total_display}").unwrap(); + cargo_process(&format!("clean -Zgc -v --max-src-size={clean_size}")) + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr_unordered(&stderr) + .run(); + for name in kept { + assert!(src_dir.join(name).exists()); + } + for name in removed { + assert!(!src_dir.join(name).exists()); + } + } +} + +#[cargo_test] +fn max_size_untracked_crate() { + // When a .crate file exists from an older version of cargo that did not + // track sizes, `clean --max-crate-size` should populate the db with the + // sizes. + let config = ConfigBuilder::new().unstable_flag("gc").build(); + let cache = paths::home().join(".cargo/registry/cache/example.com-a6c4a5adcb232b9a"); + cache.mkdir_p(); + paths::home() + .join(".cargo/registry/index/example.com-a6c4a5adcb232b9a") + .mkdir_p(); + // Create the `.crate files. + let test_crates = [ + // name, size + ("a-1.0.0.crate", 1234), + ("b-1.0.0.crate", 42), + ("c-1.0.0.crate", 0), + ]; + for (name, size) in test_crates { + std::fs::write(cache.join(name), "x".repeat(size as usize)).unwrap() + } + // This should scan the directory and populate the db with the size information. + cargo_process("clean -Zgc -v --max-crate-size=100000") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr("[REMOVED] 0 files") + .run(); + // Check that it stored the size data. + let _lock = config + .acquire_package_cache_lock(CacheLockMode::MutateExclusive) + .unwrap(); + let tracker = GlobalCacheTracker::new(&config).unwrap(); + let crates = tracker.registry_crate_all().unwrap(); + let mut actual: Vec<_> = crates + .iter() + .map(|(rc, _time)| (rc.crate_filename.as_str(), rc.size)) + .collect(); + actual.sort(); + assert_eq!(test_crates, actual.as_slice()); +} + +/// Helper to prepare the max-size test. +fn max_size_untracked_prepare() -> (Config, Project) { + // First, publish and download a dependency. + let p = basic_foo_bar_project(); + p.cargo("fetch").run(); + // Pretend it was an older version that did not track last-use. + let config = ConfigBuilder::new().unstable_flag("gc").build(); + GlobalCacheTracker::db_path(&config) + .into_path_unlocked() + .rm_rf(); + (config, p) +} + +/// Helper to verify the max-size test. +fn max_size_untracked_verify(config: &Config) { + let actual: Vec<_> = glob::glob( + paths::home() + .join(".cargo/registry/src/*/*") + .to_str() + .unwrap(), + ) + .unwrap() + .map(|p| p.unwrap()) + .collect(); + assert_eq!(actual.len(), 1); + let actual_size = cargo_util::du(&actual[0], &[]).unwrap(); + let lock = config + .acquire_package_cache_lock(CacheLockMode::MutateExclusive) + .unwrap(); + let tracker = GlobalCacheTracker::new(&config).unwrap(); + let srcs = tracker.registry_src_all().unwrap(); + assert_eq!(srcs.len(), 1); + assert_eq!(srcs[0].0.size, Some(actual_size)); + drop(lock); +} + +#[cargo_test] +fn max_size_untracked_src_from_use() { + // When a src directory exists from an older version of cargo that did not + // track sizes, doing a build should populate the db with an entry with an + // unknown size. `clean --max-src-size` should then fix the size. + let (config, p) = max_size_untracked_prepare(); + + // Run a command that will update the db with an unknown src size. + p.cargo("tree -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + // Check that it is None. + let lock = config + .acquire_package_cache_lock(CacheLockMode::MutateExclusive) + .unwrap(); + let tracker = GlobalCacheTracker::new(&config).unwrap(); + let srcs = tracker.registry_src_all().unwrap(); + assert_eq!(srcs.len(), 1); + assert_eq!(srcs[0].0.size, None); + drop(lock); + + // Fix the size. + p.cargo("clean -v --max-src-size=10000 -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr("[REMOVED] 0 files") + .run(); + max_size_untracked_verify(&config); +} + +#[cargo_test] +fn max_size_untracked_src_from_clean() { + // When a src directory exists from an older version of cargo that did not + // track sizes, `clean --max-src-size` should populate the db with the + // sizes. + let (config, p) = max_size_untracked_prepare(); + + // Clean should scan the src and update the db. + p.cargo("clean -v --max-src-size=10000 -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr("[REMOVED] 0 files") + .run(); + max_size_untracked_verify(&config); +} + +#[cargo_test] +fn max_download_size() { + // --max-download-size + // + // This creates some sample crates of specific sizes, and then tries + // deleting at various specific size thresholds that exercise different + // edge conditions. + let config = ConfigBuilder::new().unstable_flag("gc").build(); + + let test_crates = [ + // name, age, crate_size, src_size + ("d-1.0.0", 4, 4, 5), + ("c-1.0.0", 3, 3, 3), + ("a-1.0.0", 1, 2, 5), + ("b-1.0.0", 1, 1, 7), + ]; + + for (max_size, num_deleted, files_deleted, bytes) in [ + (30, 0, 0, 0), + (29, 1, 1, 5), + (24, 2, 2, 9), + (20, 3, 3, 12), + (1, 7, 7, 29), + (0, 8, 8, 30), + ] { + populate_cache(&config, &test_crates); + // Determine the order things will be deleted. + let delete_order: Vec = test_crates + .iter() + .flat_map(|(name, _, _, _)| [name.to_string(), format!("{name}.crate")]) + .collect(); + let (removed, _kept) = delete_order.split_at(num_deleted); + let mut stderr = String::new(); + for name in removed { + writeln!(stderr, "[REMOVING] [..]{name}").unwrap(); + } + let files_display = if files_deleted == 1 { + format!("1 file") + } else { + format!("{files_deleted} files") + }; + let total_display = if removed.is_empty() { + String::new() + } else { + format!(", {bytes}B total") + }; + write!(stderr, "[REMOVED] {files_display}{total_display}",).unwrap(); + cargo_process(&format!("clean -Zgc -v --max-download-size={max_size}")) + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr_unordered(&stderr) + .run(); + } +} + +#[cargo_test] +fn package_cache_lock_during_build() { + // Verifies that a shared lock is held during a build. Resolution and + // downloads should be OK while that is held, but mutation should block. + // + // This works by launching a build with a build script that will pause. + // Then it performs other cargo commands and verifies their behavior. + Package::new("bar", "1.0.0").publish(); + let p_foo = project() + .file( + "Cargo.toml", + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + bar = "1.0" + "#, + ) + .file("src/lib.rs", "") + .file( + "build.rs", + r#" + fn main() { + std::fs::write("blocking", "").unwrap(); + let path = std::path::Path::new("ready"); + loop { + if path.exists() { + break; + } else { + std::thread::sleep(std::time::Duration::from_millis(100)) + } + } + } + "#, + ) + .build(); + let p_foo2 = project() + .at("foo2") + .file( + "Cargo.toml", + r#" + [package] + name = "foo2" + version = "0.1.0" + + [dependencies] + bar = "1.0" + "#, + ) + .file("src/lib.rs", "") + .build(); + + // Start a build that will pause once the build starts. + let mut foo_child = p_foo + .cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .build_command() + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .unwrap(); + + // Wait for it to enter build script. + retry(100, || p_foo.root().join("blocking").exists().then_some(())); + + // Start a build with a different target directory. It should not block, + // even though it gets a download lock, and then a shared lock. + // + // Also verify that auto-gc gets disabled. + p_foo2 + .cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("CARGO_GC_AUTO_FREQUENCY", "always") + .env("CARGO_LOG", "cargo::core::gc=debug") + .with_stderr_contains("[UPDATING] `dummy-registry` index") + .with_stderr_contains("[CHECKING] bar v1.0.0") + .with_stderr_contains("[CHECKING] foo2 v0.1.0 [..]") + .with_stderr_contains("[FINISHED] [..]") + .with_stderr_contains("[..]unable to acquire mutate lock, auto gc disabled") + .run(); + + // Ensure that the first build really blocked. + assert!(matches!(foo_child.try_wait(), Ok(None))); + + // Cleaning while a command is running should block. + let mut clean_cmd = p_foo2 + .cargo("clean --max-download-size=0 -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .build_command(); + clean_cmd.stderr(Stdio::piped()); + let mut clean_child = clean_cmd.spawn().unwrap(); + + // Give the clean command a chance to finish (it shouldn't). + sleep_ms(500); + // They should both still be running. + assert!(matches!(foo_child.try_wait(), Ok(None))); + assert!(matches!(clean_child.try_wait(), Ok(None))); + + // Let the original build finish. + p_foo.change_file("ready", ""); + + // Wait for clean to finish. + let thread = std::thread::spawn(|| clean_child.wait_with_output().unwrap()); + let output = thread_wait_timeout(100, thread); + assert!(output.status.success()); + // Validate the output of the clean. + execs() + .with_stderr( + "\ +[BLOCKING] waiting for file lock on package cache mutation +[REMOVED] [..] +", + ) + .run_output(&output); +} + +#[cargo_test] +fn read_only_locking_auto_gc() { + // Tests the behavior for auto-gc on a read-only directory. + let p = basic_foo_bar_project(); + // Populate cache. + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + let cargo_home = paths::home().join(".cargo"); + let mut perms = std::fs::metadata(&cargo_home).unwrap().permissions(); + // Test when it can't update auto-gc db. + perms.set_readonly(true); + std::fs::set_permissions(&cargo_home, perms.clone()).unwrap(); + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[CHECKING] bar v1.0.0 +[CHECKING] foo v0.1.0 [..] +[FINISHED] [..] +", + ) + .run(); + // Try again without the last-use existing (such as if the cache was + // populated by an older version of cargo). + perms.set_readonly(false); + std::fs::set_permissions(&cargo_home, perms.clone()).unwrap(); + let config = ConfigBuilder::new().build(); + GlobalCacheTracker::db_path(&config) + .into_path_unlocked() + .rm_rf(); + perms.set_readonly(true); + std::fs::set_permissions(&cargo_home, perms.clone()).unwrap(); + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr("[FINISHED] [..]") + .run(); + perms.set_readonly(false); + std::fs::set_permissions(&cargo_home, perms).unwrap(); +} + +#[cargo_test] +fn delete_index_also_deletes_crates() { + // Checks that when an index is delete that src and cache directories also get deleted. + let p = basic_foo_bar_project(); + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + + assert_eq!(get_registry_names("src"), ["bar-1.0.0"]); + assert_eq!(get_registry_names("cache"), ["bar-1.0.0.crate"]); + + p.cargo("clean") + .arg("--max-index-age=0 days") + .arg("-Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr("[REMOVED] [..]") + .run(); + + assert_eq!(get_registry_names("src").len(), 0); + assert_eq!(get_registry_names("cache").len(), 0); +} + +#[cargo_test] +fn clean_syncs_missing_files() { + // When files go missing in the cache, clean operations that need to track + // the size should also remove them from the database. + Package::new("bar", "1.0.0").publish(); + Package::new("baz", "1.0.0").publish(); + let p = project() + .file( + "Cargo.toml", + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + bar = "1.0" + baz = "1.0" + "#, + ) + .file("src/lib.rs", "") + .build(); + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + + // Verify things are tracked. + let config = ConfigBuilder::new().unstable_flag("gc").build(); + let lock = config + .acquire_package_cache_lock(CacheLockMode::MutateExclusive) + .unwrap(); + let tracker = GlobalCacheTracker::new(&config).unwrap(); + let crates = tracker.registry_crate_all().unwrap(); + assert_eq!(crates.len(), 2); + let srcs = tracker.registry_src_all().unwrap(); + assert_eq!(srcs.len(), 2); + drop(lock); + + // Remove the files. + for pattern in [ + ".cargo/registry/cache/*/bar-1.0.0.crate", + ".cargo/registry/src/*/bar-1.0.0", + ] { + p.glob(paths::home().join(pattern)) + .map(|p| p.unwrap()) + .next() + .unwrap() + .rm_rf(); + } + + // Clean should update the db. + p.cargo("clean -v --max-download-size=1GB -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr("[REMOVED] 0 files") + .run(); + + // Verify + let crates = tracker.registry_crate_all().unwrap(); + assert_eq!(crates.len(), 1); + let srcs = tracker.registry_src_all().unwrap(); + assert_eq!(srcs.len(), 1); +} + +#[cargo_test] +fn offline_doesnt_auto_gc() { + // When running offline, auto-gc shouldn't run. + let p = basic_foo_bar_project(); + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + // Remove the dependency. + p.change_file("Cargo.toml", &basic_manifest("foo", "0.1.0")); + // Run offline, make sure it doesn't delete anything + p.cargo("check --offline -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr("[CHECKING] foo v0.1.0[..]\n[FINISHED][..]") + .run(); + assert_eq!(get_registry_names("src"), ["bar-1.0.0"]); + assert_eq!(get_registry_names("cache"), ["bar-1.0.0.crate"]); + // Run online, make sure auto-gc runs. + p.cargo("check -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr("[FINISHED][..]") + .run(); + assert_eq!(get_registry_names("src"), &[] as &[String]); + assert_eq!(get_registry_names("cache"), &[] as &[String]); +} + +#[cargo_test] +fn can_handle_future_schema() -> anyhow::Result<()> { + // It should work when a future version of cargo has made schema changes + // to the database. + let p = basic_foo_bar_project(); + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + // Modify the schema to pretend this is done by a future version of cargo. + let config = ConfigBuilder::new().build(); + let db_path = GlobalCacheTracker::db_path(&config).into_path_unlocked(); + let conn = rusqlite::Connection::open(&db_path)?; + let user_version: u32 = + conn.query_row("SELECT user_version FROM pragma_user_version", [], |row| { + row.get(0) + })?; + conn.execute("ALTER TABLE global_data ADD COLUMN foo DEFAULT 123", [])?; + conn.pragma_update(None, "user_version", &(user_version + 1))?; + drop(conn); + // Verify it doesn't blow up. + p.cargo("clean --max-download-size=0 -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr("[REMOVED] 4 files, [..] total") + .run(); + Ok(()) +} + +#[cargo_test] +fn clean_max_git_age() { + // --max-git-*-age flags + let (git_a, git_a_repo) = git::new_repo("git_a", |p| { + p.file("Cargo.toml", &basic_manifest("git_a", "1.0.0")) + .file("src/lib.rs", "") + }); + let p = project() + .file( + "Cargo.toml", + &format!( + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + git_a = {{ git = '{}' }} + "#, + git_a.url() + ), + ) + .file("src/lib.rs", "") + .build(); + // Populate last-use tracking. + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", days_ago_unix(4)) + .run(); + // Update git_a to create a separate checkout. + git_a.change_file("src/lib.rs", "// test"); + git::add(&git_a_repo); + git::commit(&git_a_repo); + // Update last-use tracking, where the first git checkout will stay "old". + p.cargo("update -p git_a -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", days_ago_unix(2)) + .with_stderr( + "\ +[UPDATING] git repository [..] +[UPDATING] git_a v1.0.0 [..] +", + ) + .run(); + + let db_names = get_git_db_names(); + assert_eq!(db_names.len(), 1); + let db_name = &db_names[0]; + let co_names = get_git_checkout_names(&db_name); + assert_eq!(co_names.len(), 2); + + // Delete the first checkout + p.cargo("clean -v -Zgc") + .arg("--max-git-co-age=3 days") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[REMOVING] [ROOT]/home/.cargo/git/checkouts/git_a-[..]/[..] +[REMOVED] [..] +", + ) + .run(); + + let db_names = get_git_db_names(); + assert_eq!(db_names.len(), 1); + let co_names = get_git_checkout_names(&db_name); + assert_eq!(co_names.len(), 1); + + // delete the second checkout + p.cargo("clean -v -Zgc") + .arg("--max-git-co-age=0 days") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[REMOVING] [ROOT]/home/.cargo/git/checkouts/git_a-[..]/[..] +[REMOVED] [..] +", + ) + .run(); + + let db_names = get_git_db_names(); + assert_eq!(db_names.len(), 1); + let co_names = get_git_checkout_names(&db_name); + assert_eq!(co_names.len(), 0); + + // delete the db + p.cargo("clean -v -Zgc") + .arg("--max-git-db-age=1 days") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[REMOVING] [ROOT]/home/.cargo/git/db/git_a-[..] +[REMOVING] [ROOT]/home/.cargo/git/checkouts/git_a-[..] +[REMOVED] [..] +", + ) + .run(); + + let db_names = get_git_db_names(); + assert_eq!(db_names.len(), 0); + let co_names = get_git_checkout_names(&db_name); + assert_eq!(co_names.len(), 0); +} + +#[cargo_test] +fn clean_max_src_crate_age() { + // --max-src-age and --max-crate-age flags + let p = basic_foo_bar_project(); + // Populate last-use tracking. + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", days_ago_unix(4)) + .run(); + // Update bar to create a separate copy with a different timestamp. + Package::new("bar", "1.0.1").publish(); + p.cargo("update -p bar -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", days_ago_unix(2)) + .with_stderr( + "\ +[UPDATING] `dummy-registry` index +[UPDATING] bar v1.0.0 -> v1.0.1 +", + ) + .run(); + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", days_ago_unix(2)) + .with_stderr( + "\ +[DOWNLOADING] crates ... +[DOWNLOADED] bar v1.0.1 [..] +", + ) + .run(); + + assert_eq!(get_registry_names("src"), ["bar-1.0.0", "bar-1.0.1"]); + assert_eq!( + get_registry_names("cache"), + ["bar-1.0.0.crate", "bar-1.0.1.crate"] + ); + + // Delete the old src. + p.cargo("clean -v -Zgc") + .arg("--max-src-age=3 days") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[REMOVING] [..]/bar-1.0.0 +[REMOVED] [..] +", + ) + .run(); + + // delete the second src + p.cargo("clean -v -Zgc") + .arg("--max-src-age=0 days") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[REMOVING] [..]/bar-1.0.1 +[REMOVED] [..] +", + ) + .run(); + + // delete the old crate + p.cargo("clean -v -Zgc") + .arg("--max-crate-age=3 days") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[REMOVING] [..]/bar-1.0.0.crate +[REMOVED] [..] +", + ) + .run(); + + // delete the seecond crate + p.cargo("clean -v -Zgc") + .arg("--max-crate-age=0 days") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[REMOVING] [..]/bar-1.0.1.crate +[REMOVED] [..] +", + ) + .run(); +} + +#[cargo_test] +fn clean_doc_with_cache() { + // clean --doc with other cache flags should do both. + let p = basic_foo_bar_project(); + p.cargo("doc -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", months_ago_unix(4)) + .run(); + assert_eq!(get_registry_names("src"), ["bar-1.0.0"]); + assert_eq!(get_registry_names("cache"), ["bar-1.0.0.crate"]); + assert!(p.build_dir().join("doc").exists()); + p.cargo("clean --doc --max-download-size=0 -v -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr_unordered( + "\ +[REMOVING] [ROOT]/foo/target/doc +[REMOVING] [ROOT]/home/.cargo/registry/src/[..]/bar-1.0.0 +[REMOVING] [ROOT]/home/.cargo/registry/cache/[..]/bar-1.0.0.crate +[REMOVED] [..] +", + ) + .run(); +} + +#[cargo_test] +fn clean_max_git_size() { + // clean --max-git-size + // + // Creates two checkouts. The sets a size threshold to delete one. And + // then with 0 max size to delete everything. + let (git_project, git_repo) = git::new_repo("bar", |p| { + p.file("Cargo.toml", &basic_manifest("bar", "1.0.0")) + .file("src/lib.rs", "") + }); + let p = project() + .file( + "Cargo.toml", + &format!( + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + bar = {{ git = '{}' }} + "#, + git_project.url() + ), + ) + .file("src/lib.rs", "") + .build(); + // Fetch and populate db. + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .env("__CARGO_TEST_LAST_USE_NOW", days_ago_unix(3)) + .run(); + + // Figure out the name of the first checkout. + let git_root = paths::home().join(".cargo/git"); + let db_names = get_git_db_names(); + assert_eq!(db_names.len(), 1); + let db_name = &db_names[0]; + let co_names = get_git_checkout_names(&db_name); + assert_eq!(co_names.len(), 1); + let first_co_name = &co_names[0]; + + // Make an update and create a new checkout. + git_project.change_file("src/lib.rs", "// modified"); + git::add(&git_repo); + git::commit(&git_repo); + p.cargo("update -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + // Use a different time so that the first checkout timestamp is less + // than the second. + .env("__CARGO_TEST_LAST_USE_NOW", days_ago_unix(2)) + .run(); + + // Figure out the threshold to use. + let mut co_names = get_git_checkout_names(&db_name); + assert_eq!(co_names.len(), 2); + co_names.retain(|name| name != first_co_name); + assert_eq!(co_names.len(), 1); + let second_co_name = &co_names[0]; + let second_co_path = git_root + .join("checkouts") + .join(db_name) + .join(second_co_name); + let second_co_size = cargo_util::du(&second_co_path, &["!.git"]).unwrap(); + + let db_size = cargo_util::du(&git_root.join("db").join(db_name), &[]).unwrap(); + + let threshold = db_size + second_co_size; + + p.cargo(&format!("clean --max-git-size={threshold} -Zgc -v")) + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr(&format!( + "\ +[REMOVING] [ROOT]/home/.cargo/git/checkouts/{db_name}/{first_co_name} +[REMOVED] [..] +" + )) + .run(); + + // And then try cleaning everything. + p.cargo("clean --max-git-size=0 -Zgc -v") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr_unordered(&format!( + "\ +[REMOVING] [ROOT]/home/.cargo/git/checkouts/{db_name}/{second_co_name} +[REMOVING] [ROOT]/home/.cargo/git/db/{db_name} +[REMOVED] [..] +" + )) + .run(); +} + +// Helper for setting up fake git sizes for git size cleaning. +fn setup_fake_git_sizes(db_name: &str, db_size: usize, co_sizes: &[usize]) { + let base_git = paths::home().join(".cargo/git"); + let db_path = base_git.join("db").join(db_name); + db_path.mkdir_p(); + std::fs::write(db_path.join("test"), "x".repeat(db_size)).unwrap(); + let base_co = base_git.join("checkouts").join(db_name); + for (i, size) in co_sizes.iter().enumerate() { + let co_name = format!("co{i}"); + let co_path = base_co.join(co_name); + co_path.mkdir_p(); + std::fs::write(co_path.join("test"), "x".repeat(*size)).unwrap(); + } +} + +#[cargo_test] +fn clean_max_git_size_untracked() { + // If there are git directories that aren't tracked in the database, + // `--max-git-size` should pick it up. + // + // The db_name of "example" depends on the sorting order of the names ("e" + // should be after "c"), so that the db comes after the checkouts. + setup_fake_git_sizes("example", 5000, &[1000, 2000]); + cargo_process(&format!("clean -Zgc -v --max-git-size=7000")) + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[REMOVING] [ROOT]/home/.cargo/git/checkouts/example/co0 +[REMOVED] [..] +", + ) + .run(); + cargo_process(&format!("clean -Zgc -v --max-git-size=5000")) + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[REMOVING] [ROOT]/home/.cargo/git/checkouts/example/co1 +[REMOVED] [..] +", + ) + .run(); + cargo_process(&format!("clean -Zgc -v --max-git-size=0")) + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[REMOVING] [ROOT]/home/.cargo/git/db/example +[REMOVED] [..] +", + ) + .run(); +} + +#[cargo_test] +fn clean_max_git_size_deletes_co_from_db() { + // In the scenario where it thinks it needs to delete the db, it should + // also delete all the checkouts. + // + // The db_name of "abc" depends on the sorting order of the names ("a" + // should be before "c"), so that the db comes before the checkouts. + setup_fake_git_sizes("abc", 5000, &[1000, 2000]); + // This deletes everything because it tries to delete the db, which then + // deletes all checkouts. + cargo_process(&format!("clean -Zgc -v --max-git-size=3000")) + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[REMOVING] [ROOT]/home/.cargo/git/db/abc +[REMOVING] [ROOT]/home/.cargo/git/checkouts/abc/co1 +[REMOVING] [ROOT]/home/.cargo/git/checkouts/abc/co0 +[REMOVED] [..] +", + ) + .run(); +} + +#[cargo_test] +fn handles_missing_index() { + // Checks behavior when index is missing. + let p = basic_foo_bar_project(); + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + paths::home().join(".cargo/registry/index").rm_rf(); + cargo_process("clean -v --max-download-size=0 -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr_unordered( + "\ +[REMOVING] [ROOT]/home/.cargo/registry/cache/[..] +[REMOVING] [ROOT]/home/.cargo/registry/src/[..] +[REMOVED] [..] +", + ) + .run(); +} + +#[cargo_test] +fn handles_missing_git_db() { + // Checks behavior when git db is missing. + let git_project = git::new("bar", |p| { + p.file("Cargo.toml", &basic_manifest("bar", "1.0.0")) + .file("src/lib.rs", "") + }); + let p = project() + .file( + "Cargo.toml", + &format!( + r#" + [package] + name = "foo" + version = "0.1.0" + + [dependencies] + bar = {{ git = '{}' }} + "#, + git_project.url() + ), + ) + .file("src/lib.rs", "") + .build(); + p.cargo("fetch -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .run(); + paths::home().join(".cargo/git/db").rm_rf(); + cargo_process("clean -v --max-git-size=0 -Zgc") + .masquerade_as_nightly_cargo(&["gc"]) + .with_stderr( + "\ +[REMOVING] [ROOT]/home/.cargo/git/checkouts/[..] +[REMOVED] [..] +", + ) + .run(); +} diff --git a/tests/testsuite/main.rs b/tests/testsuite/main.rs index 07f749e34..e2e46c400 100644 --- a/tests/testsuite/main.rs +++ b/tests/testsuite/main.rs @@ -98,6 +98,7 @@ mod git_auth; mod git_gc; mod git_shallow; mod glob_targets; +mod global_cache_tracker; mod help; mod https; mod inheritable_workspace_fields;