Auto merge of #7368 - alexcrichton:canonical-urls-omg, r=ehuss

Work with canonical URLs in `[patch]` This commit addresses an issue with how the resolver processes `[patch]` annotations in manifests and lock files. Previously the resolver would use the raw `Url` coming out of a manifest, but the rest of resolution, when comparing `SourceId`, uses a canonical form of a `Url` rather than the actual raw `Url`. This ended up causing discrepancies like those found in #7282. To fix the issue all `patch` intermediate storage in the resolver uses a newly-added `CanonicalUrl` type instead of a `Url`. This `CanonicalUrl` is then also used throughout the codebase, and all lookups in the resolver as switched to using `CanonicalUrl` instead of `Url`, which... Closes #7282
2025-09-28 11:20:36 +00:00 · 2019-09-17 16:42:36 +00:00 · 2019-09-17 16:42:36 +00:00 · 35c55a9320
commit 35c55a9320
parent 658bde16c3 e545412251
9 changed files with 200 additions and 91 deletions
--- a/src/cargo/core/registry.rs
+++ b/src/cargo/core/registry.rs
@ -9,7 +9,7 @@ use crate::core::PackageSet;
 use crate::core::{Dependency, PackageId, Source, SourceId, SourceMap, Summary};
 use crate::sources::config::SourceConfigMap;
 use crate::util::errors::{CargoResult, CargoResultExt};
-use crate::util::{profile, Config};
+use crate::util::{profile, CanonicalUrl, Config};

 /// Source of information about a group of packages.
 ///
@ -75,9 +75,9 @@ pub struct PackageRegistry<'cfg> {
    yanked_whitelist: HashSet<PackageId>,
    source_config: SourceConfigMap<'cfg>,

-    patches: HashMap<Url, Vec<Summary>>,
+    patches: HashMap<CanonicalUrl, Vec<Summary>>,
    patches_locked: bool,
-    patches_available: HashMap<Url, Vec<PackageId>>,
+    patches_available: HashMap<CanonicalUrl, Vec<PackageId>>,
 }

 /// A map of all "locked packages" which is filled in when parsing a lock file
@ -230,6 +230,8 @@ impl<'cfg> PackageRegistry<'cfg> {
    /// `query` until `lock_patches` is called below, which should be called
    /// once all patches have been added.
    pub fn patch(&mut self, url: &Url, deps: &[Dependency]) -> CargoResult<()> {
+        let canonical = CanonicalUrl::new(url)?;
+
        // First up we need to actually resolve each `deps` specification to
        // precisely one summary. We're not using the `query` method below as it
        // internally uses maps we're building up as part of this method
@ -284,7 +286,7 @@ impl<'cfg> PackageRegistry<'cfg> {
                        url
                    )
                }
-                if summary.package_id().source_id().url() == url {
+                if *summary.package_id().source_id().canonical_url() == canonical {
                    failure::bail!(
                        "patch for `{}` in `{}` points to the same source, but \
                         patches must point to different sources",
@ -317,8 +319,8 @@ impl<'cfg> PackageRegistry<'cfg> {
        // `lock` method) and otherwise store the unlocked summaries in
        // `patches` to get locked in a future call to `lock_patches`.
        let ids = unlocked_summaries.iter().map(|s| s.package_id()).collect();
-        self.patches_available.insert(url.clone(), ids);
-        self.patches.insert(url.clone(), unlocked_summaries);
+        self.patches_available.insert(canonical.clone(), ids);
+        self.patches.insert(canonical, unlocked_summaries);

        Ok(())
    }
@ -340,8 +342,11 @@ impl<'cfg> PackageRegistry<'cfg> {
        self.patches_locked = true;
    }

-    pub fn patches(&self) -> &HashMap<Url, Vec<Summary>> {
-        &self.patches
+    pub fn patches(&self) -> Vec<Summary> {
+        self.patches
+            .values()
+            .flat_map(|v| v.iter().cloned())
+            .collect()
    }

    fn load(&mut self, source_id: SourceId, kind: Kind) -> CargoResult<()> {
@ -472,7 +477,7 @@ impl<'cfg> Registry for PackageRegistry<'cfg> {
            // This means that `dep.matches(..)` will always return false, when
            // what we really care about is the name/version match.
            let mut patches = Vec::<Summary>::new();
-            if let Some(extra) = self.patches.get(dep.source_id().url()) {
+            if let Some(extra) = self.patches.get(dep.source_id().canonical_url()) {
                patches.extend(
                    extra
                        .iter()
@ -605,7 +610,11 @@ impl<'cfg> Registry for PackageRegistry<'cfg> {
    }
 }

-fn lock(locked: &LockedMap, patches: &HashMap<Url, Vec<PackageId>>, summary: Summary) -> Summary {
+fn lock(
+    locked: &LockedMap,
+    patches: &HashMap<CanonicalUrl, Vec<PackageId>>,
+    summary: Summary,
+) -> Summary {
    let pair = locked
        .get(&summary.source_id())
        .and_then(|map| map.get(&*summary.name()))
@ -669,7 +678,7 @@ fn lock(locked: &LockedMap, patches: &HashMap<Url, Vec<PackageId>>, summary: Sum
                // map, and we see if `id` is contained in the list of patches
                // for that url. If it is then this lock is still valid,
                // otherwise the lock is no longer valid.
-                match patches.get(dep.source_id().url()) {
+                match patches.get(dep.source_id().canonical_url()) {
                    Some(list) => list.contains(&id),
                    None => false,
                }
--- a/src/cargo/core/resolver/resolve.rs
+++ b/src/cargo/core/resolver/resolve.rs
@ -3,8 +3,6 @@ use std::collections::{HashMap, HashSet};
 use std::fmt;
 use std::iter::FromIterator;

-use url::Url;
-
 use crate::core::dependency::Kind;
 use crate::core::{Dependency, PackageId, PackageIdSpec, Summary, Target};
 use crate::util::errors::CargoResult;
@ -114,8 +112,8 @@ impl Resolve {
        self.graph.path_to_top(pkg)
    }

-    pub fn register_used_patches(&mut self, patches: &HashMap<Url, Vec<Summary>>) {
-        for summary in patches.values().flat_map(|v| v) {
+    pub fn register_used_patches(&mut self, patches: &[Summary]) {
+        for summary in patches {
            if self.iter().any(|id| id == summary.package_id()) {
                continue;
            }
--- a/src/cargo/core/source/source_id.rs
+++ b/src/cargo/core/source/source_id.rs
@ -15,10 +15,9 @@ use url::Url;

 use crate::core::PackageId;
 use crate::ops;
-use crate::sources::git;
 use crate::sources::DirectorySource;
 use crate::sources::{GitSource, PathSource, RegistrySource, CRATES_IO_INDEX};
-use crate::util::{CargoResult, Config, IntoUrl};
+use crate::util::{CanonicalUrl, CargoResult, Config, IntoUrl};

 lazy_static::lazy_static! {
    static ref SOURCE_ID_CACHE: Mutex<HashSet<&'static SourceIdInner>> = Mutex::new(HashSet::new());
@ -34,8 +33,8 @@ pub struct SourceId {
 struct SourceIdInner {
    /// The source URL.
    url: Url,
-    /// The result of `git::canonicalize_url()` on `url` field.
-    canonical_url: Url,
+    /// The canonical version of the above url
+    canonical_url: CanonicalUrl,
    /// The source kind.
    kind: Kind,
    /// For example, the exact Git revision of the specified branch for a Git Source.
@ -80,7 +79,7 @@ impl SourceId {
    fn new(kind: Kind, url: Url) -> CargoResult<SourceId> {
        let source_id = SourceId::wrap(SourceIdInner {
            kind,
-            canonical_url: git::canonicalize_url(&url)?,
+            canonical_url: CanonicalUrl::new(&url)?,
            url,
            precise: None,
            name: None,
@ -216,7 +215,7 @@ impl SourceId {
        let url = config.get_registry_index(key)?;
        Ok(SourceId::wrap(SourceIdInner {
            kind: Kind::Registry,
-            canonical_url: git::canonicalize_url(&url)?,
+            canonical_url: CanonicalUrl::new(&url)?,
            url,
            precise: None,
            name: Some(key.to_string()),
@ -228,6 +227,12 @@ impl SourceId {
        &self.inner.url
    }

+    /// Gets the canonical URL of this source, used for internal comparison
+    /// purposes.
+    pub fn canonical_url(&self) -> &CanonicalUrl {
+        &self.inner.canonical_url
+    }
+
    pub fn display_index(self) -> String {
        if self.is_default_registry() {
            "crates.io index".to_string()
@ -508,7 +513,7 @@ impl Hash for SourceId {
    fn hash<S: hash::Hasher>(&self, into: &mut S) {
        self.inner.kind.hash(into);
        match self.inner.kind {
-            Kind::Git(_) => self.inner.canonical_url.as_str().hash(into),
+            Kind::Git(_) => self.inner.canonical_url.hash(into),
            _ => self.inner.url.as_str().hash(into),
        }
    }
--- a/src/cargo/ops/resolve.rs
+++ b/src/cargo/ops/resolve.rs
@ -343,7 +343,7 @@ pub fn resolve_with_previous<'cfg>(
        Some(ws.config()),
        ws.features().require(Feature::public_dependency()).is_ok(),
    )?;
-    resolved.register_used_patches(registry.patches());
+    resolved.register_used_patches(&registry.patches());
    if register_patches {
        // It would be good if this warning was more targeted and helpful
        // (such as showing close candidates that failed to match). However,
--- a/src/cargo/sources/git/mod.rs
+++ b/src/cargo/sources/git/mod.rs
@ -1,4 +1,4 @@
-pub use self::source::{canonicalize_url, GitSource};
+pub use self::source::GitSource;
 pub use self::utils::{fetch, GitCheckout, GitDatabase, GitRemote, GitRevision};
 mod source;
 mod utils;
--- a/src/cargo/sources/git/source.rs
+++ b/src/cargo/sources/git/source.rs
@ -27,7 +27,7 @@ impl<'cfg> GitSource<'cfg> {
        assert!(source_id.is_git(), "id is not git, id={}", source_id);

        let remote = GitRemote::new(source_id.url());
-        let ident = ident(source_id.url())?;
+        let ident = ident(&source_id);

        let reference = match source_id.precise() {
            Some(s) => GitReference::Rev(s.to_string()),
@ -59,58 +59,17 @@ impl<'cfg> GitSource<'cfg> {
    }
 }

-fn ident(url: &Url) -> CargoResult<String> {
-    let url = canonicalize_url(url)?;
-    let ident = url
+fn ident(id: &SourceId) -> String {
+    let ident = id
+        .canonical_url()
+        .raw_canonicalized_url()
        .path_segments()
-        .and_then(|mut s| s.next_back())
+        .and_then(|s| s.rev().next())
        .unwrap_or("");

    let ident = if ident == "" { "_empty" } else { ident };

-    Ok(format!("{}-{}", ident, short_hash(&url)))
-}
-
-// Some hacks and heuristics for making equivalent URLs hash the same.
-pub fn canonicalize_url(url: &Url) -> CargoResult<Url> {
-    let mut url = url.clone();
-
-    // cannot-be-a-base-urls (e.g., `github.com:rust-lang-nursery/rustfmt.git`)
-    // are not supported.
-    if url.cannot_be_a_base() {
-        failure::bail!(
-            "invalid url `{}`: cannot-be-a-base-URLs are not supported",
-            url
-        )
-    }
-
-    // Strip a trailing slash.
-    if url.path().ends_with('/') {
-        url.path_segments_mut().unwrap().pop_if_empty();
-    }
-
-    // HACK: for GitHub URLs specifically, just lower-case
-    // everything. GitHub treats both the same, but they hash
-    // differently, and we're gonna be hashing them. This wants a more
-    // general solution, and also we're almost certainly not using the
-    // same case conversion rules that GitHub does. (See issue #84.)
-    if url.host_str() == Some("github.com") {
-        url.set_scheme("https").unwrap();
-        let path = url.path().to_lowercase();
-        url.set_path(&path);
-    }
-
-    // Repos can generally be accessed with or without `.git` extension.
-    let needs_chopping = url.path().ends_with(".git");
-    if needs_chopping {
-        let last = {
-            let last = url.path_segments().unwrap().next_back().unwrap();
-            last[..last.len() - 4].to_owned()
-        };
-        url.path_segments_mut().unwrap().pop().push(&last);
-    }
-
-    Ok(url)
+    format!("{}-{}", ident, short_hash(id.canonical_url()))
 }

 impl<'cfg> Debug for GitSource<'cfg> {
@ -241,56 +200,54 @@ impl<'cfg> Source for GitSource<'cfg> {
 #[cfg(test)]
 mod test {
    use super::ident;
+    use crate::core::{GitReference, SourceId};
    use crate::util::IntoUrl;
-    use url::Url;

    #[test]
    pub fn test_url_to_path_ident_with_path() {
-        let ident = ident(&url("https://github.com/carlhuda/cargo")).unwrap();
+        let ident = ident(&src("https://github.com/carlhuda/cargo"));
        assert!(ident.starts_with("cargo-"));
    }

    #[test]
    pub fn test_url_to_path_ident_without_path() {
-        let ident = ident(&url("https://github.com")).unwrap();
+        let ident = ident(&src("https://github.com"));
        assert!(ident.starts_with("_empty-"));
    }

    #[test]
    fn test_canonicalize_idents_by_stripping_trailing_url_slash() {
-        let ident1 = ident(&url("https://github.com/PistonDevelopers/piston/")).unwrap();
-        let ident2 = ident(&url("https://github.com/PistonDevelopers/piston")).unwrap();
+        let ident1 = ident(&src("https://github.com/PistonDevelopers/piston/"));
+        let ident2 = ident(&src("https://github.com/PistonDevelopers/piston"));
        assert_eq!(ident1, ident2);
    }

    #[test]
    fn test_canonicalize_idents_by_lowercasing_github_urls() {
-        let ident1 = ident(&url("https://github.com/PistonDevelopers/piston")).unwrap();
-        let ident2 = ident(&url("https://github.com/pistondevelopers/piston")).unwrap();
+        let ident1 = ident(&src("https://github.com/PistonDevelopers/piston"));
+        let ident2 = ident(&src("https://github.com/pistondevelopers/piston"));
        assert_eq!(ident1, ident2);
    }

    #[test]
    fn test_canonicalize_idents_by_stripping_dot_git() {
-        let ident1 = ident(&url("https://github.com/PistonDevelopers/piston")).unwrap();
-        let ident2 = ident(&url("https://github.com/PistonDevelopers/piston.git")).unwrap();
+        let ident1 = ident(&src("https://github.com/PistonDevelopers/piston"));
+        let ident2 = ident(&src("https://github.com/PistonDevelopers/piston.git"));
        assert_eq!(ident1, ident2);
    }

    #[test]
    fn test_canonicalize_idents_different_protocols() {
-        let ident1 = ident(&url("https://github.com/PistonDevelopers/piston")).unwrap();
-        let ident2 = ident(&url("git://github.com/PistonDevelopers/piston")).unwrap();
+        let ident1 = ident(&src("https://github.com/PistonDevelopers/piston"));
+        let ident2 = ident(&src("git://github.com/PistonDevelopers/piston"));
        assert_eq!(ident1, ident2);
    }

-    #[test]
-    fn test_canonicalize_cannot_be_a_base_urls() {
-        assert!(ident(&url("github.com:PistonDevelopers/piston")).is_err());
-        assert!(ident(&url("google.com:PistonDevelopers/piston")).is_err());
-    }
-
-    fn url(s: &str) -> Url {
-        s.into_url().unwrap()
+    fn src(s: &str) -> SourceId {
+        SourceId::for_git(
+            &s.into_url().unwrap(),
+            GitReference::Branch("master".to_string()),
+        )
+        .unwrap()
    }
 }
--- a/src/cargo/util/canonical_url.rs
+++ b/src/cargo/util/canonical_url.rs
@ -0,0 +1,73 @@
+use crate::util::errors::CargoResult;
+use std::hash::{self, Hash};
+use url::Url;
+
+/// A newtype wrapper around `Url` which represents a "canonical" version of an
+/// original URL.
+///
+/// A "canonical" url is only intended for internal comparison purposes in
+/// Cargo. It's to help paper over mistakes such as depending on
+/// `github.com/foo/bar` vs `github.com/foo/bar.git`. This is **only** for
+/// internal purposes within Cargo and provides no means to actually read the
+/// underlying string value of the `Url` it contains. This is intentional,
+/// because all fetching should still happen within the context of the original
+/// URL.
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
+pub struct CanonicalUrl(Url);
+
+impl CanonicalUrl {
+    pub fn new(url: &Url) -> CargoResult<CanonicalUrl> {
+        let mut url = url.clone();
+
+        // cannot-be-a-base-urls (e.g., `github.com:rust-lang-nursery/rustfmt.git`)
+        // are not supported.
+        if url.cannot_be_a_base() {
+            failure::bail!(
+                "invalid url `{}`: cannot-be-a-base-URLs are not supported",
+                url
+            )
+        }
+
+        // Strip a trailing slash.
+        if url.path().ends_with('/') {
+            url.path_segments_mut().unwrap().pop_if_empty();
+        }
+
+        // For GitHub URLs specifically, just lower-case everything. GitHub
+        // treats both the same, but they hash differently, and we're gonna be
+        // hashing them. This wants a more general solution, and also we're
+        // almost certainly not using the same case conversion rules that GitHub
+        // does. (See issue #84)
+        if url.host_str() == Some("github.com") {
+            url.set_scheme("https").unwrap();
+            let path = url.path().to_lowercase();
+            url.set_path(&path);
+        }
+
+        // Repos can generally be accessed with or without `.git` extension.
+        let needs_chopping = url.path().ends_with(".git");
+        if needs_chopping {
+            let last = {
+                let last = url.path_segments().unwrap().next_back().unwrap();
+                last[..last.len() - 4].to_owned()
+            };
+            url.path_segments_mut().unwrap().pop().push(&last);
+        }
+
+        Ok(CanonicalUrl(url))
+    }
+
+    /// Returns the raw canonicalized URL, although beware that this should
+    /// never be used/displayed/etc, it should only be used for internal data
+    /// structures and hashes and such.
+    pub fn raw_canonicalized_url(&self) -> &Url {
+        &self.0
+    }
+}
+
+// See comment in `source_id.rs` for why we explicitly use `as_str()` here.
+impl Hash for CanonicalUrl {
+    fn hash<S: hash::Hasher>(&self, into: &mut S) {
+        self.0.as_str().hash(into);
+    }
+}
--- a/src/cargo/util/mod.rs
+++ b/src/cargo/util/mod.rs
@ -1,5 +1,6 @@
 use std::time::Duration;

+pub use self::canonical_url::CanonicalUrl;
 pub use self::cfg::{Cfg, CfgExpr};
 pub use self::config::{homedir, Config, ConfigValue};
 pub use self::dependency_queue::DependencyQueue;
@ -28,6 +29,7 @@ pub use self::workspace::{
    print_available_tests,
 };

+mod canonical_url;
 mod cfg;
 pub mod command_prelude;
 pub mod config;
--- a/tests/testsuite/patch.rs
+++ b/tests/testsuite/patch.rs
@ -1338,3 +1338,68 @@ version. [..]
        )
        .run();
 }
+
+#[cargo_test]
+fn canonicalize_a_bunch() {
+    let base = git::repo(&paths::root().join("base"))
+        .file("Cargo.toml", &basic_manifest("base", "0.1.0"))
+        .file("src/lib.rs", "")
+        .build();
+
+    let intermediate = git::repo(&paths::root().join("intermediate"))
+        .file(
+            "Cargo.toml",
+            &format!(
+                r#"
+                    [package]
+                    name = "intermediate"
+                    version = "0.1.0"
+
+                    [dependencies]
+                    # Note the lack of trailing slash
+                    base = {{ git = '{}' }}
+                "#,
+                base.url(),
+            ),
+        )
+        .file("src/lib.rs", "pub fn f() { base::f() }")
+        .build();
+
+    let newbase = git::repo(&paths::root().join("newbase"))
+        .file("Cargo.toml", &basic_manifest("base", "0.1.0"))
+        .file("src/lib.rs", "pub fn f() {}")
+        .build();
+
+    let p = project()
+        .file(
+            "Cargo.toml",
+            &format!(
+                r#"
+                    [package]
+                    name = "foo"
+                    version = "0.0.1"
+
+                    [dependencies]
+                    # Note the trailing slashes
+                    base = {{ git = '{base}/' }}
+                    intermediate = {{ git = '{intermediate}/' }}
+
+                    [patch.'{base}'] # Note the lack of trailing slash
+                    base = {{ git = '{newbase}' }}
+                "#,
+                base = base.url(),
+                intermediate = intermediate.url(),
+                newbase = newbase.url(),
+            ),
+        )
+        .file("src/lib.rs", "pub fn a() { base::f(); intermediate::f() }")
+        .build();
+
+    // Once to make sure it actually works
+    p.cargo("build").run();
+
+    // Then a few more times for good measure to ensure no weird warnings about
+    // `[patch]` are printed.
+    p.cargo("build").with_stderr("[FINISHED] [..]").run();
+    p.cargo("build").with_stderr("[FINISHED] [..]").run();
+}