Auto merge of #7368 - alexcrichton:canonical-urls-omg, r=ehuss

Work with canonical URLs in `[patch]`

This commit addresses an issue with how the resolver processes `[patch]`
annotations in manifests and lock files. Previously the resolver would
use the raw `Url` coming out of a manifest, but the rest of resolution,
when comparing `SourceId`, uses a canonical form of a `Url` rather than
the actual raw `Url`. This ended up causing discrepancies like those
found in #7282.

To fix the issue all `patch` intermediate storage in the resolver uses a
newly-added `CanonicalUrl` type instead of a `Url`. This
`CanonicalUrl` is then also used throughout the codebase, and all
lookups in the resolver as switched to using `CanonicalUrl` instead of
`Url`, which...

Closes #7282
This commit is contained in:
bors 2019-09-17 16:42:36 +00:00
commit 35c55a9320
9 changed files with 200 additions and 91 deletions

View File

@ -9,7 +9,7 @@ use crate::core::PackageSet;
use crate::core::{Dependency, PackageId, Source, SourceId, SourceMap, Summary};
use crate::sources::config::SourceConfigMap;
use crate::util::errors::{CargoResult, CargoResultExt};
use crate::util::{profile, Config};
use crate::util::{profile, CanonicalUrl, Config};
/// Source of information about a group of packages.
///
@ -75,9 +75,9 @@ pub struct PackageRegistry<'cfg> {
yanked_whitelist: HashSet<PackageId>,
source_config: SourceConfigMap<'cfg>,
patches: HashMap<Url, Vec<Summary>>,
patches: HashMap<CanonicalUrl, Vec<Summary>>,
patches_locked: bool,
patches_available: HashMap<Url, Vec<PackageId>>,
patches_available: HashMap<CanonicalUrl, Vec<PackageId>>,
}
/// A map of all "locked packages" which is filled in when parsing a lock file
@ -230,6 +230,8 @@ impl<'cfg> PackageRegistry<'cfg> {
/// `query` until `lock_patches` is called below, which should be called
/// once all patches have been added.
pub fn patch(&mut self, url: &Url, deps: &[Dependency]) -> CargoResult<()> {
let canonical = CanonicalUrl::new(url)?;
// First up we need to actually resolve each `deps` specification to
// precisely one summary. We're not using the `query` method below as it
// internally uses maps we're building up as part of this method
@ -284,7 +286,7 @@ impl<'cfg> PackageRegistry<'cfg> {
url
)
}
if summary.package_id().source_id().url() == url {
if *summary.package_id().source_id().canonical_url() == canonical {
failure::bail!(
"patch for `{}` in `{}` points to the same source, but \
patches must point to different sources",
@ -317,8 +319,8 @@ impl<'cfg> PackageRegistry<'cfg> {
// `lock` method) and otherwise store the unlocked summaries in
// `patches` to get locked in a future call to `lock_patches`.
let ids = unlocked_summaries.iter().map(|s| s.package_id()).collect();
self.patches_available.insert(url.clone(), ids);
self.patches.insert(url.clone(), unlocked_summaries);
self.patches_available.insert(canonical.clone(), ids);
self.patches.insert(canonical, unlocked_summaries);
Ok(())
}
@ -340,8 +342,11 @@ impl<'cfg> PackageRegistry<'cfg> {
self.patches_locked = true;
}
pub fn patches(&self) -> &HashMap<Url, Vec<Summary>> {
&self.patches
pub fn patches(&self) -> Vec<Summary> {
self.patches
.values()
.flat_map(|v| v.iter().cloned())
.collect()
}
fn load(&mut self, source_id: SourceId, kind: Kind) -> CargoResult<()> {
@ -472,7 +477,7 @@ impl<'cfg> Registry for PackageRegistry<'cfg> {
// This means that `dep.matches(..)` will always return false, when
// what we really care about is the name/version match.
let mut patches = Vec::<Summary>::new();
if let Some(extra) = self.patches.get(dep.source_id().url()) {
if let Some(extra) = self.patches.get(dep.source_id().canonical_url()) {
patches.extend(
extra
.iter()
@ -605,7 +610,11 @@ impl<'cfg> Registry for PackageRegistry<'cfg> {
}
}
fn lock(locked: &LockedMap, patches: &HashMap<Url, Vec<PackageId>>, summary: Summary) -> Summary {
fn lock(
locked: &LockedMap,
patches: &HashMap<CanonicalUrl, Vec<PackageId>>,
summary: Summary,
) -> Summary {
let pair = locked
.get(&summary.source_id())
.and_then(|map| map.get(&*summary.name()))
@ -669,7 +678,7 @@ fn lock(locked: &LockedMap, patches: &HashMap<Url, Vec<PackageId>>, summary: Sum
// map, and we see if `id` is contained in the list of patches
// for that url. If it is then this lock is still valid,
// otherwise the lock is no longer valid.
match patches.get(dep.source_id().url()) {
match patches.get(dep.source_id().canonical_url()) {
Some(list) => list.contains(&id),
None => false,
}

View File

@ -3,8 +3,6 @@ use std::collections::{HashMap, HashSet};
use std::fmt;
use std::iter::FromIterator;
use url::Url;
use crate::core::dependency::Kind;
use crate::core::{Dependency, PackageId, PackageIdSpec, Summary, Target};
use crate::util::errors::CargoResult;
@ -114,8 +112,8 @@ impl Resolve {
self.graph.path_to_top(pkg)
}
pub fn register_used_patches(&mut self, patches: &HashMap<Url, Vec<Summary>>) {
for summary in patches.values().flat_map(|v| v) {
pub fn register_used_patches(&mut self, patches: &[Summary]) {
for summary in patches {
if self.iter().any(|id| id == summary.package_id()) {
continue;
}

View File

@ -15,10 +15,9 @@ use url::Url;
use crate::core::PackageId;
use crate::ops;
use crate::sources::git;
use crate::sources::DirectorySource;
use crate::sources::{GitSource, PathSource, RegistrySource, CRATES_IO_INDEX};
use crate::util::{CargoResult, Config, IntoUrl};
use crate::util::{CanonicalUrl, CargoResult, Config, IntoUrl};
lazy_static::lazy_static! {
static ref SOURCE_ID_CACHE: Mutex<HashSet<&'static SourceIdInner>> = Mutex::new(HashSet::new());
@ -34,8 +33,8 @@ pub struct SourceId {
struct SourceIdInner {
/// The source URL.
url: Url,
/// The result of `git::canonicalize_url()` on `url` field.
canonical_url: Url,
/// The canonical version of the above url
canonical_url: CanonicalUrl,
/// The source kind.
kind: Kind,
/// For example, the exact Git revision of the specified branch for a Git Source.
@ -80,7 +79,7 @@ impl SourceId {
fn new(kind: Kind, url: Url) -> CargoResult<SourceId> {
let source_id = SourceId::wrap(SourceIdInner {
kind,
canonical_url: git::canonicalize_url(&url)?,
canonical_url: CanonicalUrl::new(&url)?,
url,
precise: None,
name: None,
@ -216,7 +215,7 @@ impl SourceId {
let url = config.get_registry_index(key)?;
Ok(SourceId::wrap(SourceIdInner {
kind: Kind::Registry,
canonical_url: git::canonicalize_url(&url)?,
canonical_url: CanonicalUrl::new(&url)?,
url,
precise: None,
name: Some(key.to_string()),
@ -228,6 +227,12 @@ impl SourceId {
&self.inner.url
}
/// Gets the canonical URL of this source, used for internal comparison
/// purposes.
pub fn canonical_url(&self) -> &CanonicalUrl {
&self.inner.canonical_url
}
pub fn display_index(self) -> String {
if self.is_default_registry() {
"crates.io index".to_string()
@ -508,7 +513,7 @@ impl Hash for SourceId {
fn hash<S: hash::Hasher>(&self, into: &mut S) {
self.inner.kind.hash(into);
match self.inner.kind {
Kind::Git(_) => self.inner.canonical_url.as_str().hash(into),
Kind::Git(_) => self.inner.canonical_url.hash(into),
_ => self.inner.url.as_str().hash(into),
}
}

View File

@ -343,7 +343,7 @@ pub fn resolve_with_previous<'cfg>(
Some(ws.config()),
ws.features().require(Feature::public_dependency()).is_ok(),
)?;
resolved.register_used_patches(registry.patches());
resolved.register_used_patches(&registry.patches());
if register_patches {
// It would be good if this warning was more targeted and helpful
// (such as showing close candidates that failed to match). However,

View File

@ -1,4 +1,4 @@
pub use self::source::{canonicalize_url, GitSource};
pub use self::source::GitSource;
pub use self::utils::{fetch, GitCheckout, GitDatabase, GitRemote, GitRevision};
mod source;
mod utils;

View File

@ -27,7 +27,7 @@ impl<'cfg> GitSource<'cfg> {
assert!(source_id.is_git(), "id is not git, id={}", source_id);
let remote = GitRemote::new(source_id.url());
let ident = ident(source_id.url())?;
let ident = ident(&source_id);
let reference = match source_id.precise() {
Some(s) => GitReference::Rev(s.to_string()),
@ -59,58 +59,17 @@ impl<'cfg> GitSource<'cfg> {
}
}
fn ident(url: &Url) -> CargoResult<String> {
let url = canonicalize_url(url)?;
let ident = url
fn ident(id: &SourceId) -> String {
let ident = id
.canonical_url()
.raw_canonicalized_url()
.path_segments()
.and_then(|mut s| s.next_back())
.and_then(|s| s.rev().next())
.unwrap_or("");
let ident = if ident == "" { "_empty" } else { ident };
Ok(format!("{}-{}", ident, short_hash(&url)))
}
// Some hacks and heuristics for making equivalent URLs hash the same.
pub fn canonicalize_url(url: &Url) -> CargoResult<Url> {
let mut url = url.clone();
// cannot-be-a-base-urls (e.g., `github.com:rust-lang-nursery/rustfmt.git`)
// are not supported.
if url.cannot_be_a_base() {
failure::bail!(
"invalid url `{}`: cannot-be-a-base-URLs are not supported",
url
)
}
// Strip a trailing slash.
if url.path().ends_with('/') {
url.path_segments_mut().unwrap().pop_if_empty();
}
// HACK: for GitHub URLs specifically, just lower-case
// everything. GitHub treats both the same, but they hash
// differently, and we're gonna be hashing them. This wants a more
// general solution, and also we're almost certainly not using the
// same case conversion rules that GitHub does. (See issue #84.)
if url.host_str() == Some("github.com") {
url.set_scheme("https").unwrap();
let path = url.path().to_lowercase();
url.set_path(&path);
}
// Repos can generally be accessed with or without `.git` extension.
let needs_chopping = url.path().ends_with(".git");
if needs_chopping {
let last = {
let last = url.path_segments().unwrap().next_back().unwrap();
last[..last.len() - 4].to_owned()
};
url.path_segments_mut().unwrap().pop().push(&last);
}
Ok(url)
format!("{}-{}", ident, short_hash(id.canonical_url()))
}
impl<'cfg> Debug for GitSource<'cfg> {
@ -241,56 +200,54 @@ impl<'cfg> Source for GitSource<'cfg> {
#[cfg(test)]
mod test {
use super::ident;
use crate::core::{GitReference, SourceId};
use crate::util::IntoUrl;
use url::Url;
#[test]
pub fn test_url_to_path_ident_with_path() {
let ident = ident(&url("https://github.com/carlhuda/cargo")).unwrap();
let ident = ident(&src("https://github.com/carlhuda/cargo"));
assert!(ident.starts_with("cargo-"));
}
#[test]
pub fn test_url_to_path_ident_without_path() {
let ident = ident(&url("https://github.com")).unwrap();
let ident = ident(&src("https://github.com"));
assert!(ident.starts_with("_empty-"));
}
#[test]
fn test_canonicalize_idents_by_stripping_trailing_url_slash() {
let ident1 = ident(&url("https://github.com/PistonDevelopers/piston/")).unwrap();
let ident2 = ident(&url("https://github.com/PistonDevelopers/piston")).unwrap();
let ident1 = ident(&src("https://github.com/PistonDevelopers/piston/"));
let ident2 = ident(&src("https://github.com/PistonDevelopers/piston"));
assert_eq!(ident1, ident2);
}
#[test]
fn test_canonicalize_idents_by_lowercasing_github_urls() {
let ident1 = ident(&url("https://github.com/PistonDevelopers/piston")).unwrap();
let ident2 = ident(&url("https://github.com/pistondevelopers/piston")).unwrap();
let ident1 = ident(&src("https://github.com/PistonDevelopers/piston"));
let ident2 = ident(&src("https://github.com/pistondevelopers/piston"));
assert_eq!(ident1, ident2);
}
#[test]
fn test_canonicalize_idents_by_stripping_dot_git() {
let ident1 = ident(&url("https://github.com/PistonDevelopers/piston")).unwrap();
let ident2 = ident(&url("https://github.com/PistonDevelopers/piston.git")).unwrap();
let ident1 = ident(&src("https://github.com/PistonDevelopers/piston"));
let ident2 = ident(&src("https://github.com/PistonDevelopers/piston.git"));
assert_eq!(ident1, ident2);
}
#[test]
fn test_canonicalize_idents_different_protocols() {
let ident1 = ident(&url("https://github.com/PistonDevelopers/piston")).unwrap();
let ident2 = ident(&url("git://github.com/PistonDevelopers/piston")).unwrap();
let ident1 = ident(&src("https://github.com/PistonDevelopers/piston"));
let ident2 = ident(&src("git://github.com/PistonDevelopers/piston"));
assert_eq!(ident1, ident2);
}
#[test]
fn test_canonicalize_cannot_be_a_base_urls() {
assert!(ident(&url("github.com:PistonDevelopers/piston")).is_err());
assert!(ident(&url("google.com:PistonDevelopers/piston")).is_err());
}
fn url(s: &str) -> Url {
s.into_url().unwrap()
fn src(s: &str) -> SourceId {
SourceId::for_git(
&s.into_url().unwrap(),
GitReference::Branch("master".to_string()),
)
.unwrap()
}
}

View File

@ -0,0 +1,73 @@
use crate::util::errors::CargoResult;
use std::hash::{self, Hash};
use url::Url;
/// A newtype wrapper around `Url` which represents a "canonical" version of an
/// original URL.
///
/// A "canonical" url is only intended for internal comparison purposes in
/// Cargo. It's to help paper over mistakes such as depending on
/// `github.com/foo/bar` vs `github.com/foo/bar.git`. This is **only** for
/// internal purposes within Cargo and provides no means to actually read the
/// underlying string value of the `Url` it contains. This is intentional,
/// because all fetching should still happen within the context of the original
/// URL.
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
pub struct CanonicalUrl(Url);
impl CanonicalUrl {
pub fn new(url: &Url) -> CargoResult<CanonicalUrl> {
let mut url = url.clone();
// cannot-be-a-base-urls (e.g., `github.com:rust-lang-nursery/rustfmt.git`)
// are not supported.
if url.cannot_be_a_base() {
failure::bail!(
"invalid url `{}`: cannot-be-a-base-URLs are not supported",
url
)
}
// Strip a trailing slash.
if url.path().ends_with('/') {
url.path_segments_mut().unwrap().pop_if_empty();
}
// For GitHub URLs specifically, just lower-case everything. GitHub
// treats both the same, but they hash differently, and we're gonna be
// hashing them. This wants a more general solution, and also we're
// almost certainly not using the same case conversion rules that GitHub
// does. (See issue #84)
if url.host_str() == Some("github.com") {
url.set_scheme("https").unwrap();
let path = url.path().to_lowercase();
url.set_path(&path);
}
// Repos can generally be accessed with or without `.git` extension.
let needs_chopping = url.path().ends_with(".git");
if needs_chopping {
let last = {
let last = url.path_segments().unwrap().next_back().unwrap();
last[..last.len() - 4].to_owned()
};
url.path_segments_mut().unwrap().pop().push(&last);
}
Ok(CanonicalUrl(url))
}
/// Returns the raw canonicalized URL, although beware that this should
/// never be used/displayed/etc, it should only be used for internal data
/// structures and hashes and such.
pub fn raw_canonicalized_url(&self) -> &Url {
&self.0
}
}
// See comment in `source_id.rs` for why we explicitly use `as_str()` here.
impl Hash for CanonicalUrl {
fn hash<S: hash::Hasher>(&self, into: &mut S) {
self.0.as_str().hash(into);
}
}

View File

@ -1,5 +1,6 @@
use std::time::Duration;
pub use self::canonical_url::CanonicalUrl;
pub use self::cfg::{Cfg, CfgExpr};
pub use self::config::{homedir, Config, ConfigValue};
pub use self::dependency_queue::DependencyQueue;
@ -28,6 +29,7 @@ pub use self::workspace::{
print_available_tests,
};
mod canonical_url;
mod cfg;
pub mod command_prelude;
pub mod config;

View File

@ -1338,3 +1338,68 @@ version. [..]
)
.run();
}
#[cargo_test]
fn canonicalize_a_bunch() {
let base = git::repo(&paths::root().join("base"))
.file("Cargo.toml", &basic_manifest("base", "0.1.0"))
.file("src/lib.rs", "")
.build();
let intermediate = git::repo(&paths::root().join("intermediate"))
.file(
"Cargo.toml",
&format!(
r#"
[package]
name = "intermediate"
version = "0.1.0"
[dependencies]
# Note the lack of trailing slash
base = {{ git = '{}' }}
"#,
base.url(),
),
)
.file("src/lib.rs", "pub fn f() { base::f() }")
.build();
let newbase = git::repo(&paths::root().join("newbase"))
.file("Cargo.toml", &basic_manifest("base", "0.1.0"))
.file("src/lib.rs", "pub fn f() {}")
.build();
let p = project()
.file(
"Cargo.toml",
&format!(
r#"
[package]
name = "foo"
version = "0.0.1"
[dependencies]
# Note the trailing slashes
base = {{ git = '{base}/' }}
intermediate = {{ git = '{intermediate}/' }}
[patch.'{base}'] # Note the lack of trailing slash
base = {{ git = '{newbase}' }}
"#,
base = base.url(),
intermediate = intermediate.url(),
newbase = newbase.url(),
),
)
.file("src/lib.rs", "pub fn a() { base::f(); intermediate::f() }")
.build();
// Once to make sure it actually works
p.cargo("build").run();
// Then a few more times for good measure to ensure no weird warnings about
// `[patch]` are printed.
p.cargo("build").with_stderr("[FINISHED] [..]").run();
p.cargo("build").with_stderr("[FINISHED] [..]").run();
}