fix(embedded): Handle more parsing corner cases (#15187)

### What does this PR try to resolve? This is part of #12207. I found these while implementing frontmatter support within rustc. I'll likely do another pass when I finish rustc support to - Unify tests between cargo and rustc - Improve error messages ### How should we test and review this PR? ### Additional information
2025-09-25 11:14:46 +00:00 · 2025-02-16 05:05:53 +00:00 · 2025-02-16 05:05:53 +00:00 · 3b784a42e3
commit 3b784a42e3
parent 0a4aff2e78 fd2000b106
1 changed files with 118 additions and 33 deletions
--- a/src/cargo/util/toml/embedded.rs
+++ b/src/cargo/util/toml/embedded.rs
@ -140,44 +140,28 @@ impl<'s> ScriptSource<'s> {
            content: input,
        };
-        // See rust-lang/rust's compiler/rustc_lexer/src/lib.rs's `strip_shebang`
+        if let Some(shebang_end) = strip_shebang(source.content) {
-        // Shebang must start with `#!` literally, without any preceding whitespace.
+            let (shebang, content) = source.content.split_at(shebang_end);
        // For simplicity we consider any line starting with `#!` a shebang,
        // regardless of restrictions put on shebangs by specific platforms.
        if let Some(rest) = source.content.strip_prefix("#!") {
            // Ok, this is a shebang but if the next non-whitespace token is `[`,
            // then it may be valid Rust code, so consider it Rust code.
            //
            // NOTE: rustc considers line and block comments to be whitespace but to avoid
            // any more awareness of Rust grammar, we are excluding it.
            if rest.trim_start().starts_with('[') {
                return Ok(source);
            }
            // No other choice than to consider this a shebang.
            let newline_end = source
                .content
                .find('\n')
                .map(|pos| pos + 1)
                .unwrap_or(source.content.len());
            let (shebang, content) = source.content.split_at(newline_end);
            source.shebang = Some(shebang);
            source.content = content;
        }
        const FENCE_CHAR: char = '-';
-        let mut trimmed_content = source.content;
+        let mut rest = source.content;
-        while !trimmed_content.is_empty() {
+        while !rest.is_empty() {
-            let c = trimmed_content;
+            let without_spaces = rest.trim_start_matches([' ', '\t']);
-            let c = c.trim_start_matches([' ', '\t']);
+            let without_nl = without_spaces.trim_start_matches(['\r', '\n']);
-            let c = c.trim_start_matches(['\r', '\n']);
+            if without_nl == rest {
-            if c == trimmed_content {
+                // nothing trimmed
                break;
            } else if without_nl == without_spaces {
                // frontmatter must come after a newline
                return Ok(source);
            }
-            trimmed_content = c;
+            rest = without_nl;
        }
-        let fence_end = trimmed_content
+        let fence_end = rest
            .char_indices()
            .find_map(|(i, c)| (c != FENCE_CHAR).then_some(i))
            .unwrap_or(source.content.len());
@ -190,8 +174,9 @@ impl<'s> ScriptSource<'s> {
                    "found {fence_end} `{FENCE_CHAR}` in rust frontmatter, expected at least 3"
                )
            }
-            _ => trimmed_content.split_at(fence_end),
+            _ => rest.split_at(fence_end),
        };
        let nl_fence_pattern = format!("\n{fence_pattern}");
        let (info, content) = rest.split_once("\n").unwrap_or((rest, ""));
        let info = info.trim();
        if !info.is_empty() {
@ -199,11 +184,11 @@ impl<'s> ScriptSource<'s> {
        }
        source.content = content;
-        let Some((frontmatter, content)) = source.content.split_once(fence_pattern) else {
+        let Some(frontmatter_nl) = source.content.find(&nl_fence_pattern) else {
            anyhow::bail!("no closing `{fence_pattern}` found for frontmatter");
        };
-        source.frontmatter = Some(frontmatter);
+        source.frontmatter = Some(&source.content[..frontmatter_nl + 1]);
-        source.content = content;
+        source.content = &source.content[frontmatter_nl + nl_fence_pattern.len()..];
        let (line, content) = source
            .content
@ -235,6 +220,26 @@ impl<'s> ScriptSource<'s> {
    }
 }
 fn strip_shebang(input: &str) -> Option<usize> {
    // See rust-lang/rust's compiler/rustc_lexer/src/lib.rs's `strip_shebang`
    // Shebang must start with `#!` literally, without any preceding whitespace.
    // For simplicity we consider any line starting with `#!` a shebang,
    // regardless of restrictions put on shebangs by specific platforms.
    if let Some(rest) = input.strip_prefix("#!") {
        // Ok, this is a shebang but if the next non-whitespace token is `[`,
        // then it may be valid Rust code, so consider it Rust code.
        //
        // NOTE: rustc considers line and block comments to be whitespace but to avoid
        // any more awareness of Rust grammar, we are excluding it.
        if !rest.trim_start().starts_with('[') {
            // No other choice than to consider this a shebang.
            let newline_end = input.find('\n').map(|pos| pos + 1).unwrap_or(input.len());
            return Some(newline_end);
        }
    }
    None
 }
 #[cfg(test)]
 mod test_expand {
    use snapbox::assert_data_eq;
@ -466,6 +471,86 @@ fn main() {}
        );
    }
    #[test]
    fn split_indent() {
        assert_source(
            r#"#!/usr/bin/env cargo
    ---
    [dependencies]
    time="0.1.25"
    ----
 fn main() {}
 "#,
            str![[r##"
 shebang: "#!/usr/bin/env cargo\n"
 info: None
 frontmatter: None
 content: "    ---\n    [dependencies]\n    time=\"0.1.25\"\n    ----\n\nfn main() {}\n"
 "##]],
        );
    }
    #[test]
    fn split_escaped() {
        assert_source(
            r#"#!/usr/bin/env cargo
 -----
 ---
 ---
 -----
 fn main() {}
 "#,
            str![[r##"
 shebang: "#!/usr/bin/env cargo\n"
 info: None
 frontmatter: "---\n---\n"
 content: "\nfn main() {}\n"
 "##]],
        );
    }
    #[test]
    fn split_invalid_escaped() {
        assert_err(
            ScriptSource::parse(
                r#"#!/usr/bin/env cargo
 ---
 -----
 -----
 ---
 fn main() {}
 "#,
            ),
            str!["unexpected trailing content on closing fence: `--`"],
        );
    }
    #[test]
    fn split_dashes_in_body() {
        assert_source(
            r#"#!/usr/bin/env cargo
 ---
 Hello---
 World
 ---
 fn main() {}
 "#,
            str![[r##"
 shebang: "#!/usr/bin/env cargo\n"
 info: None
 frontmatter: "Hello---\nWorld\n"
 content: "\nfn main() {}\n"
 "##]],
        );
    }
    #[test]
    fn split_mismatched_dashes() {
        assert_err(