Merge pull request #20854 from epage/frontmatter

feat(parser): Don't error on frontmatter
2025-12-27 16:07:46 +00:00 · 2025-10-20 08:58:23 +00:00 · 2025-10-20 08:58:23 +00:00 · e78de709eb
commit e78de709eb
parent 9c494da486 8231a2b01c
12 changed files with 407 additions and 9 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1583,6 +1583,7 @@ dependencies = [
 "rustc-literal-escaper 0.0.4",
 "stdx",
 "tracing",
+ "winnow",
 ]

 [[package]]
--- a/crates/parser/Cargo.toml
+++ b/crates/parser/Cargo.toml
@ -19,6 +19,7 @@ rustc-literal-escaper.workspace = true
 tracing = { workspace = true, optional = true }

 edition.workspace = true
+winnow = { version = "0.7.13", default-features = false }

 [dev-dependencies]
 expect-test = "1.5.1"
--- a/crates/parser/src/frontmatter.rs
+++ b/crates/parser/src/frontmatter.rs
@ -0,0 +1,348 @@
+// Copied from https://github.com/rust-lang/cargo/blob/367fd9f213750cd40317803dd0a5a3ce3f0c676d/src/cargo/util/frontmatter.rs
+#![expect(dead_code)] // avoid editing
+#![expect(unreachable_pub)] // avoid editing
+#![expect(clippy::useless_format)] // avoid editing
+
+type Span = std::ops::Range<usize>;
+
+#[derive(Debug)]
+pub struct ScriptSource<'s> {
+    /// The full file
+    raw: &'s str,
+    /// The `#!/usr/bin/env cargo` line, if present
+    shebang: Option<Span>,
+    /// The code fence opener (`---`)
+    open: Option<Span>,
+    /// Trailing text after `ScriptSource::open` that identifies the meaning of
+    /// `ScriptSource::frontmatter`
+    info: Option<Span>,
+    /// The lines between `ScriptSource::open` and `ScriptSource::close`
+    frontmatter: Option<Span>,
+    /// The code fence closer (`---`)
+    close: Option<Span>,
+    /// All content after the frontmatter and shebang
+    content: Span,
+}
+
+impl<'s> ScriptSource<'s> {
+    pub fn parse(raw: &'s str) -> Result<Self, FrontmatterError> {
+        use winnow::stream::FindSlice as _;
+        use winnow::stream::Location as _;
+        use winnow::stream::Offset as _;
+        use winnow::stream::Stream as _;
+
+        let content_end = raw.len();
+        let mut source = Self {
+            raw,
+            shebang: None,
+            open: None,
+            info: None,
+            frontmatter: None,
+            close: None,
+            content: 0..content_end,
+        };
+
+        let mut input = winnow::stream::LocatingSlice::new(raw);
+
+        if let Some(shebang_end) = strip_shebang(input.as_ref()) {
+            let shebang_start = input.current_token_start();
+            let _ = input.next_slice(shebang_end);
+            let shebang_end = input.current_token_start();
+            source.shebang = Some(shebang_start..shebang_end);
+            source.content = shebang_end..content_end;
+        }
+
+        // Whitespace may precede a frontmatter but must end with a newline
+        if let Some(nl_end) = strip_ws_lines(input.as_ref()) {
+            let _ = input.next_slice(nl_end);
+        }
+
+        // Opens with a line that starts with 3 or more `-` followed by an optional identifier
+        const FENCE_CHAR: char = '-';
+        let fence_length = input
+            .as_ref()
+            .char_indices()
+            .find_map(|(i, c)| (c != FENCE_CHAR).then_some(i))
+            .unwrap_or_else(|| input.eof_offset());
+        let open_start = input.current_token_start();
+        let fence_pattern = input.next_slice(fence_length);
+        let open_end = input.current_token_start();
+        match fence_length {
+            0 => {
+                return Ok(source);
+            }
+            1 | 2 => {
+                // either not a frontmatter or invalid frontmatter opening
+                return Err(FrontmatterError::new(
+                    format!(
+                        "found {fence_length} `{FENCE_CHAR}` in rust frontmatter, expected at least 3"
+                    ),
+                    raw.len()..raw.len(),
+                ).push_visible_span(open_start..open_end));
+            }
+            _ => {}
+        }
+        source.open = Some(open_start..open_end);
+        let Some(info_nl) = input.find_slice("\n") else {
+            return Err(FrontmatterError::new(
+                format!("unclosed frontmatter; expected `{fence_pattern}`"),
+                raw.len()..raw.len(),
+            )
+            .push_visible_span(open_start..open_end));
+        };
+        let info = input.next_slice(info_nl.start);
+        let info = info.strip_suffix('\r').unwrap_or(info); // already excludes `\n`
+        let info = info.trim_matches(is_horizontal_whitespace);
+        if !info.is_empty() {
+            let info_start = info.offset_from(&raw);
+            let info_end = info_start + info.len();
+            source.info = Some(info_start..info_end);
+        }
+
+        // Ends with a line that starts with a matching number of `-` only followed by whitespace
+        let nl_fence_pattern = format!("\n{fence_pattern}");
+        let Some(frontmatter_nl) = input.find_slice(nl_fence_pattern.as_str()) else {
+            for len in (2..(nl_fence_pattern.len() - 1)).rev() {
+                let Some(frontmatter_nl) = input.find_slice(&nl_fence_pattern[0..len]) else {
+                    continue;
+                };
+                let _ = input.next_slice(frontmatter_nl.start + 1);
+                let close_start = input.current_token_start();
+                let _ = input.next_slice(len);
+                let close_end = input.current_token_start();
+                let fewer_dashes = fence_length - len;
+                return Err(FrontmatterError::new(
+                    format!(
+                        "closing code fence has {fewer_dashes} less `-` than the opening fence"
+                    ),
+                    close_start..close_end,
+                )
+                .push_visible_span(open_start..open_end));
+            }
+            return Err(FrontmatterError::new(
+                format!("unclosed frontmatter; expected `{fence_pattern}`"),
+                raw.len()..raw.len(),
+            )
+            .push_visible_span(open_start..open_end));
+        };
+        let frontmatter_start = input.current_token_start() + 1; // skip nl from infostring
+        let _ = input.next_slice(frontmatter_nl.start + 1);
+        let frontmatter_end = input.current_token_start();
+        source.frontmatter = Some(frontmatter_start..frontmatter_end);
+        let close_start = input.current_token_start();
+        let _ = input.next_slice(fence_length);
+        let close_end = input.current_token_start();
+        source.close = Some(close_start..close_end);
+
+        let nl = input.find_slice("\n");
+        let after_closing_fence =
+            input.next_slice(nl.map(|span| span.end).unwrap_or_else(|| input.eof_offset()));
+        let content_start = input.current_token_start();
+        let extra_dashes = after_closing_fence.chars().take_while(|b| *b == FENCE_CHAR).count();
+        if 0 < extra_dashes {
+            let extra_start = close_end;
+            let extra_end = extra_start + extra_dashes;
+            return Err(FrontmatterError::new(
+                format!("closing code fence has {extra_dashes} more `-` than the opening fence"),
+                extra_start..extra_end,
+            )
+            .push_visible_span(open_start..open_end));
+        } else {
+            let after_closing_fence = strip_newline(after_closing_fence);
+            let after_closing_fence = after_closing_fence.trim_matches(is_horizontal_whitespace);
+            if !after_closing_fence.is_empty() {
+                // extra characters beyond the original fence pattern
+                let after_start = after_closing_fence.offset_from(&raw);
+                let after_end = after_start + after_closing_fence.len();
+                return Err(FrontmatterError::new(
+                    format!("unexpected characters after frontmatter close"),
+                    after_start..after_end,
+                )
+                .push_visible_span(open_start..open_end));
+            }
+        }
+
+        source.content = content_start..content_end;
+
+        if let Some(nl_end) = strip_ws_lines(input.as_ref()) {
+            let _ = input.next_slice(nl_end);
+        }
+        let fence_length = input
+            .as_ref()
+            .char_indices()
+            .find_map(|(i, c)| (c != FENCE_CHAR).then_some(i))
+            .unwrap_or_else(|| input.eof_offset());
+        if 0 < fence_length {
+            let fence_start = input.current_token_start();
+            let fence_end = fence_start + fence_length;
+            return Err(FrontmatterError::new(
+                format!("only one frontmatter is supported"),
+                fence_start..fence_end,
+            )
+            .push_visible_span(open_start..open_end)
+            .push_visible_span(close_start..close_end));
+        }
+
+        Ok(source)
+    }
+
+    pub fn shebang(&self) -> Option<&'s str> {
+        self.shebang.clone().map(|span| &self.raw[span])
+    }
+
+    pub fn shebang_span(&self) -> Option<Span> {
+        self.shebang.clone()
+    }
+
+    pub fn open_span(&self) -> Option<Span> {
+        self.open.clone()
+    }
+
+    pub fn info(&self) -> Option<&'s str> {
+        self.info.clone().map(|span| &self.raw[span])
+    }
+
+    pub fn info_span(&self) -> Option<Span> {
+        self.info.clone()
+    }
+
+    pub fn frontmatter(&self) -> Option<&'s str> {
+        self.frontmatter.clone().map(|span| &self.raw[span])
+    }
+
+    pub fn frontmatter_span(&self) -> Option<Span> {
+        self.frontmatter.clone()
+    }
+
+    pub fn close_span(&self) -> Option<Span> {
+        self.close.clone()
+    }
+
+    pub fn content(&self) -> &'s str {
+        &self.raw[self.content.clone()]
+    }
+
+    pub fn content_span(&self) -> Span {
+        self.content.clone()
+    }
+}
+
+/// Returns the index after the shebang line, if present
+pub fn strip_shebang(input: &str) -> Option<usize> {
+    // See rust-lang/rust's compiler/rustc_lexer/src/lib.rs's `strip_shebang`
+    // Shebang must start with `#!` literally, without any preceding whitespace.
+    // For simplicity we consider any line starting with `#!` a shebang,
+    // regardless of restrictions put on shebangs by specific platforms.
+    if let Some(rest) = input.strip_prefix("#!") {
+        // Ok, this is a shebang but if the next non-whitespace token is `[`,
+        // then it may be valid Rust code, so consider it Rust code.
+        //
+        // NOTE: rustc considers line and block comments to be whitespace but to avoid
+        // any more awareness of Rust grammar, we are excluding it.
+        if !rest.trim_start().starts_with('[') {
+            // No other choice than to consider this a shebang.
+            let newline_end = input.find('\n').map(|pos| pos + 1).unwrap_or(input.len());
+            return Some(newline_end);
+        }
+    }
+    None
+}
+
+/// Returns the index after any lines with only whitespace, if present
+pub fn strip_ws_lines(input: &str) -> Option<usize> {
+    let ws_end = input.find(|c| !is_whitespace(c)).unwrap_or(input.len());
+    if ws_end == 0 {
+        return None;
+    }
+
+    let nl_start = input[0..ws_end].rfind('\n')?;
+    let nl_end = nl_start + 1;
+    Some(nl_end)
+}
+
+/// True if `c` is considered a whitespace according to Rust language definition.
+/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
+/// for definitions of these classes.
+fn is_whitespace(c: char) -> bool {
+    // This is Pattern_White_Space.
+    //
+    // Note that this set is stable (ie, it doesn't change with different
+    // Unicode versions), so it's ok to just hard-code the values.
+
+    matches!(
+        c,
+        // End-of-line characters
+        | '\u{000A}' // line feed (\n)
+        | '\u{000B}' // vertical tab
+        | '\u{000C}' // form feed
+        | '\u{000D}' // carriage return (\r)
+        | '\u{0085}' // next line (from latin1)
+        | '\u{2028}' // LINE SEPARATOR
+        | '\u{2029}' // PARAGRAPH SEPARATOR
+
+        // `Default_Ignorable_Code_Point` characters
+        | '\u{200E}' // LEFT-TO-RIGHT MARK
+        | '\u{200F}' // RIGHT-TO-LEFT MARK
+
+        // Horizontal space characters
+        | '\u{0009}'   // tab (\t)
+        | '\u{0020}' // space
+    )
+}
+
+/// True if `c` is considered horizontal whitespace according to Rust language definition.
+fn is_horizontal_whitespace(c: char) -> bool {
+    // This is Pattern_White_Space.
+    //
+    // Note that this set is stable (ie, it doesn't change with different
+    // Unicode versions), so it's ok to just hard-code the values.
+
+    matches!(
+        c,
+        // Horizontal space characters
+        '\u{0009}'   // tab (\t)
+        | '\u{0020}' // space
+    )
+}
+
+fn strip_newline(text: &str) -> &str {
+    text.strip_suffix("\r\n").or_else(|| text.strip_suffix('\n')).unwrap_or(text)
+}
+
+#[derive(Debug)]
+pub struct FrontmatterError {
+    message: String,
+    primary_span: Span,
+    visible_spans: Vec<Span>,
+}
+
+impl FrontmatterError {
+    pub fn new(message: impl Into<String>, span: Span) -> Self {
+        Self { message: message.into(), primary_span: span, visible_spans: Vec::new() }
+    }
+
+    pub fn push_visible_span(mut self, span: Span) -> Self {
+        self.visible_spans.push(span);
+        self
+    }
+
+    pub fn message(&self) -> &str {
+        self.message.as_str()
+    }
+
+    pub fn primary_span(&self) -> Span {
+        self.primary_span.clone()
+    }
+
+    pub fn visible_spans(&self) -> &[Span] {
+        &self.visible_spans
+    }
+}
+
+impl std::fmt::Display for FrontmatterError {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.message.fmt(fmt)
+    }
+}
+
+impl std::error::Error for FrontmatterError {}
--- a/crates/parser/src/lexed_str.rs
+++ b/crates/parser/src/lexed_str.rs
@ -37,10 +37,17 @@ impl<'a> LexedStr<'a> {
    pub fn new(edition: Edition, text: &'a str) -> LexedStr<'a> {
        let _p = tracing::info_span!("LexedStr::new").entered();
        let mut conv = Converter::new(edition, text);
-        if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
-            conv.res.push(SHEBANG, conv.offset);
-            conv.offset = shebang_len;
-        };
+        if let Ok(script) = crate::frontmatter::ScriptSource::parse(text) {
+            if let Some(shebang) = script.shebang_span() {
+                conv.push(SHEBANG, shebang.end - shebang.start, Vec::new());
+            }
+            if script.frontmatter().is_some() {
+                conv.push(FRONTMATTER, script.content_span().start - conv.offset, Vec::new());
+            }
+        } else if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
+            // Leave error reporting to `rustc_lexer`
+            conv.push(SHEBANG, shebang_len, Vec::new());
+        }

        // Re-create the tokenizer from scratch every token because `GuardedStrPrefix` is one token in the lexer
        // but we want to split it to two in edition <2024.
--- a/crates/parser/src/lib.rs
+++ b/crates/parser/src/lib.rs
@ -26,6 +26,7 @@ extern crate ra_ap_rustc_lexer as rustc_lexer;
 extern crate rustc_lexer;

 mod event;
+mod frontmatter;
 mod grammar;
 mod input;
 mod lexed_str;
--- a/crates/parser/test_data/lexer/ok/frontmatter.rast
+++ b/crates/parser/test_data/lexer/ok/frontmatter.rast
@ -0,0 +1,12 @@
+FRONTMATTER "\n---\n[dependencies]\nclap = \"4\"\n---\n"
+WHITESPACE "\n"
+FN_KW "fn"
+WHITESPACE " "
+IDENT "main"
+L_PAREN "("
+R_PAREN ")"
+WHITESPACE " "
+L_CURLY "{"
+WHITESPACE "\n"
+R_CURLY "}"
+WHITESPACE "\n"
--- a/crates/parser/test_data/lexer/ok/frontmatter.rs
+++ b/crates/parser/test_data/lexer/ok/frontmatter.rs
@ -0,0 +1,8 @@
+
+---
+[dependencies]
+clap = "4"
+---
+
+fn main() {
+}
--- a/crates/parser/test_data/lexer/ok/shebang_frontmatter.rast
+++ b/crates/parser/test_data/lexer/ok/shebang_frontmatter.rast
@ -0,0 +1,13 @@
+SHEBANG "#!/usr/bin/env cargo\n"
+FRONTMATTER "\n---\n[dependencies]\nclap = \"4\"\n---\n"
+WHITESPACE "\n"
+FN_KW "fn"
+WHITESPACE " "
+IDENT "main"
+L_PAREN "("
+R_PAREN ")"
+WHITESPACE " "
+L_CURLY "{"
+WHITESPACE "\n"
+R_CURLY "}"
+WHITESPACE "\n"
--- a/crates/parser/test_data/lexer/ok/shebang_frontmatter.rs
+++ b/crates/parser/test_data/lexer/ok/shebang_frontmatter.rs
@ -0,0 +1,9 @@
+#!/usr/bin/env cargo
+
+---
+[dependencies]
+clap = "4"
+---
+
+fn main() {
+}
--- a/crates/parser/test_data/lexer/ok/single_line_comments.rast
+++ b/crates/parser/test_data/lexer/ok/single_line_comments.rast
@ -1,5 +1,4 @@
-SHEBANG "#!/usr/bin/env bash"
-WHITESPACE "\n"
+SHEBANG "#!/usr/bin/env bash\n"
 COMMENT "// hello"
 WHITESPACE "\n"
 COMMENT "//! World"
--- a/crates/parser/test_data/parser/err/0002_duplicate_shebang.rast
+++ b/crates/parser/test_data/parser/err/0002_duplicate_shebang.rast
@ -1,6 +1,5 @@
 SOURCE_FILE
-  SHEBANG "#!/use/bin/env rusti"
-  WHITESPACE "\n"
+  SHEBANG "#!/use/bin/env rusti\n"
  ATTR
    POUND "#"
    BANG "!"
--- a/xtask/src/tidy.rs
+++ b/xtask/src/tidy.rs
@ -259,7 +259,7 @@ impl TidyDocs {
        }

        fn is_exclude_file(d: &Path) -> bool {
-            let file_names = ["tests.rs", "famous_defs_fixture.rs"];
+            let file_names = ["tests.rs", "famous_defs_fixture.rs", "frontmatter.rs"];

            d.file_name()
                .unwrap_or_default()