diff --git a/lib/ungrammar/.github/ci.rs b/lib/ungrammar/.github/ci.rs new file mode 100644 index 0000000000..87eb307d63 --- /dev/null +++ b/lib/ungrammar/.github/ci.rs @@ -0,0 +1,114 @@ +use std::{ + env, fs, + process::{self, Command, ExitStatus, Stdio}, + time::Instant, +}; + +type Error = Box; +type Result = std::result::Result; + +fn main() { + if let Err(err) = try_main() { + eprintln!("{}", err); + process::exit(1); + } +} + +fn try_main() -> Result<()> { + let cwd = env::current_dir()?; + let cargo_toml = cwd.join("Cargo.toml"); + assert!( + cargo_toml.exists(), + "Cargo.toml not found, cwd: {}", + cwd.display() + ); + + { + let _s = Section::new("BUILD"); + shell("cargo test --workspace --no-run")?; + } + + { + let _s = Section::new("TEST"); + shell("cargo test --workspace")?; + } + + let current_branch = shell_output("git branch --show-current")?; + if ¤t_branch == "master" { + let _s = Section::new("PUBLISH"); + let manifest = fs::read_to_string(&cargo_toml)?; + let version = get_field(&manifest, "version")?; + let tag = format!("v{}", version); + let tags = shell_output("git tag --list")?; + + if !tags.contains(&tag) { + let token = env::var("CRATES_IO_TOKEN").unwrap(); + shell(&format!("git tag v{}", version))?; + shell(&format!("cargo publish --token {}", token))?; + shell("git push --tags")?; + } + } + Ok(()) +} + +fn get_field<'a>(text: &'a str, name: &str) -> Result<&'a str> { + for line in text.lines() { + let words = line.split_ascii_whitespace().collect::>(); + match words.as_slice() { + [n, "=", v, ..] if n.trim() == name => { + assert!(v.starts_with('"') && v.ends_with('"')); + return Ok(&v[1..v.len() - 1]); + } + _ => (), + } + } + Err(format!("can't find `{}` in\n----\n{}\n----\n", name, text))? +} + +fn shell(cmd: &str) -> Result<()> { + let status = command(cmd).status()?; + check_status(status) +} + +fn shell_output(cmd: &str) -> Result { + let output = command(cmd).stderr(Stdio::inherit()).output()?; + check_status(output.status)?; + let res = String::from_utf8(output.stdout)?; + Ok(res.trim().to_string()) +} + +fn command(cmd: &str) -> Command { + eprintln!("> {}", cmd); + let words = cmd.split_ascii_whitespace().collect::>(); + let (cmd, args) = words.split_first().unwrap(); + let mut res = Command::new(cmd); + res.args(args); + res +} + +fn check_status(status: ExitStatus) -> Result<()> { + if !status.success() { + Err(format!("$status: {}", status))?; + } + Ok(()) +} + +struct Section { + name: &'static str, + start: Instant, +} + +impl Section { + fn new(name: &'static str) -> Section { + println!("::group::{}", name); + let start = Instant::now(); + Section { name, start } + } +} + +impl Drop for Section { + fn drop(&mut self) { + eprintln!("{}: {:.2?}", self.name, self.start.elapsed()); + println!("::endgroup::"); + } +} diff --git a/lib/ungrammar/.github/workflows/ci.yaml b/lib/ungrammar/.github/workflows/ci.yaml new file mode 100644 index 0000000000..88f133867e --- /dev/null +++ b/lib/ungrammar/.github/workflows/ci.yaml @@ -0,0 +1,36 @@ +name: CI +on: + pull_request: + push: + branches: + - master + - staging + - trying + +env: + CARGO_INCREMENTAL: 0 + CARGO_NET_RETRY: 10 + CI: 1 + RUST_BACKTRACE: short + RUSTFLAGS: -D warnings + RUSTUP_MAX_RETRIES: 10 + +jobs: + rust: + name: Rust + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Install Rust toolchain + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + profile: minimal + override: true + + - run: rustc ./.github/ci.rs && ./ci + env: + CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} diff --git a/lib/ungrammar/.gitignore b/lib/ungrammar/.gitignore new file mode 100644 index 0000000000..e3bd43f693 --- /dev/null +++ b/lib/ungrammar/.gitignore @@ -0,0 +1,3 @@ +/ci +/Cargo.lock +/target diff --git a/lib/ungrammar/Cargo.toml b/lib/ungrammar/Cargo.toml new file mode 100644 index 0000000000..920d9ef49d --- /dev/null +++ b/lib/ungrammar/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "ungrammar" +description = "A DSL for describing concrete syntax trees" +version = "1.16.1" +license = "MIT OR Apache-2.0" +repository = "https://github.com/rust-analyzer/ungrammar" +edition = "2018" + +exclude = ["/bors.toml", "/.github"] + +[workspace] +members = ["ungrammar2json"] + +[dependencies] +# nope diff --git a/lib/ungrammar/README.md b/lib/ungrammar/README.md new file mode 100644 index 0000000000..a5e130fedf --- /dev/null +++ b/lib/ungrammar/README.md @@ -0,0 +1,21 @@ +# ungrammar + +A DSL for specifying concrete syntax trees. + +See the [blog post][post] for an introduction. + +See [./rust.ungram](./rust.ungram) for an example. + +## Editor support + +- Vim + - [vim-ungrammar][] + - [ungrammar.vim][] +- VSCode + - [ungrammar-tools][] + +[post]: + https://rust-analyzer.github.io/blog/2020/10/24/introducing-ungrammar.html +[vim-ungrammar]: https://github.com/Iron-E/vim-ungrammar +[ungrammar.vim]: https://github.com/drtychai/ungrammar.vim +[ungrammar-tools]: https://github.com/azdavis/ungrammar-tools diff --git a/lib/ungrammar/bors.toml b/lib/ungrammar/bors.toml new file mode 100644 index 0000000000..b92b99ac30 --- /dev/null +++ b/lib/ungrammar/bors.toml @@ -0,0 +1,2 @@ +status = [ "Rust" ] +delete_merged_branches = true diff --git a/lib/ungrammar/rust.ungram b/lib/ungrammar/rust.ungram new file mode 100644 index 0000000000..7781e719e2 --- /dev/null +++ b/lib/ungrammar/rust.ungram @@ -0,0 +1,666 @@ +// Note this grammar file does not reflect the current language as this file is no longer maintained. + +// Rust Un-Grammar. +// +// This grammar specifies the structure of Rust's concrete syntax tree. +// It does not specify parsing rules (ambiguities, precedence, etc are out of scope). +// Tokens are processed -- contextual keywords are recognised, compound operators glued. +// +// Legend: +// +// // -- comment +// Name = -- non-terminal definition +// 'ident' -- token (terminal) +// A B -- sequence +// A | B -- alternation +// A* -- zero or more repetition +// A? -- zero or one repetition +// (A) -- same as A +// label:A -- suggested name for field of AST node + +//*************************// +// Names, Paths and Macros // +//*************************// + +Name = + 'ident' | 'self' + +NameRef = + 'ident' | 'int_number' | 'self' | 'super' | 'crate' | 'Self' + +Lifetime = + 'lifetime_ident' + +Path = + (qualifier:Path '::')? segment:PathSegment + +PathSegment = + '::'? NameRef +| NameRef GenericArgList? +| NameRef ParamList RetType? +| '<' PathType ('as' PathType)? '>' + +GenericArgList = + '::'? '<' (GenericArg (',' GenericArg)* ','?)? '>' + +GenericArg = + TypeArg +| AssocTypeArg +| LifetimeArg +| ConstArg + +TypeArg = + Type + +AssocTypeArg = + NameRef GenericParamList? (':' TypeBoundList | '=' Type) + +LifetimeArg = + Lifetime + +ConstArg = + Expr + +MacroCall = + Attr* Path '!' TokenTree ';'? + +TokenTree = + '(' ')' +| '{' '}' +| '[' ']' + +MacroItems = + Item* + +MacroStmts = + statements:Stmt* + Expr? + +//*************************// +// Items // +//*************************// + +SourceFile = + 'shebang'? + Attr* + Item* + +Item = + Const +| Enum +| ExternBlock +| ExternCrate +| Fn +| Impl +| MacroCall +| MacroRules +| MacroDef +| Module +| Static +| Struct +| Trait +| TypeAlias +| Union +| Use + +MacroRules = + Attr* Visibility? + 'macro_rules' '!' Name + TokenTree + +MacroDef = + Attr* Visibility? + 'macro' Name args:TokenTree? + body:TokenTree + +Module = + Attr* Visibility? + 'mod' Name + (ItemList | ';') + +ItemList = + '{' Attr* Item* '}' + +ExternCrate = + Attr* Visibility? + 'extern' 'crate' NameRef Rename? ';' + +Rename = + 'as' (Name | '_') + +Use = + Attr* Visibility? + 'use' UseTree ';' + +UseTree = + (Path? '::')? ('*' | UseTreeList) +| Path Rename? + +UseTreeList = + '{' (UseTree (',' UseTree)* ','?)? '}' + +Fn = + Attr* Visibility? + 'default'? 'const'? 'async'? 'unsafe'? Abi? + 'fn' Name GenericParamList? ParamList RetType? WhereClause? + (body:BlockExpr | ';') + +Abi = + 'extern' 'string'? + +ParamList = + '('( + SelfParam + | (SelfParam ',')? (Param (',' Param)* ','?)? + )')' +| '|' (Param (',' Param)* ','?)? '|' + +SelfParam = + Attr* ( + ('&' Lifetime?)? 'mut'? Name + | 'mut'? Name ':' Type + ) + +Param = + Attr* ( + Pat (':' Type)? + | Type + | '...' + ) + +RetType = + '->' Type + +TypeAlias = + Attr* Visibility? + 'default'? + 'type' Name GenericParamList? (':' TypeBoundList?)? WhereClause? + ('=' Type)? ';' + +Struct = + Attr* Visibility? + 'struct' Name GenericParamList? ( + WhereClause? (RecordFieldList | ';') + | TupleFieldList WhereClause? ';' + ) + +RecordFieldList = + '{' fields:(RecordField (',' RecordField)* ','?)? '}' + +RecordField = + Attr* Visibility? + Name ':' Type + +TupleFieldList = + '(' fields:(TupleField (',' TupleField)* ','?)? ')' + +TupleField = + Attr* Visibility? + Type + +FieldList = + RecordFieldList +| TupleFieldList + +Enum = + Attr* Visibility? + 'enum' Name GenericParamList? WhereClause? + VariantList + +VariantList = + '{' (Variant (',' Variant)* ','?)? '}' + +Variant = + Attr* Visibility? + Name FieldList? ('=' Expr)? + +Union = + Attr* Visibility? + 'union' Name GenericParamList? WhereClause? + RecordFieldList + +// A Data Type. +// +// Not used directly in the grammar, but handy to have anyway. +Adt = + Enum +| Struct +| Union + +Const = + Attr* Visibility? + 'default'? + 'const' (Name | '_') ':' Type + ('=' body:Expr)? ';' + +Static = + Attr* Visibility? + 'static' 'mut'? Name ':' Type + ('=' body:Expr)? ';' + +Trait = + Attr* Visibility? + 'unsafe'? 'auto'? + 'trait' Name GenericParamList? (':' TypeBoundList?)? WhereClause? + AssocItemList + +AssocItemList = + '{' Attr* AssocItem* '}' + +AssocItem = + Const +| Fn +| MacroCall +| TypeAlias + +Impl = + Attr* Visibility? + 'default'? 'unsafe'? + 'impl' GenericParamList? ('const'? '!'? trait:Type 'for')? self_ty:Type WhereClause? + AssocItemList + +ExternBlock = + Attr* 'unsafe'? Abi ExternItemList + +ExternItemList = + '{' Attr* ExternItem* '}' + +ExternItem = + Fn +| MacroCall +| Static +| TypeAlias + +GenericParamList = + '<' (GenericParam (',' GenericParam)* ','?)? '>' + +GenericParam = + ConstParam +| LifetimeParam +| TypeParam + +TypeParam = + Attr* Name (':' TypeBoundList?)? + ('=' default_type:Type)? + +ConstParam = + Attr* 'const' Name ':' Type + ('=' default_val:Expr)? + +LifetimeParam = + Attr* Lifetime (':' TypeBoundList?)? + +WhereClause = + 'where' predicates:(WherePred (',' WherePred)* ','?) + +WherePred = + ('for' GenericParamList)? (Lifetime | Type) ':' TypeBoundList? + +Visibility = + 'pub' ('(' 'in'? Path ')')? + +Attr = + '#' '!'? '[' Meta ']' + +Meta = + Path ('=' Expr | TokenTree)? + +//****************************// +// Statements and Expressions // +//****************************// + +Stmt = + ';' +| ExprStmt +| Item +| LetStmt + +LetStmt = + Attr* 'let' Pat (':' Type)? + '=' initializer:Expr + LetElse? + ';' + +LetElse = + 'else' BlockExpr + +ExprStmt = + Expr ';'? + +Expr = + ArrayExpr +| AwaitExpr +| BinExpr +| BlockExpr +| BoxExpr +| BreakExpr +| CallExpr +| CastExpr +| ClosureExpr +| ContinueExpr +| FieldExpr +| ForExpr +| IfExpr +| IndexExpr +| Literal +| LoopExpr +| MacroCall +| MacroStmts +| MatchExpr +| MethodCallExpr +| ParenExpr +| PathExpr +| PrefixExpr +| RangeExpr +| RecordExpr +| RefExpr +| ReturnExpr +| TryExpr +| TupleExpr +| WhileExpr +| YieldExpr +| LetExpr +| UnderscoreExpr + +Literal = + Attr* value:( + 'int_number' | 'float_number' + | 'string' | 'raw_string' + | 'byte_string' | 'raw_byte_string' + | 'true' | 'false' + | 'char' | 'byte' + ) + +PathExpr = + Attr* Path + +StmtList = + '{' + Attr* + statements:Stmt* + tail_expr:Expr? + '}' + +RefExpr = + Attr* '&' ('raw' | 'mut' | 'const') Expr + +TryExpr = + Attr* Expr '?' + +BlockExpr = + Attr* Label? ('try' | 'unsafe' | 'async' | 'const') StmtList + +PrefixExpr = + Attr* op:('-' | '!' | '*') Expr + +BinExpr = + Attr* + lhs:Expr + op:( + '||' | '&&' + | '==' | '!=' | '<=' | '>=' | '<' | '>' + | '+' | '*' | '-' | '/' | '%' | '<<' | '>>' | '^' | '|' | '&' + | '=' | '+=' | '/=' | '*=' | '%=' | '>>=' | '<<=' | '-=' | '|=' | '&=' | '^=' + ) + rhs:Expr + +CastExpr = + Attr* Expr 'as' Type + +ParenExpr = + Attr* '(' Attr* Expr ')' + +ArrayExpr = + Attr* '[' Attr* ( + (Expr (',' Expr)* ','?)? + | Expr ';' Expr + ) ']' + +IndexExpr = + Attr* base:Expr '[' index:Expr ']' + +TupleExpr = + Attr* '(' Attr* fields:(Expr (',' Expr)* ','?)? ')' + +RecordExpr = + Path RecordExprFieldList + +RecordExprFieldList = + '{' + Attr* + fields:(RecordExprField (',' RecordExprField)* ','?)? + ('..' spread:Expr?)? + '}' + +RecordExprField = + Attr* (NameRef ':')? Expr + +CallExpr = + Attr* Expr ArgList + +ArgList = + '(' args:(Expr (',' Expr)* ','?)? ')' + +MethodCallExpr = + Attr* receiver:Expr '.' NameRef GenericArgList? ArgList + +FieldExpr = + Attr* Expr '.' NameRef + +ClosureExpr = + Attr* 'static'? 'async'? 'move'? ParamList RetType? + body:Expr + +IfExpr = + Attr* 'if' condition:Expr then_branch:BlockExpr + ('else' else_branch:(IfExpr | BlockExpr))? + +LoopExpr = + Attr* Label? 'loop' + loop_body:BlockExpr + +ForExpr = + Attr* Label? 'for' Pat 'in' iterable:Expr + loop_body:BlockExpr + +WhileExpr = + Attr* Label? 'while' condition:Expr + loop_body:BlockExpr + +Label = + Lifetime ':' + +BreakExpr = + Attr* 'break' Lifetime? Expr? + +ContinueExpr = + Attr* 'continue' Lifetime? + +RangeExpr = + Attr* start:Expr? op:('..' | '..=') end:Expr? + +MatchExpr = + Attr* 'match' Expr MatchArmList + +MatchArmList = + '{' + Attr* + arms:MatchArm* + '}' + +MatchArm = + Attr* Pat guard:MatchGuard? '=>' Expr ','? + +MatchGuard = + 'if' condition:Expr + +ReturnExpr = + Attr* 'return' Expr? + +YieldExpr = + Attr* 'yield' Expr? + +LetExpr = + Attr* 'let' Pat '=' Expr + +UnderscoreExpr = + Attr* '_' + +AwaitExpr = + Attr* Expr '.' 'await' + +BoxExpr = + Attr* 'box' Expr + +//*************************// +// Types // +//*************************// + +Type = + ArrayType +| DynTraitType +| FnPtrType +| ForType +| ImplTraitType +| InferType +| MacroType +| NeverType +| ParenType +| PathType +| PtrType +| RefType +| SliceType +| TupleType + +ParenType = + '(' Type ')' + +NeverType = + '!' + +MacroType = + MacroCall + +PathType = + Path + +TupleType = + '(' fields:(Type (',' Type)* ','?)? ')' + +PtrType = + '*' ('const' | 'mut') Type + +RefType = + '&' Lifetime? 'mut'? Type + +ArrayType = + '[' Type ';' Expr ']' + +SliceType = + '[' Type ']' + +InferType = + '_' + +FnPtrType = + 'const'? 'async'? 'unsafe'? Abi? 'fn' ParamList RetType? + +ForType = + 'for' GenericParamList Type + +ImplTraitType = + 'impl' TypeBoundList + +DynTraitType = + 'dyn' TypeBoundList + +TypeBoundList = + bounds:(TypeBound ('+' TypeBound)* '+'?) + +TypeBound = + Lifetime +| ('?' | '~' 'const')? Type + +//************************// +// Patterns // +//************************// + +Pat = + IdentPat +| BoxPat +| RestPat +| LiteralPat +| MacroPat +| OrPat +| ParenPat +| PathPat +| WildcardPat +| RangePat +| RecordPat +| RefPat +| SlicePat +| TuplePat +| TupleStructPat +| ConstBlockPat + +LiteralPat = + Literal + +IdentPat = + Attr* 'ref'? 'mut'? Name ('@' Pat)? + +WildcardPat = + '_' + +RangePat = + // 1.. + start:Pat op:('..' | '..=') + // 1..2 + | start:Pat op:('..' | '..=') end:Pat + // ..2 + | op:('..' | '..=') end:Pat + +RefPat = + '&' 'mut'? Pat + +RecordPat = + Path RecordPatFieldList + +RecordPatFieldList = + '{' + fields:(RecordPatField (',' RecordPatField)* ','?)? + RestPat? + '}' + +RecordPatField = + Attr* (NameRef ':')? Pat + +TupleStructPat = + Path '(' fields:(Pat (',' Pat)* ','?)? ')' + +TuplePat = + '(' fields:(Pat (',' Pat)* ','?)? ')' + +ParenPat = + '(' Pat ')' + +SlicePat = + '[' (Pat (',' Pat)* ','?)? ']' + +PathPat = + Path + +OrPat = + (Pat ('|' Pat)* '|'?) + +BoxPat = + 'box' Pat + +RestPat = + Attr* '..' + +MacroPat = + MacroCall + +ConstBlockPat = + 'const' BlockExpr diff --git a/lib/ungrammar/src/error.rs b/lib/ungrammar/src/error.rs new file mode 100644 index 0000000000..355e0b7ebc --- /dev/null +++ b/lib/ungrammar/src/error.rs @@ -0,0 +1,50 @@ +//! Boilerplate error definitions. +use std::fmt; + +use crate::lexer::Location; + +/// A type alias for std's Result with the Error as our error type. +pub type Result = std::result::Result; + +/// An error encountered when parsing a Grammar. +#[derive(Debug)] +pub struct Error { + pub(crate) message: String, + pub(crate) location: Option, +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(loc) = self.location { + // Report 1-based indices, to match text editors + write!(f, "{}:{}: ", loc.line + 1, loc.column + 1)? + } + write!(f, "{}", self.message) + } +} + +impl std::error::Error for Error {} + +impl Error { + pub(crate) fn with_location(self, location: Location) -> Error { + Error { + location: Some(location), + ..self + } + } +} + +macro_rules! _format_err { + ($($tt:tt)*) => { + $crate::error::Error { + message: format!($($tt)*), + location: None, + } + }; +} +pub(crate) use _format_err as format_err; + +macro_rules! _bail { + ($($tt:tt)*) => { return Err($crate::error::format_err!($($tt)*)) }; +} +pub(crate) use _bail as bail; diff --git a/lib/ungrammar/src/lexer.rs b/lib/ungrammar/src/lexer.rs new file mode 100644 index 0000000000..f4c979b5bd --- /dev/null +++ b/lib/ungrammar/src/lexer.rs @@ -0,0 +1,129 @@ +//! Simple hand-written ungrammar lexer +use crate::error::{bail, Result}; + +#[derive(Debug, Eq, PartialEq)] +pub(crate) enum TokenKind { + Node(String), + Token(String), + Eq, + Star, + Pipe, + QMark, + Colon, + LParen, + RParen, +} + +#[derive(Debug)] +pub(crate) struct Token { + pub(crate) kind: TokenKind, + pub(crate) loc: Location, +} + +#[derive(Copy, Clone, Default, Debug)] +pub(crate) struct Location { + pub(crate) line: usize, + pub(crate) column: usize, +} + +impl Location { + fn advance(&mut self, text: &str) { + match text.rfind('\n') { + Some(idx) => { + self.line += text.chars().filter(|&it| it == '\n').count(); + self.column = text[idx + 1..].chars().count(); + } + None => self.column += text.chars().count(), + } + } +} + +pub(crate) fn tokenize(mut input: &str) -> Result> { + let mut res = Vec::new(); + let mut loc = Location::default(); + while !input.is_empty() { + let old_input = input; + skip_ws(&mut input); + skip_comment(&mut input); + if old_input.len() == input.len() { + match advance(&mut input) { + Ok(kind) => { + res.push(Token { kind, loc }); + } + Err(err) => return Err(err.with_location(loc)), + } + } + let consumed = old_input.len() - input.len(); + loc.advance(&old_input[..consumed]); + } + + Ok(res) +} + +fn skip_ws(input: &mut &str) { + *input = input.trim_start_matches(is_whitespace) +} +fn skip_comment(input: &mut &str) { + if input.starts_with("//") { + let idx = input.find('\n').map_or(input.len(), |it| it + 1); + *input = &input[idx..] + } +} + +fn advance(input: &mut &str) -> Result { + let mut chars = input.chars(); + let c = chars.next().unwrap(); + let res = match c { + '=' => TokenKind::Eq, + '*' => TokenKind::Star, + '?' => TokenKind::QMark, + '(' => TokenKind::LParen, + ')' => TokenKind::RParen, + '|' => TokenKind::Pipe, + ':' => TokenKind::Colon, + '\'' => { + let mut buf = String::new(); + loop { + match chars.next() { + None => bail!("unclosed token literal"), + Some('\\') => match chars.next() { + Some(c) if is_escapable(c) => buf.push(c), + _ => bail!("invalid escape in token literal"), + }, + Some('\'') => break, + Some(c) => buf.push(c), + } + } + TokenKind::Token(buf) + } + c if is_ident_char(c) => { + let mut buf = String::new(); + buf.push(c); + loop { + match chars.clone().next() { + Some(c) if is_ident_char(c) => { + chars.next(); + buf.push(c); + } + _ => break, + } + } + TokenKind::Node(buf) + } + '\r' => bail!("unexpected `\\r`, only Unix-style line endings allowed"), + c => bail!("unexpected character: `{}`", c), + }; + + *input = chars.as_str(); + Ok(res) +} + +fn is_escapable(c: char) -> bool { + matches!(c, '\\' | '\'') +} +fn is_whitespace(c: char) -> bool { + matches!(c, ' ' | '\t' | '\n') +} +fn is_ident_char(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_') +} diff --git a/lib/ungrammar/src/lib.rs b/lib/ungrammar/src/lib.rs new file mode 100644 index 0000000000..6adf8ef8ea --- /dev/null +++ b/lib/ungrammar/src/lib.rs @@ -0,0 +1,137 @@ +//! Ungrammar -- a DSL for specifying concrete syntax tree grammar. +//! +//! Producing a parser is an explicit non-goal -- it's ok for this grammar to be +//! ambiguous, non LL, non LR, etc. +//! +//! See this +//! [introductory post](https://rust-analyzer.github.io/blog/2020/10/24/introducing-ungrammar.html) +//! for details. + +#![deny(missing_debug_implementations)] +#![deny(missing_docs)] +#![deny(rust_2018_idioms)] + +mod error; +mod lexer; +mod parser; + +use std::{ops, str::FromStr}; + +pub use error::{Error, Result}; + +/// Returns a Rust grammar. +pub fn rust_grammar() -> Grammar { + let src = include_str!("../rust.ungram"); + src.parse().unwrap() +} + +/// A node, like `A = 'b' | 'c'`. +/// +/// Indexing into a [`Grammar`] with a [`Node`] returns a reference to a +/// [`NodeData`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Node(usize); + +/// A token, denoted with single quotes, like `'+'` or `'struct'`. +/// +/// Indexing into a [`Grammar`] with a [`Token`] returns a reference to a +/// [`TokenData`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Token(usize); + +/// An Ungrammar grammar. +#[derive(Default, Debug)] +pub struct Grammar { + nodes: Vec, + tokens: Vec, +} + +impl FromStr for Grammar { + type Err = Error; + fn from_str(s: &str) -> Result { + let tokens = lexer::tokenize(s)?; + parser::parse(tokens) + } +} + +impl Grammar { + /// Returns an iterator over all nodes in the grammar. + pub fn iter(&self) -> impl Iterator + '_ { + (0..self.nodes.len()).map(Node) + } + + /// Returns an iterator over all tokens in the grammar. + pub fn tokens(&self) -> impl Iterator + '_ { + (0..self.tokens.len()).map(Token) + } +} + +impl ops::Index for Grammar { + type Output = NodeData; + fn index(&self, Node(index): Node) -> &NodeData { + &self.nodes[index] + } +} + +impl ops::Index for Grammar { + type Output = TokenData; + fn index(&self, Token(index): Token) -> &TokenData { + &self.tokens[index] + } +} + +/// Data about a node. +#[derive(Debug)] +pub struct NodeData { + /// The name of the node. + /// + /// In the rule `A = 'b' | 'c'`, this is `"A"`. + pub name: String, + /// The rule for this node. + /// + /// In the rule `A = 'b' | 'c'`, this represents `'b' | 'c'`. + pub rule: Rule, +} + +/// Data about a token. +#[derive(Debug)] +pub struct TokenData { + /// The name of the token. + pub name: String, +} + +/// A production rule. +#[derive(Debug, Clone, Eq, PartialEq)] +pub enum Rule { + /// A labeled rule, like `a:B` (`"a"` is the label, `B` is the rule). + Labeled { + /// The label. + label: String, + /// The rule. + rule: Box, + }, + /// A node, like `A`. + Node(Node), + /// A token, like `'struct'`. + Token(Token), + /// A sequence of rules, like `'while' '(' Expr ')' Stmt`. + Seq(Vec), + /// An alternative between many rules, like `'+' | '-' | '*' | '/'`. + Alt(Vec), + /// An optional rule, like `A?`. + Opt(Box), + /// A repeated rule, like `A*`. + Rep(Box), +} + +#[test] +fn smoke() { + let grammar = include_str!("../ungrammar.ungram"); + let grammar = grammar.parse::().unwrap(); + drop(grammar) +} + +#[test] +fn test_rust_grammar() { + let _ = rust_grammar(); +} diff --git a/lib/ungrammar/src/parser.rs b/lib/ungrammar/src/parser.rs new file mode 100644 index 0000000000..a4ce9c1202 --- /dev/null +++ b/lib/ungrammar/src/parser.rs @@ -0,0 +1,225 @@ +//! Simple hand-written ungrammar parser. +use std::collections::HashMap; + +use crate::{ + error::{bail, format_err, Result}, + lexer::{self, TokenKind}, + Grammar, Node, NodeData, Rule, Token, TokenData, +}; + +macro_rules! bail { + ($loc:expr, $($tt:tt)*) => {{ + let err = $crate::error::format_err!($($tt)*) + .with_location($loc); + return Err(err); + }}; +} + +pub(crate) fn parse(tokens: Vec) -> Result { + let mut p = Parser::new(tokens); + while !p.is_eof() { + node(&mut p)?; + } + p.finish() +} + +#[derive(Default)] +struct Parser { + grammar: Grammar, + tokens: Vec, + node_table: HashMap, + token_table: HashMap, +} + +const DUMMY_RULE: Rule = Rule::Node(Node(!0)); + +impl Parser { + fn new(mut tokens: Vec) -> Parser { + tokens.reverse(); + Parser { + tokens, + ..Parser::default() + } + } + + fn peek(&self) -> Option<&lexer::Token> { + self.peek_n(0) + } + fn peek_n(&self, n: usize) -> Option<&lexer::Token> { + self.tokens.iter().nth_back(n) + } + fn bump(&mut self) -> Result { + self.tokens + .pop() + .ok_or_else(|| format_err!("unexpected EOF")) + } + fn expect(&mut self, kind: TokenKind, what: &str) -> Result<()> { + let token = self.bump()?; + if token.kind != kind { + bail!(token.loc, "unexpected token, expected `{}`", what); + } + Ok(()) + } + fn is_eof(&self) -> bool { + self.tokens.is_empty() + } + fn finish(self) -> Result { + for node_data in &self.grammar.nodes { + if matches!(node_data.rule, DUMMY_RULE) { + crate::error::bail!("Undefined node: {}", node_data.name) + } + } + Ok(self.grammar) + } + fn intern_node(&mut self, name: String) -> Node { + let len = self.node_table.len(); + let grammar = &mut self.grammar; + *self.node_table.entry(name.clone()).or_insert_with(|| { + grammar.nodes.push(NodeData { + name, + rule: DUMMY_RULE, + }); + Node(len) + }) + } + fn intern_token(&mut self, name: String) -> Token { + let len = self.token_table.len(); + let grammar = &mut self.grammar; + *self.token_table.entry(name.clone()).or_insert_with(|| { + grammar.tokens.push(TokenData { name }); + Token(len) + }) + } +} + +fn node(p: &mut Parser) -> Result<()> { + let token = p.bump()?; + let node = match token.kind { + TokenKind::Node(it) => p.intern_node(it), + _ => bail!(token.loc, "expected ident"), + }; + p.expect(TokenKind::Eq, "=")?; + if !matches!(p.grammar[node].rule, DUMMY_RULE) { + bail!(token.loc, "duplicate rule: `{}`", p.grammar[node].name) + } + + let rule = rule(p)?; + p.grammar.nodes[node.0].rule = rule; + Ok(()) +} + +fn rule(p: &mut Parser) -> Result { + if let Some(lexer::Token { kind: TokenKind::Pipe, loc }) = p.peek() { + bail!( + *loc, + "The first element in a sequence of productions or alternatives \ + must not have a leading pipe (`|`)" + ); + } + + let lhs = seq_rule(p)?; + let mut alt = vec![lhs]; + while let Some(token) = p.peek() { + if token.kind != TokenKind::Pipe { + break; + } + p.bump()?; + let rule = seq_rule(p)?; + alt.push(rule) + } + let res = if alt.len() == 1 { + alt.pop().unwrap() + } else { + Rule::Alt(alt) + }; + Ok(res) +} + +fn seq_rule(p: &mut Parser) -> Result { + let lhs = atom_rule(p)?; + + let mut seq = vec![lhs]; + while let Some(rule) = opt_atom_rule(p)? { + seq.push(rule) + } + let res = if seq.len() == 1 { + seq.pop().unwrap() + } else { + Rule::Seq(seq) + }; + Ok(res) +} + +fn atom_rule(p: &mut Parser) -> Result { + match opt_atom_rule(p)? { + Some(it) => Ok(it), + None => { + let token = p.bump()?; + bail!(token.loc, "unexpected token") + } + } +} + +fn opt_atom_rule(p: &mut Parser) -> Result> { + let token = match p.peek() { + Some(it) => it, + None => return Ok(None), + }; + let mut res = match &token.kind { + TokenKind::Node(name) => { + if let Some(lookahead) = p.peek_n(1) { + match lookahead.kind { + TokenKind::Eq => return Ok(None), + TokenKind::Colon => { + let label = name.clone(); + p.bump()?; + p.bump()?; + let rule = atom_rule(p)?; + let res = Rule::Labeled { + label, + rule: Box::new(rule), + }; + return Ok(Some(res)); + } + _ => (), + } + } + match p.peek_n(1) { + Some(token) if token.kind == TokenKind::Eq => return Ok(None), + _ => (), + } + let name = name.clone(); + p.bump()?; + let node = p.intern_node(name); + Rule::Node(node) + } + TokenKind::Token(name) => { + let name = name.clone(); + p.bump()?; + let token = p.intern_token(name); + Rule::Token(token) + } + TokenKind::LParen => { + p.bump()?; + let rule = rule(p)?; + p.expect(TokenKind::RParen, ")")?; + rule + } + _ => return Ok(None), + }; + + if let Some(token) = p.peek() { + match &token.kind { + TokenKind::QMark => { + p.bump()?; + res = Rule::Opt(Box::new(res)); + } + TokenKind::Star => { + p.bump()?; + res = Rule::Rep(Box::new(res)); + } + _ => (), + } + } + Ok(Some(res)) +} diff --git a/lib/ungrammar/ungrammar.ungram b/lib/ungrammar/ungrammar.ungram new file mode 100644 index 0000000000..856a6cede0 --- /dev/null +++ b/lib/ungrammar/ungrammar.ungram @@ -0,0 +1,16 @@ +/// ungrammar for ungrammar +Grammar = + Node * + +Node = + name:'ident' '=' Rule + +Rule = + 'ident' +| 'token_ident' +| Rule * +| Rule ( '|' Rule) * +| Rule '?' +| Rule '*' +| '(' Rule ')' +| label:'ident' ':' Rule diff --git a/lib/ungrammar/ungrammar2json/Cargo.toml b/lib/ungrammar/ungrammar2json/Cargo.toml new file mode 100644 index 0000000000..19ca3d8324 --- /dev/null +++ b/lib/ungrammar/ungrammar2json/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "ungrammar2json" +description = "Convert ungrammar files to JSON" +version = "1.0.0" +license = "MIT OR Apache-2.0" +repository = "https://github.com/matklad/ungrammar" +authors = ["Aleksey Kladov "] +edition = "2018" + +[dependencies] +write-json = "0.1.1" +ungrammar = { path = "../", version = "1.1.0" } diff --git a/lib/ungrammar/ungrammar2json/src/main.rs b/lib/ungrammar/ungrammar2json/src/main.rs new file mode 100644 index 0000000000..f588ed5eb6 --- /dev/null +++ b/lib/ungrammar/ungrammar2json/src/main.rs @@ -0,0 +1,77 @@ +use std::{ + env, + io::{self, Read}, + process, +}; + +use ungrammar::{Grammar, Rule}; + +fn main() { + if let Err(err) = try_main() { + eprintln!("{}", err); + process::exit(101); + } +} + +fn try_main() -> io::Result<()> { + if env::args().count() != 1 { + eprintln!("Usage: ungrammar2json < grammar.ungram > grammar.json"); + return Ok(()); + } + let grammar = read_stdin()?; + let grammar = grammar + .parse::() + .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err))?; + + let mut buf = String::new(); + grammar_to_json(&grammar, write_json::object(&mut buf)); + println!("{}", buf); + Ok(()) +} + +fn read_stdin() -> io::Result { + let mut buf = String::new(); + io::stdin().lock().read_to_string(&mut buf)?; + Ok(buf) +} + +fn grammar_to_json(grammar: &Grammar, mut obj: write_json::Object<'_>) { + for node in grammar.iter() { + let node = &grammar[node]; + rule_to_json(grammar, &node.rule, obj.object(&node.name)); + } +} + +fn rule_to_json(grammar: &Grammar, rule: &Rule, mut obj: write_json::Object) { + match rule { + Rule::Labeled { label, rule } => { + obj.string("label", label); + rule_to_json(grammar, rule, obj.object("rule")) + } + Rule::Node(node) => { + obj.string("node", &grammar[*node].name); + } + Rule::Token(token) => { + obj.string("token", &grammar[*token].name); + } + Rule::Seq(rules) | Rule::Alt(rules) => { + let tag = match rule { + Rule::Seq(_) => "seq", + Rule::Alt(_) => "alt", + _ => unreachable!(), + }; + let mut array = obj.array(tag); + for rule in rules { + rule_to_json(grammar, rule, array.object()); + } + } + Rule::Opt(arg) | Rule::Rep(arg) => { + let tag = match rule { + Rule::Opt(_) => "opt", + Rule::Rep(_) => "rep", + _ => unreachable!(), + }; + rule_to_json(grammar, arg, obj.object(tag)); + } + } +}