Subtree ungrammar

This commit is contained in:
Lukas Wirth 2025-11-07 20:19:36 +01:00
commit f48b2f65d3
14 changed files with 1503 additions and 0 deletions

114
lib/ungrammar/.github/ci.rs vendored Normal file
View File

@ -0,0 +1,114 @@
use std::{
env, fs,
process::{self, Command, ExitStatus, Stdio},
time::Instant,
};
type Error = Box<dyn std::error::Error>;
type Result<T> = std::result::Result<T, Error>;
fn main() {
if let Err(err) = try_main() {
eprintln!("{}", err);
process::exit(1);
}
}
fn try_main() -> Result<()> {
let cwd = env::current_dir()?;
let cargo_toml = cwd.join("Cargo.toml");
assert!(
cargo_toml.exists(),
"Cargo.toml not found, cwd: {}",
cwd.display()
);
{
let _s = Section::new("BUILD");
shell("cargo test --workspace --no-run")?;
}
{
let _s = Section::new("TEST");
shell("cargo test --workspace")?;
}
let current_branch = shell_output("git branch --show-current")?;
if &current_branch == "master" {
let _s = Section::new("PUBLISH");
let manifest = fs::read_to_string(&cargo_toml)?;
let version = get_field(&manifest, "version")?;
let tag = format!("v{}", version);
let tags = shell_output("git tag --list")?;
if !tags.contains(&tag) {
let token = env::var("CRATES_IO_TOKEN").unwrap();
shell(&format!("git tag v{}", version))?;
shell(&format!("cargo publish --token {}", token))?;
shell("git push --tags")?;
}
}
Ok(())
}
fn get_field<'a>(text: &'a str, name: &str) -> Result<&'a str> {
for line in text.lines() {
let words = line.split_ascii_whitespace().collect::<Vec<_>>();
match words.as_slice() {
[n, "=", v, ..] if n.trim() == name => {
assert!(v.starts_with('"') && v.ends_with('"'));
return Ok(&v[1..v.len() - 1]);
}
_ => (),
}
}
Err(format!("can't find `{}` in\n----\n{}\n----\n", name, text))?
}
fn shell(cmd: &str) -> Result<()> {
let status = command(cmd).status()?;
check_status(status)
}
fn shell_output(cmd: &str) -> Result<String> {
let output = command(cmd).stderr(Stdio::inherit()).output()?;
check_status(output.status)?;
let res = String::from_utf8(output.stdout)?;
Ok(res.trim().to_string())
}
fn command(cmd: &str) -> Command {
eprintln!("> {}", cmd);
let words = cmd.split_ascii_whitespace().collect::<Vec<_>>();
let (cmd, args) = words.split_first().unwrap();
let mut res = Command::new(cmd);
res.args(args);
res
}
fn check_status(status: ExitStatus) -> Result<()> {
if !status.success() {
Err(format!("$status: {}", status))?;
}
Ok(())
}
struct Section {
name: &'static str,
start: Instant,
}
impl Section {
fn new(name: &'static str) -> Section {
println!("::group::{}", name);
let start = Instant::now();
Section { name, start }
}
}
impl Drop for Section {
fn drop(&mut self) {
eprintln!("{}: {:.2?}", self.name, self.start.elapsed());
println!("::endgroup::");
}
}

36
lib/ungrammar/.github/workflows/ci.yaml vendored Normal file
View File

@ -0,0 +1,36 @@
name: CI
on:
pull_request:
push:
branches:
- master
- staging
- trying
env:
CARGO_INCREMENTAL: 0
CARGO_NET_RETRY: 10
CI: 1
RUST_BACKTRACE: short
RUSTFLAGS: -D warnings
RUSTUP_MAX_RETRIES: 10
jobs:
rust:
name: Rust
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Install Rust toolchain
uses: actions-rs/toolchain@v1
with:
toolchain: stable
profile: minimal
override: true
- run: rustc ./.github/ci.rs && ./ci
env:
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}

3
lib/ungrammar/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/ci
/Cargo.lock
/target

15
lib/ungrammar/Cargo.toml Normal file
View File

@ -0,0 +1,15 @@
[package]
name = "ungrammar"
description = "A DSL for describing concrete syntax trees"
version = "1.16.1"
license = "MIT OR Apache-2.0"
repository = "https://github.com/rust-analyzer/ungrammar"
edition = "2018"
exclude = ["/bors.toml", "/.github"]
[workspace]
members = ["ungrammar2json"]
[dependencies]
# nope

21
lib/ungrammar/README.md Normal file
View File

@ -0,0 +1,21 @@
# ungrammar
A DSL for specifying concrete syntax trees.
See the [blog post][post] for an introduction.
See [./rust.ungram](./rust.ungram) for an example.
## Editor support
- Vim
- [vim-ungrammar][]
- [ungrammar.vim][]
- VSCode
- [ungrammar-tools][]
[post]:
https://rust-analyzer.github.io/blog/2020/10/24/introducing-ungrammar.html
[vim-ungrammar]: https://github.com/Iron-E/vim-ungrammar
[ungrammar.vim]: https://github.com/drtychai/ungrammar.vim
[ungrammar-tools]: https://github.com/azdavis/ungrammar-tools

2
lib/ungrammar/bors.toml Normal file
View File

@ -0,0 +1,2 @@
status = [ "Rust" ]
delete_merged_branches = true

666
lib/ungrammar/rust.ungram Normal file
View File

@ -0,0 +1,666 @@
// Note this grammar file does not reflect the current language as this file is no longer maintained.
// Rust Un-Grammar.
//
// This grammar specifies the structure of Rust's concrete syntax tree.
// It does not specify parsing rules (ambiguities, precedence, etc are out of scope).
// Tokens are processed -- contextual keywords are recognised, compound operators glued.
//
// Legend:
//
// // -- comment
// Name = -- non-terminal definition
// 'ident' -- token (terminal)
// A B -- sequence
// A | B -- alternation
// A* -- zero or more repetition
// A? -- zero or one repetition
// (A) -- same as A
// label:A -- suggested name for field of AST node
//*************************//
// Names, Paths and Macros //
//*************************//
Name =
'ident' | 'self'
NameRef =
'ident' | 'int_number' | 'self' | 'super' | 'crate' | 'Self'
Lifetime =
'lifetime_ident'
Path =
(qualifier:Path '::')? segment:PathSegment
PathSegment =
'::'? NameRef
| NameRef GenericArgList?
| NameRef ParamList RetType?
| '<' PathType ('as' PathType)? '>'
GenericArgList =
'::'? '<' (GenericArg (',' GenericArg)* ','?)? '>'
GenericArg =
TypeArg
| AssocTypeArg
| LifetimeArg
| ConstArg
TypeArg =
Type
AssocTypeArg =
NameRef GenericParamList? (':' TypeBoundList | '=' Type)
LifetimeArg =
Lifetime
ConstArg =
Expr
MacroCall =
Attr* Path '!' TokenTree ';'?
TokenTree =
'(' ')'
| '{' '}'
| '[' ']'
MacroItems =
Item*
MacroStmts =
statements:Stmt*
Expr?
//*************************//
// Items //
//*************************//
SourceFile =
'shebang'?
Attr*
Item*
Item =
Const
| Enum
| ExternBlock
| ExternCrate
| Fn
| Impl
| MacroCall
| MacroRules
| MacroDef
| Module
| Static
| Struct
| Trait
| TypeAlias
| Union
| Use
MacroRules =
Attr* Visibility?
'macro_rules' '!' Name
TokenTree
MacroDef =
Attr* Visibility?
'macro' Name args:TokenTree?
body:TokenTree
Module =
Attr* Visibility?
'mod' Name
(ItemList | ';')
ItemList =
'{' Attr* Item* '}'
ExternCrate =
Attr* Visibility?
'extern' 'crate' NameRef Rename? ';'
Rename =
'as' (Name | '_')
Use =
Attr* Visibility?
'use' UseTree ';'
UseTree =
(Path? '::')? ('*' | UseTreeList)
| Path Rename?
UseTreeList =
'{' (UseTree (',' UseTree)* ','?)? '}'
Fn =
Attr* Visibility?
'default'? 'const'? 'async'? 'unsafe'? Abi?
'fn' Name GenericParamList? ParamList RetType? WhereClause?
(body:BlockExpr | ';')
Abi =
'extern' 'string'?
ParamList =
'('(
SelfParam
| (SelfParam ',')? (Param (',' Param)* ','?)?
)')'
| '|' (Param (',' Param)* ','?)? '|'
SelfParam =
Attr* (
('&' Lifetime?)? 'mut'? Name
| 'mut'? Name ':' Type
)
Param =
Attr* (
Pat (':' Type)?
| Type
| '...'
)
RetType =
'->' Type
TypeAlias =
Attr* Visibility?
'default'?
'type' Name GenericParamList? (':' TypeBoundList?)? WhereClause?
('=' Type)? ';'
Struct =
Attr* Visibility?
'struct' Name GenericParamList? (
WhereClause? (RecordFieldList | ';')
| TupleFieldList WhereClause? ';'
)
RecordFieldList =
'{' fields:(RecordField (',' RecordField)* ','?)? '}'
RecordField =
Attr* Visibility?
Name ':' Type
TupleFieldList =
'(' fields:(TupleField (',' TupleField)* ','?)? ')'
TupleField =
Attr* Visibility?
Type
FieldList =
RecordFieldList
| TupleFieldList
Enum =
Attr* Visibility?
'enum' Name GenericParamList? WhereClause?
VariantList
VariantList =
'{' (Variant (',' Variant)* ','?)? '}'
Variant =
Attr* Visibility?
Name FieldList? ('=' Expr)?
Union =
Attr* Visibility?
'union' Name GenericParamList? WhereClause?
RecordFieldList
// A Data Type.
//
// Not used directly in the grammar, but handy to have anyway.
Adt =
Enum
| Struct
| Union
Const =
Attr* Visibility?
'default'?
'const' (Name | '_') ':' Type
('=' body:Expr)? ';'
Static =
Attr* Visibility?
'static' 'mut'? Name ':' Type
('=' body:Expr)? ';'
Trait =
Attr* Visibility?
'unsafe'? 'auto'?
'trait' Name GenericParamList? (':' TypeBoundList?)? WhereClause?
AssocItemList
AssocItemList =
'{' Attr* AssocItem* '}'
AssocItem =
Const
| Fn
| MacroCall
| TypeAlias
Impl =
Attr* Visibility?
'default'? 'unsafe'?
'impl' GenericParamList? ('const'? '!'? trait:Type 'for')? self_ty:Type WhereClause?
AssocItemList
ExternBlock =
Attr* 'unsafe'? Abi ExternItemList
ExternItemList =
'{' Attr* ExternItem* '}'
ExternItem =
Fn
| MacroCall
| Static
| TypeAlias
GenericParamList =
'<' (GenericParam (',' GenericParam)* ','?)? '>'
GenericParam =
ConstParam
| LifetimeParam
| TypeParam
TypeParam =
Attr* Name (':' TypeBoundList?)?
('=' default_type:Type)?
ConstParam =
Attr* 'const' Name ':' Type
('=' default_val:Expr)?
LifetimeParam =
Attr* Lifetime (':' TypeBoundList?)?
WhereClause =
'where' predicates:(WherePred (',' WherePred)* ','?)
WherePred =
('for' GenericParamList)? (Lifetime | Type) ':' TypeBoundList?
Visibility =
'pub' ('(' 'in'? Path ')')?
Attr =
'#' '!'? '[' Meta ']'
Meta =
Path ('=' Expr | TokenTree)?
//****************************//
// Statements and Expressions //
//****************************//
Stmt =
';'
| ExprStmt
| Item
| LetStmt
LetStmt =
Attr* 'let' Pat (':' Type)?
'=' initializer:Expr
LetElse?
';'
LetElse =
'else' BlockExpr
ExprStmt =
Expr ';'?
Expr =
ArrayExpr
| AwaitExpr
| BinExpr
| BlockExpr
| BoxExpr
| BreakExpr
| CallExpr
| CastExpr
| ClosureExpr
| ContinueExpr
| FieldExpr
| ForExpr
| IfExpr
| IndexExpr
| Literal
| LoopExpr
| MacroCall
| MacroStmts
| MatchExpr
| MethodCallExpr
| ParenExpr
| PathExpr
| PrefixExpr
| RangeExpr
| RecordExpr
| RefExpr
| ReturnExpr
| TryExpr
| TupleExpr
| WhileExpr
| YieldExpr
| LetExpr
| UnderscoreExpr
Literal =
Attr* value:(
'int_number' | 'float_number'
| 'string' | 'raw_string'
| 'byte_string' | 'raw_byte_string'
| 'true' | 'false'
| 'char' | 'byte'
)
PathExpr =
Attr* Path
StmtList =
'{'
Attr*
statements:Stmt*
tail_expr:Expr?
'}'
RefExpr =
Attr* '&' ('raw' | 'mut' | 'const') Expr
TryExpr =
Attr* Expr '?'
BlockExpr =
Attr* Label? ('try' | 'unsafe' | 'async' | 'const') StmtList
PrefixExpr =
Attr* op:('-' | '!' | '*') Expr
BinExpr =
Attr*
lhs:Expr
op:(
'||' | '&&'
| '==' | '!=' | '<=' | '>=' | '<' | '>'
| '+' | '*' | '-' | '/' | '%' | '<<' | '>>' | '^' | '|' | '&'
| '=' | '+=' | '/=' | '*=' | '%=' | '>>=' | '<<=' | '-=' | '|=' | '&=' | '^='
)
rhs:Expr
CastExpr =
Attr* Expr 'as' Type
ParenExpr =
Attr* '(' Attr* Expr ')'
ArrayExpr =
Attr* '[' Attr* (
(Expr (',' Expr)* ','?)?
| Expr ';' Expr
) ']'
IndexExpr =
Attr* base:Expr '[' index:Expr ']'
TupleExpr =
Attr* '(' Attr* fields:(Expr (',' Expr)* ','?)? ')'
RecordExpr =
Path RecordExprFieldList
RecordExprFieldList =
'{'
Attr*
fields:(RecordExprField (',' RecordExprField)* ','?)?
('..' spread:Expr?)?
'}'
RecordExprField =
Attr* (NameRef ':')? Expr
CallExpr =
Attr* Expr ArgList
ArgList =
'(' args:(Expr (',' Expr)* ','?)? ')'
MethodCallExpr =
Attr* receiver:Expr '.' NameRef GenericArgList? ArgList
FieldExpr =
Attr* Expr '.' NameRef
ClosureExpr =
Attr* 'static'? 'async'? 'move'? ParamList RetType?
body:Expr
IfExpr =
Attr* 'if' condition:Expr then_branch:BlockExpr
('else' else_branch:(IfExpr | BlockExpr))?
LoopExpr =
Attr* Label? 'loop'
loop_body:BlockExpr
ForExpr =
Attr* Label? 'for' Pat 'in' iterable:Expr
loop_body:BlockExpr
WhileExpr =
Attr* Label? 'while' condition:Expr
loop_body:BlockExpr
Label =
Lifetime ':'
BreakExpr =
Attr* 'break' Lifetime? Expr?
ContinueExpr =
Attr* 'continue' Lifetime?
RangeExpr =
Attr* start:Expr? op:('..' | '..=') end:Expr?
MatchExpr =
Attr* 'match' Expr MatchArmList
MatchArmList =
'{'
Attr*
arms:MatchArm*
'}'
MatchArm =
Attr* Pat guard:MatchGuard? '=>' Expr ','?
MatchGuard =
'if' condition:Expr
ReturnExpr =
Attr* 'return' Expr?
YieldExpr =
Attr* 'yield' Expr?
LetExpr =
Attr* 'let' Pat '=' Expr
UnderscoreExpr =
Attr* '_'
AwaitExpr =
Attr* Expr '.' 'await'
BoxExpr =
Attr* 'box' Expr
//*************************//
// Types //
//*************************//
Type =
ArrayType
| DynTraitType
| FnPtrType
| ForType
| ImplTraitType
| InferType
| MacroType
| NeverType
| ParenType
| PathType
| PtrType
| RefType
| SliceType
| TupleType
ParenType =
'(' Type ')'
NeverType =
'!'
MacroType =
MacroCall
PathType =
Path
TupleType =
'(' fields:(Type (',' Type)* ','?)? ')'
PtrType =
'*' ('const' | 'mut') Type
RefType =
'&' Lifetime? 'mut'? Type
ArrayType =
'[' Type ';' Expr ']'
SliceType =
'[' Type ']'
InferType =
'_'
FnPtrType =
'const'? 'async'? 'unsafe'? Abi? 'fn' ParamList RetType?
ForType =
'for' GenericParamList Type
ImplTraitType =
'impl' TypeBoundList
DynTraitType =
'dyn' TypeBoundList
TypeBoundList =
bounds:(TypeBound ('+' TypeBound)* '+'?)
TypeBound =
Lifetime
| ('?' | '~' 'const')? Type
//************************//
// Patterns //
//************************//
Pat =
IdentPat
| BoxPat
| RestPat
| LiteralPat
| MacroPat
| OrPat
| ParenPat
| PathPat
| WildcardPat
| RangePat
| RecordPat
| RefPat
| SlicePat
| TuplePat
| TupleStructPat
| ConstBlockPat
LiteralPat =
Literal
IdentPat =
Attr* 'ref'? 'mut'? Name ('@' Pat)?
WildcardPat =
'_'
RangePat =
// 1..
start:Pat op:('..' | '..=')
// 1..2
| start:Pat op:('..' | '..=') end:Pat
// ..2
| op:('..' | '..=') end:Pat
RefPat =
'&' 'mut'? Pat
RecordPat =
Path RecordPatFieldList
RecordPatFieldList =
'{'
fields:(RecordPatField (',' RecordPatField)* ','?)?
RestPat?
'}'
RecordPatField =
Attr* (NameRef ':')? Pat
TupleStructPat =
Path '(' fields:(Pat (',' Pat)* ','?)? ')'
TuplePat =
'(' fields:(Pat (',' Pat)* ','?)? ')'
ParenPat =
'(' Pat ')'
SlicePat =
'[' (Pat (',' Pat)* ','?)? ']'
PathPat =
Path
OrPat =
(Pat ('|' Pat)* '|'?)
BoxPat =
'box' Pat
RestPat =
Attr* '..'
MacroPat =
MacroCall
ConstBlockPat =
'const' BlockExpr

View File

@ -0,0 +1,50 @@
//! Boilerplate error definitions.
use std::fmt;
use crate::lexer::Location;
/// A type alias for std's Result with the Error as our error type.
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// An error encountered when parsing a Grammar.
#[derive(Debug)]
pub struct Error {
pub(crate) message: String,
pub(crate) location: Option<Location>,
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(loc) = self.location {
// Report 1-based indices, to match text editors
write!(f, "{}:{}: ", loc.line + 1, loc.column + 1)?
}
write!(f, "{}", self.message)
}
}
impl std::error::Error for Error {}
impl Error {
pub(crate) fn with_location(self, location: Location) -> Error {
Error {
location: Some(location),
..self
}
}
}
macro_rules! _format_err {
($($tt:tt)*) => {
$crate::error::Error {
message: format!($($tt)*),
location: None,
}
};
}
pub(crate) use _format_err as format_err;
macro_rules! _bail {
($($tt:tt)*) => { return Err($crate::error::format_err!($($tt)*)) };
}
pub(crate) use _bail as bail;

129
lib/ungrammar/src/lexer.rs Normal file
View File

@ -0,0 +1,129 @@
//! Simple hand-written ungrammar lexer
use crate::error::{bail, Result};
#[derive(Debug, Eq, PartialEq)]
pub(crate) enum TokenKind {
Node(String),
Token(String),
Eq,
Star,
Pipe,
QMark,
Colon,
LParen,
RParen,
}
#[derive(Debug)]
pub(crate) struct Token {
pub(crate) kind: TokenKind,
pub(crate) loc: Location,
}
#[derive(Copy, Clone, Default, Debug)]
pub(crate) struct Location {
pub(crate) line: usize,
pub(crate) column: usize,
}
impl Location {
fn advance(&mut self, text: &str) {
match text.rfind('\n') {
Some(idx) => {
self.line += text.chars().filter(|&it| it == '\n').count();
self.column = text[idx + 1..].chars().count();
}
None => self.column += text.chars().count(),
}
}
}
pub(crate) fn tokenize(mut input: &str) -> Result<Vec<Token>> {
let mut res = Vec::new();
let mut loc = Location::default();
while !input.is_empty() {
let old_input = input;
skip_ws(&mut input);
skip_comment(&mut input);
if old_input.len() == input.len() {
match advance(&mut input) {
Ok(kind) => {
res.push(Token { kind, loc });
}
Err(err) => return Err(err.with_location(loc)),
}
}
let consumed = old_input.len() - input.len();
loc.advance(&old_input[..consumed]);
}
Ok(res)
}
fn skip_ws(input: &mut &str) {
*input = input.trim_start_matches(is_whitespace)
}
fn skip_comment(input: &mut &str) {
if input.starts_with("//") {
let idx = input.find('\n').map_or(input.len(), |it| it + 1);
*input = &input[idx..]
}
}
fn advance(input: &mut &str) -> Result<TokenKind> {
let mut chars = input.chars();
let c = chars.next().unwrap();
let res = match c {
'=' => TokenKind::Eq,
'*' => TokenKind::Star,
'?' => TokenKind::QMark,
'(' => TokenKind::LParen,
')' => TokenKind::RParen,
'|' => TokenKind::Pipe,
':' => TokenKind::Colon,
'\'' => {
let mut buf = String::new();
loop {
match chars.next() {
None => bail!("unclosed token literal"),
Some('\\') => match chars.next() {
Some(c) if is_escapable(c) => buf.push(c),
_ => bail!("invalid escape in token literal"),
},
Some('\'') => break,
Some(c) => buf.push(c),
}
}
TokenKind::Token(buf)
}
c if is_ident_char(c) => {
let mut buf = String::new();
buf.push(c);
loop {
match chars.clone().next() {
Some(c) if is_ident_char(c) => {
chars.next();
buf.push(c);
}
_ => break,
}
}
TokenKind::Node(buf)
}
'\r' => bail!("unexpected `\\r`, only Unix-style line endings allowed"),
c => bail!("unexpected character: `{}`", c),
};
*input = chars.as_str();
Ok(res)
}
fn is_escapable(c: char) -> bool {
matches!(c, '\\' | '\'')
}
fn is_whitespace(c: char) -> bool {
matches!(c, ' ' | '\t' | '\n')
}
fn is_ident_char(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '_')
}

137
lib/ungrammar/src/lib.rs Normal file
View File

@ -0,0 +1,137 @@
//! Ungrammar -- a DSL for specifying concrete syntax tree grammar.
//!
//! Producing a parser is an explicit non-goal -- it's ok for this grammar to be
//! ambiguous, non LL, non LR, etc.
//!
//! See this
//! [introductory post](https://rust-analyzer.github.io/blog/2020/10/24/introducing-ungrammar.html)
//! for details.
#![deny(missing_debug_implementations)]
#![deny(missing_docs)]
#![deny(rust_2018_idioms)]
mod error;
mod lexer;
mod parser;
use std::{ops, str::FromStr};
pub use error::{Error, Result};
/// Returns a Rust grammar.
pub fn rust_grammar() -> Grammar {
let src = include_str!("../rust.ungram");
src.parse().unwrap()
}
/// A node, like `A = 'b' | 'c'`.
///
/// Indexing into a [`Grammar`] with a [`Node`] returns a reference to a
/// [`NodeData`].
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Node(usize);
/// A token, denoted with single quotes, like `'+'` or `'struct'`.
///
/// Indexing into a [`Grammar`] with a [`Token`] returns a reference to a
/// [`TokenData`].
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Token(usize);
/// An Ungrammar grammar.
#[derive(Default, Debug)]
pub struct Grammar {
nodes: Vec<NodeData>,
tokens: Vec<TokenData>,
}
impl FromStr for Grammar {
type Err = Error;
fn from_str(s: &str) -> Result<Self> {
let tokens = lexer::tokenize(s)?;
parser::parse(tokens)
}
}
impl Grammar {
/// Returns an iterator over all nodes in the grammar.
pub fn iter(&self) -> impl Iterator<Item = Node> + '_ {
(0..self.nodes.len()).map(Node)
}
/// Returns an iterator over all tokens in the grammar.
pub fn tokens(&self) -> impl Iterator<Item = Token> + '_ {
(0..self.tokens.len()).map(Token)
}
}
impl ops::Index<Node> for Grammar {
type Output = NodeData;
fn index(&self, Node(index): Node) -> &NodeData {
&self.nodes[index]
}
}
impl ops::Index<Token> for Grammar {
type Output = TokenData;
fn index(&self, Token(index): Token) -> &TokenData {
&self.tokens[index]
}
}
/// Data about a node.
#[derive(Debug)]
pub struct NodeData {
/// The name of the node.
///
/// In the rule `A = 'b' | 'c'`, this is `"A"`.
pub name: String,
/// The rule for this node.
///
/// In the rule `A = 'b' | 'c'`, this represents `'b' | 'c'`.
pub rule: Rule,
}
/// Data about a token.
#[derive(Debug)]
pub struct TokenData {
/// The name of the token.
pub name: String,
}
/// A production rule.
#[derive(Debug, Clone, Eq, PartialEq)]
pub enum Rule {
/// A labeled rule, like `a:B` (`"a"` is the label, `B` is the rule).
Labeled {
/// The label.
label: String,
/// The rule.
rule: Box<Rule>,
},
/// A node, like `A`.
Node(Node),
/// A token, like `'struct'`.
Token(Token),
/// A sequence of rules, like `'while' '(' Expr ')' Stmt`.
Seq(Vec<Rule>),
/// An alternative between many rules, like `'+' | '-' | '*' | '/'`.
Alt(Vec<Rule>),
/// An optional rule, like `A?`.
Opt(Box<Rule>),
/// A repeated rule, like `A*`.
Rep(Box<Rule>),
}
#[test]
fn smoke() {
let grammar = include_str!("../ungrammar.ungram");
let grammar = grammar.parse::<Grammar>().unwrap();
drop(grammar)
}
#[test]
fn test_rust_grammar() {
let _ = rust_grammar();
}

225
lib/ungrammar/src/parser.rs Normal file
View File

@ -0,0 +1,225 @@
//! Simple hand-written ungrammar parser.
use std::collections::HashMap;
use crate::{
error::{bail, format_err, Result},
lexer::{self, TokenKind},
Grammar, Node, NodeData, Rule, Token, TokenData,
};
macro_rules! bail {
($loc:expr, $($tt:tt)*) => {{
let err = $crate::error::format_err!($($tt)*)
.with_location($loc);
return Err(err);
}};
}
pub(crate) fn parse(tokens: Vec<lexer::Token>) -> Result<Grammar> {
let mut p = Parser::new(tokens);
while !p.is_eof() {
node(&mut p)?;
}
p.finish()
}
#[derive(Default)]
struct Parser {
grammar: Grammar,
tokens: Vec<lexer::Token>,
node_table: HashMap<String, Node>,
token_table: HashMap<String, Token>,
}
const DUMMY_RULE: Rule = Rule::Node(Node(!0));
impl Parser {
fn new(mut tokens: Vec<lexer::Token>) -> Parser {
tokens.reverse();
Parser {
tokens,
..Parser::default()
}
}
fn peek(&self) -> Option<&lexer::Token> {
self.peek_n(0)
}
fn peek_n(&self, n: usize) -> Option<&lexer::Token> {
self.tokens.iter().nth_back(n)
}
fn bump(&mut self) -> Result<lexer::Token> {
self.tokens
.pop()
.ok_or_else(|| format_err!("unexpected EOF"))
}
fn expect(&mut self, kind: TokenKind, what: &str) -> Result<()> {
let token = self.bump()?;
if token.kind != kind {
bail!(token.loc, "unexpected token, expected `{}`", what);
}
Ok(())
}
fn is_eof(&self) -> bool {
self.tokens.is_empty()
}
fn finish(self) -> Result<Grammar> {
for node_data in &self.grammar.nodes {
if matches!(node_data.rule, DUMMY_RULE) {
crate::error::bail!("Undefined node: {}", node_data.name)
}
}
Ok(self.grammar)
}
fn intern_node(&mut self, name: String) -> Node {
let len = self.node_table.len();
let grammar = &mut self.grammar;
*self.node_table.entry(name.clone()).or_insert_with(|| {
grammar.nodes.push(NodeData {
name,
rule: DUMMY_RULE,
});
Node(len)
})
}
fn intern_token(&mut self, name: String) -> Token {
let len = self.token_table.len();
let grammar = &mut self.grammar;
*self.token_table.entry(name.clone()).or_insert_with(|| {
grammar.tokens.push(TokenData { name });
Token(len)
})
}
}
fn node(p: &mut Parser) -> Result<()> {
let token = p.bump()?;
let node = match token.kind {
TokenKind::Node(it) => p.intern_node(it),
_ => bail!(token.loc, "expected ident"),
};
p.expect(TokenKind::Eq, "=")?;
if !matches!(p.grammar[node].rule, DUMMY_RULE) {
bail!(token.loc, "duplicate rule: `{}`", p.grammar[node].name)
}
let rule = rule(p)?;
p.grammar.nodes[node.0].rule = rule;
Ok(())
}
fn rule(p: &mut Parser) -> Result<Rule> {
if let Some(lexer::Token { kind: TokenKind::Pipe, loc }) = p.peek() {
bail!(
*loc,
"The first element in a sequence of productions or alternatives \
must not have a leading pipe (`|`)"
);
}
let lhs = seq_rule(p)?;
let mut alt = vec![lhs];
while let Some(token) = p.peek() {
if token.kind != TokenKind::Pipe {
break;
}
p.bump()?;
let rule = seq_rule(p)?;
alt.push(rule)
}
let res = if alt.len() == 1 {
alt.pop().unwrap()
} else {
Rule::Alt(alt)
};
Ok(res)
}
fn seq_rule(p: &mut Parser) -> Result<Rule> {
let lhs = atom_rule(p)?;
let mut seq = vec![lhs];
while let Some(rule) = opt_atom_rule(p)? {
seq.push(rule)
}
let res = if seq.len() == 1 {
seq.pop().unwrap()
} else {
Rule::Seq(seq)
};
Ok(res)
}
fn atom_rule(p: &mut Parser) -> Result<Rule> {
match opt_atom_rule(p)? {
Some(it) => Ok(it),
None => {
let token = p.bump()?;
bail!(token.loc, "unexpected token")
}
}
}
fn opt_atom_rule(p: &mut Parser) -> Result<Option<Rule>> {
let token = match p.peek() {
Some(it) => it,
None => return Ok(None),
};
let mut res = match &token.kind {
TokenKind::Node(name) => {
if let Some(lookahead) = p.peek_n(1) {
match lookahead.kind {
TokenKind::Eq => return Ok(None),
TokenKind::Colon => {
let label = name.clone();
p.bump()?;
p.bump()?;
let rule = atom_rule(p)?;
let res = Rule::Labeled {
label,
rule: Box::new(rule),
};
return Ok(Some(res));
}
_ => (),
}
}
match p.peek_n(1) {
Some(token) if token.kind == TokenKind::Eq => return Ok(None),
_ => (),
}
let name = name.clone();
p.bump()?;
let node = p.intern_node(name);
Rule::Node(node)
}
TokenKind::Token(name) => {
let name = name.clone();
p.bump()?;
let token = p.intern_token(name);
Rule::Token(token)
}
TokenKind::LParen => {
p.bump()?;
let rule = rule(p)?;
p.expect(TokenKind::RParen, ")")?;
rule
}
_ => return Ok(None),
};
if let Some(token) = p.peek() {
match &token.kind {
TokenKind::QMark => {
p.bump()?;
res = Rule::Opt(Box::new(res));
}
TokenKind::Star => {
p.bump()?;
res = Rule::Rep(Box::new(res));
}
_ => (),
}
}
Ok(Some(res))
}

View File

@ -0,0 +1,16 @@
/// ungrammar for ungrammar
Grammar =
Node *
Node =
name:'ident' '=' Rule
Rule =
'ident'
| 'token_ident'
| Rule *
| Rule ( '|' Rule) *
| Rule '?'
| Rule '*'
| '(' Rule ')'
| label:'ident' ':' Rule

View File

@ -0,0 +1,12 @@
[package]
name = "ungrammar2json"
description = "Convert ungrammar files to JSON"
version = "1.0.0"
license = "MIT OR Apache-2.0"
repository = "https://github.com/matklad/ungrammar"
authors = ["Aleksey Kladov <aleksey.kladov@gmail.com>"]
edition = "2018"
[dependencies]
write-json = "0.1.1"
ungrammar = { path = "../", version = "1.1.0" }

View File

@ -0,0 +1,77 @@
use std::{
env,
io::{self, Read},
process,
};
use ungrammar::{Grammar, Rule};
fn main() {
if let Err(err) = try_main() {
eprintln!("{}", err);
process::exit(101);
}
}
fn try_main() -> io::Result<()> {
if env::args().count() != 1 {
eprintln!("Usage: ungrammar2json < grammar.ungram > grammar.json");
return Ok(());
}
let grammar = read_stdin()?;
let grammar = grammar
.parse::<Grammar>()
.map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err))?;
let mut buf = String::new();
grammar_to_json(&grammar, write_json::object(&mut buf));
println!("{}", buf);
Ok(())
}
fn read_stdin() -> io::Result<String> {
let mut buf = String::new();
io::stdin().lock().read_to_string(&mut buf)?;
Ok(buf)
}
fn grammar_to_json(grammar: &Grammar, mut obj: write_json::Object<'_>) {
for node in grammar.iter() {
let node = &grammar[node];
rule_to_json(grammar, &node.rule, obj.object(&node.name));
}
}
fn rule_to_json(grammar: &Grammar, rule: &Rule, mut obj: write_json::Object) {
match rule {
Rule::Labeled { label, rule } => {
obj.string("label", label);
rule_to_json(grammar, rule, obj.object("rule"))
}
Rule::Node(node) => {
obj.string("node", &grammar[*node].name);
}
Rule::Token(token) => {
obj.string("token", &grammar[*token].name);
}
Rule::Seq(rules) | Rule::Alt(rules) => {
let tag = match rule {
Rule::Seq(_) => "seq",
Rule::Alt(_) => "alt",
_ => unreachable!(),
};
let mut array = obj.array(tag);
for rule in rules {
rule_to_json(grammar, rule, array.object());
}
}
Rule::Opt(arg) | Rule::Rep(arg) => {
let tag = match rule {
Rule::Opt(_) => "opt",
Rule::Rep(_) => "rep",
_ => unreachable!(),
};
rule_to_json(grammar, arg, obj.object(tag));
}
}
}