From 767842ab64f601b324b02a0f7f55ebc0b849f251 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 21 Apr 2024 13:57:45 +0200 Subject: rebel-parse: separate tokenization step --- crates/rebel-parse/examples/parse-string.rs | 37 +++-- crates/rebel-parse/src/ast.rs | 15 +- crates/rebel-parse/src/grammar/mod.rs | 1 + crates/rebel-parse/src/grammar/recipe.rs | 220 ++++++++++++++-------------- crates/rebel-parse/src/grammar/tokenize.rs | 64 ++++++++ crates/rebel-parse/src/lib.rs | 3 + crates/rebel-parse/src/token.rs | 32 ++++ 7 files changed, 252 insertions(+), 120 deletions(-) create mode 100644 crates/rebel-parse/src/grammar/tokenize.rs create mode 100644 crates/rebel-parse/src/token.rs diff --git a/crates/rebel-parse/examples/parse-string.rs b/crates/rebel-parse/examples/parse-string.rs index ba6a378..47e30a5 100644 --- a/crates/rebel-parse/examples/parse-string.rs +++ b/crates/rebel-parse/examples/parse-string.rs @@ -1,11 +1,12 @@ -use std::{fmt::Debug, time::Instant}; +use std::{fmt::Debug, process, time::Instant}; use clap::{Parser, ValueEnum}; -use rebel_parse::recipe; +use rebel_parse::{recipe, tokenize}; -#[derive(Clone, Debug, ValueEnum)] +#[derive(Clone, Debug, PartialEq, Eq, ValueEnum)] enum Rule { + Tokenize, Recipe, RecipeStmt, Body, @@ -27,15 +28,32 @@ fn main() { Box::new(v) } + let start = Instant::now(); + let result = tokenize::token_stream(input); + let dur = Instant::now().duration_since(start); + println!("Tokenization took {} us", dur.as_micros()); + + let tokens = match result { + Ok(value) => value, + Err(err) => { + println!("{err}"); + process::exit(1); + } + }; + let start = Instant::now(); let result = match opts.rule { - Rule::Recipe => recipe::recipe(input).map(as_debug), - Rule::RecipeStmt => recipe::recipe_stmt(input).map(as_debug), - Rule::Body => recipe::body(input).map(as_debug), - Rule::BodyStmt => recipe::body_stmt(input).map(as_debug), - Rule::Expr => recipe::expr(input).map(as_debug), + Rule::Tokenize => Ok(as_debug(tokens)), + Rule::Recipe => recipe::recipe(&tokens).map(as_debug), + Rule::RecipeStmt => recipe::recipe_stmt(&tokens).map(as_debug), + Rule::Body => recipe::body(&tokens).map(as_debug), + Rule::BodyStmt => recipe::body_stmt(&tokens).map(as_debug), + Rule::Expr => recipe::expr(&tokens).map(as_debug), }; - let dur = Instant::now().duration_since(start); + if opts.rule != Rule::Tokenize { + let dur = Instant::now().duration_since(start); + println!("Parsing took {} us", dur.as_micros()); + } match result { Ok(value) => { @@ -45,5 +63,4 @@ fn main() { println!("{err}"); } }; - println!("Took {} us", dur.as_micros()); } diff --git a/crates/rebel-parse/src/ast.rs b/crates/rebel-parse/src/ast.rs index d923f2a..1f98f15 100644 --- a/crates/rebel-parse/src/ast.rs +++ b/crates/rebel-parse/src/ast.rs @@ -138,9 +138,18 @@ pub enum Literal<'a> { } impl<'a> Literal<'a> { - pub(crate) fn integer(s: &'a str, radix: u32) -> Result { - let s = s.replace('_', ""); - let value = u64::from_str_radix(&s, radix).or(Err("Failed to parse number"))?; + pub(crate) fn number(s: &'a str) -> Result { + let (radix, rest) = if let Some(rest) = s.strip_prefix("0x") { + (16, rest) + } else if let Some(rest) = s.strip_prefix("0o") { + (8, rest) + } else if let Some(rest) = s.strip_prefix("0b") { + (2, rest) + } else { + (10, s) + }; + let digits = rest.replace('_', ""); + let value = u64::from_str_radix(&digits, radix).or(Err("number"))?; Ok(Literal::Integer(value)) } } diff --git a/crates/rebel-parse/src/grammar/mod.rs b/crates/rebel-parse/src/grammar/mod.rs index ed180f0..de06991 100644 --- a/crates/rebel-parse/src/grammar/mod.rs +++ b/crates/rebel-parse/src/grammar/mod.rs @@ -1,2 +1,3 @@ pub mod recipe; pub mod task_ref; +pub mod tokenize; diff --git a/crates/rebel-parse/src/grammar/recipe.rs b/crates/rebel-parse/src/grammar/recipe.rs index 5ae6b8b..d1deba4 100644 --- a/crates/rebel-parse/src/grammar/recipe.rs +++ b/crates/rebel-parse/src/grammar/recipe.rs @@ -1,156 +1,162 @@ -use crate::ast::*; +use crate::ast::{self, Expr}; +use crate::token::*; pub use rules::*; peg::parser! { - pub grammar rules() for str { - use OpBinary::*; - use OpUnary::*; + pub grammar rules<'a>() for [Token<'a>] { + use ast::OpBinary::*; + use ast::OpUnary::*; - pub rule recipe() -> Recipe<'input> - = _ recipe:recipe_stmt()* { recipe } + pub rule recipe() -> ast::Recipe<'a> + = recipe:recipe_stmt()* { recipe } - pub rule recipe_stmt() -> RecipeStmt<'input> - = stmt:body_stmt() { - RecipeStmt::BodyStmt(stmt) + pub rule recipe_stmt() -> ast::RecipeStmt<'a> + = keyword_fetch() name:ident() p('{') body:body() p('}') { + ast::RecipeStmt::Fetch { name, body: Vec::new() } } - / "fetch" __ name:ident() _ "{" _ body:body() _ "}" _ { - RecipeStmt::Fetch { name, body } + / keyword_task() name:ident() p('(') args:argtypes() p(')') p('{') body:body() p('}') { + ast::RecipeStmt::Task { name, args, body } } - / "task" __ name:ident() _ "(" _ args:argtypes() _ ")" _ "{" _ body:body() _ "}" _ { - RecipeStmt::Task { name, args, body } + / stmt:body_stmt() { + ast::RecipeStmt::BodyStmt(stmt) } - pub rule body() -> Body<'input> - = recipe:body_stmt()* { recipe } + pub rule body() -> ast::Body<'a> + = body:body_stmt()* { body } - pub rule body_stmt() -> BodyStmt<'input> - = left:typed_expr() _ op:assign_op() _ right:expr() _ ";" _ { - BodyStmt::assign(left, op, right) + pub rule body_stmt() -> ast::BodyStmt<'a> + = left:typed_expr() op:assign_op() right:expr() p(';') { + ast::BodyStmt::assign(left, op, right) } - rule assign_op() -> Option - = "+=" { Some(Add) } - / "-=" { Some(Sub) } - / "*=" { Some(Mul) } - / "/=" { Some(Div) } - / "%=" { Some(Rem) } - / "=" { None } + rule assign_op() -> Option + = p2('+', '=') { Some(Add) } + / p2('-', '=') { Some(Sub) } + / p2('*', '=') { Some(Mul) } + / p2('/', '=') { Some(Div) } + / p2('%', '=') { Some(Rem) } + / p('=') { None } - rule typed_expr() -> TypedExpr<'input> - = expr:expr() typ:tagged(<_ ":" _>, )? { TypedExpr { expr, typ } } + rule typed_expr() -> ast::TypedExpr<'a> + = expr:expr() typ:tagged(, )? { ast::TypedExpr { expr, typ } } - - rule typ() -> Path<'input> - = path() - - pub rule expr() -> Expr<'input> = precedence! { - left:(@) _ "||" _ right:@ { Expr::binary(left, Or, right) } + pub rule expr() -> Expr<'a> = precedence! { + left:(@) p2('|', '|') right:@ { Expr::binary(left, Or, right) } -- - left:(@) _ "&&" _ right:@ { Expr::binary(left, And, right) } + left:(@) p2('&', '&') right:@ { Expr::binary(left, And, right) } -- - left:(@) _ "==" _ right:@ { Expr::binary(left, Eq, right) } - left:(@) _ "!=" _ right:@ { Expr::binary(left, Ne, right) } - left:(@) _ "<" _ right:@ { Expr::binary(left, Lt, right) } - left:(@) _ ">" _ right:@ { Expr::binary(left, Gt, right) } - left:(@) _ "<=" _ right:@ { Expr::binary(left, Le, right) } - left:(@) _ ">=" _ right:@ { Expr::binary(left, Ge, right) } + left:(@) p2('=', '=') right:@ { Expr::binary(left, Eq, right) } + left:(@) p2('!', '=') right:@ { Expr::binary(left, Ne, right) } + left:(@) p('<') right:@ { Expr::binary(left, Lt, right) } + left:(@) p('>') right:@ { Expr::binary(left, Gt, right) } + left:(@) p2('<', '=') right:@ { Expr::binary(left, Le, right) } + left:(@) p2('>', '=') right:@ { Expr::binary(left, Ge, right) } -- - left:(@) _ "+" _ right:@ { Expr::binary(left, Add, right) } - left:(@) _ "-" _ right:@ { Expr::binary(left, Sub, right) } + left:(@) p('+') right:@ { Expr::binary(left, Add, right) } + left:(@) p('-') right:@ { Expr::binary(left, Sub, right) } -- - left:(@) _ "*" _ right:@ { Expr::binary(left, Mul, right) } - left:(@) _ "/" _ right:@ { Expr::binary(left, Div, right) } - left:(@) _ "%" _ right:@ { Expr::binary(left, Rem, right) } + left:(@) p('*') right:@ { Expr::binary(left, Mul, right) } + left:(@) p('/') right:@ { Expr::binary(left, Div, right) } + left:(@) p('%') right:@ { Expr::binary(left, Rem, right) } -- - "-" _ expr:@ { Expr::unary(Neg, expr) } - "!" _ expr:@ { Expr::unary(Not, expr) } + p('-') expr:@ { Expr::unary(Neg, expr) } + p('!') expr:@ { Expr::unary(Not, expr) } -- - expr:@ _ "(" _ args:args() _ ")" { Expr::apply(expr, args) } - expr:@ _ "[" _ index:expr() _ "]" { Expr::index(expr, index) } + expr:@ p('(') args:args() p(')') { Expr::apply(expr, args) } + expr:@ p('[') index:expr() p(']') { Expr::index(expr, index) } -- - expr:@ _ "." _ field:field() { Expr::field(expr, field) } + expr:@ p('.') field:field() { Expr::field(expr, field) } -- - "(" _ e:expr() _ ")" { Expr::paren(e) } + p('(') e:expr() p(')') { Expr::paren(e) } e:atom() { e } } - rule atom() -> Expr<'input> + rule atom() -> Expr<'a> = lit:literal() { Expr::Literal(lit) } / path:path() { Expr::Path(path) } - rule args() -> Vec> - = args:delimited(, <_ "," _>) { args } + rule args() -> Vec> + = args:delimited(, ) { args } - rule arg() -> Arg<'input> - = expr:expr() { Arg { expr } } + rule arg() -> ast::Arg<'a> + = expr:expr() { ast::Arg { expr } } - rule argtypes() -> Vec> - = args:delimited(, <_ "," _>) { args } + rule argtypes() -> Vec> + = args:delimited(, ) { args } - rule argtype() -> ArgType<'input> - = expr:typed_expr() { ArgType { expr } } + rule argtype() -> ast::ArgType<'a> + = name:ident() p(':') typ:expr() { ast::ArgType { name, typ } } - rule literal() -> Literal<'input> - = "true" { Literal::Boolean(true) } - / "false" { Literal::Boolean(false) } - / "0x" s:$((['0'..='9' | 'a'..='f' | 'A'..='F']+) ++ "_") { ? - Literal::integer(s, 16) + rule literal() -> ast::Literal<'a> + = keyword_true() { ast::Literal::Boolean(true) } + / keyword_false() { ast::Literal::Boolean(false) } + / [Token::Literal(Literal { content, kind: LiteralKind::Number })] { ? + ast::Literal::number(content) + } + / [Token::Literal(Literal { content, kind: LiteralKind::String })] { + ast::Literal::String(content) + } + / [Token::Literal(Literal { content, kind: LiteralKind::RawString })] { + ast::Literal::RawString(content) } - / "0o" s:$((['0'..='7']+) ++ "_") { ? - Literal::integer(s, 8) + / [Token::Literal(Literal { content, kind: LiteralKind::ScriptString })] { + ast::Literal::ScriptString(content) } - / "0b" s:$((['0'..='1']+) ++ "_") { ? - Literal::integer(s, 2) + / p('(') p(')') { ast::Literal::Unit } + / p('(') elements:(expr() ** p(',')) p(',')? p(')') { + ast::Literal::Tuple(elements) } - / s:$((['0'..='9']+) ++ "_") { ? - Literal::integer(s, 10) + / p('[') elements:delimited(, ) p(']') { + ast::Literal::Array(elements) } - / "\"" s:$(string_char()*) "\"" { Literal::String(s) } - / "r\"" s:$([^'"']*) "\"" { Literal::RawString(s) } - / "```" newline() s:$((!"```" [_])+) "```" { Literal::ScriptString(s) } - / "(" _ ")" { Literal::Unit } - / "(" _ elements:(expr() ** (_ "," _)) (_ ",")? _ ")" { Literal::Tuple(elements) } - / "[" _ elements:delimited(, <_ "," _>) _ "]" { Literal::Array(elements) } - / "{" _ entries:delimited(, <_ "," _>) _ "}" { Literal::Map(entries) } - - rule map_entry() -> MapEntry<'input> - = left:typed_expr() _ "=" _ right:expr() { - MapEntry { left, right } + / p('{') entries:delimited(, ) p('}') { + ast::Literal::Map(entries) } - rule string_char() - = [^'"' | '\\'] - / "\\" [_] + rule map_entry() -> ast::MapEntry<'a> + = key:field() p('=') value:expr() { + ast::MapEntry { key: key.name, value } + } + + rule path() -> ast::Path<'a> + = components:ident() ++ p2(':', ':') { ast::Path { components } } + + rule field() -> ast::Ident<'a> + = ident() + / [Token::Literal(Literal {content, kind: LiteralKind::Number} )] { + ast::Ident { name: content } + } - rule path() -> Path<'input> - = components:ident() ++ (_ "::" _) { Path { components } } + rule p_(ch: char) + = [Token::Punct(Punct(c, Spacing::Joint)) if c == ch] {} - rule field() -> Ident<'input> - = name:$( - ['a'..='z' | 'A' ..='Z' | '0'..='9' | '_']* - ) { Ident { name } } + rule p(ch: char) -> () + = [Token::Punct(Punct(c, _)) if c == ch] {} - rule ident() -> Ident<'input> - = name:$( - ['a'..='z' | 'A' ..='Z' | '_' ] - ['a'..='z' | 'A' ..='Z' | '0'..='9' | '_']* - ) { Ident { name } } + rule p2(ch1: char, ch2: char) + = p_(ch1) p(ch2) - /// Mandatory whitespace - rule __ - = ([' ' | '\t'] / quiet!{newline()} / quiet!{comment()})+ + rule ident() -> ast::Ident<'a> + = !keyword() [Token::Ident(Ident(name))] { ast::Ident { name } } - /// Optional whitespace - rule _ - = quiet!{__?} + rule keyword() + = keyword_true() + / keyword_false() + / keyword_fetch() + / keyword_task() - rule comment() - = "//" (!newline() [_])* (newline() / ![_]) - / "/*" (!"*/" [_])* "*/" + rule keyword_true() + = const_ident("true") + rule keyword_false() + = const_ident("false") + rule keyword_fetch() + = const_ident("fetch") + rule keyword_task() + = const_ident("task") - rule newline() - = ['\n' | '\r'] + rule const_ident(keyword: &str) + = [Token::Ident(Ident(name)) if keyword == name] rule delimited(expr: rule, delim: rule<()>) -> Vec = values:(expr() ++ delim()) delim()? { values } diff --git a/crates/rebel-parse/src/grammar/tokenize.rs b/crates/rebel-parse/src/grammar/tokenize.rs new file mode 100644 index 0000000..a30e299 --- /dev/null +++ b/crates/rebel-parse/src/grammar/tokenize.rs @@ -0,0 +1,64 @@ +use crate::token::*; + +pub use rules::*; + +peg::parser! { + pub grammar rules() for str { + pub rule token_stream() -> Vec> + = _ tokens:(token() ** _) _ { tokens } + + pub rule token() -> Token<'input> + = literal:literal() { Token::Literal(literal) } + / ident:ident() { Token::Ident(ident) } + / punct:punct() { Token::Punct(punct) } + + rule ident() -> Ident<'input> + = name:$( + ['a'..='z' | 'A' ..='Z' | '_' ] + ['a'..='z' | 'A' ..='Z' | '_' | '0'..='9']* + ) { Ident(name) } + + rule punct() -> Punct + = ch:punct_char() spacing:spacing() { Punct(ch, spacing) } + + rule punct_char() -> char + = !literal() !ident() !__ ch:[_] { ch } + + rule spacing() -> Spacing + = &punct_char() { Spacing::Joint } + / { Spacing::Alone } + + rule literal() -> Literal<'input> + = content:$(['0'..='9'] ['0'..='9' | 'a'..='z' | 'A'..='Z' | '_']*) { + Literal { content, kind: LiteralKind::Number } + } + / "\"" content:$(string_char()*) "\"" { + Literal { content, kind: LiteralKind::String } + } + / "r\"" content:$([^'"']*) "\"" { + Literal { content, kind: LiteralKind::RawString } + } + / "```" newline() content:$((!"```" [_])+) "```" { + Literal { content, kind: LiteralKind::ScriptString } + } + + rule string_char() + = [^'"' | '\\'] + / "\\" [_] + + /// Mandatory whitespace + rule __ + = ([' ' | '\t'] / quiet!{newline()} / quiet!{comment()})+ + + /// Optional whitespace + rule _ + = quiet!{__?} + + rule comment() + = "//" (!newline() [_])* (newline() / ![_]) + / "/*" (!"*/" [_])* "*/" + + rule newline() + = ['\n' | '\r'] + } +} diff --git a/crates/rebel-parse/src/lib.rs b/crates/rebel-parse/src/lib.rs index 8019d00..4a8c431 100644 --- a/crates/rebel-parse/src/lib.rs +++ b/crates/rebel-parse/src/lib.rs @@ -1,5 +1,8 @@ pub mod ast; +pub mod token; + mod grammar; pub use grammar::recipe; pub use grammar::task_ref; +pub use grammar::tokenize; diff --git a/crates/rebel-parse/src/token.rs b/crates/rebel-parse/src/token.rs new file mode 100644 index 0000000..3147899 --- /dev/null +++ b/crates/rebel-parse/src/token.rs @@ -0,0 +1,32 @@ +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Token<'a> { + Ident(Ident<'a>), + Punct(Punct), + Literal(Literal<'a>), +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct Ident<'a>(pub &'a str); + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct Punct(pub char, pub Spacing); + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Spacing { + Alone, + Joint, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct Literal<'a> { + pub content: &'a str, + pub kind: LiteralKind, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum LiteralKind { + Number, + String, + RawString, + ScriptString, +} -- cgit v1.2.3