summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatthias Schiffer <mschiffer@universe-factory.net>2024-04-21 13:57:45 +0200
committerMatthias Schiffer <mschiffer@universe-factory.net>2024-04-22 17:53:49 +0200
commit767842ab64f601b324b02a0f7f55ebc0b849f251 (patch)
tree71af28ee7b3f5837b60a29d752b7df25843a920d
parent458afaa9b725b295fc634121350da547cf668f25 (diff)
downloadrebel-767842ab64f601b324b02a0f7f55ebc0b849f251.tar
rebel-767842ab64f601b324b02a0f7f55ebc0b849f251.zip
rebel-parse: separate tokenization step
-rw-r--r--crates/rebel-parse/examples/parse-string.rs37
-rw-r--r--crates/rebel-parse/src/ast.rs15
-rw-r--r--crates/rebel-parse/src/grammar/mod.rs1
-rw-r--r--crates/rebel-parse/src/grammar/recipe.rs220
-rw-r--r--crates/rebel-parse/src/grammar/tokenize.rs64
-rw-r--r--crates/rebel-parse/src/lib.rs3
-rw-r--r--crates/rebel-parse/src/token.rs32
7 files changed, 252 insertions, 120 deletions
diff --git a/crates/rebel-parse/examples/parse-string.rs b/crates/rebel-parse/examples/parse-string.rs
index ba6a378..47e30a5 100644
--- a/crates/rebel-parse/examples/parse-string.rs
+++ b/crates/rebel-parse/examples/parse-string.rs
@@ -1,11 +1,12 @@
-use std::{fmt::Debug, time::Instant};
+use std::{fmt::Debug, process, time::Instant};
use clap::{Parser, ValueEnum};
-use rebel_parse::recipe;
+use rebel_parse::{recipe, tokenize};
-#[derive(Clone, Debug, ValueEnum)]
+#[derive(Clone, Debug, PartialEq, Eq, ValueEnum)]
enum Rule {
+ Tokenize,
Recipe,
RecipeStmt,
Body,
@@ -28,14 +29,31 @@ fn main() {
}
let start = Instant::now();
+ let result = tokenize::token_stream(input);
+ let dur = Instant::now().duration_since(start);
+ println!("Tokenization took {} us", dur.as_micros());
+
+ let tokens = match result {
+ Ok(value) => value,
+ Err(err) => {
+ println!("{err}");
+ process::exit(1);
+ }
+ };
+
+ let start = Instant::now();
let result = match opts.rule {
- Rule::Recipe => recipe::recipe(input).map(as_debug),
- Rule::RecipeStmt => recipe::recipe_stmt(input).map(as_debug),
- Rule::Body => recipe::body(input).map(as_debug),
- Rule::BodyStmt => recipe::body_stmt(input).map(as_debug),
- Rule::Expr => recipe::expr(input).map(as_debug),
+ Rule::Tokenize => Ok(as_debug(tokens)),
+ Rule::Recipe => recipe::recipe(&tokens).map(as_debug),
+ Rule::RecipeStmt => recipe::recipe_stmt(&tokens).map(as_debug),
+ Rule::Body => recipe::body(&tokens).map(as_debug),
+ Rule::BodyStmt => recipe::body_stmt(&tokens).map(as_debug),
+ Rule::Expr => recipe::expr(&tokens).map(as_debug),
};
- let dur = Instant::now().duration_since(start);
+ if opts.rule != Rule::Tokenize {
+ let dur = Instant::now().duration_since(start);
+ println!("Parsing took {} us", dur.as_micros());
+ }
match result {
Ok(value) => {
@@ -45,5 +63,4 @@ fn main() {
println!("{err}");
}
};
- println!("Took {} us", dur.as_micros());
}
diff --git a/crates/rebel-parse/src/ast.rs b/crates/rebel-parse/src/ast.rs
index d923f2a..1f98f15 100644
--- a/crates/rebel-parse/src/ast.rs
+++ b/crates/rebel-parse/src/ast.rs
@@ -138,9 +138,18 @@ pub enum Literal<'a> {
}
impl<'a> Literal<'a> {
- pub(crate) fn integer(s: &'a str, radix: u32) -> Result<Self, &'static str> {
- let s = s.replace('_', "");
- let value = u64::from_str_radix(&s, radix).or(Err("Failed to parse number"))?;
+ pub(crate) fn number(s: &'a str) -> Result<Self, &'static str> {
+ let (radix, rest) = if let Some(rest) = s.strip_prefix("0x") {
+ (16, rest)
+ } else if let Some(rest) = s.strip_prefix("0o") {
+ (8, rest)
+ } else if let Some(rest) = s.strip_prefix("0b") {
+ (2, rest)
+ } else {
+ (10, s)
+ };
+ let digits = rest.replace('_', "");
+ let value = u64::from_str_radix(&digits, radix).or(Err("number"))?;
Ok(Literal::Integer(value))
}
}
diff --git a/crates/rebel-parse/src/grammar/mod.rs b/crates/rebel-parse/src/grammar/mod.rs
index ed180f0..de06991 100644
--- a/crates/rebel-parse/src/grammar/mod.rs
+++ b/crates/rebel-parse/src/grammar/mod.rs
@@ -1,2 +1,3 @@
pub mod recipe;
pub mod task_ref;
+pub mod tokenize;
diff --git a/crates/rebel-parse/src/grammar/recipe.rs b/crates/rebel-parse/src/grammar/recipe.rs
index 5ae6b8b..d1deba4 100644
--- a/crates/rebel-parse/src/grammar/recipe.rs
+++ b/crates/rebel-parse/src/grammar/recipe.rs
@@ -1,156 +1,162 @@
-use crate::ast::*;
+use crate::ast::{self, Expr};
+use crate::token::*;
pub use rules::*;
peg::parser! {
- pub grammar rules() for str {
- use OpBinary::*;
- use OpUnary::*;
+ pub grammar rules<'a>() for [Token<'a>] {
+ use ast::OpBinary::*;
+ use ast::OpUnary::*;
- pub rule recipe() -> Recipe<'input>
- = _ recipe:recipe_stmt()* { recipe }
+ pub rule recipe() -> ast::Recipe<'a>
+ = recipe:recipe_stmt()* { recipe }
- pub rule recipe_stmt() -> RecipeStmt<'input>
- = stmt:body_stmt() {
- RecipeStmt::BodyStmt(stmt)
+ pub rule recipe_stmt() -> ast::RecipeStmt<'a>
+ = keyword_fetch() name:ident() p('{') body:body() p('}') {
+ ast::RecipeStmt::Fetch { name, body: Vec::new() }
}
- / "fetch" __ name:ident() _ "{" _ body:body() _ "}" _ {
- RecipeStmt::Fetch { name, body }
+ / keyword_task() name:ident() p('(') args:argtypes() p(')') p('{') body:body() p('}') {
+ ast::RecipeStmt::Task { name, args, body }
}
- / "task" __ name:ident() _ "(" _ args:argtypes() _ ")" _ "{" _ body:body() _ "}" _ {
- RecipeStmt::Task { name, args, body }
+ / stmt:body_stmt() {
+ ast::RecipeStmt::BodyStmt(stmt)
}
- pub rule body() -> Body<'input>
- = recipe:body_stmt()* { recipe }
+ pub rule body() -> ast::Body<'a>
+ = body:body_stmt()* { body }
- pub rule body_stmt() -> BodyStmt<'input>
- = left:typed_expr() _ op:assign_op() _ right:expr() _ ";" _ {
- BodyStmt::assign(left, op, right)
+ pub rule body_stmt() -> ast::BodyStmt<'a>
+ = left:typed_expr() op:assign_op() right:expr() p(';') {
+ ast::BodyStmt::assign(left, op, right)
}
- rule assign_op() -> Option<OpBinary>
- = "+=" { Some(Add) }
- / "-=" { Some(Sub) }
- / "*=" { Some(Mul) }
- / "/=" { Some(Div) }
- / "%=" { Some(Rem) }
- / "=" { None }
+ rule assign_op() -> Option<ast::OpBinary>
+ = p2('+', '=') { Some(Add) }
+ / p2('-', '=') { Some(Sub) }
+ / p2('*', '=') { Some(Mul) }
+ / p2('/', '=') { Some(Div) }
+ / p2('%', '=') { Some(Rem) }
+ / p('=') { None }
- rule typed_expr() -> TypedExpr<'input>
- = expr:expr() typ:tagged(<_ ":" _>, <typ()>)? { TypedExpr { expr, typ } }
+ rule typed_expr() -> ast::TypedExpr<'a>
+ = expr:expr() typ:tagged(<p(':')>, <expr()>)? { ast::TypedExpr { expr, typ } }
-
- rule typ() -> Path<'input>
- = path()
-
- pub rule expr() -> Expr<'input> = precedence! {
- left:(@) _ "||" _ right:@ { Expr::binary(left, Or, right) }
+ pub rule expr() -> Expr<'a> = precedence! {
+ left:(@) p2('|', '|') right:@ { Expr::binary(left, Or, right) }
--
- left:(@) _ "&&" _ right:@ { Expr::binary(left, And, right) }
+ left:(@) p2('&', '&') right:@ { Expr::binary(left, And, right) }
--
- left:(@) _ "==" _ right:@ { Expr::binary(left, Eq, right) }
- left:(@) _ "!=" _ right:@ { Expr::binary(left, Ne, right) }
- left:(@) _ "<" _ right:@ { Expr::binary(left, Lt, right) }
- left:(@) _ ">" _ right:@ { Expr::binary(left, Gt, right) }
- left:(@) _ "<=" _ right:@ { Expr::binary(left, Le, right) }
- left:(@) _ ">=" _ right:@ { Expr::binary(left, Ge, right) }
+ left:(@) p2('=', '=') right:@ { Expr::binary(left, Eq, right) }
+ left:(@) p2('!', '=') right:@ { Expr::binary(left, Ne, right) }
+ left:(@) p('<') right:@ { Expr::binary(left, Lt, right) }
+ left:(@) p('>') right:@ { Expr::binary(left, Gt, right) }
+ left:(@) p2('<', '=') right:@ { Expr::binary(left, Le, right) }
+ left:(@) p2('>', '=') right:@ { Expr::binary(left, Ge, right) }
--
- left:(@) _ "+" _ right:@ { Expr::binary(left, Add, right) }
- left:(@) _ "-" _ right:@ { Expr::binary(left, Sub, right) }
+ left:(@) p('+') right:@ { Expr::binary(left, Add, right) }
+ left:(@) p('-') right:@ { Expr::binary(left, Sub, right) }
--
- left:(@) _ "*" _ right:@ { Expr::binary(left, Mul, right) }
- left:(@) _ "/" _ right:@ { Expr::binary(left, Div, right) }
- left:(@) _ "%" _ right:@ { Expr::binary(left, Rem, right) }
+ left:(@) p('*') right:@ { Expr::binary(left, Mul, right) }
+ left:(@) p('/') right:@ { Expr::binary(left, Div, right) }
+ left:(@) p('%') right:@ { Expr::binary(left, Rem, right) }
--
- "-" _ expr:@ { Expr::unary(Neg, expr) }
- "!" _ expr:@ { Expr::unary(Not, expr) }
+ p('-') expr:@ { Expr::unary(Neg, expr) }
+ p('!') expr:@ { Expr::unary(Not, expr) }
--
- expr:@ _ "(" _ args:args() _ ")" { Expr::apply(expr, args) }
- expr:@ _ "[" _ index:expr() _ "]" { Expr::index(expr, index) }
+ expr:@ p('(') args:args() p(')') { Expr::apply(expr, args) }
+ expr:@ p('[') index:expr() p(']') { Expr::index(expr, index) }
--
- expr:@ _ "." _ field:field() { Expr::field(expr, field) }
+ expr:@ p('.') field:field() { Expr::field(expr, field) }
--
- "(" _ e:expr() _ ")" { Expr::paren(e) }
+ p('(') e:expr() p(')') { Expr::paren(e) }
e:atom() { e }
}
- rule atom() -> Expr<'input>
+ rule atom() -> Expr<'a>
= lit:literal() { Expr::Literal(lit) }
/ path:path() { Expr::Path(path) }
- rule args() -> Vec<Arg<'input>>
- = args:delimited(<arg()>, <_ "," _>) { args }
+ rule args() -> Vec<ast::Arg<'a>>
+ = args:delimited(<arg()>, <p(',')>) { args }
- rule arg() -> Arg<'input>
- = expr:expr() { Arg { expr } }
+ rule arg() -> ast::Arg<'a>
+ = expr:expr() { ast::Arg { expr } }
- rule argtypes() -> Vec<ArgType<'input>>
- = args:delimited(<argtype()>, <_ "," _>) { args }
+ rule argtypes() -> Vec<ast::ArgType<'a>>
+ = args:delimited(<argtype()>, <p(',')>) { args }
- rule argtype() -> ArgType<'input>
- = expr:typed_expr() { ArgType { expr } }
+ rule argtype() -> ast::ArgType<'a>
+ = name:ident() p(':') typ:expr() { ast::ArgType { name, typ } }
- rule literal() -> Literal<'input>
- = "true" { Literal::Boolean(true) }
- / "false" { Literal::Boolean(false) }
- / "0x" s:$((['0'..='9' | 'a'..='f' | 'A'..='F']+) ++ "_") { ?
- Literal::integer(s, 16)
+ rule literal() -> ast::Literal<'a>
+ = keyword_true() { ast::Literal::Boolean(true) }
+ / keyword_false() { ast::Literal::Boolean(false) }
+ / [Token::Literal(Literal { content, kind: LiteralKind::Number })] { ?
+ ast::Literal::number(content)
+ }
+ / [Token::Literal(Literal { content, kind: LiteralKind::String })] {
+ ast::Literal::String(content)
+ }
+ / [Token::Literal(Literal { content, kind: LiteralKind::RawString })] {
+ ast::Literal::RawString(content)
}
- / "0o" s:$((['0'..='7']+) ++ "_") { ?
- Literal::integer(s, 8)
+ / [Token::Literal(Literal { content, kind: LiteralKind::ScriptString })] {
+ ast::Literal::ScriptString(content)
}
- / "0b" s:$((['0'..='1']+) ++ "_") { ?
- Literal::integer(s, 2)
+ / p('(') p(')') { ast::Literal::Unit }
+ / p('(') elements:(expr() ** p(',')) p(',')? p(')') {
+ ast::Literal::Tuple(elements)
}
- / s:$((['0'..='9']+) ++ "_") { ?
- Literal::integer(s, 10)
+ / p('[') elements:delimited(<expr()>, <p(',')>) p(']') {
+ ast::Literal::Array(elements)
}
- / "\"" s:$(string_char()*) "\"" { Literal::String(s) }
- / "r\"" s:$([^'"']*) "\"" { Literal::RawString(s) }
- / "```" newline() s:$((!"```" [_])+) "```" { Literal::ScriptString(s) }
- / "(" _ ")" { Literal::Unit }
- / "(" _ elements:(expr() ** (_ "," _)) (_ ",")? _ ")" { Literal::Tuple(elements) }
- / "[" _ elements:delimited(<expr()>, <_ "," _>) _ "]" { Literal::Array(elements) }
- / "{" _ entries:delimited(<map_entry()>, <_ "," _>) _ "}" { Literal::Map(entries) }
-
- rule map_entry() -> MapEntry<'input>
- = left:typed_expr() _ "=" _ right:expr() {
- MapEntry { left, right }
+ / p('{') entries:delimited(<map_entry()>, <p(',')>) p('}') {
+ ast::Literal::Map(entries)
}
- rule string_char()
- = [^'"' | '\\']
- / "\\" [_]
+ rule map_entry() -> ast::MapEntry<'a>
+ = key:field() p('=') value:expr() {
+ ast::MapEntry { key: key.name, value }
+ }
+
+ rule path() -> ast::Path<'a>
+ = components:ident() ++ p2(':', ':') { ast::Path { components } }
+
+ rule field() -> ast::Ident<'a>
+ = ident()
+ / [Token::Literal(Literal {content, kind: LiteralKind::Number} )] {
+ ast::Ident { name: content }
+ }
- rule path() -> Path<'input>
- = components:ident() ++ (_ "::" _) { Path { components } }
+ rule p_(ch: char)
+ = [Token::Punct(Punct(c, Spacing::Joint)) if c == ch] {}
- rule field() -> Ident<'input>
- = name:$(
- ['a'..='z' | 'A' ..='Z' | '0'..='9' | '_']*
- ) { Ident { name } }
+ rule p(ch: char) -> ()
+ = [Token::Punct(Punct(c, _)) if c == ch] {}
- rule ident() -> Ident<'input>
- = name:$(
- ['a'..='z' | 'A' ..='Z' | '_' ]
- ['a'..='z' | 'A' ..='Z' | '0'..='9' | '_']*
- ) { Ident { name } }
+ rule p2(ch1: char, ch2: char)
+ = p_(ch1) p(ch2)
- /// Mandatory whitespace
- rule __
- = ([' ' | '\t'] / quiet!{newline()} / quiet!{comment()})+
+ rule ident() -> ast::Ident<'a>
+ = !keyword() [Token::Ident(Ident(name))] { ast::Ident { name } }
- /// Optional whitespace
- rule _
- = quiet!{__?}
+ rule keyword()
+ = keyword_true()
+ / keyword_false()
+ / keyword_fetch()
+ / keyword_task()
- rule comment()
- = "//" (!newline() [_])* (newline() / ![_])
- / "/*" (!"*/" [_])* "*/"
+ rule keyword_true()
+ = const_ident("true")
+ rule keyword_false()
+ = const_ident("false")
+ rule keyword_fetch()
+ = const_ident("fetch")
+ rule keyword_task()
+ = const_ident("task")
- rule newline()
- = ['\n' | '\r']
+ rule const_ident(keyword: &str)
+ = [Token::Ident(Ident(name)) if keyword == name]
rule delimited<T>(expr: rule<T>, delim: rule<()>) -> Vec<T>
= values:(expr() ++ delim()) delim()? { values }
diff --git a/crates/rebel-parse/src/grammar/tokenize.rs b/crates/rebel-parse/src/grammar/tokenize.rs
new file mode 100644
index 0000000..a30e299
--- /dev/null
+++ b/crates/rebel-parse/src/grammar/tokenize.rs
@@ -0,0 +1,64 @@
+use crate::token::*;
+
+pub use rules::*;
+
+peg::parser! {
+ pub grammar rules() for str {
+ pub rule token_stream() -> Vec<Token<'input>>
+ = _ tokens:(token() ** _) _ { tokens }
+
+ pub rule token() -> Token<'input>
+ = literal:literal() { Token::Literal(literal) }
+ / ident:ident() { Token::Ident(ident) }
+ / punct:punct() { Token::Punct(punct) }
+
+ rule ident() -> Ident<'input>
+ = name:$(
+ ['a'..='z' | 'A' ..='Z' | '_' ]
+ ['a'..='z' | 'A' ..='Z' | '_' | '0'..='9']*
+ ) { Ident(name) }
+
+ rule punct() -> Punct
+ = ch:punct_char() spacing:spacing() { Punct(ch, spacing) }
+
+ rule punct_char() -> char
+ = !literal() !ident() !__ ch:[_] { ch }
+
+ rule spacing() -> Spacing
+ = &punct_char() { Spacing::Joint }
+ / { Spacing::Alone }
+
+ rule literal() -> Literal<'input>
+ = content:$(['0'..='9'] ['0'..='9' | 'a'..='z' | 'A'..='Z' | '_']*) {
+ Literal { content, kind: LiteralKind::Number }
+ }
+ / "\"" content:$(string_char()*) "\"" {
+ Literal { content, kind: LiteralKind::String }
+ }
+ / "r\"" content:$([^'"']*) "\"" {
+ Literal { content, kind: LiteralKind::RawString }
+ }
+ / "```" newline() content:$((!"```" [_])+) "```" {
+ Literal { content, kind: LiteralKind::ScriptString }
+ }
+
+ rule string_char()
+ = [^'"' | '\\']
+ / "\\" [_]
+
+ /// Mandatory whitespace
+ rule __
+ = ([' ' | '\t'] / quiet!{newline()} / quiet!{comment()})+
+
+ /// Optional whitespace
+ rule _
+ = quiet!{__?}
+
+ rule comment()
+ = "//" (!newline() [_])* (newline() / ![_])
+ / "/*" (!"*/" [_])* "*/"
+
+ rule newline()
+ = ['\n' | '\r']
+ }
+}
diff --git a/crates/rebel-parse/src/lib.rs b/crates/rebel-parse/src/lib.rs
index 8019d00..4a8c431 100644
--- a/crates/rebel-parse/src/lib.rs
+++ b/crates/rebel-parse/src/lib.rs
@@ -1,5 +1,8 @@
pub mod ast;
+pub mod token;
+
mod grammar;
pub use grammar::recipe;
pub use grammar::task_ref;
+pub use grammar::tokenize;
diff --git a/crates/rebel-parse/src/token.rs b/crates/rebel-parse/src/token.rs
new file mode 100644
index 0000000..3147899
--- /dev/null
+++ b/crates/rebel-parse/src/token.rs
@@ -0,0 +1,32 @@
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum Token<'a> {
+ Ident(Ident<'a>),
+ Punct(Punct),
+ Literal(Literal<'a>),
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Ident<'a>(pub &'a str);
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Punct(pub char, pub Spacing);
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum Spacing {
+ Alone,
+ Joint,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Literal<'a> {
+ pub content: &'a str,
+ pub kind: LiteralKind,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum LiteralKind {
+ Number,
+ String,
+ RawString,
+ ScriptString,
+}