From 767842ab64f601b324b02a0f7f55ebc0b849f251 Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Sun, 21 Apr 2024 13:57:45 +0200
Subject: rebel-parse: separate tokenization step

---
 crates/rebel-parse/examples/parse-string.rs |  37 +++--
 crates/rebel-parse/src/ast.rs               |  15 +-
 crates/rebel-parse/src/grammar/mod.rs       |   1 +
 crates/rebel-parse/src/grammar/recipe.rs    | 220 ++++++++++++++--------------
 crates/rebel-parse/src/grammar/tokenize.rs  |  64 ++++++++
 crates/rebel-parse/src/lib.rs               |   3 +
 crates/rebel-parse/src/token.rs             |  32 ++++
 7 files changed, 252 insertions(+), 120 deletions(-)
 create mode 100644 crates/rebel-parse/src/grammar/tokenize.rs
 create mode 100644 crates/rebel-parse/src/token.rs

diff --git a/crates/rebel-parse/examples/parse-string.rs b/crates/rebel-parse/examples/parse-string.rs
index ba6a378..47e30a5 100644
--- a/crates/rebel-parse/examples/parse-string.rs
+++ b/crates/rebel-parse/examples/parse-string.rs
@@ -1,11 +1,12 @@
-use std::{fmt::Debug, time::Instant};
+use std::{fmt::Debug, process, time::Instant};
 
 use clap::{Parser, ValueEnum};
 
-use rebel_parse::recipe;
+use rebel_parse::{recipe, tokenize};
 
-#[derive(Clone, Debug, ValueEnum)]
+#[derive(Clone, Debug, PartialEq, Eq, ValueEnum)]
 enum Rule {
+	Tokenize,
 	Recipe,
 	RecipeStmt,
 	Body,
@@ -27,15 +28,32 @@ fn main() {
 		Box::new(v)
 	}
 
+	let start = Instant::now();
+	let result = tokenize::token_stream(input);
+	let dur = Instant::now().duration_since(start);
+	println!("Tokenization took {} us", dur.as_micros());
+
+	let tokens = match result {
+		Ok(value) => value,
+		Err(err) => {
+			println!("{err}");
+			process::exit(1);
+		}
+	};
+
 	let start = Instant::now();
 	let result = match opts.rule {
-		Rule::Recipe => recipe::recipe(input).map(as_debug),
-		Rule::RecipeStmt => recipe::recipe_stmt(input).map(as_debug),
-		Rule::Body => recipe::body(input).map(as_debug),
-		Rule::BodyStmt => recipe::body_stmt(input).map(as_debug),
-		Rule::Expr => recipe::expr(input).map(as_debug),
+		Rule::Tokenize => Ok(as_debug(tokens)),
+		Rule::Recipe => recipe::recipe(&tokens).map(as_debug),
+		Rule::RecipeStmt => recipe::recipe_stmt(&tokens).map(as_debug),
+		Rule::Body => recipe::body(&tokens).map(as_debug),
+		Rule::BodyStmt => recipe::body_stmt(&tokens).map(as_debug),
+		Rule::Expr => recipe::expr(&tokens).map(as_debug),
 	};
-	let dur = Instant::now().duration_since(start);
+	if opts.rule != Rule::Tokenize {
+		let dur = Instant::now().duration_since(start);
+		println!("Parsing took {} us", dur.as_micros());
+	}
 
 	match result {
 		Ok(value) => {
@@ -45,5 +63,4 @@ fn main() {
 			println!("{err}");
 		}
 	};
-	println!("Took {} us", dur.as_micros());
 }
diff --git a/crates/rebel-parse/src/ast.rs b/crates/rebel-parse/src/ast.rs
index d923f2a..1f98f15 100644
--- a/crates/rebel-parse/src/ast.rs
+++ b/crates/rebel-parse/src/ast.rs
@@ -138,9 +138,18 @@ pub enum Literal<'a> {
 }
 
 impl<'a> Literal<'a> {
-	pub(crate) fn integer(s: &'a str, radix: u32) -> Result<Self, &'static str> {
-		let s = s.replace('_', "");
-		let value = u64::from_str_radix(&s, radix).or(Err("Failed to parse number"))?;
+	pub(crate) fn number(s: &'a str) -> Result<Self, &'static str> {
+		let (radix, rest) = if let Some(rest) = s.strip_prefix("0x") {
+			(16, rest)
+		} else if let Some(rest) = s.strip_prefix("0o") {
+			(8, rest)
+		} else if let Some(rest) = s.strip_prefix("0b") {
+			(2, rest)
+		} else {
+			(10, s)
+		};
+		let digits = rest.replace('_', "");
+		let value = u64::from_str_radix(&digits, radix).or(Err("number"))?;
 		Ok(Literal::Integer(value))
 	}
 }
diff --git a/crates/rebel-parse/src/grammar/mod.rs b/crates/rebel-parse/src/grammar/mod.rs
index ed180f0..de06991 100644
--- a/crates/rebel-parse/src/grammar/mod.rs
+++ b/crates/rebel-parse/src/grammar/mod.rs
@@ -1,2 +1,3 @@
 pub mod recipe;
 pub mod task_ref;
+pub mod tokenize;
diff --git a/crates/rebel-parse/src/grammar/recipe.rs b/crates/rebel-parse/src/grammar/recipe.rs
index 5ae6b8b..d1deba4 100644
--- a/crates/rebel-parse/src/grammar/recipe.rs
+++ b/crates/rebel-parse/src/grammar/recipe.rs
@@ -1,156 +1,162 @@
-use crate::ast::*;
+use crate::ast::{self, Expr};
+use crate::token::*;
 
 pub use rules::*;
 
 peg::parser! {
-	pub grammar rules() for str {
-		use OpBinary::*;
-		use OpUnary::*;
+	pub grammar rules<'a>() for [Token<'a>] {
+		use ast::OpBinary::*;
+		use ast::OpUnary::*;
 
-		pub rule recipe() -> Recipe<'input>
-			= _ recipe:recipe_stmt()* { recipe }
+		pub rule recipe() -> ast::Recipe<'a>
+			= recipe:recipe_stmt()* { recipe }
 
-		pub rule recipe_stmt() -> RecipeStmt<'input>
-			= stmt:body_stmt() {
-				RecipeStmt::BodyStmt(stmt)
+		pub rule recipe_stmt() -> ast::RecipeStmt<'a>
+			= keyword_fetch() name:ident() p('{') body:body() p('}') {
+				ast::RecipeStmt::Fetch { name, body: Vec::new() }
 			}
-			/ "fetch" __ name:ident() _ "{" _ body:body() _ "}" _ {
-				RecipeStmt::Fetch { name, body }
+			/ keyword_task() name:ident() p('(') args:argtypes() p(')') p('{') body:body() p('}') {
+				ast::RecipeStmt::Task { name, args, body }
 			}
-			/ "task" __ name:ident() _ "(" _ args:argtypes() _ ")" _ "{" _ body:body() _ "}" _ {
-				RecipeStmt::Task { name, args, body }
+			/ stmt:body_stmt() {
+				ast::RecipeStmt::BodyStmt(stmt)
 			}
 
-		pub rule body() -> Body<'input>
-			= recipe:body_stmt()* { recipe }
+		pub rule body() -> ast::Body<'a>
+			= body:body_stmt()* { body }
 
-		pub rule body_stmt() -> BodyStmt<'input>
-			= left:typed_expr() _ op:assign_op() _ right:expr() _ ";" _ {
-				BodyStmt::assign(left, op, right)
+		pub rule body_stmt() -> ast::BodyStmt<'a>
+			= left:typed_expr() op:assign_op() right:expr() p(';') {
+				ast::BodyStmt::assign(left, op, right)
 			}
 
-		rule assign_op() -> Option<OpBinary>
-			= "+=" { Some(Add) }
-			/ "-=" { Some(Sub) }
-			/ "*=" { Some(Mul) }
-			/ "/=" { Some(Div) }
-			/ "%=" { Some(Rem) }
-			/ "="  { None }
+		rule assign_op() -> Option<ast::OpBinary>
+			= p2('+', '=') { Some(Add) }
+			/ p2('-', '=') { Some(Sub) }
+			/ p2('*', '=') { Some(Mul) }
+			/ p2('/', '=') { Some(Div) }
+			/ p2('%', '=') { Some(Rem) }
+			/ p('=')  { None }
 
-		rule typed_expr() -> TypedExpr<'input>
-			= expr:expr() typ:tagged(<_ ":" _>, <typ()>)? { TypedExpr { expr, typ } }
+		rule typed_expr() -> ast::TypedExpr<'a>
+			= expr:expr() typ:tagged(<p(':')>, <expr()>)? { ast::TypedExpr { expr, typ } }
 
-
-		rule typ() -> Path<'input>
-			= path()
-
-		pub rule expr() -> Expr<'input> = precedence! {
-			left:(@) _ "||" _ right:@ { Expr::binary(left, Or, right) }
+		pub rule expr() -> Expr<'a> = precedence! {
+			left:(@) p2('|', '|') right:@ { Expr::binary(left, Or, right) }
 			--
-			left:(@) _ "&&" _ right:@ { Expr::binary(left, And, right) }
+			left:(@) p2('&', '&') right:@ { Expr::binary(left, And, right) }
 			--
-			left:(@) _ "==" _ right:@ { Expr::binary(left, Eq, right) }
-			left:(@) _ "!=" _ right:@ { Expr::binary(left, Ne, right) }
-			left:(@) _ "<"  _ right:@ { Expr::binary(left, Lt, right) }
-			left:(@) _ ">"  _ right:@ { Expr::binary(left, Gt, right) }
-			left:(@) _ "<=" _ right:@ { Expr::binary(left, Le, right) }
-			left:(@) _ ">=" _ right:@ { Expr::binary(left, Ge, right) }
+			left:(@) p2('=', '=') right:@ { Expr::binary(left, Eq, right) }
+			left:(@) p2('!', '=') right:@ { Expr::binary(left, Ne, right) }
+			left:(@) p('<')  right:@ { Expr::binary(left, Lt, right) }
+			left:(@) p('>')  right:@ { Expr::binary(left, Gt, right) }
+			left:(@) p2('<', '=') right:@ { Expr::binary(left, Le, right) }
+			left:(@) p2('>', '=') right:@ { Expr::binary(left, Ge, right) }
 			--
-			left:(@) _ "+"  _ right:@ { Expr::binary(left, Add, right) }
-			left:(@) _ "-"  _ right:@ { Expr::binary(left, Sub, right) }
+			left:(@) p('+')  right:@ { Expr::binary(left, Add, right) }
+			left:(@) p('-')  right:@ { Expr::binary(left, Sub, right) }
 			--
-			left:(@) _ "*"  _ right:@ { Expr::binary(left, Mul, right) }
-			left:(@) _ "/"  _ right:@ { Expr::binary(left, Div, right) }
-			left:(@) _ "%"  _ right:@ { Expr::binary(left, Rem, right) }
+			left:(@) p('*')  right:@ { Expr::binary(left, Mul, right) }
+			left:(@) p('/')  right:@ { Expr::binary(left, Div, right) }
+			left:(@) p('%')  right:@ { Expr::binary(left, Rem, right) }
 			--
-			"-" _ expr:@ { Expr::unary(Neg, expr) }
-			"!" _ expr:@ { Expr::unary(Not, expr) }
+			p('-') expr:@ { Expr::unary(Neg, expr) }
+			p('!') expr:@ { Expr::unary(Not, expr) }
 			--
-			expr:@ _ "(" _ args:args() _ ")" { Expr::apply(expr, args) }
-			expr:@ _ "[" _ index:expr() _ "]" { Expr::index(expr, index) }
+			expr:@ p('(') args:args() p(')') { Expr::apply(expr, args) }
+			expr:@ p('[') index:expr() p(']') { Expr::index(expr, index) }
 			--
-			expr:@ _ "." _ field:field() { Expr::field(expr, field) }
+			expr:@ p('.') field:field() { Expr::field(expr, field) }
 			--
-			"(" _ e:expr() _ ")" { Expr::paren(e) }
+			p('(') e:expr() p(')') { Expr::paren(e) }
 			e:atom() { e }
 		}
 
-		rule atom() -> Expr<'input>
+		rule atom() -> Expr<'a>
 			= lit:literal() { Expr::Literal(lit) }
 			/ path:path() { Expr::Path(path) }
 
-		rule args() -> Vec<Arg<'input>>
-			= args:delimited(<arg()>, <_ "," _>) { args }
+		rule args() -> Vec<ast::Arg<'a>>
+			= args:delimited(<arg()>, <p(',')>) { args }
 
-		rule arg() -> Arg<'input>
-			= expr:expr() { Arg { expr } }
+		rule arg() -> ast::Arg<'a>
+			= expr:expr() { ast::Arg { expr } }
 
-		rule argtypes() -> Vec<ArgType<'input>>
-			= args:delimited(<argtype()>, <_ "," _>) { args }
+		rule argtypes() -> Vec<ast::ArgType<'a>>
+			= args:delimited(<argtype()>, <p(',')>) { args }
 
-		rule argtype() -> ArgType<'input>
-			= expr:typed_expr() { ArgType { expr } }
+		rule argtype() -> ast::ArgType<'a>
+			= name:ident() p(':') typ:expr() { ast::ArgType { name, typ } }
 
-		rule literal() -> Literal<'input>
-			= "true" { Literal::Boolean(true) }
-			/ "false" { Literal::Boolean(false) }
-			/ "0x" s:$((['0'..='9' | 'a'..='f' | 'A'..='F']+) ++ "_") { ?
-				Literal::integer(s, 16)
+		rule literal() -> ast::Literal<'a>
+			= keyword_true() { ast::Literal::Boolean(true) }
+			/ keyword_false() { ast::Literal::Boolean(false) }
+			/ [Token::Literal(Literal { content, kind: LiteralKind::Number })] { ?
+				ast::Literal::number(content)
+			}
+			/ [Token::Literal(Literal { content, kind: LiteralKind::String })] {
+				ast::Literal::String(content)
+			}
+			/ [Token::Literal(Literal { content, kind: LiteralKind::RawString })] {
+				ast::Literal::RawString(content)
 			}
-			/ "0o" s:$((['0'..='7']+) ++ "_") { ?
-				Literal::integer(s, 8)
+			/ [Token::Literal(Literal { content, kind: LiteralKind::ScriptString })] {
+				ast::Literal::ScriptString(content)
 			}
-			/ "0b" s:$((['0'..='1']+) ++ "_") { ?
-				Literal::integer(s, 2)
+			/ p('(') p(')') { ast::Literal::Unit }
+			/ p('(') elements:(expr() ** p(',')) p(',')? p(')') {
+				ast::Literal::Tuple(elements)
 			}
-			/ s:$((['0'..='9']+) ++ "_") { ?
-				Literal::integer(s, 10)
+			/ p('[') elements:delimited(<expr()>, <p(',')>) p(']') {
+				ast::Literal::Array(elements)
 			}
-			/ "\"" s:$(string_char()*) "\"" { Literal::String(s) }
-			/ "r\"" s:$([^'"']*) "\"" { Literal::RawString(s) }
-			/ "```" newline() s:$((!"```" [_])+) "```" { Literal::ScriptString(s) }
-			/ "(" _ ")" { Literal::Unit }
-			/ "(" _ elements:(expr() ** (_ "," _)) (_ ",")? _ ")" { Literal::Tuple(elements) }
-			/ "[" _ elements:delimited(<expr()>, <_ "," _>) _ "]" { Literal::Array(elements) }
-			/ "{" _ entries:delimited(<map_entry()>, <_ "," _>) _ "}" { Literal::Map(entries) }
-
-		rule map_entry() -> MapEntry<'input>
-			= left:typed_expr() _ "=" _ right:expr() {
-				MapEntry { left, right }
+			/ p('{') entries:delimited(<map_entry()>, <p(',')>) p('}') {
+				ast::Literal::Map(entries)
 			}
 
-		rule string_char()
-			= [^'"' | '\\']
-			/ "\\" [_]
+		rule map_entry() -> ast::MapEntry<'a>
+			= key:field() p('=') value:expr() {
+				ast::MapEntry { key: key.name, value }
+			}
+
+		rule path() -> ast::Path<'a>
+			= components:ident() ++ p2(':', ':') { ast::Path { components } }
+
+		rule field() -> ast::Ident<'a>
+			= ident()
+			/ [Token::Literal(Literal {content, kind: LiteralKind::Number} )] {
+				ast::Ident { name: content }
+			}
 
-		rule path() -> Path<'input>
-			= components:ident() ++ (_ "::" _) { Path { components } }
+		rule p_(ch: char)
+			= [Token::Punct(Punct(c, Spacing::Joint)) if c == ch] {}
 
-		rule field() -> Ident<'input>
-			= name:$(
-				['a'..='z' | 'A' ..='Z' | '0'..='9' | '_']*
-			) { Ident { name } }
+		rule p(ch: char) -> ()
+			= [Token::Punct(Punct(c, _)) if c == ch] {}
 
-		rule ident() -> Ident<'input>
-			= name:$(
-				['a'..='z' | 'A' ..='Z' | '_' ]
-				['a'..='z' | 'A' ..='Z' | '0'..='9' | '_']*
-			) { Ident { name } }
+		rule p2(ch1: char, ch2: char)
+			= p_(ch1) p(ch2)
 
-		/// Mandatory whitespace
-		rule __
-			= ([' ' | '\t'] / quiet!{newline()} / quiet!{comment()})+
+		rule ident() -> ast::Ident<'a>
+			= !keyword() [Token::Ident(Ident(name))] { ast::Ident { name } }
 
-		/// Optional whitespace
-		rule _
-			= quiet!{__?}
+		rule keyword()
+			= keyword_true()
+			/ keyword_false()
+			/ keyword_fetch()
+			/ keyword_task()
 
-		rule comment()
-			= "//" (!newline() [_])* (newline() / ![_])
-			/ "/*" (!"*/" [_])* "*/"
+		rule keyword_true()
+			= const_ident("true")
+		rule keyword_false()
+			= const_ident("false")
+		rule keyword_fetch()
+			= const_ident("fetch")
+		rule keyword_task()
+			= const_ident("task")
 
-		rule newline()
-			= ['\n' | '\r']
+		rule const_ident(keyword: &str)
+			= [Token::Ident(Ident(name)) if keyword == name]
 
 		rule delimited<T>(expr: rule<T>, delim: rule<()>) -> Vec<T>
 			= values:(expr() ++ delim()) delim()? { values }
diff --git a/crates/rebel-parse/src/grammar/tokenize.rs b/crates/rebel-parse/src/grammar/tokenize.rs
new file mode 100644
index 0000000..a30e299
--- /dev/null
+++ b/crates/rebel-parse/src/grammar/tokenize.rs
@@ -0,0 +1,64 @@
+use crate::token::*;
+
+pub use rules::*;
+
+peg::parser! {
+	pub grammar rules() for str {
+		pub rule token_stream() -> Vec<Token<'input>>
+			= _ tokens:(token() ** _) _ { tokens }
+
+		pub rule token() -> Token<'input>
+			= literal:literal() { Token::Literal(literal) }
+			/ ident:ident() { Token::Ident(ident) }
+			/ punct:punct() { Token::Punct(punct) }
+
+		rule ident() -> Ident<'input>
+			= name:$(
+				['a'..='z' | 'A' ..='Z' | '_' ]
+				['a'..='z' | 'A' ..='Z' | '_' | '0'..='9']*
+			) { Ident(name) }
+
+		rule punct() -> Punct
+			= ch:punct_char() spacing:spacing() { Punct(ch, spacing) }
+
+		rule punct_char() -> char
+			= !literal() !ident() !__ ch:[_] { ch }
+
+		rule spacing() -> Spacing
+			= &punct_char() { Spacing::Joint }
+			/ { Spacing::Alone }
+
+		rule literal() -> Literal<'input>
+			= content:$(['0'..='9'] ['0'..='9' | 'a'..='z' | 'A'..='Z' | '_']*) {
+				Literal { content, kind: LiteralKind::Number }
+			}
+			/ "\"" content:$(string_char()*) "\"" {
+				Literal { content, kind: LiteralKind::String }
+			}
+			/ "r\"" content:$([^'"']*) "\"" {
+				Literal { content, kind: LiteralKind::RawString }
+			}
+			/ "```" newline() content:$((!"```" [_])+) "```" {
+				Literal { content, kind: LiteralKind::ScriptString }
+			}
+
+		rule string_char()
+			= [^'"' | '\\']
+			/ "\\" [_]
+
+		/// Mandatory whitespace
+		rule __
+			= ([' ' | '\t'] / quiet!{newline()} / quiet!{comment()})+
+
+		/// Optional whitespace
+		rule _
+			= quiet!{__?}
+
+		rule comment()
+			= "//" (!newline() [_])* (newline() / ![_])
+			/ "/*" (!"*/" [_])* "*/"
+
+		rule newline()
+			= ['\n' | '\r']
+	}
+}
diff --git a/crates/rebel-parse/src/lib.rs b/crates/rebel-parse/src/lib.rs
index 8019d00..4a8c431 100644
--- a/crates/rebel-parse/src/lib.rs
+++ b/crates/rebel-parse/src/lib.rs
@@ -1,5 +1,8 @@
 pub mod ast;
+pub mod token;
+
 mod grammar;
 
 pub use grammar::recipe;
 pub use grammar::task_ref;
+pub use grammar::tokenize;
diff --git a/crates/rebel-parse/src/token.rs b/crates/rebel-parse/src/token.rs
new file mode 100644
index 0000000..3147899
--- /dev/null
+++ b/crates/rebel-parse/src/token.rs
@@ -0,0 +1,32 @@
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum Token<'a> {
+	Ident(Ident<'a>),
+	Punct(Punct),
+	Literal(Literal<'a>),
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Ident<'a>(pub &'a str);
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Punct(pub char, pub Spacing);
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum Spacing {
+	Alone,
+	Joint,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Literal<'a> {
+	pub content: &'a str,
+	pub kind: LiteralKind,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum LiteralKind {
+	Number,
+	String,
+	RawString,
+	ScriptString,
+}
-- 
cgit v1.2.3