diff options
Diffstat (limited to 'crates/rebel-parse/src/grammar/tokenize.rs')
-rw-r--r-- | crates/rebel-parse/src/grammar/tokenize.rs | 136 |
1 files changed, 136 insertions, 0 deletions
diff --git a/crates/rebel-parse/src/grammar/tokenize.rs b/crates/rebel-parse/src/grammar/tokenize.rs new file mode 100644 index 0000000..ed1db39 --- /dev/null +++ b/crates/rebel-parse/src/grammar/tokenize.rs @@ -0,0 +1,136 @@ +use crate::token::*; + +pub use rules::*; + +static KEYWORDS: phf::Map<&'static str, Keyword> = phf::phf_map! { + "else" => Keyword::Else, + "false" => Keyword::False, + "fetch" => Keyword::Fetch, + "fn" => Keyword::Fn, + "for" => Keyword::For, + "if" => Keyword::If, + "let" => Keyword::Let, + "map" => Keyword::Map, + "mut" => Keyword::Mut, + "recipe" => Keyword::Recipe, + "set" => Keyword::Set, + "task" => Keyword::Task, + "true" => Keyword::True, +}; + +peg::parser! { + pub grammar rules() for str { + pub rule token_stream() -> TokenStream<'input> + = _ tokens:(token() ** _) _ { TokenStream(tokens) } + + pub rule token() -> Token<'input> + = number:number() { Token::Number(number) } + / string:string() { Token::Str(string) } + / token:ident_or_keyword() { token } + / punct:punct() { Token::Punct(punct) } + + rule ident_or_keyword() -> Token<'input> + = s:$( + ['a'..='z' | 'A' ..='Z' | '_' ] + ['a'..='z' | 'A' ..='Z' | '_' | '0'..='9']* + ) { + if let Some(kw) = KEYWORDS.get(s) { + Token::Keyword(*kw) + } else { + Token::Ident(s) + } + } + + rule punct() -> Punct + = ch:punct_char() spacing:spacing() { Punct(ch, spacing) } + + rule punct_char() -> char + = !comment_start() ch:[ + | '~' | '!' | '@' | '#' | '$' | '%' | '^' | '&' + | '*' | '-' | '=' | '+' | '|' | ';' | ':' | ',' + | '<' | '.' | '>' | '/' | '\'' | '?' | '(' | ')' + | '[' | ']' | '{' | '}' + ] { ch } + + rule spacing() -> Spacing + = &punct_char() { Spacing::Joint } + / { Spacing::Alone } + + rule number() -> &'input str + = $(['0'..='9'] ['0'..='9' | 'a'..='z' | 'A'..='Z' | '_']*) + + rule string() -> Str<'input> + = "\"" pieces:string_piece()* "\"" { + Str { + pieces, + kind: StrKind::Regular, + } + } + / "r\"" chars:$([^'"']*) "\"" { + Str { + pieces: vec![StrPiece::Chars(chars)], + kind: StrKind::Raw, + } + } + / "```" newline() pieces:script_string_piece()* "```" { + Str { + pieces, + kind: StrKind::Script, + } + } + + rule string_piece() -> StrPiece<'input> + = chars:$((!"{{" [^'"' | '\\'])+) { StrPiece::Chars(chars) } + / "\\" escape:string_escape() { StrPiece::Escape(escape) } + / string_interp() + + rule string_escape() -> char + = "n" { '\n' } + / "r" { '\r' } + / "t" { '\t' } + / "\\" { '\\' } + / "\"" { '"' } + / "{" { '{' } + / "0" { '\0' } + / "x" digits:$(['0'..='7'] hex_digit()) { + u8::from_str_radix(digits, 16).unwrap().into() + } + / "u{" digits:$(hex_digit()*<1,6>) "}" { ? + u32::from_str_radix(digits, 16).unwrap().try_into().or(Err("Invalid unicode escape")) + } + + rule script_string_piece() -> StrPiece<'input> + = chars:$((!"{{" !"```" [_])+) { StrPiece::Chars(chars) } + / string_interp() + + rule string_interp() -> StrPiece<'input> + = "{{" _ tokens:(subtoken() ++ _) _ "}}" { + StrPiece::Interp(TokenStream(tokens)) + } + + rule subtoken() -> Token<'input> + = !"}}" token:token() { token } + + rule hex_digit() + = ['0'..='9' | 'a'..='f' | 'A'..='F'] + + /// Mandatory whitespace + rule __ + = ([' ' | '\t'] / quiet!{newline()} / quiet!{comment()})+ + + /// Optional whitespace + rule _ + = quiet!{__?} + + rule comment_start() + = "//" + / "/*" + + rule comment() + = "//" (!newline() [_])* (newline() / ![_]) + / "/*" (!"*/" [_])* "*/" + + rule newline() + = ['\n' | '\r'] + } +} |