From 09972613c44cf9968afc17d7294aa5e77fc29b9c Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 25 Mar 2015 23:52:08 +0100 Subject: Some initial work --- lex.cpp | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 372 insertions(+) create mode 100644 lex.cpp (limited to 'lex.cpp') diff --git a/lex.cpp b/lex.cpp new file mode 100644 index 0000000..59d17f2 --- /dev/null +++ b/lex.cpp @@ -0,0 +1,372 @@ +/* + Copyright (c) 2013-2014, Matthias Schiffer + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#include "lex.hpp" + +#include + + +#define array_size(array) (sizeof(array)/sizeof((array)[0])) + + +namespace solar { + + +struct keyword_t { + const char *keyword; + int token; +}; + +/* the keyword list must be sorted */ +static const keyword_t keywords[] = { +}; + +static int compare_keywords(const void *v1, const void *v2) { + const keyword_t *k1 = static_cast(v1), *k2 = static_cast(v2); + return std::strcmp(k1->keyword, k2->keyword); +} + + +bool lex_t::advance() { + if (start > 0) { + std::memmove(buffer, buffer+start, end - start); + end -= start; + start = 0; + } + + if (end == sizeof(buffer)) + return false; + + size_t l = std::fread(buffer+end, 1, sizeof(buffer) - end, file); + + end += l; + return l; +} + +bool lex_t::next(bool move) { + if (start + tok_len >= end) + return false; + + if (current() == '\n') { + loc.last_column = 0; + loc.last_line++; + } + else { + loc.last_column++; + } + + if (move) + start++; + else + tok_len++; + + + if (start + tok_len >= end) + return advance(); + + return true; +} + +void lex_t::consume(bool consume_needspace) { + start += tok_len; + tok_len = 0; + + needspace = consume_needspace; +} + +int lex_t::io_error(value_t *value) { + value->error = "I/O error"; + return -1; +} + +int lex_t::syntax_error(value_t *value) { + if (std::ferror(file)) + return io_error(value); + + value->error = "syntax error"; + return -1; +} + +int lex_t::consume_comment(value_t *value) { + char prev = 0; + + while (next(true)) { + if (prev == '*' && current() == '/') { + next(true); + consume(false); + return 0; + } + + prev = current(); + } + + if (std::ferror(file)) + return io_error(value); + + value->error = "unterminated block comment"; + return -1; +} + +int lex_t::unterminated_string(value_t *value) { + if (ferror(file)) + return io_error(value); + + value->error = "unterminated string"; + return -1; +} + +int lex_t::lex_string(value_t *value) { + char *buf = NULL; + size_t len = 1024; + size_t pos = 0; + + if (needspace) + return syntax_error(value); + + buf = static_cast(std::malloc(len)); + + while (true) { + if (!next(true)) { + std::free(buf); + return unterminated_string(value); + } + + char cur = current(); + + if (cur == '"') + break; + + if (cur == '\\') { + if (!next(true)) { + free(buf); + return unterminated_string(value); + } + + cur = current(); + + if (cur == '\n') + continue; + } + + if (pos >= len) { + len *= 2; + buf = static_cast(std::realloc(buf, len)); + } + + buf[pos++] = cur; + } + + value->str = strndup(buf, pos); + std::free(buf); + + next(true); + consume(true); + + return TOK_STRING; +} + +int lex_t::lex_number(value_t *value) { + if (needspace) + return syntax_error(value); + + while (next(false)) { + char cur = current(); + + if (cur >= '0' && cur <= '9') + continue; + + if (cur == 'x' || (cur >= 'a' && cur <= 'f') || (cur >= 'A' && cur <= 'F')) + continue; + + break; + } + + char *endptr, *token = get_token(); + value->number = std::strtoull(token, &endptr, 0); + + bool ok = !*endptr; + free(token); + + if (!ok) + return syntax_error(value); + + consume(true); + + return TOK_UINT; +} + +int lex_t::lex_keyword(value_t *value) { + if (needspace) + return syntax_error(value); + + while (next(false)) { + char cur = current(); + + if (!((cur >= 'a' && cur <= 'z') || (cur >= '0' && cur <= '9') || cur == '-')) + break; + } + + char *token = get_token(); + const keyword_t key = { .keyword = token }; + const keyword_t *ret = static_cast(bsearch(&key, keywords, array_size(keywords), sizeof(keyword_t), compare_keywords)); + free(token); + + if (!ret) + return syntax_error(value); + + consume(true); + + return ret->token; +} + +int lex_t::lex_symbol(value_t *value, bool terminal) { + if (needspace) + return syntax_error(value); + + while (next(false)) { + char cur = current(); + + switch (cur) { + case 'A' ... 'Z': + if (!terminal) + break; + + continue; + + case 'a' ... 'z': + if (terminal) + break; + + continue; + + case '0' ... '9': + case '_': + continue; + } + + break; + } + + value->str = get_token(); + return terminal ? TOK_TERM : TOK_NONTERM; +} + +lex_t::lex_t(FILE *file0) : file(file0), needspace(false), start(0), end(0), tok_len(0) { + advance(); +} + +int lex_t::lex(value_t *value) { + int token; + + while (end > start) { + loc.first_line = loc.last_line; + loc.first_column = loc.last_column+1; + + switch (current()) { + case ' ': + case '\n': + case '\t': + case '\r': + next(true); + consume(false); + continue; + + case ';': + case ':': + case '{': + case '}': + case '|': + case '=': + token = current(); + next(true); + consume(false); + return token; + + case '/': + if (!next(true)) + return syntax_error(value); + + if (current() == '*') { + token = consume_comment(value); + if (token) + return token; + + continue; + } + + if (current() != '/') + return syntax_error(value); + + /* fall-through */ + case '#': + while (next(true)) { + if (current() == '\n') + break; + } + + next(true); + consume(false); + continue; + + case '\'': + if (!next(true)) + return syntax_error(value); + + value->number = current(); + + if (!next(true) || current() != '\'') + return syntax_error(value); + + next(true); + + consume(false); + + return TOK_CHAR; + + case '"': + return lex_string(value); + + case '0' ... '9': + return lex_number(value); + + case 'a' ... 'z': + return lex_symbol(value, false); + + case 'A' ... 'Z': + return lex_symbol(value, true); + + default: + return syntax_error(value); + } + } + + if (ferror(file)) + return io_error(value); + + return 0; +} + +} -- cgit v1.2.3