diff options
Diffstat (limited to 'src/Core/Tokenizer.cpp')
-rw-r--r-- | src/Core/Tokenizer.cpp | 50 |
1 files changed, 26 insertions, 24 deletions
diff --git a/src/Core/Tokenizer.cpp b/src/Core/Tokenizer.cpp index 4043b39..93c853a 100644 --- a/src/Core/Tokenizer.cpp +++ b/src/Core/Tokenizer.cpp @@ -18,30 +18,32 @@ */ #include "Tokenizer.h" +#include <boost/cstdint.hpp> +#include <unicode/schriter.h> namespace Mad { namespace Core { -const std::string Tokenizer::delimiters = " \t\n\"'\\"; +const UnicodeString Tokenizer::delimiters(" \t\n\"'\\"); -std::vector<std::string> Tokenizer::split(const std::string &str) { - std::vector<std::string> ret; +std::vector<UnicodeString> Tokenizer::split(const UnicodeString &str) { + std::vector<UnicodeString> ret; - for(size_t s = 0; s < str.length();) { - size_t index = str.find_first_of(delimiters, s); - size_t length = (index == std::string::npos) ? std::string::npos : index-s; + for(boost::int32_t s = 0; s < str.length();) { + boost::int32_t index = str.findFirstOf(delimiters, s); + boost::int32_t length = (index < 0) ? -1 : index-s; ret.push_back(str.substr(s, length)); - if(index != std::string::npos) { - size_t index2 = str.find_first_not_of(delimiters, index); + if(index >= 0) { + boost::int32_t index2 = str.findFirstNotOf(delimiters, index); - length = (index2 == std::string::npos) ? std::string::npos : index2-index; + length = (index2 < 0) ? -1 : index2-index; ret.push_back(str.substr(index, length)); - if(index2 != std::string::npos) + if(index2 >= 0) s = index2; else break; @@ -53,48 +55,48 @@ std::vector<std::string> Tokenizer::split(const std::string &str) { return ret; } -bool Tokenizer::tokenize(const std::string &str, std::vector<std::string> &out) { - std::vector<std::string> splitString = split(str); +bool Tokenizer::tokenize(const UnicodeString &str, std::vector<UnicodeString> &out) { + std::vector<UnicodeString> splitString = split(str); bool singleQuotes = false, doubleQuotes = false, escape = false; - std::string token; + UnicodeString token; bool forceToken = false; out.clear(); - for(std::vector<std::string>::iterator s = splitString.begin(); s != splitString.end(); ++s) { + for(std::vector<UnicodeString>::iterator s = splitString.begin(); s != splitString.end(); ++s) { token += *s; escape = false; if(++s == splitString.end()) break; - for(std::string::iterator c = s->begin(); c != s->end(); ++c) { - if(*c == '\n' && escape) { + for(icu::StringCharacterIterator c(*s); c.hasNext(); c.next()) { + if(c.current() == '\n' && escape) { escape = false; if(doubleQuotes) continue; } - if(escape || (singleQuotes && *c != '\'')) { - token += *c; + if(escape || (singleQuotes && c.current() != '\'')) { + token += c.current(); escape = false; continue; } - switch(*c) { + switch(c.current()) { case ' ': case '\t': case '\n': if(doubleQuotes) { - token += *c; + token += c.current(); } else { - if(!token.empty() || forceToken) { + if(!token.isEmpty() || forceToken) { out.push_back(token); - token.clear(); + token.remove(); forceToken = false; } } @@ -107,7 +109,7 @@ bool Tokenizer::tokenize(const std::string &str, std::vector<std::string> &out) case '\'': if(doubleQuotes) { - token += *c; + token += c.current(); } else { singleQuotes = !singleQuotes; @@ -121,7 +123,7 @@ bool Tokenizer::tokenize(const std::string &str, std::vector<std::string> &out) } } - if(!token.empty() || forceToken) + if(!token.isEmpty() || forceToken) out.push_back(token); return !(singleQuotes || doubleQuotes || escape); |