summaryrefslogtreecommitdiffstats
path: root/src/Core/Tokenizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/Core/Tokenizer.cpp')
-rw-r--r--src/Core/Tokenizer.cpp50
1 files changed, 26 insertions, 24 deletions
diff --git a/src/Core/Tokenizer.cpp b/src/Core/Tokenizer.cpp
index 4043b39..93c853a 100644
--- a/src/Core/Tokenizer.cpp
+++ b/src/Core/Tokenizer.cpp
@@ -18,30 +18,32 @@
*/
#include "Tokenizer.h"
+#include <boost/cstdint.hpp>
+#include <unicode/schriter.h>
namespace Mad {
namespace Core {
-const std::string Tokenizer::delimiters = " \t\n\"'\\";
+const UnicodeString Tokenizer::delimiters(" \t\n\"'\\");
-std::vector<std::string> Tokenizer::split(const std::string &str) {
- std::vector<std::string> ret;
+std::vector<UnicodeString> Tokenizer::split(const UnicodeString &str) {
+ std::vector<UnicodeString> ret;
- for(size_t s = 0; s < str.length();) {
- size_t index = str.find_first_of(delimiters, s);
- size_t length = (index == std::string::npos) ? std::string::npos : index-s;
+ for(boost::int32_t s = 0; s < str.length();) {
+ boost::int32_t index = str.findFirstOf(delimiters, s);
+ boost::int32_t length = (index < 0) ? -1 : index-s;
ret.push_back(str.substr(s, length));
- if(index != std::string::npos) {
- size_t index2 = str.find_first_not_of(delimiters, index);
+ if(index >= 0) {
+ boost::int32_t index2 = str.findFirstNotOf(delimiters, index);
- length = (index2 == std::string::npos) ? std::string::npos : index2-index;
+ length = (index2 < 0) ? -1 : index2-index;
ret.push_back(str.substr(index, length));
- if(index2 != std::string::npos)
+ if(index2 >= 0)
s = index2;
else
break;
@@ -53,48 +55,48 @@ std::vector<std::string> Tokenizer::split(const std::string &str) {
return ret;
}
-bool Tokenizer::tokenize(const std::string &str, std::vector<std::string> &out) {
- std::vector<std::string> splitString = split(str);
+bool Tokenizer::tokenize(const UnicodeString &str, std::vector<UnicodeString> &out) {
+ std::vector<UnicodeString> splitString = split(str);
bool singleQuotes = false, doubleQuotes = false, escape = false;
- std::string token;
+ UnicodeString token;
bool forceToken = false;
out.clear();
- for(std::vector<std::string>::iterator s = splitString.begin(); s != splitString.end(); ++s) {
+ for(std::vector<UnicodeString>::iterator s = splitString.begin(); s != splitString.end(); ++s) {
token += *s;
escape = false;
if(++s == splitString.end())
break;
- for(std::string::iterator c = s->begin(); c != s->end(); ++c) {
- if(*c == '\n' && escape) {
+ for(icu::StringCharacterIterator c(*s); c.hasNext(); c.next()) {
+ if(c.current() == '\n' && escape) {
escape = false;
if(doubleQuotes)
continue;
}
- if(escape || (singleQuotes && *c != '\'')) {
- token += *c;
+ if(escape || (singleQuotes && c.current() != '\'')) {
+ token += c.current();
escape = false;
continue;
}
- switch(*c) {
+ switch(c.current()) {
case ' ':
case '\t':
case '\n':
if(doubleQuotes) {
- token += *c;
+ token += c.current();
}
else {
- if(!token.empty() || forceToken) {
+ if(!token.isEmpty() || forceToken) {
out.push_back(token);
- token.clear();
+ token.remove();
forceToken = false;
}
}
@@ -107,7 +109,7 @@ bool Tokenizer::tokenize(const std::string &str, std::vector<std::string> &out)
case '\'':
if(doubleQuotes) {
- token += *c;
+ token += c.current();
}
else {
singleQuotes = !singleQuotes;
@@ -121,7 +123,7 @@ bool Tokenizer::tokenize(const std::string &str, std::vector<std::string> &out)
}
}
- if(!token.empty() || forceToken)
+ if(!token.isEmpty() || forceToken)
out.push_back(token);
return !(singleQuotes || doubleQuotes || escape);