/* * Tokenizer.cpp * * Copyright (C) 2008 Matthias Schiffer * * This program is free software: you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the * Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License along * with this program. If not, see . */ #include "Tokenizer.h" #include #include namespace Mad { namespace Core { const String Tokenizer::delimiters(" \t\n\"'\\"); std::vector Tokenizer::split(const String &str) { std::vector ret; for(boost::int32_t s = 0; s < str.length();) { boost::int32_t index = str.findFirstOf(delimiters, s); boost::int32_t length = (index < 0) ? -1 : index-s; ret.push_back(str.substr(s, length)); if(index >= 0) { boost::int32_t index2 = str.findFirstNotOf(delimiters, index); length = (index2 < 0) ? -1 : index2-index; ret.push_back(str.substr(index, length)); if(index2 >= 0) s = index2; else break; } else break; } return ret; } bool Tokenizer::tokenize(const String &str, std::vector &out) { std::vector splitString = split(str); bool singleQuotes = false, doubleQuotes = false, escape = false; String token; bool forceToken = false; out.clear(); for(std::vector::iterator s = splitString.begin(); s != splitString.end(); ++s) { token += *s; escape = false; if(++s == splitString.end()) break; for(icu::StringCharacterIterator c(*s); c.hasNext(); c.next()) { if(c.current() == '\n' && escape) { escape = false; if(doubleQuotes) continue; } if(escape || (singleQuotes && c.current() != '\'')) { token += c.current(); escape = false; continue; } switch(c.current()) { case ' ': case '\t': case '\n': if(doubleQuotes) { token += c.current(); } else { if(!token.isEmpty() || forceToken) { out.push_back(token); token.remove(); forceToken = false; } } break; case '"': doubleQuotes = !doubleQuotes; forceToken = true; break; case '\'': if(doubleQuotes) { token += c.current(); } else { singleQuotes = !singleQuotes; forceToken = true; } break; case '\\': escape = true; } } } if(!token.isEmpty() || forceToken) out.push_back(token); return !(singleQuotes || doubleQuotes || escape); } } }