/* $Id$ */ /* Copyright (C) 2007 - 2012 by David White Part of the Silver Tree Project This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by or later. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY. See the COPYING file for more details. */ #include #include "foreach.hpp" #include "formula_tokenizer.hpp" namespace formula_tokenizer { namespace { void raise_exception(iterator& i1, iterator i2, std::string str) { std::ostringstream expr; while( (i1 != i2) && (*i1 != '\n') ) { if( (*i1 != '\t') ) expr << *i1; ++i1; } if( str.empty() ) throw token_error("Unrecognized token", expr.str() ); else throw token_error(str, expr.str() ); } } token get_token(iterator& i1, iterator i2) { iterator it = i1; if( *i1 >= 'A' ) { //current character is >= 'A', limit search to the upper-half of the ASCII table // check if we parse now TOKEN_IDENTIFIER or TOKEN_OPERATOR/KEYWORD based on string if( *i1 <= 'Z' || ( *i1 >= 'a' && *it <= 'z' ) || *i1 == '_' ) { while( i1 != i2 && ( ( *i1 >= 'a' && *i1 <= 'z' ) || *i1 == '_' || ( *i1 >= 'A' && *i1 <= 'Z' ) ) ) ++i1; int diff = i1 - it; TOKEN_TYPE t = TOKEN_IDENTIFIER; //check if this string matches any keyword or an operator //possible opearators and keywords: // d, or, def, and, not, fai, where, faiend, functions if( diff == 1 ) { if( *it == 'd' ) t = TOKEN_OPERATOR; } else if( diff == 2 ) { if( *it == 'o' && *(it+1) == 'r' ) t = TOKEN_OPERATOR; } else if( diff == 3 ) { if( *it == 'd' ) { //def if( *(it+1) == 'e' && *(it+2) == 'f' ) t = TOKEN_KEYWORD; } else if( *it == 'a' ) { //and if( *(it+1) == 'n' && *(it+2) == 'd' ) t = TOKEN_OPERATOR; } else if( *it == 'n' ) { //not if( *(it+1) == 'o' && *(it+2) == 't' ) t = TOKEN_OPERATOR; } else if( *it == 'f' ) { //fai if( *(it+1) == 'a' && *(it+2) == 'i' ) t = TOKEN_KEYWORD; } } else if( diff == 5 ) { std::string s(it, i1); if( s == "where" ) t = TOKEN_OPERATOR; } else if( diff == 6 ) { std::string s(it, i1); if( s == "faiend" ) t = TOKEN_KEYWORD; } else if( diff == 9 ) { std::string s(it, i1); if( s == "functions" ) t = TOKEN_KEYWORD; } return token( it, i1, t); } else { //at this point only 3 chars left to check: if( *i1 == '[' ) return token( it, ++i1, TOKEN_LSQUARE ); if( *i1 == ']' ) return token( it, ++i1, TOKEN_RSQUARE ); if( *i1 == '^' ) return token( it, ++i1, TOKEN_OPERATOR ); } } else { //limit search to the lower-half of the ASCII table //start by checking for whitespaces/end of line char if( *i1 <= ' ' ) { if( *i1 == '\n' ) { return token( it, ++i1, TOKEN_EOL); } else { while( i1 != i2 && *i1 <= ' ' && *i1 != '\n' ) ++i1; return token( it, i1, TOKEN_WHITESPACE ); } //try to further limit number of characters that we need to check: } else if ( *i1 >= '0' ){ //current character is between '0' and '@' if( *i1 <= '9' ) { //we parse integer or decimal number ++i1; bool dot = false; while( i1 != i2 ) { if( *i1 >= '0' && *i1 <= '9' ) { //do nothing } else { //look for '.' in case of decimal numer if( *i1 == '.' ) { //allow only one dot in such expression if( !dot ) dot = true; else raise_exception(it, i2, "Multiple dots near decimal expression"); } else break; } ++i1; } if( dot ) return token( it, i1, TOKEN_DECIMAL ); else return token( it, i1, TOKEN_INTEGER ); } else { //current character is between ':' and '@' //possible tokens at this point that we are intersted with: // ; < = > <= >= if( *i1 == ';' ) { return token( it, ++i1, TOKEN_SEMICOLON); } else if( *i1 == '=' ) { return token( it, ++i1, TOKEN_OPERATOR); } else if( *i1 == '<' ) { ++i1; if( i1 != i2 ) { if( *i1 == '=' ) return token( it, ++i1, TOKEN_OPERATOR); else return token( it, i1, TOKEN_OPERATOR); } else return token( it, i1, TOKEN_OPERATOR); } else if( *i1 == '>' ) { ++i1; if( i1 != i2 ) { if( *i1 == '=' ) return token( it, ++i1, TOKEN_OPERATOR); else return token( it, i1, TOKEN_OPERATOR); } else return token( it, i1, TOKEN_OPERATOR); } } //current character is between '!' and '/' } else if ( *i1 == ',' ) { return token( it, ++i1, TOKEN_COMMA); } else if ( *i1 == '.' ) { ++i1; if( i1 != i2 ) { if( *i1 == '+' || *i1 == '-' || *i1 == '*' || *i1 == '/') return token( it, ++i1, TOKEN_OPERATOR ); else return token( it, i1, TOKEN_OPERATOR ); } else { return token( it, i1, TOKEN_OPERATOR); } } else if ( *i1 == '(' ) { return token( it, ++i1, TOKEN_LPARENS); } else if ( *i1 == ')' ) { return token( it, ++i1, TOKEN_RPARENS); } else if ( *i1 == '\'' ) { ++i1; while( i1 != i2 && *i1 != '\'' ) ++i1; if( i1 != i2 ) { return token( it, ++i1, TOKEN_STRING_LITERAL ); } else { raise_exception(it, i2, "Missing closing ' for formula string"); } } else if ( *i1 == '#' ) { ++i1; while( i1 != i2 && *i1 != '#' ) ++i1; if( i1 != i2 ) { return token( it, ++i1, TOKEN_COMMENT ); } else { raise_exception(it, i2, "Missing closing # for formula comment"); } } else if ( *i1 == '+' ) { return token( it, ++i1, TOKEN_OPERATOR); } else if ( *i1 == '-' ) { ++i1; if( i1 != i2 ) { if( *i1 == '>' ) return token( it, ++i1, TOKEN_POINTER ); else return token( it, i1, TOKEN_OPERATOR ); } else { return token( it, i1, TOKEN_OPERATOR); } } else if ( *i1 == '*' ) { return token( it, ++i1, TOKEN_OPERATOR); } else if ( *i1 == '/' ) { return token( it, ++i1, TOKEN_OPERATOR); } else if ( *i1 == '%' ) { return token( it, ++i1, TOKEN_OPERATOR); } else if ( *i1 == '!' ) { ++i1; if( *i1 == '=' ) return token( it, ++i1, TOKEN_OPERATOR); else raise_exception(it, i2, std::string() ); } } raise_exception(it, i2, std::string() ); return token(); } } #ifdef UNIT_TEST_TOKENIZER int main() { using namespace formula_tokenizer; std::string test = "(abc + 4 * (5+3))^2"; std::string::const_iterator i1 = test.begin(); std::string::const_iterator i2 = test.end(); TOKEN_TYPE types[] = {TOKEN_LPARENS, TOKEN_IDENTIFIER, TOKEN_WHITESPACE, TOKEN_OPERATOR, TOKEN_WHITESPACE, TOKEN_INTEGER, TOKEN_WHITESPACE, TOKEN_OPERATOR, TOKEN_WHITESPACE, TOKEN_LPARENS, TOKEN_INTEGER, TOKEN_OPERATOR, TOKEN_INTEGER, TOKEN_RPARENS, TOKEN_RPARENS, TOKEN_KEYWORD, TOKEN_OPERATOR, TOKEN_INTEGER}; std::string tokens[] = {"(", "abc", " ", "+", " ", "4", " ", "*", " ", "(", "5", "+", "3", ")", ")", "functions"}; for(int n = 0; n != sizeof(types)/sizeof(*types); ++n) { token t = get_token(i1,i2); assert(std::string(t.begin,t.end) == tokens[n]); assert(t.type == types[n]); } return 0; } #endif