/*------------------------------------------------------------------------------ | |
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team | |
* | |
* Distributable under the terms of either the Apache License (Version 2.0) or | |
* the GNU Lesser General Public License, as specified in the COPYING file. | |
* | |
* Changes are Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). | |
------------------------------------------------------------------------------*/ | |
#include "CLucene/StdHeader.h" | |
#include "QueryParser.h" | |
#include "CLucene/analysis/AnalysisHeader.h" | |
#include "CLucene/util/Reader.h" | |
#include "CLucene/search/SearchHeader.h" | |
#include "CLucene/index/Term.h" | |
#include "TokenList.h" | |
#include "QueryToken.h" | |
#include "QueryParserBase.h" | |
#include "Lexer.h" | |
CL_NS_USE(util) | |
CL_NS_USE(index) | |
CL_NS_USE(analysis) | |
CL_NS_USE(search) | |
CL_NS_DEF(queryParser) | |
QueryParser::QueryParser(const TCHAR* _field, Analyzer* _analyzer) : QueryParserBase(_analyzer){ | |
//Func - Constructor. | |
// Instantiates a QueryParser for the named field _field | |
//Pre - _field != NULL | |
//Post - An instance has been created | |
if ( _field ) | |
field = STRDUP_TtoT(_field); | |
else | |
field = NULL; | |
tokens = NULL; | |
lowercaseExpandedTerms = true; | |
} | |
QueryParser::~QueryParser() { | |
//Func - Destructor | |
//Pre - true | |
//Post - The instance has been destroyed | |
_CLDELETE_CARRAY(field); | |
} | |
//static | |
Query* QueryParser::parse(const TCHAR* query, const TCHAR* field, Analyzer* analyzer){ | |
//Func - Returns a new instance of the Query class with a specified query, field and | |
// analyzer values. | |
//Pre - query != NULL and holds the query to parse | |
// field != NULL and holds the default field for query terms | |
// analyzer holds a valid reference to an Analyzer and is used to | |
// find terms in the query text | |
//Post - query has been parsed and an instance of Query has been returned | |
CND_PRECONDITION(query != NULL, "query is NULL"); | |
CND_PRECONDITION(field != NULL, "field is NULL"); | |
QueryParser parser(field, analyzer); | |
return parser.parse(query); | |
} | |
Query* QueryParser::parse(const TCHAR* query){ | |
//Func - Returns a parsed Query instance | |
//Pre - query != NULL and contains the query value to be parsed | |
//Post - Returns a parsed Query Instance | |
CND_PRECONDITION(query != NULL, "query is NULL"); | |
//Instantie a Stringer that can read the query string | |
Reader* r = _CLNEW StringReader(query); | |
//Check to see if r has been created properly | |
CND_CONDITION(r != NULL, "Could not allocate memory for StringReader r"); | |
//Pointer for the return value | |
Query* ret = NULL; | |
try{ | |
//Parse the query managed by the StringReader R and return a parsed Query instance | |
//into ret | |
ret = parse(r); | |
}_CLFINALLY ( | |
_CLDELETE(r); | |
); | |
return ret; | |
} | |
Query* QueryParser::parse(Reader* reader){ | |
//Func - Returns a parsed Query instance | |
//Pre - reader contains a valid reference to a Reader and manages the query string | |
//Post - A parsed Query instance has been returned or | |
//instantiate the TokenList tokens | |
TokenList _tokens; | |
this->tokens = &_tokens; | |
//Instantiate a lexer | |
Lexer lexer(this, reader); | |
//tokens = lexer.Lex(); | |
//Lex the tokens | |
lexer.Lex(tokens); | |
//Peek to the first token and check if is an EOF | |
if (tokens->peek()->Type == QueryToken::EOF_){ | |
// The query string failed to yield any tokens. We discard the | |
// TokenList tokens and raise an exceptioin. | |
QueryToken* token = this->tokens->extract(); | |
_CLDELETE(token); | |
_CLTHROWA(CL_ERR_Parse, "No query given."); | |
} | |
//Return the parsed Query instance | |
Query* ret = MatchQuery(field); | |
this->tokens = NULL; | |
return ret; | |
} | |
int32_t QueryParser::MatchConjunction(){ | |
//Func - matches for CONJUNCTION | |
// CONJUNCTION ::= <AND> | <OR> | |
//Pre - tokens != NULL | |
//Post - if the first token is an AND or an OR then | |
// the token is extracted and deleted and CONJ_AND or CONJ_OR is returned | |
// otherwise CONJ_NONE is returned | |
CND_PRECONDITION(tokens != NULL, "tokens is NULL"); | |
switch(tokens->peek()->Type){ | |
case QueryToken::AND_ : | |
//Delete the first token of tokenlist | |
ExtractAndDeleteToken(); | |
return CONJ_AND; | |
case QueryToken::OR : | |
//Delete the first token of tokenlist | |
ExtractAndDeleteToken(); | |
return CONJ_OR; | |
default : | |
return CONJ_NONE; | |
} | |
} | |
int32_t QueryParser::MatchModifier(){ | |
//Func - matches for MODIFIER | |
// MODIFIER ::= <PLUS> | <MINUS> | <NOT> | |
//Pre - tokens != NULL | |
//Post - if the first token is a PLUS the token is extracted and deleted and MOD_REQ is returned | |
// if the first token is a MINUS or NOT the token is extracted and deleted and MOD_NOT is returned | |
// otherwise MOD_NONE is returned | |
CND_PRECONDITION(tokens != NULL, "tokens is NULL"); | |
switch(tokens->peek()->Type){ | |
case QueryToken::PLUS : | |
//Delete the first token of tokenlist | |
ExtractAndDeleteToken(); | |
return MOD_REQ; | |
case QueryToken::MINUS : | |
case QueryToken::NOT : | |
//Delete the first token of tokenlist | |
ExtractAndDeleteToken(); | |
return MOD_NOT; | |
default : | |
return MOD_NONE; | |
} | |
} | |
Query* QueryParser::MatchQuery(const TCHAR* field){ | |
//Func - matches for QUERY | |
// QUERY ::= [MODIFIER] QueryParser::CLAUSE (<CONJUNCTION> [MODIFIER] CLAUSE)* | |
//Pre - field != NULL | |
//Post - | |
CND_PRECONDITION(tokens != NULL, "tokens is NULL"); | |
CL_NS_STD(vector)<BooleanClause*> clauses; | |
Query* q = NULL; | |
int32_t mods = MOD_NONE; | |
int32_t conj = CONJ_NONE; | |
//match for MODIFIER | |
mods = MatchModifier(); | |
//match for CLAUSE | |
q = MatchClause(field); | |
AddClause(clauses, CONJ_NONE, mods, q); | |
// match for CLAUSE* | |
while(true){ | |
QueryToken* p = tokens->peek(); | |
if(p->Type == QueryToken::EOF_){ | |
QueryToken* qt = MatchQueryToken(QueryToken::EOF_); | |
_CLDELETE(qt); | |
break; | |
} | |
if(p->Type == QueryToken::RPAREN){ | |
//MatchQueryToken(QueryToken::RPAREN); | |
break; | |
} | |
//match for a conjuction (AND OR NOT) | |
conj = MatchConjunction(); | |
//match for a modifier | |
mods = MatchModifier(); | |
q = MatchClause(field); | |
if ( q != NULL ) | |
AddClause(clauses, conj, mods, q); | |
} | |
// finalize query | |
if(clauses.size() == 1){ //bvk: removed this && firstQuery != NULL | |
BooleanClause* c = clauses[0]; | |
Query* q = c->query; | |
//Condition check to be sure clauses[0] is valid | |
CND_CONDITION(c != NULL, "c is NULL"); | |
//Tell the boolean clause not to delete its query | |
c->deleteQuery=false; | |
//Clear the clauses list | |
clauses.clear(); | |
_CLDELETE(c); | |
return q; | |
}else{ | |
return GetBooleanQuery(clauses); | |
} | |
} | |
Query* QueryParser::MatchClause(const TCHAR* field){ | |
//Func - matches for CLAUSE | |
// CLAUSE ::= [TERM <COLONQueryParser::>] ( TERM | (<LPAREN> QUERY <RPAREN>)) | |
//Pre - field != NULL | |
//Post - | |
Query* q = NULL; | |
const TCHAR* sfield = field; | |
bool delField = false; | |
QueryToken *DelToken = NULL; | |
//match for [TERM <COLON>] | |
QueryToken* term = tokens->extract(); | |
if(term->Type == QueryToken::TERM && tokens->peek()->Type == QueryToken::COLON){ | |
DelToken = MatchQueryToken(QueryToken::COLON); | |
CND_CONDITION(DelToken != NULL,"DelToken is NULL"); | |
_CLDELETE(DelToken); | |
TCHAR* tmp = STRDUP_TtoT(term->Value); | |
discardEscapeChar(tmp); | |
delField = true; | |
sfield = tmp; | |
_CLDELETE(term); | |
}else{ | |
tokens->push(term); | |
term = NULL; | |
} | |
// match for | |
// TERM | (<LPAREN> QUERY <RPAREN>) | |
if(tokens->peek()->Type == QueryToken::LPAREN){ | |
DelToken = MatchQueryToken(QueryToken::LPAREN); | |
CND_CONDITION(DelToken != NULL,"DelToken is NULL"); | |
_CLDELETE(DelToken); | |
q = MatchQuery(sfield); | |
//DSR:2004.11.01: | |
//If exception is thrown while trying to match trailing parenthesis, | |
//need to prevent q from leaking. | |
try{ | |
DelToken = MatchQueryToken(QueryToken::RPAREN); | |
CND_CONDITION(DelToken != NULL,"DelToken is NULL"); | |
_CLDELETE(DelToken); | |
}catch(...) { | |
_CLDELETE(q); | |
throw; | |
} | |
}else{ | |
q = MatchTerm(sfield); | |
} | |
if ( delField ) | |
_CLDELETE_CARRAY(sfield); | |
return q; | |
} | |
Query* QueryParser::MatchTerm(const TCHAR* field){ | |
//Func - matches for TERM | |
// TERM ::= TERM | PREFIXTERM | WILDTERM | NUMBER | |
// [ <FUZZY> ] [ <CARAT> <NUMBER> [<FUZZY>]] | |
// | (<RANGEIN> | <RANGEEX>) [<CARAT> <NUMBER>] | |
// | <QUOTED> [SLOP] [<CARAT> <NUMBER>] | |
//Pre - field != NULL | |
//Post - | |
QueryToken* term = NULL; | |
QueryToken* slop = NULL; | |
QueryToken* boost = NULL; | |
bool prefix = false; | |
bool wildcard = false; | |
bool fuzzy = false; | |
bool rangein = false; | |
Query* q = NULL; | |
term = tokens->extract(); | |
QueryToken* DelToken = NULL; //Token that is about to be deleted | |
switch(term->Type){ | |
case QueryToken::TERM: | |
case QueryToken::NUMBER: | |
case QueryToken::PREFIXTERM: | |
case QueryToken::WILDTERM: | |
{ //start case | |
//Check if type of QueryToken term is a prefix term | |
if(term->Type == QueryToken::PREFIXTERM){ | |
prefix = true; | |
} | |
//Check if type of QueryToken term is a wildcard term | |
if(term->Type == QueryToken::WILDTERM){ | |
wildcard = true; | |
} | |
//Peek to see if the type of the next token is fuzzy term | |
if(tokens->peek()->Type == QueryToken::FUZZY){ | |
DelToken = MatchQueryToken(QueryToken::FUZZY); | |
CND_CONDITION(DelToken !=NULL, "DelToken is NULL"); | |
_CLDELETE(DelToken); | |
fuzzy = true; | |
} | |
if(tokens->peek()->Type == QueryToken::CARAT){ | |
DelToken = MatchQueryToken(QueryToken::CARAT); | |
CND_CONDITION(DelToken !=NULL, "DelToken is NULL"); | |
_CLDELETE(DelToken); | |
boost = MatchQueryToken(QueryToken::NUMBER); | |
if(tokens->peek()->Type == QueryToken::FUZZY){ | |
DelToken = MatchQueryToken(QueryToken::FUZZY); | |
CND_CONDITION(DelToken !=NULL, "DelToken is NULL"); | |
_CLDELETE(DelToken); | |
fuzzy = true; | |
} | |
} //end if type==CARAT | |
discardEscapeChar(term->Value); //clean up | |
if(wildcard){ | |
q = GetWildcardQuery(field,term->Value); | |
break; | |
}else if(prefix){ | |
//Create a PrefixQuery | |
term->Value[_tcslen(term->Value)-1] = 0; //discard the * | |
q = GetPrefixQuery(field,term->Value); | |
break; | |
}else if(fuzzy){ | |
//Create a FuzzyQuery | |
//Check if the last char is a ~ | |
if(term->Value[_tcslen(term->Value)-1] == '~'){ | |
//remove the ~ | |
term->Value[_tcslen(term->Value)-1] = '\0'; | |
} | |
q = GetFuzzyQuery(field,term->Value); | |
break; | |
}else{ | |
q = GetFieldQuery(field, term->Value); | |
break; | |
} | |
} | |
case QueryToken::RANGEIN: | |
case QueryToken::RANGEEX:{ | |
if(term->Type == QueryToken::RANGEIN){ | |
rangein = true; | |
} | |
if(tokens->peek()->Type == QueryToken::CARAT){ | |
DelToken = MatchQueryToken(QueryToken::CARAT); | |
CND_CONDITION(DelToken !=NULL, "DelToken is NULL"); | |
_CLDELETE(DelToken); | |
boost = MatchQueryToken(QueryToken::NUMBER); | |
} | |
TCHAR* noBrackets = term->Value + 1; | |
noBrackets[_tcslen(noBrackets)-1] = 0; | |
q = ParseRangeQuery(field, noBrackets, rangein); | |
break; | |
} | |
case QueryToken::QUOTED:{ | |
if(tokens->peek()->Type == QueryToken::SLOP){ | |
slop = MatchQueryToken(QueryToken::SLOP); | |
} | |
if(tokens->peek()->Type == QueryToken::CARAT){ | |
DelToken = MatchQueryToken(QueryToken::CARAT); | |
CND_CONDITION(DelToken !=NULL, "DelToken is NULL"); | |
_CLDELETE(DelToken); | |
boost = MatchQueryToken(QueryToken::NUMBER); | |
} | |
//remove the quotes | |
TCHAR* quotedValue = term->Value+1; | |
quotedValue[_tcslen(quotedValue)-1] = '\0'; | |
int32_t islop = phraseSlop; | |
if(slop != NULL ){ | |
try { | |
TCHAR* end; //todo: should parse using float... | |
islop = (int32_t)_tcstoi64(slop->Value+1, &end, 10); | |
}catch(...){ | |
//ignored | |
} | |
} | |
q = GetFieldQuery(field, quotedValue, islop); | |
_CLDELETE(slop); | |
} | |
} // end of switch | |
_CLDELETE(term); | |
if( q!=NULL && boost != NULL ){ | |
qreal f = 1.0F; | |
try { | |
TCHAR* tmp; | |
f = _tcstod(boost->Value, &tmp); | |
}catch(...){ | |
//ignored | |
} | |
_CLDELETE(boost); | |
q->setBoost( f); | |
} | |
return q; | |
} | |
QueryToken* QueryParser::MatchQueryToken(QueryToken::Types expectedType){ | |
//Func - matches for QueryToken of the specified type and returns it | |
// otherwise Exception throws | |
//Pre - tokens != NULL | |
//Post - | |
CND_PRECONDITION(tokens != NULL,"tokens is NULL"); | |
if(tokens->count() == 0){ | |
throwParserException(_T("Error: Unexpected end of program"),' ',0,0); | |
} | |
//Extract a token form the TokenList tokens | |
QueryToken* t = tokens->extract(); | |
//Check if the type of the token t matches the expectedType | |
if (expectedType != t->Type){ | |
TCHAR buf[200]; | |
_sntprintf(buf,200,_T("Error: Unexpected QueryToken: %d, expected: %d"),t->Type,expectedType); | |
_CLDELETE(t); | |
throwParserException(buf,' ',0,0); | |
} | |
//Return the matched token | |
return t; | |
} | |
void QueryParser::ExtractAndDeleteToken(void){ | |
//Func - Extracts the first token from the Tokenlist tokenlist | |
// and destroys it | |
//Pre - true | |
//Post - The first token has been extracted and destroyed | |
CND_PRECONDITION(tokens != NULL, "tokens is NULL"); | |
//Extract the token from the TokenList tokens | |
QueryToken* t = tokens->extract(); | |
//Condition Check Token may not be NULL | |
CND_CONDITION(t != NULL, "Token is NULL"); | |
//Delete Token | |
_CLDELETE(t); | |
} | |
CL_NS_END |