blob: 04f018629fa88fa727691834066cb91a642e107f [file] [log] [blame]
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
*
* Changes are Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "QueryParser.h"
#include "CLucene/analysis/AnalysisHeader.h"
#include "CLucene/util/Reader.h"
#include "CLucene/search/SearchHeader.h"
#include "CLucene/index/Term.h"
#include "TokenList.h"
#include "QueryToken.h"
#include "QueryParserBase.h"
#include "Lexer.h"
CL_NS_USE(util)
CL_NS_USE(index)
CL_NS_USE(analysis)
CL_NS_USE(search)
CL_NS_DEF(queryParser)
QueryParser::QueryParser(const TCHAR* _field, Analyzer* _analyzer) : QueryParserBase(_analyzer){
//Func - Constructor.
// Instantiates a QueryParser for the named field _field
//Pre - _field != NULL
//Post - An instance has been created
if ( _field )
field = STRDUP_TtoT(_field);
else
field = NULL;
tokens = NULL;
lowercaseExpandedTerms = true;
}
QueryParser::~QueryParser() {
//Func - Destructor
//Pre - true
//Post - The instance has been destroyed
_CLDELETE_CARRAY(field);
}
//static
Query* QueryParser::parse(const TCHAR* query, const TCHAR* field, Analyzer* analyzer){
//Func - Returns a new instance of the Query class with a specified query, field and
// analyzer values.
//Pre - query != NULL and holds the query to parse
// field != NULL and holds the default field for query terms
// analyzer holds a valid reference to an Analyzer and is used to
// find terms in the query text
//Post - query has been parsed and an instance of Query has been returned
CND_PRECONDITION(query != NULL, "query is NULL");
CND_PRECONDITION(field != NULL, "field is NULL");
QueryParser parser(field, analyzer);
return parser.parse(query);
}
Query* QueryParser::parse(const TCHAR* query){
//Func - Returns a parsed Query instance
//Pre - query != NULL and contains the query value to be parsed
//Post - Returns a parsed Query Instance
CND_PRECONDITION(query != NULL, "query is NULL");
//Instantie a Stringer that can read the query string
Reader* r = _CLNEW StringReader(query);
//Check to see if r has been created properly
CND_CONDITION(r != NULL, "Could not allocate memory for StringReader r");
//Pointer for the return value
Query* ret = NULL;
try{
//Parse the query managed by the StringReader R and return a parsed Query instance
//into ret
ret = parse(r);
}_CLFINALLY (
_CLDELETE(r);
);
return ret;
}
Query* QueryParser::parse(Reader* reader){
//Func - Returns a parsed Query instance
//Pre - reader contains a valid reference to a Reader and manages the query string
//Post - A parsed Query instance has been returned or
//instantiate the TokenList tokens
TokenList _tokens;
this->tokens = &_tokens;
//Instantiate a lexer
Lexer lexer(this, reader);
//tokens = lexer.Lex();
//Lex the tokens
lexer.Lex(tokens);
//Peek to the first token and check if is an EOF
if (tokens->peek()->Type == QueryToken::EOF_){
// The query string failed to yield any tokens. We discard the
// TokenList tokens and raise an exceptioin.
QueryToken* token = this->tokens->extract();
_CLDELETE(token);
_CLTHROWA(CL_ERR_Parse, "No query given.");
}
//Return the parsed Query instance
Query* ret = MatchQuery(field);
this->tokens = NULL;
return ret;
}
int32_t QueryParser::MatchConjunction(){
//Func - matches for CONJUNCTION
// CONJUNCTION ::= <AND> | <OR>
//Pre - tokens != NULL
//Post - if the first token is an AND or an OR then
// the token is extracted and deleted and CONJ_AND or CONJ_OR is returned
// otherwise CONJ_NONE is returned
CND_PRECONDITION(tokens != NULL, "tokens is NULL");
switch(tokens->peek()->Type){
case QueryToken::AND_ :
//Delete the first token of tokenlist
ExtractAndDeleteToken();
return CONJ_AND;
case QueryToken::OR :
//Delete the first token of tokenlist
ExtractAndDeleteToken();
return CONJ_OR;
default :
return CONJ_NONE;
}
}
int32_t QueryParser::MatchModifier(){
//Func - matches for MODIFIER
// MODIFIER ::= <PLUS> | <MINUS> | <NOT>
//Pre - tokens != NULL
//Post - if the first token is a PLUS the token is extracted and deleted and MOD_REQ is returned
// if the first token is a MINUS or NOT the token is extracted and deleted and MOD_NOT is returned
// otherwise MOD_NONE is returned
CND_PRECONDITION(tokens != NULL, "tokens is NULL");
switch(tokens->peek()->Type){
case QueryToken::PLUS :
//Delete the first token of tokenlist
ExtractAndDeleteToken();
return MOD_REQ;
case QueryToken::MINUS :
case QueryToken::NOT :
//Delete the first token of tokenlist
ExtractAndDeleteToken();
return MOD_NOT;
default :
return MOD_NONE;
}
}
Query* QueryParser::MatchQuery(const TCHAR* field){
//Func - matches for QUERY
// QUERY ::= [MODIFIER] QueryParser::CLAUSE (<CONJUNCTION> [MODIFIER] CLAUSE)*
//Pre - field != NULL
//Post -
CND_PRECONDITION(tokens != NULL, "tokens is NULL");
CL_NS_STD(vector)<BooleanClause*> clauses;
Query* q = NULL;
int32_t mods = MOD_NONE;
int32_t conj = CONJ_NONE;
//match for MODIFIER
mods = MatchModifier();
//match for CLAUSE
q = MatchClause(field);
AddClause(clauses, CONJ_NONE, mods, q);
// match for CLAUSE*
while(true){
QueryToken* p = tokens->peek();
if(p->Type == QueryToken::EOF_){
QueryToken* qt = MatchQueryToken(QueryToken::EOF_);
_CLDELETE(qt);
break;
}
if(p->Type == QueryToken::RPAREN){
//MatchQueryToken(QueryToken::RPAREN);
break;
}
//match for a conjuction (AND OR NOT)
conj = MatchConjunction();
//match for a modifier
mods = MatchModifier();
q = MatchClause(field);
if ( q != NULL )
AddClause(clauses, conj, mods, q);
}
// finalize query
if(clauses.size() == 1){ //bvk: removed this && firstQuery != NULL
BooleanClause* c = clauses[0];
Query* q = c->query;
//Condition check to be sure clauses[0] is valid
CND_CONDITION(c != NULL, "c is NULL");
//Tell the boolean clause not to delete its query
c->deleteQuery=false;
//Clear the clauses list
clauses.clear();
_CLDELETE(c);
return q;
}else{
return GetBooleanQuery(clauses);
}
}
Query* QueryParser::MatchClause(const TCHAR* field){
//Func - matches for CLAUSE
// CLAUSE ::= [TERM <COLONQueryParser::>] ( TERM | (<LPAREN> QUERY <RPAREN>))
//Pre - field != NULL
//Post -
Query* q = NULL;
const TCHAR* sfield = field;
bool delField = false;
QueryToken *DelToken = NULL;
//match for [TERM <COLON>]
QueryToken* term = tokens->extract();
if(term->Type == QueryToken::TERM && tokens->peek()->Type == QueryToken::COLON){
DelToken = MatchQueryToken(QueryToken::COLON);
CND_CONDITION(DelToken != NULL,"DelToken is NULL");
_CLDELETE(DelToken);
TCHAR* tmp = STRDUP_TtoT(term->Value);
discardEscapeChar(tmp);
delField = true;
sfield = tmp;
_CLDELETE(term);
}else{
tokens->push(term);
term = NULL;
}
// match for
// TERM | (<LPAREN> QUERY <RPAREN>)
if(tokens->peek()->Type == QueryToken::LPAREN){
DelToken = MatchQueryToken(QueryToken::LPAREN);
CND_CONDITION(DelToken != NULL,"DelToken is NULL");
_CLDELETE(DelToken);
q = MatchQuery(sfield);
//DSR:2004.11.01:
//If exception is thrown while trying to match trailing parenthesis,
//need to prevent q from leaking.
try{
DelToken = MatchQueryToken(QueryToken::RPAREN);
CND_CONDITION(DelToken != NULL,"DelToken is NULL");
_CLDELETE(DelToken);
}catch(...) {
_CLDELETE(q);
throw;
}
}else{
q = MatchTerm(sfield);
}
if ( delField )
_CLDELETE_CARRAY(sfield);
return q;
}
Query* QueryParser::MatchTerm(const TCHAR* field){
//Func - matches for TERM
// TERM ::= TERM | PREFIXTERM | WILDTERM | NUMBER
// [ <FUZZY> ] [ <CARAT> <NUMBER> [<FUZZY>]]
// | (<RANGEIN> | <RANGEEX>) [<CARAT> <NUMBER>]
// | <QUOTED> [SLOP] [<CARAT> <NUMBER>]
//Pre - field != NULL
//Post -
QueryToken* term = NULL;
QueryToken* slop = NULL;
QueryToken* boost = NULL;
bool prefix = false;
bool wildcard = false;
bool fuzzy = false;
bool rangein = false;
Query* q = NULL;
term = tokens->extract();
QueryToken* DelToken = NULL; //Token that is about to be deleted
switch(term->Type){
case QueryToken::TERM:
case QueryToken::NUMBER:
case QueryToken::PREFIXTERM:
case QueryToken::WILDTERM:
{ //start case
//Check if type of QueryToken term is a prefix term
if(term->Type == QueryToken::PREFIXTERM){
prefix = true;
}
//Check if type of QueryToken term is a wildcard term
if(term->Type == QueryToken::WILDTERM){
wildcard = true;
}
//Peek to see if the type of the next token is fuzzy term
if(tokens->peek()->Type == QueryToken::FUZZY){
DelToken = MatchQueryToken(QueryToken::FUZZY);
CND_CONDITION(DelToken !=NULL, "DelToken is NULL");
_CLDELETE(DelToken);
fuzzy = true;
}
if(tokens->peek()->Type == QueryToken::CARAT){
DelToken = MatchQueryToken(QueryToken::CARAT);
CND_CONDITION(DelToken !=NULL, "DelToken is NULL");
_CLDELETE(DelToken);
boost = MatchQueryToken(QueryToken::NUMBER);
if(tokens->peek()->Type == QueryToken::FUZZY){
DelToken = MatchQueryToken(QueryToken::FUZZY);
CND_CONDITION(DelToken !=NULL, "DelToken is NULL");
_CLDELETE(DelToken);
fuzzy = true;
}
} //end if type==CARAT
discardEscapeChar(term->Value); //clean up
if(wildcard){
q = GetWildcardQuery(field,term->Value);
break;
}else if(prefix){
//Create a PrefixQuery
term->Value[_tcslen(term->Value)-1] = 0; //discard the *
q = GetPrefixQuery(field,term->Value);
break;
}else if(fuzzy){
//Create a FuzzyQuery
//Check if the last char is a ~
if(term->Value[_tcslen(term->Value)-1] == '~'){
//remove the ~
term->Value[_tcslen(term->Value)-1] = '\0';
}
q = GetFuzzyQuery(field,term->Value);
break;
}else{
q = GetFieldQuery(field, term->Value);
break;
}
}
case QueryToken::RANGEIN:
case QueryToken::RANGEEX:{
if(term->Type == QueryToken::RANGEIN){
rangein = true;
}
if(tokens->peek()->Type == QueryToken::CARAT){
DelToken = MatchQueryToken(QueryToken::CARAT);
CND_CONDITION(DelToken !=NULL, "DelToken is NULL");
_CLDELETE(DelToken);
boost = MatchQueryToken(QueryToken::NUMBER);
}
TCHAR* noBrackets = term->Value + 1;
noBrackets[_tcslen(noBrackets)-1] = 0;
q = ParseRangeQuery(field, noBrackets, rangein);
break;
}
case QueryToken::QUOTED:{
if(tokens->peek()->Type == QueryToken::SLOP){
slop = MatchQueryToken(QueryToken::SLOP);
}
if(tokens->peek()->Type == QueryToken::CARAT){
DelToken = MatchQueryToken(QueryToken::CARAT);
CND_CONDITION(DelToken !=NULL, "DelToken is NULL");
_CLDELETE(DelToken);
boost = MatchQueryToken(QueryToken::NUMBER);
}
//remove the quotes
TCHAR* quotedValue = term->Value+1;
quotedValue[_tcslen(quotedValue)-1] = '\0';
int32_t islop = phraseSlop;
if(slop != NULL ){
try {
TCHAR* end; //todo: should parse using float...
islop = (int32_t)_tcstoi64(slop->Value+1, &end, 10);
}catch(...){
//ignored
}
}
q = GetFieldQuery(field, quotedValue, islop);
_CLDELETE(slop);
}
} // end of switch
_CLDELETE(term);
if( q!=NULL && boost != NULL ){
qreal f = 1.0F;
try {
TCHAR* tmp;
f = _tcstod(boost->Value, &tmp);
}catch(...){
//ignored
}
_CLDELETE(boost);
q->setBoost( f);
}
return q;
}
QueryToken* QueryParser::MatchQueryToken(QueryToken::Types expectedType){
//Func - matches for QueryToken of the specified type and returns it
// otherwise Exception throws
//Pre - tokens != NULL
//Post -
CND_PRECONDITION(tokens != NULL,"tokens is NULL");
if(tokens->count() == 0){
throwParserException(_T("Error: Unexpected end of program"),' ',0,0);
}
//Extract a token form the TokenList tokens
QueryToken* t = tokens->extract();
//Check if the type of the token t matches the expectedType
if (expectedType != t->Type){
TCHAR buf[200];
_sntprintf(buf,200,_T("Error: Unexpected QueryToken: %d, expected: %d"),t->Type,expectedType);
_CLDELETE(t);
throwParserException(buf,' ',0,0);
}
//Return the matched token
return t;
}
void QueryParser::ExtractAndDeleteToken(void){
//Func - Extracts the first token from the Tokenlist tokenlist
// and destroys it
//Pre - true
//Post - The first token has been extracted and destroyed
CND_PRECONDITION(tokens != NULL, "tokens is NULL");
//Extract the token from the TokenList tokens
QueryToken* t = tokens->extract();
//Condition Check Token may not be NULL
CND_CONDITION(t != NULL, "Token is NULL");
//Delete Token
_CLDELETE(t);
}
CL_NS_END