/* | |
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team | |
* | |
* Distributable under the terms of either the Apache License (Version 2.0) or | |
* the GNU Lesser General Public License, as specified in the COPYING file. | |
* | |
* Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved. | |
*/ | |
#include "CLucene/StdHeader.h" | |
#include "TermInfosReader.h" | |
#include "CLucene/store/Directory.h" | |
#include "CLucene/util/Misc.h" | |
#include "FieldInfos.h" | |
#include "Term.h" | |
#include "Terms.h" | |
#include "TermInfo.h" | |
#include "TermInfosWriter.h" | |
CL_NS_USE(store) | |
CL_NS_USE(util) | |
CL_NS_DEF(index) | |
TermInfosReader::TermInfosReader(Directory* dir, const QString& seg, | |
FieldInfos* fis) | |
: directory(dir) | |
, fieldInfos (fis) | |
{ | |
//Func - Constructor. | |
// Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii) | |
//Pre - dir is a reference to a valid Directory | |
// Fis contains a valid reference to an FieldInfos instance | |
// seg != NULL and contains the name of the segment | |
//Post - An instance has been created and the index named seg has been read. (Remember | |
// a segment is nothing more then an independently readable index) | |
CND_PRECONDITION(!seg.isEmpty(), "seg is NULL"); | |
//Initialize the name of the segment | |
segment = seg; | |
//There are no indexTerms yet | |
indexTerms = NULL; | |
//So there are no indexInfos | |
indexInfos = NULL; | |
//So there are no indexPointers | |
indexPointers = NULL; | |
//Create a filname fo a Term Info File | |
QString tisFile = Misc::segmentname(segment, QLatin1String(".tis")); | |
QString tiiFile = Misc::segmentname(segment, QLatin1String(".tii")); | |
//Create an SegmentTermEnum for storing all the terms read of the segment | |
origEnum = _CLNEW SegmentTermEnum( directory->openInput( tisFile ), fieldInfos, false); | |
indexEnum = _CLNEW SegmentTermEnum( directory->openInput( tiiFile ), fieldInfos, true); | |
//Check if enumerator points to a valid instance | |
CND_CONDITION(origEnum != NULL, "No memory could be allocated for orig enumerator"); | |
CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator"); | |
//Get the size of the enumeration and store it in size | |
_size = origEnum->size; | |
} | |
TermInfosReader::~TermInfosReader() | |
{ | |
//Func - Destructor | |
//Pre - true | |
//Post - The instance has been destroyed | |
//Close the TermInfosReader to be absolutly sure that enumerator has been closed | |
//and the arrays indexTerms, indexPointers and indexInfos and their elements | |
//have been destroyed | |
close(); | |
} | |
void TermInfosReader::close() | |
{ | |
//Func - Close the enumeration of TermInfos | |
//Pre - true | |
//Post - The _enumeration has been closed and the arrays | |
//Check if indexTerms and indexInfos exist | |
if (indexTerms && indexInfos){ | |
//Iterate through arrays indexTerms and indexPointer to | |
//destroy their elements | |
#ifdef _DEBUG | |
for (int32_t i = 0; i < indexTermsLength; ++i) { | |
if (indexTerms[i].__cl_refcount != 1) { | |
CND_PRECONDITION(indexTerms[i].__cl_refcount == 1, | |
"TermInfosReader term was references more than internally"); | |
} | |
// _CLDECDELETE(indexTerms[i]); | |
//_CLDELETE(indexInfos[i]); | |
} | |
#endif | |
//Delete the arrays | |
_CLDELETE_ARRAY(indexTerms); | |
_CLDELETE_ARRAY(indexInfos); | |
} | |
//Delete the arrays | |
_CLDELETE_ARRAY(indexPointers); | |
if (origEnum != NULL) { | |
origEnum->close(); | |
//Get a pointer to IndexInput used by the enumeration but | |
//instantiated in the constructor by directory.open( tisFile ) | |
IndexInput *is = origEnum->input; | |
//Delete the enumuration enumerator | |
_CLDELETE(origEnum); | |
//Delete the IndexInput | |
_CLDELETE(is); | |
} | |
if (indexEnum != NULL){ | |
indexEnum->close(); | |
//Get a pointer to IndexInput used by the enumeration but | |
//instantiated in the constructor by directory.open( tiiFile ) | |
IndexInput *is = indexEnum->input; | |
//Delete the enumuration enumerator | |
_CLDELETE(indexEnum); | |
//Delete the IndexInput | |
_CLDELETE(is); | |
} | |
} | |
int64_t TermInfosReader::size() const | |
{ | |
//Func - Return the size of the enumeration of TermInfos | |
//Pre - true | |
//Post - size has been returened | |
return _size; | |
} | |
Term* TermInfosReader::get(const int32_t position) | |
{ | |
//Func - Returns the nth term in the set | |
//Pre - position > = 0 | |
//Post - The n-th term in the set has been returned | |
//Check if the size is 0 because then there are no terms | |
if (_size == 0) | |
return NULL; | |
SegmentTermEnum* enumerator = getEnum(); | |
if (enumerator != NULL //an enumeration exists | |
&& enumerator->term(false) != NULL // term is at or past current | |
&& position >= enumerator->position | |
&& position < (enumerator->position + enumerator->indexInterval)) { | |
return scanEnum(position); // can avoid seek | |
} | |
//random-access: must seek | |
seekEnum(position / enumerator->indexInterval); | |
//Get the Term at position | |
return scanEnum(position); | |
} | |
// TODO: currently there is no way of cleaning up a thread, if the thread ends. | |
// we are stuck with the terminfosreader of that thread. Hopefully this won't | |
// be too big a problem... solutions anyone? | |
SegmentTermEnum* TermInfosReader::getEnum() | |
{ | |
SegmentTermEnum* termEnum = enumerators.get(); | |
if (termEnum == NULL) { | |
termEnum = terms(); | |
enumerators.set(termEnum); | |
} | |
return termEnum; | |
} | |
TermInfo* TermInfosReader::get(const Term* term) | |
{ | |
//Func - Returns a TermInfo for a term | |
//Pre - term holds a valid reference to term | |
//Post - if term can be found its TermInfo has been returned otherwise NULL | |
//If the size of the enumeration is 0 then no Terms have been read | |
if (_size == 0) | |
return NULL; | |
ensureIndexIsRead(); | |
// optimize sequential access: first try scanning cached enum w/o seeking | |
SegmentTermEnum* enumerator = getEnum(); | |
// optimize sequential access: first try scanning cached enumerator w/o seeking | |
// if the current term of the enumeration enumerator is not at the end | |
if (enumerator->term(false) != NULL | |
// AND there exists a previous current called prev and term is | |
// positioned after this prev | |
&& ((enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0) | |
// OR term is positioned at the same position as the current of | |
// enumerator or at a higher position | |
|| term->compareTo(enumerator->term(false)) >= 0)) { | |
//Calculate the offset for the position | |
int32_t _enumOffset = (int32_t) | |
(enumerator->position / enumerator->indexInterval) + 1; | |
// but before end of block the length of indexTerms (the number of | |
// terms in enumerator) equals _enum_offset | |
if (indexTermsLength == _enumOffset | |
// OR term is positioned in front of term found at _enumOffset in | |
// indexTerms | |
|| term->compareTo(&indexTerms[_enumOffset]) < 0) { | |
//no need to seek, retrieve the TermInfo for term | |
return scanEnum(term); | |
} | |
} | |
//Reposition current term in the enumeration | |
seekEnum(getIndexOffset(term)); | |
//Return the TermInfo for term | |
return scanEnum(term); | |
} | |
int64_t TermInfosReader::getPosition(const Term* term) | |
{ | |
//Func - Returns the position of a Term in the set | |
//Pre - term holds a valid reference to a Term | |
// enumerator != NULL | |
//Post - If term was found then its position is returned otherwise -1 | |
//if the enumeration is empty then return -1 | |
if (_size == 0) | |
return -1; | |
ensureIndexIsRead(); | |
//Retrieve the indexOffset for term | |
int32_t indexOffset = getIndexOffset(term); | |
seekEnum(indexOffset); | |
SegmentTermEnum* enumerator = getEnum(); | |
while(term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) {} | |
if (term->equals(enumerator->term(false))) | |
return enumerator->position; | |
return -1; | |
} | |
SegmentTermEnum* TermInfosReader::terms(const Term* term) | |
{ | |
//Func - Returns an enumeration of terms starting at or after the named term. | |
// If term is null then enumerator is set to the beginning | |
//Pre - term holds a valid reference to a Term | |
// enumerator != NULL | |
//Post - An enumeration of terms starting at or after the named term has been returned | |
SegmentTermEnum* enumerator = NULL; | |
if (term != NULL) { | |
//Seek enumerator to term; delete the new TermInfo that's returned. | |
TermInfo* ti = get(term); | |
_CLDELETE(ti); | |
enumerator = getEnum(); | |
} else { | |
enumerator = origEnum; | |
} | |
//Clone the entire enumeration | |
SegmentTermEnum* cln = enumerator->clone(); | |
//Check if cln points to a valid instance | |
CND_CONDITION(cln != NULL, "cln is NULL"); | |
return cln; | |
} | |
void TermInfosReader::ensureIndexIsRead() | |
{ | |
//Func - Reads the term info index file or .tti file. | |
// This file contains every IndexInterval-th entry from the .tis file, | |
// along with its location in the "tis" file. This is designed to be | |
// read entirely into memory and used to provide random access to the | |
// "tis" file. | |
//Pre - indexTerms = NULL | |
// indexInfos = NULL | |
// indexPointers = NULL | |
//Post - The term info index file has been read into memory | |
SCOPED_LOCK_MUTEX(THIS_LOCK) | |
if ( indexTerms != NULL ) | |
return; | |
try { | |
indexTermsLength = (size_t)indexEnum->size; | |
// Instantiate an block of Term's,so that each one doesn't have to be new'd | |
indexTerms = _CL_NEWARRAY(Term,indexTermsLength); | |
// Check if is indexTerms is a valid array | |
CND_CONDITION(indexTerms != NULL, | |
"No memory could be allocated for indexTerms"); | |
// Instantiate an big block of TermInfo's, so that each one doesn't | |
// have to be new'd | |
indexInfos = _CL_NEWARRAY(TermInfo,indexTermsLength); | |
// Check if is indexInfos is a valid array | |
CND_CONDITION(indexInfos != NULL, | |
"No memory could be allocated for indexInfos"); | |
// Instantiate an array indexPointers that contains pointers to the | |
// term info index file | |
indexPointers = _CL_NEWARRAY(int64_t,indexTermsLength); | |
// Check if is indexPointers is a valid array | |
CND_CONDITION(indexPointers != NULL, | |
"No memory could be allocated for indexPointers"); | |
//Iterate through the terms of indexEnum | |
for (int32_t i = 0; indexEnum->next(); ++i) { | |
indexTerms[i].set(indexEnum->term(false), indexEnum->term(false)->text()); | |
indexEnum->getTermInfo(&indexInfos[i]); | |
indexPointers[i] = indexEnum->indexPointer; | |
} | |
} _CLFINALLY ( | |
indexEnum->close(); | |
// Close and delete the IndexInput is. The close is done by the destructor. | |
_CLDELETE( indexEnum->input ); | |
_CLDELETE( indexEnum ); | |
); | |
} | |
int32_t TermInfosReader::getIndexOffset(const Term* term) | |
{ | |
//Func - Returns the offset of the greatest index entry which is less than | |
// or equal to term. | |
//Pre - term holds a reference to a valid term | |
// indexTerms != NULL | |
//Post - The new offset has been returned | |
//Check if is indexTerms is a valid array | |
CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL"); | |
int32_t lo = 0; | |
int32_t hi = indexTermsLength - 1; | |
int32_t mid; | |
int32_t delta; | |
while (hi >= lo) { | |
//Start in the middle betwee hi and lo | |
mid = (lo + hi) >> 1; | |
//Check if is indexTerms[mid] is a valid instance of Term | |
CND_PRECONDITION(&indexTerms[mid] != NULL, "indexTerms[mid] is NULL"); | |
CND_PRECONDITION(mid < indexTermsLength, "mid >= indexTermsLength"); | |
//Determine if term is before mid or after mid | |
delta = term->compareTo(&indexTerms[mid]); | |
if (delta < 0) { | |
//Calculate the new hi | |
hi = mid - 1; | |
} else if (delta > 0) { | |
//Calculate the new lo | |
lo = mid + 1; | |
} else { | |
//term has been found so return its position | |
return mid; | |
} | |
} | |
// the new starting offset | |
return hi; | |
} | |
void TermInfosReader::seekEnum(const int32_t indexOffset) | |
{ | |
//Func - Reposition the current Term and TermInfo to indexOffset | |
//Pre - indexOffset >= 0 | |
// indexTerms != NULL | |
// indexInfos != NULL | |
// indexPointers != NULL | |
//Post - The current Term and Terminfo have been repositioned to indexOffset | |
CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number"); | |
CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL"); | |
CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL"); | |
CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL"); | |
SegmentTermEnum* enumerator = getEnum(); | |
enumerator->seek(indexPointers[indexOffset], | |
(indexOffset * enumerator->indexInterval) - 1, | |
&indexTerms[indexOffset], &indexInfos[indexOffset]); | |
} | |
TermInfo* TermInfosReader::scanEnum(const Term* term) | |
{ | |
//Func - Scans the Enumeration of terms for term and returns the | |
// corresponding TermInfo instance if found. The search is started | |
// from the current term. | |
//Pre - term contains a valid reference to a Term | |
// enumerator != NULL | |
//Post - if term has been found the corresponding TermInfo has been returned | |
// otherwise NULL has been returned | |
SegmentTermEnum* enumerator = getEnum(); | |
enumerator->scanTo(term); | |
//Check if the at the position the Term term can be found | |
if (enumerator->term(false) != NULL && term->equals(enumerator->term(false))) { | |
//Return the TermInfo instance about term | |
return enumerator->getTermInfo(); | |
} | |
//term was not found so no TermInfo can be returned | |
return NULL; | |
} | |
Term* TermInfosReader::scanEnum(const int32_t position) | |
{ | |
//Func - Scans the enumeration to the requested position and returns the | |
// Term located at that position | |
//Pre - position > = 0 | |
// enumerator != NULL | |
//Post - The Term at the requested position has been returned | |
SegmentTermEnum* enumerator = getEnum(); | |
// As long the position of the enumeration enumerator is smaller than the | |
// requested one | |
while(enumerator->position < position) { | |
//Move the current of enumerator to the next | |
if (!enumerator->next()) { | |
//If there is no next it means that the requested position was to big | |
return NULL; | |
} | |
} | |
//Return the Term a the requested position | |
return enumerator->term(); | |
} | |
CL_NS_END |