blob: 43e152380e6f4cbac21b023e3a389e5151e64f88 [file] [log] [blame]
/*
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
*
* Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
*/
#include "CLucene/StdHeader.h"
#include "TermInfosReader.h"
#include "CLucene/store/Directory.h"
#include "CLucene/util/Misc.h"
#include "FieldInfos.h"
#include "Term.h"
#include "Terms.h"
#include "TermInfo.h"
#include "TermInfosWriter.h"
CL_NS_USE(store)
CL_NS_USE(util)
CL_NS_DEF(index)
TermInfosReader::TermInfosReader(Directory* dir, const QString& seg,
FieldInfos* fis)
: directory(dir)
, fieldInfos (fis)
{
//Func - Constructor.
// Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii)
//Pre - dir is a reference to a valid Directory
// Fis contains a valid reference to an FieldInfos instance
// seg != NULL and contains the name of the segment
//Post - An instance has been created and the index named seg has been read. (Remember
// a segment is nothing more then an independently readable index)
CND_PRECONDITION(!seg.isEmpty(), "seg is NULL");
//Initialize the name of the segment
segment = seg;
//There are no indexTerms yet
indexTerms = NULL;
//So there are no indexInfos
indexInfos = NULL;
//So there are no indexPointers
indexPointers = NULL;
//Create a filname fo a Term Info File
QString tisFile = Misc::segmentname(segment, QLatin1String(".tis"));
QString tiiFile = Misc::segmentname(segment, QLatin1String(".tii"));
//Create an SegmentTermEnum for storing all the terms read of the segment
origEnum = _CLNEW SegmentTermEnum( directory->openInput( tisFile ), fieldInfos, false);
indexEnum = _CLNEW SegmentTermEnum( directory->openInput( tiiFile ), fieldInfos, true);
//Check if enumerator points to a valid instance
CND_CONDITION(origEnum != NULL, "No memory could be allocated for orig enumerator");
CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator");
//Get the size of the enumeration and store it in size
_size = origEnum->size;
}
TermInfosReader::~TermInfosReader()
{
//Func - Destructor
//Pre - true
//Post - The instance has been destroyed
//Close the TermInfosReader to be absolutly sure that enumerator has been closed
//and the arrays indexTerms, indexPointers and indexInfos and their elements
//have been destroyed
close();
}
void TermInfosReader::close()
{
//Func - Close the enumeration of TermInfos
//Pre - true
//Post - The _enumeration has been closed and the arrays
//Check if indexTerms and indexInfos exist
if (indexTerms && indexInfos){
//Iterate through arrays indexTerms and indexPointer to
//destroy their elements
#ifdef _DEBUG
for (int32_t i = 0; i < indexTermsLength; ++i) {
if (indexTerms[i].__cl_refcount != 1) {
CND_PRECONDITION(indexTerms[i].__cl_refcount == 1,
"TermInfosReader term was references more than internally");
}
// _CLDECDELETE(indexTerms[i]);
//_CLDELETE(indexInfos[i]);
}
#endif
//Delete the arrays
_CLDELETE_ARRAY(indexTerms);
_CLDELETE_ARRAY(indexInfos);
}
//Delete the arrays
_CLDELETE_ARRAY(indexPointers);
if (origEnum != NULL) {
origEnum->close();
//Get a pointer to IndexInput used by the enumeration but
//instantiated in the constructor by directory.open( tisFile )
IndexInput *is = origEnum->input;
//Delete the enumuration enumerator
_CLDELETE(origEnum);
//Delete the IndexInput
_CLDELETE(is);
}
if (indexEnum != NULL){
indexEnum->close();
//Get a pointer to IndexInput used by the enumeration but
//instantiated in the constructor by directory.open( tiiFile )
IndexInput *is = indexEnum->input;
//Delete the enumuration enumerator
_CLDELETE(indexEnum);
//Delete the IndexInput
_CLDELETE(is);
}
}
int64_t TermInfosReader::size() const
{
//Func - Return the size of the enumeration of TermInfos
//Pre - true
//Post - size has been returened
return _size;
}
Term* TermInfosReader::get(const int32_t position)
{
//Func - Returns the nth term in the set
//Pre - position > = 0
//Post - The n-th term in the set has been returned
//Check if the size is 0 because then there are no terms
if (_size == 0)
return NULL;
SegmentTermEnum* enumerator = getEnum();
if (enumerator != NULL //an enumeration exists
&& enumerator->term(false) != NULL // term is at or past current
&& position >= enumerator->position
&& position < (enumerator->position + enumerator->indexInterval)) {
return scanEnum(position); // can avoid seek
}
//random-access: must seek
seekEnum(position / enumerator->indexInterval);
//Get the Term at position
return scanEnum(position);
}
// TODO: currently there is no way of cleaning up a thread, if the thread ends.
// we are stuck with the terminfosreader of that thread. Hopefully this won't
// be too big a problem... solutions anyone?
SegmentTermEnum* TermInfosReader::getEnum()
{
SegmentTermEnum* termEnum = enumerators.get();
if (termEnum == NULL) {
termEnum = terms();
enumerators.set(termEnum);
}
return termEnum;
}
TermInfo* TermInfosReader::get(const Term* term)
{
//Func - Returns a TermInfo for a term
//Pre - term holds a valid reference to term
//Post - if term can be found its TermInfo has been returned otherwise NULL
//If the size of the enumeration is 0 then no Terms have been read
if (_size == 0)
return NULL;
ensureIndexIsRead();
// optimize sequential access: first try scanning cached enum w/o seeking
SegmentTermEnum* enumerator = getEnum();
// optimize sequential access: first try scanning cached enumerator w/o seeking
// if the current term of the enumeration enumerator is not at the end
if (enumerator->term(false) != NULL
// AND there exists a previous current called prev and term is
// positioned after this prev
&& ((enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0)
// OR term is positioned at the same position as the current of
// enumerator or at a higher position
|| term->compareTo(enumerator->term(false)) >= 0)) {
//Calculate the offset for the position
int32_t _enumOffset = (int32_t)
(enumerator->position / enumerator->indexInterval) + 1;
// but before end of block the length of indexTerms (the number of
// terms in enumerator) equals _enum_offset
if (indexTermsLength == _enumOffset
// OR term is positioned in front of term found at _enumOffset in
// indexTerms
|| term->compareTo(&indexTerms[_enumOffset]) < 0) {
//no need to seek, retrieve the TermInfo for term
return scanEnum(term);
}
}
//Reposition current term in the enumeration
seekEnum(getIndexOffset(term));
//Return the TermInfo for term
return scanEnum(term);
}
int64_t TermInfosReader::getPosition(const Term* term)
{
//Func - Returns the position of a Term in the set
//Pre - term holds a valid reference to a Term
// enumerator != NULL
//Post - If term was found then its position is returned otherwise -1
//if the enumeration is empty then return -1
if (_size == 0)
return -1;
ensureIndexIsRead();
//Retrieve the indexOffset for term
int32_t indexOffset = getIndexOffset(term);
seekEnum(indexOffset);
SegmentTermEnum* enumerator = getEnum();
while(term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) {}
if (term->equals(enumerator->term(false)))
return enumerator->position;
return -1;
}
SegmentTermEnum* TermInfosReader::terms(const Term* term)
{
//Func - Returns an enumeration of terms starting at or after the named term.
// If term is null then enumerator is set to the beginning
//Pre - term holds a valid reference to a Term
// enumerator != NULL
//Post - An enumeration of terms starting at or after the named term has been returned
SegmentTermEnum* enumerator = NULL;
if (term != NULL) {
//Seek enumerator to term; delete the new TermInfo that's returned.
TermInfo* ti = get(term);
_CLDELETE(ti);
enumerator = getEnum();
} else {
enumerator = origEnum;
}
//Clone the entire enumeration
SegmentTermEnum* cln = enumerator->clone();
//Check if cln points to a valid instance
CND_CONDITION(cln != NULL, "cln is NULL");
return cln;
}
void TermInfosReader::ensureIndexIsRead()
{
//Func - Reads the term info index file or .tti file.
// This file contains every IndexInterval-th entry from the .tis file,
// along with its location in the "tis" file. This is designed to be
// read entirely into memory and used to provide random access to the
// "tis" file.
//Pre - indexTerms = NULL
// indexInfos = NULL
// indexPointers = NULL
//Post - The term info index file has been read into memory
SCOPED_LOCK_MUTEX(THIS_LOCK)
if ( indexTerms != NULL )
return;
try {
indexTermsLength = (size_t)indexEnum->size;
// Instantiate an block of Term's,so that each one doesn't have to be new'd
indexTerms = _CL_NEWARRAY(Term,indexTermsLength);
// Check if is indexTerms is a valid array
CND_CONDITION(indexTerms != NULL,
"No memory could be allocated for indexTerms");
// Instantiate an big block of TermInfo's, so that each one doesn't
// have to be new'd
indexInfos = _CL_NEWARRAY(TermInfo,indexTermsLength);
// Check if is indexInfos is a valid array
CND_CONDITION(indexInfos != NULL,
"No memory could be allocated for indexInfos");
// Instantiate an array indexPointers that contains pointers to the
// term info index file
indexPointers = _CL_NEWARRAY(int64_t,indexTermsLength);
// Check if is indexPointers is a valid array
CND_CONDITION(indexPointers != NULL,
"No memory could be allocated for indexPointers");
//Iterate through the terms of indexEnum
for (int32_t i = 0; indexEnum->next(); ++i) {
indexTerms[i].set(indexEnum->term(false), indexEnum->term(false)->text());
indexEnum->getTermInfo(&indexInfos[i]);
indexPointers[i] = indexEnum->indexPointer;
}
} _CLFINALLY (
indexEnum->close();
// Close and delete the IndexInput is. The close is done by the destructor.
_CLDELETE( indexEnum->input );
_CLDELETE( indexEnum );
);
}
int32_t TermInfosReader::getIndexOffset(const Term* term)
{
//Func - Returns the offset of the greatest index entry which is less than
// or equal to term.
//Pre - term holds a reference to a valid term
// indexTerms != NULL
//Post - The new offset has been returned
//Check if is indexTerms is a valid array
CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL");
int32_t lo = 0;
int32_t hi = indexTermsLength - 1;
int32_t mid;
int32_t delta;
while (hi >= lo) {
//Start in the middle betwee hi and lo
mid = (lo + hi) >> 1;
//Check if is indexTerms[mid] is a valid instance of Term
CND_PRECONDITION(&indexTerms[mid] != NULL, "indexTerms[mid] is NULL");
CND_PRECONDITION(mid < indexTermsLength, "mid >= indexTermsLength");
//Determine if term is before mid or after mid
delta = term->compareTo(&indexTerms[mid]);
if (delta < 0) {
//Calculate the new hi
hi = mid - 1;
} else if (delta > 0) {
//Calculate the new lo
lo = mid + 1;
} else {
//term has been found so return its position
return mid;
}
}
// the new starting offset
return hi;
}
void TermInfosReader::seekEnum(const int32_t indexOffset)
{
//Func - Reposition the current Term and TermInfo to indexOffset
//Pre - indexOffset >= 0
// indexTerms != NULL
// indexInfos != NULL
// indexPointers != NULL
//Post - The current Term and Terminfo have been repositioned to indexOffset
CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number");
CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL");
CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL");
CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL");
SegmentTermEnum* enumerator = getEnum();
enumerator->seek(indexPointers[indexOffset],
(indexOffset * enumerator->indexInterval) - 1,
&indexTerms[indexOffset], &indexInfos[indexOffset]);
}
TermInfo* TermInfosReader::scanEnum(const Term* term)
{
//Func - Scans the Enumeration of terms for term and returns the
// corresponding TermInfo instance if found. The search is started
// from the current term.
//Pre - term contains a valid reference to a Term
// enumerator != NULL
//Post - if term has been found the corresponding TermInfo has been returned
// otherwise NULL has been returned
SegmentTermEnum* enumerator = getEnum();
enumerator->scanTo(term);
//Check if the at the position the Term term can be found
if (enumerator->term(false) != NULL && term->equals(enumerator->term(false))) {
//Return the TermInfo instance about term
return enumerator->getTermInfo();
}
//term was not found so no TermInfo can be returned
return NULL;
}
Term* TermInfosReader::scanEnum(const int32_t position)
{
//Func - Scans the enumeration to the requested position and returns the
// Term located at that position
//Pre - position > = 0
// enumerator != NULL
//Post - The Term at the requested position has been returned
SegmentTermEnum* enumerator = getEnum();
// As long the position of the enumeration enumerator is smaller than the
// requested one
while(enumerator->position < position) {
//Move the current of enumerator to the next
if (!enumerator->next()) {
//If there is no next it means that the requested position was to big
return NULL;
}
}
//Return the Term a the requested position
return enumerator->term();
}
CL_NS_END