blob: 116fa6d30184a3a37b450e0722a56f90eed532f4 [file] [log] [blame]
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/StdHeader.h"
#include "SegmentTermEnum.h"
#include "Terms.h"
#include "FieldInfos.h"
#include "Term.h"
#include "TermInfo.h"
#include "TermInfosWriter.h"
CL_NS_USE(store)
CL_NS_DEF(index)
SegmentTermEnum::SegmentTermEnum(IndexInput* i, FieldInfos* fis, const bool isi):
fieldInfos(fis){
//Func - Constructor
//Pre - i holds a reference to an instance of IndexInput
// fis holds a reference to an instance of FieldInfos
// isi
//Post - An instance of SegmentTermEnum has been created
input = i;
position = -1;
//Instantiate a Term with empty field, empty text and which is interned (see term.h what interned means)
_term = _CLNEW Term;
isIndex = isi;
termInfo = _CLNEW TermInfo();
indexPointer = 0;
buffer = NULL;
bufferLength = 0;
prev = NULL;
formatM1SkipInterval = 0;
//Set isClone to false as the instance is not clone of another instance
isClone = false;
int32_t firstInt = input->readInt();
if (firstInt >= 0) {
// original-format file, without explicit format version number
format = 0;
size = firstInt;
// back-compatible settings
indexInterval = 128;
skipInterval = LUCENE_INT32_MAX_SHOULDBE; // switch off skipTo optimization
} else {
// we have a format version number
format = firstInt;
// check that it is a format we can understand
if (format < TermInfosWriter::FORMAT){
TCHAR err[30];
_sntprintf(err,30,_T("Unknown format version: %d"), format);
_CLTHROWT(CL_ERR_Runtime,err);
}
size = input->readLong(); // read the size
if(format == -1){
if (!isIndex) {
indexInterval = input->readInt();
formatM1SkipInterval = input->readInt();
}
// switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in
// skipTo implementation of these versions
skipInterval = LUCENE_INT32_MAX_SHOULDBE;
}else{
indexInterval = input->readInt();
skipInterval = input->readInt();
}
}
}
SegmentTermEnum::SegmentTermEnum(const SegmentTermEnum& clone):
fieldInfos(clone.fieldInfos)
{
//Func - Constructor
// The instance is created by cloning all properties of clone
//Pre - clone holds a valid reference to SegmentTermEnum
//Post - An instance of SegmentTermEnum with the same properties as clone
input = clone.input->clone();
//Copy the postion from the clone
position = clone.position;
if ( clone._term != NULL ){
_term = _CLNEW Term;
_term->set(clone._term,clone._term->text());
}else
_term = NULL;
isIndex = clone.isIndex;
termInfo = _CLNEW TermInfo(clone.termInfo);
indexPointer = clone.indexPointer;
buffer = clone.buffer==NULL?NULL:(TCHAR*)malloc(sizeof(TCHAR) * (clone.bufferLength+1));
bufferLength = clone.bufferLength;
prev = clone.prev==NULL?NULL:_CLNEW Term(clone.prev->field(),clone.prev->text(),false);
size = clone.size;
format = clone.format;
indexInterval= clone.indexInterval;
skipInterval = clone.skipInterval;
formatM1SkipInterval = clone.formatM1SkipInterval;
//Set isClone to true as this instance is a clone of another instance
isClone = true;
//Copy the contents of buffer of clone to the buffer of this instance
if ( clone.buffer != NULL )
memcpy(buffer,clone.buffer,bufferLength * sizeof(TCHAR));
}
SegmentTermEnum::~SegmentTermEnum(){
//Func - Destructor
//Pre - true
//Post - The instance has been destroyed. If this instance was a clone
// then the inputstream is closed and deleted too.
//todo: revisit this... close() should clean up most of everything.
//Finalize prev
_CLDECDELETE(prev );
//Finalize term
_CLDECDELETE( _term );
//Delete the buffer if necessary
free(buffer);
//Delete termInfo if necessary
_CLDELETE(termInfo);
//Check if this instance is a clone
if ( isClone ){
//Close the inputstream
input->close();
//delete the inputstream
_CLDELETE(input);
}
}
bool SegmentTermEnum::next(){
//Func - Moves the current of the set to the next in the set
//Pre - true
//Post - If the end has been reached NULL is returned otherwise the term has
// become the next Term in the enumeration
//Increase position by and and check if the end has been reached
if (position++ >= size-1) {
//delete term
_CLDECDELETE(_term);
return false;
}
//delete the previous enumerated term
Term* tmp=NULL;
if ( prev != NULL ){
int32_t usage = prev->__cl_refcount;
if ( usage > 1 ){
_CLDECDELETE(prev); //todo: tune other places try and delete its term
}else
tmp = prev; //we are going to re-use this term
}
//prev becomes the current enumerated term
prev = _term;
//term becomes the next term read from inputStream input
_term = readTerm(tmp);
//Read docFreq, the number of documents which contain the term.
termInfo->docFreq = input->readVInt();
//Read freqPointer, a pointer into the TermFreqs file (.frq)
termInfo->freqPointer += input->readVLong();
//Read proxPointer, a pointer into the TermPosition file (.prx).
termInfo->proxPointer += input->readVLong();
if(format == -1){
// just read skipOffset in order to increment file pointer;
// value is never used since skipTo is switched off
if (!isIndex) {
if (termInfo->docFreq > formatM1SkipInterval) {
termInfo->skipOffset = input->readVInt();
}
}
}else{
if (termInfo->docFreq >= skipInterval)
termInfo->skipOffset = input->readVInt();
}
//Check if the enumeration is an index
if (isIndex)
//read index pointer
indexPointer += input->readVLong();
return true;
}
Term* SegmentTermEnum::term() {
//Func - Returns the current term.
//Pre - pointer is true or false and indicates if the reference counter
// of term must be increased or not
// next() must have been called once!
//Post - pointer = true -> term has been returned with an increased reference counter
// pointer = false -> term has been returned
return _CL_POINTER(_term);
}
Term* SegmentTermEnum::term(bool pointer) {
if ( pointer )
return _CL_POINTER(_term);
else
return _term;
}
void SegmentTermEnum::scanTo(const Term *term){
//Func - Scan for Term without allocating new Terms
//Pre - term != NULL
//Post - The iterator term has been moved to the position where Term is expected to be
// in the enumeration
while ( term->compareTo(this->_term) > 0 && next())
{
}
}
void SegmentTermEnum::close() {
//Func - Closes the enumeration to further activity, freeing resources.
//Pre - true
//Post - The inputStream input has been closed
input->close();
}
int32_t SegmentTermEnum::docFreq() const {
//Func - Returns the document frequency of the current term in the set
//Pre - termInfo != NULL
// next() must have been called once
//Post - The document frequency of the current enumerated term has been returned
return termInfo->docFreq;
}
void SegmentTermEnum::seek(const int64_t pointer, const int32_t p, Term* t, TermInfo* ti) {
//Func - Repositions term and termInfo within the enumeration
//Pre - pointer >= 0
// p >= 0 and contains the new position within the enumeration
// t is a valid reference to a Term and is the new current term in the enumeration
// ti is a valid reference to a TermInfo and is corresponding TermInfo form the new
// current Term
//Post - term and terminfo have been repositioned within the enumeration
//Reset the IndexInput input to pointer
input->seek(pointer);
//Assign the new position
position = p;
//finalize the current term
if ( _term == NULL || _term->__cl_refcount > 1 ){
_CLDECDELETE(_term);
//Get a pointer from t and increase the reference counter of t
_term = _CLNEW Term; //cannot use reference, because TermInfosReader uses non ref-counted array
}
_term->set(t,t->text());
//finalize prev
_CLDECDELETE(prev);
//Change the current termInfo so it matches the new current term
termInfo->set(ti);
//Have the buffer grown if needed
if ( bufferLength <= _term->textLength() )
growBuffer(_term->textLength(), true ); // copy term text into buffer
else
_tcsncpy(buffer,_term->text(),bufferLength); //just copy the buffer
}
TermInfo* SegmentTermEnum::getTermInfo()const {
//Func - Returns a clone of the current termInfo
//Pre - termInfo != NULL
// next() must have been called once
//Post - A clone of the current termInfo has been returned
return _CLNEW TermInfo(*termInfo); //clone
}
void SegmentTermEnum::getTermInfo(TermInfo* ti)const {
//Func - Retrieves a clone of termInfo through the reference ti
//Pre - ti contains a valid reference to TermInfo
// termInfo != NULL
// next() must have been called once
//Post - ti contains a clone of termInfo
ti->set(termInfo);
}
int64_t SegmentTermEnum::freqPointer()const {
//Func - Returns the freqpointer of the current termInfo
//Pre - termInfo != NULL
// next() must have been called once
//Post - The freqpointer of the current termInfo has been returned
return termInfo->freqPointer;
}
int64_t SegmentTermEnum::proxPointer()const {
//Func - Returns the proxPointer of the current termInfo
//Pre - termInfo != NULL
// next() must have been called once
//Post - the proxPointer of the current termInfo has been returned
return termInfo->proxPointer;
}
SegmentTermEnum* SegmentTermEnum::clone() const {
//Func - Returns a clone of this instance
//Pre - true
//Post - An clone of this instance has been returned
return _CLNEW SegmentTermEnum(*this);
}
Term* SegmentTermEnum::readTerm(Term* reuse) {
//Func - Reads the next term in the enumeration
//Pre - true
//Post - The next Term in the enumeration has been read and returned
//Read the start position from the inputStream input
int32_t start = input->readVInt();
//Read the length of term in the inputStream input
int32_t length = input->readVInt();
//Calculated the total lenght of bytes that buffer must be to contain the current
//chars in buffer and the new ones yet to be read
uint32_t totalLength = start + length;
if (static_cast<uint32_t>(bufferLength) < totalLength+1)
growBuffer(totalLength, false);
//Read a length number of characters into the buffer from position start in the inputStream input
input->readChars(buffer, start, length);
//Null terminate the string
buffer[totalLength] = 0;
//Return a new Term
int32_t field = input->readVInt();
const TCHAR* fieldname = fieldInfos->fieldName(field);
if ( reuse == NULL )
reuse = _CLNEW Term;
reuse->set(fieldname, buffer, false);
return reuse;
}
void SegmentTermEnum::growBuffer(const uint32_t length, bool force_copy) {
//Func - Instantiate a buffer of length length+1
//Pre - length > 0
//Post - pre(buffer) has been deleted with its contents. A new buffer
// has been allocated of length length+1 and the text of term has been copied
// to buffer
//todo: we could guess that we will need to re-grow this
//buffer a few times...so start off with a reasonable grow
//value...
if ( bufferLength > length )
return;
//Store the new bufferLength
if ( length - bufferLength < LUCENE_SEGMENTTERMENUM_GROWSIZE )
bufferLength = length+LUCENE_SEGMENTTERMENUM_GROWSIZE;
else
bufferLength = length+1;
bool copy = buffer==NULL;
//Instantiate the new buffer + 1 is needed for terminator '\0'
if ( buffer == NULL )
buffer = (TCHAR*)malloc(sizeof(TCHAR) * (bufferLength+1));
else
buffer = (TCHAR*)realloc(buffer, sizeof(TCHAR) * (bufferLength+1));
if ( copy || force_copy){
//Copy the text of term into buffer
_tcsncpy(buffer,_term->text(),bufferLength);
}
}
CL_NS_END