blob: 11b1af1013caf4d4d7af6cbc6100da08666a9b6e [file] [log] [blame]
/*
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
*
* Changes are Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
*/
#include "CLucene/StdHeader.h"
#include "SegmentHeader.h"
#include "FieldInfos.h"
#include "FieldsReader.h"
#include "IndexReader.h"
#include "TermInfosReader.h"
#include "Terms.h"
#include "CLucene/search/Similarity.h"
CL_NS_USE(util)
CL_NS_USE(store)
CL_NS_USE(document)
CL_NS_USE(search)
CL_NS_DEF(index)
SegmentReader::Norm::Norm(IndexInput* instrm, int32_t n, SegmentReader* r,
const QString& seg)
: number(n)
, reader(r)
, segment(seg)
, in(instrm)
, bytes(NULL)
, dirty(false)
{
//Func - Constructor
//Pre - instrm is a valid reference to an IndexInput
//Post - A Norm instance has been created with an empty bytes array
bytes = NULL;
dirty = false;
}
SegmentReader::Norm::~Norm()
{
//Func - Destructor
//Pre - true
//Post - The IndexInput in has been deleted (and closed by its destructor)
// and the array too.
//Close and destroy the inputstream in-> The inputstream will be closed
// by its destructor. Note that the IndexInput 'in' actually is a pointer!!!!!
_CLDELETE(in);
//Delete the bytes array
_CLDELETE_ARRAY(bytes);
}
void SegmentReader::Norm::reWrite()
{
QString buf(segment + QLatin1String(".tmp"));
// NOTE: norms are re-written in regular directory, not cfs
IndexOutput* out = reader->getDirectory()->createOutput(buf);
try {
out->writeBytes(bytes, reader->maxDoc());
} _CLFINALLY (
out->close();
_CLDELETE(out)
);
QString fileName(segment);
if (reader->cfsReader == NULL)
fileName.append(QLatin1String(".f%1")).arg(number);
else // use a different file name if we have compound format
fileName.append(QLatin1String(".s%1")).arg(number);
reader->getDirectory()->renameFile(buf, fileName);
this->dirty = false;
}
SegmentReader::SegmentReader(SegmentInfo* si)
: IndexReader(si->getDir())
, _norms(false, false)
{
initialize(si);
}
SegmentReader::SegmentReader(SegmentInfos* sis, SegmentInfo* si)
: IndexReader(si->getDir(), sis, false)
, _norms(false, false)
{
initialize(si);
}
void SegmentReader::initialize(SegmentInfo* si)
{
//Pre - si-> is a valid reference to SegmentInfo instance
// identified by si->
//Post - All files of the segment have been read
deletedDocs = NULL;
ones = NULL;
//There are no documents yet marked as deleted
deletedDocsDirty = false;
normsDirty=false;
undeleteAll=false;
//Duplicate the name of the segment from SegmentInfo to segment
segment = si->name;
// make sure that all index files have been read or are kept open
// so that if an index update removes them we'll still have them
freqStream = NULL;
proxStream = NULL;
//instantiate a buffer large enough to hold a directory path
QString buf;
// Use compound file directory for some files, if it exists
Directory* cfsDir = getDirectory();
SegmentName(buf, CL_MAX_PATH, QLatin1String(".cfs"));
if (cfsDir->fileExists(buf)) {
cfsReader = _CLNEW CompoundFileReader(cfsDir, buf);
cfsDir = cfsReader;
}else
cfsReader = NULL;
// Create the name of the field info file with suffix .fnm in buf
SegmentName(buf, CL_MAX_PATH, QLatin1String(".fnm"));
fieldInfos = _CLNEW FieldInfos(cfsDir, buf );
// Condition check to see if fieldInfos points to a valid instance
CND_CONDITION(fieldInfos != NULL,
"No memory could be allocated for fieldInfos");
// Create the name of the frequence file with suffix .frq in buf
SegmentName(buf ,CL_MAX_PATH, QLatin1String(".frq"));
// Open an IndexInput freqStream to the frequency file
freqStream = cfsDir->openInput( buf );
// Condition check to see if freqStream points to a valid instance and was
// able to open the frequency file
CND_CONDITION(freqStream != NULL,
"IndexInput freqStream could not open the frequency file");
// Create the name of the prox file with suffix .prx in buf
SegmentName(buf, CL_MAX_PATH, QLatin1String(".prx"));
// Open an IndexInput proxStream to the prox file
proxStream = cfsDir->openInput( buf );
// Condition check to see if proxStream points to a valid instance and was
// able to open the prox file
CND_CONDITION(proxStream != NULL,
"IndexInput proxStream could not open proximity file");
// Instantiate a FieldsReader for reading the Field Info File
fieldsReader = _CLNEW FieldsReader(cfsDir, segment, fieldInfos);
// Condition check to see if fieldsReader points to a valid instance
CND_CONDITION(fieldsReader != NULL,
"No memory could be allocated for fieldsReader");
//Instantiate a TermInfosReader for reading the Term Dictionary .tis file
tis = _CLNEW TermInfosReader(cfsDir, segment, fieldInfos);
//Condition check to see if tis points to a valid instance
CND_CONDITION(tis != NULL,"No memory could be allocated for tis");
// Check if the segment has deletion according to the SegmentInfo instance
// si-> NOTE: the bitvector is stored using the regular directory, not cfs
if (hasDeletions(si)) {
//Create a deletion file with suffix .del
SegmentName(buf, CL_MAX_PATH, QLatin1String(".del"));
// Instantiate a BitVector that manages which documents have been deleted
deletedDocs = _CLNEW BitSet(getDirectory(), buf);
}
// Open the norm file. There's a norm file for each indexed field with a
// byte for each document. The .f[0-9]* file contains, for each document,
// a byte that encodes a value that is multiplied into the score for hits
// on that field
openNorms(cfsDir);
termVectorsReaderOrig = NULL;
if (fieldInfos->hasVectors()) // open term vector files only as needed
termVectorsReaderOrig = _CLNEW TermVectorsReader(cfsDir, segment, fieldInfos);
}
SegmentReader::~SegmentReader()
{
//Func - Destructor.
//Pre - doClose has been invoked!
//Post - the instance has been destroyed
doClose(); //this means that index reader doesn't need to be closed manually
_CLDELETE(fieldInfos);
_CLDELETE(fieldsReader);
_CLDELETE(tis);
_CLDELETE(freqStream);
_CLDELETE(proxStream);
_CLDELETE(deletedDocs);
_CLDELETE_ARRAY(ones);
_CLDELETE(termVectorsReaderOrig);
_CLDECDELETE(cfsReader);
}
void SegmentReader::doCommit()
{
QString bufdel(segment + QLatin1String(".del"));
if (deletedDocsDirty) { // re-write deleted
QString buftmp(segment + QLatin1String(".tmp"));
deletedDocs->write(getDirectory(), buftmp);
getDirectory()->renameFile(buftmp, bufdel);
}
if(undeleteAll && getDirectory()->fileExists(bufdel))
getDirectory()->deleteFile(bufdel, true);
if (normsDirty) { // re-write norms
NormsType::iterator itr = _norms.begin();
while (itr != _norms.end()) {
Norm* norm = itr->second;
if (norm->dirty) {
norm->reWrite();
}
++itr;
}
}
deletedDocsDirty = false;
normsDirty = false;
undeleteAll = false;
}
void SegmentReader::doClose()
{
//Func - Closes all streams to the files of a single segment
//Pre - fieldsReader != NULL
// tis != NULL
//Post - All streams to files have been closed
CND_PRECONDITION(fieldsReader != NULL, "fieldsReader is NULL");
CND_PRECONDITION(tis != NULL, "tis is NULL");
//Close the fieldsReader
fieldsReader->close();
//Close the TermInfosReader
tis->close();
//Close the frequency stream
if (freqStream != NULL){
freqStream->close();
}
//Close the prox stream
if (proxStream != NULL){
proxStream->close();
}
//Close the norm file
closeNorms();
if (termVectorsReaderOrig != NULL)
termVectorsReaderOrig->close();
if (cfsReader != NULL)
cfsReader->close();
}
bool SegmentReader::hasDeletions() const
{
return deletedDocs != NULL;
}
//static
bool SegmentReader::usesCompoundFile(SegmentInfo* si)
{
return si->getDir()->fileExists(si->name + QLatin1String(".cfs"));
}
//static
bool SegmentReader::hasSeparateNorms(SegmentInfo* si)
{
QString pattern(si->name);
pattern.append(QLatin1String(".s"));
size_t patternLength = pattern.length();
QStringList names = si->getDir()->list();
foreach (const QString& name, names) {
int length = name.length();
if (length > patternLength && name.left(patternLength) == pattern) {
if (name.at(patternLength) >= QLatin1Char('0')
&& name.at(patternLength) <= QLatin1Char('9')) {
return true;
}
}
}
return false;
}
bool SegmentReader::hasDeletions(const SegmentInfo* si)
{
//Func - Static method
// Checks if a segment managed by SegmentInfo si-> has deletions
//Pre - si-> holds a valid reference to an SegmentInfo instance
//Post - if the segement contains deleteions true is returned otherwise flas
//Check if the deletion file exists and return the result
QString f;
Misc::segmentname(f, CL_MAX_PATH, si->name, QLatin1String(".del"), -1);
return si->getDir()->fileExists(f);
}
//synchronized
void SegmentReader::doDelete(const int32_t docNum)
{
//Func - Marks document docNum as deleted
//Pre - docNum >=0 and DocNum < maxDoc()
// docNum contains the number of the document that must be
// marked deleted
//Post - The document identified by docNum has been marked deleted
SCOPED_LOCK_MUTEX(THIS_LOCK)
CND_PRECONDITION(docNum >= 0, "docNum is a negative number");
CND_PRECONDITION(docNum < maxDoc(),
"docNum is bigger than the total number of documents");
//Check if deletedDocs exists
if (deletedDocs == NULL) {
deletedDocs = _CLNEW BitSet(maxDoc());
//Condition check to see if deletedDocs points to a valid instance
CND_CONDITION(deletedDocs != NULL,
"No memory could be allocated for deletedDocs");
}
//Flag that there are documents marked deleted
deletedDocsDirty = true;
undeleteAll = false;
//Mark document identified by docNum as deleted
deletedDocs->set(docNum);
}
void SegmentReader::doUndeleteAll()
{
_CLDELETE(deletedDocs);
deletedDocsDirty = false;
undeleteAll = true;
}
void SegmentReader::files(QStringList& retarray)
{
//Func - Returns all file names managed by this SegmentReader
//Pre - segment != NULL
//Post - All filenames managed by this SegmentRead have been returned
CND_PRECONDITION(segment != NULL, "segment is NULL");
QString temp;
#define _ADD_SEGMENT(ext) \
temp = SegmentName(ext); \
if (getDirectory()->fileExists(temp)) \
retarray.push_back(temp);
//Add the name of the Field Info file
_ADD_SEGMENT(QLatin1String(".cfs"));
_ADD_SEGMENT(QLatin1String(".fnm"));
_ADD_SEGMENT(QLatin1String(".fdx"));
_ADD_SEGMENT(QLatin1String(".fdt"));
_ADD_SEGMENT(QLatin1String(".tii"));
_ADD_SEGMENT(QLatin1String(".tis"));
_ADD_SEGMENT(QLatin1String(".frq"));
_ADD_SEGMENT(QLatin1String(".prx"));
_ADD_SEGMENT(QLatin1String(".del"));
_ADD_SEGMENT(QLatin1String(".tvx"));
_ADD_SEGMENT(QLatin1String(".tvd"));
_ADD_SEGMENT(QLatin1String(".tvf"));
_ADD_SEGMENT(QLatin1String(".tvp"));
//iterate through the field infos
for (int32_t i = 0; i < fieldInfos->size(); ++i) {
//Get the field info for the i-th field
FieldInfo* fi = fieldInfos->fieldInfo(i);
//Check if the field has been indexed
if (fi->isIndexed && !fi->omitNorms) {
QString name;
if (cfsReader == NULL)
name = SegmentName(QLatin1String(".f"), i);
else
name = SegmentName(QLatin1String(".s"), i);
//The field has been indexed so add its norm file
if (getDirectory()->fileExists(name))
retarray.push_back(name);
}
}
}
TermEnum* SegmentReader::terms() const
{
//Func - Returns an enumeration of all the Terms and TermInfos in the set.
//Pre - tis != NULL
//Post - An enumeration of all the Terms and TermInfos in the set has been returned
CND_PRECONDITION(tis != NULL, "tis is NULL");
return tis->terms();
}
TermEnum* SegmentReader::terms(const Term* t) const
{
//Func - Returns an enumeration of terms starting at or after the named term t
//Pre - t != NULL
// tis != NULL
//Post - An enumeration of terms starting at or after the named term t
CND_PRECONDITION(t != NULL, "t is NULL");
CND_PRECONDITION(tis != NULL, "tis is NULL");
return tis->terms(t);
}
bool SegmentReader::document(int32_t n, Document* doc)
{
//Func - Returns a document identified by n
//Pre - n >=0 and identifies the document n
//Post - if the document has been deleted then an exception has been thrown
// otherwise a reference to the found document has been returned
SCOPED_LOCK_MUTEX(THIS_LOCK)
CND_PRECONDITION(n >= 0, "n is a negative number");
//Check if the n-th document has been marked deleted
if (isDeleted(n))
_CLTHROWA(CL_ERR_InvalidState, "attempt to access a deleted document" );
//Retrieve the n-th document
return fieldsReader->doc(n, doc);
}
bool SegmentReader::isDeleted(const int32_t n)
{
//Func - Checks if the n-th document has been marked deleted
//Pre - n >=0 and identifies the document n
//Post - true has been returned if document n has been deleted otherwise fralse
SCOPED_LOCK_MUTEX(THIS_LOCK)
CND_PRECONDITION(n >= 0, "n is a negative number");
//Is document n deleted
return (deletedDocs != NULL && deletedDocs->get(n));
}
TermDocs* SegmentReader::termDocs() const
{
//Func - Returns an unpositioned TermDocs enumerator.
//Pre - true
//Post - An unpositioned TermDocs enumerator has been returned
return _CLNEW SegmentTermDocs(this);
}
TermPositions* SegmentReader::termPositions() const
{
//Func - Returns an unpositioned TermPositions enumerator.
//Pre - true
//Post - An unpositioned TermPositions enumerator has been returned
return _CLNEW SegmentTermPositions(this);
}
int32_t SegmentReader::docFreq(const Term* t) const
{
//Func - Returns the number of documents which contain the term t
//Pre - t holds a valid reference to a Term
//Post - The number of documents which contain term t has been returned
//Get the TermInfo ti for Term t in the set
TermInfo* ti = tis->get(t);
//Check if an TermInfo has been returned
if (ti) {
//Get the frequency of the term
int32_t ret = ti->docFreq;
//TermInfo ti is not needed anymore so delete it
_CLDELETE( ti );
//return the number of documents which containt term t
return ret;
}
//No TermInfo returned so return 0
return 0;
}
int32_t SegmentReader::numDocs()
{
//Func - Returns the actual number of documents in the segment
//Pre - true
//Post - The actual number of documents in the segments
//Get the number of all the documents in the segment including the ones that have
//been marked deleted
int32_t n = maxDoc();
//Check if there any deleted docs
if (deletedDocs != NULL)
//Substract the number of deleted docs from the number returned by maxDoc
n -= deletedDocs->count();
//return the actual number of documents in the segment
return n;
}
int32_t SegmentReader::maxDoc() const
{
//Func - Returns the number of all the documents in the segment including
// the ones that have been marked deleted
//Pre - true
//Post - The total number of documents in the segment has been returned
return fieldsReader->size();
}
void SegmentReader::getFieldNames(FieldOption fldOption,
StringArrayWithDeletor& retarray)
{
size_t len = fieldInfos->size();
for (size_t i = 0; i < len; i++) {
bool v = false;
FieldInfo* fi = fieldInfos->fieldInfo(i);
if (fldOption & IndexReader::ALL) {
v = true;
} else {
if (!fi->isIndexed && (fldOption & IndexReader::UNINDEXED)) {
v = true;
}
if (fi->isIndexed && (fldOption & IndexReader::INDEXED)) {
v = true;
}
if (fi->isIndexed && fi->storeTermVector == false
&& (fldOption & IndexReader::INDEXED_NO_TERMVECTOR)) {
v = true;
}
if ((fldOption & IndexReader::TERMVECTOR)
&& fi->storeTermVector == true
&& fi->storePositionWithTermVector == false
&& fi->storeOffsetWithTermVector == false) {
v = true;
}
if (fi->isIndexed && fi->storeTermVector
&& (fldOption & IndexReader::INDEXED_WITH_TERMVECTOR)) {
v = true;
}
if (fi->storePositionWithTermVector
&& fi->storeOffsetWithTermVector == false
&& (fldOption & IndexReader::TERMVECTOR_WITH_POSITION)) {
v = true;
}
if (fi->storeOffsetWithTermVector
&& fi->storePositionWithTermVector == false
&& (fldOption & IndexReader::TERMVECTOR_WITH_OFFSET)) {
v = true;
}
if ((fi->storeOffsetWithTermVector && fi->storePositionWithTermVector)
&& (fldOption & IndexReader::TERMVECTOR_WITH_POSITION_OFFSET)) {
v = true;
}
}
if (v)
retarray.push_back(STRDUP_TtoT(fi->name));
}
}
bool SegmentReader::hasNorms(const TCHAR* field) const
{
return _norms.find(field) != _norms.end();
}
void SegmentReader::norms(const TCHAR* field, uint8_t* bytes)
{
//Func - Reads the Norms for field from disk starting at offset in the inputstream
//Pre - field != NULL
// bytes != NULL is an array of bytes which is to be used to read the norms into.
// it is advisable to have bytes initalized by zeroes!
//Post - The if an inputstream to the norm file could be retrieved the bytes have been read
// You are never sure whether or not the norms have been read into bytes properly!!!!!!!!!!!!!!!!!
CND_PRECONDITION(field != NULL, "field is NULL");
CND_PRECONDITION(bytes != NULL, "field is NULL");
SCOPED_LOCK_MUTEX(THIS_LOCK)
Norm* norm = _norms.get(field);
if ( norm == NULL ){
memcpy(bytes, fakeNorms(), maxDoc());
return;
}
if (norm->bytes != NULL) { // can copy from cache
memcpy(bytes, norm->bytes, maxDoc());
return;
}
IndexInput* _normStream = norm->in->clone();
CND_PRECONDITION(_normStream != NULL, "normStream==NULL")
// read from disk
try {
_normStream->seek(0);
_normStream->readBytes(bytes, maxDoc());
} _CLFINALLY (
//Have the normstream closed
_normStream->close();
//Destroy the normstream
_CLDELETE( _normStream );
);
}
uint8_t* SegmentReader::createFakeNorms(int32_t size)
{
uint8_t* ones = _CL_NEWARRAY(uint8_t,size);
memset(ones, DefaultSimilarity::encodeNorm(1.0f), size);
return ones;
}
uint8_t* SegmentReader::fakeNorms()
{
if (ones == NULL)
ones = createFakeNorms(maxDoc());
return ones;
}
// can return null if norms aren't stored
uint8_t* SegmentReader::getNorms(const TCHAR* field)
{
SCOPED_LOCK_MUTEX(THIS_LOCK)
Norm* norm = _norms.get(field);
if (norm == NULL)
return NULL; // not indexed, or norms not stored
if (norm->bytes == NULL) { // value not yet read
uint8_t* bytes = _CL_NEWARRAY(uint8_t, maxDoc());
norms(field, bytes);
norm->bytes = bytes; // cache it
}
return norm->bytes;
}
uint8_t* SegmentReader::norms(const TCHAR* field)
{
//Func - Returns the bytes array that holds the norms of a named field
//Pre - field != NULL and contains the name of the field for which the norms
// must be retrieved
//Post - If there was norm for the named field then a bytes array has been allocated
// and returned containing the norms for that field. If the named field is unknown NULL is returned.
CND_PRECONDITION(field != NULL, "field is NULL");
SCOPED_LOCK_MUTEX(THIS_LOCK)
uint8_t* bytes = getNorms(field);
if (bytes == NULL)
bytes = fakeNorms();
return bytes;
}
void SegmentReader::doSetNorm(int32_t doc, const TCHAR* field, uint8_t value)
{
Norm* norm = _norms.get(field);
if (norm == NULL) // not an indexed field
return;
norm->dirty = true; // mark it dirty
normsDirty = true;
uint8_t* bits = norms(field);
bits[doc] = value; // set the value
}
QString SegmentReader::SegmentName(const QString& ext, const int32_t x)
{
//Func - Returns an allocated buffer in which it creates a filename by
// concatenating segment with ext and x
//Pre ext != NULL and holds the extension
// x contains a number
//Post - A buffer has been instantiated an when x = -1 buffer contains the concatenation of
// segment and ext otherwise buffer contains the contentation of segment, ext and x
CND_PRECONDITION(!ext.isEmpty(), "ext is NULL");
QString buf;
SegmentName(buf, CL_MAX_PATH, ext, x);
return buf;
}
void SegmentReader::SegmentName(QString& buffer, int32_t bufferLen,
const QString& ext, const int32_t x)
{
//Func - Creates a filename in buffer by concatenating segment with ext and x
//Pre - buffer != NULL
// ext != NULL
// x contains a number
//Post - When x = -1 buffer contains the concatenation of segment and ext otherwise
// buffer contains the contentation of segment, ext and x
CND_PRECONDITION(!segment.isEmpty(), "Segment is NULL");
Misc::segmentname(buffer, bufferLen, segment, ext, x);
}
void SegmentReader::openNorms(Directory* cfsDir)
{
//Func - Open all norms files for all fields
// Creates for each field a norm Instance with an open inputstream to
// a corresponding norm file ready to be read
//Pre - true
//Post - For each field a norm instance has been created with an open inputstream to
// a corresponding norm file ready to be read
//Iterate through all the fields
for (int32_t i = 0; i < fieldInfos->size(); i++) {
//Get the FieldInfo for the i-th field
FieldInfo* fi = fieldInfos->fieldInfo(i);
//Check if the field is indexed
if (fi->isIndexed && !fi->omitNorms ) {
//Allocate a buffer
QString fileName;
// look first if there are separate norms in compound format
SegmentName(fileName, CL_MAX_PATH, QLatin1String(".s"), fi->number);
Directory* d = getDirectory();
if(!d->fileExists(fileName)){
SegmentName(fileName, CL_MAX_PATH, QLatin1String(".f"), fi->number);
d = cfsDir;
}
_norms.put(fi->name, _CLNEW Norm(d->openInput(fileName),
fi->number, this, segment));
}
}
}
void SegmentReader::closeNorms()
{
//Func - Close all the norms stored in norms
//Pre - true
//Post - All the norms have been destroyed
SCOPED_LOCK_MUTEX(_norms.THIS_LOCK)
//Create an interator initialized at the beginning of norms
NormsType::iterator itr = _norms.begin();
//Iterate through all the norms
while (itr != _norms.end()) {
// Get, delete the norm
_CLDELETE(itr->second);
// Move the interator to the next norm in the norms collection.
// Note ++ is an overloaded operator
++itr;
}
_norms.clear(); //bvk: they're deleted, so clear them so that they are not re-used
}
TermVectorsReader* SegmentReader::getTermVectorsReader()
{
TermVectorsReader* tvReader = termVectorsLocal.get();
if (tvReader == NULL) {
tvReader = termVectorsReaderOrig->clone();
termVectorsLocal.set(tvReader);
}
return tvReader;
}
TermFreqVector* SegmentReader::getTermFreqVector(int32_t docNumber,
const TCHAR* field)
{
if (field) {
FieldInfo* fi = fieldInfos->fieldInfo(field);
// Check if this field is invalid or has no stored term vector
if (fi == NULL || !fi->storeTermVector || termVectorsReaderOrig == NULL)
return NULL;
}
TermVectorsReader* termVectorsReader = getTermVectorsReader();
if (termVectorsReader == NULL)
return NULL;
return termVectorsReader->get(docNumber, field);
}
bool SegmentReader::getTermFreqVectors(int32_t docNumber,
Array<TermFreqVector*>& result)
{
if (termVectorsReaderOrig == NULL)
return false;
TermVectorsReader* termVectorsReader = getTermVectorsReader();
if (termVectorsReader == NULL)
return false;
return termVectorsReader->get(docNumber, result);
}
CL_NS_END