blob: caa2d06f02f3d87ab3a05278c510e5ae328e6f73 [file] [log] [blame]
/*
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
*
* Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
*/
#ifndef _lucene_index_termvector_h
#define _lucene_index_termvector_h
#if defined(_LUCENE_PRAGMA_ONCE)
# pragma once
#endif
#include <QtCore/QString>
#include "CLucene/store/Directory.h"
#include "CLucene/store/IndexOutput.h"
#include "FieldInfos.h"
CL_NS_DEF(index)
struct TermVectorOffsetInfo;
class TermPositionVector;
// Provides access to stored term vector of a document field.
class TermFreqVector : LUCENE_BASE
{
public:
virtual ~TermFreqVector() {}
// @return The field this vector is associated with.
virtual const TCHAR* getField() = 0;
// @return The number of terms in the term vector.
virtual int32_t size() = 0;
// @return An Array of term texts in ascending order.
virtual const TCHAR** getTerms() = 0;
/* Array of term frequencies. Locations of the array correspond one to one
* to the terms in the array obtained from <code>getTerms</code>
* method. Each location in the array contains the number of times this
* term occurs in the document or the document field.
*
* The size of the returned array is size()
* @memory Returning a pointer to internal data. Do not delete.
*/
virtual const Array<int32_t>* getTermFrequencies() = 0;
/* Return an index in the term numbers array returned from
* <code>getTerms</code> at which the term with the specified
* <code>term</code> appears. If this term does not appear in the array,
* return -1.
*/
virtual int32_t indexOf(const TCHAR* term) = 0;
/* Just like <code>indexOf(int32_t)</code> but searches for a number of terms
* at the same time. Returns an array that has the same size as the number
* of terms searched for, each slot containing the result of searching for
* that term number.
*
* @param terms array containing terms to look for
* @param start index in the array where the list of terms starts
* @param len the number of terms in the list
*/
virtual void indexesOf(const TCHAR** terms, const int32_t start,
const int32_t len, Array<int32_t>& ret) = 0;
// Solve the diamond inheritence problem by providing a reinterpret function.
// No dynamic casting is required and no RTTI data is needed to do this
virtual TermPositionVector* __asTermPositionVector() = 0;
};
/**
* Writer works by opening a document and then opening the fields within the document and then
* writing out the vectors for each field.
*
* Rough usage:
*
<CODE>
for each document
{
writer.openDocument();
for each field on the document
{
writer.openField(field);
for all of the terms
{
writer.addTerm(...)
}
writer.closeField
}
writer.closeDocument()
}
</CODE>
*/
class TermVectorsWriter : LUCENE_BASE
{
private:
class TVField : LUCENE_BASE
{
public:
int32_t number;
int64_t tvfPointer;
int32_t length; // number of distinct term positions
bool storePositions;
bool storeOffsets;
TVField(int32_t number, bool storePos, bool storeOff)
: tvfPointer(0)
, length(0)
{
this->number = number;
this->storePositions = storePos;
this->storeOffsets = storeOff;
}
~TVField() {}
};
class TVTerm : LUCENE_BASE
{
const TCHAR* termText;
int32_t termTextLen; //textlen cache
public:
TVTerm();
~TVTerm();
int32_t freq;
Array<int32_t>* positions;
Array<TermVectorOffsetInfo>* offsets;
const TCHAR* getTermText() const;
size_t getTermTextLen();
void setTermText(const TCHAR* val);
};
CL_NS(store)::IndexOutput* tvx, *tvd, *tvf;
CL_NS(util)::CLVector<TVField*,CL_NS(util)::Deletor::Object<TVField> > fields;
CL_NS(util)::CLVector<TVTerm*,CL_NS(util)::Deletor::Object<TVTerm> > terms;
FieldInfos* fieldInfos;
TVField* currentField;
int64_t currentDocPointer;
void addTermInternal(const TCHAR* termText, const int32_t freq,
Array<int32_t>* positions, Array<TermVectorOffsetInfo>* offsets);
void writeField();
void writeDoc();
void openField(int32_t fieldNumber, bool storePositionWithTermVector,
bool storeOffsetWithTermVector);
public:
LUCENE_STATIC_CONSTANT(int32_t, FORMAT_VERSION = 2);
// The size in bytes that the FORMAT_VERSION will take up at the beginning
// of each file
LUCENE_STATIC_CONSTANT(int32_t, FORMAT_SIZE = 4);
LUCENE_STATIC_CONSTANT(uint8_t, STORE_POSITIONS_WITH_TERMVECTOR = 0x1);
LUCENE_STATIC_CONSTANT(uint8_t, STORE_OFFSET_WITH_TERMVECTOR = 0x2);
static const QLatin1String LUCENE_TVX_EXTENSION;
static const QLatin1String LUCENE_TVD_EXTENSION;
static const QLatin1String LUCENE_TVF_EXTENSION;
TermVectorsWriter(CL_NS(store)::Directory* directory, const QString& segment,
FieldInfos* fieldInfos);
~TermVectorsWriter();
void openDocument();
void closeDocument();
/** Close all streams. */
void close();
bool isDocumentOpen() const;
/** Start processing a field. This can be followed by a number of calls to
* addTerm, and a final call to closeField to indicate the end of
* processing of this field. If a field was previously open, it is
* closed automatically.
*/
void openField(const TCHAR* field);
/** Finished processing current field. This should be followed by a call to
* openField before future calls to addTerm.
*/
void closeField();
/** Return true if a field is currently open. */
bool isFieldOpen() const;
/**
* Add a complete document specified by all its term vectors. If document has no
* term vectors, add value for tvx.
*
* @param vectors
* @throws IOException
*/
void addAllDocVectors(Array<TermFreqVector*>& vectors);
/** Add term to the field's term vector. Field must already be open.
* Terms should be added in
* increasing order of terms, one call per unique termNum. ProxPointer
* is a pointer into the TermPosition file (prx). Freq is the number of
* times this term appears in this field, in this document.
* @throws IllegalStateException if document or field is not open
*/
void addTerm(const TCHAR* termText, int32_t freq,
Array<int32_t>* positions = NULL, Array<TermVectorOffsetInfo>* offsets = NULL);
};
class SegmentTermVector : public virtual TermFreqVector
{
private:
const TCHAR* field;
TCHAR** terms;
int32_t termsLen; //cache
Array<int32_t>* termFreqs;
int32_t binarySearch(TCHAR** a, const int32_t arraylen, const TCHAR* key) const;
public:
//note: termFreqs must be the same length as terms
SegmentTermVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs);
virtual ~SegmentTermVector();
/**
*
* @return The number of the field this vector is associated with
*/
const TCHAR* getField();
TCHAR* toString() const;
int32_t size();
const TCHAR** getTerms();
const Array<int32_t>* getTermFrequencies();
int32_t indexOf(const TCHAR* termText);
void indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, Array<int32_t>& ret);
virtual TermPositionVector* __asTermPositionVector();
};
class TermVectorsReader : LUCENE_BASE
{
private:
FieldInfos* fieldInfos;
CL_NS(store)::IndexInput* tvx;
CL_NS(store)::IndexInput* tvd;
CL_NS(store)::IndexInput* tvf;
int64_t _size;
int32_t tvdFormat;
int32_t tvfFormat;
int32_t checkValidFormat(CL_NS(store)::IndexInput* in);
void readTermVectors(const TCHAR** fields, const int64_t* tvfPointers,
const int32_t len, Array<TermFreqVector*>& _return);
/**
*
* @param field The field to read in
* @param tvfPointer The pointer within the tvf file where we should start reading
* @return The TermVector located at that position
* @throws IOException
*/
SegmentTermVector* readTermVector(const TCHAR* field, const int64_t tvfPointer);
int64_t size();
DEFINE_MUTEX(THIS_LOCK)
TermVectorsReader(const TermVectorsReader& copy);
public:
TermVectorsReader(CL_NS(store)::Directory* d, const QString& segment,
FieldInfos* fieldInfos);
~TermVectorsReader();
void close();
TermVectorsReader* clone() const;
/**
* Retrieve the term vector for the given document and field
* @param docNum The document number to retrieve the vector for
* @param field The field within the document to retrieve
* @return The TermFreqVector for the document and field or null if there is no termVector for this field.
* @throws IOException if there is an error reading the term vector files
*/
TermFreqVector* get(const int32_t docNum, const TCHAR* field);
/**
* Return all term vectors stored for this document or null if the could not be read in.
*
* @param docNum The document number to retrieve the vector for
* @return All term frequency vectors
* @throws IOException if there is an error reading the term vector files
*/
bool get(int32_t docNum, Array<TermFreqVector*>& result);
};
struct TermVectorOffsetInfo
{
int startOffset;
int endOffset;
public:
static Array<TermVectorOffsetInfo> EMPTY_OFFSET_INFO;
TermVectorOffsetInfo();
~TermVectorOffsetInfo();
TermVectorOffsetInfo(int32_t startOffset, int32_t endOffset);
int32_t getEndOffset() const;
void setEndOffset(int32_t endOffset);
int32_t getStartOffset() const;
void setStartOffset(int32_t startOffset);
bool equals(TermVectorOffsetInfo* o);
size_t hashCode() const;
};
/* Extends <code>TermFreqVector</code> to provide additional information about
* positions in which each of the terms is found. A TermPositionVector not
* necessarily contains both positions and offsets, but at least one of these
* arrays exists.
*/
class TermPositionVector : public virtual TermFreqVector
{
public:
/** Returns an array of positions in which the term is found.
* Terms are identified by the index at which its number appears in the
* term String array obtained from the <code>indexOf</code> method.
* May return null if positions have not been stored.
*/
virtual Array<int32_t>* getTermPositions(int32_t index) = 0;
/**
* Returns an array of TermVectorOffsetInfo in which the term is found.
* May return null if offsets have not been stored.
*
* @see org.apache.lucene.analysis.Token
*
* @param index The position in the array to get the offsets from
* @return An array of TermVectorOffsetInfo objects or the empty list
*/
virtual Array<TermVectorOffsetInfo>* getOffsets(int32_t index) = 0;
virtual ~TermPositionVector(){
}
};
class SegmentTermPositionVector: public SegmentTermVector, public TermPositionVector
{
protected:
Array< Array<int32_t> >* positions;
Array< Array<TermVectorOffsetInfo> >* offsets;
static Array<int32_t> EMPTY_TERM_POS;
public:
SegmentTermPositionVector(const TCHAR* field, TCHAR** terms,
Array<int32_t>* termFreqs, Array< Array<int32_t> >* positions,
Array< Array<TermVectorOffsetInfo> >* offsets);
~SegmentTermPositionVector();
/**
* Returns an array of TermVectorOffsetInfo in which the term is found.
*
* @param index The position in the array to get the offsets from
* @return An array of TermVectorOffsetInfo objects or the empty list
* @see org.apache.lucene.analysis.Token
*/
Array<TermVectorOffsetInfo>* getOffsets(int32_t index);
/**
* Returns an array of positions in which the term is found.
* Terms are identified by the index at which its number appears in the
* term String array obtained from the <code>indexOf</code> method.
*/
Array<int32_t>* getTermPositions(int32_t index);
const TCHAR* getField() {
return SegmentTermVector::getField(); }
TCHAR* toString() const {
return SegmentTermVector::toString(); }
int32_t size() {
return SegmentTermVector::size(); }
const TCHAR** getTerms() {
return SegmentTermVector::getTerms(); }
const Array<int32_t>* getTermFrequencies() {
return SegmentTermVector::getTermFrequencies(); }
int32_t indexOf(const TCHAR* termText) {
return SegmentTermVector::indexOf(termText); }
void indexesOf(const TCHAR** termNumbers, const int32_t start,
const int32_t len, Array<int32_t>& ret) {
SegmentTermVector::indexesOf(termNumbers, start, len, ret); }
virtual TermPositionVector* __asTermPositionVector();
};
CL_NS_END
#endif