blob: 428a5e7d63f3865e18ce14acef4fe0e0be94507f [file] [log] [blame]
/*
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
*
* Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
*/
#ifndef _lucene_index_DocumentWriter_
#define _lucene_index_DocumentWriter_
#if defined(_LUCENE_PRAGMA_ONCE)
# pragma once
#endif
#include <QtCore/QString>
#include "CLucene/analysis/AnalysisHeader.h"
#include "CLucene/document/Document.h"
#include "CLucene/store/Directory.h"
#include "FieldInfos.h"
#include "IndexWriter.h"
#include "CLucene/util/VoidMap.h"
#include "CLucene/document/Field.h"
#include "TermInfo.h"
#include "CLucene/search/Similarity.h"
#include "TermInfosWriter.h"
#include "FieldsWriter.h"
#include "Term.h"
CL_NS_DEF(index)
class DocumentWriter : LUCENE_BASE
{
public:
// info about a Term in a doc
class Posting : LUCENE_BASE
{
public:
Term* term; // the Term
int32_t freq; // its frequency in doc
Array<int32_t> positions; // positions it occurs at
Array<TermVectorOffsetInfo> offsets;
Posting(Term* t, const int32_t position, TermVectorOffsetInfo* offset);
~Posting();
};
private:
CL_NS(analysis)::Analyzer* analyzer;
CL_NS(store)::Directory* directory;
FieldInfos* fieldInfos; //array
const int32_t maxFieldLength;
CL_NS(search)::Similarity* similarity;
int32_t termIndexInterval;
// Keys are Terms, values are Postings.
// Used to buffer a document before it is written to the index.
typedef CL_NS(util)::CLHashtable<Term*, Posting*, Term::Compare,
Term::Equals> PostingTableType;
PostingTableType postingTable;
int32_t* fieldLengths; //array
int32_t* fieldPositions; //array
int32_t* fieldOffsets; //array
qreal* fieldBoosts; //array
Term* termBuffer;
public:
/** This ctor used by test code only.
*
* @param directory The directory to write the document information to
* @param analyzer The analyzer to use for the document
* @param similarity The Similarity function
* @param maxFieldLength The maximum number of tokens a field may have
*/
DocumentWriter(CL_NS(store)::Directory* d, CL_NS(analysis)::Analyzer* a,
CL_NS(search)::Similarity* similarity, const int32_t maxFieldLength);
DocumentWriter(CL_NS(store)::Directory* directory,
CL_NS(analysis)::Analyzer* analyzer, IndexWriter* writer);
~DocumentWriter();
void addDocument(const QString& segment, CL_NS(document)::Document* doc);
private:
// Tokenizes the fields of a document into Postings.
void invertDocument(const CL_NS(document)::Document* doc);
void addPosition(const TCHAR* field, const TCHAR* text,
const int32_t position, TermVectorOffsetInfo* offset);
void sortPostingTable(Posting**& array, int32_t& arraySize);
static void quickSort(Posting**& postings, const int32_t lo, const int32_t hi);
void writePostings(Posting** postings, const int32_t postingsLength,
const QString& segment);
void writeNorms(const QString& segment);
void clearPostingTable();
};
CL_NS_END
#endif