/* | |
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team | |
* | |
* Distributable under the terms of either the Apache License (Version 2.0) or | |
* the GNU Lesser General Public License, as specified in the COPYING file. | |
* | |
* Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved. | |
*/ | |
#ifndef _lucene_index_IndexWriter_ | |
#define _lucene_index_IndexWriter_ | |
#if defined(_LUCENE_PRAGMA_ONCE) | |
# pragma once | |
#endif | |
#include <QtCore/QString> | |
#include <QtCore/QStringList> | |
#include "CLucene/analysis/AnalysisHeader.h" | |
#include "CLucene/util/VoidList.h" | |
#include "CLucene/search/Similarity.h" | |
#include "CLucene/store/Lock.h" | |
#include "CLucene/store/TransactionalRAMDirectory.h" | |
#include "SegmentHeader.h" | |
CL_NS_DEF(index) | |
/** | |
An IndexWriter creates and maintains an index. | |
The third argument to the | |
<a href="#IndexWriter(org.apache.lucene.store.Directory, org.apache.lucene.analysis.Analyzer, boolean)"><b>constructor</b></a> | |
determines whether a new index is created, or whether an existing index is | |
opened for the addition of new documents. | |
In either case, documents are added with the <a | |
href="#addDocument(org.apache.lucene.document.Document)"><b>addDocument</b></a> method. | |
When finished adding documents, <a href="#close()"><b>close</b></a> should be called. | |
<p>If an index will not have more documents added for a while and optimal search | |
performance is desired, then the <a href="#optimize()"><b>optimize</b></a> | |
method should be called before the index is closed. | |
<p>Opening an IndexWriter creates a lock file for the directory in use. Trying to open | |
another IndexWriter on the same directory will lead to an IOException. The IOException | |
is also thrown if an IndexReader on the same directory is used to delete documents | |
from the index. | |
@see IndexModifier IndexModifier supports the important methods of IndexWriter plus deletion | |
*/ | |
class IndexWriter : LUCENE_BASE | |
{ | |
class LockWith2 : public CL_NS(store)::LuceneLockWith<void> | |
{ | |
public: | |
LockWith2(CL_NS(store)::LuceneLock* lock, | |
int64_t lockWaitTimeout, | |
IndexWriter* wr, | |
CL_NS(util)::CLVector<SegmentReader*>* std, | |
bool create); | |
~LockWith2() {} | |
void doBody(); | |
private: | |
bool create; | |
IndexWriter* writer; | |
CL_NS(util)::CLVector<SegmentReader*>* segmentsToDelete; | |
}; | |
friend class LockWith2; | |
class LockWithCFS : public CL_NS(store)::LuceneLockWith<void> | |
{ | |
public: | |
LockWithCFS(CL_NS(store)::LuceneLock* lock, | |
int64_t lockWaitTimeout, | |
CL_NS(store)::Directory* dir, | |
IndexWriter* wr, | |
const QString& segName, | |
const QStringList& ftd); | |
~LockWithCFS() {} | |
void doBody(); | |
private: | |
QString segName; | |
IndexWriter* writer; | |
CL_NS(store)::Directory* directory; | |
QStringList filesToDelete; | |
}; | |
friend class IndexWriter::LockWithCFS; | |
// indicates if the writers is open - this way close can be called multiple | |
// times | |
bool isOpen; | |
// how to analyze text | |
CL_NS(analysis)::Analyzer* analyzer; | |
CL_NS(search)::Similarity* similarity; // how to normalize | |
/** Use compound file setting. Normally defaults to true, except when | |
* using a RAMDirectory. This minimizes the number of files used. | |
* Setting this to false may improve indexing performance, but | |
* may also cause file handle problems. | |
*/ | |
bool useCompoundFile; | |
bool closeDir; | |
// for temp segs | |
CL_NS(store)::TransactionalRAMDirectory* ramDirectory; | |
CL_NS(store)::LuceneLock* writeLock; | |
void _IndexWriter(const bool create); | |
void _finalize(); | |
// where this index resides | |
CL_NS(store)::Directory* directory; | |
int32_t getSegmentsCounter() { return segmentInfos.counter; } | |
int32_t maxFieldLength; | |
int32_t mergeFactor; | |
int32_t minMergeDocs; | |
int32_t maxMergeDocs; | |
int32_t termIndexInterval; | |
int64_t writeLockTimeout; | |
int64_t commitLockTimeout; | |
public: | |
DEFINE_MUTEX(THIS_LOCK) | |
// Release the write lock, if needed. | |
SegmentInfos segmentInfos; | |
// Release the write lock, if needed. | |
~IndexWriter(); | |
/** | |
* The Java implementation of Lucene silently truncates any tokenized | |
* field if the number of tokens exceeds a certain threshold. Although | |
* that threshold is adjustable, it is easy for the client programmer | |
* to be unaware that such a threshold exists, and to become its | |
* unwitting victim. | |
* CLucene implements a less insidious truncation policy. Up to | |
* DEFAULT_MAX_FIELD_LENGTH tokens, CLucene behaves just as JLucene | |
* does. If the number of tokens exceeds that threshold without any | |
* indication of a truncation preference by the client programmer, | |
* CLucene raises an exception, prompting the client programmer to | |
* explicitly set a truncation policy by adjusting maxFieldLength. | |
*/ | |
LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_FIELD_LENGTH = 10000); | |
LUCENE_STATIC_CONSTANT(int32_t, FIELD_TRUNC_POLICY__WARN = -1); | |
int32_t getMaxFieldLength() const{ return maxFieldLength; } | |
void setMaxFieldLength(int32_t val){ maxFieldLength = val; } | |
/** | |
* Default value is 10. Change using {@link #setMaxBufferedDocs(int)}. | |
*/ | |
LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_BUFFERED_DOCS = 10); | |
/** Determines the minimal number of documents required before the buffered | |
* in-memory documents are merging and a new Segment is created. | |
* Since Documents are merged in a {@link RAMDirectory}, | |
* large value gives faster indexing. At the same time, mergeFactor limits | |
* the number of files open in a FSDirectory. | |
* | |
* <p> The default value is DEFAULT_MAX_BUFFERED_DOCS.*/ | |
void setMaxBufferedDocs(int32_t val){ minMergeDocs = val; } | |
/** | |
* @see #setMaxBufferedDocs | |
*/ | |
int32_t getMaxBufferedDocs(){ return minMergeDocs; } | |
/** | |
* Default value for the write lock timeout (1,000). | |
*/ | |
LUCENE_STATIC_CONSTANT(int64_t, WRITE_LOCK_TIMEOUT = 1000); | |
/** | |
* Sets the maximum time to wait for a write lock (in milliseconds). | |
*/ | |
void setWriteLockTimeout(int64_t writeLockTimeout) | |
{ this->writeLockTimeout = writeLockTimeout; } | |
/** | |
* @see #setWriteLockTimeout | |
*/ | |
int64_t getWriteLockTimeout() { return writeLockTimeout; } | |
/** | |
* Default value for the commit lock timeout (10,000). | |
*/ | |
LUCENE_STATIC_CONSTANT(int64_t, COMMIT_LOCK_TIMEOUT = 10000); | |
/** | |
* Sets the maximum time to wait for a commit lock (in milliseconds). | |
*/ | |
void setCommitLockTimeout(int64_t commitLockTimeout) | |
{ this->commitLockTimeout = commitLockTimeout; } | |
/** | |
* @see #setCommitLockTimeout | |
*/ | |
int64_t getCommitLockTimeout() { return commitLockTimeout; } | |
static const QLatin1String WRITE_LOCK_NAME; //"write.lock"; | |
static const QLatin1String COMMIT_LOCK_NAME; //"commit.lock"; | |
/** | |
* Default value is 10. Change using {@link #setMergeFactor(int)}. | |
*/ | |
LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MERGE_FACTOR = 10); | |
/* Determines how often segment indices are merged by addDocument(). With | |
* smaller values, less RAM is used while indexing, and searches on | |
* unoptimized indices are faster, but indexing speed is slower. With larger | |
* values more RAM is used while indexing and searches on unoptimized indices | |
* are slower, but indexing is faster. Thus larger values (> 10) are best | |
* for batched index creation, and smaller values (< 10) for indices that are | |
* interactively maintained. | |
* | |
* <p>This must never be less than 2. The default value is 10. | |
*/ | |
int32_t getMergeFactor() const{ return mergeFactor; } | |
void setMergeFactor(int32_t val){ mergeFactor = val; } | |
/** Expert: The fraction of terms in the "dictionary" which should be stored | |
* in RAM. Smaller values use more memory, but make searching slightly | |
* faster, while larger values use less memory and make searching slightly | |
* slower. Searching is typically not dominated by dictionary lookup, so | |
* tweaking this is rarely useful. | |
*/ | |
LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_TERM_INDEX_INTERVAL = 128); | |
/** Expert: Set the interval between indexed terms. Large values cause less | |
* memory to be used by IndexReader, but slow random-access to terms. Small | |
* values cause more memory to be used by an IndexReader, and speed | |
* random-access to terms. | |
* | |
* This parameter determines the amount of computation required per query | |
* term, regardless of the number of documents that contain that term. In | |
* particular, it is the maximum number of other terms that must be | |
* scanned before a term is located and its frequency and position information | |
* may be processed. In a large index with user-entered query terms, query | |
* processing time is likely to be dominated not by term lookup but rather | |
* by the processing of frequency and positional data. In a small index | |
* or when many uncommon query terms are generated (e.g., by wildcard | |
* queries) term lookup may become a dominant cost. | |
* | |
* In particular, <code>numUniqueTerms/interval</code> terms are read into | |
* memory by an IndexReader, and, on average, <code>interval/2</code> terms | |
* must be scanned for each random term access. | |
* | |
* @see #DEFAULT_TERM_INDEX_INTERVAL | |
*/ | |
void setTermIndexInterval(int32_t interval) { termIndexInterval = interval; } | |
/** Expert: Return the interval between indexed terms. | |
* | |
* @see #setTermIndexInterval(int) | |
*/ | |
int32_t getTermIndexInterval() { return termIndexInterval; } | |
/** Determines the minimal number of documents required before the buffered | |
* in-memory documents are merging and a new Segment is created. | |
* Since Documents are merged in a {@link RAMDirectory}, | |
* large value gives faster indexing. At the same time, mergeFactor limits | |
* the number of files open in a FSDirectory. | |
* | |
* <p> The default value is 10.*/ | |
int32_t getMinMergeDocs() const{ return minMergeDocs; } | |
void setMinMergeDocs(int32_t val){ minMergeDocs = val; } | |
/** Determines the largest number of documents ever merged by addDocument(). | |
* Small values (e.g., less than 10,000) are best for interactive indexing, | |
* as this limits the length of pauses while indexing to a few seconds. | |
* Larger values are best for batched indexing and speedier searches. | |
* | |
* <p>The default value is {@link #DEFAULT_MAX_MERGE_DOCS}. | |
*/ | |
LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_MERGE_DOCS = LUCENE_INT32_MAX_SHOULDBE); | |
/**Determines the largest number of documents ever merged by addDocument(). | |
* Small values (e.g., less than 10,000) are best for interactive indexing, | |
* as this limits the length of pauses while indexing to a few seconds. | |
* Larger values are best for batched indexing and speedier searches. | |
* | |
* <p>The default value is {@link Integer#MAX_VALUE}. | |
*/ | |
int32_t getMaxMergeDocs() const{ return maxMergeDocs; } | |
void setMaxMergeDocs(int32_t val){ maxMergeDocs = val; } | |
/** | |
* Constructs an IndexWriter for the index in <code>path</code>. | |
* Text will be analyzed with <code>a</code>. If <code>create</code> | |
* is true, then a new, empty index will be created in | |
* <code>path</code>, replacing the index already there, if any. | |
* | |
* @param path the path to the index directory | |
* @param a the analyzer to use | |
* @param create <code>true</code> to create the index or overwrite | |
* the existing one; <code>false</code> to append to the existing | |
* index | |
* @throws IOException if the directory cannot be read/written to, or | |
* if it does not exist, and <code>create</code> is | |
* <code>false</code> | |
*/ | |
IndexWriter(const QString& path, CL_NS(analysis)::Analyzer* a, | |
const bool create, const bool closeDir = true); | |
/**Constructs an IndexWriter for the index in <code>d</code>. Text will be | |
* analyzed with <code>a</code>. If <code>create</code> is true, then a new, | |
* empty index will be created in <code>d</code>, replacing the index already | |
* there, if any. | |
*/ | |
IndexWriter(CL_NS(store)::Directory* d, CL_NS(analysis)::Analyzer* a, | |
const bool create, const bool closeDir = false); | |
// Flushes all changes to an index, closes all associated files, and closes | |
// the directory that the index is stored in. | |
void close(); | |
// Returns the number of documents currently in this index. synchronized | |
int32_t docCount(); | |
// Adds a document to this index, using the provided analyzer instead of | |
// the value of {@link #getAnalyzer()}. If the document contains more than | |
// {@link #setMaxFieldLength(int)} terms for a given field, the remainder | |
// are discarded. | |
void addDocument(CL_NS(document)::Document* doc, | |
CL_NS(analysis)::Analyzer* analyzer = NULL); | |
// Merges all segments together into a single segment, optimizing an index | |
// for search. synchronized | |
void optimize(); | |
/**Merges all segments from an array of indices into this index. | |
* | |
* <p>This may be used to parallelize batch indexing. A large document | |
* collection can be broken into sub-collections. Each sub-collection can be | |
* indexed in parallel, on a different thread, process or machine. The | |
* complete index can then be created by merging sub-collection indices | |
* with this method. | |
* | |
* <p>After this completes, the index is optimized. | |
*@synchronized | |
*/ | |
void addIndexes(CL_NS(store)::Directory** dirs); | |
/** Merges the provided indexes into this index. | |
* <p>After this completes, the index is optimized. </p> | |
* <p>The provided IndexReaders are not closed.</p> | |
*/ | |
void addIndexes(IndexReader** readers); | |
/** Returns the directory this index resides in. */ | |
CL_NS(store)::Directory* getDirectory() { return directory; } | |
/** Get the current setting of whether to use the compound file format. | |
* Note that this just returns the value you set with setUseCompoundFile(boolean) | |
* or the default. You cannot use this to query the status of an existing index. | |
* @see #setUseCompoundFile(boolean) | |
*/ | |
bool getUseCompoundFile() { return useCompoundFile; } | |
/** Setting to turn on usage of a compound file. When on, multiple files | |
* for each segment are merged into a single file once the segment creation | |
* is finished. This is done regardless of what directory is in use. | |
*/ | |
void setUseCompoundFile(bool value) { useCompoundFile = value; } | |
/** Expert: Set the Similarity implementation used by this IndexWriter. | |
* | |
* @see Similarity#setDefault(Similarity) | |
*/ | |
void setSimilarity(CL_NS(search)::Similarity* similarity) | |
{ this->similarity = similarity; } | |
/** Expert: Return the Similarity implementation used by this IndexWriter. | |
* | |
* <p>This defaults to the current value of {@link Similarity#getDefault()}. | |
*/ | |
CL_NS(search)::Similarity* getSimilarity() { return this->similarity; } | |
/** Returns the analyzer used by this index. */ | |
CL_NS(analysis)::Analyzer* getAnalyzer() { return analyzer; } | |
private: | |
/** Merges all RAM-resident segments. */ | |
void flushRamSegments(); | |
/** Incremental segment merger. */ | |
void maybeMergeSegments(); | |
// Pops segments off of segmentInfos stack down to minSegment, merges them, | |
// and pushes the merged index onto the top of the segmentInfos stack. | |
void mergeSegments(const uint32_t minSegment); | |
// Merges the named range of segments, replacing them in the stack with a | |
// single segment. | |
void mergeSegments(const uint32_t minSegment, const uint32_t end); | |
// Some operating systems (e.g. Windows) don't permit a file to be deleted | |
// while it is opened for read (e.g. by another process or thread). So we | |
// assume that when a delete fails it is because the file is open in another | |
// process, and queue the file for subsequent deletion. | |
void deleteSegments(CL_NS(util)::CLVector<SegmentReader*>* segments); | |
void deleteFiles(const QStringList& files); | |
void readDeleteableFiles(QStringList& files); | |
void deleteFiles(const QStringList& files, QStringList& deletable); | |
void deleteFiles(const QStringList& files, CL_NS(store)::Directory* directory); | |
void writeDeleteableFiles(const QStringList& files); | |
// synchronized | |
QString newSegmentName(); | |
}; | |
CL_NS_END | |
#endif |