blob: 32ea97bc738bf54f7ec8c937cf51529e5657b307 [file] [log] [blame]
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ICING_INDEX_INDEX_H_
#define ICING_INDEX_INDEX_H_
#include <cstdint>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/file/filesystem.h"
#include "icing/index/hit/hit.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/index/lite/lite-index.h"
#include "icing/index/lite/term-id-hit-pair.h"
#include "icing/index/main/main-index-merger.h"
#include "icing/index/main/main-index.h"
#include "icing/index/term-id-codec.h"
#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/proto/debug.pb.h"
#include "icing/proto/scoring.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/store/namespace-id.h"
#include "icing/store/suggestion-result-checker.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
// The class representing the Icing search index. This index maps terms to hits
// (document_ids, section_ids).
// Content is added to the index through the Editor class - which also dedupes
// hits (calling Editor::AddHit with the same arguments will only result in the
// creation of a single hit).
// Ex.
// ICING_ASSIGN_OR_RETURN(std::unique_ptr<Index> index,
// . Index::Create(MakeIndexOptions()));
// Index::Editor editor = index->Edit(document_id, section_id,
// TermMatchType::EXACT_ONLY); ICING_RETURN_IF_ERROR(editor.AddHit("foo"));
// ICING_RETURN_IF_ERROR(editor.AddHit("baz"));
//
// Content is retrieved from the index through the Iterator class.
// Ex.
// ICING_ASSIGN_OR_RETURN(std::unique_ptr<Index> index,
// . Index::Create(MakeIndexOptions()));
// ICING_ASSIGN_OR_RETURN(Index::Iterator iterator =
// index->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
// while(iterator->Advance().ok())
// ProcessResult(iterator->value());
class Index {
public:
struct Options {
explicit Options(const std::string& base_dir, uint32_t index_merge_size,
bool lite_index_sort_at_indexing,
uint32_t lite_index_sort_size)
: base_dir(base_dir),
index_merge_size(index_merge_size),
lite_index_sort_at_indexing(lite_index_sort_at_indexing),
lite_index_sort_size(lite_index_sort_size) {}
std::string base_dir;
int32_t index_merge_size;
bool lite_index_sort_at_indexing;
int32_t lite_index_sort_size;
};
// Creates an instance of Index in the directory pointed by file_dir.
//
// Returns:
// Valid Index on success
// DATA_LOSS if the index was corrupt and had to be cleared
// INVALID_ARGUMENT if options have invalid values
// INTERNAL on I/O error
static libtextclassifier3::StatusOr<std::unique_ptr<Index>> Create(
const Options& options, const Filesystem* filesystem,
const IcingFilesystem* icing_filesystem);
// Reads magic from existing flash (main) index file header. We need this
// during Icing initialization phase to determine the version.
//
// Returns
// Valid magic on success
// NOT_FOUND if the lite index doesn't exist
// INTERNAL on I/O error
static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic(
const Filesystem* filesystem, const std::string& base_dir);
// Clears all files created by the index. Returns OK if all files were
// cleared.
libtextclassifier3::Status Reset() {
ICING_RETURN_IF_ERROR(lite_index_->Reset());
return main_index_->Reset();
}
// Brings components of the index into memory in anticipation of a query in
// order to reduce latency.
void Warm() {
lite_index_->Warm();
main_index_->Warm();
}
// Syncs all the data and metadata changes to disk.
//
// Returns:
// OK on success
// INTERNAL on I/O errors
libtextclassifier3::Status PersistToDisk() {
ICING_RETURN_IF_ERROR(lite_index_->PersistToDisk());
return main_index_->PersistToDisk();
}
// Discard parts of the index if they contain data for document ids greater
// than document_id.
//
// NOTE: This means that TruncateTo(kInvalidDocumentId) will have no effect.
//
// Returns:
// OK on success
// INTERNAL on I/O errors
libtextclassifier3::Status TruncateTo(DocumentId document_id);
// DocumentIds are always inserted in increasing order. Returns the largest
// document_id added to the index.
DocumentId last_added_document_id() const {
DocumentId lite_document_id = lite_index_->last_added_document_id();
if (lite_document_id != kInvalidDocumentId) {
return lite_document_id;
}
return main_index_->last_added_document_id();
}
// Sets last_added_document_id to document_id so long as document_id >
// last_added_document_id()
void set_last_added_document_id(DocumentId document_id) {
DocumentId lite_document_id = lite_index_->last_added_document_id();
if (lite_document_id == kInvalidDocumentId ||
document_id >= lite_document_id) {
lite_index_->set_last_added_document_id(document_id);
}
}
// Returns debug information for the index in out.
// verbosity = BASIC, simplest debug information - just the lexicons and lite
// index.
// verbosity = DETAILED, more detailed debug information including raw
// postings lists.
IndexDebugInfoProto GetDebugInfo(DebugInfoVerbosity::Code verbosity) const {
IndexDebugInfoProto debug_info;
*debug_info.mutable_index_storage_info() = GetStorageInfo();
*debug_info.mutable_lite_index_info() =
lite_index_->GetDebugInfo(verbosity);
*debug_info.mutable_main_index_info() =
main_index_->GetDebugInfo(verbosity);
return debug_info;
}
// Returns the byte size of the all the elements held in the index. This
// excludes the size of any internal metadata of the index, e.g. the index's
// header.
//
// Returns:
// Byte size on success
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<int64_t> GetElementsSize() const {
ICING_ASSIGN_OR_RETURN(int64_t lite_index_size,
lite_index_->GetElementsSize());
ICING_ASSIGN_OR_RETURN(int64_t main_index_size,
main_index_->GetElementsSize());
return lite_index_size + main_index_size;
}
// Calculates the StorageInfo for the Index.
//
// If an IO error occurs while trying to calculate the value for a field, then
// that field will be set to -1.
IndexStorageInfoProto GetStorageInfo() const;
// Create an iterator to iterate through all doc hit infos in the index that
// match the term. term_start_index is the start index of the given term in
// the search query. unnormalized_term_length is the length of the given
// unnormalized term in the search query not listed in the mask.
// Eg. section_id_mask = 1U << 3; would only return hits that occur in
// section 3.
//
// Returns:
// unique ptr to a valid DocHitInfoIterator that matches the term
// INVALID_ARGUMENT if given an invalid term_match_type
libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
const std::string& term, int term_start_index,
int unnormalized_term_length, SectionIdMask section_id_mask,
TermMatchType::Code term_match_type, bool need_hit_term_frequency = true);
// Finds terms with the given prefix in the given namespaces. If
// 'namespace_ids' is empty, returns results from all the namespaces. Results
// are sorted in decreasing order of hit count. Number of results are no more
// than 'num_to_return'.
//
// Returns:
// A list of TermMetadata on success
// INTERNAL_ERROR if failed to access term data.
libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
const std::string& prefix, int num_to_return,
TermMatchType::Code scoring_match_type,
SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,
const SuggestionResultChecker* suggestion_result_checker);
// A class that can be used to add hits to the index.
//
// An editor groups hits from a particular section within a document together
// and dedupes hits for the same term within a section. This removes the
// burden of deduping from the caller and direct access to the index
// implementation allows for more efficient deduping.
class Editor {
public:
// Does not take any ownership, and all pointers must refer to valid objects
// that outlive the one constructed.
// TODO(b/141180665): Add nullptr checks for the raw pointers
Editor(const TermIdCodec* term_id_codec, LiteIndex* lite_index,
DocumentId document_id, SectionId section_id,
TermMatchType::Code term_match_type, NamespaceId namespace_id)
: term_id_codec_(term_id_codec),
lite_index_(lite_index),
document_id_(document_id),
term_match_type_(term_match_type),
namespace_id_(namespace_id),
section_id_(section_id) {}
// Buffer the term in seen_tokens_.
libtextclassifier3::Status BufferTerm(const char* term);
// Index all the terms stored in seen_tokens_.
libtextclassifier3::Status IndexAllBufferedTerms();
private:
// The Editor is able to store previously seen terms as TermIds. This is
// is more efficient than a client doing this externally because TermIds are
// not exposed to clients.
std::unordered_map<uint32_t, Hit::TermFrequency> seen_tokens_;
const TermIdCodec* term_id_codec_;
LiteIndex* lite_index_;
DocumentId document_id_;
TermMatchType::Code term_match_type_;
NamespaceId namespace_id_;
SectionId section_id_;
};
Editor Edit(DocumentId document_id, SectionId section_id,
TermMatchType::Code term_match_type, NamespaceId namespace_id) {
return Editor(term_id_codec_.get(), lite_index_.get(), document_id,
section_id, term_match_type, namespace_id);
}
bool WantsMerge() const { return lite_index_->WantsMerge(); }
// Merges newly-added hits in the LiteIndex into the MainIndex.
//
// RETURNS:
// - INTERNAL on IO error while writing to the MainIndex.
// - RESOURCE_EXHAUSTED error if unable to grow the index.
libtextclassifier3::Status Merge() {
ICING_ASSIGN_OR_RETURN(MainIndex::LexiconMergeOutputs outputs,
main_index_->MergeLexicon(lite_index_->lexicon()));
ICING_ASSIGN_OR_RETURN(std::vector<TermIdHitPair> term_id_hit_pairs,
MainIndexMerger::TranslateAndExpandLiteHits(
*lite_index_, *term_id_codec_, outputs));
ICING_RETURN_IF_ERROR(main_index_->AddHits(
*term_id_codec_, std::move(outputs.backfill_map),
std::move(term_id_hit_pairs), lite_index_->last_added_document_id()));
ICING_RETURN_IF_ERROR(main_index_->PersistToDisk());
return lite_index_->Reset();
}
// Whether the LiteIndex HitBuffer requires sorting. This is only true if
// Icing has enabled sorting during indexing time, and the HitBuffer's
// unsorted tail has exceeded the lite_index_sort_size.
bool LiteIndexNeedSort() const {
return options_.lite_index_sort_at_indexing &&
lite_index_->HasUnsortedHitsExceedingSortThreshold();
}
// Sorts the LiteIndex HitBuffer.
void SortLiteIndex() {
lite_index_->SortHits();
}
// Reduces internal file sizes by reclaiming space of deleted documents.
// new_last_added_document_id will be used to update the last added document
// id in the lite index.
//
// Returns:
// OK on success
// INTERNAL_ERROR on IO error, this indicates that the index may be in an
// invalid state and should be cleared.
libtextclassifier3::Status Optimize(
const std::vector<DocumentId>& document_id_old_to_new,
DocumentId new_last_added_document_id);
private:
Index(const Options& options, std::unique_ptr<TermIdCodec> term_id_codec,
std::unique_ptr<LiteIndex> lite_index,
std::unique_ptr<MainIndex> main_index, const Filesystem* filesystem)
: lite_index_(std::move(lite_index)),
main_index_(std::move(main_index)),
options_(options),
term_id_codec_(std::move(term_id_codec)),
filesystem_(filesystem) {}
libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix(
const std::string& prefix,
SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,
const SuggestionResultChecker* suggestion_result_checker);
std::unique_ptr<LiteIndex> lite_index_;
std::unique_ptr<MainIndex> main_index_;
const Options options_;
std::unique_ptr<TermIdCodec> term_id_codec_;
const Filesystem* filesystem_;
};
} // namespace lib
} // namespace icing
#endif // ICING_INDEX_INDEX_H_