blob: 6680dae5ef1a5c1fb2fc50e3db53b58f75e724fb [file] [log] [blame]
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "icing/icing-search-engine.h"
#include <cstdint>
#include <memory>
#include <string>
#include <string_view>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/annotate.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/mutex.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/file/destructible-file.h"
#include "icing/file/file-backed-proto.h"
#include "icing/file/filesystem.h"
#include "icing/file/version-util.h"
#include "icing/index/data-indexing-handler.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index-processor.h"
#include "icing/index/index.h"
#include "icing/index/integer-section-indexing-handler.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/index/numeric/integer-index.h"
#include "icing/index/string-section-indexing-handler.h"
#include "icing/join/join-processor.h"
#include "icing/join/qualified-id-join-index.h"
#include "icing/join/qualified-id-join-indexing-handler.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/portable/endian.h"
#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
#include "icing/proto/internal/optimize.pb.h"
#include "icing/proto/logging.pb.h"
#include "icing/proto/optimize.pb.h"
#include "icing/proto/persist.pb.h"
#include "icing/proto/reset.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/scoring.pb.h"
#include "icing/proto/search.pb.h"
#include "icing/proto/status.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/proto/usage.pb.h"
#include "icing/query/advanced_query_parser/lexer.h"
#include "icing/query/query-features.h"
#include "icing/query/query-processor.h"
#include "icing/query/query-results.h"
#include "icing/query/suggestion-processor.h"
#include "icing/result/page-result.h"
#include "icing/result/projection-tree.h"
#include "icing/result/projector.h"
#include "icing/result/result-adjustment-info.h"
#include "icing/result/result-retriever-v2.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/schema-util.h"
#include "icing/schema/section.h"
#include "icing/scoring/advanced_scoring/score-expression.h"
#include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
#include "icing/scoring/scored-document-hit.h"
#include "icing/scoring/scored-document-hits-ranker.h"
#include "icing/scoring/scoring-processor.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
#include "icing/util/clock.h"
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
#include "icing/util/tokenized-document.h"
#include "unicode/uloc.h"
namespace icing {
namespace lib {
namespace {
constexpr std::string_view kVersionFilename = "version";
constexpr std::string_view kDocumentSubfolderName = "document_dir";
constexpr std::string_view kIndexSubfolderName = "index_dir";
constexpr std::string_view kIntegerIndexSubfolderName = "integer_index_dir";
constexpr std::string_view kQualifiedIdJoinIndexSubfolderName =
"qualified_id_join_index_dir";
constexpr std::string_view kSchemaSubfolderName = "schema_dir";
constexpr std::string_view kSetSchemaMarkerFilename = "set_schema_marker";
constexpr std::string_view kInitMarkerFilename = "init_marker";
constexpr std::string_view kOptimizeStatusFilename = "optimize_status";
// The maximum number of unsuccessful initialization attempts from the current
// state that we will tolerate before deleting all data and starting from a
// fresh state.
constexpr int kMaxUnsuccessfulInitAttempts = 5;
// A pair that holds namespace and type.
struct NamespaceTypePair {
std::string namespace_;
std::string type;
bool operator==(const NamespaceTypePair& other) const {
return namespace_ == other.namespace_ && type == other.type;
}
};
struct NamespaceTypePairHasher {
std::size_t operator()(const NamespaceTypePair& pair) const {
return std::hash<std::string>()(pair.namespace_) ^
std::hash<std::string>()(pair.type);
}
};
libtextclassifier3::Status ValidateResultSpec(
const DocumentStore* document_store, const ResultSpecProto& result_spec) {
if (result_spec.num_per_page() < 0) {
return absl_ports::InvalidArgumentError(
"ResultSpecProto.num_per_page cannot be negative.");
}
if (result_spec.num_total_bytes_per_page_threshold() <= 0) {
return absl_ports::InvalidArgumentError(
"ResultSpecProto.num_total_bytes_per_page_threshold cannot be "
"non-positive.");
}
if (result_spec.max_joined_children_per_parent_to_return() < 0) {
return absl_ports::InvalidArgumentError(
"ResultSpecProto.max_joined_children_per_parent_to_return cannot be "
"negative.");
}
if (result_spec.num_to_score() <= 0) {
return absl_ports::InvalidArgumentError(
"ResultSpecProto.num_to_score cannot be non-positive.");
}
// Validate ResultGroupings.
std::unordered_set<int32_t> unique_entry_ids;
ResultSpecProto::ResultGroupingType result_grouping_type =
result_spec.result_group_type();
for (const ResultSpecProto::ResultGrouping& result_grouping :
result_spec.result_groupings()) {
if (result_grouping.max_results() <= 0) {
return absl_ports::InvalidArgumentError(
"Cannot specify a result grouping with max results <= 0.");
}
for (const ResultSpecProto::ResultGrouping::Entry& entry :
result_grouping.entry_groupings()) {
const std::string& name_space = entry.namespace_();
const std::string& schema = entry.schema();
auto entry_id_or = document_store->GetResultGroupingEntryId(
result_grouping_type, name_space, schema);
if (!entry_id_or.ok()) {
continue;
}
int32_t entry_id = entry_id_or.ValueOrDie();
if (unique_entry_ids.find(entry_id) != unique_entry_ids.end()) {
return absl_ports::InvalidArgumentError(
"Entry Ids must be unique across result groups.");
}
unique_entry_ids.insert(entry_id);
}
}
return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status ValidateSearchSpec(
const SearchSpecProto& search_spec,
const PerformanceConfiguration& configuration) {
if (search_spec.query().size() > configuration.max_query_length) {
return absl_ports::InvalidArgumentError(
absl_ports::StrCat("SearchSpecProto.query is longer than the maximum "
"allowed query length: ",
std::to_string(configuration.max_query_length)));
}
// Check that no unknown features have been enabled in the search spec.
std::unordered_set<Feature> query_features_set = GetQueryFeaturesSet();
for (const Feature feature : search_spec.enabled_features()) {
if (query_features_set.find(feature) == query_features_set.end()) {
return absl_ports::InvalidArgumentError(
absl_ports::StrCat("Unknown feature in "
"SearchSpecProto.enabled_features: ",
feature));
}
}
return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status ValidateSuggestionSpec(
const SuggestionSpecProto& suggestion_spec,
const PerformanceConfiguration& configuration) {
if (suggestion_spec.prefix().empty()) {
return absl_ports::InvalidArgumentError(
absl_ports::StrCat("SuggestionSpecProto.prefix is empty!"));
}
if (suggestion_spec.scoring_spec().scoring_match_type() ==
TermMatchType::UNKNOWN) {
return absl_ports::InvalidArgumentError(
absl_ports::StrCat("SuggestionSpecProto.term_match_type is unknown!"));
}
if (suggestion_spec.num_to_return() <= 0) {
return absl_ports::InvalidArgumentError(absl_ports::StrCat(
"SuggestionSpecProto.num_to_return must be positive."));
}
if (suggestion_spec.prefix().size() > configuration.max_query_length) {
return absl_ports::InvalidArgumentError(
absl_ports::StrCat("SuggestionSpecProto.prefix is longer than the "
"maximum allowed prefix length: ",
std::to_string(configuration.max_query_length)));
}
return libtextclassifier3::Status::OK;
}
// Version file is a single file under base_dir containing version info of the
// existing data.
std::string MakeVersionFilePath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kVersionFilename);
}
// Document store files are in a standalone subfolder for easier file
// management. We can delete and recreate the subfolder and not touch/affect
// anything else.
std::string MakeDocumentDirectoryPath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kDocumentSubfolderName);
}
// Makes a temporary folder path for the document store which will be used
// during full optimization.
std::string MakeDocumentTemporaryDirectoryPath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kDocumentSubfolderName,
"_optimize_tmp");
}
// Index files are in a standalone subfolder because for easier file management.
// We can delete and recreate the subfolder and not touch/affect anything
// else.
std::string MakeIndexDirectoryPath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kIndexSubfolderName);
}
// Working path for integer index. Integer index is derived from
// PersistentStorage and it will take full ownership of this working path,
// including creation/deletion. See PersistentStorage for more details about
// working path.
std::string MakeIntegerIndexWorkingPath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kIntegerIndexSubfolderName);
}
// Working path for qualified id join index. It is derived from
// PersistentStorage and it will take full ownership of this working path,
// including creation/deletion. See PersistentStorage for more details about
// working path.
std::string MakeQualifiedIdJoinIndexWorkingPath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kQualifiedIdJoinIndexSubfolderName);
}
// SchemaStore files are in a standalone subfolder for easier file management.
// We can delete and recreate the subfolder and not touch/affect anything
// else.
std::string MakeSchemaDirectoryPath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kSchemaSubfolderName);
}
std::string MakeSetSchemaMarkerFilePath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kSetSchemaMarkerFilename);
}
std::string MakeInitMarkerFilePath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kInitMarkerFilename);
}
void TransformStatus(const libtextclassifier3::Status& internal_status,
StatusProto* status_proto) {
StatusProto::Code code;
if (!internal_status.ok()) {
ICING_LOG(WARNING) << "Error: " << internal_status.error_code()
<< ", Message: " << internal_status.error_message();
}
switch (internal_status.CanonicalCode()) {
case libtextclassifier3::StatusCode::OK:
code = StatusProto::OK;
break;
case libtextclassifier3::StatusCode::DATA_LOSS:
code = StatusProto::WARNING_DATA_LOSS;
break;
case libtextclassifier3::StatusCode::INVALID_ARGUMENT:
code = StatusProto::INVALID_ARGUMENT;
break;
case libtextclassifier3::StatusCode::NOT_FOUND:
code = StatusProto::NOT_FOUND;
break;
case libtextclassifier3::StatusCode::FAILED_PRECONDITION:
code = StatusProto::FAILED_PRECONDITION;
break;
case libtextclassifier3::StatusCode::ABORTED:
code = StatusProto::ABORTED;
break;
case libtextclassifier3::StatusCode::INTERNAL:
// TODO(b/147699081): Cleanup our internal use of INTERNAL since it
// doesn't match with what it *should* indicate as described in
// go/icing-library-apis.
code = StatusProto::INTERNAL;
break;
case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED:
// TODO(b/147699081): Note that we don't detect all cases of OUT_OF_SPACE
// (e.g. if the document log is full). And we use RESOURCE_EXHAUSTED
// internally to indicate other resources are exhausted (e.g.
// DocHitInfos) - although none of these are exposed through the API.
// Consider separating the two cases out more clearly.
code = StatusProto::OUT_OF_SPACE;
break;
case libtextclassifier3::StatusCode::ALREADY_EXISTS:
code = StatusProto::ALREADY_EXISTS;
break;
case libtextclassifier3::StatusCode::CANCELLED:
[[fallthrough]];
case libtextclassifier3::StatusCode::UNKNOWN:
[[fallthrough]];
case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED:
[[fallthrough]];
case libtextclassifier3::StatusCode::PERMISSION_DENIED:
[[fallthrough]];
case libtextclassifier3::StatusCode::OUT_OF_RANGE:
[[fallthrough]];
case libtextclassifier3::StatusCode::UNIMPLEMENTED:
[[fallthrough]];
case libtextclassifier3::StatusCode::UNAVAILABLE:
[[fallthrough]];
case libtextclassifier3::StatusCode::UNAUTHENTICATED:
// Other internal status codes aren't supported externally yet. If it
// should be supported, add another switch-case above.
ICING_LOG(ERROR) << "Internal status code "
<< internal_status.error_code()
<< " not supported in the external API";
code = StatusProto::UNKNOWN;
break;
}
status_proto->set_code(code);
status_proto->set_message(internal_status.error_message());
}
libtextclassifier3::Status RetrieveAndAddDocumentInfo(
const DocumentStore* document_store, DeleteByQueryResultProto& result_proto,
std::unordered_map<NamespaceTypePair,
DeleteByQueryResultProto::DocumentGroupInfo*,
NamespaceTypePairHasher>& info_map,
DocumentId document_id) {
ICING_ASSIGN_OR_RETURN(DocumentProto document,
document_store->Get(document_id));
NamespaceTypePair key = {document.namespace_(), document.schema()};
auto iter = info_map.find(key);
if (iter == info_map.end()) {
auto entry = result_proto.add_deleted_documents();
entry->set_namespace_(std::move(document.namespace_()));
entry->set_schema(std::move(document.schema()));
entry->add_uris(std::move(document.uri()));
info_map[key] = entry;
} else {
iter->second->add_uris(std::move(document.uri()));
}
return libtextclassifier3::Status::OK;
}
bool ShouldRebuildIndex(const OptimizeStatsProto& optimize_stats,
float optimize_rebuild_index_threshold) {
int num_invalid_documents = optimize_stats.num_deleted_documents() +
optimize_stats.num_expired_documents();
return num_invalid_documents >= optimize_stats.num_original_documents() *
optimize_rebuild_index_threshold;
}
// Useful method to get RankingStrategy if advanced scoring is enabled. When the
// "RelevanceScore" function is used in the advanced scoring expression,
// RankingStrategy will be treated as RELEVANCE_SCORE in order to prepare the
// necessary information needed for calculating relevance score.
libtextclassifier3::StatusOr<ScoringSpecProto::RankingStrategy::Code>
GetRankingStrategyFromScoringSpec(const ScoringSpecProto& scoring_spec) {
if (scoring_spec.advanced_scoring_expression().empty()) {
return scoring_spec.rank_by();
}
// TODO(b/261474063) The Lexer will be called again when creating the
// AdvancedScorer instance. Consider refactoring the code to allow the Lexer
// to be called only once.
Lexer lexer(scoring_spec.advanced_scoring_expression(),
Lexer::Language::SCORING);
ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
lexer.ExtractTokens());
for (const Lexer::LexerToken& token : lexer_tokens) {
if (token.type == Lexer::TokenType::FUNCTION_NAME &&
token.text == RelevanceScoreFunctionScoreExpression::kFunctionName) {
return ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;
}
}
return ScoringSpecProto::RankingStrategy::NONE;
}
} // namespace
IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
std::unique_ptr<const JniCache> jni_cache)
: IcingSearchEngine(options, std::make_unique<Filesystem>(),
std::make_unique<IcingFilesystem>(),
std::make_unique<Clock>(), std::move(jni_cache)) {}
IcingSearchEngine::IcingSearchEngine(
IcingSearchEngineOptions options,
std::unique_ptr<const Filesystem> filesystem,
std::unique_ptr<const IcingFilesystem> icing_filesystem,
std::unique_ptr<Clock> clock, std::unique_ptr<const JniCache> jni_cache)
: options_(std::move(options)),
filesystem_(std::move(filesystem)),
icing_filesystem_(std::move(icing_filesystem)),
clock_(std::move(clock)),
jni_cache_(std::move(jni_cache)) {
ICING_VLOG(1) << "Creating IcingSearchEngine in dir: " << options_.base_dir();
}
IcingSearchEngine::~IcingSearchEngine() {
if (initialized_) {
if (PersistToDisk(PersistType::FULL).status().code() != StatusProto::OK) {
ICING_LOG(ERROR)
<< "Error persisting to disk in IcingSearchEngine destructor";
}
}
}
InitializeResultProto IcingSearchEngine::Initialize() {
// This method does both read and write so we need a writer lock. Using two
// locks (reader and writer) has the chance to be interrupted during
// switching.
absl_ports::unique_lock l(&mutex_);
return InternalInitialize();
}
void IcingSearchEngine::ResetMembers() {
schema_store_.reset();
document_store_.reset();
language_segmenter_.reset();
normalizer_.reset();
index_.reset();
integer_index_.reset();
qualified_id_join_index_.reset();
}
libtextclassifier3::Status IcingSearchEngine::CheckInitMarkerFile(
InitializeStatsProto* initialize_stats) {
// Check to see if the marker file exists and if we've already passed our max
// number of init attempts.
std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
bool file_exists = filesystem_->FileExists(marker_filepath.c_str());
int network_init_attempts = 0;
int host_init_attempts = 0;
// Read the number of previous failed init attempts from the file. If it
// fails, then just assume the value is zero (the most likely reason for
// failure would be non-existence because the last init was successful
// anyways).
std::unique_ptr<ScopedFd> marker_file_fd = std::make_unique<ScopedFd>(
filesystem_->OpenForWrite(marker_filepath.c_str()));
libtextclassifier3::Status status;
if (file_exists &&
filesystem_->PRead(marker_file_fd->get(), &network_init_attempts,
sizeof(network_init_attempts), /*offset=*/0)) {
host_init_attempts = GNetworkToHostL(network_init_attempts);
if (host_init_attempts > kMaxUnsuccessfulInitAttempts) {
// We're tried and failed to init too many times. We need to throw
// everything out and start from scratch.
ResetMembers();
marker_file_fd.reset();
// Delete the entire base directory.
if (!filesystem_->DeleteDirectoryRecursively(
options_.base_dir().c_str())) {
return absl_ports::InternalError("Failed to delete icing base dir!");
}
// Create the base directory again and reopen marker file.
if (!filesystem_->CreateDirectoryRecursively(
options_.base_dir().c_str())) {
return absl_ports::InternalError("Failed to create icing base dir!");
}
marker_file_fd = std::make_unique<ScopedFd>(
filesystem_->OpenForWrite(marker_filepath.c_str()));
status = absl_ports::DataLossError(
"Encountered failed initialization limit. Cleared all data.");
host_init_attempts = 0;
}
}
// Use network_init_attempts here because we might have set host_init_attempts
// to 0 if it exceeded the max threshold.
initialize_stats->set_num_previous_init_failures(
GNetworkToHostL(network_init_attempts));
++host_init_attempts;
network_init_attempts = GHostToNetworkL(host_init_attempts);
// Write the updated number of attempts before we get started.
if (!filesystem_->PWrite(marker_file_fd->get(), /*offset=*/0,
&network_init_attempts,
sizeof(network_init_attempts)) ||
!filesystem_->DataSync(marker_file_fd->get())) {
return absl_ports::InternalError(
"Failed to write and sync init marker file");
}
return status;
}
InitializeResultProto IcingSearchEngine::InternalInitialize() {
ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: "
<< options_.base_dir();
// Measure the latency of the initialization process.
std::unique_ptr<Timer> initialize_timer = clock_->GetNewTimer();
InitializeResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
InitializeStatsProto* initialize_stats =
result_proto.mutable_initialize_stats();
if (initialized_) {
// Already initialized.
result_status->set_code(StatusProto::OK);
initialize_stats->set_latency_ms(
initialize_timer->GetElapsedMilliseconds());
initialize_stats->set_num_documents(document_store_->num_documents());
return result_proto;
}
// Now go ahead and try to initialize.
libtextclassifier3::Status status = InitializeMembers(initialize_stats);
if (status.ok() || absl_ports::IsDataLoss(status)) {
// We successfully initialized. We should delete the init marker file to
// indicate a successful init.
std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
if (!filesystem_->DeleteFile(marker_filepath.c_str())) {
status = absl_ports::InternalError("Failed to delete init marker file!");
} else {
initialized_ = true;
}
}
TransformStatus(status, result_status);
initialize_stats->set_latency_ms(initialize_timer->GetElapsedMilliseconds());
return result_proto;
}
libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
InitializeStatsProto* initialize_stats) {
ICING_RETURN_ERROR_IF_NULL(initialize_stats);
// Make sure the base directory exists
if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) {
return absl_ports::InternalError(absl_ports::StrCat(
"Could not create directory: ", options_.base_dir()));
}
// Check to see if the marker file exists and if we've already passed our max
// number of init attempts.
libtextclassifier3::Status status = CheckInitMarkerFile(initialize_stats);
if (!status.ok() && !absl_ports::IsDataLoss(status)) {
return status;
}
// Read version file and determine the state change.
const std::string version_filepath = MakeVersionFilePath(options_.base_dir());
const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
ICING_ASSIGN_OR_RETURN(
version_util::VersionInfo version_info,
version_util::ReadVersion(*filesystem_, version_filepath, index_dir));
version_util::StateChange version_state_change =
version_util::GetVersionStateChange(version_info);
if (version_state_change != version_util::StateChange::kCompatible) {
// Step 1: migrate schema according to the version state change.
ICING_RETURN_IF_ERROR(SchemaStore::MigrateSchema(
filesystem_.get(), MakeSchemaDirectoryPath(options_.base_dir()),
version_state_change, version_util::kVersion));
// Step 2: discard all derived data if needed rebuild.
if (version_util::ShouldRebuildDerivedFiles(version_info)) {
ICING_RETURN_IF_ERROR(DiscardDerivedFiles());
}
// Step 3: update version file
version_util::VersionInfo new_version_info(
version_util::kVersion,
std::max(version_info.max_version, version_util::kVersion));
ICING_RETURN_IF_ERROR(version_util::WriteVersion(
*filesystem_, version_filepath, new_version_info));
}
ICING_RETURN_IF_ERROR(InitializeSchemaStore(initialize_stats));
// TODO(b/156383798) : Resolve how to specify the locale.
language_segmenter_factory::SegmenterOptions segmenter_options(
ULOC_US, jni_cache_.get());
TC3_ASSIGN_OR_RETURN(language_segmenter_, language_segmenter_factory::Create(
std::move(segmenter_options)));
TC3_ASSIGN_OR_RETURN(normalizer_,
normalizer_factory::Create(options_.max_token_length()));
std::string marker_filepath =
MakeSetSchemaMarkerFilePath(options_.base_dir());
libtextclassifier3::Status index_init_status;
if (absl_ports::IsNotFound(schema_store_->GetSchema().status())) {
// The schema was either lost or never set before. Wipe out the doc store
// and index directories and initialize them from scratch.
const std::string doc_store_dir =
MakeDocumentDirectoryPath(options_.base_dir());
const std::string integer_index_dir =
MakeIntegerIndexWorkingPath(options_.base_dir());
const std::string qualified_id_join_index_dir =
MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
if (!filesystem_->DeleteDirectoryRecursively(doc_store_dir.c_str()) ||
!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
!IntegerIndex::Discard(*filesystem_, integer_index_dir).ok() ||
!QualifiedIdJoinIndex::Discard(*filesystem_,
qualified_id_join_index_dir)
.ok()) {
return absl_ports::InternalError(absl_ports::StrCat(
"Could not delete directories: ", index_dir, ", ", integer_index_dir,
", ", qualified_id_join_index_dir, " and ", doc_store_dir));
}
ICING_RETURN_IF_ERROR(InitializeDocumentStore(
/*force_recovery_and_revalidate_documents=*/false, initialize_stats));
index_init_status = InitializeIndex(initialize_stats);
if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
return index_init_status;
}
} else if (filesystem_->FileExists(marker_filepath.c_str())) {
// If the marker file is still around then something wonky happened when we
// last tried to set the schema.
ICING_RETURN_IF_ERROR(InitializeDocumentStore(
/*force_recovery_and_revalidate_documents=*/true, initialize_stats));
// We're going to need to build the index from scratch. So just delete its
// directory now.
// Discard index directory and instantiate a new one.
Index::Options index_options(index_dir, options_.index_merge_size(),
options_.lite_index_sort_at_indexing(),
options_.lite_index_sort_size());
if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
!filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
return absl_ports::InternalError(
absl_ports::StrCat("Could not recreate directory: ", index_dir));
}
ICING_ASSIGN_OR_RETURN(index_,
Index::Create(index_options, filesystem_.get(),
icing_filesystem_.get()));
// Discard integer index directory and instantiate a new one.
std::string integer_index_dir =
MakeIntegerIndexWorkingPath(options_.base_dir());
ICING_RETURN_IF_ERROR(
IntegerIndex::Discard(*filesystem_, integer_index_dir));
ICING_ASSIGN_OR_RETURN(
integer_index_,
IntegerIndex::Create(*filesystem_, std::move(integer_index_dir),
options_.integer_index_bucket_split_threshold(),
options_.pre_mapping_fbv()));
// Discard qualified id join index directory and instantiate a new one.
std::string qualified_id_join_index_dir =
MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
*filesystem_, qualified_id_join_index_dir));
ICING_ASSIGN_OR_RETURN(
qualified_id_join_index_,
QualifiedIdJoinIndex::Create(
*filesystem_, std::move(qualified_id_join_index_dir),
options_.pre_mapping_fbv(), options_.use_persistent_hash_map()));
std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
IndexRestorationResult restore_result = RestoreIndexIfNeeded();
index_init_status = std::move(restore_result.status);
// DATA_LOSS means that we have successfully initialized and re-added
// content to the index. Some indexed content was lost, but otherwise the
// index is in a valid state and can be queried.
if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
return index_init_status;
}
// Delete the marker file to indicate that everything is now in sync with
// whatever changes were made to the schema.
filesystem_->DeleteFile(marker_filepath.c_str());
initialize_stats->set_index_restoration_latency_ms(
restore_timer->GetElapsedMilliseconds());
initialize_stats->set_index_restoration_cause(
InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
initialize_stats->set_integer_index_restoration_cause(
InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
initialize_stats->set_qualified_id_join_index_restoration_cause(
InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
} else if (version_state_change != version_util::StateChange::kCompatible) {
ICING_RETURN_IF_ERROR(InitializeDocumentStore(
/*force_recovery_and_revalidate_documents=*/true, initialize_stats));
index_init_status = InitializeIndex(initialize_stats);
if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
return index_init_status;
}
initialize_stats->set_schema_store_recovery_cause(
InitializeStatsProto::VERSION_CHANGED);
initialize_stats->set_document_store_recovery_cause(
InitializeStatsProto::VERSION_CHANGED);
initialize_stats->set_index_restoration_cause(
InitializeStatsProto::VERSION_CHANGED);
initialize_stats->set_integer_index_restoration_cause(
InitializeStatsProto::VERSION_CHANGED);
initialize_stats->set_qualified_id_join_index_restoration_cause(
InitializeStatsProto::VERSION_CHANGED);
} else {
ICING_RETURN_IF_ERROR(InitializeDocumentStore(
/*force_recovery_and_revalidate_documents=*/false, initialize_stats));
index_init_status = InitializeIndex(initialize_stats);
if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
return index_init_status;
}
}
if (status.ok()) {
status = index_init_status;
}
result_state_manager_ = std::make_unique<ResultStateManager>(
performance_configuration_.max_num_total_hits, *document_store_);
return status;
}
libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore(
InitializeStatsProto* initialize_stats) {
ICING_RETURN_ERROR_IF_NULL(initialize_stats);
const std::string schema_store_dir =
MakeSchemaDirectoryPath(options_.base_dir());
// Make sure the sub-directory exists
if (!filesystem_->CreateDirectoryRecursively(schema_store_dir.c_str())) {
return absl_ports::InternalError(
absl_ports::StrCat("Could not create directory: ", schema_store_dir));
}
ICING_ASSIGN_OR_RETURN(
schema_store_, SchemaStore::Create(filesystem_.get(), schema_store_dir,
clock_.get(), initialize_stats));
return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore(
bool force_recovery_and_revalidate_documents,
InitializeStatsProto* initialize_stats) {
ICING_RETURN_ERROR_IF_NULL(initialize_stats);
const std::string document_dir =
MakeDocumentDirectoryPath(options_.base_dir());
// Make sure the sub-directory exists
if (!filesystem_->CreateDirectoryRecursively(document_dir.c_str())) {
return absl_ports::InternalError(
absl_ports::StrCat("Could not create directory: ", document_dir));
}
ICING_ASSIGN_OR_RETURN(
DocumentStore::CreateResult create_result,
DocumentStore::Create(
filesystem_.get(), document_dir, clock_.get(), schema_store_.get(),
force_recovery_and_revalidate_documents,
options_.document_store_namespace_id_fingerprint(),
options_.pre_mapping_fbv(), options_.use_persistent_hash_map(),
options_.compression_level(), initialize_stats));
document_store_ = std::move(create_result.document_store);
return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status IcingSearchEngine::InitializeIndex(
InitializeStatsProto* initialize_stats) {
ICING_RETURN_ERROR_IF_NULL(initialize_stats);
const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
// Make sure the sub-directory exists
if (!filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
return absl_ports::InternalError(
absl_ports::StrCat("Could not create directory: ", index_dir));
}
Index::Options index_options(index_dir, options_.index_merge_size(),
options_.lite_index_sort_at_indexing(),
options_.lite_index_sort_size());
// Term index
InitializeStatsProto::RecoveryCause index_recovery_cause;
auto index_or =
Index::Create(index_options, filesystem_.get(), icing_filesystem_.get());
if (!index_or.ok()) {
if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
!filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
return absl_ports::InternalError(
absl_ports::StrCat("Could not recreate directory: ", index_dir));
}
index_recovery_cause = InitializeStatsProto::IO_ERROR;
// Try recreating it from scratch and re-indexing everything.
ICING_ASSIGN_OR_RETURN(index_,
Index::Create(index_options, filesystem_.get(),
icing_filesystem_.get()));
} else {
// Index was created fine.
index_ = std::move(index_or).ValueOrDie();
// If a recover does have to happen, then it must be because the index is
// out of sync with the document store.
index_recovery_cause = InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
}
// Integer index
std::string integer_index_dir =
MakeIntegerIndexWorkingPath(options_.base_dir());
InitializeStatsProto::RecoveryCause integer_index_recovery_cause;
auto integer_index_or =
IntegerIndex::Create(*filesystem_, integer_index_dir,
options_.integer_index_bucket_split_threshold(),
options_.pre_mapping_fbv());
if (!integer_index_or.ok()) {
ICING_RETURN_IF_ERROR(
IntegerIndex::Discard(*filesystem_, integer_index_dir));
integer_index_recovery_cause = InitializeStatsProto::IO_ERROR;
// Try recreating it from scratch and re-indexing everything.
ICING_ASSIGN_OR_RETURN(
integer_index_,
IntegerIndex::Create(*filesystem_, std::move(integer_index_dir),
options_.integer_index_bucket_split_threshold(),
options_.pre_mapping_fbv()));
} else {
// Integer index was created fine.
integer_index_ = std::move(integer_index_or).ValueOrDie();
// If a recover does have to happen, then it must be because the index is
// out of sync with the document store.
integer_index_recovery_cause =
InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
}
// Qualified id join index
std::string qualified_id_join_index_dir =
MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
InitializeStatsProto::RecoveryCause qualified_id_join_index_recovery_cause;
auto qualified_id_join_index_or = QualifiedIdJoinIndex::Create(
*filesystem_, qualified_id_join_index_dir, options_.pre_mapping_fbv(),
options_.use_persistent_hash_map());
if (!qualified_id_join_index_or.ok()) {
ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
*filesystem_, qualified_id_join_index_dir));
qualified_id_join_index_recovery_cause = InitializeStatsProto::IO_ERROR;
// Try recreating it from scratch and rebuild everything.
ICING_ASSIGN_OR_RETURN(
qualified_id_join_index_,
QualifiedIdJoinIndex::Create(
*filesystem_, std::move(qualified_id_join_index_dir),
options_.pre_mapping_fbv(), options_.use_persistent_hash_map()));
} else {
// Qualified id join index was created fine.
qualified_id_join_index_ =
std::move(qualified_id_join_index_or).ValueOrDie();
// If a recover does have to happen, then it must be because the index is
// out of sync with the document store.
qualified_id_join_index_recovery_cause =
InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
}
std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
IndexRestorationResult restore_result = RestoreIndexIfNeeded();
if (restore_result.index_needed_restoration ||
restore_result.integer_index_needed_restoration ||
restore_result.qualified_id_join_index_needed_restoration) {
initialize_stats->set_index_restoration_latency_ms(
restore_timer->GetElapsedMilliseconds());
if (restore_result.index_needed_restoration) {
initialize_stats->set_index_restoration_cause(index_recovery_cause);
}
if (restore_result.integer_index_needed_restoration) {
initialize_stats->set_integer_index_restoration_cause(
integer_index_recovery_cause);
}
if (restore_result.qualified_id_join_index_needed_restoration) {
initialize_stats->set_qualified_id_join_index_restoration_cause(
qualified_id_join_index_recovery_cause);
}
}
return restore_result.status;
}
SetSchemaResultProto IcingSearchEngine::SetSchema(
const SchemaProto& new_schema, bool ignore_errors_and_delete_documents) {
return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents);
}
SetSchemaResultProto IcingSearchEngine::SetSchema(
SchemaProto&& new_schema, bool ignore_errors_and_delete_documents) {
ICING_VLOG(1) << "Setting new Schema";
SetSchemaResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
ScopedTimer timer(clock_->GetNewTimer(), [&result_proto](int64_t t) {
result_proto.set_latency_ms(t);
});
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
auto lost_previous_schema_or = LostPreviousSchema();
if (!lost_previous_schema_or.ok()) {
TransformStatus(lost_previous_schema_or.status(), result_status);
return result_proto;
}
bool lost_previous_schema = lost_previous_schema_or.ValueOrDie();
std::string marker_filepath =
MakeSetSchemaMarkerFilePath(options_.base_dir());
// Create the marker file indicating that we are going to apply a schema
// change. No need to write anything to the marker file - its existence is the
// only thing that matters. The marker file is used to indicate if we
// encountered a crash or a power loss while updating the schema and other
// files. So set it up to be deleted as long as we return from this function.
DestructibleFile marker_file(marker_filepath, filesystem_.get());
auto set_schema_result_or = schema_store_->SetSchema(
std::move(new_schema), ignore_errors_and_delete_documents,
options_.allow_circular_schema_definitions());
if (!set_schema_result_or.ok()) {
TransformStatus(set_schema_result_or.status(), result_status);
return result_proto;
}
SchemaStore::SetSchemaResult set_schema_result =
std::move(set_schema_result_or).ValueOrDie();
for (const std::string& deleted_type :
set_schema_result.schema_types_deleted_by_name) {
result_proto.add_deleted_schema_types(deleted_type);
}
for (const std::string& incompatible_type :
set_schema_result.schema_types_incompatible_by_name) {
result_proto.add_incompatible_schema_types(incompatible_type);
}
for (const std::string& new_type :
set_schema_result.schema_types_new_by_name) {
result_proto.add_new_schema_types(std::move(new_type));
}
for (const std::string& compatible_type :
set_schema_result.schema_types_changed_fully_compatible_by_name) {
result_proto.add_fully_compatible_changed_schema_types(
std::move(compatible_type));
}
bool index_incompatible =
!set_schema_result.schema_types_index_incompatible_by_name.empty();
for (const std::string& index_incompatible_type :
set_schema_result.schema_types_index_incompatible_by_name) {
result_proto.add_index_incompatible_changed_schema_types(
std::move(index_incompatible_type));
}
bool join_incompatible =
!set_schema_result.schema_types_join_incompatible_by_name.empty();
for (const std::string& join_incompatible_type :
set_schema_result.schema_types_join_incompatible_by_name) {
result_proto.add_join_incompatible_changed_schema_types(
std::move(join_incompatible_type));
}
libtextclassifier3::Status status;
if (set_schema_result.success) {
if (lost_previous_schema) {
// No previous schema to calculate a diff against. We have to go through
// and revalidate all the Documents in the DocumentStore
status = document_store_->UpdateSchemaStore(schema_store_.get());
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
} else if (!set_schema_result.old_schema_type_ids_changed.empty() ||
!set_schema_result.schema_types_incompatible_by_id.empty() ||
!set_schema_result.schema_types_deleted_by_id.empty()) {
status = document_store_->OptimizedUpdateSchemaStore(schema_store_.get(),
set_schema_result);
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
}
if (lost_previous_schema || index_incompatible) {
// Clears search indices
status = ClearSearchIndices();
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
}
if (lost_previous_schema || join_incompatible) {
// Clears join indices
status = ClearJoinIndices();
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
}
if (lost_previous_schema || index_incompatible || join_incompatible) {
IndexRestorationResult restore_result = RestoreIndexIfNeeded();
// DATA_LOSS means that we have successfully re-added content to the
// index. Some indexed content was lost, but otherwise the index is in a
// valid state and can be queried.
if (!restore_result.status.ok() &&
!absl_ports::IsDataLoss(restore_result.status)) {
TransformStatus(status, result_status);
return result_proto;
}
}
result_status->set_code(StatusProto::OK);
} else {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("Schema is incompatible.");
}
return result_proto;
}
GetSchemaResultProto IcingSearchEngine::GetSchema() {
GetSchemaResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
absl_ports::shared_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
auto schema_or = schema_store_->GetSchema();
if (!schema_or.ok()) {
TransformStatus(schema_or.status(), result_status);
return result_proto;
}
result_status->set_code(StatusProto::OK);
*result_proto.mutable_schema() = *std::move(schema_or).ValueOrDie();
return result_proto;
}
GetSchemaTypeResultProto IcingSearchEngine::GetSchemaType(
std::string_view schema_type) {
GetSchemaTypeResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
absl_ports::shared_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
auto type_config_or = schema_store_->GetSchemaTypeConfig(schema_type);
if (!type_config_or.ok()) {
TransformStatus(type_config_or.status(), result_status);
return result_proto;
}
result_status->set_code(StatusProto::OK);
*result_proto.mutable_schema_type_config() = *(type_config_or.ValueOrDie());
return result_proto;
}
PutResultProto IcingSearchEngine::Put(const DocumentProto& document) {
return Put(DocumentProto(document));
}
PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
ICING_VLOG(1) << "Writing document to document store";
PutResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
PutDocumentStatsProto* put_document_stats =
result_proto.mutable_put_document_stats();
ScopedTimer put_timer(clock_->GetNewTimer(), [put_document_stats](int64_t t) {
put_document_stats->set_latency_ms(t);
});
// Lock must be acquired before validation because the DocumentStore uses
// the schema file to validate, and the schema could be changed in
// SetSchema() which is protected by the same mutex.
absl_ports::unique_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
auto tokenized_document_or = TokenizedDocument::Create(
schema_store_.get(), language_segmenter_.get(), std::move(document));
if (!tokenized_document_or.ok()) {
TransformStatus(tokenized_document_or.status(), result_status);
return result_proto;
}
TokenizedDocument tokenized_document(
std::move(tokenized_document_or).ValueOrDie());
auto document_id_or = document_store_->Put(
tokenized_document.document(), tokenized_document.num_string_tokens(),
put_document_stats);
if (!document_id_or.ok()) {
TransformStatus(document_id_or.status(), result_status);
return result_proto;
}
DocumentId document_id = document_id_or.ValueOrDie();
auto data_indexing_handlers_or = CreateDataIndexingHandlers();
if (!data_indexing_handlers_or.ok()) {
TransformStatus(data_indexing_handlers_or.status(), result_status);
return result_proto;
}
IndexProcessor index_processor(
std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get());
auto index_status = index_processor.IndexDocument(
tokenized_document, document_id, put_document_stats);
// Getting an internal error from the index could possibly mean that the index
// is broken. Try to rebuild them to recover.
if (absl_ports::IsInternal(index_status)) {
ICING_LOG(ERROR) << "Got an internal error from the index. Trying to "
"rebuild the index!\n"
<< index_status.error_message();
index_status = ClearAllIndices();
if (index_status.ok()) {
index_status = RestoreIndexIfNeeded().status;
if (!index_status.ok()) {
ICING_LOG(ERROR) << "Failed to reindex documents after a failure of "
"indexing a document.";
}
} else {
ICING_LOG(ERROR)
<< "Failed to clear indices after a failure of indexing a document.";
}
}
if (!index_status.ok()) {
// If we encountered a failure or cannot resolve an internal error while
// indexing this document, then mark it as deleted.
int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
libtextclassifier3::Status delete_status =
document_store_->Delete(document_id, current_time_ms);
if (!delete_status.ok()) {
// This is pretty dire (and, hopefully, unlikely). We can't roll back the
// document that we just added. Wipeout the whole index.
ICING_LOG(ERROR) << "Cannot delete the document that is failed to index. "
"Wiping out the whole Icing search engine.";
ResetInternal();
}
}
TransformStatus(index_status, result_status);
return result_proto;
}
GetResultProto IcingSearchEngine::Get(const std::string_view name_space,
const std::string_view uri,
const GetResultSpecProto& result_spec) {
GetResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
absl_ports::shared_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
auto document_or = document_store_->Get(name_space, uri);
if (!document_or.ok()) {
TransformStatus(document_or.status(), result_status);
return result_proto;
}
DocumentProto document = std::move(document_or).ValueOrDie();
std::unique_ptr<ProjectionTree> type_projection_tree;
std::unique_ptr<ProjectionTree> wildcard_projection_tree;
for (const SchemaStore::ExpandedTypePropertyMask& type_field_mask :
schema_store_->ExpandTypePropertyMasks(
result_spec.type_property_masks())) {
if (type_field_mask.schema_type == document.schema()) {
type_projection_tree = std::make_unique<ProjectionTree>(type_field_mask);
} else if (type_field_mask.schema_type ==
SchemaStore::kSchemaTypeWildcard) {
wildcard_projection_tree =
std::make_unique<ProjectionTree>(type_field_mask);
}
}
// Apply projection
if (type_projection_tree != nullptr) {
projector::Project(type_projection_tree->root().children, &document);
} else if (wildcard_projection_tree != nullptr) {
projector::Project(wildcard_projection_tree->root().children, &document);
}
result_status->set_code(StatusProto::OK);
*result_proto.mutable_document() = std::move(document);
return result_proto;
}
ReportUsageResultProto IcingSearchEngine::ReportUsage(
const UsageReport& usage_report) {
ReportUsageResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
libtextclassifier3::Status status =
document_store_->ReportUsage(usage_report);
TransformStatus(status, result_status);
return result_proto;
}
GetAllNamespacesResultProto IcingSearchEngine::GetAllNamespaces() {
GetAllNamespacesResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
absl_ports::shared_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
std::vector<std::string> namespaces = document_store_->GetAllNamespaces();
for (const std::string& namespace_ : namespaces) {
result_proto.add_namespaces(namespace_);
}
result_status->set_code(StatusProto::OK);
return result_proto;
}
DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
const std::string_view uri) {
ICING_VLOG(1) << "Deleting document from doc store";
DeleteResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
DeleteStatsProto* delete_stats = result_proto.mutable_delete_stats();
delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE);
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
// TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
libtextclassifier3::Status status =
document_store_->Delete(name_space, uri, current_time_ms);
if (!status.ok()) {
LogSeverity::Code severity = ERROR;
if (absl_ports::IsNotFound(status)) {
severity = DBG;
}
ICING_LOG(severity) << status.error_message()
<< "Failed to delete Document. namespace: "
<< name_space << ", uri: " << uri;
TransformStatus(status, result_status);
return result_proto;
}
result_status->set_code(StatusProto::OK);
delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
delete_stats->set_num_documents_deleted(1);
return result_proto;
}
DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
const std::string_view name_space) {
ICING_VLOG(1) << "Deleting namespace from doc store";
DeleteByNamespaceResultProto delete_result;
StatusProto* result_status = delete_result.mutable_status();
absl_ports::unique_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return delete_result;
}
DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats();
delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE);
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
// TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
DocumentStore::DeleteByGroupResult doc_store_result =
document_store_->DeleteByNamespace(name_space);
if (!doc_store_result.status.ok()) {
ICING_LOG(ERROR) << doc_store_result.status.error_message()
<< "Failed to delete Namespace: " << name_space;
TransformStatus(doc_store_result.status, result_status);
return delete_result;
}
result_status->set_code(StatusProto::OK);
delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
delete_stats->set_num_documents_deleted(doc_store_result.num_docs_deleted);
return delete_result;
}
DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
const std::string_view schema_type) {
ICING_VLOG(1) << "Deleting type from doc store";
DeleteBySchemaTypeResultProto delete_result;
StatusProto* result_status = delete_result.mutable_status();
absl_ports::unique_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return delete_result;
}
DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats();
delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE);
std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
// TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
DocumentStore::DeleteByGroupResult doc_store_result =
document_store_->DeleteBySchemaType(schema_type);
if (!doc_store_result.status.ok()) {
ICING_LOG(ERROR) << doc_store_result.status.error_message()
<< "Failed to delete SchemaType: " << schema_type;
TransformStatus(doc_store_result.status, result_status);
return delete_result;
}
result_status->set_code(StatusProto::OK);
delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
delete_stats->set_num_documents_deleted(doc_store_result.num_docs_deleted);
return delete_result;
}
DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
const SearchSpecProto& search_spec, bool return_deleted_document_info) {
ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
<< " from doc store";
DeleteByQueryResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
DeleteByQueryStatsProto* delete_stats =
result_proto.mutable_delete_by_query_stats();
delete_stats->set_query_length(search_spec.query().length());
delete_stats->set_num_namespaces_filtered(
search_spec.namespace_filters_size());
delete_stats->set_num_schema_types_filtered(
search_spec.schema_type_filters_size());
ScopedTimer delete_timer(clock_->GetNewTimer(), [delete_stats](int64_t t) {
delete_stats->set_latency_ms(t);
});
libtextclassifier3::Status status =
ValidateSearchSpec(search_spec, performance_configuration_);
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
// Gets unordered results from query processor
auto query_processor_or = QueryProcessor::Create(
index_.get(), integer_index_.get(), language_segmenter_.get(),
normalizer_.get(), document_store_.get(), schema_store_.get());
if (!query_processor_or.ok()) {
TransformStatus(query_processor_or.status(), result_status);
delete_stats->set_parse_query_latency_ms(
component_timer->GetElapsedMilliseconds());
return result_proto;
}
std::unique_ptr<QueryProcessor> query_processor =
std::move(query_processor_or).ValueOrDie();
int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
auto query_results_or = query_processor->ParseSearch(
search_spec, ScoringSpecProto::RankingStrategy::NONE, current_time_ms);
if (!query_results_or.ok()) {
TransformStatus(query_results_or.status(), result_status);
delete_stats->set_parse_query_latency_ms(
component_timer->GetElapsedMilliseconds());
return result_proto;
}
QueryResults query_results = std::move(query_results_or).ValueOrDie();
delete_stats->set_parse_query_latency_ms(
component_timer->GetElapsedMilliseconds());
ICING_VLOG(2) << "Deleting the docs that matched the query.";
int num_deleted = 0;
// A map used to group deleted documents.
// From the (namespace, type) pair to a list of uris.
std::unordered_map<NamespaceTypePair,
DeleteByQueryResultProto::DocumentGroupInfo*,
NamespaceTypePairHasher>
deleted_info_map;
component_timer = clock_->GetNewTimer();
while (query_results.root_iterator->Advance().ok()) {
ICING_VLOG(3) << "Deleting doc "
<< query_results.root_iterator->doc_hit_info().document_id();
++num_deleted;
if (return_deleted_document_info) {
status = RetrieveAndAddDocumentInfo(
document_store_.get(), result_proto, deleted_info_map,
query_results.root_iterator->doc_hit_info().document_id());
if (!status.ok()) {
TransformStatus(status, result_status);
delete_stats->set_document_removal_latency_ms(
component_timer->GetElapsedMilliseconds());
return result_proto;
}
}
status = document_store_->Delete(
query_results.root_iterator->doc_hit_info().document_id(),
current_time_ms);
if (!status.ok()) {
TransformStatus(status, result_status);
delete_stats->set_document_removal_latency_ms(
component_timer->GetElapsedMilliseconds());
return result_proto;
}
}
delete_stats->set_document_removal_latency_ms(
component_timer->GetElapsedMilliseconds());
int term_count = 0;
for (const auto& section_and_terms : query_results.query_terms) {
term_count += section_and_terms.second.size();
}
delete_stats->set_num_terms(term_count);
if (num_deleted > 0) {
result_proto.mutable_status()->set_code(StatusProto::OK);
} else {
result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
result_proto.mutable_status()->set_message(
"No documents matched the query to delete by!");
}
delete_stats->set_num_documents_deleted(num_deleted);
return result_proto;
}
PersistToDiskResultProto IcingSearchEngine::PersistToDisk(
PersistType::Code persist_type) {
ICING_VLOG(1) << "Persisting data to disk";
PersistToDiskResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
auto status = InternalPersistToDisk(persist_type);
TransformStatus(status, result_status);
return result_proto;
}
// Optimizes Icing's storage
//
// Steps:
// 1. Flush data to disk.
// 2. Copy data needed to a tmp directory.
// 3. Swap current directory and tmp directory.
OptimizeResultProto IcingSearchEngine::Optimize() {
ICING_VLOG(1) << "Optimizing icing storage";
OptimizeResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats();
ScopedTimer optimize_timer(
clock_->GetNewTimer(),
[optimize_stats](int64_t t) { optimize_stats->set_latency_ms(t); });
// Flushes data to disk before doing optimization
auto status = InternalPersistToDisk(PersistType::FULL);
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
optimize_stats->set_storage_size_before(
Filesystem::SanitizeFileSize(before_size));
// TODO(b/143646633): figure out if we need to optimize index and doc store
// at the same time.
std::unique_ptr<Timer> optimize_doc_store_timer = clock_->GetNewTimer();
libtextclassifier3::StatusOr<std::vector<DocumentId>>
document_id_old_to_new_or = OptimizeDocumentStore(optimize_stats);
optimize_stats->set_document_store_optimize_latency_ms(
optimize_doc_store_timer->GetElapsedMilliseconds());
if (!document_id_old_to_new_or.ok() &&
!absl_ports::IsDataLoss(document_id_old_to_new_or.status())) {
// The status now is either ABORTED_ERROR or INTERNAL_ERROR.
// If ABORTED_ERROR, Icing should still be working.
// If INTERNAL_ERROR, we're having IO errors or other errors that we can't
// recover from.
TransformStatus(document_id_old_to_new_or.status(), result_status);
return result_proto;
}
// The status is either OK or DATA_LOSS. The optimized document store is
// guaranteed to work, so we update index according to the new document store.
std::unique_ptr<Timer> optimize_index_timer = clock_->GetNewTimer();
bool should_rebuild_index =
!document_id_old_to_new_or.ok() ||
ShouldRebuildIndex(*optimize_stats,
options_.optimize_rebuild_index_threshold());
if (!should_rebuild_index) {
optimize_stats->set_index_restoration_mode(
OptimizeStatsProto::INDEX_TRANSLATION);
libtextclassifier3::Status index_optimize_status =
index_->Optimize(document_id_old_to_new_or.ValueOrDie(),
document_store_->last_added_document_id());
if (!index_optimize_status.ok()) {
ICING_LOG(WARNING) << "Failed to optimize index. Error: "
<< index_optimize_status.error_message();
should_rebuild_index = true;
}
libtextclassifier3::Status integer_index_optimize_status =
integer_index_->Optimize(document_id_old_to_new_or.ValueOrDie(),
document_store_->last_added_document_id());
if (!integer_index_optimize_status.ok()) {
ICING_LOG(WARNING) << "Failed to optimize integer index. Error: "
<< integer_index_optimize_status.error_message();
should_rebuild_index = true;
}
libtextclassifier3::Status qualified_id_join_index_optimize_status =
qualified_id_join_index_->Optimize(
document_id_old_to_new_or.ValueOrDie(),
document_store_->last_added_document_id());
if (!qualified_id_join_index_optimize_status.ok()) {
ICING_LOG(WARNING)
<< "Failed to optimize qualified id join index. Error: "
<< qualified_id_join_index_optimize_status.error_message();
should_rebuild_index = true;
}
}
// If we received a DATA_LOSS error from OptimizeDocumentStore, we have a
// valid document store, but it might be the old one or the new one. So throw
// out the index data and rebuild from scratch.
// Likewise, if Index::Optimize failed, then attempt to recover the index by
// rebuilding from scratch.
// If ShouldRebuildIndex() returns true, we will also rebuild the index for
// better performance.
if (should_rebuild_index) {
optimize_stats->set_index_restoration_mode(
OptimizeStatsProto::FULL_INDEX_REBUILD);
ICING_LOG(WARNING) << "Clearing the entire index!";
libtextclassifier3::Status index_clear_status = ClearAllIndices();
if (!index_clear_status.ok()) {
status = absl_ports::Annotate(
absl_ports::InternalError("Failed to clear index."),
index_clear_status.error_message());
TransformStatus(status, result_status);
optimize_stats->set_index_restoration_latency_ms(
optimize_index_timer->GetElapsedMilliseconds());
return result_proto;
}
IndexRestorationResult index_restoration_status = RestoreIndexIfNeeded();
// DATA_LOSS means that we have successfully re-added content to the index.
// Some indexed content was lost, but otherwise the index is in a valid
// state and can be queried.
if (!index_restoration_status.status.ok() &&
!absl_ports::IsDataLoss(index_restoration_status.status)) {
status = absl_ports::Annotate(
absl_ports::InternalError(
"Failed to reindex documents after optimization."),
index_restoration_status.status.error_message());
TransformStatus(status, result_status);
optimize_stats->set_index_restoration_latency_ms(
optimize_index_timer->GetElapsedMilliseconds());
return result_proto;
}
}
optimize_stats->set_index_restoration_latency_ms(
optimize_index_timer->GetElapsedMilliseconds());
// Read the optimize status to get the time that we last ran.
std::string optimize_status_filename =
absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename);
FileBackedProto<OptimizeStatusProto> optimize_status_file(
*filesystem_, optimize_status_filename);
auto optimize_status_or = optimize_status_file.Read();
int64_t current_time = clock_->GetSystemTimeMilliseconds();
if (optimize_status_or.ok()) {
// If we have trouble reading the status or this is the first time that
// we've ever run, don't set this field.
optimize_stats->set_time_since_last_optimize_ms(
current_time - optimize_status_or.ValueOrDie()
->last_successful_optimize_run_time_ms());
}
// Update the status for this run and write it.
auto optimize_status = std::make_unique<OptimizeStatusProto>();
optimize_status->set_last_successful_optimize_run_time_ms(current_time);
optimize_status_file.Write(std::move(optimize_status));
// Flushes data to disk after doing optimization
status = InternalPersistToDisk(PersistType::FULL);
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
optimize_stats->set_storage_size_after(
Filesystem::SanitizeFileSize(after_size));
TransformStatus(document_id_old_to_new_or.status(), result_status);
return result_proto;
}
GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() {
ICING_VLOG(1) << "Getting optimize info from IcingSearchEngine";
GetOptimizeInfoResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
absl_ports::shared_lock l(&mutex_);
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
// Read the optimize status to get the time that we last ran.
std::string optimize_status_filename =
absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename);
FileBackedProto<OptimizeStatusProto> optimize_status_file(
*filesystem_, optimize_status_filename);
auto optimize_status_or = optimize_status_file.Read();
int64_t current_time = clock_->GetSystemTimeMilliseconds();
if (optimize_status_or.ok()) {
// If we have trouble reading the status or this is the first time that
// we've ever run, don't set this field.
result_proto.set_time_since_last_optimize_ms(
current_time - optimize_status_or.ValueOrDie()
->last_successful_optimize_run_time_ms());
}
// Get stats from DocumentStore
auto doc_store_optimize_info_or = document_store_->GetOptimizeInfo();
if (!doc_store_optimize_info_or.ok()) {
TransformStatus(doc_store_optimize_info_or.status(), result_status);
return result_proto;
}
DocumentStore::OptimizeInfo doc_store_optimize_info =
doc_store_optimize_info_or.ValueOrDie();
result_proto.set_optimizable_docs(doc_store_optimize_info.optimizable_docs);
if (doc_store_optimize_info.optimizable_docs == 0) {
// Can return early since there's nothing to calculate on the index side
result_proto.set_estimated_optimizable_bytes(0);
result_status->set_code(StatusProto::OK);
return result_proto;
}
// Get stats from Index.
auto index_elements_size_or = index_->GetElementsSize();
if (!index_elements_size_or.ok()) {
TransformStatus(index_elements_size_or.status(), result_status);
return result_proto;
}
int64_t index_elements_size = index_elements_size_or.ValueOrDie();
// TODO(b/259744228): add stats for integer index
// Sum up the optimizable sizes from DocumentStore and Index
result_proto.set_estimated_optimizable_bytes(
index_elements_size * doc_store_optimize_info.optimizable_docs /
doc_store_optimize_info.total_docs +
doc_store_optimize_info.estimated_optimizable_bytes);
result_status->set_code(StatusProto::OK);
return result_proto;
}
StorageInfoResultProto IcingSearchEngine::GetStorageInfo() {
StorageInfoResultProto result;
absl_ports::shared_lock l(&mutex_);
if (!initialized_) {
result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
result.mutable_status()->set_message(
"IcingSearchEngine has not been initialized!");
return result;
}
int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
result.mutable_storage_info()->set_total_storage_size(
Filesystem::SanitizeFileSize(index_size));
*result.mutable_storage_info()->mutable_document_storage_info() =
document_store_->GetStorageInfo();
*result.mutable_storage_info()->mutable_schema_store_storage_info() =
schema_store_->GetStorageInfo();
*result.mutable_storage_info()->mutable_index_storage_info() =
index_->GetStorageInfo();
// TODO(b/259744228): add stats for integer index
result.mutable_status()->set_code(StatusProto::OK);
return result;
}
DebugInfoResultProto IcingSearchEngine::GetDebugInfo(
DebugInfoVerbosity::Code verbosity) {
DebugInfoResultProto debug_info;
StatusProto* result_status = debug_info.mutable_status();
absl_ports::shared_lock l(&mutex_);
if (!initialized_) {
debug_info.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
debug_info.mutable_status()->set_message(
"IcingSearchEngine has not been initialized!");
return debug_info;
}
// Index
*debug_info.mutable_debug_info()->mutable_index_info() =
index_->GetDebugInfo(verbosity);
// TODO(b/259744228): add debug info for integer index
// Document Store
libtextclassifier3::StatusOr<DocumentDebugInfoProto> document_debug_info =
document_store_->GetDebugInfo(verbosity);
if (!document_debug_info.ok()) {
TransformStatus(document_debug_info.status(), result_status);
return debug_info;
}
*debug_info.mutable_debug_info()->mutable_document_info() =
std::move(document_debug_info).ValueOrDie();
// Schema Store
libtextclassifier3::StatusOr<SchemaDebugInfoProto> schema_debug_info =
schema_store_->GetDebugInfo();
if (!schema_debug_info.ok()) {
TransformStatus(schema_debug_info.status(), result_status);
return debug_info;
}
*debug_info.mutable_debug_info()->mutable_schema_info() =
std::move(schema_debug_info).ValueOrDie();
result_status->set_code(StatusProto::OK);
return debug_info;
}
libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk(
PersistType::Code persist_type) {
if (persist_type == PersistType::LITE) {
return document_store_->PersistToDisk(persist_type);
}
ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk());
ICING_RETURN_IF_ERROR(document_store_->PersistToDisk(PersistType::FULL));
ICING_RETURN_IF_ERROR(index_->PersistToDisk());
ICING_RETURN_IF_ERROR(integer_index_->PersistToDisk());
ICING_RETURN_IF_ERROR(qualified_id_join_index_->PersistToDisk());
return libtextclassifier3::Status::OK;
}
SearchResultProto IcingSearchEngine::Search(
const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
const ResultSpecProto& result_spec) {
if (search_spec.use_read_only_search()) {
return SearchLockedShared(search_spec, scoring_spec, result_spec);
} else {
return SearchLockedExclusive(search_spec, scoring_spec, result_spec);
}
}
SearchResultProto IcingSearchEngine::SearchLockedShared(
const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
const ResultSpecProto& result_spec) {
std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
// Only acquire an overall read-lock for this implementation. Finer-grained
// locks are implemented around code paths that write changes to Icing's data
// members.
absl_ports::shared_lock l(&mutex_);
int64_t lock_acquisition_latency = overall_timer->GetElapsedMilliseconds();
SearchResultProto result_proto =
InternalSearch(search_spec, scoring_spec, result_spec);
result_proto.mutable_query_stats()->set_lock_acquisition_latency_ms(
lock_acquisition_latency);
result_proto.mutable_query_stats()->set_latency_ms(
overall_timer->GetElapsedMilliseconds());
return result_proto;
}
SearchResultProto IcingSearchEngine::SearchLockedExclusive(
const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
const ResultSpecProto& result_spec) {
std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
// Acquire the overall write-lock for this locked implementation.
absl_ports::unique_lock l(&mutex_);
int64_t lock_acquisition_latency = overall_timer->GetElapsedMilliseconds();
SearchResultProto result_proto =
InternalSearch(search_spec, scoring_spec, result_spec);
result_proto.mutable_query_stats()->set_lock_acquisition_latency_ms(
lock_acquisition_latency);
result_proto.mutable_query_stats()->set_latency_ms(
overall_timer->GetElapsedMilliseconds());
return result_proto;
}
SearchResultProto IcingSearchEngine::InternalSearch(
const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
const ResultSpecProto& result_spec) {
SearchResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
QueryStatsProto* query_stats = result_proto.mutable_query_stats();
query_stats->set_query_length(search_spec.query().length());
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
libtextclassifier3::Status status =
ValidateResultSpec(document_store_.get(), result_spec);
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
status = ValidateSearchSpec(search_spec, performance_configuration_);
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
query_stats->set_num_namespaces_filtered(
search_spec.namespace_filters_size());
query_stats->set_num_schema_types_filtered(
search_spec.schema_type_filters_size());
query_stats->set_ranking_strategy(scoring_spec.rank_by());
query_stats->set_is_first_page(true);
query_stats->set_requested_page_size(result_spec.num_per_page());
const JoinSpecProto& join_spec = search_spec.join_spec();
std::unique_ptr<JoinChildrenFetcher> join_children_fetcher;
std::unique_ptr<ResultAdjustmentInfo> child_result_adjustment_info;
int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
if (!join_spec.parent_property_expression().empty() &&
!join_spec.child_property_expression().empty()) {
// Process child query
QueryScoringResults nested_query_scoring_results = ProcessQueryAndScore(
join_spec.nested_spec().search_spec(),
join_spec.nested_spec().scoring_spec(),
join_spec.nested_spec().result_spec(),
/*join_children_fetcher=*/nullptr, current_time_ms);
// TOOD(b/256022027): set different kinds of latency for 2nd query.
if (!nested_query_scoring_results.status.ok()) {
TransformStatus(nested_query_scoring_results.status, result_status);
return result_proto;
}
JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
qualified_id_join_index_.get(),
current_time_ms);
// Building a JoinChildrenFetcher where child documents are grouped by
// their joinable values.
libtextclassifier3::StatusOr<JoinChildrenFetcher> join_children_fetcher_or =
join_processor.GetChildrenFetcher(
search_spec.join_spec(),
std::move(nested_query_scoring_results.scored_document_hits));
if (!join_children_fetcher_or.ok()) {
TransformStatus(join_children_fetcher_or.status(), result_status);
return result_proto;
}
join_children_fetcher = std::make_unique<JoinChildrenFetcher>(
std::move(join_children_fetcher_or).ValueOrDie());
// Assign child's ResultAdjustmentInfo.
child_result_adjustment_info = std::make_unique<ResultAdjustmentInfo>(
join_spec.nested_spec().search_spec(),
join_spec.nested_spec().scoring_spec(),
join_spec.nested_spec().result_spec(), schema_store_.get(),
std::move(nested_query_scoring_results.query_terms));
}
// Process parent query
QueryScoringResults query_scoring_results =
ProcessQueryAndScore(search_spec, scoring_spec, result_spec,
join_children_fetcher.get(), current_time_ms);
int term_count = 0;
for (const auto& section_and_terms : query_scoring_results.query_terms) {
term_count += section_and_terms.second.size();
}
query_stats->set_num_terms(term_count);
query_stats->set_parse_query_latency_ms(
query_scoring_results.parse_query_latency_ms);
query_stats->set_scoring_latency_ms(query_scoring_results.scoring_latency_ms);
if (!query_scoring_results.status.ok()) {
TransformStatus(query_scoring_results.status, result_status);
return result_proto;
}
query_stats->set_num_documents_scored(
query_scoring_results.scored_document_hits.size());
// Returns early for empty result
if (query_scoring_results.scored_document_hits.empty()) {
result_status->set_code(StatusProto::OK);
return result_proto;
}
// Construct parent's result adjustment info.
auto parent_result_adjustment_info = std::make_unique<ResultAdjustmentInfo>(
search_spec, scoring_spec, result_spec, schema_store_.get(),
std::move(query_scoring_results.query_terms));
std::unique_ptr<ScoredDocumentHitsRanker> ranker;
if (join_children_fetcher != nullptr) {
std::unique_ptr<Timer> join_timer = clock_->GetNewTimer();
// Join 2 scored document hits
JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
qualified_id_join_index_.get(),
current_time_ms);
libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>>
joined_result_document_hits_or = join_processor.Join(
join_spec, std::move(query_scoring_results.scored_document_hits),
*join_children_fetcher);
if (!joined_result_document_hits_or.ok()) {
TransformStatus(joined_result_document_hits_or.status(), result_status);
return result_proto;
}
std::vector<JoinedScoredDocumentHit> joined_result_document_hits =
std::move(joined_result_document_hits_or).ValueOrDie();
query_stats->set_join_latency_ms(join_timer->GetElapsedMilliseconds());
std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
// Ranks results
ranker = std::make_unique<
PriorityQueueScoredDocumentHitsRanker<JoinedScoredDocumentHit>>(
std::move(joined_result_document_hits),
/*is_descending=*/scoring_spec.order_by() ==
ScoringSpecProto::Order::DESC);
query_stats->set_ranking_latency_ms(
component_timer->GetElapsedMilliseconds());
} else {
// Non-join query
std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
// Ranks results
ranker = std::make_unique<
PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
std::move(query_scoring_results.scored_document_hits),
/*is_descending=*/scoring_spec.order_by() ==
ScoringSpecProto::Order::DESC);
query_stats->set_ranking_latency_ms(
component_timer->GetElapsedMilliseconds());
}
std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
// CacheAndRetrieveFirstPage and retrieves the document protos and snippets if
// requested
auto result_retriever_or =
ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
language_segmenter_.get(), normalizer_.get());
if (!result_retriever_or.ok()) {
TransformStatus(result_retriever_or.status(), result_status);
query_stats->set_document_retrieval_latency_ms(
component_timer->GetElapsedMilliseconds());
return result_proto;
}
std::unique_ptr<ResultRetrieverV2> result_retriever =
std::move(result_retriever_or).ValueOrDie();
libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
page_result_info_or = result_state_manager_->CacheAndRetrieveFirstPage(
std::move(ranker), std::move(parent_result_adjustment_info),
std::move(child_result_adjustment_info), result_spec,
*document_store_, *result_retriever, current_time_ms);
if (!page_result_info_or.ok()) {
TransformStatus(page_result_info_or.status(), result_status);
query_stats->set_document_retrieval_latency_ms(
component_timer->GetElapsedMilliseconds());
return result_proto;
}
std::pair<uint64_t, PageResult> page_result_info =
std::move(page_result_info_or).ValueOrDie();
// Assembles the final search result proto
result_proto.mutable_results()->Reserve(
page_result_info.second.results.size());
int32_t child_count = 0;
for (SearchResultProto::ResultProto& result :
page_result_info.second.results) {
child_count += result.joined_results_size();
result_proto.mutable_results()->Add(std::move(result));
}
result_status->set_code(StatusProto::OK);
if (page_result_info.first != kInvalidNextPageToken) {
result_proto.set_next_page_token(page_result_info.first);
}
query_stats->set_document_retrieval_latency_ms(
component_timer->GetElapsedMilliseconds());
query_stats->set_num_results_returned_current_page(
result_proto.results_size());
query_stats->set_num_joined_results_returned_current_page(child_count);
query_stats->set_num_results_with_snippets(
page_result_info.second.num_results_with_snippets);
return result_proto;
}
IcingSearchEngine::QueryScoringResults IcingSearchEngine::ProcessQueryAndScore(
const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
const ResultSpecProto& result_spec,
const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms) {
std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
// Gets unordered results from query processor
auto query_processor_or = QueryProcessor::Create(
index_.get(), integer_index_.get(), language_segmenter_.get(),
normalizer_.get(), document_store_.get(), schema_store_.get());
if (!query_processor_or.ok()) {
return QueryScoringResults(
std::move(query_processor_or).status(), /*query_terms_in=*/{},
/*scored_document_hits_in=*/{},
/*parse_query_latency_ms_in=*/component_timer->GetElapsedMilliseconds(),
/*scoring_latency_ms_in=*/0);
}
std::unique_ptr<QueryProcessor> query_processor =
std::move(query_processor_or).ValueOrDie();
auto ranking_strategy_or = GetRankingStrategyFromScoringSpec(scoring_spec);
libtextclassifier3::StatusOr<QueryResults> query_results_or;
if (ranking_strategy_or.ok()) {
query_results_or = query_processor->ParseSearch(
search_spec, ranking_strategy_or.ValueOrDie(), current_time_ms);
} else {
query_results_or = ranking_strategy_or.status();
}
if (!query_results_or.ok()) {
return QueryScoringResults(
std::move(query_results_or).status(), /*query_terms_in=*/{},
/*scored_document_hits_in=*/{},
/*parse_query_latency_ms_in=*/component_timer->GetElapsedMilliseconds(),
/*scoring_latency_ms_in=*/0);
}
QueryResults query_results = std::move(query_results_or).ValueOrDie();
int64_t parse_query_latency_ms = component_timer->GetElapsedMilliseconds();
component_timer = clock_->GetNewTimer();
// Scores but does not rank the results.
libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>>
scoring_processor_or = ScoringProcessor::Create(
scoring_spec, document_store_.get(), schema_store_.get(),
current_time_ms, join_children_fetcher);
if (!scoring_processor_or.ok()) {
return QueryScoringResults(std::move(scoring_processor_or).status(),
std::move(query_results.query_terms),
/*scored_document_hits_in=*/{},
parse_query_latency_ms,
/*scoring_latency_ms_in=*/0);
}
std::unique_ptr<ScoringProcessor> scoring_processor =
std::move(scoring_processor_or).ValueOrDie();
std::vector<ScoredDocumentHit> scored_document_hits =
scoring_processor->Score(std::move(query_results.root_iterator),
result_spec.num_to_score(),
&query_results.query_term_iterators);
int64_t scoring_latency_ms = component_timer->GetElapsedMilliseconds();
return QueryScoringResults(libtextclassifier3::Status::OK,
std::move(query_results.query_terms),
std::move(scored_document_hits),
parse_query_latency_ms, scoring_latency_ms);
}
SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
SearchResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
QueryStatsProto* query_stats = result_proto.mutable_query_stats();
query_stats->set_is_first_page(false);
std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
// ResultStateManager has its own writer lock, so here we only need a reader
// lock for other components.
absl_ports::shared_lock l(&mutex_);
query_stats->set_lock_acquisition_latency_ms(
overall_timer->GetElapsedMilliseconds());
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
auto result_retriever_or =
ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
language_segmenter_.get(), normalizer_.get());
if (!result_retriever_or.ok()) {
TransformStatus(result_retriever_or.status(), result_status);
return result_proto;
}
std::unique_ptr<ResultRetrieverV2> result_retriever =
std::move(result_retriever_or).ValueOrDie();
int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
page_result_info_or = result_state_manager_->GetNextPage(
next_page_token, *result_retriever, current_time_ms);
if (!page_result_info_or.ok()) {
if (absl_ports::IsNotFound(page_result_info_or.status())) {
// NOT_FOUND means an empty result.
result_status->set_code(StatusProto::OK);
} else {
// Real error, pass up.
TransformStatus(page_result_info_or.status(), result_status);
}
return result_proto;
}
std::pair<uint64_t, PageResult> page_result_info =
std::move(page_result_info_or).ValueOrDie();
query_stats->set_requested_page_size(
page_result_info.second.requested_page_size);
// Assembles the final search result proto
result_proto.mutable_results()->Reserve(
page_result_info.second.results.size());
int32_t child_count = 0;
for (SearchResultProto::ResultProto& result :
page_result_info.second.results) {
child_count += result.joined_results_size();
result_proto.mutable_results()->Add(std::move(result));
}
result_status->set_code(StatusProto::OK);
if (page_result_info.first != kInvalidNextPageToken) {
result_proto.set_next_page_token(page_result_info.first);
}
// The only thing that we're doing is document retrieval. So document
// retrieval latency and overall latency are the same and can use the same
// timer.
query_stats->set_document_retrieval_latency_ms(
overall_timer->GetElapsedMilliseconds());
query_stats->set_latency_ms(overall_timer->GetElapsedMilliseconds());
query_stats->set_num_results_returned_current_page(
result_proto.results_size());
query_stats->set_num_results_with_snippets(
page_result_info.second.num_results_with_snippets);
query_stats->set_num_joined_results_returned_current_page(child_count);
return result_proto;
}
void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) {
absl_ports::shared_lock l(&mutex_);
if (!initialized_) {
ICING_LOG(ERROR) << "IcingSearchEngine has not been initialized!";
return;
}
result_state_manager_->InvalidateResultState(next_page_token);
}
libtextclassifier3::StatusOr<std::vector<DocumentId>>
IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) {
// Gets the current directory path and an empty tmp directory path for
// document store optimization.
const std::string current_document_dir =
MakeDocumentDirectoryPath(options_.base_dir());
const std::string temporary_document_dir =
MakeDocumentTemporaryDirectoryPath(options_.base_dir());
if (!filesystem_->DeleteDirectoryRecursively(
temporary_document_dir.c_str()) ||
!filesystem_->CreateDirectoryRecursively(
temporary_document_dir.c_str())) {
return absl_ports::AbortedError(absl_ports::StrCat(
"Failed to create a tmp directory: ", temporary_document_dir));
}
// Copies valid document data to tmp directory
libtextclassifier3::StatusOr<std::vector<DocumentId>>
document_id_old_to_new_or = document_store_->OptimizeInto(
temporary_document_dir, language_segmenter_.get(), optimize_stats);
// Handles error if any
if (!document_id_old_to_new_or.ok()) {
filesystem_->DeleteDirectoryRecursively(temporary_document_dir.c_str());
return absl_ports::Annotate(
absl_ports::AbortedError("Failed to optimize document store"),
document_id_old_to_new_or.status().error_message());
}
// result_state_manager_ depends on document_store_. So we need to reset it at
// the same time that we reset the document_store_.
result_state_manager_.reset();
document_store_.reset();
// When swapping files, always put the current working directory at the
// second place because it is renamed at the latter position so we're less
// vulnerable to errors.
if (!filesystem_->SwapFiles(temporary_document_dir.c_str(),
current_document_dir.c_str())) {
ICING_LOG(ERROR) << "Failed to swap files";
// Ensures that current directory is still present.
if (!filesystem_->CreateDirectoryRecursively(
current_document_dir.c_str())) {
// Can't even create the old directory. Mark as uninitialized and return
// INTERNAL.
initialized_ = false;
return absl_ports::InternalError(
"Failed to create file directory for document store");
}
// Tries to rebuild document store if swapping fails, to avoid leaving the
// system in the broken state for future operations.
auto create_result_or = DocumentStore::Create(
filesystem_.get(), current_document_dir, clock_.get(),
schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false,
options_.document_store_namespace_id_fingerprint(),
options_.pre_mapping_fbv(), options_.use_persistent_hash_map(),
options_.compression_level(), /*initialize_stats=*/nullptr);
// TODO(b/144458732): Implement a more robust version of
// TC_ASSIGN_OR_RETURN that can support error logging.
if (!create_result_or.ok()) {
// Unable to create DocumentStore from the old file. Mark as uninitialized
// and return INTERNAL.
initialized_ = false;
ICING_LOG(ERROR) << "Failed to create document store instance";
return absl_ports::Annotate(
absl_ports::InternalError("Failed to create document store instance"),
create_result_or.status().error_message());
}
document_store_ = std::move(create_result_or.ValueOrDie().document_store);
result_state_manager_ = std::make_unique<ResultStateManager>(
performance_configuration_.max_num_total_hits, *document_store_);
// Potential data loss
// TODO(b/147373249): Find a way to detect true data loss error
return absl_ports::DataLossError(
"Failed to optimize document store, there might be data loss");
}
// Recreates the doc store instance
auto create_result_or = DocumentStore::Create(
filesystem_.get(), current_document_dir, clock_.get(),
schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false,
options_.document_store_namespace_id_fingerprint(),
options_.pre_mapping_fbv(), options_.use_persistent_hash_map(),
options_.compression_level(), /*initialize_stats=*/nullptr);
if (!create_result_or.ok()) {
// Unable to create DocumentStore from the new file. Mark as uninitialized
// and return INTERNAL.
initialized_ = false;
return absl_ports::InternalError(
"Document store has been optimized, but a valid document store "
"instance can't be created");
}
document_store_ = std::move(create_result_or.ValueOrDie().document_store);
result_state_manager_ = std::make_unique<ResultStateManager>(
performance_configuration_.max_num_total_hits, *document_store_);
// Deletes tmp directory
if (!filesystem_->DeleteDirectoryRecursively(
temporary_document_dir.c_str())) {
ICING_LOG(ERROR) << "Document store has been optimized, but it failed to "
"delete temporary file directory";
}
return document_id_old_to_new_or;
}
IcingSearchEngine::IndexRestorationResult
IcingSearchEngine::RestoreIndexIfNeeded() {
DocumentId last_stored_document_id =
document_store_->last_added_document_id();
if (last_stored_document_id == index_->last_added_document_id() &&
last_stored_document_id == integer_index_->last_added_document_id() &&
last_stored_document_id ==
qualified_id_join_index_->last_added_document_id()) {
// No need to recover.
return {libtextclassifier3::Status::OK, false, false, false};
}
if (last_stored_document_id == kInvalidDocumentId) {
// Document store is empty but index is not. Clear the index.
return {ClearAllIndices(), false, false, false};
}
// Truncate indices first.
auto truncate_result_or = TruncateIndicesTo(last_stored_document_id);
if (!truncate_result_or.ok()) {
return {std::move(truncate_result_or).status(), false, false, false};
}
TruncateIndexResult truncate_result =
std::move(truncate_result_or).ValueOrDie();
if (truncate_result.first_document_to_reindex > last_stored_document_id) {
// Nothing to restore. Just return.
return {libtextclassifier3::Status::OK, false, false, false};
}
auto data_indexing_handlers_or = CreateDataIndexingHandlers();
if (!data_indexing_handlers_or.ok()) {
return {data_indexing_handlers_or.status(),
truncate_result.index_needed_restoration,
truncate_result.integer_index_needed_restoration,
truncate_result.qualified_id_join_index_needed_restoration};
}
// By using recovery_mode for IndexProcessor, we're able to replay documents
// from smaller document id and it will skip documents that are already been
// indexed.
IndexProcessor index_processor(
std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get(),
/*recovery_mode=*/true);
ICING_VLOG(1) << "Restoring index by replaying documents from document id "
<< truncate_result.first_document_to_reindex
<< " to document id " << last_stored_document_id;
libtextclassifier3::Status overall_status;
for (DocumentId document_id = truncate_result.first_document_to_reindex;
document_id <= last_stored_document_id; ++document_id) {
libtextclassifier3::StatusOr<DocumentProto> document_or =
document_store_->Get(document_id);
if (!document_or.ok()) {
if (absl_ports::IsInvalidArgument(document_or.status()) ||
absl_ports::IsNotFound(document_or.status())) {
// Skips invalid and non-existing documents.
continue;
} else {
// Returns other errors
return {document_or.status(), truncate_result.index_needed_restoration,
truncate_result.integer_index_needed_restoration,
truncate_result.qualified_id_join_index_needed_restoration};
}
}
DocumentProto document(std::move(document_or).ValueOrDie());
libtextclassifier3::StatusOr<TokenizedDocument> tokenized_document_or =
TokenizedDocument::Create(schema_store_.get(),
language_segmenter_.get(),
std::move(document));
if (!tokenized_document_or.ok()) {
return {tokenized_document_or.status(),
truncate_result.index_needed_restoration,
truncate_result.integer_index_needed_restoration,
truncate_result.qualified_id_join_index_needed_restoration};
}
TokenizedDocument tokenized_document(
std::move(tokenized_document_or).ValueOrDie());
libtextclassifier3::Status status =
index_processor.IndexDocument(tokenized_document, document_id);
if (!status.ok()) {
if (!absl_ports::IsDataLoss(status)) {
// Real error. Stop recovering and pass it up.
return {status, truncate_result.index_needed_restoration,
truncate_result.integer_index_needed_restoration,
truncate_result.qualified_id_join_index_needed_restoration};
}
// FIXME: why can we skip data loss error here?
// Just a data loss. Keep trying to add the remaining docs, but report the
// data loss when we're done.
overall_status = status;
}
}
return {overall_status, truncate_result.index_needed_restoration,
truncate_result.integer_index_needed_restoration,
truncate_result.qualified_id_join_index_needed_restoration};
}
libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() {
auto status_or = schema_store_->GetSchema();
if (status_or.ok()) {
// Found a schema.
return false;
}
if (!absl_ports::IsNotFound(status_or.status())) {
// Any other type of error
return status_or.status();
}
// We know: We don't have a schema now.
//
// We know: If no documents have been added, then the last_added_document_id
// will be invalid.
//
// So: If documents have been added before and we don't have a schema now,
// then that means we must have had a schema at some point. Since we wouldn't
// accept documents without a schema to validate them against.
return document_store_->last_added_document_id() != kInvalidDocumentId;
}
libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DataIndexingHandler>>>
IcingSearchEngine::CreateDataIndexingHandlers() {
std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
// Term index handler
ICING_ASSIGN_OR_RETURN(std::unique_ptr<StringSectionIndexingHandler>
string_section_indexing_handler,
StringSectionIndexingHandler::Create(
clock_.get(), normalizer_.get(), index_.get()));
handlers.push_back(std::move(string_section_indexing_handler));
// Integer index handler
ICING_ASSIGN_OR_RETURN(std::unique_ptr<IntegerSectionIndexingHandler>
integer_section_indexing_handler,
IntegerSectionIndexingHandler::Create(
clock_.get(), integer_index_.get()));
handlers.push_back(std::move(integer_section_indexing_handler));
// Qualified id join index handler
ICING_ASSIGN_OR_RETURN(std::unique_ptr<QualifiedIdJoinIndexingHandler>
qualified_id_join_indexing_handler,
QualifiedIdJoinIndexingHandler::Create(
clock_.get(), qualified_id_join_index_.get()));
handlers.push_back(std::move(qualified_id_join_indexing_handler));
return handlers;
}
libtextclassifier3::StatusOr<IcingSearchEngine::TruncateIndexResult>
IcingSearchEngine::TruncateIndicesTo(DocumentId last_stored_document_id) {
// Attempt to truncate term index.
// TruncateTo ensures that the index does not hold any data that is not
// present in the ground truth. If the document store lost some documents,
// TruncateTo will ensure that the index does not contain any hits from those
// lost documents. If the index does not contain any hits for documents with
// document id greater than last_stored_document_id, then TruncateTo will have
// no effect.
ICING_RETURN_IF_ERROR(index_->TruncateTo(last_stored_document_id));
// Get last indexed document id for term index after truncating.
DocumentId term_index_last_added_document_id =
index_->last_added_document_id();
DocumentId first_document_to_reindex =
(term_index_last_added_document_id != kInvalidDocumentId)
? term_index_last_added_document_id + 1
: kMinDocumentId;
bool index_needed_restoration =
(last_stored_document_id != term_index_last_added_document_id);
// Attempt to truncate integer index.
bool integer_index_needed_restoration = false;
DocumentId integer_index_last_added_document_id =
integer_index_->last_added_document_id();
if (integer_index_last_added_document_id == kInvalidDocumentId ||
last_stored_document_id > integer_index_last_added_document_id) {
// If last_stored_document_id is greater than
// integer_index_last_added_document_id, then we only have to replay docs
// starting from integer_index_last_added_document_id + 1. Also use std::min
// since we might need to replay even smaller doc ids for term index.
integer_index_needed_restoration = true;
if (integer_index_last_added_document_id != kInvalidDocumentId) {
first_document_to_reindex = std::min(
first_document_to_reindex, integer_index_last_added_document_id + 1);
} else {
first_document_to_reindex = kMinDocumentId;
}
} else if (last_stored_document_id < integer_index_last_added_document_id) {
// Clear the entire integer index if last_stored_document_id is smaller than
// integer_index_last_added_document_id, because there is no way to remove
// data with doc_id > last_stored_document_id from integer index and we have
// to rebuild.
ICING_RETURN_IF_ERROR(integer_index_->Clear());
// Since the entire integer index is discarded, we start to rebuild it by
// setting first_document_to_reindex to kMinDocumentId.
integer_index_needed_restoration = true;
first_document_to_reindex = kMinDocumentId;
}
// Attempt to truncate qualified id join index
bool qualified_id_join_index_needed_restoration = false;
DocumentId qualified_id_join_index_last_added_document_id =
qualified_id_join_index_->last_added_document_id();
if (qualified_id_join_index_last_added_document_id == kInvalidDocumentId ||
last_stored_document_id >
qualified_id_join_index_last_added_document_id) {
// If last_stored_document_id is greater than
// qualified_id_join_index_last_added_document_id, then we only have to
// replay docs starting from (qualified_id_join_index_last_added_document_id
// + 1). Also use std::min since we might need to replay even smaller doc
// ids for other components.
qualified_id_join_index_needed_restoration = true;
if (qualified_id_join_index_last_added_document_id != kInvalidDocumentId) {
first_document_to_reindex =
std::min(first_document_to_reindex,
qualified_id_join_index_last_added_document_id + 1);
} else {
first_document_to_reindex = kMinDocumentId;
}
} else if (last_stored_document_id <
qualified_id_join_index_last_added_document_id) {
// Clear the entire qualified id join index if last_stored_document_id is
// smaller than qualified_id_join_index_last_added_document_id, because
// there is no way to remove data with doc_id > last_stored_document_id from
// join index efficiently and we have to rebuild.
ICING_RETURN_IF_ERROR(qualified_id_join_index_->Clear());
// Since the entire qualified id join index is discarded, we start to
// rebuild it by setting first_document_to_reindex to kMinDocumentId.
qualified_id_join_index_needed_restoration = true;
first_document_to_reindex = kMinDocumentId;
}
return TruncateIndexResult(first_document_to_reindex,
index_needed_restoration,
integer_index_needed_restoration,
qualified_id_join_index_needed_restoration);
}
libtextclassifier3::Status IcingSearchEngine::DiscardDerivedFiles() {
if (schema_store_ != nullptr || document_store_ != nullptr ||
index_ != nullptr || integer_index_ != nullptr ||
qualified_id_join_index_ != nullptr) {
return absl_ports::FailedPreconditionError(
"Cannot discard derived files while having valid instances");
}
// Schema store
ICING_RETURN_IF_ERROR(
SchemaStore::DiscardDerivedFiles(filesystem_.get(), options_.base_dir()));
// Document store
ICING_RETURN_IF_ERROR(DocumentStore::DiscardDerivedFiles(
filesystem_.get(), options_.base_dir()));
// Term index
if (!filesystem_->DeleteDirectoryRecursively(
MakeIndexDirectoryPath(options_.base_dir()).c_str())) {
return absl_ports::InternalError("Failed to discard index");
}
// Integer index
if (!filesystem_->DeleteDirectoryRecursively(
MakeIntegerIndexWorkingPath(options_.base_dir()).c_str())) {
return absl_ports::InternalError("Failed to discard integer index");
}
// Qualified id join index
if (!filesystem_->DeleteDirectoryRecursively(
MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir()).c_str())) {
return absl_ports::InternalError(
"Failed to discard qualified id join index");
}
return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status IcingSearchEngine::ClearSearchIndices() {
ICING_RETURN_IF_ERROR(index_->Reset());
ICING_RETURN_IF_ERROR(integer_index_->Clear());
return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status IcingSearchEngine::ClearJoinIndices() {
return qualified_id_join_index_->Clear();
}
libtextclassifier3::Status IcingSearchEngine::ClearAllIndices() {
ICING_RETURN_IF_ERROR(ClearSearchIndices());
ICING_RETURN_IF_ERROR(ClearJoinIndices());
return libtextclassifier3::Status::OK;
}
ResetResultProto IcingSearchEngine::Reset() {
absl_ports::unique_lock l(&mutex_);
return ResetInternal();
}
ResetResultProto IcingSearchEngine::ResetInternal() {
ICING_VLOG(1) << "Resetting IcingSearchEngine";
ResetResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
initialized_ = false;
ResetMembers();
if (!filesystem_->DeleteDirectoryRecursively(options_.base_dir().c_str())) {
result_status->set_code(StatusProto::INTERNAL);
return result_proto;
}
if (InternalInitialize().status().code() != StatusProto::OK) {
// We shouldn't hit the following Initialize errors:
// NOT_FOUND: all data was cleared, we aren't expecting anything
// DATA_LOSS: all data was cleared, we aren't expecting anything
// RESOURCE_EXHAUSTED: just deleted files, shouldn't run out of space
//
// We can't tell if Initialize failed and left Icing in an inconsistent
// state or if it was a temporary I/O error. Group everything under INTERNAL
// to be safe.
//
// TODO(b/147699081): Once Initialize returns the proper ABORTED/INTERNAL
// status code, we can just propagate it up from here.
result_status->set_code(StatusProto::INTERNAL);
return result_proto;
}
result_status->set_code(StatusProto::OK);
return result_proto;
}
SuggestionResponse IcingSearchEngine::SearchSuggestions(
const SuggestionSpecProto& suggestion_spec) {
// TODO(b/146008613) Explore ideas to make this function read-only.
absl_ports::unique_lock l(&mutex_);
SuggestionResponse response;
StatusProto* response_status = response.mutable_status();
if (!initialized_) {
response_status->set_code(StatusProto::FAILED_PRECONDITION);
response_status->set_message("IcingSearchEngine has not been initialized!");
return response;
}
libtextclassifier3::Status status =
ValidateSuggestionSpec(suggestion_spec, performance_configuration_);
if (!status.ok()) {
TransformStatus(status, response_status);
return response;
}
// Create the suggestion processor.
auto suggestion_processor_or = SuggestionProcessor::Create(
index_.get(), integer_index_.get(), language_segmenter_.get(),
normalizer_.get(), document_store_.get(), schema_store_.get());
if (!suggestion_processor_or.ok()) {
TransformStatus(suggestion_processor_or.status(), response_status);
return response;
}
std::unique_ptr<SuggestionProcessor> suggestion_processor =
std::move(suggestion_processor_or).ValueOrDie();
// Run suggestion based on given SuggestionSpec.
int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
libtextclassifier3::StatusOr<std::vector<TermMetadata>> terms_or =
suggestion_processor->QuerySuggestions(suggestion_spec, current_time_ms);
if (!terms_or.ok()) {
TransformStatus(terms_or.status(), response_status);
return response;
}
// Convert vector<TermMetaData> into final SuggestionResponse proto.
for (TermMetadata& term : terms_or.ValueOrDie()) {
SuggestionResponse::Suggestion suggestion;
suggestion.set_query(std::move(term.content));
response.mutable_suggestions()->Add(std::move(suggestion));
}
response_status->set_code(StatusProto::OK);
return response;
}
} // namespace lib
} // namespace icing