blob: bc0917b3b2934823e994cfc34013d8fec903bfbd [file] [log] [blame]
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "icing/query/query-processor.h"
#include <cstdint>
#include <deque>
#include <memory>
#include <stack>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
#include "icing/index/iterator/doc-hit-info-iterator-and.h"
#include "icing/index/iterator/doc-hit-info-iterator-filter.h"
#include "icing/index/iterator/doc-hit-info-iterator-not.h"
#include "icing/index/iterator/doc-hit-info-iterator-or.h"
#include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/index/numeric/numeric-index.h"
#include "icing/proto/logging.pb.h"
#include "icing/proto/search.pb.h"
#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
#include "icing/query/advanced_query_parser/lexer.h"
#include "icing/query/advanced_query_parser/parser.h"
#include "icing/query/advanced_query_parser/query-visitor.h"
#include "icing/query/query-features.h"
#include "icing/query/query-results.h"
#include "icing/query/query-terms.h"
#include "icing/query/query-utils.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/token.h"
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
#include "icing/transform/normalizer.h"
#include "icing/util/clock.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
namespace {
// State of the current query parser state. This is specific to how the raw
// query is parsed/stored.
struct ParserStateFrame {
std::vector<std::unique_ptr<DocHitInfoIterator>> and_iterators;
std::vector<std::unique_ptr<DocHitInfoIterator>> or_iterators;
// If the last independent token was an OR, then we need to treat the next
// resulting iterator as part of an or_iterator
bool saw_or = false;
// If the last independent token was an exclusion, then we need to treat the
// next resulting iterator as being excluded.
bool saw_exclude = false;
// If the last independent token was a property/section filter, then we need
// to save the section name so we can create a section filter iterator.
std::string section_restrict;
};
// Combines any OR and AND iterators together into one iterator.
std::unique_ptr<DocHitInfoIterator> ProcessParserStateFrame(
ParserStateFrame parser_state_frame,
const DocumentId last_added_document_id) {
if (parser_state_frame.and_iterators.empty() &&
parser_state_frame.or_iterators.empty()) {
// No terms specified, treat an empty query as retrieving all documents.
//
// We don't use the index_.last_added_document_id here because it's possible
// that documents exist in the DocumentStore, but were not successfully
// indexed. So to return *all* documents and not just *all indexed*
// documents, we use the DocumentStore's last_added_document_id
return std::make_unique<DocHitInfoIteratorAllDocumentId>(
last_added_document_id);
}
if (!parser_state_frame.or_iterators.empty()) {
// Combine all the ORs first since they have higher priority, then add it to
// the ANDs.
parser_state_frame.and_iterators.push_back(
CreateOrIterator(std::move(parser_state_frame.or_iterators)));
}
return CreateAndIterator(std::move(parser_state_frame.and_iterators));
}
} // namespace
libtextclassifier3::StatusOr<std::unique_ptr<QueryProcessor>>
QueryProcessor::Create(Index* index, const NumericIndex<int64_t>* numeric_index,
const LanguageSegmenter* language_segmenter,
const Normalizer* normalizer,
const DocumentStore* document_store,
const SchemaStore* schema_store, const Clock* clock) {
ICING_RETURN_ERROR_IF_NULL(index);
ICING_RETURN_ERROR_IF_NULL(numeric_index);
ICING_RETURN_ERROR_IF_NULL(language_segmenter);
ICING_RETURN_ERROR_IF_NULL(normalizer);
ICING_RETURN_ERROR_IF_NULL(document_store);
ICING_RETURN_ERROR_IF_NULL(schema_store);
ICING_RETURN_ERROR_IF_NULL(clock);
return std::unique_ptr<QueryProcessor>(
new QueryProcessor(index, numeric_index, language_segmenter, normalizer,
document_store, schema_store, clock));
}
QueryProcessor::QueryProcessor(Index* index,
const NumericIndex<int64_t>* numeric_index,
const LanguageSegmenter* language_segmenter,
const Normalizer* normalizer,
const DocumentStore* document_store,
const SchemaStore* schema_store,
const Clock* clock)
: index_(*index),
numeric_index_(*numeric_index),
language_segmenter_(*language_segmenter),
normalizer_(*normalizer),
document_store_(*document_store),
schema_store_(*schema_store),
clock_(*clock) {}
libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseSearch(
const SearchSpecProto& search_spec,
ScoringSpecProto::RankingStrategy::Code ranking_strategy,
int64_t current_time_ms, QueryStatsProto::SearchStats* search_stats) {
if (search_spec.search_type() == SearchSpecProto::SearchType::UNDEFINED) {
return absl_ports::InvalidArgumentError(absl_ports::StrCat(
"Search type ",
SearchSpecProto::SearchType::Code_Name(search_spec.search_type()),
" is not supported."));
}
QueryResults results;
if (search_spec.search_type() ==
SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
ICING_VLOG(1) << "Using EXPERIMENTAL_ICING_ADVANCED_QUERY parser!";
ICING_ASSIGN_OR_RETURN(results,
ParseAdvancedQuery(search_spec, ranking_strategy,
current_time_ms, search_stats));
} else {
ICING_ASSIGN_OR_RETURN(
results, ParseRawQuery(search_spec, ranking_strategy, current_time_ms));
}
// Check that all new features used in the search have been enabled in the
// SearchSpec.
const std::unordered_set<Feature> enabled_features(
search_spec.enabled_features().begin(),
search_spec.enabled_features().end());
for (const Feature feature : results.features_in_use) {
if (enabled_features.find(feature) == enabled_features.end()) {
return absl_ports::InvalidArgumentError(absl_ports::StrCat(
"Attempted use of unenabled feature ", feature,
". Please make sure that you have explicitly set all advanced query "
"features used in this query as enabled in the SearchSpec."));
}
}
DocHitInfoIteratorFilter::Options options = GetFilterOptions(search_spec);
results.root_iterator = std::make_unique<DocHitInfoIteratorFilter>(
std::move(results.root_iterator), &document_store_, &schema_store_,
options, current_time_ms);
if (!search_spec.type_property_filters().empty()) {
results.root_iterator =
DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
std::move(results.root_iterator), &document_store_, &schema_store_,
search_spec, current_time_ms);
}
return results;
}
libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseAdvancedQuery(
const SearchSpecProto& search_spec,
ScoringSpecProto::RankingStrategy::Code ranking_strategy,
int64_t current_time_ms, QueryStatsProto::SearchStats* search_stats) const {
std::unique_ptr<Timer> lexer_timer = clock_.GetNewTimer();
Lexer lexer(search_spec.query(), Lexer::Language::QUERY);
ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
lexer.ExtractTokens());
if (search_stats != nullptr) {
search_stats->set_query_processor_lexer_extract_token_latency_ms(
lexer_timer->GetElapsedMilliseconds());
}
std::unique_ptr<Timer> parser_timer = clock_.GetNewTimer();
Parser parser = Parser::Create(std::move(lexer_tokens));
ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> tree_root,
parser.ConsumeQuery());
if (search_stats != nullptr) {
search_stats->set_query_processor_parser_consume_query_latency_ms(
parser_timer->GetElapsedMilliseconds());
}
if (tree_root == nullptr) {
QueryResults results;
results.root_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>(
document_store_.last_added_document_id());
return results;
}
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<Tokenizer> plain_tokenizer,
tokenizer_factory::CreateIndexingTokenizer(
StringIndexingConfig::TokenizerType::PLAIN, &language_segmenter_));
DocHitInfoIteratorFilter::Options options = GetFilterOptions(search_spec);
bool needs_term_frequency_info =
ranking_strategy == ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;
std::unique_ptr<Timer> query_visitor_timer = clock_.GetNewTimer();
QueryVisitor query_visitor(&index_, &numeric_index_, &document_store_,
&schema_store_, &normalizer_,
plain_tokenizer.get(), search_spec.query(),
std::move(options), search_spec.term_match_type(),
needs_term_frequency_info, current_time_ms);
tree_root->Accept(&query_visitor);
ICING_ASSIGN_OR_RETURN(QueryResults results,
std::move(query_visitor).ConsumeResults());
if (search_stats != nullptr) {
search_stats->set_query_processor_query_visitor_latency_ms(
query_visitor_timer->GetElapsedMilliseconds());
}
return results;
}
// TODO(cassiewang): Collect query stats to populate the SearchResultsProto
libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseRawQuery(
const SearchSpecProto& search_spec,
ScoringSpecProto::RankingStrategy::Code ranking_strategy,
int64_t current_time_ms) {
DocHitInfoIteratorFilter::Options options = GetFilterOptions(search_spec);
// Tokenize the incoming raw query
//
// TODO(cassiewang): Consider caching/creating a tokenizer factory that will
// cache the n most recently used tokenizers. So we don't have to recreate
// this on every new query, if they'll all be raw queries.
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
&language_segmenter_));
ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens,
raw_query_tokenizer->TokenizeAll(search_spec.query()));
std::stack<ParserStateFrame> frames;
frames.emplace();
QueryResults results;
// Process all the tokens
for (int i = 0; i < tokens.size(); i++) {
const Token& token = tokens.at(i);
std::unique_ptr<DocHitInfoIterator> result_iterator;
// TODO(b/202076890): Handle negation tokens
switch (token.type) {
case Token::Type::QUERY_LEFT_PARENTHESES: {
frames.emplace(ParserStateFrame());
break;
}
case Token::Type::QUERY_RIGHT_PARENTHESES: {
if (frames.empty()) {
return absl_ports::InternalError(
"Encountered empty stack of ParserStateFrames");
}
result_iterator = ProcessParserStateFrame(
std::move(frames.top()), document_store_.last_added_document_id());
frames.pop();
break;
}
case Token::Type::QUERY_EXCLUSION: {
if (frames.empty()) {
return absl_ports::InternalError(
"Encountered empty stack of ParserStateFrames");
}
frames.top().saw_exclude = true;
break;
}
case Token::Type::QUERY_OR: {
if (frames.empty()) {
return absl_ports::InternalError(
"Encountered empty stack of ParserStateFrames");
}
frames.top().saw_or = true;
break;
}
case Token::Type::QUERY_PROPERTY: {
if (frames.empty()) {
return absl_ports::InternalError(
"Encountered empty stack of ParserStateFrames");
}
frames.top().section_restrict = std::string(token.text);
break;
}
case Token::Type::REGULAR: {
if (frames.empty()) {
return absl_ports::InternalError(
"Encountered empty stack of ParserStateFrames");
}
std::string normalized_text = normalizer_.NormalizeTerm(token.text);
// TODO(cassiewang): Consider removing the use of a section mask in the
// term iterator, or constructing a best-effort SectionIdMask based on
// the section filter. For some combination of schema type filters and
// section filters, we can't encapsulate the perfect
// SchemaTypeId-SectionId sets with just a SectionIdMask. So we
// over-retrieve hits and have to do a post-filter anyways. With a
// SectionIdMask, we might be able to narrow down our SectionIds, but
// we'll still over-retrieve hits a bit. So at that point, it's a
// tradeoff of
//
// 1.1 Go to SchemaStore and iterate over the schema to calculate a
// SectionIdMask
// 1.2 Use SectionIdMask and save some hit buffer memory
// 1.3 Do a post-filter to double check SchemaTypeId-SectionId combo
//
// vs
//
// 2.1 Use SectionIdMaskAll and use more hit buffer memory
// 2.2 Do a post-filter to double check SchemaTypeId-SectionId combo
//
// We do the same amount of disk reads, so it may be dependent on how
// big the schema is and/or how popular schema type filtering and
// section filtering is.
ICING_ASSIGN_OR_RETURN(
result_iterator,
index_.GetIterator(
normalized_text,
token.text.data() - search_spec.query().c_str(),
token.text.length(), kSectionIdMaskAll,
search_spec.term_match_type(),
/*need_hit_term_frequency=*/ranking_strategy ==
ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE));
// Add term iterator and terms to match if this is not a negation term.
// WARNING: setting query terms at this point is not compatible with
// group-level excludes, group-level sections restricts or excluded
// section restricts. Those are not currently supported. If they became
// supported, this handling for query terms would need to be altered.
if (!frames.top().saw_exclude) {
if (ranking_strategy ==
ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE) {
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<DocHitInfoIterator> term_iterator,
index_.GetIterator(
normalized_text,
token.text.data() - search_spec.query().c_str(),
token.text.length(), kSectionIdMaskAll,
search_spec.term_match_type(),
/*need_hit_term_frequency=*/ranking_strategy ==
ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE));
results.query_term_iterators[normalized_text] =
std::make_unique<DocHitInfoIteratorFilter>(
std::move(term_iterator), &document_store_, &schema_store_,
options, current_time_ms);
}
results.query_terms[frames.top().section_restrict].insert(
std::move(normalized_text));
}
break;
}
case Token::Type::INVALID:
[[fallthrough]];
default:
// This wouldn't happen if tokenizer and query processor both work
// correctly. An unknown token indicates inconsistency between tokenizer
// and query processor, so we return an internal error here.
return absl_ports::InternalError(absl_ports::StrCat(
"Encountered unknown token while processing query: ", token.text));
}
// Did we get an iterator out of this token?
if (result_iterator) {
if (frames.empty()) {
return absl_ports::InternalError(
"Encountered empty stack of ParserStateFrames");
}
// NOTE: Order matters!! We must apply the section restrict first, then
// the NOT operator.
//
// Imagine a query like [-subject:foo] which means we
// want to get documents that don't have the term 'foo' in section
// 'subject'.
//
// Assume some Document_0:
// { "subject": "foo" }
//
// And assume some Document_1:
// { "subject": "bar" }
//
// If we use the IteratorNot first, then we'll get DocHitInfos that
// represent DocumentIds without any section hits like
// DocHitInfo(document_id_1, kSectionIdMaskNone). Then, when we try to
// apply the IteratorSectionRestrict, no SectionIds in the mask will match
// the SectionId of 'subject' and we won't return any results.
//
// If we use the IteratorSectionRestrict first, then we'll get a
// DocHitInfo for Document_0. Then with the IteratorNot, we can get the
// rest of the Documents excluding Document_0, and get Document_1 as a
// correct result.
//
// TODO(cassiewang): The point is a bit moot right now since we don't even
// support this functionality. But add tests for this once we do support
// more advanced section restricts with grouping, negation, etc.
if (!frames.top().section_restrict.empty()) {
// We saw a section restrict earlier, wrap the result iterator in
// the section restrict
std::set<std::string> section_restricts;
section_restricts.insert(std::move(frames.top().section_restrict));
result_iterator = DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
std::move(result_iterator), &document_store_, &schema_store_,
std::move(section_restricts), current_time_ms);
frames.top().section_restrict = "";
}
// Check if we need to NOT/exclude this iterator
if (frames.top().saw_exclude) {
result_iterator = std::make_unique<DocHitInfoIteratorNot>(
std::move(result_iterator),
document_store_.last_added_document_id());
frames.top().saw_exclude = false;
}
if (i < tokens.size() - 1 &&
tokens.at(i + 1).type == Token::Type::QUERY_OR) {
// This isn't the last token, and the next token is an OR. Then we
// should OR this iterator with the next iterator, (e.g. if the query
// was "A OR B", we would be processing "A" right now)
frames.top().or_iterators.push_back(std::move(result_iterator));
} else if (frames.top().saw_or) {
// This isn't the first token, and the previous token was an OR. Then
// we should OR this iterator with the previous iterator (e.g. if the
// query was "A OR (B C)", we would be processing the iterator for "(B
// C)" right now)
frames.top().or_iterators.push_back(std::move(result_iterator));
frames.top().saw_or = false;
} else {
// If we're not trying to OR this iterator, we AND everything else.
if (!frames.top().or_iterators.empty()) {
// Accumulate the previous OR iterators if there were any.
frames.top().and_iterators.push_back(
CreateOrIterator(std::move(frames.top().or_iterators)));
frames.top().or_iterators =
std::vector<std::unique_ptr<DocHitInfoIterator>>();
}
frames.top().and_iterators.push_back(std::move(result_iterator));
}
}
}
// Guaranteed that we have some iterators to return. Need to do one last
// combining since we could have ORs and ANDs.
if (frames.size() != 1) {
return absl_ports::InternalError(
"Encountered invalid state of ParserStateFrames stack");
}
results.root_iterator = ProcessParserStateFrame(
std::move(frames.top()), document_store_.last_added_document_id());
return results;
}
} // namespace lib
} // namespace icing