icing/query/query-processor.cc - platform/external/icing - Git at Google

 // Copyright (C) 2019 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "icing/query/query-processor.h"

 #include <cstdint>
 #include <deque>
 #include <memory>
 #include <stack>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>

 #include "icing/text_classifier/lib3/utils/base/statusor.h"
 #include "icing/absl_ports/canonical_errors.h"
 #include "icing/absl_ports/str_cat.h"
 #include "icing/index/index.h"
 #include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
 #include "icing/index/iterator/doc-hit-info-iterator-and.h"
 #include "icing/index/iterator/doc-hit-info-iterator-filter.h"
 #include "icing/index/iterator/doc-hit-info-iterator-not.h"
 #include "icing/index/iterator/doc-hit-info-iterator-or.h"
 #include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
 #include "icing/index/iterator/doc-hit-info-iterator.h"
 #include "icing/index/numeric/numeric-index.h"
 #include "icing/proto/logging.pb.h"
 #include "icing/proto/search.pb.h"
 #include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
 #include "icing/query/advanced_query_parser/lexer.h"
 #include "icing/query/advanced_query_parser/parser.h"
 #include "icing/query/advanced_query_parser/query-visitor.h"
 #include "icing/query/query-features.h"
 #include "icing/query/query-results.h"
 #include "icing/query/query-terms.h"
 #include "icing/query/query-utils.h"
 #include "icing/schema/schema-store.h"
 #include "icing/schema/section.h"
 #include "icing/store/document-id.h"
 #include "icing/store/document-store.h"
 #include "icing/tokenization/language-segmenter.h"
 #include "icing/tokenization/token.h"
 #include "icing/tokenization/tokenizer-factory.h"
 #include "icing/tokenization/tokenizer.h"
 #include "icing/transform/normalizer.h"
 #include "icing/util/clock.h"
 #include "icing/util/status-macros.h"

 namespace icing {
 namespace lib {

 namespace {

 // State of the current query parser state. This is specific to how the raw
 // query is parsed/stored.
 struct ParserStateFrame {
   std::vector<std::unique_ptr<DocHitInfoIterator>> and_iterators;
   std::vector<std::unique_ptr<DocHitInfoIterator>> or_iterators;

   // If the last independent token was an OR, then we need to treat the next
   // resulting iterator as part of an or_iterator
   bool saw_or = false;

   // If the last independent token was an exclusion, then we need to treat the
   // next resulting iterator as being excluded.
   bool saw_exclude = false;

   // If the last independent token was a property/section filter, then we need
   // to save the section name so we can create a section filter iterator.
   std::string section_restrict;
 };

 // Combines any OR and AND iterators together into one iterator.
 std::unique_ptr<DocHitInfoIterator> ProcessParserStateFrame(
     ParserStateFrame parser_state_frame,
     const DocumentId last_added_document_id) {
   if (parser_state_frame.and_iterators.empty() &&
       parser_state_frame.or_iterators.empty()) {
     // No terms specified, treat an empty query as retrieving all documents.
     //
     // We don't use the index_.last_added_document_id here because it's possible
     // that documents exist in the DocumentStore, but were not successfully
     // indexed. So to return *all* documents and not just *all indexed*
     // documents, we use the DocumentStore's last_added_document_id
     return std::make_unique<DocHitInfoIteratorAllDocumentId>(
         last_added_document_id);
   }

   if (!parser_state_frame.or_iterators.empty()) {
     // Combine all the ORs first since they have higher priority, then add it to
     // the ANDs.
     parser_state_frame.and_iterators.push_back(
         CreateOrIterator(std::move(parser_state_frame.or_iterators)));
   }
   return CreateAndIterator(std::move(parser_state_frame.and_iterators));
 }

 }  // namespace

 libtextclassifier3::StatusOr<std::unique_ptr<QueryProcessor>>
 QueryProcessor::Create(Index* index, const NumericIndex<int64_t>* numeric_index,
                        const LanguageSegmenter* language_segmenter,
                        const Normalizer* normalizer,
                        const DocumentStore* document_store,
                        const SchemaStore* schema_store, const Clock* clock) {
   ICING_RETURN_ERROR_IF_NULL(index);
   ICING_RETURN_ERROR_IF_NULL(numeric_index);
   ICING_RETURN_ERROR_IF_NULL(language_segmenter);
   ICING_RETURN_ERROR_IF_NULL(normalizer);
   ICING_RETURN_ERROR_IF_NULL(document_store);
   ICING_RETURN_ERROR_IF_NULL(schema_store);
   ICING_RETURN_ERROR_IF_NULL(clock);

   return std::unique_ptr<QueryProcessor>(
       new QueryProcessor(index, numeric_index, language_segmenter, normalizer,
                          document_store, schema_store, clock));
 }

 QueryProcessor::QueryProcessor(Index* index,
                                const NumericIndex<int64_t>* numeric_index,
                                const LanguageSegmenter* language_segmenter,
                                const Normalizer* normalizer,
                                const DocumentStore* document_store,
                                const SchemaStore* schema_store,
                                const Clock* clock)
     : index_(*index),
       numeric_index_(*numeric_index),
       language_segmenter_(*language_segmenter),
       normalizer_(*normalizer),
       document_store_(*document_store),
       schema_store_(*schema_store),
       clock_(*clock) {}

 libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseSearch(
     const SearchSpecProto& search_spec,
     ScoringSpecProto::RankingStrategy::Code ranking_strategy,
     int64_t current_time_ms, QueryStatsProto::SearchStats* search_stats) {
   if (search_spec.search_type() == SearchSpecProto::SearchType::UNDEFINED) {
     return absl_ports::InvalidArgumentError(absl_ports::StrCat(
         "Search type ",
         SearchSpecProto::SearchType::Code_Name(search_spec.search_type()),
         " is not supported."));
   }
   QueryResults results;
   if (search_spec.search_type() ==
       SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
     ICING_VLOG(1) << "Using EXPERIMENTAL_ICING_ADVANCED_QUERY parser!";
     ICING_ASSIGN_OR_RETURN(results,
                            ParseAdvancedQuery(search_spec, ranking_strategy,
                                               current_time_ms, search_stats));
   } else {
     ICING_ASSIGN_OR_RETURN(
         results, ParseRawQuery(search_spec, ranking_strategy, current_time_ms));
   }

   // Check that all new features used in the search have been enabled in the
   // SearchSpec.
   const std::unordered_set<Feature> enabled_features(
       search_spec.enabled_features().begin(),
       search_spec.enabled_features().end());
   for (const Feature feature : results.features_in_use) {
     if (enabled_features.find(feature) == enabled_features.end()) {
       return absl_ports::InvalidArgumentError(absl_ports::StrCat(
           "Attempted use of unenabled feature ", feature,
           ". Please make sure that you have explicitly set all advanced query "
           "features used in this query as enabled in the SearchSpec."));
     }
   }

   DocHitInfoIteratorFilter::Options options = GetFilterOptions(search_spec);
   results.root_iterator = std::make_unique<DocHitInfoIteratorFilter>(
       std::move(results.root_iterator), &document_store_, &schema_store_,
       options, current_time_ms);
   if (!search_spec.type_property_filters().empty()) {
     results.root_iterator =
         DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
             std::move(results.root_iterator), &document_store_, &schema_store_,
             search_spec, current_time_ms);
   }
   return results;
 }

 libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseAdvancedQuery(
     const SearchSpecProto& search_spec,
     ScoringSpecProto::RankingStrategy::Code ranking_strategy,
     int64_t current_time_ms, QueryStatsProto::SearchStats* search_stats) const {
   std::unique_ptr<Timer> lexer_timer = clock_.GetNewTimer();
   Lexer lexer(search_spec.query(), Lexer::Language::QUERY);
   ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
                          lexer.ExtractTokens());
   if (search_stats != nullptr) {
     search_stats->set_query_processor_lexer_extract_token_latency_ms(
         lexer_timer->GetElapsedMilliseconds());
   }

   std::unique_ptr<Timer> parser_timer = clock_.GetNewTimer();
   Parser parser = Parser::Create(std::move(lexer_tokens));
   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> tree_root,
                          parser.ConsumeQuery());
   if (search_stats != nullptr) {
     search_stats->set_query_processor_parser_consume_query_latency_ms(
         parser_timer->GetElapsedMilliseconds());
   }

   if (tree_root == nullptr) {
     QueryResults results;
     results.root_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>(
         document_store_.last_added_document_id());
     return results;
   }
   ICING_ASSIGN_OR_RETURN(
       std::unique_ptr<Tokenizer> plain_tokenizer,
       tokenizer_factory::CreateIndexingTokenizer(
           StringIndexingConfig::TokenizerType::PLAIN, &language_segmenter_));
   DocHitInfoIteratorFilter::Options options = GetFilterOptions(search_spec);
   bool needs_term_frequency_info =
       ranking_strategy == ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;

   std::unique_ptr<Timer> query_visitor_timer = clock_.GetNewTimer();
   QueryVisitor query_visitor(&index_, &numeric_index_, &document_store_,
                              &schema_store_, &normalizer_,
                              plain_tokenizer.get(), search_spec.query(),
                              std::move(options), search_spec.term_match_type(),
                              needs_term_frequency_info, current_time_ms);
   tree_root->Accept(&query_visitor);
   ICING_ASSIGN_OR_RETURN(QueryResults results,
                          std::move(query_visitor).ConsumeResults());
   if (search_stats != nullptr) {
     search_stats->set_query_processor_query_visitor_latency_ms(
         query_visitor_timer->GetElapsedMilliseconds());
   }

   return results;
 }

 // TODO(cassiewang): Collect query stats to populate the SearchResultsProto
 libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseRawQuery(
     const SearchSpecProto& search_spec,
     ScoringSpecProto::RankingStrategy::Code ranking_strategy,
     int64_t current_time_ms) {
   DocHitInfoIteratorFilter::Options options = GetFilterOptions(search_spec);

   // Tokenize the incoming raw query
   //
   // TODO(cassiewang): Consider caching/creating a tokenizer factory that will
   // cache the n most recently used tokenizers. So we don't have to recreate
   // this on every new query, if they'll all be raw queries.
   ICING_ASSIGN_OR_RETURN(
       std::unique_ptr<Tokenizer> raw_query_tokenizer,
       tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
                                               &language_segmenter_));

   ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens,
                          raw_query_tokenizer->TokenizeAll(search_spec.query()));

   std::stack<ParserStateFrame> frames;
   frames.emplace();
   QueryResults results;
   // Process all the tokens
   for (int i = 0; i < tokens.size(); i++) {
     const Token& token = tokens.at(i);
     std::unique_ptr<DocHitInfoIterator> result_iterator;

     // TODO(b/202076890): Handle negation tokens
     switch (token.type) {
       case Token::Type::QUERY_LEFT_PARENTHESES: {
         frames.emplace(ParserStateFrame());
         break;
       }
       case Token::Type::QUERY_RIGHT_PARENTHESES: {
         if (frames.empty()) {
           return absl_ports::InternalError(
               "Encountered empty stack of ParserStateFrames");
         }
         result_iterator = ProcessParserStateFrame(
             std::move(frames.top()), document_store_.last_added_document_id());
         frames.pop();
         break;
       }
       case Token::Type::QUERY_EXCLUSION: {
         if (frames.empty()) {
           return absl_ports::InternalError(
               "Encountered empty stack of ParserStateFrames");
         }
         frames.top().saw_exclude = true;
         break;
       }
       case Token::Type::QUERY_OR: {
         if (frames.empty()) {
           return absl_ports::InternalError(
               "Encountered empty stack of ParserStateFrames");
         }
         frames.top().saw_or = true;
         break;
       }
       case Token::Type::QUERY_PROPERTY: {
         if (frames.empty()) {
           return absl_ports::InternalError(
               "Encountered empty stack of ParserStateFrames");
         }

         frames.top().section_restrict = std::string(token.text);
         break;
       }
       case Token::Type::REGULAR: {
         if (frames.empty()) {
           return absl_ports::InternalError(
               "Encountered empty stack of ParserStateFrames");
         }

         std::string normalized_text = normalizer_.NormalizeTerm(token.text);

         // TODO(cassiewang): Consider removing the use of a section mask in the
         // term iterator, or constructing a best-effort SectionIdMask based on
         // the section filter. For some combination of schema type filters and
         // section filters, we can't encapsulate the perfect
         // SchemaTypeId-SectionId sets with just a SectionIdMask. So we
         // over-retrieve hits and have to do a post-filter anyways. With a
         // SectionIdMask, we might be able to narrow down our SectionIds, but
         // we'll still over-retrieve hits a bit. So at that point, it's a
         // tradeoff of
         //
         //   1.1 Go to SchemaStore and iterate over the schema to calculate a
         //   SectionIdMask
         //   1.2 Use SectionIdMask and save some hit buffer memory
         //   1.3 Do a post-filter to double check SchemaTypeId-SectionId combo
         //
         // vs
         //
         //   2.1 Use SectionIdMaskAll and use more hit buffer memory
         //   2.2 Do a post-filter to double check SchemaTypeId-SectionId combo
         //
         // We do the same amount of disk reads, so it may be dependent on how
         // big the schema is and/or how popular schema type filtering and
         // section filtering is.
         ICING_ASSIGN_OR_RETURN(
             result_iterator,
             index_.GetIterator(
                 normalized_text,
                 token.text.data() - search_spec.query().c_str(),
                 token.text.length(), kSectionIdMaskAll,
                 search_spec.term_match_type(),
                 /*need_hit_term_frequency=*/ranking_strategy ==
                     ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE));

         // Add term iterator and terms to match if this is not a negation term.
         // WARNING: setting query terms at this point is not compatible with
         // group-level excludes, group-level sections restricts or excluded
         // section restricts. Those are not currently supported. If they became
         // supported, this handling for query terms would need to be altered.
         if (!frames.top().saw_exclude) {
           if (ranking_strategy ==
               ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE) {
             ICING_ASSIGN_OR_RETURN(
                 std::unique_ptr<DocHitInfoIterator> term_iterator,
                 index_.GetIterator(
                     normalized_text,
                     token.text.data() - search_spec.query().c_str(),
                     token.text.length(), kSectionIdMaskAll,
                     search_spec.term_match_type(),
                     /*need_hit_term_frequency=*/ranking_strategy ==
                         ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE));
             results.query_term_iterators[normalized_text] =
                 std::make_unique<DocHitInfoIteratorFilter>(
                     std::move(term_iterator), &document_store_, &schema_store_,
                     options, current_time_ms);
           }
           results.query_terms[frames.top().section_restrict].insert(
               std::move(normalized_text));
         }
         break;
       }
       case Token::Type::INVALID:
         [[fallthrough]];
       default:
         // This wouldn't happen if tokenizer and query processor both work
         // correctly. An unknown token indicates inconsistency between tokenizer
         // and query processor, so we return an internal error here.
         return absl_ports::InternalError(absl_ports::StrCat(
             "Encountered unknown token while processing query: ", token.text));
     }

     // Did we get an iterator out of this token?
     if (result_iterator) {
       if (frames.empty()) {
         return absl_ports::InternalError(
             "Encountered empty stack of ParserStateFrames");
       }

       // NOTE: Order matters!! We must apply the section restrict first, then
       // the NOT operator.
       //
       // Imagine a query like [-subject:foo] which means we
       // want to get documents that don't have the term 'foo' in section
       // 'subject'.
       //
       // Assume some Document_0:
       //   { "subject": "foo" }
       //
       // And assume some Document_1:
       //   { "subject": "bar" }
       //
       // If we use the IteratorNot first, then we'll get DocHitInfos that
       // represent DocumentIds without any section hits like
       // DocHitInfo(document_id_1, kSectionIdMaskNone). Then, when we try to
       // apply the IteratorSectionRestrict, no SectionIds in the mask will match
       // the SectionId of 'subject' and we won't return any results.
       //
       // If we use the IteratorSectionRestrict first, then we'll get a
       // DocHitInfo for Document_0. Then with the IteratorNot, we can get the
       // rest of the Documents excluding Document_0, and get Document_1 as a
       // correct result.
       //
       // TODO(cassiewang): The point is a bit moot right now since we don't even
       // support this functionality. But add tests for this once we do support
       // more advanced section restricts with grouping, negation, etc.
       if (!frames.top().section_restrict.empty()) {
         // We saw a section restrict earlier, wrap the result iterator in
         // the section restrict
         std::set<std::string> section_restricts;
         section_restricts.insert(std::move(frames.top().section_restrict));
         result_iterator = DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
             std::move(result_iterator), &document_store_, &schema_store_,
             std::move(section_restricts), current_time_ms);

         frames.top().section_restrict = "";
       }

       // Check if we need to NOT/exclude this iterator
       if (frames.top().saw_exclude) {
         result_iterator = std::make_unique<DocHitInfoIteratorNot>(
             std::move(result_iterator),
             document_store_.last_added_document_id());
         frames.top().saw_exclude = false;
       }

       if (i < tokens.size() - 1 &&
           tokens.at(i + 1).type == Token::Type::QUERY_OR) {
         // This isn't the last token, and the next token is an OR. Then we
         // should OR this iterator with the next iterator, (e.g. if the query
         // was "A OR B", we would be processing "A" right now)
         frames.top().or_iterators.push_back(std::move(result_iterator));
       } else if (frames.top().saw_or) {
         // This isn't the first token, and the previous token was an OR. Then
         // we should OR this iterator with the previous iterator (e.g. if the
         // query was "A OR (B C)", we would be processing the iterator for "(B
         // C)" right now)
         frames.top().or_iterators.push_back(std::move(result_iterator));
         frames.top().saw_or = false;
       } else {
         // If we're not trying to OR this iterator, we AND everything else.
         if (!frames.top().or_iterators.empty()) {
           // Accumulate the previous OR iterators if there were any.
           frames.top().and_iterators.push_back(
               CreateOrIterator(std::move(frames.top().or_iterators)));
           frames.top().or_iterators =
               std::vector<std::unique_ptr<DocHitInfoIterator>>();
         }
         frames.top().and_iterators.push_back(std::move(result_iterator));
       }
     }
   }

   // Guaranteed that we have some iterators to return. Need to do one last
   // combining since we could have ORs and ANDs.
   if (frames.size() != 1) {
     return absl_ports::InternalError(
         "Encountered invalid state of ParserStateFrames stack");
   }
   results.root_iterator = ProcessParserStateFrame(
       std::move(frames.top()), document_store_.last_added_document_id());
   return results;
 }

 }  // namespace lib
 }  // namespace icing
	// Copyright (C) 2019 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "icing/query/query-processor.h"

	#include <cstdint>
	#include <deque>
	#include <memory>
	#include <stack>
	#include <string>
	#include <unordered_map>
	#include <unordered_set>
	#include <utility>
	#include <vector>

	#include "icing/text_classifier/lib3/utils/base/statusor.h"
	#include "icing/absl_ports/canonical_errors.h"
	#include "icing/absl_ports/str_cat.h"
	#include "icing/index/index.h"
	#include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
	#include "icing/index/iterator/doc-hit-info-iterator-and.h"
	#include "icing/index/iterator/doc-hit-info-iterator-filter.h"
	#include "icing/index/iterator/doc-hit-info-iterator-not.h"
	#include "icing/index/iterator/doc-hit-info-iterator-or.h"
	#include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
	#include "icing/index/iterator/doc-hit-info-iterator.h"
	#include "icing/index/numeric/numeric-index.h"
	#include "icing/proto/logging.pb.h"
	#include "icing/proto/search.pb.h"
	#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
	#include "icing/query/advanced_query_parser/lexer.h"
	#include "icing/query/advanced_query_parser/parser.h"
	#include "icing/query/advanced_query_parser/query-visitor.h"
	#include "icing/query/query-features.h"
	#include "icing/query/query-results.h"
	#include "icing/query/query-terms.h"
	#include "icing/query/query-utils.h"
	#include "icing/schema/schema-store.h"
	#include "icing/schema/section.h"
	#include "icing/store/document-id.h"
	#include "icing/store/document-store.h"
	#include "icing/tokenization/language-segmenter.h"
	#include "icing/tokenization/token.h"
	#include "icing/tokenization/tokenizer-factory.h"
	#include "icing/tokenization/tokenizer.h"
	#include "icing/transform/normalizer.h"
	#include "icing/util/clock.h"
	#include "icing/util/status-macros.h"

	namespace icing {
	namespace lib {

	namespace {

	// State of the current query parser state. This is specific to how the raw
	// query is parsed/stored.
	struct ParserStateFrame {
	std::vector<std::unique_ptr<DocHitInfoIterator>> and_iterators;
	std::vector<std::unique_ptr<DocHitInfoIterator>> or_iterators;

	// If the last independent token was an OR, then we need to treat the next
	// resulting iterator as part of an or_iterator
	bool saw_or = false;

	// If the last independent token was an exclusion, then we need to treat the
	// next resulting iterator as being excluded.
	bool saw_exclude = false;

	// If the last independent token was a property/section filter, then we need
	// to save the section name so we can create a section filter iterator.
	std::string section_restrict;
	};

	// Combines any OR and AND iterators together into one iterator.
	std::unique_ptr<DocHitInfoIterator> ProcessParserStateFrame(
	ParserStateFrame parser_state_frame,
	const DocumentId last_added_document_id) {
	if (parser_state_frame.and_iterators.empty() &&
	parser_state_frame.or_iterators.empty()) {
	// No terms specified, treat an empty query as retrieving all documents.
	//
	// We don't use the index_.last_added_document_id here because it's possible
	// that documents exist in the DocumentStore, but were not successfully
	// indexed. So to return all documents and not just all indexed
	// documents, we use the DocumentStore's last_added_document_id
	return std::make_unique<DocHitInfoIteratorAllDocumentId>(
	last_added_document_id);
	}

	if (!parser_state_frame.or_iterators.empty()) {
	// Combine all the ORs first since they have higher priority, then add it to
	// the ANDs.
	parser_state_frame.and_iterators.push_back(
	CreateOrIterator(std::move(parser_state_frame.or_iterators)));
	}
	return CreateAndIterator(std::move(parser_state_frame.and_iterators));
	}

	} // namespace

	libtextclassifier3::StatusOr<std::unique_ptr<QueryProcessor>>
	QueryProcessor::Create(Index* index, const NumericIndex<int64_t>* numeric_index,
	const LanguageSegmenter* language_segmenter,
	const Normalizer* normalizer,
	const DocumentStore* document_store,
	const SchemaStore* schema_store, const Clock* clock) {
	ICING_RETURN_ERROR_IF_NULL(index);
	ICING_RETURN_ERROR_IF_NULL(numeric_index);
	ICING_RETURN_ERROR_IF_NULL(language_segmenter);
	ICING_RETURN_ERROR_IF_NULL(normalizer);
	ICING_RETURN_ERROR_IF_NULL(document_store);
	ICING_RETURN_ERROR_IF_NULL(schema_store);
	ICING_RETURN_ERROR_IF_NULL(clock);

	return std::unique_ptr<QueryProcessor>(
	new QueryProcessor(index, numeric_index, language_segmenter, normalizer,
	document_store, schema_store, clock));
	}

	QueryProcessor::QueryProcessor(Index* index,
	const NumericIndex<int64_t>* numeric_index,
	const LanguageSegmenter* language_segmenter,
	const Normalizer* normalizer,
	const DocumentStore* document_store,
	const SchemaStore* schema_store,
	const Clock* clock)
	: index_(*index),
	numeric_index_(*numeric_index),
	language_segmenter_(*language_segmenter),
	normalizer_(*normalizer),
	document_store_(*document_store),
	schema_store_(*schema_store),
	clock_(*clock) {}

	libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseSearch(
	const SearchSpecProto& search_spec,
	ScoringSpecProto::RankingStrategy::Code ranking_strategy,
	int64_t current_time_ms, QueryStatsProto::SearchStats* search_stats) {
	if (search_spec.search_type() == SearchSpecProto::SearchType::UNDEFINED) {
	return absl_ports::InvalidArgumentError(absl_ports::StrCat(
	"Search type ",
	SearchSpecProto::SearchType::Code_Name(search_spec.search_type()),
	" is not supported."));
	}
	QueryResults results;
	if (search_spec.search_type() ==
	SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
	ICING_VLOG(1) << "Using EXPERIMENTAL_ICING_ADVANCED_QUERY parser!";
	ICING_ASSIGN_OR_RETURN(results,
	ParseAdvancedQuery(search_spec, ranking_strategy,
	current_time_ms, search_stats));
	} else {
	ICING_ASSIGN_OR_RETURN(
	results, ParseRawQuery(search_spec, ranking_strategy, current_time_ms));
	}

	// Check that all new features used in the search have been enabled in the
	// SearchSpec.
	const std::unordered_set<Feature> enabled_features(
	search_spec.enabled_features().begin(),
	search_spec.enabled_features().end());
	for (const Feature feature : results.features_in_use) {
	if (enabled_features.find(feature) == enabled_features.end()) {
	return absl_ports::InvalidArgumentError(absl_ports::StrCat(
	"Attempted use of unenabled feature ", feature,
	". Please make sure that you have explicitly set all advanced query "
	"features used in this query as enabled in the SearchSpec."));
	}
	}

	DocHitInfoIteratorFilter::Options options = GetFilterOptions(search_spec);
	results.root_iterator = std::make_unique<DocHitInfoIteratorFilter>(
	std::move(results.root_iterator), &document_store_, &schema_store_,
	options, current_time_ms);
	if (!search_spec.type_property_filters().empty()) {
	results.root_iterator =
	DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
	std::move(results.root_iterator), &document_store_, &schema_store_,
	search_spec, current_time_ms);
	}
	return results;
	}

	libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseAdvancedQuery(
	const SearchSpecProto& search_spec,
	ScoringSpecProto::RankingStrategy::Code ranking_strategy,
	int64_t current_time_ms, QueryStatsProto::SearchStats* search_stats) const {
	std::unique_ptr<Timer> lexer_timer = clock_.GetNewTimer();
	Lexer lexer(search_spec.query(), Lexer::Language::QUERY);
	ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
	lexer.ExtractTokens());
	if (search_stats != nullptr) {
	search_stats->set_query_processor_lexer_extract_token_latency_ms(
	lexer_timer->GetElapsedMilliseconds());
	}

	std::unique_ptr<Timer> parser_timer = clock_.GetNewTimer();
	Parser parser = Parser::Create(std::move(lexer_tokens));
	ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> tree_root,
	parser.ConsumeQuery());
	if (search_stats != nullptr) {
	search_stats->set_query_processor_parser_consume_query_latency_ms(
	parser_timer->GetElapsedMilliseconds());
	}

	if (tree_root == nullptr) {
	QueryResults results;
	results.root_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>(
	document_store_.last_added_document_id());
	return results;
	}
	ICING_ASSIGN_OR_RETURN(
	std::unique_ptr<Tokenizer> plain_tokenizer,
	tokenizer_factory::CreateIndexingTokenizer(
	StringIndexingConfig::TokenizerType::PLAIN, &language_segmenter_));
	DocHitInfoIteratorFilter::Options options = GetFilterOptions(search_spec);
	bool needs_term_frequency_info =
	ranking_strategy == ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;

	std::unique_ptr<Timer> query_visitor_timer = clock_.GetNewTimer();
	QueryVisitor query_visitor(&index_, &numeric_index_, &document_store_,
	&schema_store_, &normalizer_,
	plain_tokenizer.get(), search_spec.query(),
	std::move(options), search_spec.term_match_type(),
	needs_term_frequency_info, current_time_ms);
	tree_root->Accept(&query_visitor);
	ICING_ASSIGN_OR_RETURN(QueryResults results,
	std::move(query_visitor).ConsumeResults());
	if (search_stats != nullptr) {
	search_stats->set_query_processor_query_visitor_latency_ms(
	query_visitor_timer->GetElapsedMilliseconds());
	}

	return results;
	}

	// TODO(cassiewang): Collect query stats to populate the SearchResultsProto
	libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseRawQuery(
	const SearchSpecProto& search_spec,
	ScoringSpecProto::RankingStrategy::Code ranking_strategy,
	int64_t current_time_ms) {
	DocHitInfoIteratorFilter::Options options = GetFilterOptions(search_spec);

	// Tokenize the incoming raw query
	//
	// TODO(cassiewang): Consider caching/creating a tokenizer factory that will
	// cache the n most recently used tokenizers. So we don't have to recreate
	// this on every new query, if they'll all be raw queries.
	ICING_ASSIGN_OR_RETURN(
	std::unique_ptr<Tokenizer> raw_query_tokenizer,
	tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
	&language_segmenter_));

	ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens,
	raw_query_tokenizer->TokenizeAll(search_spec.query()));

	std::stack<ParserStateFrame> frames;
	frames.emplace();
	QueryResults results;
	// Process all the tokens
	for (int i = 0; i < tokens.size(); i++) {
	const Token& token = tokens.at(i);
	std::unique_ptr<DocHitInfoIterator> result_iterator;

	// TODO(b/202076890): Handle negation tokens
	switch (token.type) {
	case Token::Type::QUERY_LEFT_PARENTHESES: {
	frames.emplace(ParserStateFrame());
	break;
	}
	case Token::Type::QUERY_RIGHT_PARENTHESES: {
	if (frames.empty()) {
	return absl_ports::InternalError(
	"Encountered empty stack of ParserStateFrames");
	}
	result_iterator = ProcessParserStateFrame(
	std::move(frames.top()), document_store_.last_added_document_id());
	frames.pop();
	break;
	}
	case Token::Type::QUERY_EXCLUSION: {
	if (frames.empty()) {
	return absl_ports::InternalError(
	"Encountered empty stack of ParserStateFrames");
	}
	frames.top().saw_exclude = true;
	break;
	}
	case Token::Type::QUERY_OR: {
	if (frames.empty()) {
	return absl_ports::InternalError(
	"Encountered empty stack of ParserStateFrames");
	}
	frames.top().saw_or = true;
	break;
	}
	case Token::Type::QUERY_PROPERTY: {
	if (frames.empty()) {
	return absl_ports::InternalError(
	"Encountered empty stack of ParserStateFrames");
	}

	frames.top().section_restrict = std::string(token.text);
	break;
	}
	case Token::Type::REGULAR: {
	if (frames.empty()) {
	return absl_ports::InternalError(
	"Encountered empty stack of ParserStateFrames");
	}

	std::string normalized_text = normalizer_.NormalizeTerm(token.text);

	// TODO(cassiewang): Consider removing the use of a section mask in the
	// term iterator, or constructing a best-effort SectionIdMask based on
	// the section filter. For some combination of schema type filters and
	// section filters, we can't encapsulate the perfect
	// SchemaTypeId-SectionId sets with just a SectionIdMask. So we
	// over-retrieve hits and have to do a post-filter anyways. With a
	// SectionIdMask, we might be able to narrow down our SectionIds, but
	// we'll still over-retrieve hits a bit. So at that point, it's a
	// tradeoff of
	//
	// 1.1 Go to SchemaStore and iterate over the schema to calculate a
	// SectionIdMask
	// 1.2 Use SectionIdMask and save some hit buffer memory
	// 1.3 Do a post-filter to double check SchemaTypeId-SectionId combo
	//
	// vs
	//
	// 2.1 Use SectionIdMaskAll and use more hit buffer memory
	// 2.2 Do a post-filter to double check SchemaTypeId-SectionId combo
	//
	// We do the same amount of disk reads, so it may be dependent on how
	// big the schema is and/or how popular schema type filtering and
	// section filtering is.
	ICING_ASSIGN_OR_RETURN(
	result_iterator,
	index_.GetIterator(
	normalized_text,
	token.text.data() - search_spec.query().c_str(),
	token.text.length(), kSectionIdMaskAll,
	search_spec.term_match_type(),
	/need_hit_term_frequency=/ranking_strategy ==
	ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE));

	// Add term iterator and terms to match if this is not a negation term.
	// WARNING: setting query terms at this point is not compatible with
	// group-level excludes, group-level sections restricts or excluded
	// section restricts. Those are not currently supported. If they became
	// supported, this handling for query terms would need to be altered.
	if (!frames.top().saw_exclude) {
	if (ranking_strategy ==
	ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE) {
	ICING_ASSIGN_OR_RETURN(
	std::unique_ptr<DocHitInfoIterator> term_iterator,
	index_.GetIterator(
	normalized_text,
	token.text.data() - search_spec.query().c_str(),
	token.text.length(), kSectionIdMaskAll,
	search_spec.term_match_type(),
	/need_hit_term_frequency=/ranking_strategy ==
	ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE));
	results.query_term_iterators[normalized_text] =
	std::make_unique<DocHitInfoIteratorFilter>(
	std::move(term_iterator), &document_store_, &schema_store_,
	options, current_time_ms);
	}
	results.query_terms[frames.top().section_restrict].insert(
	std::move(normalized_text));
	}
	break;
	}
	case Token::Type::INVALID:
	[[fallthrough]];
	default:
	// This wouldn't happen if tokenizer and query processor both work
	// correctly. An unknown token indicates inconsistency between tokenizer
	// and query processor, so we return an internal error here.
	return absl_ports::InternalError(absl_ports::StrCat(
	"Encountered unknown token while processing query: ", token.text));
	}

	// Did we get an iterator out of this token?
	if (result_iterator) {
	if (frames.empty()) {
	return absl_ports::InternalError(
	"Encountered empty stack of ParserStateFrames");
	}

	// NOTE: Order matters!! We must apply the section restrict first, then
	// the NOT operator.
	//
	// Imagine a query like [-subject:foo] which means we
	// want to get documents that don't have the term 'foo' in section
	// 'subject'.
	//
	// Assume some Document_0:
	// { "subject": "foo" }
	//
	// And assume some Document_1:
	// { "subject": "bar" }
	//
	// If we use the IteratorNot first, then we'll get DocHitInfos that
	// represent DocumentIds without any section hits like
	// DocHitInfo(document_id_1, kSectionIdMaskNone). Then, when we try to
	// apply the IteratorSectionRestrict, no SectionIds in the mask will match
	// the SectionId of 'subject' and we won't return any results.
	//
	// If we use the IteratorSectionRestrict first, then we'll get a
	// DocHitInfo for Document_0. Then with the IteratorNot, we can get the
	// rest of the Documents excluding Document_0, and get Document_1 as a
	// correct result.
	//
	// TODO(cassiewang): The point is a bit moot right now since we don't even
	// support this functionality. But add tests for this once we do support
	// more advanced section restricts with grouping, negation, etc.
	if (!frames.top().section_restrict.empty()) {
	// We saw a section restrict earlier, wrap the result iterator in
	// the section restrict
	std::set<std::string> section_restricts;
	section_restricts.insert(std::move(frames.top().section_restrict));
	result_iterator = DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
	std::move(result_iterator), &document_store_, &schema_store_,
	std::move(section_restricts), current_time_ms);

	frames.top().section_restrict = "";
	}

	// Check if we need to NOT/exclude this iterator
	if (frames.top().saw_exclude) {
	result_iterator = std::make_unique<DocHitInfoIteratorNot>(
	std::move(result_iterator),
	document_store_.last_added_document_id());
	frames.top().saw_exclude = false;
	}

	if (i < tokens.size() - 1 &&
	tokens.at(i + 1).type == Token::Type::QUERY_OR) {
	// This isn't the last token, and the next token is an OR. Then we
	// should OR this iterator with the next iterator, (e.g. if the query
	// was "A OR B", we would be processing "A" right now)
	frames.top().or_iterators.push_back(std::move(result_iterator));
	} else if (frames.top().saw_or) {
	// This isn't the first token, and the previous token was an OR. Then
	// we should OR this iterator with the previous iterator (e.g. if the
	// query was "A OR (B C)", we would be processing the iterator for "(B
	// C)" right now)
	frames.top().or_iterators.push_back(std::move(result_iterator));
	frames.top().saw_or = false;
	} else {
	// If we're not trying to OR this iterator, we AND everything else.
	if (!frames.top().or_iterators.empty()) {
	// Accumulate the previous OR iterators if there were any.
	frames.top().and_iterators.push_back(
	CreateOrIterator(std::move(frames.top().or_iterators)));
	frames.top().or_iterators =
	std::vector<std::unique_ptr<DocHitInfoIterator>>();
	}
	frames.top().and_iterators.push_back(std::move(result_iterator));
	}
	}
	}

	// Guaranteed that we have some iterators to return. Need to do one last
	// combining since we could have ORs and ANDs.
	if (frames.size() != 1) {
	return absl_ports::InternalError(
	"Encountered invalid state of ParserStateFrames stack");
	}
	results.root_iterator = ProcessParserStateFrame(
	std::move(frames.top()), document_store_.last_added_document_id());
	return results;
	}

	} // namespace lib
	} // namespace icing