| // Copyright (C) 2019 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "icing/index/index-processor.h" |
| |
| #include <cstdint> |
| #include <limits> |
| #include <memory> |
| #include <string> |
| #include <string_view> |
| #include <unordered_map> |
| #include <utility> |
| #include <vector> |
| |
| #include "icing/text_classifier/lib3/utils/base/status.h" |
| #include "gmock/gmock.h" |
| #include "gtest/gtest.h" |
| #include "icing/absl_ports/str_cat.h" |
| #include "icing/absl_ports/str_join.h" |
| #include "icing/document-builder.h" |
| #include "icing/file/filesystem.h" |
| #include "icing/index/data-indexing-handler.h" |
| #include "icing/index/hit/doc-hit-info.h" |
| #include "icing/index/index.h" |
| #include "icing/index/integer-section-indexing-handler.h" |
| #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" |
| #include "icing/index/iterator/doc-hit-info-iterator.h" |
| #include "icing/index/numeric/integer-index.h" |
| #include "icing/index/numeric/numeric-index.h" |
| #include "icing/index/string-section-indexing-handler.h" |
| #include "icing/index/term-property-id.h" |
| #include "icing/join/qualified-id-join-index.h" |
| #include "icing/join/qualified-id-join-indexing-handler.h" |
| #include "icing/legacy/index/icing-filesystem.h" |
| #include "icing/legacy/index/icing-mock-filesystem.h" |
| #include "icing/portable/platform.h" |
| #include "icing/proto/document.pb.h" |
| #include "icing/proto/schema.pb.h" |
| #include "icing/proto/term.pb.h" |
| #include "icing/schema-builder.h" |
| #include "icing/schema/schema-store.h" |
| #include "icing/schema/schema-util.h" |
| #include "icing/schema/section.h" |
| #include "icing/store/document-id.h" |
| #include "icing/store/document-store.h" |
| #include "icing/testing/common-matchers.h" |
| #include "icing/testing/fake-clock.h" |
| #include "icing/testing/icu-data-file-helper.h" |
| #include "icing/testing/random-string.h" |
| #include "icing/testing/test-data.h" |
| #include "icing/testing/tmp-directory.h" |
| #include "icing/tokenization/language-segmenter-factory.h" |
| #include "icing/tokenization/language-segmenter.h" |
| #include "icing/transform/normalizer-factory.h" |
| #include "icing/transform/normalizer.h" |
| #include "icing/util/tokenized-document.h" |
| #include "unicode/uloc.h" |
| |
| namespace icing { |
| namespace lib { |
| |
| namespace { |
| |
| constexpr std::string_view kIpsumText = |
| "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla convallis " |
| "scelerisque orci quis hendrerit. Sed augue turpis, sodales eu gravida " |
| "nec, scelerisque nec leo. Maecenas accumsan interdum commodo. Aliquam " |
| "mattis sapien est, sit amet interdum risus dapibus sed. Maecenas leo " |
| "erat, fringilla in nisl a, venenatis gravida metus. Phasellus venenatis, " |
| "orci in aliquet mattis, lectus sapien volutpat arcu, sed hendrerit ligula " |
| "arcu nec mauris. Integer dolor mi, rhoncus eget gravida et, pulvinar et " |
| "nunc. Aliquam ac sollicitudin nisi. Vivamus sit amet urna vestibulum, " |
| "tincidunt eros sed, efficitur nisl. Fusce non neque accumsan, sagittis " |
| "nisi eget, sagittis turpis. Ut pulvinar nibh eu purus feugiat faucibus. " |
| "Donec tellus nulla, tincidunt vel lacus id, bibendum fermentum turpis. " |
| "Nullam ultrices sed nibh vitae aliquet. Ut risus neque, consectetur " |
| "vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh " |
| "placerat semper."; |
| |
| // schema types |
| constexpr std::string_view kFakeType = "FakeType"; |
| constexpr std::string_view kNestedType = "NestedType"; |
| |
| // Indexable properties and section Id. Section Id is determined by the |
| // lexicographical order of indexable property path. |
| constexpr std::string_view kExactProperty = "exact"; |
| constexpr std::string_view kIndexableIntegerProperty = "indexableInteger"; |
| constexpr std::string_view kPrefixedProperty = "prefixed"; |
| constexpr std::string_view kRepeatedProperty = "repeated"; |
| constexpr std::string_view kRfc822Property = "rfc822"; |
| constexpr std::string_view kSubProperty = "submessage"; // submessage.nested |
| constexpr std::string_view kNestedProperty = "nested"; // submessage.nested |
| // TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export |
| // to Android. |
| #ifdef ENABLE_URL_TOKENIZER |
| constexpr std::string_view kUrlExactProperty = "urlExact"; |
| constexpr std::string_view kUrlPrefixedProperty = "urlPrefixed"; |
| #endif // ENABLE_URL_TOKENIZER |
| constexpr std::string_view kVerbatimExactProperty = "verbatimExact"; |
| constexpr std::string_view kVerbatimPrefixedProperty = "verbatimPrefixed"; |
| |
| constexpr SectionId kExactSectionId = 0; |
| constexpr SectionId kIndexableIntegerSectionId = 1; |
| constexpr SectionId kPrefixedSectionId = 2; |
| constexpr SectionId kRepeatedSectionId = 3; |
| constexpr SectionId kRfc822SectionId = 4; |
| constexpr SectionId kNestedSectionId = 5; // submessage.nested |
| #ifdef ENABLE_URL_TOKENIZER |
| constexpr SectionId kUrlExactSectionId = 6; |
| constexpr SectionId kUrlPrefixedSectionId = 7; |
| constexpr SectionId kVerbatimExactSectionId = 8; |
| constexpr SectionId kVerbatimPrefixedSectionId = 9; |
| #else // !ENABLE_URL_TOKENIZER |
| constexpr SectionId kVerbatimExactSectionId = 6; |
| constexpr SectionId kVerbatimPrefixedSectionId = 7; |
| #endif // ENABLE_URL_TOKENIZER |
| |
| // Other non-indexable properties. |
| constexpr std::string_view kUnindexedProperty1 = "unindexed1"; |
| constexpr std::string_view kUnindexedProperty2 = "unindexed2"; |
| |
| constexpr DocumentId kDocumentId0 = 0; |
| constexpr DocumentId kDocumentId1 = 1; |
| |
| using Cardinality = PropertyConfigProto::Cardinality; |
| using DataType = PropertyConfigProto::DataType; |
| using ::testing::ElementsAre; |
| using ::testing::Eq; |
| using ::testing::IsEmpty; |
| using ::testing::IsTrue; |
| using ::testing::SizeIs; |
| using ::testing::Test; |
| |
| #ifdef ENABLE_URL_TOKENIZER |
| constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL = |
| StringIndexingConfig::TokenizerType::URL; |
| #endif // ENABLE_URL_TOKENIZER |
| |
| class IndexProcessorTest : public Test { |
| protected: |
| void SetUp() override { |
| if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { |
| ICING_ASSERT_OK( |
| // File generated via icu_data_file rule in //icing/BUILD. |
| icu_data_file_helper::SetUpICUDataFile( |
| GetTestFilePath("icing/icu.dat"))); |
| } |
| |
| base_dir_ = GetTestTempDir() + "/index_processor_test"; |
| ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()), |
| IsTrue()); |
| |
| index_dir_ = base_dir_ + "/index"; |
| integer_index_dir_ = base_dir_ + "/integer_index"; |
| qualified_id_join_index_dir_ = base_dir_ + "/qualified_id_join_index"; |
| schema_store_dir_ = base_dir_ + "/schema_store"; |
| doc_store_dir_ = base_dir_ + "/doc_store"; |
| |
| Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024, |
| /*lite_index_sort_at_indexing=*/true, |
| /*lite_index_sort_size=*/1024 * 8); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| index_, Index::Create(options, &filesystem_, &icing_filesystem_)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| integer_index_, |
| IntegerIndex::Create( |
| filesystem_, integer_index_dir_, |
| IntegerIndex::kDefaultNumDataThresholdForBucketSplit, |
| /*pre_mapping_fbv=*/false)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| qualified_id_join_index_, |
| QualifiedIdJoinIndex::Create(filesystem_, qualified_id_join_index_dir_, |
| /*pre_mapping_fbv=*/false, |
| /*use_persistent_hash_map=*/false)); |
| |
| language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| lang_segmenter_, |
| language_segmenter_factory::Create(std::move(segmenter_options))); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| normalizer_, |
| normalizer_factory::Create( |
| /*max_term_byte_size=*/std::numeric_limits<int32_t>::max())); |
| |
| ASSERT_TRUE( |
| filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str())); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| schema_store_, |
| SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); |
| SchemaProto schema = |
| SchemaBuilder() |
| .AddType( |
| SchemaTypeConfigBuilder() |
| .SetType(kFakeType) |
| .AddProperty(PropertyConfigBuilder() |
| .SetName(kExactProperty) |
| .SetDataTypeString(TERM_MATCH_EXACT, |
| TOKENIZER_PLAIN) |
| .SetCardinality(CARDINALITY_OPTIONAL)) |
| .AddProperty(PropertyConfigBuilder() |
| .SetName(kPrefixedProperty) |
| .SetDataTypeString(TERM_MATCH_PREFIX, |
| TOKENIZER_PLAIN) |
| .SetCardinality(CARDINALITY_OPTIONAL)) |
| .AddProperty(PropertyConfigBuilder() |
| .SetName(kUnindexedProperty1) |
| .SetDataType(TYPE_STRING) |
| .SetCardinality(CARDINALITY_OPTIONAL)) |
| .AddProperty(PropertyConfigBuilder() |
| .SetName(kUnindexedProperty2) |
| .SetDataType(TYPE_BYTES) |
| .SetCardinality(CARDINALITY_OPTIONAL)) |
| .AddProperty(PropertyConfigBuilder() |
| .SetName(kRepeatedProperty) |
| .SetDataTypeString(TERM_MATCH_PREFIX, |
| TOKENIZER_PLAIN) |
| .SetCardinality(CARDINALITY_REPEATED)) |
| .AddProperty(PropertyConfigBuilder() |
| .SetName(kVerbatimExactProperty) |
| .SetDataTypeString(TERM_MATCH_EXACT, |
| TOKENIZER_VERBATIM) |
| .SetCardinality(CARDINALITY_REPEATED)) |
| .AddProperty(PropertyConfigBuilder() |
| .SetName(kVerbatimPrefixedProperty) |
| .SetDataTypeString(TERM_MATCH_PREFIX, |
| TOKENIZER_VERBATIM) |
| .SetCardinality(CARDINALITY_REPEATED)) |
| .AddProperty(PropertyConfigBuilder() |
| .SetName(kRfc822Property) |
| .SetDataTypeString(TERM_MATCH_PREFIX, |
| TOKENIZER_RFC822) |
| .SetCardinality(CARDINALITY_REPEATED)) |
| #ifdef ENABLE_URL_TOKENIZER |
| .AddProperty( |
| PropertyConfigBuilder() |
| .SetName(kUrlExactProperty) |
| .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_URL) |
| .SetCardinality(CARDINALITY_REPEATED)) |
| .AddProperty( |
| PropertyConfigBuilder() |
| .SetName(kUrlPrefixedProperty) |
| .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_URL) |
| .SetCardinality(CARDINALITY_REPEATED)) |
| #endif // ENABLE_URL_TOKENIZER |
| .AddProperty(PropertyConfigBuilder() |
| .SetName(kIndexableIntegerProperty) |
| .SetDataTypeInt64(NUMERIC_MATCH_RANGE) |
| .SetCardinality(CARDINALITY_REPEATED)) |
| .AddProperty( |
| PropertyConfigBuilder() |
| .SetName(kSubProperty) |
| .SetDataTypeDocument( |
| kNestedType, /*index_nested_properties=*/true) |
| .SetCardinality(CARDINALITY_OPTIONAL))) |
| .AddType( |
| SchemaTypeConfigBuilder() |
| .SetType(kNestedType) |
| .AddProperty(PropertyConfigBuilder() |
| .SetName(kNestedProperty) |
| .SetDataTypeString(TERM_MATCH_PREFIX, |
| TOKENIZER_PLAIN) |
| .SetCardinality(CARDINALITY_OPTIONAL))) |
| .Build(); |
| ICING_ASSERT_OK(schema_store_->SetSchema( |
| schema, /*ignore_errors_and_delete_documents=*/false, |
| /*allow_circular_schema_definitions=*/false)); |
| |
| ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str())); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| DocumentStore::CreateResult create_result, |
| DocumentStore::Create( |
| &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(), |
| /*force_recovery_and_revalidate_documents=*/false, |
| /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false, |
| /*use_persistent_hash_map=*/false, |
| PortableFileBackedProtoLog< |
| DocumentWrapper>::kDeflateCompressionLevel, |
| /*initialize_stats=*/nullptr)); |
| doc_store_ = std::move(create_result.document_store); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<StringSectionIndexingHandler> |
| string_section_indexing_handler, |
| StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), |
| index_.get())); |
| ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler> |
| integer_section_indexing_handler, |
| IntegerSectionIndexingHandler::Create( |
| &fake_clock_, integer_index_.get())); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<QualifiedIdJoinIndexingHandler> |
| qualified_id_join_indexing_handler, |
| QualifiedIdJoinIndexingHandler::Create(&fake_clock_, |
| qualified_id_join_index_.get())); |
| std::vector<std::unique_ptr<DataIndexingHandler>> handlers; |
| handlers.push_back(std::move(string_section_indexing_handler)); |
| handlers.push_back(std::move(integer_section_indexing_handler)); |
| handlers.push_back(std::move(qualified_id_join_indexing_handler)); |
| |
| index_processor_ = |
| std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); |
| |
| mock_icing_filesystem_ = std::make_unique<IcingMockFilesystem>(); |
| } |
| |
| void TearDown() override { |
| index_processor_.reset(); |
| doc_store_.reset(); |
| schema_store_.reset(); |
| normalizer_.reset(); |
| lang_segmenter_.reset(); |
| qualified_id_join_index_.reset(); |
| integer_index_.reset(); |
| index_.reset(); |
| |
| filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); |
| } |
| |
| std::unique_ptr<IcingMockFilesystem> mock_icing_filesystem_; |
| |
| Filesystem filesystem_; |
| IcingFilesystem icing_filesystem_; |
| FakeClock fake_clock_; |
| std::string base_dir_; |
| std::string index_dir_; |
| std::string integer_index_dir_; |
| std::string qualified_id_join_index_dir_; |
| std::string schema_store_dir_; |
| std::string doc_store_dir_; |
| |
| std::unique_ptr<Index> index_; |
| std::unique_ptr<NumericIndex<int64_t>> integer_index_; |
| std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_; |
| std::unique_ptr<LanguageSegmenter> lang_segmenter_; |
| std::unique_ptr<Normalizer> normalizer_; |
| std::unique_ptr<SchemaStore> schema_store_; |
| std::unique_ptr<DocumentStore> doc_store_; |
| |
| std::unique_ptr<IndexProcessor> index_processor_; |
| }; |
| |
| std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) { |
| std::vector<DocHitInfo> infos; |
| while (iterator->Advance().ok()) { |
| infos.push_back(iterator->doc_hit_info()); |
| } |
| return infos; |
| } |
| |
| std::vector<DocHitInfoTermFrequencyPair> GetHitsWithTermFrequency( |
| std::unique_ptr<DocHitInfoIterator> iterator) { |
| std::vector<DocHitInfoTermFrequencyPair> infos; |
| while (iterator->Advance().ok()) { |
| std::vector<TermMatchInfo> matched_terms_stats; |
| iterator->PopulateMatchedTermsStats(&matched_terms_stats); |
| for (const TermMatchInfo& term_match_info : matched_terms_stats) { |
| infos.push_back(DocHitInfoTermFrequencyPair( |
| iterator->doc_hit_info(), term_match_info.term_frequencies)); |
| } |
| } |
| return infos; |
| } |
| |
| TEST_F(IndexProcessorTest, NoTermMatchTypeContent) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kUnindexedProperty1), "foo bar baz") |
| .AddBytesProperty(std::string(kUnindexedProperty2), |
| "attachment bytes") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| } |
| |
| TEST_F(IndexProcessorTest, NoValidContent) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "?...!") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| } |
| |
| TEST_F(IndexProcessorTest, OneDoc) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "hello world") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("hello", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::EXACT_ONLY)); |
| std::vector<DocHitInfoTermFrequencyPair> hits = |
| GetHitsWithTermFrequency(std::move(itr)); |
| std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{ |
| {kExactSectionId, 1}}; |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expectedMap))); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator( |
| "hello", /*term_start_index=*/0, /*unnormalized_term_length=*/0, |
| 1U << kPrefixedSectionId, TermMatchType::EXACT_ONLY)); |
| EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); |
| } |
| |
| TEST_F(IndexProcessorTest, MultipleDocs) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "hello world") |
| .AddStringProperty(std::string(kPrefixedProperty), "good night moon!") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| std::string coffeeRepeatedString = "coffee"; |
| for (int i = 0; i < Hit::kMaxTermFrequency + 1; i++) { |
| coffeeRepeatedString += " coffee"; |
| } |
| |
| document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/2") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), coffeeRepeatedString) |
| .AddStringProperty(std::string(kPrefixedProperty), |
| "mr. world world wide") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("world", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::EXACT_ONLY)); |
| std::vector<DocHitInfoTermFrequencyPair> hits = |
| GetHitsWithTermFrequency(std::move(itr)); |
| std::unordered_map<SectionId, Hit::TermFrequency> expectedMap1{ |
| {kPrefixedSectionId, 2}}; |
| std::unordered_map<SectionId, Hit::TermFrequency> expectedMap2{ |
| {kExactSectionId, 1}}; |
| EXPECT_THAT( |
| hits, ElementsAre( |
| EqualsDocHitInfoWithTermFrequency(kDocumentId1, expectedMap1), |
| EqualsDocHitInfoWithTermFrequency(kDocumentId0, expectedMap2))); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator( |
| "world", /*term_start_index=*/0, /*unnormalized_term_length=*/0, |
| 1U << kPrefixedSectionId, TermMatchType::EXACT_ONLY)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{ |
| {kPrefixedSectionId, 2}}; |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId1, expectedMap))); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("coffee", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| expectedMap = {{kExactSectionId, Hit::kMaxTermFrequency}}; |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId1, expectedMap))); |
| } |
| |
| TEST_F(IndexProcessorTest, DocWithNestedProperty) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "hello world") |
| .AddDocumentProperty( |
| std::string(kSubProperty), |
| DocumentBuilder() |
| .SetKey("icing", "nested_type/1") |
| .SetSchema(std::string(kNestedType)) |
| .AddStringProperty(std::string(kNestedProperty), |
| "rocky raccoon") |
| .Build()) |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("rocky", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::EXACT_ONLY)); |
| EXPECT_THAT(GetHits(std::move(itr)), |
| ElementsAre(EqualsDocHitInfo( |
| kDocumentId0, std::vector<SectionId>{kNestedSectionId}))); |
| } |
| |
| TEST_F(IndexProcessorTest, DocWithRepeatedProperty) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "hello world") |
| .AddStringProperty(std::string(kRepeatedProperty), "rocky", |
| "italian stallion") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("italian", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::EXACT_ONLY)); |
| EXPECT_THAT(GetHits(std::move(itr)), |
| ElementsAre(EqualsDocHitInfo( |
| kDocumentId0, std::vector<SectionId>{kRepeatedSectionId}))); |
| } |
| |
| // TODO(b/196771754) This test is disabled on Android because it takes too long |
| // to generate all of the unique terms and the test times out. Try storing these |
| // unique terms in a file that the test can read from. |
| #ifndef __ANDROID__ |
| |
| TEST_F(IndexProcessorTest, HitBufferExhaustedTest) { |
| // Testing has shown that adding ~600,000 hits will fill up the hit buffer. |
| std::vector<std::string> unique_terms_ = GenerateUniqueTerms(200000); |
| std::string content = absl_ports::StrJoin(unique_terms_, " "); |
| |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), content) |
| .AddStringProperty(std::string(kPrefixedProperty), content) |
| .AddStringProperty(std::string(kRepeatedProperty), content) |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED, |
| testing::HasSubstr("Hit buffer is full!"))); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| } |
| |
| TEST_F(IndexProcessorTest, LexiconExhaustedTest) { |
| // Testing has shown that adding ~300,000 terms generated this way will |
| // fill up the lexicon. |
| std::vector<std::string> unique_terms_ = GenerateUniqueTerms(300000); |
| std::string content = absl_ports::StrJoin(unique_terms_, " "); |
| |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), content) |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| } |
| |
| #endif // __ANDROID__ |
| |
| TEST_F(IndexProcessorTest, TooLongTokens) { |
| // Only allow the tokens of length four, truncating "hello", "world" and |
| // "night". |
| ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Normalizer> normalizer, |
| normalizer_factory::Create( |
| /*max_term_byte_size=*/4)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StringSectionIndexingHandler> |
| string_section_indexing_handler, |
| StringSectionIndexingHandler::Create( |
| &fake_clock_, normalizer.get(), index_.get())); |
| std::vector<std::unique_ptr<DataIndexingHandler>> handlers; |
| handlers.push_back(std::move(string_section_indexing_handler)); |
| |
| index_processor_ = |
| std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); |
| |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "hello world") |
| .AddStringProperty(std::string(kPrefixedProperty), "good night moon!") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| // "good" should have been indexed normally. |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("good", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::EXACT_ONLY)); |
| EXPECT_THAT(GetHits(std::move(itr)), |
| ElementsAre(EqualsDocHitInfo( |
| kDocumentId0, std::vector<SectionId>{kPrefixedSectionId}))); |
| |
| // "night" should not have been. |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("night", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); |
| EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); |
| |
| // "night" should have been truncated to "nigh". |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("nigh", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); |
| EXPECT_THAT(GetHits(std::move(itr)), |
| ElementsAre(EqualsDocHitInfo( |
| kDocumentId0, std::vector<SectionId>{kPrefixedSectionId}))); |
| } |
| |
| TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "best rocky movies") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/2") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kPrefixedProperty), "rocky raccoon") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); |
| |
| // Only document_id 1 should surface in a prefix query for "Rock" |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("rock", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::PREFIX)); |
| EXPECT_THAT(GetHits(std::move(itr)), |
| ElementsAre(EqualsDocHitInfo( |
| kDocumentId1, std::vector<SectionId>{kPrefixedSectionId}))); |
| } |
| |
| TEST_F(IndexProcessorTest, TokenNormalization) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/2") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "all lower case") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("case", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::EXACT_ONLY)); |
| EXPECT_THAT( |
| GetHits(std::move(itr)), |
| ElementsAre(EqualsDocHitInfo(kDocumentId1, |
| std::vector<SectionId>{kExactSectionId}), |
| EqualsDocHitInfo(kDocumentId0, |
| std::vector<SectionId>{kExactSectionId}))); |
| } |
| |
| TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE") |
| .AddInt64Property(std::string(kIndexableIntegerProperty), 123) |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN(int64_t index_element_size, |
| index_->GetElementsSize()); |
| ICING_ASSERT_OK_AND_ASSIGN(Crc32 integer_index_crc, |
| integer_index_->UpdateChecksums()); |
| |
| // Indexing a document with document_id <= last_added_document_id should cause |
| // a failure. |
| document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/2") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "all lower case") |
| .AddInt64Property(std::string(kIndexableIntegerProperty), 456) |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); |
| // Verify that both index_ and integer_index_ are unchanged. |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); |
| EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size)); |
| EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1)); |
| EXPECT_THAT(integer_index_->UpdateChecksums(), |
| IsOkAndHolds(integer_index_crc)); |
| |
| // As should indexing a document document_id == last_added_document_id. |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1), |
| StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); |
| // Verify that both index_ and integer_index_ are unchanged. |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); |
| EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size)); |
| EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1)); |
| EXPECT_THAT(integer_index_->UpdateChecksums(), |
| IsOkAndHolds(integer_index_crc)); |
| } |
| |
| TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) { |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<StringSectionIndexingHandler> |
| string_section_indexing_handler, |
| StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), |
| index_.get())); |
| ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler> |
| integer_section_indexing_handler, |
| IntegerSectionIndexingHandler::Create( |
| &fake_clock_, integer_index_.get())); |
| ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<QualifiedIdJoinIndexingHandler> |
| qualified_id_join_indexing_handler, |
| QualifiedIdJoinIndexingHandler::Create( |
| &fake_clock_, qualified_id_join_index_.get())); |
| std::vector<std::unique_ptr<DataIndexingHandler>> handlers; |
| handlers.push_back(std::move(string_section_indexing_handler)); |
| handlers.push_back(std::move(integer_section_indexing_handler)); |
| handlers.push_back(std::move(qualified_id_join_indexing_handler)); |
| |
| IndexProcessor index_processor(std::move(handlers), &fake_clock_, |
| /*recovery_mode=*/true); |
| |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE") |
| .AddInt64Property(std::string(kIndexableIntegerProperty), 123) |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor.IndexDocument(tokenized_document, kDocumentId1), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN(int64_t index_element_size, |
| index_->GetElementsSize()); |
| ICING_ASSERT_OK_AND_ASSIGN(Crc32 integer_index_crc, |
| integer_index_->UpdateChecksums()); |
| |
| // Indexing a document with document_id <= last_added_document_id in recovery |
| // mode should not get any error, but IndexProcessor should still ignore it |
| // and index data should remain unchanged. |
| document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/2") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), "all lower case") |
| .AddInt64Property(std::string(kIndexableIntegerProperty), 456) |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor.IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| // Verify that both index_ and integer_index_ are unchanged. |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); |
| EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size)); |
| EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1)); |
| EXPECT_THAT(integer_index_->UpdateChecksums(), |
| IsOkAndHolds(integer_index_crc)); |
| |
| // As should indexing a document document_id == last_added_document_id. |
| EXPECT_THAT(index_processor.IndexDocument(tokenized_document, kDocumentId1), |
| IsOk()); |
| // Verify that both index_ and integer_index_ are unchanged. |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); |
| EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size)); |
| EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1)); |
| EXPECT_THAT(integer_index_->UpdateChecksums(), |
| IsOkAndHolds(integer_index_crc)); |
| } |
| |
| TEST_F(IndexProcessorTest, NonAsciiIndexing) { |
| language_segmenter_factory::SegmenterOptions segmenter_options( |
| ULOC_SIMPLIFIED_CHINESE); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| lang_segmenter_, |
| language_segmenter_factory::Create(std::move(segmenter_options))); |
| |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), |
| "你好,世界!你好:世界。“你好”世界?") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("你好", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::EXACT_ONLY)); |
| EXPECT_THAT(GetHits(std::move(itr)), |
| ElementsAre(EqualsDocHitInfo( |
| kDocumentId0, std::vector<SectionId>{kExactSectionId}))); |
| } |
| |
| TEST_F(IndexProcessorTest, |
| LexiconFullIndexesSmallerTokensReturnsResourceExhausted) { |
| // This is the maximum token length that an empty lexicon constructed for a |
| // lite index with merge size of 1MiB can support. |
| constexpr int kMaxTokenLength = 16777217; |
| // Create a string "ppppppp..." with a length that is too large to fit into |
| // the lexicon. |
| std::string enormous_string(kMaxTokenLength + 1, 'p'); |
| DocumentProto document_one = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), |
| absl_ports::StrCat(enormous_string, " foo")) |
| .AddStringProperty(std::string(kPrefixedProperty), "bar baz") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document_one)); |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| } |
| |
| TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) { |
| // Create the index with a smaller index_merge_size - merging every time we |
| // add 101 documents. This will result in a small LiteIndex, which will be |
| // easier to fill up. The LiteIndex itself will have a size larger than the |
| // index_merge_size because it adds extra buffer to ensure that it always has |
| // room to fit whatever document will trigger the merge. |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kExactProperty), kIpsumText) |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| Index::Options options(index_dir_, |
| /*index_merge_size=*/document.ByteSizeLong() * 100, |
| /*lite_index_sort_at_indexing=*/true, |
| /*lite_index_sort_size=*/64); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| index_, Index::Create(options, &filesystem_, &icing_filesystem_)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<StringSectionIndexingHandler> |
| string_section_indexing_handler, |
| StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), |
| index_.get())); |
| std::vector<std::unique_ptr<DataIndexingHandler>> handlers; |
| handlers.push_back(std::move(string_section_indexing_handler)); |
| |
| index_processor_ = |
| std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); |
| |
| DocumentId doc_id = 0; |
| // Have determined experimentally that indexing 3373 documents with this text |
| // will cause the LiteIndex to fill up. Further indexing will fail unless the |
| // index processor properly merges the LiteIndex into the MainIndex and |
| // empties the LiteIndex. |
| constexpr int kNumDocsLiteIndexExhaustion = 3373; |
| for (; doc_id < kNumDocsLiteIndexExhaustion; ++doc_id) { |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); |
| } |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); |
| } |
| |
| TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) { |
| // 1. Setup a mock filesystem to fail to grow the main index. |
| auto open_write_lambda = [this](const char* filename) { |
| std::string main_lexicon_suffix = |
| "/main-lexicon.prop." + |
| std::to_string(GetHasHitsInPrefixSectionPropertyId()); |
| std::string filename_string(filename); |
| if (filename_string.length() >= main_lexicon_suffix.length() && |
| filename_string.substr( |
| filename_string.length() - main_lexicon_suffix.length(), |
| main_lexicon_suffix.length()) == main_lexicon_suffix) { |
| return -1; |
| } |
| return this->filesystem_.OpenForWrite(filename); |
| }; |
| ON_CALL(*mock_icing_filesystem_, OpenForWrite) |
| .WillByDefault(open_write_lambda); |
| |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kPrefixedProperty), kIpsumText) |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| |
| // 2. Recreate the index with the mock filesystem and a merge size that will |
| // only allow one document to be added before requiring a merge. |
| Index::Options options(index_dir_, |
| /*index_merge_size=*/document.ByteSizeLong(), |
| /*lite_index_sort_at_indexing=*/true, |
| /*lite_index_sort_size=*/16); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| index_, |
| Index::Create(options, &filesystem_, mock_icing_filesystem_.get())); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<StringSectionIndexingHandler> |
| string_section_indexing_handler, |
| StringSectionIndexingHandler::Create(&fake_clock_, normalizer_.get(), |
| index_.get())); |
| std::vector<std::unique_ptr<DataIndexingHandler>> handlers; |
| handlers.push_back(std::move(string_section_indexing_handler)); |
| |
| index_processor_ = |
| std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); |
| |
| // 3. Index one document. This should fit in the LiteIndex without requiring a |
| // merge. |
| DocumentId doc_id = 0; |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); |
| |
| // 4. Add one more document to trigger a merge, which should fail and result |
| // in a Reset. |
| ++doc_id; |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), |
| StatusIs(libtextclassifier3::StatusCode::DATA_LOSS)); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); |
| |
| // 5. Indexing a new document should succeed. |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id)); |
| } |
| |
| TEST_F(IndexProcessorTest, ExactVerbatimProperty) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kVerbatimExactProperty), |
| "Hello, world!") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("Hello, world!", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::EXACT_ONLY)); |
| std::vector<DocHitInfoTermFrequencyPair> hits = |
| GetHitsWithTermFrequency(std::move(itr)); |
| std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{ |
| {kVerbatimExactSectionId, 1}}; |
| |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expectedMap))); |
| } |
| |
| TEST_F(IndexProcessorTest, PrefixVerbatimProperty) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kVerbatimPrefixedProperty), |
| "Hello, world!") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| // We expect to match the document we indexed as "Hello, w" is a prefix |
| // of "Hello, world!" |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("Hello, w", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::PREFIX)); |
| std::vector<DocHitInfoTermFrequencyPair> hits = |
| GetHitsWithTermFrequency(std::move(itr)); |
| std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{ |
| {kVerbatimPrefixedSectionId, 1}}; |
| |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expectedMap))); |
| } |
| |
| TEST_F(IndexProcessorTest, VerbatimPropertyDoesntMatchSubToken) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kVerbatimPrefixedProperty), |
| "Hello, world!") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("world", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::PREFIX)); |
| std::vector<DocHitInfo> hits = GetHits(std::move(itr)); |
| |
| // We should not have hits for term "world" as the index processor should |
| // create a sole token "Hello, world! for the document. |
| EXPECT_THAT(hits, IsEmpty()); |
| } |
| |
| // Some phrases that should match exactly to RFC822 tokens. We normalize the |
| // tokens, so the case of the string property shouldn't matter. |
| TEST_F(IndexProcessorTest, Rfc822PropertyExact) { |
| DocumentProto document = DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kRfc822Property), |
| "<AlexSav@GOOGLE.com>") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| std::unordered_map<SectionId, Hit::TermFrequency> expected_map{ |
| {kRfc822SectionId, 2}}; |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("alexsav", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::EXACT_ONLY)); |
| std::vector<DocHitInfoTermFrequencyPair> hits = |
| GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| |
| expected_map = {{kRfc822SectionId, 1}}; |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("com", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("alexsav@google.com", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| } |
| |
| TEST_F(IndexProcessorTest, Rfc822PropertyExactShouldNotReturnPrefix) { |
| DocumentProto document = DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kRfc822Property), |
| "<AlexSav@GOOGLE.com>") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| std::unordered_map<SectionId, Hit::TermFrequency> expected_map{ |
| {kRfc822SectionId, 2}}; |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("alexsa", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::EXACT_ONLY)); |
| std::vector<DocHitInfo> hits = GetHits(std::move(itr)); |
| EXPECT_THAT(hits, IsEmpty()); |
| } |
| |
| // Some prefixes of generated RFC822 tokens. |
| #ifdef ENABLE_RFC822_PROPERTY_PREFIX_TEST |
| // ENABLE_RFC822_PROPERTY_PREFIX_TEST won't be defined, so this test will not be |
| // compiled. |
| // TODO(b/250648165): Remove #ifdef to enable this test after fixing the |
| // indeterministic behavior of prefix query term frequency in |
| // lite index. |
| // |
| TEST_F(IndexProcessorTest, Rfc822PropertyPrefix) { |
| DocumentProto document = DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kRfc822Property), |
| "<alexsav@google.com>") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| std::unordered_map<SectionId, Hit::TermFrequency> expected_map{ |
| {kRfc822SectionId, 1}}; |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("alexsav@", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::PREFIX)); |
| std::vector<DocHitInfoTermFrequencyPair> hits = |
| GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("goog", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::PREFIX)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("ale", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::PREFIX)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| } |
| #endif // ENABLE_RFC822_PROPERTY_PREFIX_TEST |
| |
| TEST_F(IndexProcessorTest, Rfc822PropertyNoMatch) { |
| DocumentProto document = DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kRfc822Property), |
| "<alexsav@google.com>") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| std::unordered_map<SectionId, Hit::TermFrequency> expect_map{{}}; |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("abc.xyz", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::PREFIX)); |
| std::vector<DocHitInfo> hits = GetHits(std::move(itr)); |
| |
| EXPECT_THAT(hits, IsEmpty()); |
| } |
| |
| #ifdef ENABLE_URL_TOKENIZER |
| TEST_F(IndexProcessorTest, ExactUrlProperty) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kUrlExactProperty), |
| "http://www.google.com") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("google", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::EXACT_ONLY)); |
| std::vector<DocHitInfoTermFrequencyPair> hits = |
| GetHitsWithTermFrequency(std::move(itr)); |
| std::unordered_map<SectionId, Hit::TermFrequency> expected_map{ |
| {kUrlExactSectionId, 1}}; |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("http", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| expected_map = {{kUrlExactSectionId, 1}}; |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("www.google.com", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| expected_map = {{kUrlExactSectionId, 1}}; |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("http://www.google.com", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| expected_map = {{kUrlExactSectionId, 1}}; |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| } |
| |
| TEST_F(IndexProcessorTest, ExactUrlPropertyDoesNotMatchPrefix) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kUrlExactProperty), |
| "https://mail.google.com/calendar/render") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(8)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("co", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::EXACT_ONLY)); |
| std::vector<DocHitInfoTermFrequencyPair> hits = |
| GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, IsEmpty()); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("mail.go", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, IsEmpty()); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("mail.google.com", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, IsEmpty()); |
| } |
| |
| TEST_F(IndexProcessorTest, PrefixUrlProperty) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kUrlPrefixedProperty), |
| "http://www.google.com") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| // "goo" is a prefix of "google" and "google.com" |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("goo", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::PREFIX)); |
| std::vector<DocHitInfoTermFrequencyPair> hits = |
| GetHitsWithTermFrequency(std::move(itr)); |
| std::unordered_map<SectionId, Hit::TermFrequency> expected_map{ |
| {kUrlPrefixedSectionId, 1}}; |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| |
| // "http" is a prefix of "http" and "http://www.google.com" |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("http", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::PREFIX)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| expected_map = {{kUrlPrefixedSectionId, 1}}; |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| |
| // "www.go" is a prefix of "www.google.com" |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("www.go", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::PREFIX)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| expected_map = {{kUrlPrefixedSectionId, 1}}; |
| EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency( |
| kDocumentId0, expected_map))); |
| } |
| |
| TEST_F(IndexProcessorTest, PrefixUrlPropertyNoMatch) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddStringProperty(std::string(kUrlPrefixedProperty), |
| "https://mail.google.com/calendar/render") |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(8)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); |
| |
| // no token starts with "gle", so we should have no hits |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| index_->GetIterator("gle", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, kSectionIdMaskAll, |
| TermMatchType::PREFIX)); |
| std::vector<DocHitInfoTermFrequencyPair> hits = |
| GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, IsEmpty()); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("w.goo", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::PREFIX)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, IsEmpty()); |
| |
| // tokens have separators removed, so no hits here |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator(".com", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::PREFIX)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, IsEmpty()); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| itr, index_->GetIterator("calendar/render", /*term_start_index=*/0, |
| /*unnormalized_term_length=*/0, |
| kSectionIdMaskAll, TermMatchType::PREFIX)); |
| hits = GetHitsWithTermFrequency(std::move(itr)); |
| EXPECT_THAT(hits, IsEmpty()); |
| } |
| #endif // ENABLE_URL_TOKENIZER |
| |
| TEST_F(IndexProcessorTest, IndexableIntegerProperty) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddInt64Property(std::string(kIndexableIntegerProperty), 1, 2, 3, 4, |
| 5) |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| // Expected to have 1 integer section. |
| EXPECT_THAT(tokenized_document.integer_sections(), SizeIs(1)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/1, |
| /*key_upper=*/5, *doc_store_, *schema_store_, |
| fake_clock_.GetSystemTimeMilliseconds())); |
| |
| EXPECT_THAT( |
| GetHits(std::move(itr)), |
| ElementsAre(EqualsDocHitInfo( |
| kDocumentId0, std::vector<SectionId>{kIndexableIntegerSectionId}))); |
| } |
| |
| TEST_F(IndexProcessorTest, IndexableIntegerPropertyNoMatch) { |
| DocumentProto document = |
| DocumentBuilder() |
| .SetKey("icing", "fake_type/1") |
| .SetSchema(std::string(kFakeType)) |
| .AddInt64Property(std::string(kIndexableIntegerProperty), 1, 2, 3, 4, |
| 5) |
| .Build(); |
| ICING_ASSERT_OK_AND_ASSIGN( |
| TokenizedDocument tokenized_document, |
| TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), |
| document)); |
| // Expected to have 1 integer section. |
| EXPECT_THAT(tokenized_document.integer_sections(), SizeIs(1)); |
| |
| EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), |
| IsOk()); |
| |
| ICING_ASSERT_OK_AND_ASSIGN( |
| std::unique_ptr<DocHitInfoIterator> itr, |
| integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/-1, |
| /*key_upper=*/0, *doc_store_, *schema_store_, |
| fake_clock_.GetSystemTimeMilliseconds())); |
| |
| EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); |
| } |
| |
| } // namespace |
| |
| } // namespace lib |
| } // namespace icing |