blob: db9dbe2b039b0b938852769afc7a82bff0e39aa0 [file] [log] [blame]
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "icing/index/main/main-index.h"
#include <cstdint>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/file/filesystem.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/hit/hit.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/index/lite/lite-index.h"
#include "icing/index/lite/term-id-hit-pair.h"
#include "icing/index/main/doc-hit-info-iterator-term-main.h"
#include "icing/index/main/main-index-merger.h"
#include "icing/index/term-id-codec.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/store/namespace-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
namespace {
using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::IsEmpty;
using ::testing::NiceMock;
using ::testing::Return;
using ::testing::SizeIs;
std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
std::vector<DocHitInfo> infos;
while (iterator->Advance().ok()) {
infos.push_back(iterator->doc_hit_info());
}
return infos;
}
std::vector<DocHitInfo> GetExactHits(
MainIndex* main_index, int term_start_index, int unnormalized_term_length,
const std::string& term, SectionIdMask section_mask = kSectionIdMaskAll) {
auto iterator = std::make_unique<DocHitInfoIteratorTermMainExact>(
main_index, term, term_start_index, unnormalized_term_length,
section_mask, /*need_hit_term_frequency=*/true);
return GetHits(std::move(iterator));
}
std::vector<DocHitInfo> GetPrefixHits(
MainIndex* main_index, int term_start_index, int unnormalized_term_length,
const std::string& term, SectionIdMask section_mask = kSectionIdMaskAll) {
auto iterator = std::make_unique<DocHitInfoIteratorTermMainPrefix>(
main_index, term, term_start_index, unnormalized_term_length,
section_mask, /*need_hit_term_frequency=*/true);
return GetHits(std::move(iterator));
}
libtextclassifier3::Status Merge(const LiteIndex& lite_index,
const TermIdCodec& term_id_codec,
MainIndex* main_index) {
ICING_ASSIGN_OR_RETURN(MainIndex::LexiconMergeOutputs outputs,
main_index->MergeLexicon(lite_index.lexicon()));
ICING_ASSIGN_OR_RETURN(std::vector<TermIdHitPair> term_id_hit_pairs,
MainIndexMerger::TranslateAndExpandLiteHits(
lite_index, term_id_codec, outputs));
return main_index->AddHits(term_id_codec, std::move(outputs.backfill_map),
std::move(term_id_hit_pairs),
lite_index.last_added_document_id());
}
class MainIndexTest : public testing::Test {
protected:
void SetUp() override {
index_dir_ = GetTestTempDir() + "/test_dir";
ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
LiteIndex::Options options(lite_index_file_name,
/*hit_buffer_want_merge_bytes=*/1024 * 1024,
/*hit_buffer_sort_at_indexing=*/true,
/*hit_buffer_sort_threshold_bytes=*/1024 * 8);
ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
LiteIndex::Create(options, &icing_filesystem_));
ICING_ASSERT_OK_AND_ASSIGN(
term_id_codec_,
TermIdCodec::Create(
IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
IcingDynamicTrie::max_value_index(options.lexicon_options)));
}
void TearDown() override {
term_id_codec_.reset();
lite_index_.reset();
ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
}
std::string index_dir_;
Filesystem filesystem_;
IcingFilesystem icing_filesystem_;
std::unique_ptr<LiteIndex> lite_index_;
std::unique_ptr<TermIdCodec> term_id_codec_;
};
constexpr NamespaceId kNamespace0 = 0;
TEST_F(MainIndexTest, MainIndexCreateIOFailure) {
// Create the index with mock filesystem. By default, Mock will return false,
// so the first attempted file operation will fail.
NiceMock<IcingMockFilesystem> mock_icing_filesystem;
ON_CALL(mock_icing_filesystem, CreateDirectoryRecursively)
.WillByDefault(Return(false));
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
EXPECT_THAT(MainIndex::Create(main_index_file_name, &filesystem_,
&mock_icing_filesystem),
StatusIs(libtextclassifier3::StatusCode::INTERNAL));
}
TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixTermNotFound) {
// Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(main_index_file_name, &filesystem_,
&icing_filesystem_));
EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) {
// 1. Index one doc in the Lite Index:
// - Doc0 {"foot" is_in_prefix_section=true}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t tvi,
lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
// 2. Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(main_index_file_name, &filesystem_,
&icing_filesystem_));
// 3. Merge the index. The main index should contain "foo".
ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
// GetAccessorForPrefixTerm should return a valid accessor for "foo".
EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"), IsOk());
}
TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsNotFound) {
// 1. Index one doc in the Lite Index:
// - Doc0 {"foot" is_in_prefix_section=false}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t tvi,
lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
// 2. Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(main_index_file_name, &filesystem_,
&icing_filesystem_));
// 3. Merge the index. The main index should return not found when we search
// prefix contain "foo".
ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
// GetAccessorForPrefixTerm should return a valid accessor for "foo".
EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(MainIndexTest, MainIndexGetAccessorForExactTermNotFound) {
// Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(main_index_file_name, &filesystem_,
&icing_filesystem_));
EXPECT_THAT(main_index->GetAccessorForExactTerm("foo"),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(MainIndexTest, MainIndexGetAccessorForExactReturnsValidAccessor) {
// 1. Index one doc in the Lite Index:
// - Doc0 {"foo" is_in_prefix_section=false}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t tvi,
lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
// 2. Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(main_index_file_name, &filesystem_,
&icing_filesystem_));
// 3. Merge the index. The main index should contain "foo".
ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
// GetAccessorForPrefixTerm should return a valid accessor for "foo".
EXPECT_THAT(main_index->GetAccessorForExactTerm("foo"), IsOk());
}
TEST_F(MainIndexTest, MergeIndexToEmpty) {
// 1. Index three docs in the Lite Index:
// - Doc0 {"foot", "fool", "far" is_in_prefix_section=false}
// - Doc1 {"foot", "fool" is_in_prefix_section=true}
// - Doc2 {"fool", "far" is_in_prefix_section=false}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t tvi,
lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
tvi, lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
tvi,
lite_index_->InsertTerm("far", TermMatchType::EXACT_ONLY, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t far_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit));
ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit));
// 2. Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(main_index_file_name, &filesystem_,
&icing_filesystem_));
std::vector<DocHitInfo> hits =
GetExactHits(main_index.get(), /*term_start_index=*/0,
/*unnormalized_term_length=*/0, "foot");
EXPECT_THAT(hits, IsEmpty());
hits = GetPrefixHits(main_index.get(), /*term_start_index=*/0,
/*unnormalized_term_length=*/0, "fo");
EXPECT_THAT(hits, IsEmpty());
// 3. Merge the index. The main index should contain "fool", "foot"
// and "far" as well as a branch points for "foo" and "f". "fa" and "fo"
// should not be present because it is not a branch point.
ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
// Get hits from an exact posting list.
hits = GetExactHits(main_index.get(), /*term_start_index=*/0,
/*unnormalized_term_length=*/0, "foot");
// We should get hits for "foot" in doc1 and doc0
EXPECT_THAT(
hits,
ElementsAre(
EqualsDocHitInfo(doc1_hit.document_id(),
std::vector<SectionId>{doc1_hit.section_id()}),
EqualsDocHitInfo(doc0_hit.document_id(),
std::vector<SectionId>{doc0_hit.section_id()})));
// Get hits from a branching point posting list. "fo" should redirect to "foo"
hits = GetPrefixHits(main_index.get(), /*term_start_index=*/0,
/*unnormalized_term_length=*/0, "fo");
// We should get hits for "foot" in doc1 and "fool" in doc1. We shouldn't get
// the hits for "foot" in doc0 and "fool" in doc0 and doc2 because they
// weren't hits in prefix sections.
EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
doc1_hit.document_id(),
std::vector<SectionId>{doc1_hit.section_id()})));
}
TEST_F(MainIndexTest, MergeIndexToPreexisting) {
// 1. Index three docs in the Lite Index:
// - Doc0 {"foot", "fool", "far" is_in_prefix_section=false}
// - Doc1 {"foot", "fool" is_in_prefix_section=true}
// - Doc2 {"fool", "far" is_in_prefix_section=false}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t tvi,
lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
tvi, lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
tvi,
lite_index_->InsertTerm("far", TermMatchType::EXACT_ONLY, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t far_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit));
ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit));
// 2. Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(main_index_file_name, &filesystem_,
&icing_filesystem_));
// 3. Merge the index. The main index should contain "fool", "foot"
// and "far" as well as a branch points for "foo" and "f". "fa" and "fo"
// should not be present because it is not a branch point.
ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
// 4. Index two docs in a new Lite Index:
// - Doc3 {"foot", "four", "foul", "fall" is_in_prefix_section=false}
// - Doc4 {"four", "foul" is_in_prefix_section=true}
std::string lite_index_file_name2 = index_dir_ + "/test_file.lite-idx.index2";
LiteIndex::Options options(lite_index_file_name2,
/*hit_buffer_want_merge_bytes=*/1024 * 1024,
/*hit_buffer_sort_at_indexing=*/true,
/*hit_buffer_sort_threshold_bytes=*/1024 * 8);
ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
LiteIndex::Create(options, &icing_filesystem_));
ICING_ASSERT_OK_AND_ASSIGN(
tvi,
lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(
tvi, lite_index_->InsertTerm("four", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t four_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
tvi, lite_index_->InsertTerm("foul", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foul_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
tvi,
lite_index_->InsertTerm("fall", TermMatchType::EXACT_ONLY, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t fall_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc3_hit(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc3_hit));
ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc3_hit));
ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc3_hit));
ICING_ASSERT_OK(lite_index_->AddHit(fall_term_id, doc3_hit));
Hit doc4_hit(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc4_hit));
ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc4_hit));
// 3. Merge the index. The main index should now contain "foul", "four"
// and "fall", a branch points for "fou" and backfill points for "fo".
ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
// Get hits from an exact posting list the existed before the merge.
std::vector<DocHitInfo> hits =
GetExactHits(main_index.get(), /*term_start_index=*/0,
/*unnormalized_term_length=*/0, "foot");
// We should get hits for "foot" in doc3, doc1 and doc0
EXPECT_THAT(
hits,
ElementsAre(
EqualsDocHitInfo(doc3_hit.document_id(),
std::vector<SectionId>{doc3_hit.section_id()}),
EqualsDocHitInfo(doc1_hit.document_id(),
std::vector<SectionId>{doc1_hit.section_id()}),
EqualsDocHitInfo(doc0_hit.document_id(),
std::vector<SectionId>{doc0_hit.section_id()})));
// Get hits from backfill posting list.
hits = GetPrefixHits(main_index.get(), /*term_start_index=*/0,
/*unnormalized_term_length=*/0, "fo");
// We should get hits for "four" and "foul" in doc4 and hits for "foot" and
// "fool" in doc1. We shouldn't get the hits for "foot" in doc0 and doc3,
// "fool" in doc0 and doc2 or the hits for "four" and "foul" in doc4 because
// they weren't hits in prefix sections.
EXPECT_THAT(
hits,
ElementsAre(
EqualsDocHitInfo(doc4_hit.document_id(),
std::vector<SectionId>{doc4_hit.section_id()}),
EqualsDocHitInfo(doc1_hit.document_id(),
std::vector<SectionId>{doc1_hit.section_id()})));
}
TEST_F(MainIndexTest, ExactRetrievedInPrefixSearch) {
// 1. Index two docs in the Lite Index:
// - Doc0 {"foot" is_in_prefix_section=true}
// - Doc1 {"foo" is_in_prefix_section=false}
// - Doc2 {"foot" is_in_prefix_section=false}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t tvi,
lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
tvi,
lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit));
Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc2_hit));
// 2. Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(main_index_file_name, &filesystem_,
&icing_filesystem_));
// 3. Merge the lite lexicon. The main lexicon should contain "foot" and
// "foo".
ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
std::vector<DocHitInfo> hits =
GetPrefixHits(main_index.get(), /*term_start_index=*/0,
/*unnormalized_term_length=*/0, "foo");
// We should get hits for "foo" in doc1 and doc0, but not in doc2 because it
// is not a prefix hit.
EXPECT_THAT(
hits,
ElementsAre(
EqualsDocHitInfo(doc1_hit.document_id(),
std::vector<SectionId>{doc1_hit.section_id()}),
EqualsDocHitInfo(doc0_hit.document_id(),
std::vector<SectionId>{doc0_hit.section_id()})));
}
TEST_F(MainIndexTest, PrefixNotRetrievedInExactSearch) {
// 1. Index two docs in the Lite Index:
// - Doc0 {"foot" is_in_prefix_section=true}
// - Doc1 {"foo" is_in_prefix_section=false}
// - Doc1 {"foo" is_in_prefix_section=true}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t tvi,
lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
tvi, lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit));
Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc2_hit));
// 2. Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(main_index_file_name, &filesystem_,
&icing_filesystem_));
// 3. Merge the lite lexicon. The main lexicon should contain "foot" and
// "foo".
ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
std::vector<DocHitInfo> hits =
GetExactHits(main_index.get(), /*term_start_index=*/0,
/*unnormalized_term_length=*/0, "foo");
// We should get hits for "foo" in doc2 and doc1, but not in doc0 because it
// is not an exact hit.
EXPECT_THAT(
hits,
ElementsAre(
EqualsDocHitInfo(doc2_hit.document_id(),
std::vector<SectionId>{doc2_hit.section_id()}),
EqualsDocHitInfo(doc1_hit.document_id(),
std::vector<SectionId>{doc1_hit.section_id()})));
}
TEST_F(MainIndexTest,
SearchChainedPostingListsShouldMergeSectionsAndTermFrequency) {
// Index 2048 document with 3 hits in each document. When merged into the main
// index, this will 1) lead to a chained posting list and 2) split at least
// one document's hits across multiple posting lists.
const std::string term = "foot";
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t tvi,
lite_index_->InsertTerm(term, TermMatchType::EXACT_ONLY, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
for (DocumentId document_id = 0; document_id < 2048; ++document_id) {
Hit::TermFrequency term_frequency = static_cast<Hit::TermFrequency>(
document_id % Hit::kMaxTermFrequency + 1);
Hit doc_hit0(
/*section_id=*/0, /*document_id=*/document_id, term_frequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit0));
Hit doc_hit1(
/*section_id=*/1, /*document_id=*/document_id, term_frequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit1));
Hit doc_hit2(
/*section_id=*/2, /*document_id=*/document_id, term_frequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit2));
}
// 2. Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(main_index_file_name, &filesystem_,
&icing_filesystem_));
// 3. Merge the lite index.
ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
// Get hits for all documents containing "foot" - which should be all of them.
auto iterator = std::make_unique<DocHitInfoIteratorTermMainExact>(
main_index.get(), term, /*term_start_index=*/0,
/*unnormalized_term_length=*/0, kSectionIdMaskAll,
/*need_hit_term_frequency=*/true);
DocumentId expected_document_id = 2047;
while (iterator->Advance().ok()) {
EXPECT_THAT(iterator->doc_hit_info(),
EqualsDocHitInfo(expected_document_id,
std::vector<SectionId>{0, 1, 2}));
std::vector<TermMatchInfo> matched_terms_stats;
iterator->PopulateMatchedTermsStats(&matched_terms_stats);
Hit::TermFrequency expected_term_frequency =
static_cast<Hit::TermFrequency>(
expected_document_id % Hit::kMaxTermFrequency + 1);
ASSERT_THAT(matched_terms_stats, SizeIs(1));
EXPECT_THAT(matched_terms_stats[0].term, Eq(term));
EXPECT_THAT(matched_terms_stats[0].term_frequencies[0],
Eq(expected_term_frequency));
EXPECT_THAT(matched_terms_stats[0].term_frequencies[1],
Eq(expected_term_frequency));
EXPECT_THAT(matched_terms_stats[0].term_frequencies[2],
Eq(expected_term_frequency));
--expected_document_id;
}
EXPECT_THAT(expected_document_id, Eq(-1));
}
TEST_F(MainIndexTest, MergeIndexBackfilling) {
// 1. Index one doc in the Lite Index:
// - Doc0 {"fool" is_in_prefix_section=true}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t tvi,
lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
// 2. Create the main index. It should have no entries in its lexicon.
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(main_index_file_name, &filesystem_,
&icing_filesystem_));
// 3. Merge the index. The main index should contain "fool".
ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
// 4. Index two docs in a new Lite Index:
// - Doc1 {"foot" is_in_prefix_section=false}
std::string lite_index_file_name2 = index_dir_ + "/test_file.lite-idx.index2";
LiteIndex::Options options(lite_index_file_name2,
/*hit_buffer_want_merge_bytes=*/1024 * 1024,
/*hit_buffer_sort_at_indexing=*/true,
/*hit_buffer_sort_threshold_bytes=*/1024 * 8);
ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
LiteIndex::Create(options, &icing_filesystem_));
ICING_ASSERT_OK_AND_ASSIGN(
tvi,
lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
// 5. Merge the index. The main index should now contain "fool", "foot"
// and a backfill point for "foo".
ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
// Get hits from an exact posting list the existed before the merge.
std::vector<DocHitInfo> hits =
GetExactHits(main_index.get(), /*term_start_index=*/0,
/*unnormalized_term_length=*/0, "foo");
EXPECT_THAT(hits, IsEmpty());
// Get hits from backfill posting list.
hits = GetPrefixHits(main_index.get(), /*term_start_index=*/0,
/*unnormalized_term_length=*/0, "foo");
// We should get a hit for "fool" in doc0.
EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
doc0_hit.document_id(),
std::vector<SectionId>{doc0_hit.section_id()})));
}
TEST_F(MainIndexTest, OneHitInTheFirstPageForTwoPagesMainIndex) {
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t tvi,
lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(tvi, TviType::LITE));
SectionId section_id = 0;
// Based on debugging logs, 2038 documents in the following setting will
// result in two pages in the posting list chain, and the first page only
// contains one hit.
uint32_t num_docs = 2038;
for (DocumentId document_id = 0; document_id < num_docs; ++document_id) {
Hit doc_hit(section_id, document_id, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit));
}
std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<MainIndex> main_index,
MainIndex::Create(main_index_file_name, &filesystem_,
&icing_filesystem_));
ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
std::vector<DocHitInfo> hits =
GetExactHits(main_index.get(), /*term_start_index=*/0,
/*unnormalized_term_length=*/0, "foo");
ASSERT_THAT(hits, SizeIs(num_docs));
for (DocumentId document_id = num_docs - 1; document_id >= 0; --document_id) {
ASSERT_THAT(
hits[num_docs - 1 - document_id],
EqualsDocHitInfo(document_id, std::vector<SectionId>{section_id}));
}
}
} // namespace
} // namespace lib
} // namespace icing