blob: 9af8efadc069e34805fcb8d94de6adf33d4ad067 [file] [log] [blame]
// Copyright (C) 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
#include "third_party/absl/flags/flag.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/search.pb.h"
#include "icing/result/snippet-retriever.h"
#include "icing/schema-builder.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/random-string.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/util/clock.h"
#include "icing/util/logging.h"
#include "unicode/uloc.h"
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
// //icing/result:snippet-retriever_benchmark
//
// $ blaze-bin/icing/result/snippet-retriever_benchmark
// --benchmark_filter=all
//
// Run on an Android device:
// Make target //icing/tokenization:language-segmenter depend on
// //third_party/icu
//
// Make target //icing/transform:normalizer depend on
// //third_party/icu
//
// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
// //icing/result:snippet-retriever_benchmark
//
// $ adb push blaze-bin/icing/result/snippet-retriever_benchmark
// /data/local/tmp/
//
// $ adb shell /data/local/tmp/snippet-retriever_benchmark
// --benchmark_filter=all --adb
// Flag to tell the benchmark that it'll be run on an Android device via adb,
// the benchmark will set up data files accordingly.
ABSL_FLAG(bool, adb, false, "run benchmark via ADB on an Android device");
namespace icing {
namespace lib {
namespace {
using ::testing::SizeIs;
void BM_SnippetOneProperty(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
GetTestFilePath("icing/icu.dat")));
}
const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
const std::string schema_dir = base_dir + "/schema";
Filesystem filesystem;
filesystem.DeleteDirectoryRecursively(base_dir.c_str());
if (!filesystem.CreateDirectoryRecursively(schema_dir.c_str())) {
ICING_LOG(ERROR) << "Failed to create test directories";
}
language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer =
normalizer_factory::Create(
/*max_term_byte_size=*/std::numeric_limits<int>::max())
.ValueOrDie();
SchemaProto schema =
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder().SetType("type1").AddProperty(
PropertyConfigBuilder()
.SetName("prop1")
.SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
Clock clock;
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
SchemaStore::Create(&filesystem, schema_dir, &clock));
ICING_ASSERT_OK(schema_store->SetSchema(schema));
auto snippet_retriever =
SnippetRetriever::Create(schema_store.get(), language_segmenter.get(),
normalizer.get())
.ValueOrDie();
int num_matches = state.range(0);
int total_terms = state.range(1);
std::default_random_engine random;
std::vector<std::string> language =
CreateLanguages(/*language_size=*/1000, &random);
std::uniform_int_distribution<size_t> uniform(0u, language.size() - 1);
std::uniform_real_distribution<double> uniform_double(0.0, 1.0);
std::string text;
int num_actual_matches = 0;
double match_chance;
while (total_terms-- > 0) {
std::string term;
match_chance = static_cast<double>(num_matches) / total_terms;
if (uniform_double(random) <= match_chance) {
--num_matches;
++num_actual_matches;
term = "foo";
} else {
term = language.at(uniform(random));
}
absl_ports::StrAppend(&text, " ", term);
}
DocumentProto document = DocumentBuilder()
.SetKey("icing", "uri1")
.SetSchema("type1")
.AddStringProperty("prop1", text)
.Build();
SectionRestrictQueryTermsMap query_terms = {{"", {"foo"}}};
ResultSpecProto::SnippetSpecProto snippet_spec;
snippet_spec.set_num_to_snippet(100000);
snippet_spec.set_num_matches_per_property(100000);
snippet_spec.set_max_window_utf32_length(64);
SectionIdMask section_id_mask = 0x01;
SnippetProto snippet_proto;
for (auto _ : state) {
snippet_proto = snippet_retriever->RetrieveSnippet(
query_terms, TERM_MATCH_PREFIX, snippet_spec, document,
section_id_mask);
ASSERT_THAT(snippet_proto.entries(), SizeIs(1));
ASSERT_THAT(snippet_proto.entries(0).snippet_matches(),
SizeIs(num_actual_matches));
}
// Destroy the schema store before the whole directory is removed because they
// persist data in destructor.
schema_store.reset();
filesystem.DeleteDirectoryRecursively(base_dir.c_str());
}
BENCHMARK(BM_SnippetOneProperty)
// Arguments: num_matches, total_terms
->ArgPair(1, 1)
->ArgPair(1, 16) // single match
->ArgPair(2, 16) // ~10% matches
->ArgPair(3, 16) // ~20% matches
->ArgPair(8, 16) // 50% matches
->ArgPair(16, 16) // 100% matches
->ArgPair(1, 128) // single match
->ArgPair(13, 128) // ~10% matches
->ArgPair(26, 128) // ~20% matches
->ArgPair(64, 128) // 50% matches
->ArgPair(128, 128) // 100% matches
->ArgPair(1, 512) // single match
->ArgPair(51, 512) // ~10% matches
->ArgPair(102, 512) // ~20% matches
->ArgPair(256, 512) // 50% matches
->ArgPair(512, 512) // 100% matches
->ArgPair(1, 1024) // single match
->ArgPair(102, 1024) // ~10% matches
->ArgPair(205, 1024) // ~20% matches
->ArgPair(512, 1024) // 50% matches
->ArgPair(1024, 1024) // 100% matches
->ArgPair(1, 4096) // single match
->ArgPair(410, 4096) // ~10% matches
->ArgPair(819, 4096) // ~20% matches
->ArgPair(2048, 4096) // 50% matches
->ArgPair(4096, 4096) // 100% matches
->ArgPair(1, 16384) // single match
->ArgPair(1638, 16384) // ~10% matches
->ArgPair(3277, 16384) // ~20% matches
->ArgPair(8192, 16384) // 50% matches
->ArgPair(16384, 16384); // 100% matches
void BM_SnippetRfcOneProperty(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
GetTestFilePath("icing/icu.dat")));
}
const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
const std::string schema_dir = base_dir + "/schema";
Filesystem filesystem;
filesystem.DeleteDirectoryRecursively(base_dir.c_str());
if (!filesystem.CreateDirectoryRecursively(schema_dir.c_str())) {
ICING_LOG(ERROR) << "Failed to create test directories";
}
language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer =
normalizer_factory::Create(
/*max_term_byte_size=*/std::numeric_limits<int>::max())
.ValueOrDie();
SchemaProto schema =
SchemaBuilder()
.AddType(SchemaTypeConfigBuilder().SetType("type1").AddProperty(
PropertyConfigBuilder()
.SetName("prop1")
.SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
.SetCardinality(CARDINALITY_OPTIONAL)))
.Build();
Clock clock;
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
SchemaStore::Create(&filesystem, schema_dir, &clock));
ICING_ASSERT_OK(schema_store->SetSchema(schema));
auto snippet_retriever =
SnippetRetriever::Create(schema_store.get(), language_segmenter.get(),
normalizer.get())
.ValueOrDie();
int num_matches = state.range(0);
int total_terms = state.range(1);
std::default_random_engine random;
std::vector<std::string> language =
CreateLanguages(/*language_size=*/1000, &random);
std::uniform_int_distribution<size_t> uniform(0u, language.size() - 1);
std::uniform_real_distribution<double> uniform_double(0.0, 1.0);
std::string text;
int num_actual_matches = 0;
double match_chance;
while (total_terms-- > 0) {
std::string term;
match_chance = static_cast<double>(num_matches) / total_terms;
if (uniform_double(random) <= match_chance) {
--num_matches;
++num_actual_matches;
term = "foo@google.com";
} else {
term = absl_ports::StrCat(language.at(uniform(random)), "@google.com");
}
absl_ports::StrAppend(&text, ",", term);
}
DocumentProto document = DocumentBuilder()
.SetKey("icing", "uri1")
.SetSchema("type1")
.AddStringProperty("prop1", text)
.Build();
SectionRestrictQueryTermsMap query_terms = {{"", {"foo"}}};
ResultSpecProto::SnippetSpecProto snippet_spec;
snippet_spec.set_num_to_snippet(100000);
snippet_spec.set_num_matches_per_property(100000);
snippet_spec.set_max_window_utf32_length(64);
SectionIdMask section_id_mask = 0x01;
SnippetProto snippet_proto;
for (auto _ : state) {
snippet_proto = snippet_retriever->RetrieveSnippet(
query_terms, TERM_MATCH_PREFIX, snippet_spec, document,
section_id_mask);
ASSERT_THAT(snippet_proto.entries(), SizeIs(1));
ASSERT_THAT(snippet_proto.entries(0).snippet_matches(),
SizeIs(num_actual_matches));
}
// Destroy the schema store before the whole directory is removed because they
// persist data in destructor.
schema_store.reset();
filesystem.DeleteDirectoryRecursively(base_dir.c_str());
}
BENCHMARK(BM_SnippetRfcOneProperty)
// Arguments: num_matches, total_terms
->ArgPair(1, 1)
->ArgPair(1, 16) // single match
->ArgPair(2, 16) // ~10% matches
->ArgPair(3, 16) // ~20% matches
->ArgPair(8, 16) // 50% matches
->ArgPair(16, 16) // 100% matches
->ArgPair(1, 128) // single match
->ArgPair(13, 128) // ~10% matches
->ArgPair(26, 128) // ~20% matches
->ArgPair(64, 128) // 50% matches
->ArgPair(128, 128) // 100% matches
->ArgPair(1, 512) // single match
->ArgPair(51, 512) // ~10% matches
->ArgPair(102, 512) // ~20% matches
->ArgPair(256, 512) // 50% matches
->ArgPair(512, 512) // 100% matches
->ArgPair(1, 1024) // single match
->ArgPair(102, 1024) // ~10% matches
->ArgPair(205, 1024) // ~20% matches
->ArgPair(512, 1024) // 50% matches
->ArgPair(1024, 1024) // 100% matches
->ArgPair(1, 4096) // single match
->ArgPair(410, 4096) // ~10% matches
->ArgPair(819, 4096) // ~20% matches
->ArgPair(2048, 4096) // 50% matches
->ArgPair(4096, 4096) // 100% matches
->ArgPair(1, 16384) // single match
->ArgPair(1638, 16384) // ~10% matches
->ArgPair(3277, 16384) // ~20% matches
->ArgPair(8192, 16384) // 50% matches
->ArgPair(16384, 16384); // 100% matches
} // namespace
} // namespace lib
} // namespace icing