Refactor deps log splitting, add tests
Factor out a MustBeDepsRecord function from SplitDepsLog, and add tests
for MustBeDepsRecord.
Allow a split candidate to appear as the first 1-3 words of a chunk (and
in principle, the start of a file). If we can't look backwards to rule out
an mtime (because we're too close to the start of the file), then we know
we can't be looking at an mtime.
A deps record won't actually appear at the start of a file, because the
path records it references will come first. An invalid file could start
with a deps record, though, so adjust the
"assert(chunk_start < candidate)" check to allow and discard an empty
word span.
Test: ninja_test
Test: replace SplitByThreads(log.size) with
SplitByChunkSize<size_t>(log.size, 1) and rerun ninja_test
Test: Short-circuit SplitDepsLog with "return {{ 0, log.size }}" and
rerun ninja_test
Change-Id: Ibab39b6f429152274e911daaa52fd0e96990cd29
diff --git a/src/deps_log.cc b/src/deps_log.cc
index d209b4f..718464c 100644
--- a/src/deps_log.cc
+++ b/src/deps_log.cc
@@ -189,9 +189,10 @@
return (header & 0x80000000) == 0x80000000;
}
-static inline bool IsValidDepsRecordHeader(size_t header) {
- return IsDepsRecordHeader(header) && RecordSizeInWords(header);
-}
+struct DepsLogWordSpan {
+ size_t begin; // starting index into a deps log buffer of uint32 words
+ size_t end; // stopping index
+};
/// Split the v4 deps log into independently-parseable chunks using a heuristic.
/// If the heuristic fails, we'll still load the file correctly, but it could be
@@ -208,7 +209,8 @@
/// deps:
/// - uint32 size -- high bit is set
/// - int32 output_path_id
-/// - uint64 output_path_mtime
+/// - uint32 output_path_mtime_lo
+/// - uint32 output_path_mtime_hi
/// - int32 input_path_id[...] -- every remaining word is an input ID
///
/// To split the deps log into chunks, look for uint32 words with the value
@@ -230,50 +232,67 @@
/// I've seen in the Android build have a few hundred. Only a few have ~10K.)
/// - All deps records with <16K of dependencies are preceded by something that
/// the mtime check rejects:
-/// - A deps record with an mtime within a 524us span every 4s, with one
-/// dependency.
+/// - A deps record with an mtime within a 524us span every 4s, with zero or
+/// one dependency.
/// - A deps record with an mtime from 2262, with one or two dependencies.
-/// - A path record containing "xx xx 0[0-7] 80" (little-endian) or "80 0[0-7] xx xx"
-/// (big-endian) as the last or second-to-last word. The "0[0-7]" is a
-/// control character, and "0[0-7] 80" is invalid UTF-8.
+/// - A path record containing "xx xx 0[0-7] 80" (little-endian) or
+/// "80 0[0-7] xx xx" (big-endian) as the last or second-to-last word. The
+/// "0[0-7]" is a control character, and "0[0-7] 80" is invalid UTF-8.
///
-/// Maybe we can add a delimiter to the log format and replace this code.
-static std::vector<std::pair<size_t, size_t>>
-SplitDepsLog(const uint32_t* table, size_t size, ThreadPool* thread_pool) {
- if (size == 0) return {};
+/// Maybe we can add a delimiter to the log format and replace this heuristic.
+size_t MustBeDepsRecordHeader(DepsLogData log, size_t index) {
+ assert(index < log.size);
- std::vector<std::pair<size_t, size_t>> blind_splits = SplitByThreads(size);
- std::vector<std::pair<size_t, size_t>> chunks;
+ size_t this_header = log.words[index];
+ if ((this_header & 0xffff0000) != 0x80000000) return false;
+ if ((this_header & 0x0000ffff) == 0) return false;
+
+ // We've either found a deps record or an mtime. If it's an mtime, the
+ // word two or three spaces back will be a valid deps size (0x800xxxxx).
+ auto might_be_deps_record_header = [](size_t header) {
+ return (header & 0x80000000) == 0x80000000 &&
+ (header & 0x7fffffff) <= kMaxRecordSize;
+ };
+ if (index >= 3 && might_be_deps_record_header(log.words[index - 3])) {
+ return false;
+ }
+ if (index >= 2 && might_be_deps_record_header(log.words[index - 2])) {
+ return false;
+ }
+
+ // Success: As long as the deps log is valid up to this point, this index
+ // must start a deps record.
+ return true;
+}
+
+static std::vector<DepsLogWordSpan>
+SplitDepsLog(DepsLogData log, ThreadPool* thread_pool) {
+ if (log.size == 0) return {};
+
+ std::vector<std::pair<size_t, size_t>> blind_splits =
+ SplitByThreads(log.size);
+ std::vector<DepsLogWordSpan> chunks;
size_t chunk_start = 0;
auto split_candidates = ParallelMap(thread_pool, blind_splits,
- [table](std::pair<size_t, size_t> chunk) {
- // Skip the first two words to allow for the mtime check later on.
- for (size_t index = chunk.first + 3; index < chunk.second; ++index) {
- size_t this_header = table[index];
- if (!IsValidDepsRecordHeader(this_header)) continue;
- if ((this_header & 0xFFFF0000) != 0x80000000) continue;
-
- // We've either found a deps record or a mtime. If it's an mtime, the
- // word two or three spaces back will be a valid deps size (0x800xxxxx).
- if (IsValidDepsRecordHeader(table[index - 3])) continue;
- if (IsValidDepsRecordHeader(table[index - 2])) continue;
-
- // Success: In a valid deps log, this index must start a deps record.
- return index;
+ [log](std::pair<size_t, size_t> chunk) {
+ for (size_t i = chunk.first; i < chunk.second; ++i) {
+ if (MustBeDepsRecordHeader(log, i)) {
+ return i;
+ }
}
return SIZE_MAX;
});
for (size_t candidate : split_candidates) {
- if (candidate != SIZE_MAX) {
- assert(chunk_start < candidate);
+ assert(chunk_start <= candidate);
+ if (candidate != SIZE_MAX && chunk_start < candidate) {
chunks.push_back({ chunk_start, candidate });
chunk_start = candidate;
}
}
- assert(chunk_start < size);
- chunks.push_back({ chunk_start, size });
+ assert(chunk_start < log.size);
+ chunks.push_back({ chunk_start, log.size });
return chunks;
}
@@ -358,11 +377,10 @@
std::unique_ptr<ThreadPool> thread_pool = CreateThreadPool();
std::vector<Chunk> chunks;
- for (std::pair<size_t, size_t> span :
- SplitDepsLog(log.words, log.size, thread_pool.get())) {
+ for (DepsLogWordSpan span : SplitDepsLog(log, thread_pool.get())) {
Chunk chunk {};
- chunk.start = span.first;
- chunk.stop = span.second;
+ chunk.start = span.begin;
+ chunk.stop = span.end;
chunks.push_back(chunk);
}
diff --git a/src/deps_log.h b/src/deps_log.h
index b981132..c4a60e2 100644
--- a/src/deps_log.h
+++ b/src/deps_log.h
@@ -34,6 +34,8 @@
size_t size = 0;
};
+size_t MustBeDepsRecordHeader(DepsLogData log, size_t index);
+
/// As build commands run they can output extra dependency information
/// (e.g. header dependencies for C source) dynamically. DepsLog collects
/// that information at build time and uses it for subsequent builds.
diff --git a/src/deps_log_test.cc b/src/deps_log_test.cc
index 64f01ec..f39f7da 100644
--- a/src/deps_log_test.cc
+++ b/src/deps_log_test.cc
@@ -498,4 +498,95 @@
}
}
+TEST_F(DepsLogTest, MustBeDepsRecordHeader) {
+ // Mark a word as a candidate.
+ static constexpr uint64_t kCandidate = 0x100000000;
+
+ // Verifies that MustBeDepsRecordHeader returns the expected value. Returns
+ // true on success.
+ auto do_test = [](std::vector<uint64_t> words) -> bool {
+ std::vector<uint32_t> data;
+ for (uint64_t word : words) {
+ // Coerce from uint64_t to uint32_t to mask off the kCandidate flag.
+ data.push_back(word);
+ }
+ DepsLogData log;
+ log.words = data.data();
+ log.size = data.size();
+ for (size_t i = 0; i < words.size(); ++i) {
+ const bool expected = (words[i] & kCandidate) == kCandidate;
+ if (expected != MustBeDepsRecordHeader(log, i)) {
+ printf("\n%s,%d: bad index: %zu\n", __FILE__, __LINE__, i);
+ return false;
+ }
+ }
+ return true;
+ };
+
+ // Two valid deps records with no dependencies. Each record's header is
+ // recognized as the start of a deps record. The first record has an mtime_hi
+ // from 2262.
+ EXPECT_TRUE(do_test({
+ // header output_id mtime_lo mtime_hi
+ 0x8000000c|kCandidate, 1, 2, 0x80000100,
+ 0x8000000c|kCandidate, 3, 4, 5,
+ }));
+
+ // The first record's mtime_lo is within a 524us window. The second record's
+ // header looks like a potential mtime_lo for 0x8007fffc.
+ EXPECT_TRUE(do_test({
+ // header output_id mtime_lo mtime_hi
+ 0x8000000c|kCandidate, 1, 0x8007fffc, 2,
+ 0x8000000c, 3, 4, 5,
+ }));
+
+ // 0x80080000 is above the maximum record size, so it is rejected as a
+ // possible header.
+ EXPECT_TRUE(do_test({
+ // header output_id mtime_lo mtime_hi
+ 0x8000000c|kCandidate, 1, 0x80080000, 2,
+ 0x8000000c|kCandidate, 3, 4, 5,
+ }));
+
+ // Two deps records with >16K inputs each. The header could be confused with a
+ // path string containing control characters, so it's not a candidate.
+ EXPECT_TRUE(do_test({
+ // header output_id mtime_lo mtime_hi
+ 0x80010101, 1, 2, 3, // input IDs elided...
+ 0x80010101, 4, 5, 6, // input IDs elided...
+ }));
+
+ // The first record has a single dependency and an mtime_hi from 2262. The
+ // second deps record's header looks like a potential mtime_lo for 0x80000100.
+ EXPECT_TRUE(do_test({
+ // header output_id mtime_lo mtime_hi
+ 0x80000010|kCandidate, 1, 2, 0x80000100, 3,
+ 0x8000000c, 4, 5, 6,
+ }));
+
+ // The first deps record's mtime_lo is within a 524us window, and the second
+ // record's header looks like a potential mtime_hi for 0x80000100.
+ EXPECT_TRUE(do_test({
+ // header output_id mtime_lo mtime_hi
+ 0x80000010|kCandidate, 1, 0x80000100, 2, 3,
+ 0x8000000c, 4, 5, 6,
+ }));
+
+ // The first record has two dependencies, so its mtime_lo doesn't disqualify
+ // the next record's header.
+ EXPECT_TRUE(do_test({
+ // header output_id mtime_lo mtime_hi
+ 0x80000014|kCandidate, 1, 0x80000100, 2, 3, 4,
+ 0x8000000c|kCandidate, 5, 6, 7,
+ }));
+
+ // The first deps record's mtime_hi is from 2262, and the second record's
+ // header looks like a potential mtime_hi for 0x80000100.
+ EXPECT_TRUE(do_test({
+ // header output_id mtime_lo mtime_hi
+ 0x80000014|kCandidate, 1, 2, 0x80000100, 3, 4,
+ 0x8000000c, 5, 6, 7,
+ }));
+}
+
} // anonymous namespace