Refactor deps log splitting, add tests

Factor out a MustBeDepsRecord function from SplitDepsLog, and add tests
for MustBeDepsRecord.

Allow a split candidate to appear as the first 1-3 words of a chunk (and
in principle, the start of a file). If we can't look backwards to rule out
an mtime (because we're too close to the start of the file), then we know
we can't be looking at an mtime.

A deps record won't actually appear at the start of a file, because the
path records it references will come first. An invalid file could start
with a deps record, though, so adjust the
"assert(chunk_start < candidate)" check to allow and discard an empty
word span.

Test: ninja_test
Test: replace SplitByThreads(log.size) with
  SplitByChunkSize<size_t>(log.size, 1) and rerun ninja_test
Test: Short-circuit SplitDepsLog with "return {{ 0, log.size }}" and
  rerun ninja_test
Change-Id: Ibab39b6f429152274e911daaa52fd0e96990cd29
diff --git a/src/deps_log.cc b/src/deps_log.cc
index d209b4f..718464c 100644
--- a/src/deps_log.cc
+++ b/src/deps_log.cc
@@ -189,9 +189,10 @@
   return (header & 0x80000000) == 0x80000000;
 }
 
-static inline bool IsValidDepsRecordHeader(size_t header) {
-  return IsDepsRecordHeader(header) && RecordSizeInWords(header);
-}
+struct DepsLogWordSpan {
+  size_t begin; // starting index into a deps log buffer of uint32 words
+  size_t end; // stopping index
+};
 
 /// Split the v4 deps log into independently-parseable chunks using a heuristic.
 /// If the heuristic fails, we'll still load the file correctly, but it could be
@@ -208,7 +209,8 @@
 ///    deps:
 ///     - uint32 size -- high bit is set
 ///     - int32 output_path_id
-///     - uint64 output_path_mtime
+///     - uint32 output_path_mtime_lo
+///     - uint32 output_path_mtime_hi
 ///     - int32 input_path_id[...] -- every remaining word is an input ID
 ///
 /// To split the deps log into chunks, look for uint32 words with the value
@@ -230,50 +232,67 @@
 ///    I've seen in the Android build have a few hundred. Only a few have ~10K.)
 ///  - All deps records with <16K of dependencies are preceded by something that
 ///    the mtime check rejects:
-///     - A deps record with an mtime within a 524us span every 4s, with one
-///       dependency.
+///     - A deps record with an mtime within a 524us span every 4s, with zero or
+///       one dependency.
 ///     - A deps record with an mtime from 2262, with one or two dependencies.
-///     - A path record containing "xx xx 0[0-7] 80" (little-endian) or "80 0[0-7] xx xx"
-///       (big-endian) as the last or second-to-last word. The "0[0-7]" is a
-///       control character, and "0[0-7] 80" is invalid UTF-8.
+///     - A path record containing "xx xx 0[0-7] 80" (little-endian) or
+///       "80 0[0-7] xx xx" (big-endian) as the last or second-to-last word. The
+///       "0[0-7]" is a control character, and "0[0-7] 80" is invalid UTF-8.
 ///
-/// Maybe we can add a delimiter to the log format and replace this code.
-static std::vector<std::pair<size_t, size_t>>
-SplitDepsLog(const uint32_t* table, size_t size, ThreadPool* thread_pool) {
-  if (size == 0) return {};
+/// Maybe we can add a delimiter to the log format and replace this heuristic.
+size_t MustBeDepsRecordHeader(DepsLogData log, size_t index) {
+  assert(index < log.size);
 
-  std::vector<std::pair<size_t, size_t>> blind_splits = SplitByThreads(size);
-  std::vector<std::pair<size_t, size_t>> chunks;
+  size_t this_header = log.words[index];
+  if ((this_header & 0xffff0000) != 0x80000000) return false;
+  if ((this_header & 0x0000ffff) == 0) return false;
+
+  // We've either found a deps record or an mtime. If it's an mtime, the
+  // word two or three spaces back will be a valid deps size (0x800xxxxx).
+  auto might_be_deps_record_header = [](size_t header) {
+    return (header & 0x80000000) == 0x80000000 &&
+           (header & 0x7fffffff) <= kMaxRecordSize;
+  };
+  if (index >= 3 && might_be_deps_record_header(log.words[index - 3])) {
+    return false;
+  }
+  if (index >= 2 && might_be_deps_record_header(log.words[index - 2])) {
+    return false;
+  }
+
+  // Success: As long as the deps log is valid up to this point, this index
+  // must start a deps record.
+  return true;
+}
+
+static std::vector<DepsLogWordSpan>
+SplitDepsLog(DepsLogData log, ThreadPool* thread_pool) {
+  if (log.size == 0) return {};
+
+  std::vector<std::pair<size_t, size_t>> blind_splits =
+      SplitByThreads(log.size);
+  std::vector<DepsLogWordSpan> chunks;
   size_t chunk_start = 0;
 
   auto split_candidates = ParallelMap(thread_pool, blind_splits,
-      [table](std::pair<size_t, size_t> chunk) {
-    // Skip the first two words to allow for the mtime check later on.
-    for (size_t index = chunk.first + 3; index < chunk.second; ++index) {
-      size_t this_header = table[index];
-      if (!IsValidDepsRecordHeader(this_header)) continue;
-      if ((this_header & 0xFFFF0000) != 0x80000000) continue;
-
-      // We've either found a deps record or a mtime. If it's an mtime, the
-      // word two or three spaces back will be a valid deps size (0x800xxxxx).
-      if (IsValidDepsRecordHeader(table[index - 3])) continue;
-      if (IsValidDepsRecordHeader(table[index - 2])) continue;
-
-      // Success: In a valid deps log, this index must start a deps record.
-      return index;
+      [log](std::pair<size_t, size_t> chunk) {
+    for (size_t i = chunk.first; i < chunk.second; ++i) {
+      if (MustBeDepsRecordHeader(log, i)) {
+        return i;
+      }
     }
     return SIZE_MAX;
   });
   for (size_t candidate : split_candidates) {
-    if (candidate != SIZE_MAX) {
-      assert(chunk_start < candidate);
+    assert(chunk_start <= candidate);
+    if (candidate != SIZE_MAX && chunk_start < candidate) {
       chunks.push_back({ chunk_start, candidate });
       chunk_start = candidate;
     }
   }
 
-  assert(chunk_start < size);
-  chunks.push_back({ chunk_start, size });
+  assert(chunk_start < log.size);
+  chunks.push_back({ chunk_start, log.size });
   return chunks;
 }
 
@@ -358,11 +377,10 @@
   std::unique_ptr<ThreadPool> thread_pool = CreateThreadPool();
 
   std::vector<Chunk> chunks;
-  for (std::pair<size_t, size_t> span :
-      SplitDepsLog(log.words, log.size, thread_pool.get())) {
+  for (DepsLogWordSpan span : SplitDepsLog(log, thread_pool.get())) {
     Chunk chunk {};
-    chunk.start = span.first;
-    chunk.stop = span.second;
+    chunk.start = span.begin;
+    chunk.stop = span.end;
     chunks.push_back(chunk);
   }
 
diff --git a/src/deps_log.h b/src/deps_log.h
index b981132..c4a60e2 100644
--- a/src/deps_log.h
+++ b/src/deps_log.h
@@ -34,6 +34,8 @@
   size_t size = 0;
 };
 
+size_t MustBeDepsRecordHeader(DepsLogData log, size_t index);
+
 /// As build commands run they can output extra dependency information
 /// (e.g. header dependencies for C source) dynamically.  DepsLog collects
 /// that information at build time and uses it for subsequent builds.
diff --git a/src/deps_log_test.cc b/src/deps_log_test.cc
index 64f01ec..f39f7da 100644
--- a/src/deps_log_test.cc
+++ b/src/deps_log_test.cc
@@ -498,4 +498,95 @@
   }
 }
 
+TEST_F(DepsLogTest, MustBeDepsRecordHeader) {
+  // Mark a word as a candidate.
+  static constexpr uint64_t kCandidate = 0x100000000;
+
+  // Verifies that MustBeDepsRecordHeader returns the expected value. Returns
+  // true on success.
+  auto do_test = [](std::vector<uint64_t> words) -> bool {
+    std::vector<uint32_t> data;
+    for (uint64_t word : words) {
+      // Coerce from uint64_t to uint32_t to mask off the kCandidate flag.
+      data.push_back(word);
+    }
+    DepsLogData log;
+    log.words = data.data();
+    log.size = data.size();
+    for (size_t i = 0; i < words.size(); ++i) {
+      const bool expected = (words[i] & kCandidate) == kCandidate;
+      if (expected != MustBeDepsRecordHeader(log, i)) {
+        printf("\n%s,%d: bad index: %zu\n", __FILE__, __LINE__, i);
+        return false;
+      }
+    }
+    return true;
+  };
+
+  // Two valid deps records with no dependencies. Each record's header is
+  // recognized as the start of a deps record. The first record has an mtime_hi
+  // from 2262.
+  EXPECT_TRUE(do_test({
+    // header               output_id   mtime_lo    mtime_hi
+    0x8000000c|kCandidate,  1,          2,          0x80000100,
+    0x8000000c|kCandidate,  3,          4,          5,
+  }));
+
+  // The first record's mtime_lo is within a 524us window. The second record's
+  // header looks like a potential mtime_lo for 0x8007fffc.
+  EXPECT_TRUE(do_test({
+    // header               output_id   mtime_lo    mtime_hi
+    0x8000000c|kCandidate,  1,          0x8007fffc, 2,
+    0x8000000c,             3,          4,          5,
+  }));
+
+  // 0x80080000 is above the maximum record size, so it is rejected as a
+  // possible header.
+  EXPECT_TRUE(do_test({
+    // header               output_id   mtime_lo    mtime_hi
+    0x8000000c|kCandidate,  1,          0x80080000, 2,
+    0x8000000c|kCandidate,  3,          4,          5,
+  }));
+
+  // Two deps records with >16K inputs each. The header could be confused with a
+  // path string containing control characters, so it's not a candidate.
+  EXPECT_TRUE(do_test({
+    // header               output_id   mtime_lo    mtime_hi
+    0x80010101,             1,          2,          3,    // input IDs elided...
+    0x80010101,             4,          5,          6,    // input IDs elided...
+  }));
+
+  // The first record has a single dependency and an mtime_hi from 2262. The
+  // second deps record's header looks like a potential mtime_lo for 0x80000100.
+  EXPECT_TRUE(do_test({
+    // header               output_id   mtime_lo    mtime_hi
+    0x80000010|kCandidate,  1,          2,          0x80000100, 3,
+    0x8000000c,             4,          5,          6,
+  }));
+
+  // The first deps record's mtime_lo is within a 524us window, and the second
+  // record's header looks like a potential mtime_hi for 0x80000100.
+  EXPECT_TRUE(do_test({
+    // header               output_id   mtime_lo    mtime_hi
+    0x80000010|kCandidate,  1,          0x80000100, 2,          3,
+    0x8000000c,             4,          5,          6,
+  }));
+
+  // The first record has two dependencies, so its mtime_lo doesn't disqualify
+  // the next record's header.
+  EXPECT_TRUE(do_test({
+    // header               output_id   mtime_lo    mtime_hi
+    0x80000014|kCandidate,  1,          0x80000100, 2,          3,          4,
+    0x8000000c|kCandidate,  5,          6,          7,
+  }));
+
+  // The first deps record's mtime_hi is from 2262, and the second record's
+  // header looks like a potential mtime_hi for 0x80000100.
+  EXPECT_TRUE(do_test({
+    // header               output_id   mtime_lo    mtime_hi
+    0x80000014|kCandidate,  1,          2,          0x80000100, 3,          4,
+    0x8000000c,             5,          6,          7,
+  }));
+}
+
 }  // anonymous namespace