blob: 4371c34c6fb5160da17386e77df665f52b3d3613 [file] [log] [blame]
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "deps_log.h"
#include <assert.h>
#include <stdio.h>
#include <errno.h>
#include <string.h>
#ifndef _WIN32
#include <unistd.h>
#endif
#include <numeric>
#include "disk_interface.h"
#include "graph.h"
#include "metrics.h"
#include "parallel_map.h"
#include "state.h"
#include "util.h"
// The version is stored as 4 bytes after the signature and also serves as a
// byte order mark. Signature and version combined are 16 bytes long.
static constexpr StringPiece kFileSignature { "# ninjadeps\n", 12 };
static_assert(kFileSignature.size() % 4 == 0,
"file signature size is not a multiple of 4");
static constexpr size_t kFileHeaderSize = kFileSignature.size() + 4;
const int kCurrentVersion = 3;
// Record size is currently limited to less than the full 32 bit, due to
// internal buffers having to have this size.
const unsigned kMaxRecordSize = (1 << 19) - 1;
DepsLog::~DepsLog() {
Close();
}
bool DepsLog::OpenForWrite(const string& path, const DiskInterface& disk, string* err) {
if (needs_recompaction_) {
if (!Recompact(path, disk, err))
return false;
}
file_ = fopen(path.c_str(), "ab");
if (!file_) {
*err = strerror(errno);
return false;
}
// Set the buffer size to this and flush the file buffer after every record
// to make sure records aren't written partially.
setvbuf(file_, NULL, _IOFBF, kMaxRecordSize + 1);
SetCloseOnExec(fileno(file_));
// Opening a file in append mode doesn't set the file pointer to the file's
// end on Windows. Do that explicitly.
fseek(file_, 0, SEEK_END);
if (ftell(file_) == 0) {
if (fwrite(kFileSignature.data(), kFileSignature.size(), 1, file_) < 1) {
*err = strerror(errno);
return false;
}
if (fwrite(&kCurrentVersion, 4, 1, file_) < 1) {
*err = strerror(errno);
return false;
}
}
if (fflush(file_) != 0) {
*err = strerror(errno);
return false;
}
return true;
}
bool DepsLog::RecordDeps(Node* node, TimeStamp mtime,
const vector<Node*>& nodes) {
return RecordDeps(node, mtime, nodes.size(),
nodes.empty() ? NULL : (Node**)&nodes.front());
}
bool DepsLog::RecordDeps(Node* node, TimeStamp mtime,
int node_count, Node** nodes) {
// Track whether there's any new data to be recorded.
bool made_change = false;
// Assign ids to all nodes that are missing one.
if (node->id() < 0) {
if (!RecordId(node))
return false;
made_change = true;
}
for (int i = 0; i < node_count; ++i) {
if (nodes[i]->id() < 0) {
if (!RecordId(nodes[i]))
return false;
made_change = true;
}
}
// See if the new data is different than the existing data, if any.
if (!made_change) {
Deps* deps = GetDeps(node);
if (!deps ||
deps->mtime != mtime ||
deps->node_count != node_count) {
made_change = true;
} else {
for (int i = 0; i < node_count; ++i) {
if (deps->nodes[i] != nodes[i]) {
made_change = true;
break;
}
}
}
}
// Don't write anything if there's no new info.
if (!made_change)
return true;
// Update on-disk representation.
unsigned size = 4 * (1 + 1 + node_count);
if (size > kMaxRecordSize) {
errno = ERANGE;
return false;
}
size |= 0x80000000; // Deps record: set high bit.
if (fwrite(&size, 4, 1, file_) < 1)
return false;
int id = node->id();
if (fwrite(&id, 4, 1, file_) < 1)
return false;
int timestamp = mtime;
if (fwrite(&timestamp, 4, 1, file_) < 1)
return false;
for (int i = 0; i < node_count; ++i) {
id = nodes[i]->id();
if (fwrite(&id, 4, 1, file_) < 1)
return false;
}
if (fflush(file_) != 0)
return false;
// Update in-memory representation.
Deps* deps = new Deps(mtime, node_count);
for (int i = 0; i < node_count; ++i)
deps->nodes[i] = nodes[i];
UpdateDeps(node->id(), deps);
return true;
}
void DepsLog::Close() {
if (file_)
fclose(file_);
file_ = NULL;
}
// Return the number of words in the record, including the header, or 0 if
// the header is invalid.
static inline size_t RecordSizeInWords(size_t header) {
header &= 0x7FFFFFFF;
if (header % sizeof(uint32_t) != 0) return 0;
// Either (node ID and mtime) or (data and checksum)
if (header < sizeof(uint32_t) * 2) return 0;
if (header > kMaxRecordSize) return 0;
return header / sizeof(uint32_t) + 1;
}
static inline bool IsDepsRecordHeader(size_t header) {
return (header & 0x80000000) == 0x80000000;
}
static inline bool IsValidDepsRecordHeader(size_t header) {
return IsDepsRecordHeader(header) && RecordSizeInWords(header);
}
/// Split the v3 deps log into independently-parseable chunks using a heuristic.
/// If the heuristic fails, we'll still load the file correctly, but it could be
/// slower.
///
/// There are two kinds of records -- path and deps records. Their formats:
///
/// path:
/// - uint32 size -- high bit is clear
/// - String content. The string is padded to a multiple of 4 bytes with
/// trailing NULs.
/// - uint32 checksum (ones complement of the path's index / node ID)
///
/// deps:
/// - uint32 size -- high bit is set
/// - int32 output_path_id
/// - uint32 output_path_mtime
/// - int32 input_path_id[...] -- every remaining word is an input ID
///
/// To split the deps log into chunks, look for uint32 words with the value
/// 0x8000xxxx, where xxxx is nonzero. Such a word is almost guaranteed to be
/// the size field of a deps record (with fewer than ~16K dependencies):
/// - It can't be part of a string, because paths can't have embedded NULs.
/// - It (probably) can't be a node ID, because node IDs are represented using
/// "int", and it would be unlikely to have more than 2 billion of them. An
/// Android build typically has about 1 million nodes.
/// - It's unlikely to be part of a path checksum, because that would also
/// imply that we have at least 2 billion nodes.
/// - It could be an mtime from 1901, which we rule out by looking for the
/// mtime's deps size two words above the split candidate.
///
/// This heuristic can fail in a few ways:
/// - We only find path records in the area we scan.
/// - The deps records all have >16K of dependencies. (Almost all deps records
/// I've seen in the Android build have a few hundred. Only a few have ~10K.)
/// - The area contains only deps entries with an mtime from 1901 and one
/// dependency.
///
/// Maybe we can add a delimiter to the log format and replace this code. I
/// believe this heuristic can be adapted to work with the v4 format, which
/// expands the mtime to 64-bits.
static std::vector<std::pair<size_t, size_t>>
SplitDepsLog(const uint32_t* table, size_t size, ThreadPool* thread_pool) {
if (size == 0) return {};
std::vector<std::pair<size_t, size_t>> blind_splits = SplitByThreads(size);
std::vector<std::pair<size_t, size_t>> chunks;
size_t chunk_start = 0;
auto split_candidates = ParallelMap(thread_pool, blind_splits,
[table](std::pair<size_t, size_t> chunk) {
// Skip the first two words to allow for the 1901 mtime check later on.
for (size_t index = chunk.first + 2; index < chunk.second; ++index) {
size_t this_header = table[index];
if (!IsValidDepsRecordHeader(this_header)) continue;
if ((this_header & 0xFFFF0000) != 0x80000000) continue;
// We've either found a deps record or a 1901 mtime (unlikely). If it's an
// mtime, the word two spaces back will be a valid deps size (0x800xxxxx).
if (IsValidDepsRecordHeader(table[index - 2])) continue;
// Success: In a valid deps log, this index must start a deps record.
return index;
}
return SIZE_MAX;
});
for (size_t candidate : split_candidates) {
if (candidate != SIZE_MAX) {
assert(chunk_start < candidate);
chunks.push_back({ chunk_start, candidate });
chunk_start = candidate;
}
}
assert(chunk_start < size);
chunks.push_back({ chunk_start, size });
return chunks;
}
struct DepsLogInput {
std::unique_ptr<LoadedFile> file;
const uint32_t* table = nullptr;
size_t table_size = 0;
};
static bool OpenDepsLogForReading(const std::string& path,
DepsLogInput* log,
std::string* err) {
*log = {};
RealDiskInterface file_reader;
std::string load_err;
switch (file_reader.LoadFile(path, &log->file, &load_err)) {
case FileReader::Okay:
break;
case FileReader::NotFound:
return true;
default:
*err = load_err;
return false;
}
bool valid_header = false;
int version = 0;
if (log->file->content().size() >= kFileHeaderSize ||
log->file->content().substr(0, kFileSignature.size()) == kFileSignature) {
valid_header = true;
memcpy(&version,
log->file->content().data() + kFileSignature.size(),
sizeof(version));
}
// Note: For version differences, this should migrate to the new format.
// But the v1 format could sometimes (rarely) end up with invalid data, so
// don't migrate v1 to v3 to force a rebuild. (v2 only existed for a few days,
// and there was no release with it, so pretend that it never happened.)
if (!valid_header || version != kCurrentVersion) {
if (version == 1)
*err = "deps log version change; rebuilding";
else
*err = "bad deps log signature or version; starting over";
log->file.reset();
unlink(path.c_str());
// Don't report this as a failure. An empty deps log will cause
// us to rebuild the outputs anyway.
return true;
}
log->table =
reinterpret_cast<const uint32_t*>(
log->file->content().data() + kFileHeaderSize);
log->table_size =
(log->file->content().size() - kFileHeaderSize) / sizeof(uint32_t);
return true;
}
bool DepsLog::Load(const string& path, State* state, string* err) {
METRIC_RECORD(".ninja_deps load");
assert(nodes_.empty());
DepsLogInput log;
if (!OpenDepsLogForReading(path, &log, err)) return false;
if (log.file.get() == nullptr) return true;
struct NINJA_ALIGNAS_CACHE_LINE Chunk {
size_t start = 0;
size_t stop = 0;
int first_node_id = 0;
int initial_node_count = 0;
int final_node_count = 0;
size_t deps_count = 0;
bool parse_error = false;
};
std::unique_ptr<ThreadPool> thread_pool = CreateThreadPool();
std::vector<Chunk> chunks;
for (std::pair<size_t, size_t> span :
SplitDepsLog(log.table, log.table_size, thread_pool.get())) {
Chunk chunk {};
chunk.start = span.first;
chunk.stop = span.second;
chunks.push_back(chunk);
}
// Compute the starting node ID for each chunk. The result is correct as long as
// preceding chunks are parsed successfully. If there is a parsing error in a
// chunk, then following chunks are discarded after the validation pass.
ParallelMap(thread_pool.get(), chunks, [&log](Chunk& chunk) {
size_t index = chunk.start;
while (index < chunk.stop) {
size_t header = log.table[index];
size_t size = RecordSizeInWords(header);
if (!size) return; // invalid header
if (!IsDepsRecordHeader(header)) {
++chunk.initial_node_count;
}
index += size;
}
});
int initial_node_count = 0;
for (size_t i = 0; i < chunks.size(); ++i) {
Chunk& chunk = chunks[i];
chunk.first_node_id = initial_node_count;
initial_node_count += chunk.initial_node_count;
}
// A map from node ID to the final file table index of the dep record
// outputting the given node ID. The index is biased by 1 because 0 indicates
// that no dep record outputs this ID.
std::vector<std::atomic<size_t>> dep_index(initial_node_count);
// A map from node ID to file index of that node, with no bias.
std::vector<size_t> node_index(initial_node_count);
// The main parsing pass. Validate each chunk's entries and, for each node ID,
// record the location of its node and deps records. If there is parser error,
// truncate the log just before the problem record.
ParallelMap(thread_pool.get(), chunks,
[&log, &dep_index, &node_index](Chunk& chunk) {
size_t index = chunk.start;
int next_node_id = chunk.first_node_id;
while (index < chunk.stop) {
size_t header = log.table[index];
size_t size = RecordSizeInWords(header);
if (!size || (index + size > chunk.stop)) break;
if (IsDepsRecordHeader(header)) {
// Verify that input/output node IDs are valid.
int output_id = log.table[index + 1];
if (output_id < 0 || output_id >= next_node_id) break;
for (size_t i = 3; i < size; ++i) {
int input_id = log.table[index + i];
if (input_id < 0 || input_id >= next_node_id) break;
}
AtomicUpdateMaximum(&dep_index[output_id], index + 1);
++chunk.deps_count;
} else {
// Validate the path's checksum.
int checksum = log.table[index + size - 1];
if (checksum != ~next_node_id) break;
node_index[next_node_id] = index;
++next_node_id;
++chunk.final_node_count;
}
index += size;
}
// We'll exit early on a parser error.
if (index < chunk.stop) {
chunk.stop = index;
chunk.parse_error = true;
}
});
int node_count = 0;
size_t total_dep_record_count = 0;
for (size_t i = 0; i < chunks.size(); ++i) {
Chunk& chunk = chunks[i];
assert(chunk.first_node_id == node_count);
total_dep_record_count += chunk.deps_count;
node_count += chunk.final_node_count;
if (chunk.parse_error) {
// Part of this chunk may have been parsed successfully, so keep it, but
// discard all later chunks.
chunks.resize(i + 1);
break;
}
}
// The final node count could be smaller than the initial count if there was a
// parser error.
assert(node_count <= initial_node_count);
// The log is valid. Commit the nodes into the state graph. First make sure
// that the hash table has at least one bucket for each node in this deps log.
state->paths_.reserve(node_count);
nodes_.resize(node_count);
ParallelMap(thread_pool.get(), IntegralRange<int>(0, node_count),
[this, state, &log, &node_index](int node_id) {
size_t index = node_index[node_id];
size_t header = log.table[index];
size_t size = RecordSizeInWords(header);
const char* path = reinterpret_cast<const char*>(&log.table[index + 1]);
size_t path_size = (size - 2) * sizeof(uint32_t);
if (path[path_size - 1] == '\0') --path_size;
if (path[path_size - 1] == '\0') --path_size;
if (path[path_size - 1] == '\0') --path_size;
// It is not necessary to pass in a correct slash_bits here. It will
// either be a Node that's in the manifest (in which case it will
// already have a correct slash_bits that GetNode will look up), or it
// is an implicit dependency from a .d which does not affect the build
// command (and so need not have its slashes maintained).
Node* node = state->GetNode(StringPiece(path, path_size), 0);
assert(node->id() < 0);
node->set_id(node_id);
nodes_[node_id] = node;
});
// Add the deps records.
deps_.resize(node_count);
std::vector<size_t> unique_counts = ParallelMap(thread_pool.get(),
SplitByThreads(node_count),
[this, &log, &dep_index](std::pair<int, int> node_chunk) {
size_t unique_count = 0;
for (int node_id = node_chunk.first; node_id < node_chunk.second; ++node_id) {
size_t index = dep_index[node_id];
if (index == 0) continue;
--index;
++unique_count;
size_t header = log.table[index];
size_t size = RecordSizeInWords(header);
assert(size != 0 && IsDepsRecordHeader(header));
int output_id = log.table[index + 1];
int mtime = log.table[index + 2];
int deps_count = size - 3;
Deps* deps = new Deps(mtime, deps_count);
for (int i = 0; i < deps_count; ++i) {
int input_id = log.table[index + 3 + i];
Node* node = nodes_[input_id];
assert(node != nullptr);
deps->nodes[i] = node;
}
deps_[output_id] = deps;
}
return unique_count;
});
size_t unique_dep_record_count = std::accumulate(unique_counts.begin(),
unique_counts.end(), 0);
const size_t actual_file_size = log.file->content().size();
const size_t parsed_file_size = kFileHeaderSize +
(chunks.empty() ? 0 : chunks.back().stop) * sizeof(uint32_t);
assert(parsed_file_size <= actual_file_size);
if (parsed_file_size < actual_file_size) {
// An error occurred while loading; try to recover by truncating the file to
// the last fully-read record.
*err = "premature end of file";
log.file.reset();
if (!Truncate(path, parsed_file_size, err))
return false;
// The truncate succeeded; we'll just report the load error as a
// warning because the build can proceed.
*err += "; recovering";
return true;
}
// Rebuild the log if there are too many dead records.
const unsigned kMinCompactionEntryCount = 1000;
const unsigned kCompactionRatio = 3;
if (total_dep_record_count > kMinCompactionEntryCount &&
total_dep_record_count > unique_dep_record_count * kCompactionRatio) {
needs_recompaction_ = true;
}
return true;
}
DepsLog::Deps* DepsLog::GetDeps(Node* node) {
// Abort if the node has no id (never referenced in the deps) or if
// there's no deps recorded for the node.
if (node->id() < 0 || node->id() >= (int)deps_.size())
return NULL;
return deps_[node->id()];
}
bool DepsLog::Recompact(const string& path, const DiskInterface& disk, string* err) {
METRIC_RECORD(".ninja_deps recompact");
Close();
string temp_path = path + ".recompact";
// OpenForWrite() opens for append. Make sure it's not appending to a
// left-over file from a previous recompaction attempt that crashed somehow.
unlink(temp_path.c_str());
DepsLog new_log;
if (!new_log.OpenForWrite(temp_path, disk, err))
return false;
// Clear all known ids so that new ones can be reassigned. The new indices
// will refer to the ordering in new_log, not in the current log.
for (vector<Node*>::iterator i = nodes_.begin(); i != nodes_.end(); ++i)
(*i)->set_id(-1);
// Write out all deps again.
for (int old_id = 0; old_id < (int)deps_.size(); ++old_id) {
Deps* deps = deps_[old_id];
if (!deps) continue; // If nodes_[old_id] is a leaf, it has no deps.
Node* node = nodes_[old_id];
if (node->in_edge()) {
// If the current manifest defines this edge, skip if it's not dep
// producing.
if (node->in_edge()->GetBinding("deps").empty()) continue;
} else {
// If the current manifest does not define this edge, skip if it's missing
// from the disk.
string err;
TimeStamp mtime = disk.Stat(node->path(), &err);
if (mtime == -1)
Error("%s", err.c_str()); // log and ignore Stat() errors
if (mtime == 0)
continue;
}
if (!new_log.RecordDeps(nodes_[old_id], deps->mtime,
deps->node_count, deps->nodes)) {
new_log.Close();
return false;
}
}
new_log.Close();
// All nodes now have ids that refer to new_log, so steal its data.
deps_.swap(new_log.deps_);
nodes_.swap(new_log.nodes_);
if (unlink(path.c_str()) < 0) {
*err = strerror(errno);
return false;
}
if (rename(temp_path.c_str(), path.c_str()) < 0) {
*err = strerror(errno);
return false;
}
return true;
}
bool DepsLog::IsDepsEntryLiveFor(Node* node) {
// Skip entries that don't have in-edges or whose edges don't have a
// "deps" attribute. They were in the deps log from previous builds, but the
// files they were for were removed from the build.
return node->in_edge() && !node->in_edge()->GetBinding("deps").empty();
}
bool DepsLog::UpdateDeps(int out_id, Deps* deps) {
if (out_id >= (int)deps_.size())
deps_.resize(out_id + 1);
bool delete_old = deps_[out_id] != NULL;
if (delete_old)
delete deps_[out_id];
deps_[out_id] = deps;
return delete_old;
}
bool DepsLog::RecordId(Node* node) {
int path_size = node->path().size();
int padding = (4 - path_size % 4) % 4; // Pad path to 4 byte boundary.
unsigned size = path_size + padding + 4;
if (size > kMaxRecordSize) {
errno = ERANGE;
return false;
}
if (fwrite(&size, 4, 1, file_) < 1)
return false;
if (fwrite(node->path().data(), path_size, 1, file_) < 1) {
assert(node->path().size() > 0);
return false;
}
if (padding && fwrite("\0\0", padding, 1, file_) < 1)
return false;
int id = nodes_.size();
unsigned checksum = ~(unsigned)id;
if (fwrite(&checksum, 4, 1, file_) < 1)
return false;
if (fflush(file_) != 0)
return false;
node->set_id(id);
nodes_.push_back(node);
return true;
}