blob: 298d42dd5bca5d683ffbbedc9c87635ec3e9f412 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
#include "base/files/file_util.h"
#include "base/files/scoped_file.h"
#include "base/md5.h"
#include "base/metrics/histogram.h"
#include "base/metrics/sparse_histogram.h"
namespace {
// NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
// that byte-order changes force corruption.
const int32 kFileMagic = 0x600D71FE;
// Version history:
// Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
// Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
// Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
const int32 kFileVersion = 8;
// ReadAndVerifyHeader() returns this in case of error.
const int32 kInvalidVersion = -1;
// Starting with version 8, the storage is sorted and can be sharded to allow
// updates to be done with lower memory requirements. Newly written files will
// be sharded to need less than this amount of memory during update. Larger
// values are preferred to minimize looping overhead during processing.
const int64 kUpdateStorageBytes = 100 * 1024;
// Prevent excessive sharding by setting a lower limit on the shard stride.
// Smaller values should work fine, but very small values will probably lead to
// poor performance. Shard stride is indirectly related to
// |kUpdateStorageBytes|, setting that very small will bump against this.
const uint32 kMinShardStride = 1 << 24;
// Strides over the entire SBPrefix space.
const uint64 kMaxShardStride = 1ULL << 32;
// Maximum SBPrefix value.
const SBPrefix kMaxSBPrefix = 0xFFFFFFFF;
// Header at the front of the main database file.
struct FileHeader {
int32 magic, version;
uint32 add_chunk_count, sub_chunk_count;
uint32 shard_stride;
// TODO(shess): Is this where 64-bit will bite me? Perhaps write a
// specialized read/write?
};
// Header for each chunk in the chunk-accumulation file.
struct ChunkHeader {
uint32 add_prefix_count, sub_prefix_count;
uint32 add_hash_count, sub_hash_count;
};
// Header for each shard of data in the main database file.
struct ShardHeader {
uint32 add_prefix_count, sub_prefix_count;
uint32 add_hash_count, sub_hash_count;
};
// Enumerate different format-change events for histogramming
// purposes. DO NOT CHANGE THE ORDERING OF THESE VALUES.
enum FormatEventType {
// Corruption detected, broken down by file format.
FORMAT_EVENT_FILE_CORRUPT,
FORMAT_EVENT_SQLITE_CORRUPT, // Obsolete
// The type of format found in the file. The expected case (new
// file format) is intentionally not covered.
FORMAT_EVENT_FOUND_SQLITE, // Obsolete
FORMAT_EVENT_FOUND_UNKNOWN, // magic does not match.
// The number of SQLite-format files deleted should be the same as
// FORMAT_EVENT_FOUND_SQLITE. It can differ if the delete fails,
// or if a failure prevents the update from succeeding.
FORMAT_EVENT_SQLITE_DELETED, // Obsolete
FORMAT_EVENT_SQLITE_DELETE_FAILED, // Obsolete
// Found and deleted (or failed to delete) the ancient "Safe
// Browsing" file.
FORMAT_EVENT_DELETED_ORIGINAL, // Obsolete
FORMAT_EVENT_DELETED_ORIGINAL_FAILED, // Obsolete
// The checksum did not check out in CheckValidity() or in
// FinishUpdate(). This most likely indicates that the machine
// crashed before the file was fully sync'ed to disk.
FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
// The header checksum was incorrect in ReadAndVerifyHeader(). Likely
// indicates that the system crashed while writing an update.
FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,
FORMAT_EVENT_FOUND_DEPRECATED, // version too old.
// Memory space for histograms is determined by the max. ALWAYS
// ADD NEW VALUES BEFORE THIS ONE.
FORMAT_EVENT_MAX
};
void RecordFormatEvent(FormatEventType event_type) {
UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
}
// Rewind the file. Using fseek(2) because rewind(3) errors are
// weird.
bool FileRewind(FILE* fp) {
int rv = fseek(fp, 0, SEEK_SET);
DCHECK_EQ(rv, 0);
return rv == 0;
}
// Read from |fp| into |item|, and fold the input data into the
// checksum in |context|, if non-NULL. Return true on success.
template <class T>
bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
const size_t ret = fread(item, sizeof(T), 1, fp);
if (ret != 1)
return false;
if (context) {
base::MD5Update(context,
base::StringPiece(reinterpret_cast<char*>(item),
sizeof(T)));
}
return true;
}
// Write |item| to |fp|, and fold the output data into the checksum in
// |context|, if non-NULL. Return true on success.
template <class T>
bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
const size_t ret = fwrite(&item, sizeof(T), 1, fp);
if (ret != 1)
return false;
if (context) {
base::MD5Update(context,
base::StringPiece(reinterpret_cast<const char*>(&item),
sizeof(T)));
}
return true;
}
// Read |count| items into |values| from |fp|, and fold them into the
// checksum in |context|. Returns true on success.
template <typename CT>
bool ReadToContainer(CT* values, size_t count, FILE* fp,
base::MD5Context* context) {
if (!count)
return true;
for (size_t i = 0; i < count; ++i) {
typename CT::value_type value;
if (!ReadItem(&value, fp, context))
return false;
// push_back() is more obvious, but coded this way std::set can
// also be read.
values->insert(values->end(), value);
}
return true;
}
// Write values between |beg| and |end| to |fp|, and fold the data into the
// checksum in |context|, if non-NULL. Returns true if all items successful.
template <typename CTI>
bool WriteRange(const CTI& beg, const CTI& end,
FILE* fp, base::MD5Context* context) {
for (CTI iter = beg; iter != end; ++iter) {
if (!WriteItem(*iter, fp, context))
return false;
}
return true;
}
// Write all of |values| to |fp|, and fold the data into the checksum
// in |context|, if non-NULL. Returns true if all items successful.
template <typename CT>
bool WriteContainer(const CT& values, FILE* fp,
base::MD5Context* context) {
return WriteRange(values.begin(), values.end(), fp, context);
}
// Delete the chunks in |deleted| from |chunks|.
void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
std::set<int32>* chunks) {
for (std::set<int32>::iterator iter = chunks->begin();
iter != chunks->end();) {
std::set<int32>::iterator prev = iter++;
if (deleted.count(*prev) > 0)
chunks->erase(prev);
}
}
bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
base::MD5Digest calculated_digest;
base::MD5IntermediateFinal(&calculated_digest, context);
base::MD5Digest file_digest;
if (!ReadItem(&file_digest, fp, context))
return false;
return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
}
// Helper function to read the file header and chunk TOC. Rewinds |fp| and
// initializes |context|. The header is left in |header|, with the version
// returned. kInvalidVersion is returned for sanity check or checksum failure.
int ReadAndVerifyHeader(const base::FilePath& filename,
FileHeader* header,
std::set<int32>* add_chunks,
std::set<int32>* sub_chunks,
FILE* fp,
base::MD5Context* context) {
DCHECK(header);
DCHECK(add_chunks);
DCHECK(sub_chunks);
DCHECK(fp);
DCHECK(context);
base::MD5Init(context);
if (!FileRewind(fp))
return kInvalidVersion;
if (!ReadItem(header, fp, context))
return kInvalidVersion;
if (header->magic != kFileMagic)
return kInvalidVersion;
// Track version read to inform removal of support for older versions.
UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->version);
if (header->version != kFileVersion)
return kInvalidVersion;
if (!ReadToContainer(add_chunks, header->add_chunk_count, fp, context) ||
!ReadToContainer(sub_chunks, header->sub_chunk_count, fp, context)) {
return kInvalidVersion;
}
// Verify that the data read thus far is valid.
if (!ReadAndVerifyChecksum(fp, context)) {
RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
return kInvalidVersion;
}
return kFileVersion;
}
// Helper function to write out the initial header and chunks-contained data.
// Rewinds |fp|, initializes |context|, then writes a file header and
// |add_chunks| and |sub_chunks|.
bool WriteHeader(uint32 out_stride,
const std::set<int32>& add_chunks,
const std::set<int32>& sub_chunks,
FILE* fp,
base::MD5Context* context) {
if (!FileRewind(fp))
return false;
base::MD5Init(context);
FileHeader header;
header.magic = kFileMagic;
header.version = kFileVersion;
header.add_chunk_count = add_chunks.size();
header.sub_chunk_count = sub_chunks.size();
header.shard_stride = out_stride;
if (!WriteItem(header, fp, context))
return false;
if (!WriteContainer(add_chunks, fp, context) ||
!WriteContainer(sub_chunks, fp, context))
return false;
// Write out the header digest.
base::MD5Digest header_digest;
base::MD5IntermediateFinal(&header_digest, context);
if (!WriteItem(header_digest, fp, context))
return false;
return true;
}
// Return |true| if the range is sorted by the given comparator.
template <typename CTI, typename LESS>
bool sorted(CTI beg, CTI end, LESS less) {
while ((end - beg) > 2) {
CTI n = beg++;
DCHECK(!less(*beg, *n));
if (less(*beg, *n))
return false;
}
return true;
}
// Merge |beg|..|end| into |container|. Both should be sorted by the given
// comparator, and the range iterators should not be derived from |container|.
// Differs from std::inplace_merge() in that additional memory is not required
// for linear performance.
template <typename CT, typename CTI, typename COMP>
void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
DCHECK(sorted(container->begin(), container->end(), less));
DCHECK(sorted(beg, end, less));
// Size the container to fit the results.
const size_t c_size = container->size();
container->resize(c_size + (end - beg));
// |c_end| points to the original endpoint, while |c_out| points to the
// endpoint that will scan from end to beginning while merging.
typename CT::iterator c_end = container->begin() + c_size;
typename CT::iterator c_out = container->end();
// While both inputs have data, move the greater to |c_out|.
while (c_end != container->begin() && end != beg) {
if (less(*(c_end - 1), *(end - 1))) {
*(--c_out) = *(--end);
} else {
*(--c_out) = *(--c_end);
}
}
// Copy any data remaining in the new range.
if (end != beg) {
// The original container data has been fully shifted.
DCHECK(c_end == container->begin());
// There is exactly the correct amount of space left.
DCHECK_EQ(c_out - c_end, end - beg);
std::copy(beg, end, container->begin());
}
DCHECK(sorted(container->begin(), container->end(), less));
}
// Collection of iterators used while stepping through StateInternal (see
// below).
class StateInternalPos {
public:
StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
SBSubPrefixes::iterator sub_prefixes_iter,
std::vector<SBAddFullHash>::iterator add_hashes_iter,
std::vector<SBSubFullHash>::iterator sub_hashes_iter)
: add_prefixes_iter_(add_prefixes_iter),
sub_prefixes_iter_(sub_prefixes_iter),
add_hashes_iter_(add_hashes_iter),
sub_hashes_iter_(sub_hashes_iter) {
}
SBAddPrefixes::iterator add_prefixes_iter_;
SBSubPrefixes::iterator sub_prefixes_iter_;
std::vector<SBAddFullHash>::iterator add_hashes_iter_;
std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
};
// Helper to find the next shard boundary.
template <class T>
bool prefix_bounder(SBPrefix val, const T& elt) {
return val < elt.GetAddPrefix();
}
// Container for partial database state. Includes add/sub prefixes/hashes, plus
// aggregate operations on same.
class StateInternal {
public:
// Append indicated amount of data from |fp|.
bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
size_t add_hash_count, size_t sub_hash_count,
FILE* fp, base::MD5Context* context) {
return
ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
}
void ClearData() {
add_prefixes_.clear();
sub_prefixes_.clear();
add_full_hashes_.clear();
sub_full_hashes_.clear();
}
// Merge data from |beg|..|end| into receiver's state, then process the state.
// The current state and the range given should corrospond to the same sorted
// shard of data from different sources. |add_del_cache| and |sub_del_cache|
// indicate the chunk ids which should be deleted during processing (see
// SBProcessSubs).
void MergeDataAndProcess(const StateInternalPos& beg,
const StateInternalPos& end,
const base::hash_set<int32>& add_del_cache,
const base::hash_set<int32>& sub_del_cache) {
container_merge(&add_prefixes_,
beg.add_prefixes_iter_,
end.add_prefixes_iter_,
SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
container_merge(&sub_prefixes_,
beg.sub_prefixes_iter_,
end.sub_prefixes_iter_,
SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
container_merge(&add_full_hashes_,
beg.add_hashes_iter_,
end.add_hashes_iter_,
SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
container_merge(&sub_full_hashes_,
beg.sub_hashes_iter_,
end.sub_hashes_iter_,
SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);
SBProcessSubs(&add_prefixes_, &sub_prefixes_,
&add_full_hashes_, &sub_full_hashes_,
add_del_cache, sub_del_cache);
}
// Sort the data appropriately for the sharding, merging, and processing
// operations.
void SortData() {
std::sort(add_prefixes_.begin(), add_prefixes_.end(),
SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
}
// Iterator from the beginning of the state's data.
StateInternalPos StateBegin() {
return StateInternalPos(add_prefixes_.begin(),
sub_prefixes_.begin(),
add_full_hashes_.begin(),
sub_full_hashes_.begin());
}
// An iterator pointing just after the last possible element of the shard
// indicated by |shard_max|. Used to step through the state by shard.
// TODO(shess): Verify whether binary search really improves over linear.
// Merging or writing will immediately touch all of these elements.
StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
return StateInternalPos(
std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
shard_max, prefix_bounder<SBAddPrefix>),
std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
shard_max, prefix_bounder<SBSubPrefix>),
std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
shard_max, prefix_bounder<SBAddFullHash>),
std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
shard_max, prefix_bounder<SBSubFullHash>));
}
// Write a shard header and data for the shard starting at |beg| and ending at
// the element before |end|.
bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
FILE* fp, base::MD5Context* context) {
ShardHeader shard_header;
shard_header.add_prefix_count =
end.add_prefixes_iter_ - beg.add_prefixes_iter_;
shard_header.sub_prefix_count =
end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
shard_header.add_hash_count =
end.add_hashes_iter_ - beg.add_hashes_iter_;
shard_header.sub_hash_count =
end.sub_hashes_iter_ - beg.sub_hashes_iter_;
return
WriteItem(shard_header, fp, context) &&
WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
fp, context) &&
WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
fp, context) &&
WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
fp, context) &&
WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
fp, context);
}
SBAddPrefixes add_prefixes_;
SBSubPrefixes sub_prefixes_;
std::vector<SBAddFullHash> add_full_hashes_;
std::vector<SBSubFullHash> sub_full_hashes_;
};
// True if |val| is an even power of two.
template <typename T>
bool IsPowerOfTwo(const T& val) {
return val && (val & (val - 1)) == 0;
}
// Helper to read the entire database state, used by GetAddPrefixes() and
// GetAddFullHashes(). Those functions are generally used only for smaller
// files. Returns false in case of errors reading the data.
bool ReadDbStateHelper(const base::FilePath& filename,
StateInternal* db_state) {
base::ScopedFILE file(base::OpenFile(filename, "rb"));
if (file.get() == NULL)
return false;
std::set<int32> add_chunks;
std::set<int32> sub_chunks;
base::MD5Context context;
FileHeader header;
const int version =
ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
file.get(), &context);
if (version == kInvalidVersion)
return false;
uint64 in_min = 0;
uint64 in_stride = header.shard_stride;
if (!in_stride)
in_stride = kMaxShardStride;
if (!IsPowerOfTwo(in_stride))
return false;
do {
ShardHeader shard_header;
if (!ReadItem(&shard_header, file.get(), &context))
return false;
if (!db_state->AppendData(shard_header.add_prefix_count,
shard_header.sub_prefix_count,
shard_header.add_hash_count,
shard_header.sub_hash_count,
file.get(), &context)) {
return false;
}
in_min += in_stride;
} while (in_min <= kMaxSBPrefix);
if (!ReadAndVerifyChecksum(file.get(), &context))
return false;
int64 size = 0;
if (!base::GetFileSize(filename, &size))
return false;
return static_cast<int64>(ftell(file.get())) == size;
}
} // namespace
SafeBrowsingStoreFile::SafeBrowsingStoreFile()
: chunks_written_(0), empty_(false), corruption_seen_(false) {}
SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
Close();
}
bool SafeBrowsingStoreFile::Delete() {
// The database should not be open at this point. But, just in
// case, close everything before deleting.
if (!Close()) {
NOTREACHED();
return false;
}
return DeleteStore(filename_);
}
bool SafeBrowsingStoreFile::CheckValidity() {
// The file was either empty or never opened. The empty case is
// presumed not to be invalid. The never-opened case can happen if
// BeginUpdate() fails for any databases, and should already have
// caused the corruption callback to fire.
if (!file_.get())
return true;
if (!FileRewind(file_.get()))
return OnCorruptDatabase();
int64 size = 0;
if (!base::GetFileSize(filename_, &size))
return OnCorruptDatabase();
base::MD5Context context;
base::MD5Init(&context);
// Read everything except the final digest.
size_t bytes_left = static_cast<size_t>(size);
CHECK(size == static_cast<int64>(bytes_left));
if (bytes_left < sizeof(base::MD5Digest))
return OnCorruptDatabase();
bytes_left -= sizeof(base::MD5Digest);
// Fold the contents of the file into the checksum.
while (bytes_left > 0) {
char buf[4096];
const size_t c = std::min(sizeof(buf), bytes_left);
const size_t ret = fread(buf, 1, c, file_.get());
// The file's size changed while reading, give up.
if (ret != c)
return OnCorruptDatabase();
base::MD5Update(&context, base::StringPiece(buf, c));
bytes_left -= c;
}
if (!ReadAndVerifyChecksum(file_.get(), &context)) {
RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
return OnCorruptDatabase();
}
return true;
}
void SafeBrowsingStoreFile::Init(const base::FilePath& filename,
const base::Closure& corruption_callback) {
filename_ = filename;
corruption_callback_ = corruption_callback;
}
bool SafeBrowsingStoreFile::BeginChunk() {
return ClearChunkBuffers();
}
bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
return true;
}
bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
add_prefixes->clear();
if (!base::PathExists(filename_))
return true;
StateInternal db_state;
if (!ReadDbStateHelper(filename_, &db_state))
return OnCorruptDatabase();
add_prefixes->swap(db_state.add_prefixes_);
return true;
}
bool SafeBrowsingStoreFile::GetAddFullHashes(
std::vector<SBAddFullHash>* add_full_hashes) {
add_full_hashes->clear();
if (!base::PathExists(filename_))
return true;
StateInternal db_state;
if (!ReadDbStateHelper(filename_, &db_state))
return OnCorruptDatabase();
add_full_hashes->swap(db_state.add_full_hashes_);
return true;
}
bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
const SBFullHash& full_hash) {
add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash));
return true;
}
bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
int32 add_chunk_id,
SBPrefix prefix) {
sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
return true;
}
bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
const SBFullHash& full_hash) {
sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
return true;
}
bool SafeBrowsingStoreFile::OnCorruptDatabase() {
if (!corruption_seen_)
RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
corruption_seen_ = true;
corruption_callback_.Run();
// Return false as a convenience to callers.
return false;
}
bool SafeBrowsingStoreFile::Close() {
ClearUpdateBuffers();
// Make sure the files are closed.
file_.reset();
new_file_.reset();
return true;
}
bool SafeBrowsingStoreFile::BeginUpdate() {
DCHECK(!file_.get() && !new_file_.get());
// Structures should all be clear unless something bad happened.
DCHECK(add_chunks_cache_.empty());
DCHECK(sub_chunks_cache_.empty());
DCHECK(add_del_cache_.empty());
DCHECK(sub_del_cache_.empty());
DCHECK(add_prefixes_.empty());
DCHECK(sub_prefixes_.empty());
DCHECK(add_hashes_.empty());
DCHECK(sub_hashes_.empty());
DCHECK_EQ(chunks_written_, 0);
corruption_seen_ = false;
const base::FilePath new_filename = TemporaryFileForFilename(filename_);
base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
if (new_file.get() == NULL)
return false;
base::ScopedFILE file(base::OpenFile(filename_, "rb"));
empty_ = (file.get() == NULL);
if (empty_) {
// If the file exists but cannot be opened, try to delete it (not
// deleting directly, the bloom filter needs to be deleted, too).
if (base::PathExists(filename_))
return OnCorruptDatabase();
new_file_.swap(new_file);
return true;
}
base::MD5Context context;
FileHeader header;
const int version =
ReadAndVerifyHeader(filename_, &header,
&add_chunks_cache_, &sub_chunks_cache_,
file.get(), &context);
if (version == kInvalidVersion) {
FileHeader retry_header;
if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL)) {
if (retry_header.magic == kFileMagic &&
retry_header.version < kFileVersion) {
RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED);
} else {
RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
}
}
// Close the file so that it can be deleted.
file.reset();
return OnCorruptDatabase();
}
file_.swap(file);
new_file_.swap(new_file);
return true;
}
bool SafeBrowsingStoreFile::FinishChunk() {
if (!add_prefixes_.size() && !sub_prefixes_.size() &&
!add_hashes_.size() && !sub_hashes_.size())
return true;
ChunkHeader header;
header.add_prefix_count = add_prefixes_.size();
header.sub_prefix_count = sub_prefixes_.size();
header.add_hash_count = add_hashes_.size();
header.sub_hash_count = sub_hashes_.size();
if (!WriteItem(header, new_file_.get(), NULL))
return false;
if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
!WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
!WriteContainer(add_hashes_, new_file_.get(), NULL) ||
!WriteContainer(sub_hashes_, new_file_.get(), NULL))
return false;
++chunks_written_;
// Clear everything to save memory.
return ClearChunkBuffers();
}
bool SafeBrowsingStoreFile::DoUpdate(
safe_browsing::PrefixSetBuilder* builder,
std::vector<SBAddFullHash>* add_full_hashes_result) {
DCHECK(file_.get() || empty_);
DCHECK(new_file_.get());
CHECK(builder);
CHECK(add_full_hashes_result);
// Rewind the temporary storage.
if (!FileRewind(new_file_.get()))
return false;
// Get chunk file's size for validating counts.
int64 update_size = 0;
if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
return OnCorruptDatabase();
// Track update size to answer questions at http://crbug.com/72216 .
// Log small updates as 1k so that the 0 (underflow) bucket can be
// used for "empty" in SafeBrowsingDatabase.
UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
std::max(static_cast<int>(update_size / 1024), 1));
// Chunk updates to integrate.
StateInternal new_state;
// Read update chunks.
for (int i = 0; i < chunks_written_; ++i) {
ChunkHeader header;
int64 ofs = ftell(new_file_.get());
if (ofs == -1)
return false;
if (!ReadItem(&header, new_file_.get(), NULL))
return false;
// As a safety measure, make sure that the header describes a sane
// chunk, given the remaining file size.
int64 expected_size = ofs + sizeof(ChunkHeader);
expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
expected_size += header.add_hash_count * sizeof(SBAddFullHash);
expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
if (expected_size > update_size)
return false;
if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
header.add_hash_count, header.sub_hash_count,
new_file_.get(), NULL)) {
return false;
}
}
// The state was accumulated by chunk, sort by prefix.
new_state.SortData();
// These strides control how much data is loaded into memory per pass.
// Strides must be an even power of two. |in_stride| will be derived from the
// input file. |out_stride| will be derived from an estimate of the resulting
// file's size. |process_stride| will be the max of both.
uint64 in_stride = kMaxShardStride;
uint64 out_stride = kMaxShardStride;
uint64 process_stride = 0;
// Used to verify the input's checksum if |!empty_|.
base::MD5Context in_context;
if (!empty_) {
DCHECK(file_.get());
FileHeader header = {0};
int version = ReadAndVerifyHeader(filename_, &header,
&add_chunks_cache_, &sub_chunks_cache_,
file_.get(), &in_context);
if (version == kInvalidVersion)
return OnCorruptDatabase();
if (header.shard_stride)
in_stride = header.shard_stride;
// The header checksum should have prevented this case, but the code will be
// broken if this is not correct.
if (!IsPowerOfTwo(in_stride))
return OnCorruptDatabase();
}
// We no longer need to track deleted chunks.
DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);
// Calculate |out_stride| to break the file down into reasonable shards.
{
int64 original_size = 0;
if (!empty_ && !base::GetFileSize(filename_, &original_size))
return OnCorruptDatabase();
// Approximate the final size as everything. Subs and deletes will reduce
// the size, but modest over-sharding won't hurt much.
int64 shard_size = original_size + update_size;
// Keep splitting until a single stride of data fits the target.
size_t shifts = 0;
while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
out_stride >>= 1;
shard_size >>= 1;
++shifts;
}
UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);
DCHECK(IsPowerOfTwo(out_stride));
}
// Outer loop strides by the max of the input stride (to read integral shards)
// and the output stride (to write integral shards).
process_stride = std::max(in_stride, out_stride);
DCHECK(IsPowerOfTwo(process_stride));
DCHECK_EQ(0u, process_stride % in_stride);
DCHECK_EQ(0u, process_stride % out_stride);
// Start writing the new data to |new_file_|.
base::MD5Context out_context;
if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
new_file_.get(), &out_context)) {
return false;
}
// Start at the beginning of the SBPrefix space.
uint64 in_min = 0;
uint64 out_min = 0;
uint64 process_min = 0;
// Start at the beginning of the updates.
StateInternalPos new_pos = new_state.StateBegin();
// Re-usable container for shard processing.
StateInternal db_state;
// Track aggregate counts for histograms.
size_t add_prefix_count = 0;
size_t sub_prefix_count = 0;
do {
// Maximum element in the current shard.
SBPrefix process_max =
static_cast<SBPrefix>(process_min + process_stride - 1);
DCHECK_GT(process_max, process_min);
// Drop the data from previous pass.
db_state.ClearData();
// Fill the processing shard with one or more input shards.
if (!empty_) {
do {
ShardHeader shard_header;
if (!ReadItem(&shard_header, file_.get(), &in_context))
return OnCorruptDatabase();
if (!db_state.AppendData(shard_header.add_prefix_count,
shard_header.sub_prefix_count,
shard_header.add_hash_count,
shard_header.sub_hash_count,
file_.get(), &in_context))
return OnCorruptDatabase();
in_min += in_stride;
} while (in_min <= kMaxSBPrefix && in_min < process_max);
}
// Shard the update data to match the database data, then merge the update
// data and process the results.
{
StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
db_state.MergeDataAndProcess(new_pos, new_end,
add_del_cache_, sub_del_cache_);
new_pos = new_end;
}
// Collect the processed data for return to caller.
for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
builder->AddPrefix(db_state.add_prefixes_[i].prefix);
}
add_full_hashes_result->insert(add_full_hashes_result->end(),
db_state.add_full_hashes_.begin(),
db_state.add_full_hashes_.end());
add_prefix_count += db_state.add_prefixes_.size();
sub_prefix_count += db_state.sub_prefixes_.size();
// Write one or more shards of processed output.
StateInternalPos out_pos = db_state.StateBegin();
do {
SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
DCHECK_GT(out_max, out_min);
StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
return false;
out_pos = out_end;
out_min += out_stride;
} while (out_min == static_cast<SBPrefix>(out_min) &&
out_min < process_max);
process_min += process_stride;
} while (process_min <= kMaxSBPrefix);
// Verify the overall checksum.
if (!empty_) {
if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
return OnCorruptDatabase();
}
// TODO(shess): Verify EOF?
// Close the input file so the new file can be renamed over it.
file_.reset();
}
DCHECK(!file_.get());
// Write the overall checksum.
base::MD5Digest out_digest;
base::MD5Final(&out_digest, &out_context);
if (!WriteItem(out_digest, new_file_.get(), NULL))
return false;
// Trim any excess left over from the temporary chunk data.
if (!base::TruncateFile(new_file_.get()))
return false;
// Close the file handle and swizzle the file into place.
new_file_.reset();
if (!base::DeleteFile(filename_, false) &&
base::PathExists(filename_))
return false;
const base::FilePath new_filename = TemporaryFileForFilename(filename_);
if (!base::Move(new_filename, filename_))
return false;
// Record counts before swapping to caller.
UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);
return true;
}
bool SafeBrowsingStoreFile::FinishUpdate(
safe_browsing::PrefixSetBuilder* builder,
std::vector<SBAddFullHash>* add_full_hashes_result) {
DCHECK(builder);
DCHECK(add_full_hashes_result);
if (!DoUpdate(builder, add_full_hashes_result)) {
CancelUpdate();
return false;
}
DCHECK(!new_file_.get());
DCHECK(!file_.get());
return Close();
}
bool SafeBrowsingStoreFile::CancelUpdate() {
bool ret = Close();
// Delete stale staging file.
const base::FilePath new_filename = TemporaryFileForFilename(filename_);
base::DeleteFile(new_filename, false);
return ret;
}
void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
add_chunks_cache_.insert(chunk_id);
}
bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
return add_chunks_cache_.count(chunk_id) > 0;
}
void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
out->clear();
out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
}
void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
sub_chunks_cache_.insert(chunk_id);
}
bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
return sub_chunks_cache_.count(chunk_id) > 0;
}
void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
out->clear();
out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
}
void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
add_del_cache_.insert(chunk_id);
}
void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
sub_del_cache_.insert(chunk_id);
}
// static
bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
if (!base::DeleteFile(basename, false) &&
base::PathExists(basename)) {
NOTREACHED();
return false;
}
const base::FilePath new_filename = TemporaryFileForFilename(basename);
if (!base::DeleteFile(new_filename, false) &&
base::PathExists(new_filename)) {
NOTREACHED();
return false;
}
// With SQLite support gone, one way to get to this code is if the
// existing file is a SQLite file. Make sure the journal file is
// also removed.
const base::FilePath journal_filename(
basename.value() + FILE_PATH_LITERAL("-journal"));
if (base::PathExists(journal_filename))
base::DeleteFile(journal_filename, false);
return true;
}