| // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "chrome/browser/safe_browsing/safe_browsing_store_file.h" |
| |
| #include "base/file_util.h" |
| #include "base/files/scoped_file.h" |
| #include "base/md5.h" |
| #include "base/metrics/histogram.h" |
| #include "base/metrics/sparse_histogram.h" |
| |
| namespace { |
| |
| // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so |
| // that byte-order changes force corruption. |
| const int32 kFileMagic = 0x600D71FE; |
| |
| // Version history: |
| // Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite) |
| // Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28 |
| // Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27 |
| const int32 kFileVersion = 8; |
| |
| // ReadAndVerifyHeader() returns this in case of error. |
| const int32 kInvalidVersion = -1; |
| |
| // Header at the front of the main database file. |
| struct FileHeaderV7 { |
| int32 magic, version; |
| uint32 add_chunk_count, sub_chunk_count; |
| uint32 add_prefix_count, sub_prefix_count; |
| uint32 add_hash_count, sub_hash_count; |
| }; |
| |
| // Starting with version 8, the storage is sorted and can be sharded to allow |
| // updates to be done with lower memory requirements. Newly written files will |
| // be sharded to need less than this amount of memory during update. Larger |
| // values are preferred to minimize looping overhead during processing. |
| const int64 kUpdateStorageBytes = 100 * 1024; |
| |
| // Prevent excessive sharding by setting a lower limit on the shard stride. |
| // Smaller values should work fine, but very small values will probably lead to |
| // poor performance. Shard stride is indirectly related to |
| // |kUpdateStorageBytes|, setting that very small will bump against this. |
| const uint32 kMinShardStride = 1 << 24; |
| |
| // Strides over the entire SBPrefix space. |
| const uint64 kMaxShardStride = 1ULL << 32; |
| |
| // Maximum SBPrefix value. |
| const SBPrefix kMaxSBPrefix = ~0; |
| |
| // Header at the front of the main database file. |
| struct FileHeaderV8 { |
| int32 magic, version; |
| uint32 add_chunk_count, sub_chunk_count; |
| uint32 shard_stride; |
| // TODO(shess): Is this where 64-bit will bite me? Perhaps write a |
| // specialized read/write? |
| }; |
| |
| union FileHeader { |
| struct FileHeaderV7 v7; |
| struct FileHeaderV8 v8; |
| }; |
| |
| // Header for each chunk in the chunk-accumulation file. |
| struct ChunkHeader { |
| uint32 add_prefix_count, sub_prefix_count; |
| uint32 add_hash_count, sub_hash_count; |
| }; |
| |
| // Header for each shard of data in the main database file. |
| struct ShardHeader { |
| uint32 add_prefix_count, sub_prefix_count; |
| uint32 add_hash_count, sub_hash_count; |
| }; |
| |
| // Enumerate different format-change events for histogramming |
| // purposes. DO NOT CHANGE THE ORDERING OF THESE VALUES. |
| enum FormatEventType { |
| // Corruption detected, broken down by file format. |
| FORMAT_EVENT_FILE_CORRUPT, |
| FORMAT_EVENT_SQLITE_CORRUPT, // Obsolete |
| |
| // The type of format found in the file. The expected case (new |
| // file format) is intentionally not covered. |
| FORMAT_EVENT_FOUND_SQLITE, |
| FORMAT_EVENT_FOUND_UNKNOWN, |
| |
| // The number of SQLite-format files deleted should be the same as |
| // FORMAT_EVENT_FOUND_SQLITE. It can differ if the delete fails, |
| // or if a failure prevents the update from succeeding. |
| FORMAT_EVENT_SQLITE_DELETED, // Obsolete |
| FORMAT_EVENT_SQLITE_DELETE_FAILED, // Obsolete |
| |
| // Found and deleted (or failed to delete) the ancient "Safe |
| // Browsing" file. |
| FORMAT_EVENT_DELETED_ORIGINAL, |
| FORMAT_EVENT_DELETED_ORIGINAL_FAILED, |
| |
| // The checksum did not check out in CheckValidity() or in |
| // FinishUpdate(). This most likely indicates that the machine |
| // crashed before the file was fully sync'ed to disk. |
| FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE, |
| FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE, |
| |
| // The header checksum was incorrect in ReadAndVerifyHeader(). Likely |
| // indicates that the system crashed while writing an update. |
| FORMAT_EVENT_HEADER_CHECKSUM_FAILURE, |
| |
| // Memory space for histograms is determined by the max. ALWAYS |
| // ADD NEW VALUES BEFORE THIS ONE. |
| FORMAT_EVENT_MAX |
| }; |
| |
| void RecordFormatEvent(FormatEventType event_type) { |
| UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX); |
| } |
| |
| // Rewind the file. Using fseek(2) because rewind(3) errors are |
| // weird. |
| bool FileRewind(FILE* fp) { |
| int rv = fseek(fp, 0, SEEK_SET); |
| DCHECK_EQ(rv, 0); |
| return rv == 0; |
| } |
| |
| // Read from |fp| into |item|, and fold the input data into the |
| // checksum in |context|, if non-NULL. Return true on success. |
| template <class T> |
| bool ReadItem(T* item, FILE* fp, base::MD5Context* context) { |
| const size_t ret = fread(item, sizeof(T), 1, fp); |
| if (ret != 1) |
| return false; |
| |
| if (context) { |
| base::MD5Update(context, |
| base::StringPiece(reinterpret_cast<char*>(item), |
| sizeof(T))); |
| } |
| return true; |
| } |
| |
| // Write |item| to |fp|, and fold the output data into the checksum in |
| // |context|, if non-NULL. Return true on success. |
| template <class T> |
| bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) { |
| const size_t ret = fwrite(&item, sizeof(T), 1, fp); |
| if (ret != 1) |
| return false; |
| |
| if (context) { |
| base::MD5Update(context, |
| base::StringPiece(reinterpret_cast<const char*>(&item), |
| sizeof(T))); |
| } |
| |
| return true; |
| } |
| |
| // Read |count| items into |values| from |fp|, and fold them into the |
| // checksum in |context|. Returns true on success. |
| template <typename CT> |
| bool ReadToContainer(CT* values, size_t count, FILE* fp, |
| base::MD5Context* context) { |
| if (!count) |
| return true; |
| |
| for (size_t i = 0; i < count; ++i) { |
| typename CT::value_type value; |
| if (!ReadItem(&value, fp, context)) |
| return false; |
| |
| // push_back() is more obvious, but coded this way std::set can |
| // also be read. |
| values->insert(values->end(), value); |
| } |
| |
| return true; |
| } |
| |
| // Write values between |beg| and |end| to |fp|, and fold the data into the |
| // checksum in |context|, if non-NULL. Returns true if all items successful. |
| template <typename CTI> |
| bool WriteRange(const CTI& beg, const CTI& end, |
| FILE* fp, base::MD5Context* context) { |
| for (CTI iter = beg; iter != end; ++iter) { |
| if (!WriteItem(*iter, fp, context)) |
| return false; |
| } |
| return true; |
| } |
| |
| // Write all of |values| to |fp|, and fold the data into the checksum |
| // in |context|, if non-NULL. Returns true if all items successful. |
| template <typename CT> |
| bool WriteContainer(const CT& values, FILE* fp, |
| base::MD5Context* context) { |
| return WriteRange(values.begin(), values.end(), fp, context); |
| } |
| |
| // Delete the chunks in |deleted| from |chunks|. |
| void DeleteChunksFromSet(const base::hash_set<int32>& deleted, |
| std::set<int32>* chunks) { |
| for (std::set<int32>::iterator iter = chunks->begin(); |
| iter != chunks->end();) { |
| std::set<int32>::iterator prev = iter++; |
| if (deleted.count(*prev) > 0) |
| chunks->erase(prev); |
| } |
| } |
| |
| bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) { |
| base::MD5Digest calculated_digest; |
| base::MD5IntermediateFinal(&calculated_digest, context); |
| |
| base::MD5Digest file_digest; |
| if (!ReadItem(&file_digest, fp, context)) |
| return false; |
| |
| return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0; |
| } |
| |
| // Sanity-check the header against the file's size to make sure our |
| // vectors aren't gigantic. This doubles as a cheap way to detect |
| // corruption without having to checksum the entire file. |
| bool FileHeaderV7SanityCheck(const base::FilePath& filename, |
| const FileHeaderV7& header) { |
| DCHECK_EQ(header.version, 7); |
| |
| int64 size = 0; |
| if (!base::GetFileSize(filename, &size)) |
| return false; |
| |
| int64 expected_size = sizeof(FileHeaderV7); |
| expected_size += header.add_chunk_count * sizeof(int32); |
| expected_size += header.sub_chunk_count * sizeof(int32); |
| expected_size += header.add_prefix_count * sizeof(SBAddPrefix); |
| expected_size += header.sub_prefix_count * sizeof(SBSubPrefix); |
| expected_size += header.add_hash_count * sizeof(SBAddFullHash); |
| expected_size += header.sub_hash_count * sizeof(SBSubFullHash); |
| expected_size += sizeof(base::MD5Digest); |
| if (size != expected_size) |
| return false; |
| |
| return true; |
| } |
| |
| // Helper function to read the file header and chunk TOC. Rewinds |fp| and |
| // initializes |context|. The header is left in |header|, with the version |
| // returned. kInvalidVersion is returned for sanity check or checksum failure. |
| int ReadAndVerifyHeader(const base::FilePath& filename, |
| FileHeader* header, |
| std::set<int32>* add_chunks, |
| std::set<int32>* sub_chunks, |
| FILE* fp, |
| base::MD5Context* context) { |
| DCHECK(header); |
| DCHECK(add_chunks); |
| DCHECK(sub_chunks); |
| DCHECK(fp); |
| DCHECK(context); |
| |
| int version = kInvalidVersion; |
| |
| base::MD5Init(context); |
| if (!FileRewind(fp)) |
| return kInvalidVersion; |
| if (!ReadItem(&header->v8, fp, context)) |
| return kInvalidVersion; |
| if (header->v8.magic != kFileMagic) |
| return kInvalidVersion; |
| |
| size_t add_chunks_count = 0; |
| size_t sub_chunks_count = 0; |
| |
| // Track version read to inform removal of support for older versions. |
| UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->v8.version); |
| |
| if (header->v8.version == 7) { |
| version = 7; |
| |
| // Reset the context and re-read the v7 header. |
| base::MD5Init(context); |
| if (!FileRewind(fp)) |
| return kInvalidVersion; |
| if (!ReadItem(&header->v7, fp, context)) |
| return kInvalidVersion; |
| if (header->v7.magic != kFileMagic || header->v7.version != 7) |
| return kInvalidVersion; |
| if (!FileHeaderV7SanityCheck(filename, header->v7)) |
| return kInvalidVersion; |
| |
| add_chunks_count = header->v7.add_chunk_count; |
| sub_chunks_count = header->v7.sub_chunk_count; |
| } else if (header->v8.version == kFileVersion) { |
| version = 8; |
| add_chunks_count = header->v8.add_chunk_count; |
| sub_chunks_count = header->v8.sub_chunk_count; |
| } else { |
| return kInvalidVersion; |
| } |
| |
| if (!ReadToContainer(add_chunks, add_chunks_count, fp, context) || |
| !ReadToContainer(sub_chunks, sub_chunks_count, fp, context)) { |
| return kInvalidVersion; |
| } |
| |
| // v8 includes a checksum to validate the header. |
| if (version > 7 && !ReadAndVerifyChecksum(fp, context)) { |
| RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE); |
| return kInvalidVersion; |
| } |
| |
| return version; |
| } |
| |
| // Helper function to write out the initial header and chunks-contained data. |
| // Rewinds |fp|, initializes |context|, then writes a file header and |
| // |add_chunks| and |sub_chunks|. |
| bool WriteHeader(uint32 out_stride, |
| const std::set<int32>& add_chunks, |
| const std::set<int32>& sub_chunks, |
| FILE* fp, |
| base::MD5Context* context) { |
| if (!FileRewind(fp)) |
| return false; |
| |
| base::MD5Init(context); |
| FileHeaderV8 header; |
| header.magic = kFileMagic; |
| header.version = kFileVersion; |
| header.add_chunk_count = add_chunks.size(); |
| header.sub_chunk_count = sub_chunks.size(); |
| header.shard_stride = out_stride; |
| if (!WriteItem(header, fp, context)) |
| return false; |
| |
| if (!WriteContainer(add_chunks, fp, context) || |
| !WriteContainer(sub_chunks, fp, context)) |
| return false; |
| |
| // Write out the header digest. |
| base::MD5Digest header_digest; |
| base::MD5IntermediateFinal(&header_digest, context); |
| if (!WriteItem(header_digest, fp, context)) |
| return false; |
| |
| return true; |
| } |
| |
| // Return |true| if the range is sorted by the given comparator. |
| template <typename CTI, typename LESS> |
| bool sorted(CTI beg, CTI end, LESS less) { |
| while ((end - beg) > 2) { |
| CTI n = beg++; |
| DCHECK(!less(*beg, *n)); |
| if (less(*beg, *n)) |
| return false; |
| } |
| return true; |
| } |
| |
| // Merge |beg|..|end| into |container|. Both should be sorted by the given |
| // comparator, and the range iterators should not be derived from |container|. |
| // Differs from std::inplace_merge() in that additional memory is not required |
| // for linear performance. |
| template <typename CT, typename CTI, typename COMP> |
| void container_merge(CT* container, CTI beg, CTI end, const COMP& less) { |
| DCHECK(sorted(container->begin(), container->end(), less)); |
| DCHECK(sorted(beg, end, less)); |
| |
| // Size the container to fit the results. |
| const size_t c_size = container->size(); |
| container->resize(c_size + (end - beg)); |
| |
| // |c_end| points to the original endpoint, while |c_out| points to the |
| // endpoint that will scan from end to beginning while merging. |
| typename CT::iterator c_end = container->begin() + c_size; |
| typename CT::iterator c_out = container->end(); |
| |
| // While both inputs have data, move the greater to |c_out|. |
| while (c_end != container->begin() && end != beg) { |
| if (less(*(c_end - 1), *(end - 1))) { |
| *(--c_out) = *(--end); |
| } else { |
| *(--c_out) = *(--c_end); |
| } |
| } |
| |
| // Copy any data remaining in the new range. |
| if (end != beg) { |
| // The original container data has been fully shifted. |
| DCHECK(c_end == container->begin()); |
| |
| // There is exactly the correct amount of space left. |
| DCHECK_EQ(c_out - c_end, end - beg); |
| |
| std::copy(beg, end, container->begin()); |
| } |
| |
| DCHECK(sorted(container->begin(), container->end(), less)); |
| } |
| |
| // Collection of iterators used while stepping through StateInternal (see |
| // below). |
| class StateInternalPos { |
| public: |
| StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter, |
| SBSubPrefixes::iterator sub_prefixes_iter, |
| std::vector<SBAddFullHash>::iterator add_hashes_iter, |
| std::vector<SBSubFullHash>::iterator sub_hashes_iter) |
| : add_prefixes_iter_(add_prefixes_iter), |
| sub_prefixes_iter_(sub_prefixes_iter), |
| add_hashes_iter_(add_hashes_iter), |
| sub_hashes_iter_(sub_hashes_iter) { |
| } |
| |
| SBAddPrefixes::iterator add_prefixes_iter_; |
| SBSubPrefixes::iterator sub_prefixes_iter_; |
| std::vector<SBAddFullHash>::iterator add_hashes_iter_; |
| std::vector<SBSubFullHash>::iterator sub_hashes_iter_; |
| }; |
| |
| // Helper to find the next shard boundary. |
| template <class T> |
| bool prefix_bounder(SBPrefix val, const T& elt) { |
| return val < elt.GetAddPrefix(); |
| } |
| |
| // Container for partial database state. Includes add/sub prefixes/hashes, plus |
| // aggregate operations on same. |
| class StateInternal { |
| public: |
| // Append indicated amount of data from |fp|. |
| bool AppendData(size_t add_prefix_count, size_t sub_prefix_count, |
| size_t add_hash_count, size_t sub_hash_count, |
| FILE* fp, base::MD5Context* context) { |
| return |
| ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) && |
| ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) && |
| ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) && |
| ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context); |
| } |
| |
| void ClearData() { |
| add_prefixes_.clear(); |
| sub_prefixes_.clear(); |
| add_full_hashes_.clear(); |
| sub_full_hashes_.clear(); |
| } |
| |
| // Merge data from |beg|..|end| into receiver's state, then process the state. |
| // The current state and the range given should corrospond to the same sorted |
| // shard of data from different sources. |add_del_cache| and |sub_del_cache| |
| // indicate the chunk ids which should be deleted during processing (see |
| // SBProcessSubs). |
| void MergeDataAndProcess(const StateInternalPos& beg, |
| const StateInternalPos& end, |
| const base::hash_set<int32>& add_del_cache, |
| const base::hash_set<int32>& sub_del_cache) { |
| container_merge(&add_prefixes_, |
| beg.add_prefixes_iter_, |
| end.add_prefixes_iter_, |
| SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); |
| |
| container_merge(&sub_prefixes_, |
| beg.sub_prefixes_iter_, |
| end.sub_prefixes_iter_, |
| SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); |
| |
| container_merge(&add_full_hashes_, |
| beg.add_hashes_iter_, |
| end.add_hashes_iter_, |
| SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); |
| |
| container_merge(&sub_full_hashes_, |
| beg.sub_hashes_iter_, |
| end.sub_hashes_iter_, |
| SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>); |
| |
| SBProcessSubs(&add_prefixes_, &sub_prefixes_, |
| &add_full_hashes_, &sub_full_hashes_, |
| add_del_cache, sub_del_cache); |
| } |
| |
| // Sort the data appropriately for the sharding, merging, and processing |
| // operations. |
| void SortData() { |
| std::sort(add_prefixes_.begin(), add_prefixes_.end(), |
| SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); |
| std::sort(sub_prefixes_.begin(), sub_prefixes_.end(), |
| SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); |
| std::sort(add_full_hashes_.begin(), add_full_hashes_.end(), |
| SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); |
| std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(), |
| SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>); |
| } |
| |
| // Iterator from the beginning of the state's data. |
| StateInternalPos StateBegin() { |
| return StateInternalPos(add_prefixes_.begin(), |
| sub_prefixes_.begin(), |
| add_full_hashes_.begin(), |
| sub_full_hashes_.begin()); |
| } |
| |
| // An iterator pointing just after the last possible element of the shard |
| // indicated by |shard_max|. Used to step through the state by shard. |
| // TODO(shess): Verify whether binary search really improves over linear. |
| // Merging or writing will immediately touch all of these elements. |
| StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) { |
| return StateInternalPos( |
| std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(), |
| shard_max, prefix_bounder<SBAddPrefix>), |
| std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(), |
| shard_max, prefix_bounder<SBSubPrefix>), |
| std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(), |
| shard_max, prefix_bounder<SBAddFullHash>), |
| std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(), |
| shard_max, prefix_bounder<SBSubFullHash>)); |
| } |
| |
| // Write a shard header and data for the shard starting at |beg| and ending at |
| // the element before |end|. |
| bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end, |
| FILE* fp, base::MD5Context* context) { |
| ShardHeader shard_header; |
| shard_header.add_prefix_count = |
| end.add_prefixes_iter_ - beg.add_prefixes_iter_; |
| shard_header.sub_prefix_count = |
| end.sub_prefixes_iter_ - beg.sub_prefixes_iter_; |
| shard_header.add_hash_count = |
| end.add_hashes_iter_ - beg.add_hashes_iter_; |
| shard_header.sub_hash_count = |
| end.sub_hashes_iter_ - beg.sub_hashes_iter_; |
| |
| return |
| WriteItem(shard_header, fp, context) && |
| WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_, |
| fp, context) && |
| WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_, |
| fp, context) && |
| WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_, |
| fp, context) && |
| WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_, |
| fp, context); |
| } |
| |
| SBAddPrefixes add_prefixes_; |
| SBSubPrefixes sub_prefixes_; |
| std::vector<SBAddFullHash> add_full_hashes_; |
| std::vector<SBSubFullHash> sub_full_hashes_; |
| }; |
| |
| // True if |val| is an even power of two. |
| template <typename T> |
| bool IsPowerOfTwo(const T& val) { |
| return val && (val & (val - 1)) == 0; |
| } |
| |
| // Helper to read the entire database state, used by GetAddPrefixes() and |
| // GetAddFullHashes(). Those functions are generally used only for smaller |
| // files. Returns false in case of errors reading the data. |
| bool ReadDbStateHelper(const base::FilePath& filename, |
| StateInternal* db_state) { |
| file_util::ScopedFILE file(base::OpenFile(filename, "rb")); |
| if (file.get() == NULL) |
| return false; |
| |
| std::set<int32> add_chunks; |
| std::set<int32> sub_chunks; |
| |
| base::MD5Context context; |
| FileHeader header; |
| const int version = |
| ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks, |
| file.get(), &context); |
| if (version == kInvalidVersion) |
| return false; |
| |
| if (version == 7) { |
| if (!db_state->AppendData(header.v7.add_prefix_count, |
| header.v7.sub_prefix_count, |
| header.v7.add_hash_count, |
| header.v7.sub_hash_count, |
| file.get(), &context)) { |
| return false; |
| } |
| |
| // v7 data was not stored sorted. |
| db_state->SortData(); |
| } else { |
| // Read until the shard start overflows, always at least one pass. |
| uint64 in_min = 0; |
| uint64 in_stride = header.v8.shard_stride; |
| if (!in_stride) |
| in_stride = kMaxShardStride; |
| if (!IsPowerOfTwo(in_stride)) |
| return false; |
| |
| do { |
| ShardHeader shard_header; |
| if (!ReadItem(&shard_header, file.get(), &context)) |
| return false; |
| |
| if (!db_state->AppendData(shard_header.add_prefix_count, |
| shard_header.sub_prefix_count, |
| shard_header.add_hash_count, |
| shard_header.sub_hash_count, |
| file.get(), &context)) { |
| return false; |
| } |
| |
| in_min += in_stride; |
| } while (in_min <= kMaxSBPrefix); |
| } |
| |
| if (!ReadAndVerifyChecksum(file.get(), &context)) |
| return false; |
| |
| int64 size = 0; |
| if (!base::GetFileSize(filename, &size)) |
| return false; |
| |
| return static_cast<int64>(ftell(file.get())) == size; |
| } |
| |
| } // namespace |
| |
| // static |
| void SafeBrowsingStoreFile::CheckForOriginalAndDelete( |
| const base::FilePath& current_filename) { |
| const base::FilePath original_filename( |
| current_filename.DirName().AppendASCII("Safe Browsing")); |
| if (base::PathExists(original_filename)) { |
| int64 size = 0; |
| if (base::GetFileSize(original_filename, &size)) { |
| UMA_HISTOGRAM_COUNTS("SB2.OldDatabaseKilobytes", |
| static_cast<int>(size / 1024)); |
| } |
| |
| if (base::DeleteFile(original_filename, false)) { |
| RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL); |
| } else { |
| RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL_FAILED); |
| } |
| |
| // Just best-effort on the journal file, don't want to get lost in |
| // the weeds. |
| const base::FilePath journal_filename( |
| current_filename.DirName().AppendASCII("Safe Browsing-journal")); |
| base::DeleteFile(journal_filename, false); |
| } |
| } |
| |
| SafeBrowsingStoreFile::SafeBrowsingStoreFile() |
| : chunks_written_(0), empty_(false), corruption_seen_(false) {} |
| |
| SafeBrowsingStoreFile::~SafeBrowsingStoreFile() { |
| Close(); |
| } |
| |
| bool SafeBrowsingStoreFile::Delete() { |
| // The database should not be open at this point. But, just in |
| // case, close everything before deleting. |
| if (!Close()) { |
| NOTREACHED(); |
| return false; |
| } |
| |
| return DeleteStore(filename_); |
| } |
| |
| bool SafeBrowsingStoreFile::CheckValidity() { |
| // The file was either empty or never opened. The empty case is |
| // presumed not to be invalid. The never-opened case can happen if |
| // BeginUpdate() fails for any databases, and should already have |
| // caused the corruption callback to fire. |
| if (!file_.get()) |
| return true; |
| |
| if (!FileRewind(file_.get())) |
| return OnCorruptDatabase(); |
| |
| int64 size = 0; |
| if (!base::GetFileSize(filename_, &size)) |
| return OnCorruptDatabase(); |
| |
| base::MD5Context context; |
| base::MD5Init(&context); |
| |
| // Read everything except the final digest. |
| size_t bytes_left = static_cast<size_t>(size); |
| CHECK(size == static_cast<int64>(bytes_left)); |
| if (bytes_left < sizeof(base::MD5Digest)) |
| return OnCorruptDatabase(); |
| bytes_left -= sizeof(base::MD5Digest); |
| |
| // Fold the contents of the file into the checksum. |
| while (bytes_left > 0) { |
| char buf[4096]; |
| const size_t c = std::min(sizeof(buf), bytes_left); |
| const size_t ret = fread(buf, 1, c, file_.get()); |
| |
| // The file's size changed while reading, give up. |
| if (ret != c) |
| return OnCorruptDatabase(); |
| base::MD5Update(&context, base::StringPiece(buf, c)); |
| bytes_left -= c; |
| } |
| |
| if (!ReadAndVerifyChecksum(file_.get(), &context)) { |
| RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE); |
| return OnCorruptDatabase(); |
| } |
| |
| return true; |
| } |
| |
| void SafeBrowsingStoreFile::Init( |
| const base::FilePath& filename, |
| const base::Closure& corruption_callback |
| ) { |
| filename_ = filename; |
| corruption_callback_ = corruption_callback; |
| } |
| |
| bool SafeBrowsingStoreFile::BeginChunk() { |
| return ClearChunkBuffers(); |
| } |
| |
| bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) { |
| add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix)); |
| return true; |
| } |
| |
| bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) { |
| add_prefixes->clear(); |
| if (!base::PathExists(filename_)) |
| return true; |
| |
| StateInternal db_state; |
| if (!ReadDbStateHelper(filename_, &db_state)) |
| return OnCorruptDatabase(); |
| |
| add_prefixes->swap(db_state.add_prefixes_); |
| return true; |
| } |
| |
| bool SafeBrowsingStoreFile::GetAddFullHashes( |
| std::vector<SBAddFullHash>* add_full_hashes) { |
| add_full_hashes->clear(); |
| if (!base::PathExists(filename_)) |
| return true; |
| |
| StateInternal db_state; |
| if (!ReadDbStateHelper(filename_, &db_state)) |
| return OnCorruptDatabase(); |
| |
| add_full_hashes->swap(db_state.add_full_hashes_); |
| return true; |
| } |
| |
| bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id, |
| const SBFullHash& full_hash) { |
| add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash)); |
| return true; |
| } |
| |
| bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id, |
| int32 add_chunk_id, |
| SBPrefix prefix) { |
| sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix)); |
| return true; |
| } |
| |
| bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id, |
| const SBFullHash& full_hash) { |
| sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash)); |
| return true; |
| } |
| |
| bool SafeBrowsingStoreFile::OnCorruptDatabase() { |
| if (!corruption_seen_) |
| RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT); |
| corruption_seen_ = true; |
| |
| corruption_callback_.Run(); |
| |
| // Return false as a convenience to callers. |
| return false; |
| } |
| |
| bool SafeBrowsingStoreFile::Close() { |
| ClearUpdateBuffers(); |
| |
| // Make sure the files are closed. |
| file_.reset(); |
| new_file_.reset(); |
| return true; |
| } |
| |
| bool SafeBrowsingStoreFile::BeginUpdate() { |
| DCHECK(!file_.get() && !new_file_.get()); |
| |
| // Structures should all be clear unless something bad happened. |
| DCHECK(add_chunks_cache_.empty()); |
| DCHECK(sub_chunks_cache_.empty()); |
| DCHECK(add_del_cache_.empty()); |
| DCHECK(sub_del_cache_.empty()); |
| DCHECK(add_prefixes_.empty()); |
| DCHECK(sub_prefixes_.empty()); |
| DCHECK(add_hashes_.empty()); |
| DCHECK(sub_hashes_.empty()); |
| DCHECK_EQ(chunks_written_, 0); |
| |
| // Since the following code will already hit the profile looking for |
| // database files, this is a reasonable to time delete any old |
| // files. |
| CheckForOriginalAndDelete(filename_); |
| |
| corruption_seen_ = false; |
| |
| const base::FilePath new_filename = TemporaryFileForFilename(filename_); |
| base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+")); |
| if (new_file.get() == NULL) |
| return false; |
| |
| base::ScopedFILE file(base::OpenFile(filename_, "rb")); |
| empty_ = (file.get() == NULL); |
| if (empty_) { |
| // If the file exists but cannot be opened, try to delete it (not |
| // deleting directly, the bloom filter needs to be deleted, too). |
| if (base::PathExists(filename_)) |
| return OnCorruptDatabase(); |
| |
| new_file_.swap(new_file); |
| return true; |
| } |
| |
| base::MD5Context context; |
| FileHeader header; |
| const int version = |
| ReadAndVerifyHeader(filename_, &header, |
| &add_chunks_cache_, &sub_chunks_cache_, |
| file.get(), &context); |
| if (version == kInvalidVersion) { |
| FileHeaderV8 retry_header; |
| if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL) && |
| (retry_header.magic != kFileMagic || |
| (retry_header.version != 8 && retry_header.version != 7))) { |
| // TODO(shess): Think on whether these histograms are generating any |
| // actionable data. I kid you not, SQLITE happens many thousands of times |
| // per day, UNKNOWN about 3x higher than that. |
| if (!strcmp(reinterpret_cast<char*>(&retry_header.magic), |
| "SQLite format 3")) { |
| RecordFormatEvent(FORMAT_EVENT_FOUND_SQLITE); |
| } else { |
| RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN); |
| } |
| } |
| |
| // Close the file so that it can be deleted. |
| file.reset(); |
| |
| return OnCorruptDatabase(); |
| } |
| |
| file_.swap(file); |
| new_file_.swap(new_file); |
| return true; |
| } |
| |
| bool SafeBrowsingStoreFile::FinishChunk() { |
| if (!add_prefixes_.size() && !sub_prefixes_.size() && |
| !add_hashes_.size() && !sub_hashes_.size()) |
| return true; |
| |
| ChunkHeader header; |
| header.add_prefix_count = add_prefixes_.size(); |
| header.sub_prefix_count = sub_prefixes_.size(); |
| header.add_hash_count = add_hashes_.size(); |
| header.sub_hash_count = sub_hashes_.size(); |
| if (!WriteItem(header, new_file_.get(), NULL)) |
| return false; |
| |
| if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) || |
| !WriteContainer(sub_prefixes_, new_file_.get(), NULL) || |
| !WriteContainer(add_hashes_, new_file_.get(), NULL) || |
| !WriteContainer(sub_hashes_, new_file_.get(), NULL)) |
| return false; |
| |
| ++chunks_written_; |
| |
| // Clear everything to save memory. |
| return ClearChunkBuffers(); |
| } |
| |
| bool SafeBrowsingStoreFile::DoUpdate( |
| safe_browsing::PrefixSetBuilder* builder, |
| std::vector<SBAddFullHash>* add_full_hashes_result) { |
| DCHECK(file_.get() || empty_); |
| DCHECK(new_file_.get()); |
| CHECK(builder); |
| CHECK(add_full_hashes_result); |
| |
| // Rewind the temporary storage. |
| if (!FileRewind(new_file_.get())) |
| return false; |
| |
| // Get chunk file's size for validating counts. |
| int64 update_size = 0; |
| if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size)) |
| return OnCorruptDatabase(); |
| |
| // Track update size to answer questions at http://crbug.com/72216 . |
| // Log small updates as 1k so that the 0 (underflow) bucket can be |
| // used for "empty" in SafeBrowsingDatabase. |
| UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes", |
| std::max(static_cast<int>(update_size / 1024), 1)); |
| |
| // Chunk updates to integrate. |
| StateInternal new_state; |
| |
| // Read update chunks. |
| for (int i = 0; i < chunks_written_; ++i) { |
| ChunkHeader header; |
| |
| int64 ofs = ftell(new_file_.get()); |
| if (ofs == -1) |
| return false; |
| |
| if (!ReadItem(&header, new_file_.get(), NULL)) |
| return false; |
| |
| // As a safety measure, make sure that the header describes a sane |
| // chunk, given the remaining file size. |
| int64 expected_size = ofs + sizeof(ChunkHeader); |
| expected_size += header.add_prefix_count * sizeof(SBAddPrefix); |
| expected_size += header.sub_prefix_count * sizeof(SBSubPrefix); |
| expected_size += header.add_hash_count * sizeof(SBAddFullHash); |
| expected_size += header.sub_hash_count * sizeof(SBSubFullHash); |
| if (expected_size > update_size) |
| return false; |
| |
| if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count, |
| header.add_hash_count, header.sub_hash_count, |
| new_file_.get(), NULL)) { |
| return false; |
| } |
| } |
| |
| // The state was accumulated by chunk, sort by prefix. |
| new_state.SortData(); |
| |
| // These strides control how much data is loaded into memory per pass. |
| // Strides must be an even power of two. |in_stride| will be derived from the |
| // input file. |out_stride| will be derived from an estimate of the resulting |
| // file's size. |process_stride| will be the max of both. |
| uint64 in_stride = kMaxShardStride; |
| uint64 out_stride = kMaxShardStride; |
| uint64 process_stride = 0; |
| |
| // The header info is only used later if |!empty_|. The v8 read loop only |
| // needs |in_stride|, while v7 needs to refer to header information. |
| base::MD5Context in_context; |
| int version = kInvalidVersion; |
| FileHeader header; |
| |
| if (!empty_) { |
| DCHECK(file_.get()); |
| |
| version = ReadAndVerifyHeader(filename_, &header, |
| &add_chunks_cache_, &sub_chunks_cache_, |
| file_.get(), &in_context); |
| if (version == kInvalidVersion) |
| return OnCorruptDatabase(); |
| |
| if (version == 8 && header.v8.shard_stride) |
| in_stride = header.v8.shard_stride; |
| |
| // The header checksum should have prevented this case, but the code will be |
| // broken if this is not correct. |
| if (!IsPowerOfTwo(in_stride)) |
| return OnCorruptDatabase(); |
| } |
| |
| // We no longer need to track deleted chunks. |
| DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_); |
| DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_); |
| |
| // Calculate |out_stride| to break the file down into reasonable shards. |
| { |
| int64 original_size = 0; |
| if (!empty_ && !base::GetFileSize(filename_, &original_size)) |
| return OnCorruptDatabase(); |
| |
| // Approximate the final size as everything. Subs and deletes will reduce |
| // the size, but modest over-sharding won't hurt much. |
| int64 shard_size = original_size + update_size; |
| |
| // Keep splitting until a single stride of data fits the target. |
| size_t shifts = 0; |
| while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) { |
| out_stride >>= 1; |
| shard_size >>= 1; |
| ++shifts; |
| } |
| UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts); |
| |
| DCHECK(IsPowerOfTwo(out_stride)); |
| } |
| |
| // Outer loop strides by the max of the input stride (to read integral shards) |
| // and the output stride (to write integral shards). |
| process_stride = std::max(in_stride, out_stride); |
| DCHECK(IsPowerOfTwo(process_stride)); |
| DCHECK_EQ(0u, process_stride % in_stride); |
| DCHECK_EQ(0u, process_stride % out_stride); |
| |
| // Start writing the new data to |new_file_|. |
| base::MD5Context out_context; |
| if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_, |
| new_file_.get(), &out_context)) { |
| return false; |
| } |
| |
| // Start at the beginning of the SBPrefix space. |
| uint64 in_min = 0; |
| uint64 out_min = 0; |
| uint64 process_min = 0; |
| |
| // Start at the beginning of the updates. |
| StateInternalPos new_pos = new_state.StateBegin(); |
| |
| // Re-usable container for shard processing. |
| StateInternal db_state; |
| |
| // Track aggregate counts for histograms. |
| size_t add_prefix_count = 0; |
| size_t sub_prefix_count = 0; |
| |
| do { |
| // Maximum element in the current shard. |
| SBPrefix process_max = |
| static_cast<SBPrefix>(process_min + process_stride - 1); |
| DCHECK_GT(process_max, process_min); |
| |
| // Drop the data from previous pass. |
| db_state.ClearData(); |
| |
| // Fill the processing shard with one or more input shards. |
| if (!empty_) { |
| if (version == 7) { |
| // Treat v7 as a single-shard file. |
| DCHECK_EQ(in_min, 0u); |
| DCHECK_EQ(in_stride, kMaxShardStride); |
| DCHECK_EQ(process_stride, kMaxShardStride); |
| if (!db_state.AppendData(header.v7.add_prefix_count, |
| header.v7.sub_prefix_count, |
| header.v7.add_hash_count, |
| header.v7.sub_hash_count, |
| file_.get(), &in_context)) |
| return OnCorruptDatabase(); |
| |
| // v7 data is not sorted correctly. |
| db_state.SortData(); |
| } else { |
| do { |
| ShardHeader shard_header; |
| if (!ReadItem(&shard_header, file_.get(), &in_context)) |
| return OnCorruptDatabase(); |
| |
| if (!db_state.AppendData(shard_header.add_prefix_count, |
| shard_header.sub_prefix_count, |
| shard_header.add_hash_count, |
| shard_header.sub_hash_count, |
| file_.get(), &in_context)) |
| return OnCorruptDatabase(); |
| |
| in_min += in_stride; |
| } while (in_min <= kMaxSBPrefix && in_min < process_max); |
| } |
| } |
| |
| // Shard the update data to match the database data, then merge the update |
| // data and process the results. |
| { |
| StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max); |
| db_state.MergeDataAndProcess(new_pos, new_end, |
| add_del_cache_, sub_del_cache_); |
| new_pos = new_end; |
| } |
| |
| // Collect the processed data for return to caller. |
| for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) { |
| builder->AddPrefix(db_state.add_prefixes_[i].prefix); |
| } |
| add_full_hashes_result->insert(add_full_hashes_result->end(), |
| db_state.add_full_hashes_.begin(), |
| db_state.add_full_hashes_.end()); |
| add_prefix_count += db_state.add_prefixes_.size(); |
| sub_prefix_count += db_state.sub_prefixes_.size(); |
| |
| // Write one or more shards of processed output. |
| StateInternalPos out_pos = db_state.StateBegin(); |
| do { |
| SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1); |
| DCHECK_GT(out_max, out_min); |
| |
| StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max); |
| if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context)) |
| return false; |
| out_pos = out_end; |
| |
| out_min += out_stride; |
| } while (out_min == static_cast<SBPrefix>(out_min) && |
| out_min < process_max); |
| |
| process_min += process_stride; |
| } while (process_min <= kMaxSBPrefix); |
| |
| // Verify the overall checksum. |
| if (!empty_) { |
| if (!ReadAndVerifyChecksum(file_.get(), &in_context)) { |
| RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE); |
| return OnCorruptDatabase(); |
| } |
| |
| // TODO(shess): Verify EOF? |
| |
| // Close the input file so the new file can be renamed over it. |
| file_.reset(); |
| } |
| DCHECK(!file_.get()); |
| |
| // Write the overall checksum. |
| base::MD5Digest out_digest; |
| base::MD5Final(&out_digest, &out_context); |
| if (!WriteItem(out_digest, new_file_.get(), NULL)) |
| return false; |
| |
| // Trim any excess left over from the temporary chunk data. |
| if (!base::TruncateFile(new_file_.get())) |
| return false; |
| |
| // Close the file handle and swizzle the file into place. |
| new_file_.reset(); |
| if (!base::DeleteFile(filename_, false) && |
| base::PathExists(filename_)) |
| return false; |
| |
| const base::FilePath new_filename = TemporaryFileForFilename(filename_); |
| if (!base::Move(new_filename, filename_)) |
| return false; |
| |
| // Record counts before swapping to caller. |
| UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count); |
| UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count); |
| |
| return true; |
| } |
| |
| bool SafeBrowsingStoreFile::FinishUpdate( |
| safe_browsing::PrefixSetBuilder* builder, |
| std::vector<SBAddFullHash>* add_full_hashes_result) { |
| DCHECK(builder); |
| DCHECK(add_full_hashes_result); |
| |
| if (!DoUpdate(builder, add_full_hashes_result)) { |
| CancelUpdate(); |
| return false; |
| } |
| |
| DCHECK(!new_file_.get()); |
| DCHECK(!file_.get()); |
| |
| return Close(); |
| } |
| |
| bool SafeBrowsingStoreFile::CancelUpdate() { |
| return Close(); |
| } |
| |
| void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) { |
| add_chunks_cache_.insert(chunk_id); |
| } |
| |
| bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) { |
| return add_chunks_cache_.count(chunk_id) > 0; |
| } |
| |
| void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) { |
| out->clear(); |
| out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end()); |
| } |
| |
| void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) { |
| sub_chunks_cache_.insert(chunk_id); |
| } |
| |
| bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) { |
| return sub_chunks_cache_.count(chunk_id) > 0; |
| } |
| |
| void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) { |
| out->clear(); |
| out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end()); |
| } |
| |
| void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) { |
| add_del_cache_.insert(chunk_id); |
| } |
| |
| void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) { |
| sub_del_cache_.insert(chunk_id); |
| } |
| |
| // static |
| bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) { |
| if (!base::DeleteFile(basename, false) && |
| base::PathExists(basename)) { |
| NOTREACHED(); |
| return false; |
| } |
| |
| const base::FilePath new_filename = TemporaryFileForFilename(basename); |
| if (!base::DeleteFile(new_filename, false) && |
| base::PathExists(new_filename)) { |
| NOTREACHED(); |
| return false; |
| } |
| |
| // With SQLite support gone, one way to get to this code is if the |
| // existing file is a SQLite file. Make sure the journal file is |
| // also removed. |
| const base::FilePath journal_filename( |
| basename.value() + FILE_PATH_LITERAL("-journal")); |
| if (base::PathExists(journal_filename)) |
| base::DeleteFile(journal_filename, false); |
| |
| return true; |
| } |