| // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. See the AUTHORS file for names of contributors. |
| // |
| // We recover the contents of the descriptor from the other files we find. |
| // (1) Any log files are first converted to tables |
| // (2) We scan every table to compute |
| // (a) smallest/largest for the table |
| // (b) largest sequence number in the table |
| // (3) We generate descriptor contents: |
| // - log number is set to zero |
| // - next-file-number is set to 1 + largest file number we found |
| // - last-sequence-number is set to largest sequence# found across |
| // all tables (see 2c) |
| // - compaction pointers are cleared |
| // - every table file is added at level 0 |
| // |
| // Possible optimization 1: |
| // (a) Compute total size and use to pick appropriate max-level M |
| // (b) Sort tables by largest sequence# in the table |
| // (c) For each table: if it overlaps earlier table, place in level-0, |
| // else place in level-M. |
| // Possible optimization 2: |
| // Store per-table metadata (smallest, largest, largest-seq#, ...) |
| // in the table's meta section to speed up ScanTable. |
| |
| #include "db/builder.h" |
| #include "db/db_impl.h" |
| #include "db/dbformat.h" |
| #include "db/filename.h" |
| #include "db/log_reader.h" |
| #include "db/log_writer.h" |
| #include "db/memtable.h" |
| #include "db/table_cache.h" |
| #include "db/version_edit.h" |
| #include "db/write_batch_internal.h" |
| #include "leveldb/comparator.h" |
| #include "leveldb/db.h" |
| #include "leveldb/env.h" |
| |
| namespace leveldb { |
| |
| namespace { |
| |
| class Repairer { |
| public: |
| Repairer(const std::string& dbname, const Options& options) |
| : dbname_(dbname), |
| env_(options.env), |
| icmp_(options.comparator), |
| ipolicy_(options.filter_policy), |
| options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), |
| owns_info_log_(options_.info_log != options.info_log), |
| owns_cache_(options_.block_cache != options.block_cache), |
| next_file_number_(1) { |
| // TableCache can be small since we expect each table to be opened once. |
| table_cache_ = new TableCache(dbname_, &options_, 10); |
| } |
| |
| ~Repairer() { |
| delete table_cache_; |
| if (owns_info_log_) { |
| delete options_.info_log; |
| } |
| if (owns_cache_) { |
| delete options_.block_cache; |
| } |
| } |
| |
| Status Run() { |
| Status status = FindFiles(); |
| if (status.ok()) { |
| ConvertLogFilesToTables(); |
| ExtractMetaData(); |
| status = WriteDescriptor(); |
| } |
| if (status.ok()) { |
| unsigned long long bytes = 0; |
| for (size_t i = 0; i < tables_.size(); i++) { |
| bytes += tables_[i].meta.file_size; |
| } |
| Log(options_.info_log, |
| "**** Repaired leveldb %s; " |
| "recovered %d files; %llu bytes. " |
| "Some data may have been lost. " |
| "****", |
| dbname_.c_str(), |
| static_cast<int>(tables_.size()), |
| bytes); |
| } |
| return status; |
| } |
| |
| private: |
| struct TableInfo { |
| FileMetaData meta; |
| SequenceNumber max_sequence; |
| }; |
| |
| std::string const dbname_; |
| Env* const env_; |
| InternalKeyComparator const icmp_; |
| InternalFilterPolicy const ipolicy_; |
| Options const options_; |
| bool owns_info_log_; |
| bool owns_cache_; |
| TableCache* table_cache_; |
| VersionEdit edit_; |
| |
| std::vector<std::string> manifests_; |
| std::vector<uint64_t> table_numbers_; |
| std::vector<uint64_t> logs_; |
| std::vector<TableInfo> tables_; |
| uint64_t next_file_number_; |
| |
| Status FindFiles() { |
| std::vector<std::string> filenames; |
| Status status = env_->GetChildren(dbname_, &filenames); |
| if (!status.ok()) { |
| return status; |
| } |
| if (filenames.empty()) { |
| return Status::IOError(dbname_, "repair found no files"); |
| } |
| |
| uint64_t number; |
| FileType type; |
| for (size_t i = 0; i < filenames.size(); i++) { |
| if (ParseFileName(filenames[i], &number, &type)) { |
| if (type == kDescriptorFile) { |
| manifests_.push_back(filenames[i]); |
| } else { |
| if (number + 1 > next_file_number_) { |
| next_file_number_ = number + 1; |
| } |
| if (type == kLogFile) { |
| logs_.push_back(number); |
| } else if (type == kTableFile) { |
| table_numbers_.push_back(number); |
| } else { |
| // Ignore other files |
| } |
| } |
| } |
| } |
| return status; |
| } |
| |
| void ConvertLogFilesToTables() { |
| for (size_t i = 0; i < logs_.size(); i++) { |
| std::string logname = LogFileName(dbname_, logs_[i]); |
| Status status = ConvertLogToTable(logs_[i]); |
| if (!status.ok()) { |
| Log(options_.info_log, "Log #%llu: ignoring conversion error: %s", |
| (unsigned long long) logs_[i], |
| status.ToString().c_str()); |
| } |
| ArchiveFile(logname); |
| } |
| } |
| |
| Status ConvertLogToTable(uint64_t log) { |
| struct LogReporter : public log::Reader::Reporter { |
| Env* env; |
| Logger* info_log; |
| uint64_t lognum; |
| virtual void Corruption(size_t bytes, const Status& s) { |
| // We print error messages for corruption, but continue repairing. |
| Log(info_log, "Log #%llu: dropping %d bytes; %s", |
| (unsigned long long) lognum, |
| static_cast<int>(bytes), |
| s.ToString().c_str()); |
| } |
| }; |
| |
| // Open the log file |
| std::string logname = LogFileName(dbname_, log); |
| SequentialFile* lfile; |
| Status status = env_->NewSequentialFile(logname, &lfile); |
| if (!status.ok()) { |
| return status; |
| } |
| |
| // Create the log reader. |
| LogReporter reporter; |
| reporter.env = env_; |
| reporter.info_log = options_.info_log; |
| reporter.lognum = log; |
| // We intentially make log::Reader do checksumming so that |
| // corruptions cause entire commits to be skipped instead of |
| // propagating bad information (like overly large sequence |
| // numbers). |
| log::Reader reader(lfile, &reporter, false/*do not checksum*/, |
| 0/*initial_offset*/); |
| |
| // Read all the records and add to a memtable |
| std::string scratch; |
| Slice record; |
| WriteBatch batch; |
| MemTable* mem = new MemTable(icmp_); |
| mem->Ref(); |
| int counter = 0; |
| while (reader.ReadRecord(&record, &scratch)) { |
| if (record.size() < 12) { |
| reporter.Corruption( |
| record.size(), Status::Corruption("log record too small")); |
| continue; |
| } |
| WriteBatchInternal::SetContents(&batch, record); |
| status = WriteBatchInternal::InsertInto(&batch, mem); |
| if (status.ok()) { |
| counter += WriteBatchInternal::Count(&batch); |
| } else { |
| Log(options_.info_log, "Log #%llu: ignoring %s", |
| (unsigned long long) log, |
| status.ToString().c_str()); |
| status = Status::OK(); // Keep going with rest of file |
| } |
| } |
| delete lfile; |
| |
| // Do not record a version edit for this conversion to a Table |
| // since ExtractMetaData() will also generate edits. |
| FileMetaData meta; |
| meta.number = next_file_number_++; |
| Iterator* iter = mem->NewIterator(); |
| status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta); |
| delete iter; |
| mem->Unref(); |
| mem = NULL; |
| if (status.ok()) { |
| if (meta.file_size > 0) { |
| table_numbers_.push_back(meta.number); |
| } |
| } |
| Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", |
| (unsigned long long) log, |
| counter, |
| (unsigned long long) meta.number, |
| status.ToString().c_str()); |
| return status; |
| } |
| |
| void ExtractMetaData() { |
| std::vector<TableInfo> kept; |
| for (size_t i = 0; i < table_numbers_.size(); i++) { |
| TableInfo t; |
| t.meta.number = table_numbers_[i]; |
| Status status = ScanTable(&t); |
| if (!status.ok()) { |
| std::string fname = TableFileName(dbname_, table_numbers_[i]); |
| Log(options_.info_log, "Table #%llu: ignoring %s", |
| (unsigned long long) table_numbers_[i], |
| status.ToString().c_str()); |
| ArchiveFile(fname); |
| } else { |
| tables_.push_back(t); |
| } |
| } |
| } |
| |
| Status ScanTable(TableInfo* t) { |
| std::string fname = TableFileName(dbname_, t->meta.number); |
| int counter = 0; |
| Status status = env_->GetFileSize(fname, &t->meta.file_size); |
| if (!status.ok()) { |
| fname = SSTTableFileName(dbname_, t->meta.number); |
| Status s2 = env_->GetFileSize(fname, &t->meta.file_size); |
| if (s2.ok()) |
| status = Status::OK(); |
| } |
| if (status.ok()) { |
| Iterator* iter = table_cache_->NewIterator( |
| ReadOptions(), t->meta.number, t->meta.file_size); |
| bool empty = true; |
| ParsedInternalKey parsed; |
| t->max_sequence = 0; |
| for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { |
| Slice key = iter->key(); |
| if (!ParseInternalKey(key, &parsed)) { |
| Log(options_.info_log, "Table #%llu: unparsable key %s", |
| (unsigned long long) t->meta.number, |
| EscapeString(key).c_str()); |
| continue; |
| } |
| |
| counter++; |
| if (empty) { |
| empty = false; |
| t->meta.smallest.DecodeFrom(key); |
| } |
| t->meta.largest.DecodeFrom(key); |
| if (parsed.sequence > t->max_sequence) { |
| t->max_sequence = parsed.sequence; |
| } |
| } |
| if (!iter->status().ok()) { |
| status = iter->status(); |
| } |
| delete iter; |
| } |
| // If there was trouble opening an .sst file this will report that the .ldb |
| // file was not found, which is kind of lame but shouldn't happen often. |
| Log(options_.info_log, "Table #%llu: %d entries %s", |
| (unsigned long long) t->meta.number, |
| counter, |
| status.ToString().c_str()); |
| return status; |
| } |
| |
| Status WriteDescriptor() { |
| std::string tmp = TempFileName(dbname_, 1); |
| WritableFile* file; |
| Status status = env_->NewWritableFile(tmp, &file); |
| if (!status.ok()) { |
| return status; |
| } |
| |
| SequenceNumber max_sequence = 0; |
| for (size_t i = 0; i < tables_.size(); i++) { |
| if (max_sequence < tables_[i].max_sequence) { |
| max_sequence = tables_[i].max_sequence; |
| } |
| } |
| |
| edit_.SetComparatorName(icmp_.user_comparator()->Name()); |
| edit_.SetLogNumber(0); |
| edit_.SetNextFile(next_file_number_); |
| edit_.SetLastSequence(max_sequence); |
| |
| for (size_t i = 0; i < tables_.size(); i++) { |
| // TODO(opt): separate out into multiple levels |
| const TableInfo& t = tables_[i]; |
| edit_.AddFile(0, t.meta.number, t.meta.file_size, |
| t.meta.smallest, t.meta.largest); |
| } |
| |
| //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); |
| { |
| log::Writer log(file); |
| std::string record; |
| edit_.EncodeTo(&record); |
| status = log.AddRecord(record); |
| } |
| if (status.ok()) { |
| status = file->Close(); |
| } |
| delete file; |
| file = NULL; |
| |
| if (!status.ok()) { |
| env_->DeleteFile(tmp); |
| } else { |
| // Discard older manifests |
| for (size_t i = 0; i < manifests_.size(); i++) { |
| ArchiveFile(dbname_ + "/" + manifests_[i]); |
| } |
| |
| // Install new manifest |
| status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); |
| if (status.ok()) { |
| status = SetCurrentFile(env_, dbname_, 1); |
| } else { |
| env_->DeleteFile(tmp); |
| } |
| } |
| return status; |
| } |
| |
| void ArchiveFile(const std::string& fname) { |
| // Move into another directory. E.g., for |
| // dir/foo |
| // rename to |
| // dir/lost/foo |
| const char* slash = strrchr(fname.c_str(), '/'); |
| std::string new_dir; |
| if (slash != NULL) { |
| new_dir.assign(fname.data(), slash - fname.data()); |
| } |
| new_dir.append("/lost"); |
| env_->CreateDir(new_dir); // Ignore error |
| std::string new_file = new_dir; |
| new_file.append("/"); |
| new_file.append((slash == NULL) ? fname.c_str() : slash + 1); |
| Status s = env_->RenameFile(fname, new_file); |
| Log(options_.info_log, "Archiving %s: %s\n", |
| fname.c_str(), s.ToString().c_str()); |
| } |
| }; |
| } // namespace |
| |
| Status RepairDB(const std::string& dbname, const Options& options) { |
| Repairer repairer(dbname, options); |
| return repairer.Run(); |
| } |
| |
| } // namespace leveldb |