blob: 3a031331f56262b6cb0c835d5040120cd5e47f6c [file] [log] [blame]
// sttable.h
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright 2005-2010 Google, Inc.
// Author: allauzen@google.com (Cyril Allauzen)
//
// \file
// A generic string-to-type table file format
//
// This is not meant as a generalization of SSTable. This is more of
// a simple replacement for SSTable in order to provide an open-source
// implementation of the FAR format for the external version of the
// FST Library.
#ifndef FST_EXTENSIONS_FAR_STTABLE_H_
#define FST_EXTENSIONS_FAR_STTABLE_H_
#include <algorithm>
#include <iostream>
#include <fstream>
#include <fst/util.h>
namespace fst {
static const int32 kSTTableMagicNumber = 2125656924;
static const int32 kSTTableFileVersion = 1;
// String-to-type table writing class for object of type 'T' using functor 'W'
// to write an object of type 'T' from a stream. 'W' must conform to the
// following interface:
//
// struct Writer {
// void operator()(ostream &, const T &) const;
// };
//
template <class T, class W>
class STTableWriter {
public:
typedef T EntryType;
typedef W EntryWriter;
explicit STTableWriter(const string &filename)
: stream_(filename.c_str(), ofstream::out | ofstream::binary),
error_(false) {
WriteType(stream_, kSTTableMagicNumber);
WriteType(stream_, kSTTableFileVersion);
if (!stream_) {
FSTERROR() << "STTableWriter::STTableWriter: error writing to file: "
<< filename;
error_=true;
}
}
static STTableWriter<T, W> *Create(const string &filename) {
if (filename.empty()) {
LOG(ERROR) << "STTableWriter: writing to standard out unsupported.";
return 0;
}
return new STTableWriter<T, W>(filename);
}
void Add(const string &key, const T &t) {
if (key == "") {
FSTERROR() << "STTableWriter::Add: key empty: " << key;
error_ = true;
} else if (key < last_key_) {
FSTERROR() << "STTableWriter::Add: key disorder: " << key;
error_ = true;
}
if (error_) return;
last_key_ = key;
positions_.push_back(stream_.tellp());
WriteType(stream_, key);
entry_writer_(stream_, t);
}
bool Error() const { return error_; }
~STTableWriter() {
WriteType(stream_, positions_);
WriteType(stream_, static_cast<int64>(positions_.size()));
}
private:
EntryWriter entry_writer_; // Write functor for 'EntryType'
ofstream stream_; // Output stream
vector<int64> positions_; // Position in file of each key-entry pair
string last_key_; // Last key
bool error_;
DISALLOW_COPY_AND_ASSIGN(STTableWriter);
};
// String-to-type table reading class for object of type 'T' using functor 'R'
// to read an object of type 'T' form a stream. 'R' must conform to the
// following interface:
//
// struct Reader {
// T *operator()(istream &) const;
// };
//
template <class T, class R>
class STTableReader {
public:
typedef T EntryType;
typedef R EntryReader;
explicit STTableReader(const vector<string> &filenames)
: sources_(filenames), entry_(0), error_(false) {
compare_ = new Compare(&keys_);
keys_.resize(filenames.size());
streams_.resize(filenames.size(), 0);
positions_.resize(filenames.size());
for (size_t i = 0; i < filenames.size(); ++i) {
streams_[i] = new ifstream(
filenames[i].c_str(), ifstream::in | ifstream::binary);
int32 magic_number = 0, file_version = 0;
ReadType(*streams_[i], &magic_number);
ReadType(*streams_[i], &file_version);
if (magic_number != kSTTableMagicNumber) {
FSTERROR() << "STTableReader::STTableReader: wrong file type: "
<< filenames[i];
error_ = true;
return;
}
if (file_version != kSTTableFileVersion) {
FSTERROR() << "STTableReader::STTableReader: wrong file version: "
<< filenames[i];
error_ = true;
return;
}
int64 num_entries;
streams_[i]->seekg(-static_cast<int>(sizeof(int64)), ios_base::end);
ReadType(*streams_[i], &num_entries);
streams_[i]->seekg(-static_cast<int>(sizeof(int64)) *
(num_entries + 1), ios_base::end);
positions_[i].resize(num_entries);
for (size_t j = 0; (j < num_entries) && (*streams_[i]); ++j)
ReadType(*streams_[i], &(positions_[i][j]));
streams_[i]->seekg(positions_[i][0]);
if (!*streams_[i]) {
FSTERROR() << "STTableReader::STTableReader: error reading file: "
<< filenames[i];
error_ = true;
return;
}
}
MakeHeap();
}
~STTableReader() {
for (size_t i = 0; i < streams_.size(); ++i)
delete streams_[i];
delete compare_;
if (entry_)
delete entry_;
}
static STTableReader<T, R> *Open(const string &filename) {
if (filename.empty()) {
LOG(ERROR) << "STTableReader: reading from standard in not supported";
return 0;
}
vector<string> filenames;
filenames.push_back(filename);
return new STTableReader<T, R>(filenames);
}
static STTableReader<T, R> *Open(const vector<string> &filenames) {
return new STTableReader<T, R>(filenames);
}
void Reset() {
if (error_) return;
for (size_t i = 0; i < streams_.size(); ++i)
streams_[i]->seekg(positions_[i].front());
MakeHeap();
}
bool Find(const string &key) {
if (error_) return false;
for (size_t i = 0; i < streams_.size(); ++i)
LowerBound(i, key);
MakeHeap();
return keys_[current_] == key;
}
bool Done() const { return error_ || heap_.empty(); }
void Next() {
if (error_) return;
if (streams_[current_]->tellg() <= positions_[current_].back()) {
ReadType(*(streams_[current_]), &(keys_[current_]));
if (!*streams_[current_]) {
FSTERROR() << "STTableReader: error reading file: "
<< sources_[current_];
error_ = true;
return;
}
push_heap(heap_.begin(), heap_.end(), *compare_);
} else {
heap_.pop_back();
}
if (!heap_.empty())
PopHeap();
}
const string &GetKey() const {
return keys_[current_];
}
const EntryType &GetEntry() const {
return *entry_;
}
bool Error() const { return error_; }
private:
// Comparison functor used to compare stream IDs in the heap
struct Compare {
Compare(const vector<string> *keys) : keys_(keys) {}
bool operator()(size_t i, size_t j) const {
return (*keys_)[i] > (*keys_)[j];
};
private:
const vector<string> *keys_;
};
// Position the stream with ID 'id' at the position corresponding
// to the lower bound for key 'find_key'
void LowerBound(size_t id, const string &find_key) {
ifstream *strm = streams_[id];
const vector<int64> &positions = positions_[id];
size_t low = 0, high = positions.size() - 1;
while (low < high) {
size_t mid = (low + high)/2;
strm->seekg(positions[mid]);
string key;
ReadType(*strm, &key);
if (key > find_key) {
high = mid;
} else if (key < find_key) {
low = mid + 1;
} else {
for (size_t i = mid; i > low; --i) {
strm->seekg(positions[i - 1]);
ReadType(*strm, &key);
if (key != find_key) {
strm->seekg(positions[i]);
return;
}
}
strm->seekg(positions[low]);
return;
}
}
strm->seekg(positions[low]);
}
// Add all streams to the heap
void MakeHeap() {
heap_.clear();
for (size_t i = 0; i < streams_.size(); ++i) {
ReadType(*streams_[i], &(keys_[i]));
if (!*streams_[i]) {
FSTERROR() << "STTableReader: error reading file: " << sources_[i];
error_ = true;
return;
}
heap_.push_back(i);
}
make_heap(heap_.begin(), heap_.end(), *compare_);
PopHeap();
}
// Position the stream with the lowest key at the top
// of the heap, set 'current_' to the ID of that stream
// and read the current entry from that stream
void PopHeap() {
pop_heap(heap_.begin(), heap_.end(), *compare_);
current_ = heap_.back();
if (entry_)
delete entry_;
entry_ = entry_reader_(*streams_[current_]);
if (!entry_)
error_ = true;
if (!*streams_[current_]) {
FSTERROR() << "STTableReader: error reading entry for key: "
<< keys_[current_] << ", file: " << sources_[current_];
error_ = true;
}
}
EntryReader entry_reader_; // Read functor for 'EntryType'
vector<ifstream*> streams_; // Input streams
vector<string> sources_; // and corresponding file names
vector<vector<int64> > positions_; // Index of positions for each stream
vector<string> keys_; // Lowest unread key for each stream
vector<int64> heap_; // Heap containing ID of streams with unread keys
int64 current_; // Id of current stream to be read
Compare *compare_; // Functor comparing stream IDs for the heap
mutable EntryType *entry_; // Pointer to the currently read entry
bool error_;
DISALLOW_COPY_AND_ASSIGN(STTableReader);
};
// String-to-type table header reading function template on the entry header
// type 'H' having a member function:
// Read(istream &strm, const string &filename);
// Checks that 'filename' is an STTable and call the H::Read() on the last
// entry in the STTable.
template <class H>
bool ReadSTTableHeader(const string &filename, H *header) {
ifstream strm(filename.c_str(), ifstream::in | ifstream::binary);
int32 magic_number = 0, file_version = 0;
ReadType(strm, &magic_number);
ReadType(strm, &file_version);
if (magic_number != kSTTableMagicNumber) {
LOG(ERROR) << "ReadSTTableHeader: wrong file type: " << filename;
return false;
}
if (file_version != kSTTableFileVersion) {
LOG(ERROR) << "ReadSTTableHeader: wrong file version: " << filename;
return false;
}
int64 i = -1;
strm.seekg(-static_cast<int>(sizeof(int64)), ios_base::end);
ReadType(strm, &i); // Read number of entries
if (!strm) {
LOG(ERROR) << "ReadSTTableHeader: error reading file: " << filename;
return false;
}
if (i == 0) return true; // No entry header to read
strm.seekg(-2 * static_cast<int>(sizeof(int64)), ios_base::end);
ReadType(strm, &i); // Read position for last entry in file
strm.seekg(i);
string key;
ReadType(strm, &key);
header->Read(strm, filename + ":" + key);
if (!strm) {
LOG(ERROR) << "ReadSTTableHeader: error reading file: " << filename;
return false;
}
return true;
}
bool IsSTTable(const string &filename);
} // namespace fst
#endif // FST_EXTENSIONS_FAR_STTABLE_H_