blob: 8f9aca86f1735bdf218ba482a939666cd4b00bb5 [file] [log] [blame]
// label_reachable.h
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright 2005-2010 Google, Inc.
// Author: riley@google.com (Michael Riley)
//
// \file
// Class to determine if a non-epsilon label can be read as the
// first non-epsilon symbol along some path from a given state.
#ifndef FST_LIB_LABEL_REACHABLE_H__
#define FST_LIB_LABEL_REACHABLE_H__
#include <unordered_map>
using std::tr1::unordered_map;
using std::tr1::unordered_multimap;
#include <vector>
using std::vector;
#include <fst/accumulator.h>
#include <fst/arcsort.h>
#include <fst/interval-set.h>
#include <fst/state-reachable.h>
#include <fst/vector-fst.h>
namespace fst {
// Stores shareable data for label reachable class copies.
template <typename L>
class LabelReachableData {
public:
typedef L Label;
typedef typename IntervalSet<L>::Interval Interval;
explicit LabelReachableData(bool reach_input, bool keep_relabel_data = true)
: reach_input_(reach_input),
keep_relabel_data_(keep_relabel_data),
have_relabel_data_(true),
final_label_(kNoLabel) {}
~LabelReachableData() {}
bool ReachInput() const { return reach_input_; }
vector< IntervalSet<L> > *IntervalSets() { return &isets_; }
unordered_map<L, L> *Label2Index() {
if (!have_relabel_data_)
FSTERROR() << "LabelReachableData: no relabeling data";
return &label2index_;
}
Label FinalLabel() {
if (final_label_ == kNoLabel)
final_label_ = label2index_[kNoLabel];
return final_label_;
}
static LabelReachableData<L> *Read(istream &istrm) {
LabelReachableData<L> *data = new LabelReachableData<L>();
ReadType(istrm, &data->reach_input_);
ReadType(istrm, &data->keep_relabel_data_);
data->have_relabel_data_ = data->keep_relabel_data_;
if (data->keep_relabel_data_)
ReadType(istrm, &data->label2index_);
ReadType(istrm, &data->final_label_);
ReadType(istrm, &data->isets_);
return data;
}
bool Write(ostream &ostrm) {
WriteType(ostrm, reach_input_);
WriteType(ostrm, keep_relabel_data_);
if (keep_relabel_data_)
WriteType(ostrm, label2index_);
WriteType(ostrm, FinalLabel());
WriteType(ostrm, isets_);
return true;
}
int RefCount() const { return ref_count_.count(); }
int IncrRefCount() { return ref_count_.Incr(); }
int DecrRefCount() { return ref_count_.Decr(); }
private:
LabelReachableData() {}
bool reach_input_; // Input or output labels considered?
bool keep_relabel_data_; // Save label2index_ to file?
bool have_relabel_data_; // Using label2index_?
Label final_label_; // Final label
RefCounter ref_count_; // Reference count.
unordered_map<L, L> label2index_; // Finds index for a label.
vector<IntervalSet <L> > isets_; // Interval sets per state.
DISALLOW_COPY_AND_ASSIGN(LabelReachableData);
};
// Tests reachability of labels from a given state. If reach_input =
// true, then input labels are considered, o.w. output labels are
// considered. To test for reachability from a state s, first do
// SetState(s). Then a label l can be reached from state s of FST f
// iff Reach(r) is true where r = Relabel(l). The relabeling is
// required to ensure a compact representation of the reachable
// labels.
// The whole FST can be relabeled instead with Relabel(&f,
// reach_input) so that the test Reach(r) applies directly to the
// labels of the transformed FST f. The relabeled FST will also be
// sorted appropriately for composition.
//
// Reachablity of a final state from state s (via an epsilon path)
// can be tested with ReachFinal();
//
// Reachability can also be tested on the set of labels specified by
// an arc iterator, useful for FST composition. In particular,
// Reach(aiter, ...) is true if labels on the input (output) side of
// the transitions of the arc iterator, when iter_input is true
// (false), can be reached from the state s. The iterator labels must
// have already been relabeled.
//
// With the arc iterator test of reachability, the begin position, end
// position and accumulated arc weight of the matches can be
// returned. The optional template argument controls how reachable arc
// weights are accumulated. The default uses the semiring
// Plus(). Alternative ones can be used to distribute the weights in
// composition in various ways.
template <class A, class S = DefaultAccumulator<A> >
class LabelReachable {
public:
typedef A Arc;
typedef typename A::StateId StateId;
typedef typename A::Label Label;
typedef typename A::Weight Weight;
typedef typename IntervalSet<Label>::Interval Interval;
LabelReachable(const Fst<A> &fst, bool reach_input, S *s = 0,
bool keep_relabel_data = true)
: fst_(new VectorFst<Arc>(fst)),
s_(kNoStateId),
data_(new LabelReachableData<Label>(reach_input, keep_relabel_data)),
accumulator_(s ? s : new S()),
ncalls_(0),
nintervals_(0),
error_(false) {
StateId ins = fst_->NumStates();
TransformFst();
FindIntervals(ins);
delete fst_;
}
explicit LabelReachable(LabelReachableData<Label> *data, S *s = 0)
: fst_(0),
s_(kNoStateId),
data_(data),
accumulator_(s ? s : new S()),
ncalls_(0),
nintervals_(0),
error_(false) {
data_->IncrRefCount();
}
LabelReachable(const LabelReachable<A, S> &reachable) :
fst_(0),
s_(kNoStateId),
data_(reachable.data_),
accumulator_(new S(*reachable.accumulator_)),
ncalls_(0),
nintervals_(0),
error_(reachable.error_) {
data_->IncrRefCount();
}
~LabelReachable() {
if (!data_->DecrRefCount())
delete data_;
delete accumulator_;
if (ncalls_ > 0) {
VLOG(2) << "# of calls: " << ncalls_;
VLOG(2) << "# of intervals/call: " << (nintervals_ / ncalls_);
}
}
// Relabels w.r.t labels that give compact label sets.
Label Relabel(Label label) {
if (label == 0 || error_)
return label;
unordered_map<Label, Label> &label2index = *data_->Label2Index();
Label &relabel = label2index[label];
if (!relabel) // Add new label
relabel = label2index.size() + 1;
return relabel;
}
// Relabels Fst w.r.t to labels that give compact label sets.
void Relabel(MutableFst<Arc> *fst, bool relabel_input) {
for (StateIterator< MutableFst<Arc> > siter(*fst);
!siter.Done(); siter.Next()) {
StateId s = siter.Value();
for (MutableArcIterator< MutableFst<Arc> > aiter(fst, s);
!aiter.Done();
aiter.Next()) {
Arc arc = aiter.Value();
if (relabel_input)
arc.ilabel = Relabel(arc.ilabel);
else
arc.olabel = Relabel(arc.olabel);
aiter.SetValue(arc);
}
}
if (relabel_input) {
ArcSort(fst, ILabelCompare<Arc>());
fst->SetInputSymbols(0);
} else {
ArcSort(fst, OLabelCompare<Arc>());
fst->SetOutputSymbols(0);
}
}
// Returns relabeling pairs (cf. relabel.h::Relabel()).
// If 'avoid_collisions' is true, extra pairs are added to
// ensure no collisions when relabeling automata that have
// labels unseen here.
void RelabelPairs(vector<pair<Label, Label> > *pairs,
bool avoid_collisions = false) {
pairs->clear();
unordered_map<Label, Label> &label2index = *data_->Label2Index();
// Maps labels to their new values in [1, label2index().size()]
for (typename unordered_map<Label, Label>::const_iterator
it = label2index.begin(); it != label2index.end(); ++it)
if (it->second != data_->FinalLabel())
pairs->push_back(pair<Label, Label>(it->first, it->second));
if (avoid_collisions) {
// Ensures any label in [1, label2index().size()] is mapped either
// by the above step or to label2index() + 1 (to avoid collisions).
for (int i = 1; i <= label2index.size(); ++i) {
typename unordered_map<Label, Label>::const_iterator
it = label2index.find(i);
if (it == label2index.end() || it->second == data_->FinalLabel())
pairs->push_back(pair<Label, Label>(i, label2index.size() + 1));
}
}
}
// Set current state. Optionally set state associated
// with arc iterator to be passed to Reach.
void SetState(StateId s, StateId aiter_s = kNoStateId) {
s_ = s;
if (aiter_s != kNoStateId) {
accumulator_->SetState(aiter_s);
if (accumulator_->Error()) error_ = true;
}
}
// Can reach this label from current state?
// Original labels must be transformed by the Relabel methods above.
bool Reach(Label label) {
if (label == 0 || error_)
return false;
vector< IntervalSet<Label> > &isets = *data_->IntervalSets();
return isets[s_].Member(label);
}
// Can reach final state (via epsilon transitions) from this state?
bool ReachFinal() {
if (error_) return false;
vector< IntervalSet<Label> > &isets = *data_->IntervalSets();
return isets[s_].Member(data_->FinalLabel());
}
// Initialize with secondary FST to be used with Reach(Iterator,...).
// If copy is true, then 'fst' is a copy of the FST used in the
// previous call to this method (useful to avoid unnecessary updates).
template <class F>
void ReachInit(const F &fst, bool copy = false) {
accumulator_->Init(fst, copy);
if (accumulator_->Error()) error_ = true;
}
// Can reach any arc iterator label between iterator positions
// aiter_begin and aiter_end? If aiter_input = true, then iterator
// input labels are considered, o.w. output labels are considered.
// Arc iterator labels must be transformed by the Relabel methods
// above. If compute_weight is true, user may call ReachWeight().
template <class Iterator>
bool Reach(Iterator *aiter, ssize_t aiter_begin,
ssize_t aiter_end, bool aiter_input, bool compute_weight) {
if (error_) return false;
vector< IntervalSet<Label> > &isets = *data_->IntervalSets();
const vector<Interval> *intervals = isets[s_].Intervals();
++ncalls_;
nintervals_ += intervals->size();
reach_begin_ = -1;
reach_end_ = -1;
reach_weight_ = Weight::Zero();
uint32 flags = aiter->Flags(); // save flags to restore them on exit
aiter->SetFlags(kArcNoCache, kArcNoCache); // make caching optional
aiter->Seek(aiter_begin);
if (2 * (aiter_end - aiter_begin) < intervals->size()) {
// Check each arc against intervals.
// Set arc iterator flags to only compute the ilabel or olabel values,
// since they are the only values required for most of the arcs processed.
aiter->SetFlags(aiter_input ? kArcILabelValue : kArcOLabelValue,
kArcValueFlags);
Label reach_label = kNoLabel;
for (ssize_t aiter_pos = aiter_begin;
aiter_pos < aiter_end; aiter->Next(), ++aiter_pos) {
const A &arc = aiter->Value();
Label label = aiter_input ? arc.ilabel : arc.olabel;
if (label == reach_label || Reach(label)) {
reach_label = label;
if (reach_begin_ < 0)
reach_begin_ = aiter_pos;
reach_end_ = aiter_pos + 1;
if (compute_weight) {
if (!(aiter->Flags() & kArcWeightValue)) {
// If the 'arc.weight' wasn't computed by the call
// to 'aiter->Value()' above, we need to call
// 'aiter->Value()' again after having set the arc iterator
// flags to compute the arc weight value.
aiter->SetFlags(kArcWeightValue, kArcValueFlags);
const A &arcb = aiter->Value();
// Call the accumulator.
reach_weight_ = accumulator_->Sum(reach_weight_, arcb.weight);
// Only ilabel or olabel required to process the following
// arcs.
aiter->SetFlags(aiter_input ? kArcILabelValue : kArcOLabelValue,
kArcValueFlags);
} else {
// Call the accumulator.
reach_weight_ = accumulator_->Sum(reach_weight_, arc.weight);
}
}
}
}
} else {
// Check each interval against arcs
ssize_t begin_low, end_low = aiter_begin;
for (typename vector<Interval>::const_iterator
iiter = intervals->begin();
iiter != intervals->end(); ++iiter) {
begin_low = LowerBound(aiter, end_low, aiter_end,
aiter_input, iiter->begin);
end_low = LowerBound(aiter, begin_low, aiter_end,
aiter_input, iiter->end);
if (end_low - begin_low > 0) {
if (reach_begin_ < 0)
reach_begin_ = begin_low;
reach_end_ = end_low;
if (compute_weight) {
aiter->SetFlags(kArcWeightValue, kArcValueFlags);
reach_weight_ = accumulator_->Sum(reach_weight_, aiter,
begin_low, end_low);
}
}
}
}
aiter->SetFlags(flags, kArcFlags); // restore original flag values
return reach_begin_ >= 0;
}
// Returns iterator position of first matching arc.
ssize_t ReachBegin() const { return reach_begin_; }
// Returns iterator position one past last matching arc.
ssize_t ReachEnd() const { return reach_end_; }
// Return the sum of the weights for matching arcs.
// Valid only if compute_weight was true in Reach() call.
Weight ReachWeight() const { return reach_weight_; }
// Access to the relabeling map. Excludes epsilon (0) label but
// includes kNoLabel that is used internally for super-final
// transitons.
const unordered_map<Label, Label>& Label2Index() const {
return *data_->Label2Index();
}
LabelReachableData<Label> *GetData() const { return data_; }
bool Error() const { return error_ || accumulator_->Error(); }
private:
// Redirects labeled arcs (input or output labels determined by
// ReachInput()) to new label-specific final states. Each original
// final state is redirected via a transition labeled with kNoLabel
// to a new kNoLabel-specific final state. Creates super-initial
// state for all states with zero in-degree.
void TransformFst() {
StateId ins = fst_->NumStates();
StateId ons = ins;
vector<ssize_t> indeg(ins, 0);
// Redirects labeled arcs to new final states.
for (StateId s = 0; s < ins; ++s) {
for (MutableArcIterator< VectorFst<Arc> > aiter(fst_, s);
!aiter.Done();
aiter.Next()) {
Arc arc = aiter.Value();
Label label = data_->ReachInput() ? arc.ilabel : arc.olabel;
if (label) {
if (label2state_.find(label) == label2state_.end()) {
label2state_[label] = ons;
indeg.push_back(0);
++ons;
}
arc.nextstate = label2state_[label];
aiter.SetValue(arc);
}
++indeg[arc.nextstate]; // Finds in-degrees for next step.
}
// Redirects final weights to new final state.
Weight final = fst_->Final(s);
if (final != Weight::Zero()) {
if (label2state_.find(kNoLabel) == label2state_.end()) {
label2state_[kNoLabel] = ons;
indeg.push_back(0);
++ons;
}
Arc arc(kNoLabel, kNoLabel, final, label2state_[kNoLabel]);
fst_->AddArc(s, arc);
++indeg[arc.nextstate]; // Finds in-degrees for next step.
fst_->SetFinal(s, Weight::Zero());
}
}
// Add new final states to Fst.
while (fst_->NumStates() < ons) {
StateId s = fst_->AddState();
fst_->SetFinal(s, Weight::One());
}
// Creates a super-initial state for all states with zero in-degree.
StateId start = fst_->AddState();
fst_->SetStart(start);
for (StateId s = 0; s < start; ++s) {
if (indeg[s] == 0) {
Arc arc(0, 0, Weight::One(), s);
fst_->AddArc(start, arc);
}
}
}
void FindIntervals(StateId ins) {
StateReachable<A, Label> state_reachable(*fst_);
if (state_reachable.Error()) {
error_ = true;
return;
}
vector<Label> &state2index = state_reachable.State2Index();
vector< IntervalSet<Label> > &isets = *data_->IntervalSets();
isets = state_reachable.IntervalSets();
isets.resize(ins);
unordered_map<Label, Label> &label2index = *data_->Label2Index();
for (typename unordered_map<Label, StateId>::const_iterator
it = label2state_.begin();
it != label2state_.end();
++it) {
Label l = it->first;
StateId s = it->second;
Label i = state2index[s];
label2index[l] = i;
}
label2state_.clear();
double nintervals = 0;
ssize_t non_intervals = 0;
for (ssize_t s = 0; s < ins; ++s) {
nintervals += isets[s].Size();
if (isets[s].Size() > 1) {
++non_intervals;
VLOG(3) << "state: " << s << " # of intervals: " << isets[s].Size();
}
}
VLOG(2) << "# of states: " << ins;
VLOG(2) << "# of intervals: " << nintervals;
VLOG(2) << "# of intervals/state: " << nintervals/ins;
VLOG(2) << "# of non-interval states: " << non_intervals;
}
template <class Iterator>
ssize_t LowerBound(Iterator *aiter, ssize_t aiter_begin,
ssize_t aiter_end, bool aiter_input,
Label match_label) const {
// Only need to compute the ilabel or olabel of arcs when
// performing the binary search.
aiter->SetFlags(aiter_input ? kArcILabelValue : kArcOLabelValue,
kArcValueFlags);
ssize_t low = aiter_begin;
ssize_t high = aiter_end;
while (low < high) {
ssize_t mid = (low + high) / 2;
aiter->Seek(mid);
Label label = aiter_input ?
aiter->Value().ilabel : aiter->Value().olabel;
if (label > match_label) {
high = mid;
} else if (label < match_label) {
low = mid + 1;
} else {
// Find first matching label (when non-deterministic)
for (ssize_t i = mid; i > low; --i) {
aiter->Seek(i - 1);
label = aiter_input ? aiter->Value().ilabel : aiter->Value().olabel;
if (label != match_label) {
aiter->Seek(i);
aiter->SetFlags(kArcValueFlags, kArcValueFlags);
return i;
}
}
aiter->SetFlags(kArcValueFlags, kArcValueFlags);
return low;
}
}
aiter->Seek(low);
aiter->SetFlags(kArcValueFlags, kArcValueFlags);
return low;
}
VectorFst<Arc> *fst_;
StateId s_; // Current state
unordered_map<Label, StateId> label2state_; // Finds final state for a label
ssize_t reach_begin_; // Iterator pos of first match
ssize_t reach_end_; // Iterator pos after last match
Weight reach_weight_; // Gives weight sum of arc iterator
// arcs with reachable labels.
LabelReachableData<Label> *data_; // Shareable data between copies
S *accumulator_; // Sums arc weights
double ncalls_;
double nintervals_;
bool error_;
void operator=(const LabelReachable<A, S> &); // Disallow
};
} // namespace fst
#endif // FST_LIB_LABEL_REACHABLE_H__