| // label_reachable.h |
| |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| // Copyright 2005-2010 Google, Inc. |
| // Author: riley@google.com (Michael Riley) |
| // |
| // \file |
| // Class to determine if a non-epsilon label can be read as the |
| // first non-epsilon symbol along some path from a given state. |
| |
| |
| #ifndef FST_LIB_LABEL_REACHABLE_H__ |
| #define FST_LIB_LABEL_REACHABLE_H__ |
| |
| #include <unordered_map> |
| using std::tr1::unordered_map; |
| using std::tr1::unordered_multimap; |
| #include <vector> |
| using std::vector; |
| |
| #include <fst/accumulator.h> |
| #include <fst/arcsort.h> |
| #include <fst/interval-set.h> |
| #include <fst/state-reachable.h> |
| #include <fst/vector-fst.h> |
| |
| |
| namespace fst { |
| |
| // Stores shareable data for label reachable class copies. |
| template <typename L> |
| class LabelReachableData { |
| public: |
| typedef L Label; |
| typedef typename IntervalSet<L>::Interval Interval; |
| |
| explicit LabelReachableData(bool reach_input, bool keep_relabel_data = true) |
| : reach_input_(reach_input), |
| keep_relabel_data_(keep_relabel_data), |
| have_relabel_data_(true), |
| final_label_(kNoLabel) {} |
| |
| ~LabelReachableData() {} |
| |
| bool ReachInput() const { return reach_input_; } |
| |
| vector< IntervalSet<L> > *IntervalSets() { return &isets_; } |
| |
| unordered_map<L, L> *Label2Index() { |
| if (!have_relabel_data_) |
| FSTERROR() << "LabelReachableData: no relabeling data"; |
| return &label2index_; |
| } |
| |
| Label FinalLabel() { |
| if (final_label_ == kNoLabel) |
| final_label_ = label2index_[kNoLabel]; |
| return final_label_; |
| } |
| |
| static LabelReachableData<L> *Read(istream &istrm) { |
| LabelReachableData<L> *data = new LabelReachableData<L>(); |
| |
| ReadType(istrm, &data->reach_input_); |
| ReadType(istrm, &data->keep_relabel_data_); |
| data->have_relabel_data_ = data->keep_relabel_data_; |
| if (data->keep_relabel_data_) |
| ReadType(istrm, &data->label2index_); |
| ReadType(istrm, &data->final_label_); |
| ReadType(istrm, &data->isets_); |
| return data; |
| } |
| |
| bool Write(ostream &ostrm) { |
| WriteType(ostrm, reach_input_); |
| WriteType(ostrm, keep_relabel_data_); |
| if (keep_relabel_data_) |
| WriteType(ostrm, label2index_); |
| WriteType(ostrm, FinalLabel()); |
| WriteType(ostrm, isets_); |
| return true; |
| } |
| |
| int RefCount() const { return ref_count_.count(); } |
| int IncrRefCount() { return ref_count_.Incr(); } |
| int DecrRefCount() { return ref_count_.Decr(); } |
| |
| private: |
| LabelReachableData() {} |
| |
| bool reach_input_; // Input or output labels considered? |
| bool keep_relabel_data_; // Save label2index_ to file? |
| bool have_relabel_data_; // Using label2index_? |
| Label final_label_; // Final label |
| RefCounter ref_count_; // Reference count. |
| unordered_map<L, L> label2index_; // Finds index for a label. |
| vector<IntervalSet <L> > isets_; // Interval sets per state. |
| |
| DISALLOW_COPY_AND_ASSIGN(LabelReachableData); |
| }; |
| |
| |
| // Tests reachability of labels from a given state. If reach_input = |
| // true, then input labels are considered, o.w. output labels are |
| // considered. To test for reachability from a state s, first do |
| // SetState(s). Then a label l can be reached from state s of FST f |
| // iff Reach(r) is true where r = Relabel(l). The relabeling is |
| // required to ensure a compact representation of the reachable |
| // labels. |
| |
| // The whole FST can be relabeled instead with Relabel(&f, |
| // reach_input) so that the test Reach(r) applies directly to the |
| // labels of the transformed FST f. The relabeled FST will also be |
| // sorted appropriately for composition. |
| // |
| // Reachablity of a final state from state s (via an epsilon path) |
| // can be tested with ReachFinal(); |
| // |
| // Reachability can also be tested on the set of labels specified by |
| // an arc iterator, useful for FST composition. In particular, |
| // Reach(aiter, ...) is true if labels on the input (output) side of |
| // the transitions of the arc iterator, when iter_input is true |
| // (false), can be reached from the state s. The iterator labels must |
| // have already been relabeled. |
| // |
| // With the arc iterator test of reachability, the begin position, end |
| // position and accumulated arc weight of the matches can be |
| // returned. The optional template argument controls how reachable arc |
| // weights are accumulated. The default uses the semiring |
| // Plus(). Alternative ones can be used to distribute the weights in |
| // composition in various ways. |
| template <class A, class S = DefaultAccumulator<A> > |
| class LabelReachable { |
| public: |
| typedef A Arc; |
| typedef typename A::StateId StateId; |
| typedef typename A::Label Label; |
| typedef typename A::Weight Weight; |
| typedef typename IntervalSet<Label>::Interval Interval; |
| |
| LabelReachable(const Fst<A> &fst, bool reach_input, S *s = 0, |
| bool keep_relabel_data = true) |
| : fst_(new VectorFst<Arc>(fst)), |
| s_(kNoStateId), |
| data_(new LabelReachableData<Label>(reach_input, keep_relabel_data)), |
| accumulator_(s ? s : new S()), |
| ncalls_(0), |
| nintervals_(0), |
| error_(false) { |
| StateId ins = fst_->NumStates(); |
| TransformFst(); |
| FindIntervals(ins); |
| delete fst_; |
| } |
| |
| explicit LabelReachable(LabelReachableData<Label> *data, S *s = 0) |
| : fst_(0), |
| s_(kNoStateId), |
| data_(data), |
| accumulator_(s ? s : new S()), |
| ncalls_(0), |
| nintervals_(0), |
| error_(false) { |
| data_->IncrRefCount(); |
| } |
| |
| LabelReachable(const LabelReachable<A, S> &reachable) : |
| fst_(0), |
| s_(kNoStateId), |
| data_(reachable.data_), |
| accumulator_(new S(*reachable.accumulator_)), |
| ncalls_(0), |
| nintervals_(0), |
| error_(reachable.error_) { |
| data_->IncrRefCount(); |
| } |
| |
| ~LabelReachable() { |
| if (!data_->DecrRefCount()) |
| delete data_; |
| delete accumulator_; |
| if (ncalls_ > 0) { |
| VLOG(2) << "# of calls: " << ncalls_; |
| VLOG(2) << "# of intervals/call: " << (nintervals_ / ncalls_); |
| } |
| } |
| |
| // Relabels w.r.t labels that give compact label sets. |
| Label Relabel(Label label) { |
| if (label == 0 || error_) |
| return label; |
| unordered_map<Label, Label> &label2index = *data_->Label2Index(); |
| Label &relabel = label2index[label]; |
| if (!relabel) // Add new label |
| relabel = label2index.size() + 1; |
| return relabel; |
| } |
| |
| // Relabels Fst w.r.t to labels that give compact label sets. |
| void Relabel(MutableFst<Arc> *fst, bool relabel_input) { |
| for (StateIterator< MutableFst<Arc> > siter(*fst); |
| !siter.Done(); siter.Next()) { |
| StateId s = siter.Value(); |
| for (MutableArcIterator< MutableFst<Arc> > aiter(fst, s); |
| !aiter.Done(); |
| aiter.Next()) { |
| Arc arc = aiter.Value(); |
| if (relabel_input) |
| arc.ilabel = Relabel(arc.ilabel); |
| else |
| arc.olabel = Relabel(arc.olabel); |
| aiter.SetValue(arc); |
| } |
| } |
| if (relabel_input) { |
| ArcSort(fst, ILabelCompare<Arc>()); |
| fst->SetInputSymbols(0); |
| } else { |
| ArcSort(fst, OLabelCompare<Arc>()); |
| fst->SetOutputSymbols(0); |
| } |
| } |
| |
| // Returns relabeling pairs (cf. relabel.h::Relabel()). |
| // If 'avoid_collisions' is true, extra pairs are added to |
| // ensure no collisions when relabeling automata that have |
| // labels unseen here. |
| void RelabelPairs(vector<pair<Label, Label> > *pairs, |
| bool avoid_collisions = false) { |
| pairs->clear(); |
| unordered_map<Label, Label> &label2index = *data_->Label2Index(); |
| // Maps labels to their new values in [1, label2index().size()] |
| for (typename unordered_map<Label, Label>::const_iterator |
| it = label2index.begin(); it != label2index.end(); ++it) |
| if (it->second != data_->FinalLabel()) |
| pairs->push_back(pair<Label, Label>(it->first, it->second)); |
| if (avoid_collisions) { |
| // Ensures any label in [1, label2index().size()] is mapped either |
| // by the above step or to label2index() + 1 (to avoid collisions). |
| for (int i = 1; i <= label2index.size(); ++i) { |
| typename unordered_map<Label, Label>::const_iterator |
| it = label2index.find(i); |
| if (it == label2index.end() || it->second == data_->FinalLabel()) |
| pairs->push_back(pair<Label, Label>(i, label2index.size() + 1)); |
| } |
| } |
| } |
| |
| // Set current state. Optionally set state associated |
| // with arc iterator to be passed to Reach. |
| void SetState(StateId s, StateId aiter_s = kNoStateId) { |
| s_ = s; |
| if (aiter_s != kNoStateId) { |
| accumulator_->SetState(aiter_s); |
| if (accumulator_->Error()) error_ = true; |
| } |
| } |
| |
| // Can reach this label from current state? |
| // Original labels must be transformed by the Relabel methods above. |
| bool Reach(Label label) { |
| if (label == 0 || error_) |
| return false; |
| vector< IntervalSet<Label> > &isets = *data_->IntervalSets(); |
| return isets[s_].Member(label); |
| |
| } |
| |
| // Can reach final state (via epsilon transitions) from this state? |
| bool ReachFinal() { |
| if (error_) return false; |
| vector< IntervalSet<Label> > &isets = *data_->IntervalSets(); |
| return isets[s_].Member(data_->FinalLabel()); |
| } |
| |
| // Initialize with secondary FST to be used with Reach(Iterator,...). |
| // If copy is true, then 'fst' is a copy of the FST used in the |
| // previous call to this method (useful to avoid unnecessary updates). |
| template <class F> |
| void ReachInit(const F &fst, bool copy = false) { |
| accumulator_->Init(fst, copy); |
| if (accumulator_->Error()) error_ = true; |
| } |
| |
| // Can reach any arc iterator label between iterator positions |
| // aiter_begin and aiter_end? If aiter_input = true, then iterator |
| // input labels are considered, o.w. output labels are considered. |
| // Arc iterator labels must be transformed by the Relabel methods |
| // above. If compute_weight is true, user may call ReachWeight(). |
| template <class Iterator> |
| bool Reach(Iterator *aiter, ssize_t aiter_begin, |
| ssize_t aiter_end, bool aiter_input, bool compute_weight) { |
| if (error_) return false; |
| vector< IntervalSet<Label> > &isets = *data_->IntervalSets(); |
| const vector<Interval> *intervals = isets[s_].Intervals(); |
| ++ncalls_; |
| nintervals_ += intervals->size(); |
| |
| reach_begin_ = -1; |
| reach_end_ = -1; |
| reach_weight_ = Weight::Zero(); |
| |
| uint32 flags = aiter->Flags(); // save flags to restore them on exit |
| aiter->SetFlags(kArcNoCache, kArcNoCache); // make caching optional |
| aiter->Seek(aiter_begin); |
| |
| if (2 * (aiter_end - aiter_begin) < intervals->size()) { |
| // Check each arc against intervals. |
| // Set arc iterator flags to only compute the ilabel or olabel values, |
| // since they are the only values required for most of the arcs processed. |
| aiter->SetFlags(aiter_input ? kArcILabelValue : kArcOLabelValue, |
| kArcValueFlags); |
| Label reach_label = kNoLabel; |
| for (ssize_t aiter_pos = aiter_begin; |
| aiter_pos < aiter_end; aiter->Next(), ++aiter_pos) { |
| const A &arc = aiter->Value(); |
| Label label = aiter_input ? arc.ilabel : arc.olabel; |
| if (label == reach_label || Reach(label)) { |
| reach_label = label; |
| if (reach_begin_ < 0) |
| reach_begin_ = aiter_pos; |
| reach_end_ = aiter_pos + 1; |
| if (compute_weight) { |
| if (!(aiter->Flags() & kArcWeightValue)) { |
| // If the 'arc.weight' wasn't computed by the call |
| // to 'aiter->Value()' above, we need to call |
| // 'aiter->Value()' again after having set the arc iterator |
| // flags to compute the arc weight value. |
| aiter->SetFlags(kArcWeightValue, kArcValueFlags); |
| const A &arcb = aiter->Value(); |
| // Call the accumulator. |
| reach_weight_ = accumulator_->Sum(reach_weight_, arcb.weight); |
| // Only ilabel or olabel required to process the following |
| // arcs. |
| aiter->SetFlags(aiter_input ? kArcILabelValue : kArcOLabelValue, |
| kArcValueFlags); |
| } else { |
| // Call the accumulator. |
| reach_weight_ = accumulator_->Sum(reach_weight_, arc.weight); |
| } |
| } |
| } |
| } |
| } else { |
| // Check each interval against arcs |
| ssize_t begin_low, end_low = aiter_begin; |
| for (typename vector<Interval>::const_iterator |
| iiter = intervals->begin(); |
| iiter != intervals->end(); ++iiter) { |
| begin_low = LowerBound(aiter, end_low, aiter_end, |
| aiter_input, iiter->begin); |
| end_low = LowerBound(aiter, begin_low, aiter_end, |
| aiter_input, iiter->end); |
| if (end_low - begin_low > 0) { |
| if (reach_begin_ < 0) |
| reach_begin_ = begin_low; |
| reach_end_ = end_low; |
| if (compute_weight) { |
| aiter->SetFlags(kArcWeightValue, kArcValueFlags); |
| reach_weight_ = accumulator_->Sum(reach_weight_, aiter, |
| begin_low, end_low); |
| } |
| } |
| } |
| } |
| |
| aiter->SetFlags(flags, kArcFlags); // restore original flag values |
| return reach_begin_ >= 0; |
| } |
| |
| // Returns iterator position of first matching arc. |
| ssize_t ReachBegin() const { return reach_begin_; } |
| |
| // Returns iterator position one past last matching arc. |
| ssize_t ReachEnd() const { return reach_end_; } |
| |
| // Return the sum of the weights for matching arcs. |
| // Valid only if compute_weight was true in Reach() call. |
| Weight ReachWeight() const { return reach_weight_; } |
| |
| // Access to the relabeling map. Excludes epsilon (0) label but |
| // includes kNoLabel that is used internally for super-final |
| // transitons. |
| const unordered_map<Label, Label>& Label2Index() const { |
| return *data_->Label2Index(); |
| } |
| |
| LabelReachableData<Label> *GetData() const { return data_; } |
| |
| bool Error() const { return error_ || accumulator_->Error(); } |
| |
| private: |
| // Redirects labeled arcs (input or output labels determined by |
| // ReachInput()) to new label-specific final states. Each original |
| // final state is redirected via a transition labeled with kNoLabel |
| // to a new kNoLabel-specific final state. Creates super-initial |
| // state for all states with zero in-degree. |
| void TransformFst() { |
| StateId ins = fst_->NumStates(); |
| StateId ons = ins; |
| |
| vector<ssize_t> indeg(ins, 0); |
| |
| // Redirects labeled arcs to new final states. |
| for (StateId s = 0; s < ins; ++s) { |
| for (MutableArcIterator< VectorFst<Arc> > aiter(fst_, s); |
| !aiter.Done(); |
| aiter.Next()) { |
| Arc arc = aiter.Value(); |
| Label label = data_->ReachInput() ? arc.ilabel : arc.olabel; |
| if (label) { |
| if (label2state_.find(label) == label2state_.end()) { |
| label2state_[label] = ons; |
| indeg.push_back(0); |
| ++ons; |
| } |
| arc.nextstate = label2state_[label]; |
| aiter.SetValue(arc); |
| } |
| ++indeg[arc.nextstate]; // Finds in-degrees for next step. |
| } |
| |
| // Redirects final weights to new final state. |
| Weight final = fst_->Final(s); |
| if (final != Weight::Zero()) { |
| if (label2state_.find(kNoLabel) == label2state_.end()) { |
| label2state_[kNoLabel] = ons; |
| indeg.push_back(0); |
| ++ons; |
| } |
| Arc arc(kNoLabel, kNoLabel, final, label2state_[kNoLabel]); |
| fst_->AddArc(s, arc); |
| ++indeg[arc.nextstate]; // Finds in-degrees for next step. |
| |
| fst_->SetFinal(s, Weight::Zero()); |
| } |
| } |
| |
| // Add new final states to Fst. |
| while (fst_->NumStates() < ons) { |
| StateId s = fst_->AddState(); |
| fst_->SetFinal(s, Weight::One()); |
| } |
| |
| // Creates a super-initial state for all states with zero in-degree. |
| StateId start = fst_->AddState(); |
| fst_->SetStart(start); |
| for (StateId s = 0; s < start; ++s) { |
| if (indeg[s] == 0) { |
| Arc arc(0, 0, Weight::One(), s); |
| fst_->AddArc(start, arc); |
| } |
| } |
| } |
| |
| void FindIntervals(StateId ins) { |
| StateReachable<A, Label> state_reachable(*fst_); |
| if (state_reachable.Error()) { |
| error_ = true; |
| return; |
| } |
| |
| vector<Label> &state2index = state_reachable.State2Index(); |
| vector< IntervalSet<Label> > &isets = *data_->IntervalSets(); |
| isets = state_reachable.IntervalSets(); |
| isets.resize(ins); |
| |
| unordered_map<Label, Label> &label2index = *data_->Label2Index(); |
| for (typename unordered_map<Label, StateId>::const_iterator |
| it = label2state_.begin(); |
| it != label2state_.end(); |
| ++it) { |
| Label l = it->first; |
| StateId s = it->second; |
| Label i = state2index[s]; |
| label2index[l] = i; |
| } |
| label2state_.clear(); |
| |
| double nintervals = 0; |
| ssize_t non_intervals = 0; |
| for (ssize_t s = 0; s < ins; ++s) { |
| nintervals += isets[s].Size(); |
| if (isets[s].Size() > 1) { |
| ++non_intervals; |
| VLOG(3) << "state: " << s << " # of intervals: " << isets[s].Size(); |
| } |
| } |
| VLOG(2) << "# of states: " << ins; |
| VLOG(2) << "# of intervals: " << nintervals; |
| VLOG(2) << "# of intervals/state: " << nintervals/ins; |
| VLOG(2) << "# of non-interval states: " << non_intervals; |
| } |
| |
| template <class Iterator> |
| ssize_t LowerBound(Iterator *aiter, ssize_t aiter_begin, |
| ssize_t aiter_end, bool aiter_input, |
| Label match_label) const { |
| // Only need to compute the ilabel or olabel of arcs when |
| // performing the binary search. |
| aiter->SetFlags(aiter_input ? kArcILabelValue : kArcOLabelValue, |
| kArcValueFlags); |
| ssize_t low = aiter_begin; |
| ssize_t high = aiter_end; |
| while (low < high) { |
| ssize_t mid = (low + high) / 2; |
| aiter->Seek(mid); |
| Label label = aiter_input ? |
| aiter->Value().ilabel : aiter->Value().olabel; |
| if (label > match_label) { |
| high = mid; |
| } else if (label < match_label) { |
| low = mid + 1; |
| } else { |
| // Find first matching label (when non-deterministic) |
| for (ssize_t i = mid; i > low; --i) { |
| aiter->Seek(i - 1); |
| label = aiter_input ? aiter->Value().ilabel : aiter->Value().olabel; |
| if (label != match_label) { |
| aiter->Seek(i); |
| aiter->SetFlags(kArcValueFlags, kArcValueFlags); |
| return i; |
| } |
| } |
| aiter->SetFlags(kArcValueFlags, kArcValueFlags); |
| return low; |
| } |
| } |
| aiter->Seek(low); |
| aiter->SetFlags(kArcValueFlags, kArcValueFlags); |
| return low; |
| } |
| |
| VectorFst<Arc> *fst_; |
| StateId s_; // Current state |
| unordered_map<Label, StateId> label2state_; // Finds final state for a label |
| |
| ssize_t reach_begin_; // Iterator pos of first match |
| ssize_t reach_end_; // Iterator pos after last match |
| Weight reach_weight_; // Gives weight sum of arc iterator |
| // arcs with reachable labels. |
| LabelReachableData<Label> *data_; // Shareable data between copies |
| S *accumulator_; // Sums arc weights |
| |
| double ncalls_; |
| double nintervals_; |
| bool error_; |
| |
| void operator=(const LabelReachable<A, S> &); // Disallow |
| }; |
| |
| } // namespace fst |
| |
| #endif // FST_LIB_LABEL_REACHABLE_H__ |