| // determinize.h |
| |
| |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| // Copyright 2005-2010 Google, Inc. |
| // Author: riley@google.com (Michael Riley) |
| // |
| // \file |
| // Functions and classes to determinize an FST. |
| |
| #ifndef FST_LIB_DETERMINIZE_H__ |
| #define FST_LIB_DETERMINIZE_H__ |
| |
| #include <algorithm> |
| #include <climits> |
| #include <tr1/unordered_map> |
| using std::tr1::unordered_map; |
| using std::tr1::unordered_multimap; |
| #include <map> |
| #include <fst/slist.h> |
| #include <string> |
| #include <vector> |
| using std::vector; |
| |
| #include <fst/arc-map.h> |
| #include <fst/cache.h> |
| #include <fst/bi-table.h> |
| #include <fst/factor-weight.h> |
| #include <fst/prune.h> |
| #include <fst/test-properties.h> |
| |
| |
| namespace fst { |
| |
| // |
| // COMMON DIVISORS - these are used in determinization to compute |
| // the transition weights. In the simplest case, it is just the same |
| // as the semiring Plus(). However, other choices permit more efficient |
| // determinization when the output contains strings. |
| // |
| |
| // The default common divisor uses the semiring Plus. |
| template <class W> |
| class DefaultCommonDivisor { |
| public: |
| typedef W Weight; |
| |
| W operator()(const W &w1, const W &w2) const { return Plus(w1, w2); } |
| }; |
| |
| |
| // The label common divisor for a (left) string semiring selects a |
| // single letter common prefix or the empty string. This is used in |
| // the determinization of output strings so that at most a single |
| // letter will appear in the output of a transtion. |
| template <typename L, StringType S> |
| class LabelCommonDivisor { |
| public: |
| typedef StringWeight<L, S> Weight; |
| |
| Weight operator()(const Weight &w1, const Weight &w2) const { |
| StringWeightIterator<L, S> iter1(w1); |
| StringWeightIterator<L, S> iter2(w2); |
| |
| if (!(StringWeight<L, S>::Properties() & kLeftSemiring)) { |
| FSTERROR() << "LabelCommonDivisor: Weight needs to be left semiring"; |
| return Weight::NoWeight(); |
| } else if (w1.Size() == 0 || w2.Size() == 0) { |
| return Weight::One(); |
| } else if (w1 == Weight::Zero()) { |
| return Weight(iter2.Value()); |
| } else if (w2 == Weight::Zero()) { |
| return Weight(iter1.Value()); |
| } else if (iter1.Value() == iter2.Value()) { |
| return Weight(iter1.Value()); |
| } else { |
| return Weight::One(); |
| } |
| } |
| }; |
| |
| |
| // The gallic common divisor uses the label common divisor on the |
| // string component and the template argument D common divisor on the |
| // weight component, which defaults to the default common divisor. |
| template <class L, class W, StringType S, class D = DefaultCommonDivisor<W> > |
| class GallicCommonDivisor { |
| public: |
| typedef GallicWeight<L, W, S> Weight; |
| |
| Weight operator()(const Weight &w1, const Weight &w2) const { |
| return Weight(label_common_divisor_(w1.Value1(), w2.Value1()), |
| weight_common_divisor_(w1.Value2(), w2.Value2())); |
| } |
| |
| private: |
| LabelCommonDivisor<L, S> label_common_divisor_; |
| D weight_common_divisor_; |
| }; |
| |
| |
| // Represents an element in a subset |
| template <class A> |
| struct DeterminizeElement { |
| typedef typename A::StateId StateId; |
| typedef typename A::Weight Weight; |
| |
| DeterminizeElement() {} |
| |
| DeterminizeElement(StateId s, Weight w) : state_id(s), weight(w) {} |
| |
| bool operator==(const DeterminizeElement<A> & element) const { |
| return state_id == element.state_id && weight == element.weight; |
| } |
| |
| bool operator<(const DeterminizeElement<A> & element) const { |
| return state_id < element.state_id || |
| (state_id == element.state_id && weight == element.weight); |
| } |
| |
| StateId state_id; // Input state Id |
| Weight weight; // Residual weight |
| }; |
| |
| |
| // |
| // DETERMINIZE FILTERS - these can be used in determinization to compute |
| // transformations on the subsets prior to their being added as destination |
| // states. The filter operates on a map between a label and the |
| // corresponding destination subsets. The possibly modified map is |
| // then used to construct the destination states for arcs exiting state 's'. |
| // It must define the ordered map type LabelMap and have a default |
| // and copy constructor. |
| |
| // A determinize filter that does not modify its input. |
| template <class Arc> |
| struct IdentityDeterminizeFilter { |
| typedef typename Arc::StateId StateId; |
| typedef typename Arc::Label Label; |
| typedef slist< DeterminizeElement<Arc> > Subset; |
| typedef map<Label, Subset*> LabelMap; |
| |
| static uint64 Properties(uint64 props) { return props; } |
| |
| void operator()(StateId s, LabelMap *label_map) {} |
| }; |
| |
| |
| // |
| // DETERMINIZATION STATE TABLES |
| // |
| // The determiziation state table has the form: |
| // |
| // template <class Arc> |
| // class DeterminizeStateTable { |
| // public: |
| // typedef typename Arc::StateId StateId; |
| // typedef DeterminizeElement<Arc> Element; |
| // typedef slist<Element> Subset; |
| // |
| // // Required constuctor |
| // DeterminizeStateTable(); |
| // |
| // // Required copy constructor that does not copy state |
| // DeterminizeStateTable(const DeterminizeStateTable<A,P> &table); |
| // |
| // // Lookup state ID by subset (not depending of the element order). |
| // // If it doesn't exist, then add it. FindState takes |
| // // ownership of the subset argument (so that it doesn't have to |
| // // copy it if it creates a new state). |
| // StateId FindState(Subset *subset); |
| // |
| // // Lookup subset by ID. |
| // const Subset *FindSubset(StateId id) const; |
| // }; |
| // |
| |
| // The default determinization state table based on the |
| // compact hash bi-table. |
| template <class Arc> |
| class DefaultDeterminizeStateTable { |
| public: |
| typedef typename Arc::StateId StateId; |
| typedef typename Arc::Label Label; |
| typedef typename Arc::Weight Weight; |
| typedef DeterminizeElement<Arc> Element; |
| typedef slist<Element> Subset; |
| |
| explicit DefaultDeterminizeStateTable(size_t table_size = 0) |
| : table_size_(table_size), |
| subsets_(table_size_, new SubsetKey(), new SubsetEqual(&elements_)) { } |
| |
| DefaultDeterminizeStateTable(const DefaultDeterminizeStateTable<Arc> &table) |
| : table_size_(table.table_size_), |
| subsets_(table_size_, new SubsetKey(), new SubsetEqual(&elements_)) { } |
| |
| ~DefaultDeterminizeStateTable() { |
| for (StateId s = 0; s < subsets_.Size(); ++s) |
| delete subsets_.FindEntry(s); |
| } |
| |
| // Finds the state corresponding to a subset. Only creates a new |
| // state if the subset is not found. FindState takes ownership of |
| // the subset argument (so that it doesn't have to copy it if it |
| // creates a new state). |
| StateId FindState(Subset *subset) { |
| StateId ns = subsets_.Size(); |
| StateId s = subsets_.FindId(subset); |
| if (s != ns) delete subset; // subset found |
| return s; |
| } |
| |
| const Subset* FindSubset(StateId s) { return subsets_.FindEntry(s); } |
| |
| private: |
| // Comparison object for hashing Subset(s). Subsets are not sorted in this |
| // implementation, so ordering must not be assumed in the equivalence |
| // test. |
| class SubsetEqual { |
| public: |
| SubsetEqual() { // needed for compilation but should never be called |
| FSTERROR() << "SubsetEqual: default constructor not implemented"; |
| } |
| |
| // Constructor takes vector needed to check equality. See immediately |
| // below for constraints on it. |
| explicit SubsetEqual(vector<Element *> *elements) |
| : elements_(elements) {} |
| |
| // At each call to operator(), the elements_ vector should contain |
| // only NULLs. When this operator returns, elements_ will still |
| // have this property. |
| bool operator()(Subset* subset1, Subset* subset2) const { |
| if (!subset1 && !subset2) |
| return true; |
| if ((subset1 && !subset2) || (!subset1 && subset2)) |
| return false; |
| |
| if (subset1->size() != subset2->size()) |
| return false; |
| |
| // Loads first subset elements in element vector. |
| for (typename Subset::iterator iter1 = subset1->begin(); |
| iter1 != subset1->end(); |
| ++iter1) { |
| Element &element1 = *iter1; |
| while (elements_->size() <= element1.state_id) |
| elements_->push_back(0); |
| (*elements_)[element1.state_id] = &element1; |
| } |
| |
| // Checks second subset matches first via element vector. |
| for (typename Subset::iterator iter2 = subset2->begin(); |
| iter2 != subset2->end(); |
| ++iter2) { |
| Element &element2 = *iter2; |
| while (elements_->size() <= element2.state_id) |
| elements_->push_back(0); |
| Element *element1 = (*elements_)[element2.state_id]; |
| if (!element1 || element1->weight != element2.weight) { |
| // Mismatch found. Resets element vector before returning false. |
| for (typename Subset::iterator iter1 = subset1->begin(); |
| iter1 != subset1->end(); |
| ++iter1) |
| (*elements_)[iter1->state_id] = 0; |
| return false; |
| } else { |
| (*elements_)[element2.state_id] = 0; // Clears entry |
| } |
| } |
| return true; |
| } |
| private: |
| vector<Element *> *elements_; |
| }; |
| |
| // Hash function for Subset to Fst states. Subset elements are not |
| // sorted in this implementation, so the hash must be invariant |
| // under subset reordering. |
| class SubsetKey { |
| public: |
| size_t operator()(const Subset* subset) const { |
| size_t hash = 0; |
| if (subset) { |
| for (typename Subset::const_iterator iter = subset->begin(); |
| iter != subset->end(); |
| ++iter) { |
| const Element &element = *iter; |
| int lshift = element.state_id % (CHAR_BIT * sizeof(size_t) - 1) + 1; |
| int rshift = CHAR_BIT * sizeof(size_t) - lshift; |
| size_t n = element.state_id; |
| hash ^= n << lshift ^ n >> rshift ^ element.weight.Hash(); |
| } |
| } |
| return hash; |
| } |
| }; |
| |
| size_t table_size_; |
| |
| typedef CompactHashBiTable<StateId, Subset *, |
| SubsetKey, SubsetEqual, HS_STL> SubsetTable; |
| |
| SubsetTable subsets_; |
| vector<Element *> elements_; |
| |
| void operator=(const DefaultDeterminizeStateTable<Arc> &); // disallow |
| }; |
| |
| // Options for finite-state transducer determinization templated on |
| // the arc type, common divisor, the determinization filter and the |
| // state table. DeterminizeFst takes ownership of the determinization |
| // filter and state table if provided. |
| template <class Arc, |
| class D = DefaultCommonDivisor<typename Arc::Weight>, |
| class F = IdentityDeterminizeFilter<Arc>, |
| class T = DefaultDeterminizeStateTable<Arc> > |
| struct DeterminizeFstOptions : CacheOptions { |
| typedef typename Arc::Label Label; |
| float delta; // Quantization delta for subset weights |
| Label subsequential_label; // Label used for residual final output |
| // when producing subsequential transducers. |
| F *filter; // Determinization filter |
| T *state_table; // Determinization state table |
| |
| explicit DeterminizeFstOptions(const CacheOptions &opts, |
| float del = kDelta, Label lab = 0, |
| F *filt = 0, |
| T *table = 0) |
| : CacheOptions(opts), delta(del), subsequential_label(lab), |
| filter(filt), state_table(table) {} |
| |
| explicit DeterminizeFstOptions(float del = kDelta, Label lab = 0, |
| F *filt = 0, T *table = 0) |
| : delta(del), subsequential_label(lab), filter(filt), |
| state_table(table) {} |
| }; |
| |
| // Implementation of delayed DeterminizeFst. This base class is |
| // common to the variants that implement acceptor and transducer |
| // determinization. |
| template <class A> |
| class DeterminizeFstImplBase : public CacheImpl<A> { |
| public: |
| using FstImpl<A>::SetType; |
| using FstImpl<A>::SetProperties; |
| using FstImpl<A>::Properties; |
| using FstImpl<A>::SetInputSymbols; |
| using FstImpl<A>::SetOutputSymbols; |
| |
| using CacheBaseImpl< CacheState<A> >::HasStart; |
| using CacheBaseImpl< CacheState<A> >::HasFinal; |
| using CacheBaseImpl< CacheState<A> >::HasArcs; |
| using CacheBaseImpl< CacheState<A> >::SetFinal; |
| using CacheBaseImpl< CacheState<A> >::SetStart; |
| |
| typedef typename A::Label Label; |
| typedef typename A::Weight Weight; |
| typedef typename A::StateId StateId; |
| typedef CacheState<A> State; |
| |
| template <class D, class F, class T> |
| DeterminizeFstImplBase(const Fst<A> &fst, |
| const DeterminizeFstOptions<A, D, F, T> &opts) |
| : CacheImpl<A>(opts), fst_(fst.Copy()) { |
| SetType("determinize"); |
| uint64 iprops = fst.Properties(kFstProperties, false); |
| uint64 dprops = DeterminizeProperties(iprops, |
| opts.subsequential_label != 0); |
| SetProperties(F::Properties(dprops), kCopyProperties); |
| SetInputSymbols(fst.InputSymbols()); |
| SetOutputSymbols(fst.OutputSymbols()); |
| } |
| |
| DeterminizeFstImplBase(const DeterminizeFstImplBase<A> &impl) |
| : CacheImpl<A>(impl), |
| fst_(impl.fst_->Copy(true)) { |
| SetType("determinize"); |
| SetProperties(impl.Properties(), kCopyProperties); |
| SetInputSymbols(impl.InputSymbols()); |
| SetOutputSymbols(impl.OutputSymbols()); |
| } |
| |
| virtual ~DeterminizeFstImplBase() { delete fst_; } |
| |
| virtual DeterminizeFstImplBase<A> *Copy() = 0; |
| |
| StateId Start() { |
| if (!HasStart()) { |
| StateId start = ComputeStart(); |
| if (start != kNoStateId) { |
| SetStart(start); |
| } |
| } |
| return CacheImpl<A>::Start(); |
| } |
| |
| Weight Final(StateId s) { |
| if (!HasFinal(s)) { |
| Weight final = ComputeFinal(s); |
| SetFinal(s, final); |
| } |
| return CacheImpl<A>::Final(s); |
| } |
| |
| virtual void Expand(StateId s) = 0; |
| |
| size_t NumArcs(StateId s) { |
| if (!HasArcs(s)) |
| Expand(s); |
| return CacheImpl<A>::NumArcs(s); |
| } |
| |
| size_t NumInputEpsilons(StateId s) { |
| if (!HasArcs(s)) |
| Expand(s); |
| return CacheImpl<A>::NumInputEpsilons(s); |
| } |
| |
| size_t NumOutputEpsilons(StateId s) { |
| if (!HasArcs(s)) |
| Expand(s); |
| return CacheImpl<A>::NumOutputEpsilons(s); |
| } |
| |
| void InitArcIterator(StateId s, ArcIteratorData<A> *data) { |
| if (!HasArcs(s)) |
| Expand(s); |
| CacheImpl<A>::InitArcIterator(s, data); |
| } |
| |
| virtual StateId ComputeStart() = 0; |
| |
| virtual Weight ComputeFinal(StateId s) = 0; |
| |
| const Fst<A> &GetFst() const { return *fst_; } |
| |
| private: |
| const Fst<A> *fst_; // Input Fst |
| |
| void operator=(const DeterminizeFstImplBase<A> &); // disallow |
| }; |
| |
| |
| // Implementation of delayed determinization for weighted acceptors. |
| // It is templated on the arc type A and the common divisor D. |
| template <class A, class D, class F, class T> |
| class DeterminizeFsaImpl : public DeterminizeFstImplBase<A> { |
| public: |
| using FstImpl<A>::SetProperties; |
| using DeterminizeFstImplBase<A>::GetFst; |
| using DeterminizeFstImplBase<A>::SetArcs; |
| |
| typedef typename A::Label Label; |
| typedef typename A::Weight Weight; |
| typedef typename A::StateId StateId; |
| typedef DeterminizeElement<A> Element; |
| typedef slist<Element> Subset; |
| typedef typename F::LabelMap LabelMap; |
| |
| DeterminizeFsaImpl(const Fst<A> &fst, |
| const vector<Weight> *in_dist, vector<Weight> *out_dist, |
| const DeterminizeFstOptions<A, D, F, T> &opts) |
| : DeterminizeFstImplBase<A>(fst, opts), |
| delta_(opts.delta), |
| in_dist_(in_dist), |
| out_dist_(out_dist), |
| filter_(opts.filter ? opts.filter : new F()), |
| state_table_(opts.state_table ? opts.state_table : new T()) { |
| if (!fst.Properties(kAcceptor, true)) { |
| FSTERROR() << "DeterminizeFst: argument not an acceptor"; |
| SetProperties(kError, kError); |
| } |
| if (!(Weight::Properties() & kLeftSemiring)) { |
| FSTERROR() << "DeterminizeFst: Weight needs to be left distributive: " |
| << Weight::Type(); |
| SetProperties(kError, kError); |
| } |
| if (out_dist_) |
| out_dist_->clear(); |
| } |
| |
| DeterminizeFsaImpl(const DeterminizeFsaImpl<A, D, F, T> &impl) |
| : DeterminizeFstImplBase<A>(impl), |
| delta_(impl.delta_), |
| in_dist_(0), |
| out_dist_(0), |
| filter_(new F(*impl.filter_)), |
| state_table_(new T(*impl.state_table_)) { |
| if (impl.out_dist_) { |
| FSTERROR() << "DeterminizeFsaImpl: cannot copy with out_dist vector"; |
| SetProperties(kError, kError); |
| } |
| } |
| |
| virtual ~DeterminizeFsaImpl() { |
| delete filter_; |
| delete state_table_; |
| } |
| |
| virtual DeterminizeFsaImpl<A, D, F, T> *Copy() { |
| return new DeterminizeFsaImpl<A, D, F, T>(*this); |
| } |
| |
| uint64 Properties() const { return Properties(kFstProperties); } |
| |
| // Set error if found; return FST impl properties. |
| uint64 Properties(uint64 mask) const { |
| if ((mask & kError) && (GetFst().Properties(kError, false))) |
| SetProperties(kError, kError); |
| return FstImpl<A>::Properties(mask); |
| } |
| |
| virtual StateId ComputeStart() { |
| StateId s = GetFst().Start(); |
| if (s == kNoStateId) |
| return kNoStateId; |
| Element element(s, Weight::One()); |
| Subset *subset = new Subset; |
| subset->push_front(element); |
| return FindState(subset); |
| } |
| |
| virtual Weight ComputeFinal(StateId s) { |
| const Subset *subset = state_table_->FindSubset(s); |
| Weight final = Weight::Zero(); |
| for (typename Subset::const_iterator siter = subset->begin(); |
| siter != subset->end(); |
| ++siter) { |
| const Element &element = *siter; |
| final = Plus(final, Times(element.weight, |
| GetFst().Final(element.state_id))); |
| if (!final.Member()) |
| SetProperties(kError, kError); |
| } |
| return final; |
| } |
| |
| StateId FindState(Subset *subset) { |
| StateId s = state_table_->FindState(subset); |
| if (in_dist_ && out_dist_->size() <= s) |
| out_dist_->push_back(ComputeDistance(subset)); |
| return s; |
| } |
| |
| // Compute distance from a state to the final states in the DFA |
| // given the distances in the NFA. |
| Weight ComputeDistance(const Subset *subset) { |
| Weight outd = Weight::Zero(); |
| for (typename Subset::const_iterator siter = subset->begin(); |
| siter != subset->end(); ++siter) { |
| const Element &element = *siter; |
| Weight ind = element.state_id < in_dist_->size() ? |
| (*in_dist_)[element.state_id] : Weight::Zero(); |
| outd = Plus(outd, Times(element.weight, ind)); |
| } |
| return outd; |
| } |
| |
| // Computes the outgoing transitions from a state, creating new destination |
| // states as needed. |
| virtual void Expand(StateId s) { |
| |
| LabelMap label_map; |
| LabelSubsets(s, &label_map); |
| |
| for (typename LabelMap::iterator liter = label_map.begin(); |
| liter != label_map.end(); |
| ++liter) |
| AddArc(s, liter->first, liter->second); |
| SetArcs(s); |
| } |
| |
| private: |
| // Constructs destination subsets per label. At return, subset |
| // element weights include the input automaton label weights and the |
| // subsets may contain duplicate states. |
| void LabelSubsets(StateId s, LabelMap *label_map) { |
| const Subset *src_subset = state_table_->FindSubset(s); |
| |
| for (typename Subset::const_iterator siter = src_subset->begin(); |
| siter != src_subset->end(); |
| ++siter) { |
| const Element &src_element = *siter; |
| for (ArcIterator< Fst<A> > aiter(GetFst(), src_element.state_id); |
| !aiter.Done(); |
| aiter.Next()) { |
| const A &arc = aiter.Value(); |
| Element dest_element(arc.nextstate, |
| Times(src_element.weight, arc.weight)); |
| |
| // The LabelMap may be a e.g. multimap with more complex |
| // determinization filters, so we insert efficiently w/o using []. |
| typename LabelMap::iterator liter = label_map->lower_bound(arc.ilabel); |
| Subset* dest_subset; |
| if (liter == label_map->end() || liter->first != arc.ilabel) { |
| dest_subset = new Subset; |
| label_map->insert(liter, make_pair(arc.ilabel, dest_subset)); |
| } else { |
| dest_subset = liter->second; |
| } |
| |
| dest_subset->push_front(dest_element); |
| } |
| } |
| // Applies the determinization filter |
| (*filter_)(s, label_map); |
| } |
| |
| // Adds an arc from state S to the destination state associated |
| // with subset DEST_SUBSET (as created by LabelSubsets). |
| void AddArc(StateId s, Label label, Subset *dest_subset) { |
| A arc; |
| arc.ilabel = label; |
| arc.olabel = label; |
| arc.weight = Weight::Zero(); |
| |
| typename Subset::iterator oiter; |
| for (typename Subset::iterator diter = dest_subset->begin(); |
| diter != dest_subset->end();) { |
| Element &dest_element = *diter; |
| // Computes label weight. |
| arc.weight = common_divisor_(arc.weight, dest_element.weight); |
| |
| while (elements_.size() <= dest_element.state_id) |
| elements_.push_back(0); |
| Element *matching_element = elements_[dest_element.state_id]; |
| if (matching_element) { |
| // Found duplicate state: sums state weight and deletes dup. |
| matching_element->weight = Plus(matching_element->weight, |
| dest_element.weight); |
| if (!matching_element->weight.Member()) |
| SetProperties(kError, kError); |
| ++diter; |
| dest_subset->erase_after(oiter); |
| } else { |
| // Saves element so we can check for duplicate for this state. |
| elements_[dest_element.state_id] = &dest_element; |
| oiter = diter; |
| ++diter; |
| } |
| } |
| |
| // Divides out label weight from destination subset elements. |
| // Quantizes to ensure comparisons are effective. |
| // Clears element vector. |
| for (typename Subset::iterator diter = dest_subset->begin(); |
| diter != dest_subset->end(); |
| ++diter) { |
| Element &dest_element = *diter; |
| dest_element.weight = Divide(dest_element.weight, arc.weight, |
| DIVIDE_LEFT); |
| dest_element.weight = dest_element.weight.Quantize(delta_); |
| elements_[dest_element.state_id] = 0; |
| } |
| |
| arc.nextstate = FindState(dest_subset); |
| CacheImpl<A>::PushArc(s, arc); |
| } |
| |
| float delta_; // Quantization delta for subset weights |
| const vector<Weight> *in_dist_; // Distance to final NFA states |
| vector<Weight> *out_dist_; // Distance to final DFA states |
| |
| D common_divisor_; |
| F *filter_; |
| T *state_table_; |
| |
| vector<Element *> elements_; |
| |
| void operator=(const DeterminizeFsaImpl<A, D, F, T> &); // disallow |
| }; |
| |
| |
| // Implementation of delayed determinization for transducers. |
| // Transducer determinization is implemented by mapping the input to |
| // the Gallic semiring as an acceptor whose weights contain the output |
| // strings and using acceptor determinization above to determinize |
| // that acceptor. |
| template <class A, StringType S, class D, class F, class T> |
| class DeterminizeFstImpl : public DeterminizeFstImplBase<A> { |
| public: |
| using FstImpl<A>::SetProperties; |
| using DeterminizeFstImplBase<A>::GetFst; |
| using CacheBaseImpl< CacheState<A> >::GetCacheGc; |
| using CacheBaseImpl< CacheState<A> >::GetCacheLimit; |
| |
| typedef typename A::Label Label; |
| typedef typename A::Weight Weight; |
| typedef typename A::StateId StateId; |
| |
| typedef ToGallicMapper<A, S> ToMapper; |
| typedef FromGallicMapper<A, S> FromMapper; |
| |
| typedef typename ToMapper::ToArc ToArc; |
| typedef ArcMapFst<A, ToArc, ToMapper> ToFst; |
| typedef ArcMapFst<ToArc, A, FromMapper> FromFst; |
| |
| typedef GallicCommonDivisor<Label, Weight, S, D> CommonDivisor; |
| typedef GallicFactor<Label, Weight, S> FactorIterator; |
| |
| DeterminizeFstImpl(const Fst<A> &fst, |
| const DeterminizeFstOptions<A, D, F, T> &opts) |
| : DeterminizeFstImplBase<A>(fst, opts), |
| delta_(opts.delta), |
| subsequential_label_(opts.subsequential_label) { |
| Init(GetFst()); |
| } |
| |
| DeterminizeFstImpl(const DeterminizeFstImpl<A, S, D, F, T> &impl) |
| : DeterminizeFstImplBase<A>(impl), |
| delta_(impl.delta_), |
| subsequential_label_(impl.subsequential_label_) { |
| Init(GetFst()); |
| } |
| |
| ~DeterminizeFstImpl() { delete from_fst_; } |
| |
| virtual DeterminizeFstImpl<A, S, D, F, T> *Copy() { |
| return new DeterminizeFstImpl<A, S, D, F, T>(*this); |
| } |
| |
| uint64 Properties() const { return Properties(kFstProperties); } |
| |
| // Set error if found; return FST impl properties. |
| uint64 Properties(uint64 mask) const { |
| if ((mask & kError) && (GetFst().Properties(kError, false) || |
| from_fst_->Properties(kError, false))) |
| SetProperties(kError, kError); |
| return FstImpl<A>::Properties(mask); |
| } |
| |
| virtual StateId ComputeStart() { return from_fst_->Start(); } |
| |
| virtual Weight ComputeFinal(StateId s) { return from_fst_->Final(s); } |
| |
| virtual void Expand(StateId s) { |
| for (ArcIterator<FromFst> aiter(*from_fst_, s); |
| !aiter.Done(); |
| aiter.Next()) |
| CacheImpl<A>::PushArc(s, aiter.Value()); |
| CacheImpl<A>::SetArcs(s); |
| } |
| |
| private: |
| // Initialization of transducer determinization implementation, which |
| // is defined after DeterminizeFst since it calls it. |
| void Init(const Fst<A> &fst); |
| |
| float delta_; |
| Label subsequential_label_; |
| FromFst *from_fst_; |
| |
| void operator=(const DeterminizeFstImpl<A, S, D, F, T> &); // disallow |
| }; |
| |
| |
| // Determinizes a weighted transducer. This version is a delayed |
| // Fst. The result will be an equivalent FST that has the property |
| // that no state has two transitions with the same input label. |
| // For this algorithm, epsilon transitions are treated as regular |
| // symbols (cf. RmEpsilon). |
| // |
| // The transducer must be functional. The weights must be (weakly) |
| // left divisible (valid for TropicalWeight and LogWeight for instance) |
| // and be zero-sum-free if for all a,b: (Plus(a, b) = 0 => a = b = 0. |
| // |
| // Complexity: |
| // - Determinizable: exponential (polynomial in the size of the output) |
| // - Non-determinizable) does not terminate |
| // |
| // The determinizable automata include all unweighted and all acyclic input. |
| // |
| // References: |
| // - Mehryar Mohri, "Finite-State Transducers in Language and Speech |
| // Processing". Computational Linguistics, 23:2, 1997. |
| // |
| // This class attaches interface to implementation and handles |
| // reference counting, delegating most methods to ImplToFst. |
| template <class A> |
| class DeterminizeFst : public ImplToFst< DeterminizeFstImplBase<A> > { |
| public: |
| friend class ArcIterator< DeterminizeFst<A> >; |
| friend class StateIterator< DeterminizeFst<A> >; |
| template <class B, StringType S, class D, class F, class T> |
| friend class DeterminizeFstImpl; |
| |
| typedef A Arc; |
| typedef typename A::Weight Weight; |
| typedef typename A::StateId StateId; |
| typedef typename A::Label Label; |
| typedef CacheState<A> State; |
| typedef DeterminizeFstImplBase<A> Impl; |
| |
| using ImplToFst<Impl>::SetImpl; |
| |
| explicit DeterminizeFst(const Fst<A> &fst) { |
| typedef DefaultCommonDivisor<Weight> D; |
| typedef IdentityDeterminizeFilter<A> F; |
| typedef DefaultDeterminizeStateTable<A> T; |
| DeterminizeFstOptions<A, D, F, T> opts; |
| if (fst.Properties(kAcceptor, true)) { |
| // Calls implementation for acceptors. |
| SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, 0, 0, opts)); |
| } else { |
| // Calls implementation for transducers. |
| SetImpl(new |
| DeterminizeFstImpl<A, STRING_LEFT_RESTRICT, D, F, T>(fst, opts)); |
| } |
| } |
| |
| template <class D, class F, class T> |
| DeterminizeFst(const Fst<A> &fst, |
| const DeterminizeFstOptions<A, D, F, T> &opts) { |
| if (fst.Properties(kAcceptor, true)) { |
| // Calls implementation for acceptors. |
| SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, 0, 0, opts)); |
| } else { |
| // Calls implementation for transducers. |
| SetImpl(new |
| DeterminizeFstImpl<A, STRING_LEFT_RESTRICT, D, F, T>(fst, opts)); |
| } |
| } |
| |
| // This acceptor-only version additionally computes the distance to |
| // final states in the output if provided with those distances for the |
| // input. Useful for e.g. unique N-shortest paths. |
| template <class D, class F, class T> |
| DeterminizeFst(const Fst<A> &fst, |
| const vector<Weight> *in_dist, vector<Weight> *out_dist, |
| const DeterminizeFstOptions<A, D, F, T> &opts) { |
| if (!fst.Properties(kAcceptor, true)) { |
| FSTERROR() << "DeterminizeFst:" |
| << " distance to final states computed for acceptors only"; |
| GetImpl()->SetProperties(kError, kError); |
| } |
| SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, in_dist, out_dist, opts)); |
| } |
| |
| // See Fst<>::Copy() for doc. |
| DeterminizeFst(const DeterminizeFst<A> &fst, bool safe = false) { |
| if (safe) |
| SetImpl(fst.GetImpl()->Copy()); |
| else |
| SetImpl(fst.GetImpl(), false); |
| } |
| |
| // Get a copy of this DeterminizeFst. See Fst<>::Copy() for further doc. |
| virtual DeterminizeFst<A> *Copy(bool safe = false) const { |
| return new DeterminizeFst<A>(*this, safe); |
| } |
| |
| virtual inline void InitStateIterator(StateIteratorData<A> *data) const; |
| |
| virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const { |
| GetImpl()->InitArcIterator(s, data); |
| } |
| |
| private: |
| // Makes visible to friends. |
| Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); } |
| |
| void operator=(const DeterminizeFst<A> &fst); // Disallow |
| }; |
| |
| |
| // Initialization of transducer determinization implementation. which |
| // is defined after DeterminizeFst since it calls it. |
| template <class A, StringType S, class D, class F, class T> |
| void DeterminizeFstImpl<A, S, D, F, T>::Init(const Fst<A> &fst) { |
| // Mapper to an acceptor. |
| ToFst to_fst(fst, ToMapper()); |
| |
| // Determinizes acceptor. |
| // This recursive call terminates since it passes the common divisor |
| // to a private constructor. |
| CacheOptions copts(GetCacheGc(), GetCacheLimit()); |
| DeterminizeFstOptions<ToArc, CommonDivisor> dopts(copts, delta_); |
| // Uses acceptor-only constructor to avoid template recursion |
| DeterminizeFst<ToArc> det_fsa(to_fst, 0, 0, dopts); |
| |
| // Mapper back to transducer. |
| FactorWeightOptions<ToArc> fopts(CacheOptions(true, 0), delta_, |
| kFactorFinalWeights, |
| subsequential_label_, |
| subsequential_label_); |
| FactorWeightFst<ToArc, FactorIterator> factored_fst(det_fsa, fopts); |
| from_fst_ = new FromFst(factored_fst, FromMapper(subsequential_label_)); |
| } |
| |
| |
| // Specialization for DeterminizeFst. |
| template <class A> |
| class StateIterator< DeterminizeFst<A> > |
| : public CacheStateIterator< DeterminizeFst<A> > { |
| public: |
| explicit StateIterator(const DeterminizeFst<A> &fst) |
| : CacheStateIterator< DeterminizeFst<A> >(fst, fst.GetImpl()) {} |
| }; |
| |
| |
| // Specialization for DeterminizeFst. |
| template <class A> |
| class ArcIterator< DeterminizeFst<A> > |
| : public CacheArcIterator< DeterminizeFst<A> > { |
| public: |
| typedef typename A::StateId StateId; |
| |
| ArcIterator(const DeterminizeFst<A> &fst, StateId s) |
| : CacheArcIterator< DeterminizeFst<A> >(fst.GetImpl(), s) { |
| if (!fst.GetImpl()->HasArcs(s)) |
| fst.GetImpl()->Expand(s); |
| } |
| |
| private: |
| DISALLOW_COPY_AND_ASSIGN(ArcIterator); |
| }; |
| |
| |
| template <class A> inline |
| void DeterminizeFst<A>::InitStateIterator(StateIteratorData<A> *data) const |
| { |
| data->base = new StateIterator< DeterminizeFst<A> >(*this); |
| } |
| |
| |
| // Useful aliases when using StdArc. |
| typedef DeterminizeFst<StdArc> StdDeterminizeFst; |
| |
| |
| template <class Arc> |
| struct DeterminizeOptions { |
| typedef typename Arc::StateId StateId; |
| typedef typename Arc::Weight Weight; |
| typedef typename Arc::Label Label; |
| |
| float delta; // Quantization delta for subset weights. |
| Weight weight_threshold; // Pruning weight threshold. |
| StateId state_threshold; // Pruning state threshold. |
| Label subsequential_label; // Label used for residual final output |
| // when producing subsequential transducers. |
| |
| explicit DeterminizeOptions(float d = kDelta, Weight w = Weight::Zero(), |
| StateId n = kNoStateId, Label l = 0) |
| : delta(d), weight_threshold(w), state_threshold(n), |
| subsequential_label(l) {} |
| }; |
| |
| |
| // Determinizes a weighted transducer. This version writes the |
| // determinized Fst to an output MutableFst. The result will be an |
| // equivalent FST that has the property that no state has two |
| // transitions with the same input label. For this algorithm, epsilon |
| // transitions are treated as regular symbols (cf. RmEpsilon). |
| // |
| // The transducer must be functional. The weights must be (weakly) |
| // left divisible (valid for TropicalWeight and LogWeight). |
| // |
| // Complexity: |
| // - Determinizable: exponential (polynomial in the size of the output) |
| // - Non-determinizable: does not terminate |
| // |
| // The determinizable automata include all unweighted and all acyclic input. |
| // |
| // References: |
| // - Mehryar Mohri, "Finite-State Transducers in Language and Speech |
| // Processing". Computational Linguistics, 23:2, 1997. |
| template <class Arc> |
| void Determinize(const Fst<Arc> &ifst, MutableFst<Arc> *ofst, |
| const DeterminizeOptions<Arc> &opts |
| = DeterminizeOptions<Arc>()) { |
| typedef typename Arc::StateId StateId; |
| typedef typename Arc::Weight Weight; |
| |
| DeterminizeFstOptions<Arc> nopts; |
| nopts.delta = opts.delta; |
| nopts.subsequential_label = opts.subsequential_label; |
| |
| nopts.gc_limit = 0; // Cache only the last state for fastest copy. |
| |
| if (opts.weight_threshold != Weight::Zero() || |
| opts.state_threshold != kNoStateId) { |
| if (ifst.Properties(kAcceptor, false)) { |
| vector<Weight> idistance, odistance; |
| ShortestDistance(ifst, &idistance, true); |
| DeterminizeFst<Arc> dfst(ifst, &idistance, &odistance, nopts); |
| PruneOptions< Arc, AnyArcFilter<Arc> > popts(opts.weight_threshold, |
| opts.state_threshold, |
| AnyArcFilter<Arc>(), |
| &odistance); |
| Prune(dfst, ofst, popts); |
| } else { |
| *ofst = DeterminizeFst<Arc>(ifst, nopts); |
| Prune(ofst, opts.weight_threshold, opts.state_threshold); |
| } |
| } else { |
| *ofst = DeterminizeFst<Arc>(ifst, nopts); |
| } |
| } |
| |
| |
| } // namespace fst |
| |
| #endif // FST_LIB_DETERMINIZE_H__ |