baumwelch/html/train_8h_source.html

 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 // Copyright 2017 and onwards Google, Inc.

 #ifndef NLP_GRM2_BAUMWELCH_TRAIN_H_
 #define NLP_GRM2_BAUMWELCH_TRAIN_H_

 #include <cmath>
 #include <cstdint>
 #include <vector>

 #include <fst/log.h>
 #include <fst/extensions/far/far.h>
 #include <fst/arcfilter.h>
 #include <fst/compose.h>
 #include <fst/fst.h>
 #include <fst/mutable-fst.h>
 #include <fst/queue.h>
 #include <fst/shortest-distance.h>
 #include <fst/weight.h>
 #include <baumwelch/cascade.h>
 #include <baumwelch/expectation-table.h>
 #include <baumwelch/log-adder.h>
 #include <baumwelch/util.h>

 namespace fst {

 // Some defaults.
 constexpr float kAlpha = 1.;
 constexpr int kMaxIters = 50;

 // Helper for training options. If batch_size is 0, or larger than the data,
 // full-batch training is performed.
 struct TrainOptions {
   explicit TrainOptions(int max_iters = kMaxIters, float alpha = kAlpha,
                         int batch_size = 0, float delta = kDelta,
                         const CascadeOptions &copts = CascadeOptions())
       : max_iters(max_iters),
         alpha(alpha),
         batch_size(batch_size),
         delta(delta),
         copts(copts) {
     if (alpha == 0.0) {
       // When alpha is 0, we mandatorily use full-batch training.
       batch_size = 0;
     }
   }

   // Maximum number of iterations to perform.
   int max_iters;
   // Step size reduction power. When non-zero, step size is (k + 2)^{-alpha},
   // where k is the step.
   float alpha;
   // Maximum size of a batch.  If set to 0, full-batch training occurs.
   int batch_size;
   // Comparison/quantization delta used to determine convergence.
   float delta;
   // Options passed to the trainer.
   CascadeOptions copts;
 };

 namespace internal {

 // Class storing forward and backwards weights
 //
 // For idempotent semirings, this uses A* search during the alpha computation.
 // If a state is not visited during search the estimate is taken to be
 // semiring zero. This estimate of alpha for a state has the true value as an
 // upper bound, since some states not visited during the search will
 // have true non-zero values because search terminates once the shortest path is
 // found (due to first_path=true).
 template <class Arc>
 class ForwardBackward {
  public:
   using StateId = typename Arc::StateId;
   using Weight = typename Arc::Weight;

   explicit ForwardBackward(const ComposeFst<Arc> &ico) {
     ShortestDistance(ico, &beta_, /*reverse=*/true);
     if constexpr (IsIdempotent<typename Arc::Weight>::value) {
       // Computes alpha using an A* approximation.
       using StateId = typename Arc::StateId;
       using Weight = typename Arc::Weight;
       using MyEstimate = NaturalAStarEstimate<StateId, Weight>;
       using MyQueue = NaturalAStarQueue<StateId, Weight, MyEstimate>;
       using MyArcFilter = AnyArcFilter<Arc>;
       using MyShortestDistanceOptions =
           ShortestDistanceOptions<Arc, MyQueue, MyArcFilter>;
       const MyEstimate estimate(beta_);
       MyQueue queue(alpha_, estimate);
       static constexpr MyArcFilter arc_filter;
       const MyShortestDistanceOptions opts(
           &queue, arc_filter,
           /*source=*/kNoStateId,     // Default.
           /*delta=*/kShortestDelta,  // Default.
           /*first_path=*/true);      // Heuristic is admissible.
       ShortestDistance(ico, &alpha_, opts);
       VLOG(1) << ExploredStates<Weight>(alpha_) << " alpha states explored";
     } else {
       ShortestDistance(ico, &alpha_, /*reverse=*/false);
     }
   }

   const Weight &Alpha(StateId s) const {
     return ForwardBackward::WeightOrZero(s, alpha_);
   }

   const Weight &Beta(StateId s) const {
     return ForwardBackward::WeightOrZero(s, beta_);
   }

  private:
   static constexpr Weight kZero = Weight::Zero();

   // Returns the shortest distance weight, or semiring zero if the state was
   // not visited during the respective shortest distance computation.
   static const Weight &WeightOrZero(StateId s,
                                     const std::vector<Weight> &weights) {
     return (s < weights.size()) ? weights[s] : kZero;
   }

   std::vector<Weight> alpha_;
   std::vector<Weight> beta_;
 };

 // Object which holds all necessary information for stepwise or minibatch
 // training. It stores the (initial) learning rate and the step counter. For
 // more information, see the "sEM" pseudocode (p. 613) in:
 //
 // Liang, P., and Klein, D. 2009. Online EM for unsupervised models. In
 // Proceedings of Human Language Technologies: The 2009 Annual Conference of
 // the North American Chapter of the Association for Computational Linguistics,
 // pages 611-619.
 template <class Arc, class ExpectationTable>
 class StepwiseBaumWelchTrainer {
  public:
   using Weight = typename Arc::Weight;
   using Sum = LogAdder<Weight>;

   // Valid values of alpha are usually between [.5, 1.0].
   explicit StepwiseBaumWelchTrainer(
       float alpha = kAlpha, int batch_size = 0,
       const CascadeOptions &opts = CascadeOptions())
       : alpha_(alpha), batch_size_(batch_size), opts_(opts), step_(0) {}

   // Performs a batch of training returning the likelihood. Semiring Zero is
   // returned in the case of composition failure.
   Weight Batch(FarReader<Arc> &input, FarReader<Arc> &output,
                MutableFst<Arc> *model) {
     ExpectationTable table(*model);
     Sum likelihood;  // Tracks batch likelihood.
     int batch_idx = 0;  // Tracks actual batch size.
     for (; !input.Done() && !output.Done() &&
            (!batch_size_ || batch_idx < batch_size_);
          ++batch_idx) {
       likelihood.Add(
           Forward(*input.GetFst(), *output.GetFst(), *model, &table));
       if (input.Type() != FarType::FST) input.Next();
       output.Next();
     }
     Backward(table, model);
     ++step_;
     const auto batch_likelihood = likelihood.Sum();
     LOG(INFO) << "Step " << step_ << " (batch size " << batch_idx
               << ") average likelihood: "
               << batch_likelihood.Value() / batch_idx;
     return batch_likelihood;
   }

   // Repeatedly do the stepwise computation.
   Weight Train(FarReader<Arc> &input, FarReader<Arc> &output,
                MutableFst<Arc> *model) {
     Sum likelihood;  // Tracks iteration likelihood.
     while (!input.Done() && !output.Done()) {
       likelihood.Add(Batch(input, output, model));
     }
     Normalize(model);
     return likelihood.Sum();
   }

   // Normalizes the model.
   void Normalize(MutableFst<Arc> *model) {
     ExpectationTable table(*model);
     StateIterator<MutableFst<Arc>> siter(*model);
     for (; !siter.Done(); siter.Next()) {
       const auto state = siter.Value();
       for (ArcIterator<MutableFst<Arc>> aiter(*model, state); !aiter.Done();
            aiter.Next()) {
         const auto &arc = aiter.Value();
         table.Forward(state, arc.ilabel, arc.olabel, arc.weight, arc.nextstate);
       }
       const auto weight = model->Final(state);
       if (weight == Weight::Zero()) continue;
       table.Forward(state, weight);
     }
     for (siter.Reset(); !siter.Done(); siter.Next()) {
       const auto state = siter.Value();
       for (MutableArcIterator<MutableFst<Arc>> aiter(model, state);
            !aiter.Done(); aiter.Next()) {
         auto arc = aiter.Value();
         arc.weight = table.Backward(state, arc);
         aiter.SetValue(arc);
       }
       model->SetFinal(state, table.Backward(state));
     }
   }

  private:
   Weight Forward(const Fst<Arc> &input, const Fst<Arc> &output,
                  const Fst<Arc> &model, ExpectationTable *table) {
     const ChannelStateCascade<Arc> cascade(input, output, model, opts_);
     const auto &ico = cascade.GetFst();
     const auto start = ico.Start();
     if (start == kNoStateId) {
       VLOG(1) << "Empty lattice";
       return false;
     }
     const ForwardBackward<Arc> fb(ico);
     const auto &likelihood = fb.Beta(start);
     if (likelihood == Weight::Zero()) {
       VLOG(1) << "Start state not coaccessible";
       return Weight::Zero();
     }
     for (StateIterator<ComposeFst<Arc>> siter(ico); !siter.Done();
          siter.Next()) {
       const auto state = siter.Value();
       // Non-coaccessible source state.
       if (fb.Beta(state) == Weight::Zero()) continue;
       const auto ch_state = cascade.ChannelState(state);
       const auto &alpha = fb.Alpha(state);
       for (ArcIterator<ComposeFst<Arc>> aiter(ico, state); !aiter.Done();
            aiter.Next()) {
         const auto &arc = aiter.Value();
         const auto &beta = fb.Beta(arc.nextstate);
         // Non-coaccessible destination state.
         if (beta == Weight::Zero()) continue;
         // The arc expectation is the product of the current weight, alpha,
         // and beta, divided by the overall observation likelihood.
         table->Forward(
             ch_state, arc.ilabel, arc.olabel,
             Divide(Times(Times(alpha, arc.weight), beta), likelihood),
             cascade.ChannelState(arc.nextstate));
       }
       const auto weight = ico.Final(state);
       if (weight == Weight::Zero()) continue;
       // The final state expectation is the product of the current weight and
       // alpha, divided by the overall observation likelihood.
       table->Forward(ch_state, Divide(Times(alpha, weight), likelihood));
     }
     return likelihood;
   }

   // TODO(kbg): Add a way to disable interpolation.
   static Weight Interpolate(const Weight &old_weight, const Weight &new_weight,
                             double nu_k) {
     if (nu_k == 1.0) {
       // Contribution of old_weight is 0, so just returns the new_weight.
       // This corresponds to standard full-batch EM.
       return new_weight;
     }
     const auto old_term = Times(1 - nu_k, old_weight);
     const auto new_term = Times(nu_k, new_weight);
     Sum plus(old_term);
     plus.Add(new_term);
     return plus.Sum();
   }

   void Backward(const ExpectationTable &table, MutableFst<Arc> *model) {
     const double nu_k = alpha_ == 0.0 ? 1.0 : std::pow(step_ + 2, -alpha_);
     for (StateIterator<MutableFst<Arc>> siter(*model); !siter.Done();
          siter.Next()) {
       const auto state = siter.Value();
       // Sets new arc weights.
       for (MutableArcIterator<MutableFst<Arc>> aiter(model, state);
            !aiter.Done(); aiter.Next()) {
         auto arc = aiter.Value();
         arc.weight = Interpolate(arc.weight, table.Backward(state, arc), nu_k);
         aiter.SetValue(arc);
       }
       // Sets new final weights.
       model->SetFinal(
           state, Interpolate(model->Final(state), table.Backward(state), nu_k));
     }
   }

   const float alpha_;     // Batch size reduction power.
   const int batch_size_;  // Batch size hyperparameter.
   const CascadeOptions opts_;
   uint64_t step_;  // Iteration/step number.
 };

 // Full training setup, templated on expectation table.
 template <class Arc, class ExpectationTable>
 typename Arc::Weight Train(FarReader<Arc> &input, FarReader<Arc> &output,
                            MutableFst<Arc> *model,
                            const TrainOptions &opts = TrainOptions()) {
   using Weight = typename Arc::Weight;
   auto last_likelihood = Weight::Zero();
   StepwiseBaumWelchTrainer<Arc, ExpectationTable> trainer(
       opts.alpha, opts.batch_size, opts.copts);
   trainer.Normalize(model);
   for (int iteration = 0; iteration < opts.max_iters; ++iteration) {
     input.Reset();
     output.Reset();
     const auto total_likelihood = trainer.Train(input, output, model);
     LOG(INFO) << "Iteration " << iteration + 1
               << " total likelihood: " << total_likelihood;
     if (ApproxEqual(last_likelihood, total_likelihood, opts.delta)) {
       return total_likelihood;
     }
     last_likelihood = total_likelihood;
   }
   return last_likelihood;
 }

 }  // namespace internal

 // Full training setup.
 template <class Arc>
 typename Arc::Weight Train(FarReader<Arc> &input, FarReader<Arc> &output,
                            MutableFst<Arc> *model, bool normalize_ilabel = true,
                            const TrainOptions &opts = TrainOptions()) {
   if (normalize_ilabel) {
     return internal::Train<Arc, StateILabelExpectationTable<Arc>>(input, output,
                                                                   model, opts);
   } else {
     return internal::Train<Arc, StateExpectationTable<Arc>>(input, output,
                                                             model, opts);
   }
 }

 }  // namespace fst

 #endif  // NLP_GRM2_BAUMWELCH_TRAIN_H_

fst::internal::ForwardBackward::StateId
typename Arc::StateId StateId
Definition: train.h:85

fst::LogAdder::Add
void Add(const Weight &weight)
Definition: log-adder.h:42

fst::ChannelStateCascade::ChannelState
StateId ChannelState(StateId ico_state) const
Definition: cascade.h:105

fst::TrainOptions
Definition: train.h:44

fst::internal::ForwardBackward::Weight
typename Arc::Weight Weight
Definition: train.h:86

fst::TrainOptions::copts
CascadeOptions copts
Definition: train.h:69

fst::internal::StepwiseBaumWelchTrainer::Normalize
void Normalize(MutableFst< Arc > *model)
Definition: train.h:192

fst
Definition: a-star.h:30

fst::TrainOptions::TrainOptions
TrainOptions(int max_iters=kMaxIters, float alpha=kAlpha, int batch_size=0, float delta=kDelta, const CascadeOptions &copts=CascadeOptions())
Definition: train.h:45

fst::internal::StepwiseBaumWelchTrainer::StepwiseBaumWelchTrainer
StepwiseBaumWelchTrainer(float alpha=kAlpha, int batch_size=0, const CascadeOptions &opts=CascadeOptions())
Definition: train.h:151

fst::internal::StepwiseBaumWelchTrainer::Train
Weight Train(FarReader< Arc > &input, FarReader< Arc > &output, MutableFst< Arc > *model)
Definition: train.h:181

fst::TrainOptions::alpha
float alpha
Definition: train.h:63

fst::internal::ForwardBackward::Alpha
const Weight & Alpha(StateId s) const
Definition: train.h:114

fst::internal::ForwardBackward::ForwardBackward
ForwardBackward(const ComposeFst< Arc > &ico)
Definition: train.h:88

fst::Train
Arc::Weight Train(FarReader< Arc > &input, FarReader< Arc > &output, MutableFst< Arc > *model, bool normalize_ilabel=true, const TrainOptions &opts=TrainOptions())
Definition: train.h:330

fst::ChannelStateCascade::GetFst
const ComposeFst< Arc > & GetFst() const
Definition: cascade.h:103

cascade.h

fst::LogAdder::Sum
Weight Sum() const
Definition: log-adder.h:45

fst::TrainOptions::delta
float delta
Definition: train.h:67

fst::LogAdder
Definition: log-adder.h:29

fst::TrainOptions::batch_size
int batch_size
Definition: train.h:65

util.h

fst::internal::ForwardBackward::Beta
const Weight & Beta(StateId s) const
Definition: train.h:118

fst::internal::ForwardBackward
Definition: train.h:83

fst::CascadeOptions
Definition: cascade.h:28

fst::internal::StepwiseBaumWelchTrainer::Batch
Weight Batch(FarReader< Arc > &input, FarReader< Arc > &output, MutableFst< Arc > *model)
Definition: train.h:158

fst::TrainOptions::max_iters
int max_iters
Definition: train.h:60

fst::kAlpha
constexpr float kAlpha
Definition: train.h:39

fst::ChannelStateCascade
Definition: cascade.h:89

fst::kMaxIters
constexpr int kMaxIters
Definition: train.h:40

fst::internal::StepwiseBaumWelchTrainer
Definition: train.h:145

log-adder.h

expectation-table.h

fst::internal::StepwiseBaumWelchTrainer::Weight
typename Arc::Weight Weight
Definition: train.h:147