Added multiply_linear_by_lr documentation for FTRL for TPU embeddings. PiperOrigin-RevId: 323664047 Change-Id: I20b141e1d1eab9a3b5f6d72528d38ada350bf4ec

commit: 8b23245d0932876ad4d3a036875c48c4cbdbd063 [log] [tgz]
author: A. Unique TensorFlower <gardener@tensorflow.org> Tue Jul 28 15:15:50 2020 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> Tue Jul 28 15:20:40 2020 -0700
tree: 457d7c77663938e11172d9f34ce0448c993e3252
parent: d54a57f37d39610f4d5e6b167693f5d677c26a3e [diff]
diff --git a/tensorflow/core/protobuf/tpu/optimization_parameters.proto b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
index 1699a26..f29beb3 100644
--- a/tensorflow/core/protobuf/tpu/optimization_parameters.proto
+++ b/tensorflow/core/protobuf/tpu/optimization_parameters.proto

@@ -81,7 +81,19 @@
 message StochasticGradientDescentParameters {}
 
 // https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Ftrl
+// https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41159.pdf
 // https://github.com/tensorflow/tensorflow/blob/6b6471f3ffb7f1fefe42d814aa5fb9ab7a535b58/tensorflow/core/kernels/training_ops.cc#L2646
+//
+// The hyperparameters for FTRL are the same as for the Keras implementation,
+// with some additions. When the multiply_linear_by_lr field is set to true, a
+// modified formula is used for FTRL that treats the "linear" accumulator as
+// being pre-multiplied by the learning rate (i.e., the accumulator named
+// "linear" actually stores "linear * learning_rate"). Other than checkpoint
+// compatibility, this is mathematically equivalent for a static learning rate;
+// for a dynamic learning rate, it is nearly the same as long as the learning
+// rate does not change quickly. The benefit of setting multiply_linear_by_lr to
+// true is that the modified formula handles zero and near-zero learning rates
+// without producing NaNs, improving flexibility for learning rate ramp-up.
 message FtrlParameters {
   float l1 = 1;
   float l2 = 2;
@@ -93,9 +105,9 @@
   reserved 4, 5;
 }
 
-// The Adam optimizer does not implement hyper-parameter update; use the dynamic
-// learning rate feature instead, setting the learning rate to:
-// user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+// The Adam optimizer does not implement hyper-parameter update due to hardware
+// limitations; use the dynamic learning rate feature instead, setting the
+// learning rate to: user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
 // Here, t is the current timestep.
 //
 // https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam
commit	8b23245d0932876ad4d3a036875c48c4cbdbd063	[log] [tgz]
author	A. Unique TensorFlower <gardener@tensorflow.org>	Tue Jul 28 15:15:50 2020 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	Tue Jul 28 15:20:40 2020 -0700
tree	457d7c77663938e11172d9f34ce0448c993e3252
parent	d54a57f37d39610f4d5e6b167693f5d677c26a3e [diff]