Update TPU Embeding V1 API to properly check/set gradient accumulation status when gradient clipping is enabled.
PiperOrigin-RevId: 342912651
Change-Id: I33a13a177bf6747aae47c7a59280e5d6b9b066e8
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index 4cd828e..4539ca9 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -382,6 +382,11 @@
self.clip_gradient_min = clip_gradient_min
self.clip_gradient_max = clip_gradient_max
+ if not use_gradient_accumulation and (clip_gradient_min is not None or
+ clip_gradient_max is not None):
+ ValueError('When using gradient clipping limits, gradient accumulation '
+ 'must be enabled.')
+
@tf_export(v1=['tpu.experimental.AdagradParameters'])
class AdagradParameters(_OptimizationParameters):
@@ -430,7 +435,9 @@
multiply_weight_decay_factor_by_learning_rate: if true,
`weight_decay_factor` is multiplied by the current learning rate.
clip_gradient_min: the minimum value to clip by; None means -infinity.
+ Gradient accumulation must be set to true if this is set.
clip_gradient_max: the maximum value to clip by; None means +infinity.
+ Gradient accumulation must be set to true if this is set.
"""
super(AdagradParameters, self).__init__(
learning_rate=learning_rate,
@@ -490,7 +497,9 @@
multiply_weight_decay_factor_by_learning_rate: if true,
`weight_decay_factor` is multiplied by the current learning rate.
clip_gradient_min: the minimum value to clip by; None means -infinity.
+ Gradient accumulation must be set to true if this is set.
clip_gradient_max: the maximum value to clip by; None means +infinity.
+ Gradient accumulation must be set to true if this is set.
"""
super(ProximalAdagradParameters, self).__init__(
learning_rate=learning_rate,
@@ -577,7 +586,9 @@
multiply_weight_decay_factor_by_learning_rate: if true,
`weight_decay_factor` is multiplied by the current learning rate.
clip_gradient_min: the minimum value to clip by; None means -infinity.
+ Gradient accumulation must be set to true if this is set.
clip_gradient_max: the maximum value to clip by; None means +infinity.
+ Gradient accumulation must be set to true if this is set.
"""
super(AdamParameters, self).__init__(
learning_rate=learning_rate,
@@ -680,7 +691,9 @@
allow for the case of initial_accumulator_value being zero. This will
cause a slight performance drop.
clip_gradient_min: the minimum value to clip by; None means -infinity.
+ Gradient accumulation must be set to true if this is set.
clip_gradient_max: the maximum value to clip by; None means +infinity.
+ Gradient accumulation must be set to true if this is set.
"""
super(FtrlParameters, self).__init__(
learning_rate=learning_rate,
@@ -777,7 +790,9 @@
multiply_weight_decay_factor_by_learning_rate: if true,
`weight_decay_factor` is multiplied by the current learning rate.
clip_gradient_min: the minimum value to clip by; None means -infinity.
+ Gradient accumulation must be set to true if this is set.
clip_gradient_max: the maximum value to clip by; None means +infinity.
+ Gradient accumulation must be set to true if this is set.
"""
super(ProximalYogiParameters, self).__init__(
learning_rate=learning_rate,
@@ -867,7 +882,9 @@
multiply_weight_decay_factor_by_learning_rate: if true,
`weight_decay_factor` is multiplied by the current learning rate.
clip_gradient_min: the minimum value to clip by; None means -infinity.
+ Gradient accumulation must be set to true if this is set.
clip_gradient_max: the maximum value to clip by; None means +infinity.
+ Gradient accumulation must be set to true if this is set.
"""
super(MomentumParameters, self).__init__(
learning_rate=learning_rate,
@@ -934,7 +951,9 @@
multiply_weight_decay_factor_by_learning_rate: if true,
`weight_decay_factor` is multiplied by the current learning rate.
clip_gradient_min: the minimum value to clip by; None means -infinity.
+ Gradient accumulation must be set to true if this is set.
clip_gradient_max: the maximum value to clip by; None means +infinity.
+ Gradient accumulation must be set to true if this is set.
"""
super(RMSPropParameters, self).__init__(
learning_rate=learning_rate,
@@ -995,9 +1014,16 @@
clip_gradient_min: the minimum value to clip by; None means -infinity.
clip_gradient_max: the maximum value to clip by; None means +infinity.
"""
+ # Gradient accumulation is generally a no-op for SGD, but if gradient
+ # clipping is enabled, then we must also enable gradient accumulation.
+ # In the other optimizers this up to the user, but we don't give the user
+ # the option to turn gradient accumulation on or off for SGD.
+ use_gradient_accumulation = False
+ if (clip_gradient_min is not None or clip_gradient_max is not None):
+ use_gradient_accumulation = True
super(StochasticGradientDescentParameters, self).__init__(
learning_rate=learning_rate,
- use_gradient_accumulation=False,
+ use_gradient_accumulation=use_gradient_accumulation,
clip_weight_min=clip_weight_min,
clip_weight_max=clip_weight_max,
weight_decay_factor=weight_decay_factor,