tensorflow/core/protobuf/tpu/optimization_parameters.proto - platform/external/tensorflow - Git at Google

 syntax = "proto3";

 package tensorflow.tpu;

 import "google/protobuf/wrappers.proto";

 message ClippingLimits {
   google.protobuf.FloatValue lower = 1;  // -inf if not set
   google.protobuf.FloatValue upper = 2;  // +inf if not set
 }

 // Dynamic learning rate specification in the TPUEmbeddingConfiguration. The
 // actual learning rates are provided as a scalar input list to the
 // SendTPUEmbeddingGradients Op indexed by their tag specified through the
 // following proto.
 message DynamicLearningRate {
   // For tables where learning rates are dynamically computed and communicated
   // to the TPU embedding program, a tag must be specified for the learning
   // rate.
   //
   // The tag must be a non-negative  integer. The total number of unique tags
   // must be less than or equal to the number of tables in the TPU embedding
   // configuration (a table does not specify any tag if it uses a constant
   // learning rate, and specifies exactly one tag if it uses dynamic learning
   // rates).
   //
   // All tags in the range [0, number_of_unique_tags) must be present in the TPU
   // embedding configuration, i.e. a tag cannot be skipped if a different tag
   // numerically greater than it is used in the configuration.
   //
   // If multiple tables specify the same tag, they *MUST* have
   // the same dynamic learning rate, for example, their dynamic learning rate
   // could be computed by the same TensorFlow sub-graph. The partitioning of the
   // embedding layer would be more optimal if the number_of_unique_tags is as
   // *LOW* as possible, i.e., if many tables share the same tag.
   //
   // The learning_rate input of the SendTPUEmbeddingGradients op is used to
   // communicate dynamic learning rates to the TPU embedding program.
   // The learning_rate input is a list of scalars where the size of the list is
   // equal to the number of unique tags. The learning rate associated with a
   // particular tag is specified by populating its corresponding index in the
   // list of learning_rate scalars.
   int32 tag = 1;
 }

 // Source of learning rate to use.
 message LearningRate {
   oneof learning_rate {
     float constant = 1;
     DynamicLearningRate dynamic = 2;
   }
 }

 // Each optimizer's parameter proto has a link to its documentation and CPU
 // implementation (if available) for user reference.

 // https://www.tensorflow.org/api_docs/python/tf/train/AdagradOptimizer
 // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L151
 message AdagradParameters {
   float initial_accumulator = 1;
 }

 // Algorithm in http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
 message BoundedAdagradParameters {
   // Whether to use the updated or the old value of the accumulator when
   // computing the effective learning rate. When update_accumulator_first is set
   // to True, the updated value of the accumulator is used.
   bool update_accumulator_first = 1;
   // The max_var_update value to use. Set value to 0 (default) to disable using
   // max_var_update to clip the gradient.
   float max_var_update = 2;
   // The maximum value of the accumulator. Set max_accumulator to 0 (default)
   // to disable using max_accumulator to clip the accumulator.
   float max_accumulator = 3;
 }

 // https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer
 // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L423
 message StochasticGradientDescentParameters {}

 // https://www.tensorflow.org/api_docs/python/tf/train/FtrlOptimizer
 // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L192
 message FtrlParameters {
   float l1 = 1;
   float l2 = 2;
   float lr_power = 3;
   float initial_accum = 4;
   float initial_linear = 5;
 }

 // The Adam optimizer does not implement hyper-parameter update; use the dynamic
 // learning rate feature instead, setting the learning rate to:
 // user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
 // Here, t is the current timestep.
 //
 // https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
 // https://github.com/tensorflow/tensorflow/blob/ab51450c817674c8ff08a7ae4f8ac50cdc4bed8b/tensorflow/python/training/adam.py#L54
 //
 // Note that the code by default implements the lazy version of Adam
 // (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/LazyAdamOptimizer)
 // unless the use_non_lazy_adam parameter is set, in which case it implements
 // the normal version of Adam that updates all parameters in the embedding
 // table, even for entries that are not used in the current minibatch
 // (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If
 // use_non_lazy_adam is enabled, gradient accumulation is also required to be
 // enabled in order to get correct results; a warning will be printed otherwise
 // (which may change to an error in the future). If use_sum_inside_sqrt is set,
 // the Adam variable update formula will be changed from m / (sqrt(v) + epsilon)
 // to m / sqrt(v + epsilon**2); this option improves the performance of TPU
 // training and is not expected to harm model quality.
 message AdamParameters {
   float beta1 = 3;
   float beta2 = 4;
   float epsilon = 5;
   float initial_m = 6;
   float initial_v = 7;
   bool use_non_lazy_adam = 8;
   bool use_sum_inside_sqrt = 10;
 }

 // https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer
 // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L271
 message MomentumParameters {
   float momentum = 1;
   bool use_nesterov = 2;
   float initial_accum = 3;
 }

 // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
 // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L356
 message RmsPropParameters {
   float rho = 1;
   float momentum = 2;
   float epsilon = 3;
   float initial_ms = 4;
   float initial_mom = 5;
 }

 // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
 // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L372
 message CenteredRmsPropParameters {
   float rho = 1;
   float momentum = 2;
   float epsilon = 3;
   float initial_ms = 4;
   float initial_mom = 5;
   float initial_mg = 6;
 }

 // Variant of algorithm in http://proceedings.mlr.press/v44/shamir15.pdf
 message MdlAdagradLightParameters {
   float l2 = 1;
   float lr_power = 2;
   float min_servable_mdl_benefit = 3;
   float mdl_mix_in_margin = 4;
   float mdl_benefit_rampup_coeff = 5;
   float mdl_min_weight = 6;
   float benefit_revisit_scale = 7;
   float max_event_benefit = 8;
   float max_total_benefit = 9;
   float mdl_hard_limit = 10;
   bool hard_limit_min_benefit = 11;
   bool mdl_regularize = 12;
   float initial_accumulator = 13;
   float initial_weight = 14;
   float initial_benefit = 15;
 }

 // https://www.tensorflow.org/api_docs/python/tf/train/AdadeltaOptimizer
 // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L68
 message AdadeltaParameters {
   float rho = 1;
   float epsilon = 2;
   float initial_accumulator = 3;
   float initial_update = 4;
 }

 // https://www.tensorflow.org/api_docs/python/tf/train/ProximalAdagradOptimizer
 // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L164
 message ProximalAdagradParameters {
   float l1 = 1;
   float l2 = 2;
   float initial_accumulator = 3;
 }

 // The online Yogi optimizer does not implement hyper-parameter update; use the
 // dynamic learning rate feature instead, setting the learning rate to:
 // user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
 // Here, t is the current timestep.
 //
 // https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf
 // plus some extensions based on FTRL.
 //
 // Note that the code by default implements the lazy version of online Yogi.
 message OnlineYogiParameters {
   // The L1 regularization parameter (used analogously to the one in FTRL).
   float l1 = 1;

   // The L2 regularization parameter (used analogously to the one in FTRL).
   float l2 = 2;

   // \beta_2 from Algorithm 2 in the paper.
   float beta2 = 3;

   // x -> copysign(1, x) (i.e., return 1 for an input of +0 rather than 0).
   message SignActivation {}

   // x -> tanh(x * 10)
   message TanhActivation {}

   // Activation to use to replace sign function in v_t update in Algorithm 2 of
   // paper.
   oneof activation {
     SignActivation sign = 6;
     TanhActivation tanh = 7;
   }
 }

 // The online Yogi optimizer does not implement hyper-parameter update; use the
 // dynamic learning rate feature instead, setting the learning rate to:
 // user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
 // Here, t is the current timestep.
 //
 // https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf
 // plus some extensions based on FTRL.
 //
 // Note that the code by default implements the lazy version of proximal Yogi.
 message ProximalYogiParameters {
   // The L1 regularization parameter.
   float l1 = 1;

   // The L2 regularization parameter.
   float l2 = 2;

   // The exponential decay rate for the 1st moment estimates.
   float beta1 = 3;

   // The exponential decay rate for the 2nd moment estimates.
   float beta2 = 4;

   // A constant trading off adaptivity and noise.
   float epsilon = 5;

   // x -> copysign(1, x) (i.e., return 1 for an input of +0 rather than 0).
   message SignActivation {}

   // x -> tanh(x * 10)
   message TanhActivation {}

   // Activation to use to replace sign function in v_t update in Algorithm 2 of
   // paper.
   oneof activation {
     SignActivation sign = 8;
     TanhActivation tanh = 9;
   }
 }

 // Status of using gradient accumulation (doing two passes over the input
 // gradients: one to accumulate them into a temporary array and another to apply
 // them using the actual optimization algorithm). The extra message is to wrap
 // the enum for scoping.
 message GradientAccumulationStatus {
   // if UNSPECIFIED (default), gradient accumulation is ENABLED.
   enum Status {
     UNSPECIFIED = 0;
     ENABLED = 1;
     DISABLED = 2;
   }
 }

 // Configuration proto for hot ID optimization. This is an experimental feature
 // that is currently disabled (by default).
 message HotIdReplicationConfiguration {
   // Whether to enable or disable hot ID optimization.
   // If UNSPECIFIED (default), hot ID optimization is DISABLED.
   enum Status {
     UNSPECIFIED = 0;
     ENABLED = 1;
     DISABLED = 2;
   }
   Status status = 1;
 }

 message OptimizationParameters {
   // Learning rate used for updating the embedding layer parameters.
   LearningRate learning_rate = 13;
   reserved 1;  // Old learning rate tag.

   // Limits to which to clip the weight values after the backward pass; not
   // present means no limits are applied.
   ClippingLimits clipping_limits = 2;

   // Limits to which to clip the backward pass gradient before using it for
   // updates; not present means no limits are applied.
   ClippingLimits gradient_clipping_limits = 7;

   // Amount of weight decay to apply; see weight_decay_optimizers.py for
   // details. Almost all optimizers are supported with this option (MDL Adagrad
   // Light does not work, and SGD does not behave as expected if it is enabled).
   // Although there is no check, users who want weight decay will probably also
   // want to enable gradient accumulation as well so that the decay will happen
   // once per minibatch.
   float weight_decay_factor = 16;

   // If true, the weight decay factor is multiplied by the current learning rate
   // before use; this is to match the note in DecoupledWeightDecayExtension in
   // weight_decay_optimizers.py.
   bool multiply_weight_decay_factor_by_learning_rate = 22;

   // Status of using gradient accumulation (doing two passes over the input
   // gradients: one to accumulate them into a temporary array and another to
   // apply them using the actual optimization algorithm).
   GradientAccumulationStatus.Status gradient_accumulation_status = 17;

   // Configuration proto for hot ID replication. This is an experimental
   // feature that is currently disabled (by default).
   HotIdReplicationConfiguration hot_id_replication_configuration = 18;

   // Optimization algorithm parameters; which field is selected determines which
   // algorithm to use.
   oneof parameters {
     AdagradParameters adagrad = 3;
     BoundedAdagradParameters bounded_adagrad = 19;
     StochasticGradientDescentParameters stochastic_gradient_descent = 4;
     FtrlParameters ftrl = 5;
     AdamParameters adam = 6;
     MomentumParameters momentum = 8;
     RmsPropParameters rms_prop = 9;
     CenteredRmsPropParameters centered_rms_prop = 10;
     MdlAdagradLightParameters mdl_adagrad_light = 11;
     AdadeltaParameters adadelta = 12;
     ProximalAdagradParameters proximal_adagrad = 14;
     OnlineYogiParameters online_yogi = 20;
     ProximalYogiParameters proximal_yogi = 21;
   }

   reserved 15;  // Old use_gradient_accumulation.
 }

 // Specification of an optimization algorithm's state variables (both the main
 // value vector and any extra accumulators, etc.). This proto is only used
 // internally by the TPU software and is not exposed directly to the TF model.
 message StateVariableSpecification {
   // Parameter name for the state variable.
   string name = 1;

   // A normal state variable that should be saved and restored in checkpoints
   // and used as an input or output to non-debug TensorFlow ops.
   message UserDefined {
     // For padding embedding rows, this field specifies the initial value to be
     // used. Separate initial values need to be specified for the embeddings and
     // any extra accumulators. The initial values should be specified so as to
     // maintain two invariants during model training:
     // (1) The embedding vector multiplied by zero returns a vector containing
     //     all zeros. To maintain this invariant, the embedding values should
     //     never be NaNs or +-infinity.
     // (2) Repeatedly applying the optimizer using a gradient vector of all
     //     zeros does not cause the embeddings or slot variables to become NaNs
     //     or +-infinity.
     // The padding row is looked up when no embedding IDs are present for a
     // feature. The semantics of embedding lookup dictate that the output must
     // be zero under this scenario.
     double padding_initial_value = 1;
   }

   // A state variable that should be filled with a constant and normally hidden
   // from users (used for intermediate gradients being accumulated, for
   // example).
   message FillWithConstant {
     double initial_value = 1;
   }

   // Usage type of this state variable.
   oneof usage {
     UserDefined user_defined = 2;
     FillWithConstant fill_with_constant = 3;
   }
 }
	syntax = "proto3";

	package tensorflow.tpu;

	import "google/protobuf/wrappers.proto";

	message ClippingLimits {
	google.protobuf.FloatValue lower = 1; // -inf if not set
	google.protobuf.FloatValue upper = 2; // +inf if not set
	}

	// Dynamic learning rate specification in the TPUEmbeddingConfiguration. The
	// actual learning rates are provided as a scalar input list to the
	// SendTPUEmbeddingGradients Op indexed by their tag specified through the
	// following proto.
	message DynamicLearningRate {
	// For tables where learning rates are dynamically computed and communicated
	// to the TPU embedding program, a tag must be specified for the learning
	// rate.
	//
	// The tag must be a non-negative integer. The total number of unique tags
	// must be less than or equal to the number of tables in the TPU embedding
	// configuration (a table does not specify any tag if it uses a constant
	// learning rate, and specifies exactly one tag if it uses dynamic learning
	// rates).
	//
	// All tags in the range [0, number_of_unique_tags) must be present in the TPU
	// embedding configuration, i.e. a tag cannot be skipped if a different tag
	// numerically greater than it is used in the configuration.
	//
	// If multiple tables specify the same tag, they MUST have
	// the same dynamic learning rate, for example, their dynamic learning rate
	// could be computed by the same TensorFlow sub-graph. The partitioning of the
	// embedding layer would be more optimal if the number_of_unique_tags is as
	// LOW as possible, i.e., if many tables share the same tag.
	//
	// The learning_rate input of the SendTPUEmbeddingGradients op is used to
	// communicate dynamic learning rates to the TPU embedding program.
	// The learning_rate input is a list of scalars where the size of the list is
	// equal to the number of unique tags. The learning rate associated with a
	// particular tag is specified by populating its corresponding index in the
	// list of learning_rate scalars.
	int32 tag = 1;
	}

	// Source of learning rate to use.
	message LearningRate {
	oneof learning_rate {
	float constant = 1;
	DynamicLearningRate dynamic = 2;
	}
	}

	// Each optimizer's parameter proto has a link to its documentation and CPU
	// implementation (if available) for user reference.

	// https://www.tensorflow.org/api_docs/python/tf/train/AdagradOptimizer
	// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L151
	message AdagradParameters {
	float initial_accumulator = 1;
	}

	// Algorithm in http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
	message BoundedAdagradParameters {
	// Whether to use the updated or the old value of the accumulator when
	// computing the effective learning rate. When update_accumulator_first is set
	// to True, the updated value of the accumulator is used.
	bool update_accumulator_first = 1;
	// The max_var_update value to use. Set value to 0 (default) to disable using
	// max_var_update to clip the gradient.
	float max_var_update = 2;
	// The maximum value of the accumulator. Set max_accumulator to 0 (default)
	// to disable using max_accumulator to clip the accumulator.
	float max_accumulator = 3;
	}

	// https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer
	// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L423
	message StochasticGradientDescentParameters {}

	// https://www.tensorflow.org/api_docs/python/tf/train/FtrlOptimizer
	// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L192
	message FtrlParameters {
	float l1 = 1;
	float l2 = 2;
	float lr_power = 3;
	float initial_accum = 4;
	float initial_linear = 5;
	}

	// The Adam optimizer does not implement hyper-parameter update; use the dynamic
	// learning rate feature instead, setting the learning rate to:
	// user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
	// Here, t is the current timestep.
	//
	// https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
	// https://github.com/tensorflow/tensorflow/blob/ab51450c817674c8ff08a7ae4f8ac50cdc4bed8b/tensorflow/python/training/adam.py#L54
	//
	// Note that the code by default implements the lazy version of Adam
	// (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/LazyAdamOptimizer)
	// unless the use_non_lazy_adam parameter is set, in which case it implements
	// the normal version of Adam that updates all parameters in the embedding
	// table, even for entries that are not used in the current minibatch
	// (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If
	// use_non_lazy_adam is enabled, gradient accumulation is also required to be
	// enabled in order to get correct results; a warning will be printed otherwise
	// (which may change to an error in the future). If use_sum_inside_sqrt is set,
	// the Adam variable update formula will be changed from m / (sqrt(v) + epsilon)
	// to m / sqrt(v + epsilon**2); this option improves the performance of TPU
	// training and is not expected to harm model quality.
	message AdamParameters {
	float beta1 = 3;
	float beta2 = 4;
	float epsilon = 5;
	float initial_m = 6;
	float initial_v = 7;
	bool use_non_lazy_adam = 8;
	bool use_sum_inside_sqrt = 10;
	}

	// https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer
	// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L271
	message MomentumParameters {
	float momentum = 1;
	bool use_nesterov = 2;
	float initial_accum = 3;
	}

	// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
	// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L356
	message RmsPropParameters {
	float rho = 1;
	float momentum = 2;
	float epsilon = 3;
	float initial_ms = 4;
	float initial_mom = 5;
	}

	// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
	// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L372
	message CenteredRmsPropParameters {
	float rho = 1;
	float momentum = 2;
	float epsilon = 3;
	float initial_ms = 4;
	float initial_mom = 5;
	float initial_mg = 6;
	}

	// Variant of algorithm in http://proceedings.mlr.press/v44/shamir15.pdf
	message MdlAdagradLightParameters {
	float l2 = 1;
	float lr_power = 2;
	float min_servable_mdl_benefit = 3;
	float mdl_mix_in_margin = 4;
	float mdl_benefit_rampup_coeff = 5;
	float mdl_min_weight = 6;
	float benefit_revisit_scale = 7;
	float max_event_benefit = 8;
	float max_total_benefit = 9;
	float mdl_hard_limit = 10;
	bool hard_limit_min_benefit = 11;
	bool mdl_regularize = 12;
	float initial_accumulator = 13;
	float initial_weight = 14;
	float initial_benefit = 15;
	}

	// https://www.tensorflow.org/api_docs/python/tf/train/AdadeltaOptimizer
	// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L68
	message AdadeltaParameters {
	float rho = 1;
	float epsilon = 2;
	float initial_accumulator = 3;
	float initial_update = 4;
	}

	// https://www.tensorflow.org/api_docs/python/tf/train/ProximalAdagradOptimizer
	// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L164
	message ProximalAdagradParameters {
	float l1 = 1;
	float l2 = 2;
	float initial_accumulator = 3;
	}

	// The online Yogi optimizer does not implement hyper-parameter update; use the
	// dynamic learning rate feature instead, setting the learning rate to:
	// user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
	// Here, t is the current timestep.
	//
	// https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf
	// plus some extensions based on FTRL.
	//
	// Note that the code by default implements the lazy version of online Yogi.
	message OnlineYogiParameters {
	// The L1 regularization parameter (used analogously to the one in FTRL).
	float l1 = 1;

	// The L2 regularization parameter (used analogously to the one in FTRL).
	float l2 = 2;

	// \beta_2 from Algorithm 2 in the paper.
	float beta2 = 3;

	// x -> copysign(1, x) (i.e., return 1 for an input of +0 rather than 0).
	message SignActivation {}

	// x -> tanh(x * 10)
	message TanhActivation {}

	// Activation to use to replace sign function in v_t update in Algorithm 2 of
	// paper.
	oneof activation {
	SignActivation sign = 6;
	TanhActivation tanh = 7;
	}
	}

	// The online Yogi optimizer does not implement hyper-parameter update; use the
	// dynamic learning rate feature instead, setting the learning rate to:
	// user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
	// Here, t is the current timestep.
	//
	// https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf
	// plus some extensions based on FTRL.
	//
	// Note that the code by default implements the lazy version of proximal Yogi.
	message ProximalYogiParameters {
	// The L1 regularization parameter.
	float l1 = 1;

	// The L2 regularization parameter.
	float l2 = 2;

	// The exponential decay rate for the 1st moment estimates.
	float beta1 = 3;

	// The exponential decay rate for the 2nd moment estimates.
	float beta2 = 4;

	// A constant trading off adaptivity and noise.
	float epsilon = 5;

	// x -> copysign(1, x) (i.e., return 1 for an input of +0 rather than 0).
	message SignActivation {}

	// x -> tanh(x * 10)
	message TanhActivation {}

	// Activation to use to replace sign function in v_t update in Algorithm 2 of
	// paper.
	oneof activation {
	SignActivation sign = 8;
	TanhActivation tanh = 9;
	}
	}

	// Status of using gradient accumulation (doing two passes over the input
	// gradients: one to accumulate them into a temporary array and another to apply
	// them using the actual optimization algorithm). The extra message is to wrap
	// the enum for scoping.
	message GradientAccumulationStatus {
	// if UNSPECIFIED (default), gradient accumulation is ENABLED.
	enum Status {
	UNSPECIFIED = 0;
	ENABLED = 1;
	DISABLED = 2;
	}
	}

	// Configuration proto for hot ID optimization. This is an experimental feature
	// that is currently disabled (by default).
	message HotIdReplicationConfiguration {
	// Whether to enable or disable hot ID optimization.
	// If UNSPECIFIED (default), hot ID optimization is DISABLED.
	enum Status {
	UNSPECIFIED = 0;
	ENABLED = 1;
	DISABLED = 2;
	}
	Status status = 1;
	}

	message OptimizationParameters {
	// Learning rate used for updating the embedding layer parameters.
	LearningRate learning_rate = 13;
	reserved 1; // Old learning rate tag.

	// Limits to which to clip the weight values after the backward pass; not
	// present means no limits are applied.
	ClippingLimits clipping_limits = 2;

	// Limits to which to clip the backward pass gradient before using it for
	// updates; not present means no limits are applied.
	ClippingLimits gradient_clipping_limits = 7;

	// Amount of weight decay to apply; see weight_decay_optimizers.py for
	// details. Almost all optimizers are supported with this option (MDL Adagrad
	// Light does not work, and SGD does not behave as expected if it is enabled).
	// Although there is no check, users who want weight decay will probably also
	// want to enable gradient accumulation as well so that the decay will happen
	// once per minibatch.
	float weight_decay_factor = 16;

	// If true, the weight decay factor is multiplied by the current learning rate
	// before use; this is to match the note in DecoupledWeightDecayExtension in
	// weight_decay_optimizers.py.
	bool multiply_weight_decay_factor_by_learning_rate = 22;

	// Status of using gradient accumulation (doing two passes over the input
	// gradients: one to accumulate them into a temporary array and another to
	// apply them using the actual optimization algorithm).
	GradientAccumulationStatus.Status gradient_accumulation_status = 17;

	// Configuration proto for hot ID replication. This is an experimental
	// feature that is currently disabled (by default).
	HotIdReplicationConfiguration hot_id_replication_configuration = 18;

	// Optimization algorithm parameters; which field is selected determines which
	// algorithm to use.
	oneof parameters {
	AdagradParameters adagrad = 3;
	BoundedAdagradParameters bounded_adagrad = 19;
	StochasticGradientDescentParameters stochastic_gradient_descent = 4;
	FtrlParameters ftrl = 5;
	AdamParameters adam = 6;
	MomentumParameters momentum = 8;
	RmsPropParameters rms_prop = 9;
	CenteredRmsPropParameters centered_rms_prop = 10;
	MdlAdagradLightParameters mdl_adagrad_light = 11;
	AdadeltaParameters adadelta = 12;
	ProximalAdagradParameters proximal_adagrad = 14;
	OnlineYogiParameters online_yogi = 20;
	ProximalYogiParameters proximal_yogi = 21;
	}

	reserved 15; // Old use_gradient_accumulation.
	}

	// Specification of an optimization algorithm's state variables (both the main
	// value vector and any extra accumulators, etc.). This proto is only used
	// internally by the TPU software and is not exposed directly to the TF model.
	message StateVariableSpecification {
	// Parameter name for the state variable.
	string name = 1;

	// A normal state variable that should be saved and restored in checkpoints
	// and used as an input or output to non-debug TensorFlow ops.
	message UserDefined {
	// For padding embedding rows, this field specifies the initial value to be
	// used. Separate initial values need to be specified for the embeddings and
	// any extra accumulators. The initial values should be specified so as to
	// maintain two invariants during model training:
	// (1) The embedding vector multiplied by zero returns a vector containing
	// all zeros. To maintain this invariant, the embedding values should
	// never be NaNs or +-infinity.
	// (2) Repeatedly applying the optimizer using a gradient vector of all
	// zeros does not cause the embeddings or slot variables to become NaNs
	// or +-infinity.
	// The padding row is looked up when no embedding IDs are present for a
	// feature. The semantics of embedding lookup dictate that the output must
	// be zero under this scenario.
	double padding_initial_value = 1;
	}

	// A state variable that should be filled with a constant and normally hidden
	// from users (used for intermediate gradients being accumulated, for
	// example).
	message FillWithConstant {
	double initial_value = 1;
	}

	// Usage type of this state variable.
	oneof usage {
	UserDefined user_defined = 2;
	FillWithConstant fill_with_constant = 3;
	}
	}