tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/contrib/tpu/proto/tpu_embedding_config.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"

 namespace tensorflow {

 // TPUs use a specialized mechanism for performing embedding lookups,
 // necessitating differences in TF Graphs that use embeddings on TPUs relative
 // to CPUs. Embedding lookups on TPU systems are achieved by including the
 // following in the TF Graph.
 //
 // 0. Construct a TPUEmbeddingConfiguration, specifying the embedding tables
 //    in the model, the size of the TPU system to be used, and the optimizer to
 //    be used for each table. Some of this information is redundant with other
 //    pieces of the TF Graph.
 // 1. Pass this TPUEmbeddingConfiguration to tpu.initialize_system() as the
 //    tpu_embedding_config parameter.
 // 2. Use the TPUEmbeddingLoad Op to initialize the embedding tables in TPU
 //    memories, sharded across the memories attached to each Host.
 // 3. Use TPUEmbeddingEnqueueSparseBatch to provide the TPU with embedding
 //    indices and aggregation weights.
 // 4. TPUEmbeddingReceiveActivations returns a list of Tensors, containing the
 //    activations from each table specified in the configuration.
 // 5. TPUEmbeddingActivations, when used with appropriate Python libraries,
 //    enables the automatic differentiation of models that use embeddings.
 // 6. TPUEmbeddingSendGradients takes a list of Tensors (of the same shapes
 //    as those returned by TPUEmbeddingReceivActivations) containing gradients
 //    to use in updating the embedding tables.
 // 7. Before saving a checkpoint, use the TPUEmbeddingRetrieve Op to update
 //    the Graph's embedding table Variables from the updated tables in the
 //    TPU memories.
 //
 // TPU Embeddings use dedicated ops to enforce Host/TPU consistency in the
 // state of embedding table variables. Before beginning training or inference,
 // the model must Load the optimizer parameters into the TPU memories. Before
 // saving a checkpoint, the model must Retrieve the parameters back into the
 // host CPU memory.

 REGISTER_OP("TPUEmbeddingLoadGradientDescentParameters")
     .Input("parameters: float32")
     .Attr("tpu_embedding_config: string")
     .Attr("table_id: int >= 0")
     .Attr("num_hosts: int >= 1")
     .Attr("host_id: int >= 0")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Load an embedding table shard into TPU memory for use with GradientDescent.

 TPU embeddings use dedicated per-optimizer Ops for loading and retrieving
 trainable variables and optimizer state from TPU memory. This op enables
 functionality equivalent to GradientDescentOptimizer.

 parameters: The shard of the embedding table resident on the host executing this
     op. For single-TPU models, this is the entire embedding table.
 tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
 table_id: The id of the table specified in the tpu_embedding_config.
 num_hosts: The number of CPU hosts in the distributed training job.
 host_id: Which CPU host in the distributed training job will execute this op.
 )doc");

 namespace tpu_embedding_config_util {

 Status GradientDescentShapes(shape_inference::InferenceContext *c) {
   string config_string;
   TF_RETURN_IF_ERROR(c->GetAttr("tpu_embedding_config", &config_string));
   tpu::TPUEmbeddingConfiguration config;
   if (!config.ParseFromString(config_string)) {
     return errors::InvalidArgument("Malformed tpu_embedding_config.");
   }

   int table_id;
   TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
   int64 num_tables = config.table_config_size();
   if (table_id >= num_tables) {
     return errors::InvalidArgument("Table id >= num_tables");
   }
   int64 width = config.table_config(table_id).width();
   int64 num_rows = config.table_config(table_id).num_rows();

   TF_RETURN_IF_ERROR(c->set_output("parameters", {c->Matrix(num_rows, width)}));
   return Status::OK();
 }

 }  // namespace tpu_embedding_config_util

 REGISTER_OP("TPUEmbeddingRetrieveGradientDescentParameters")
     .Output("parameters: float32")
     .Attr("tpu_embedding_config: string")
     .Attr("table_id: int")
     .Attr("num_hosts: int")
     .Attr("host_id: int")
     .SetIsStateful()
     .SetShapeFn(tpu_embedding_config_util::GradientDescentShapes)
     .Doc(R"doc(
 Retrieve an embedding table shard from TPU memory.

 TPU embeddings use dedicated per-optimizer Ops for loading and retrieving
 trainable variables and optimizer state from TPU memory. This op enables
 functionality equivalent to GradientDescentOptimizer.

 tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
 table_id: The id of the table specified in tpu_embedding_config.
 num_hosts: The number of CPU hosts in the distributed training job.
 host_id: Which CPU host in the distributed training job will execute this op.
 )doc");

 REGISTER_OP("TPUEmbeddingLoadAdagradParameters")
     .Input("parameters: float32")
     .Input("accumulators: float32")
     .Attr("tpu_embedding_config: string")
     .Attr("table_id: int >= 0")
     .Attr("num_hosts: int >= 1")
     .Attr("host_id: int >= 0")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Load an embedding table shard into TensorNode memories for use with Adagrad.

 TPU embeddings use dedicated per-optimizer Ops for loading and retrieving
 trainable variables and optimizer state from TPU memory. This op enables
 functionality equivalent to AdagradOptimizer.

 parameters: The shard of the embedding table resident on the host executing this
     op. For single-TPU models, this is the entire embedding table.
 accumulators: Shard of the Adagrad accumulators resident on the host executing
     this op.
 tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
 table_id: The id of the table specified in the embedding_config.
 num_hosts: The number of CPU hosts in the distributed training job.
 host_id: Which CPU host in the distributed training job will execute this op.
 )doc");

 namespace tpu_embedding_config_util {

 Status AdagradShapes(shape_inference::InferenceContext *c) {
   string config_string;
   TF_RETURN_IF_ERROR(c->GetAttr("tpu_embedding_config", &config_string));
   tpu::TPUEmbeddingConfiguration config;
   if (!config.ParseFromString(config_string)) {
     return errors::InvalidArgument("Malformed tpu_embedding_config.");
   }

   int table_id;
   TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
   int64 num_tables = config.table_config_size();
   if (table_id >= num_tables) {
     return errors::InvalidArgument("Table id >= num_tables");
   }
   int64 width = config.table_config(table_id).width();
   int64 num_rows = config.table_config(table_id).num_rows();

   TF_RETURN_IF_ERROR(c->set_output("parameters", {c->Matrix(num_rows, width)}));
   TF_RETURN_IF_ERROR(
       c->set_output("accumulators", {c->Matrix(num_rows, width)}));
   return Status::OK();
 }

 }  // namespace tpu_embedding_config_util

 REGISTER_OP("TPUEmbeddingRetrieveAdagradParameters")
     .Output("parameters: float32")
     .Output("accumulators: float32")
     .Attr("tpu_embedding_config: string")
     .Attr("table_id: int >= 0")
     .Attr("num_hosts: int >= 1")
     .Attr("host_id: int >= 0")
     .SetIsStateful()
     .SetShapeFn(tpu_embedding_config_util::AdagradShapes)
     .Doc(R"doc(
 Retrieve an embedding table shard from TPU memory.

 TPU embeddings use dedicated per-optimizer Ops for loading and retrieving
 trainable variables and optimizer state from TPU memory. This op enables
 functionality equivalent to AdagradOptimizer.

 tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
 table_id: The id of the table specified in the embedding_config_json.
 num_hosts: The number of CPU hosts in the distributed training job.
 host_id: Which CPU host in the distributed training job will execute this op.
 )doc");

 REGISTER_OP("TPUEmbeddingEnqueueSparseBatch")
     .Input("sample_indices: num_tables * int32")
     .Input("embedding_indices: num_tables * int32")
     .Input("aggregation_weights: num_tables * float32")
     .Attr("num_tables: int")
     .Attr("device_ordinal: int = -1")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 An op that feeds a batch of embedding indices and weights to the TPU.

 Embedding lookups are equivalent to sparse-dense matrix multiplications: the
 sparse matrix contains nonzeros in column j in order to retrieve row j from the
 embedding table.

 The three Tensor list arguments (sample_indices, embedding_indices, and
 aggregation_weights) represent these sparse matrices in COO format. The Tensor
 lists each have one entry for each embedding table specified in the model.
 For the kth embedding table, the three Tensors at position k in the list
 specify a COO-format sparse matrix. For the kth table, the row indices,
 column indices, and nonzero values of the COO sparse matrix are specified by
 sample_indices[k], embedding_indices[k], and aggregation_weights[k],
 respectively. Entries must be sorted by row index, then by column index.

 There should be at most one TPUEmbeddingEnqueueSparseBatch op in a signle
 training step per TPU shard.

 sample_indices: A list of rank 1 Tensors specifying row indices of the COO
     sparse matrix representing the embedding lookups for each table.
 embedding_indices: A list of rank 1 Tensors  specifying column indices of the
     COO sparse matrix representing the embedding lookups for each table.
 aggregation_weights: A list of rank 1 Tensors specifying the nonzero values
     of the COO sparse matrix representing the embedding lookups for each table.
 device_ordinal: The TPU device to use. This should be -1 when the Op
     is running on a TPU device, and >= 0 when the Op is running on the CPU
     device.
 )doc");

 namespace tpu_embedding_config_util {

 Status ActivationShapes(shape_inference::InferenceContext *c) {
   string config_string;
   TF_RETURN_IF_ERROR(c->GetAttr("tpu_embedding_config", &config_string));
   tpu::TPUEmbeddingConfiguration config;
   if (!config.ParseFromString(config_string)) {
     return errors::InvalidArgument("Malformed tpu_embedding_config.");
   }
   int64 batch_size = config.batch_size();
   int64 num_tables = config.table_config_size();
   for (int table_id = 0; table_id < num_tables; ++table_id) {
     int64 width = config.table_config(table_id).width();
     int64 num_features = config.table_config(table_id).num_features();
     c->set_output(table_id, c->Matrix(batch_size * num_features, width));
   }
   return Status::OK();
 }

 }  // namespace tpu_embedding_config_util

 REGISTER_OP("TPUEmbeddingReceiveActivations")
     .Output("outputs: num_tables * float")
     .Attr("num_tables: int >= 1")
     .Attr("tpu_embedding_config: string")
     .SetIsStateful()
     .SetShapeFn(tpu_embedding_config_util::ActivationShapes)
     .Doc(R"doc(
 An op that receives embedding activations on the TPU.

 The TPU system performs the embedding lookups and aggregations specified by
 the arguments to TPUEmbeddingEnqueueSparseBatch. The results of these
 aggregations are visible to the Tensorflow Graph as the outputs of a
 TPUEmbeddingDequeueActivations Op. This op returns a list containing one
 Tensor of activations per table specified in the model. There can be at most
 one ReceieveActivations op in the TPU graph.

 outputs: A TensorList of embedding activations containing one Tensor per
     embedding table in the model.
 num_tables: The number of output activation tensors, equal to the number of
     embedding tables in the model.
 tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
 )doc");

 REGISTER_OP("TPUEmbeddingActivations")
     .Input("embedding_variable: float32")
     .Input("sliced_activations: float32")
     .Output("output: float32")
     .Attr("table_id: int >= 0")
     .Attr("lookup_id: int >= 0")
     .SetShapeFn([](shape_inference::InferenceContext *c) {
       c->set_output(0, c->input(1));
       return Status::OK();
     })
     .Doc(R"doc(
 An op enabling differentiation of TPU Embeddings.

 This op simply returns its first input, which is assumed to have been sliced
 from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of this
 op, and its first argument being a trainable Variable, enables automatic
 differentiation of graphs containing embeddings via the TPU Embedding Python
 libraries.

 embedding_variable: A trainable variable, enabling optimizers to find this op.
 sliced_activations: The embedding activations Tensor to return.
 table_id: The id of the table in the embedding layer configuration from which
     these activations were computed.
 lookup_id: Identifier of the set of embedding indices which produced these
     activations.
 )doc");

 REGISTER_OP("TPUEmbeddingSendGradients")
     .Input("gradients: num_tables * float32")
     .Attr("num_tables: int >= 1")
     .Attr("tpu_embedding_config: string")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 An op that performs gradient updates of embedding tables.

 The TensorList argument has the same length and shapes as the return value of
 TPUEmbeddingReceiveActivations, but contains gradients of the model's loss
 with respect to the embedding activations. The embedding tables are updated
 from these gradients via the optimizer specified in the configuration given
 to tpu.initialize_system.

 gradients: A TensorList of gradients with which to update embedding tables.
 tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
 )doc");

 }  // namespace tensorflow
	/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include "tensorflow/contrib/tpu/proto/tpu_embedding_config.pb.h"
	#include "tensorflow/core/framework/common_shape_fns.h"
	#include "tensorflow/core/framework/op.h"
	#include "tensorflow/core/framework/shape_inference.h"
	#include "tensorflow/core/lib/core/status.h"

	namespace tensorflow {

	// TPUs use a specialized mechanism for performing embedding lookups,
	// necessitating differences in TF Graphs that use embeddings on TPUs relative
	// to CPUs. Embedding lookups on TPU systems are achieved by including the
	// following in the TF Graph.
	//
	// 0. Construct a TPUEmbeddingConfiguration, specifying the embedding tables
	// in the model, the size of the TPU system to be used, and the optimizer to
	// be used for each table. Some of this information is redundant with other
	// pieces of the TF Graph.
	// 1. Pass this TPUEmbeddingConfiguration to tpu.initialize_system() as the
	// tpu_embedding_config parameter.
	// 2. Use the TPUEmbeddingLoad Op to initialize the embedding tables in TPU
	// memories, sharded across the memories attached to each Host.
	// 3. Use TPUEmbeddingEnqueueSparseBatch to provide the TPU with embedding
	// indices and aggregation weights.
	// 4. TPUEmbeddingReceiveActivations returns a list of Tensors, containing the
	// activations from each table specified in the configuration.
	// 5. TPUEmbeddingActivations, when used with appropriate Python libraries,
	// enables the automatic differentiation of models that use embeddings.
	// 6. TPUEmbeddingSendGradients takes a list of Tensors (of the same shapes
	// as those returned by TPUEmbeddingReceivActivations) containing gradients
	// to use in updating the embedding tables.
	// 7. Before saving a checkpoint, use the TPUEmbeddingRetrieve Op to update
	// the Graph's embedding table Variables from the updated tables in the
	// TPU memories.
	//
	// TPU Embeddings use dedicated ops to enforce Host/TPU consistency in the
	// state of embedding table variables. Before beginning training or inference,
	// the model must Load the optimizer parameters into the TPU memories. Before
	// saving a checkpoint, the model must Retrieve the parameters back into the
	// host CPU memory.

	REGISTER_OP("TPUEmbeddingLoadGradientDescentParameters")
	.Input("parameters: float32")
	.Attr("tpu_embedding_config: string")
	.Attr("table_id: int >= 0")
	.Attr("num_hosts: int >= 1")
	.Attr("host_id: int >= 0")
	.SetIsStateful()
	.SetShapeFn(shape_inference::UnknownShape)
	.Doc(R"doc(
	Load an embedding table shard into TPU memory for use with GradientDescent.

	TPU embeddings use dedicated per-optimizer Ops for loading and retrieving
	trainable variables and optimizer state from TPU memory. This op enables
	functionality equivalent to GradientDescentOptimizer.

	parameters: The shard of the embedding table resident on the host executing this
	op. For single-TPU models, this is the entire embedding table.
	tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
	table_id: The id of the table specified in the tpu_embedding_config.
	num_hosts: The number of CPU hosts in the distributed training job.
	host_id: Which CPU host in the distributed training job will execute this op.
	)doc");

	namespace tpu_embedding_config_util {

	Status GradientDescentShapes(shape_inference::InferenceContext *c) {
	string config_string;
	TF_RETURN_IF_ERROR(c->GetAttr("tpu_embedding_config", &config_string));
	tpu::TPUEmbeddingConfiguration config;
	if (!config.ParseFromString(config_string)) {
	return errors::InvalidArgument("Malformed tpu_embedding_config.");
	}

	int table_id;
	TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
	int64 num_tables = config.table_config_size();
	if (table_id >= num_tables) {
	return errors::InvalidArgument("Table id >= num_tables");
	}
	int64 width = config.table_config(table_id).width();
	int64 num_rows = config.table_config(table_id).num_rows();

	TF_RETURN_IF_ERROR(c->set_output("parameters", {c->Matrix(num_rows, width)}));
	return Status::OK();
	}

	} // namespace tpu_embedding_config_util

	REGISTER_OP("TPUEmbeddingRetrieveGradientDescentParameters")
	.Output("parameters: float32")
	.Attr("tpu_embedding_config: string")
	.Attr("table_id: int")
	.Attr("num_hosts: int")
	.Attr("host_id: int")
	.SetIsStateful()
	.SetShapeFn(tpu_embedding_config_util::GradientDescentShapes)
	.Doc(R"doc(
	Retrieve an embedding table shard from TPU memory.

	TPU embeddings use dedicated per-optimizer Ops for loading and retrieving
	trainable variables and optimizer state from TPU memory. This op enables
	functionality equivalent to GradientDescentOptimizer.

	tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
	table_id: The id of the table specified in tpu_embedding_config.
	num_hosts: The number of CPU hosts in the distributed training job.
	host_id: Which CPU host in the distributed training job will execute this op.
	)doc");

	REGISTER_OP("TPUEmbeddingLoadAdagradParameters")
	.Input("parameters: float32")
	.Input("accumulators: float32")
	.Attr("tpu_embedding_config: string")
	.Attr("table_id: int >= 0")
	.Attr("num_hosts: int >= 1")
	.Attr("host_id: int >= 0")
	.SetIsStateful()
	.SetShapeFn(shape_inference::UnknownShape)
	.Doc(R"doc(
	Load an embedding table shard into TensorNode memories for use with Adagrad.

	TPU embeddings use dedicated per-optimizer Ops for loading and retrieving
	trainable variables and optimizer state from TPU memory. This op enables
	functionality equivalent to AdagradOptimizer.

	parameters: The shard of the embedding table resident on the host executing this
	op. For single-TPU models, this is the entire embedding table.
	accumulators: Shard of the Adagrad accumulators resident on the host executing
	this op.
	tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
	table_id: The id of the table specified in the embedding_config.
	num_hosts: The number of CPU hosts in the distributed training job.
	host_id: Which CPU host in the distributed training job will execute this op.
	)doc");

	namespace tpu_embedding_config_util {

	Status AdagradShapes(shape_inference::InferenceContext *c) {
	string config_string;
	TF_RETURN_IF_ERROR(c->GetAttr("tpu_embedding_config", &config_string));
	tpu::TPUEmbeddingConfiguration config;
	if (!config.ParseFromString(config_string)) {
	return errors::InvalidArgument("Malformed tpu_embedding_config.");
	}

	int table_id;
	TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
	int64 num_tables = config.table_config_size();
	if (table_id >= num_tables) {
	return errors::InvalidArgument("Table id >= num_tables");
	}
	int64 width = config.table_config(table_id).width();
	int64 num_rows = config.table_config(table_id).num_rows();

	TF_RETURN_IF_ERROR(c->set_output("parameters", {c->Matrix(num_rows, width)}));
	TF_RETURN_IF_ERROR(
	c->set_output("accumulators", {c->Matrix(num_rows, width)}));
	return Status::OK();
	}

	} // namespace tpu_embedding_config_util

	REGISTER_OP("TPUEmbeddingRetrieveAdagradParameters")
	.Output("parameters: float32")
	.Output("accumulators: float32")
	.Attr("tpu_embedding_config: string")
	.Attr("table_id: int >= 0")
	.Attr("num_hosts: int >= 1")
	.Attr("host_id: int >= 0")
	.SetIsStateful()
	.SetShapeFn(tpu_embedding_config_util::AdagradShapes)
	.Doc(R"doc(
	Retrieve an embedding table shard from TPU memory.

	TPU embeddings use dedicated per-optimizer Ops for loading and retrieving
	trainable variables and optimizer state from TPU memory. This op enables
	functionality equivalent to AdagradOptimizer.

	tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
	table_id: The id of the table specified in the embedding_config_json.
	num_hosts: The number of CPU hosts in the distributed training job.
	host_id: Which CPU host in the distributed training job will execute this op.
	)doc");

	REGISTER_OP("TPUEmbeddingEnqueueSparseBatch")
	.Input("sample_indices: num_tables * int32")
	.Input("embedding_indices: num_tables * int32")
	.Input("aggregation_weights: num_tables * float32")
	.Attr("num_tables: int")
	.Attr("device_ordinal: int = -1")
	.SetIsStateful()
	.SetShapeFn(shape_inference::UnknownShape)
	.Doc(R"doc(
	An op that feeds a batch of embedding indices and weights to the TPU.

	Embedding lookups are equivalent to sparse-dense matrix multiplications: the
	sparse matrix contains nonzeros in column j in order to retrieve row j from the
	embedding table.

	The three Tensor list arguments (sample_indices, embedding_indices, and
	aggregation_weights) represent these sparse matrices in COO format. The Tensor
	lists each have one entry for each embedding table specified in the model.
	For the kth embedding table, the three Tensors at position k in the list
	specify a COO-format sparse matrix. For the kth table, the row indices,
	column indices, and nonzero values of the COO sparse matrix are specified by
	sample_indices[k], embedding_indices[k], and aggregation_weights[k],
	respectively. Entries must be sorted by row index, then by column index.

	There should be at most one TPUEmbeddingEnqueueSparseBatch op in a signle
	training step per TPU shard.

	sample_indices: A list of rank 1 Tensors specifying row indices of the COO
	sparse matrix representing the embedding lookups for each table.
	embedding_indices: A list of rank 1 Tensors specifying column indices of the
	COO sparse matrix representing the embedding lookups for each table.
	aggregation_weights: A list of rank 1 Tensors specifying the nonzero values
	of the COO sparse matrix representing the embedding lookups for each table.
	device_ordinal: The TPU device to use. This should be -1 when the Op
	is running on a TPU device, and >= 0 when the Op is running on the CPU
	device.
	)doc");

	namespace tpu_embedding_config_util {

	Status ActivationShapes(shape_inference::InferenceContext *c) {
	string config_string;
	TF_RETURN_IF_ERROR(c->GetAttr("tpu_embedding_config", &config_string));
	tpu::TPUEmbeddingConfiguration config;
	if (!config.ParseFromString(config_string)) {
	return errors::InvalidArgument("Malformed tpu_embedding_config.");
	}
	int64 batch_size = config.batch_size();
	int64 num_tables = config.table_config_size();
	for (int table_id = 0; table_id < num_tables; ++table_id) {
	int64 width = config.table_config(table_id).width();
	int64 num_features = config.table_config(table_id).num_features();
	c->set_output(table_id, c->Matrix(batch_size * num_features, width));
	}
	return Status::OK();
	}

	} // namespace tpu_embedding_config_util

	REGISTER_OP("TPUEmbeddingReceiveActivations")
	.Output("outputs: num_tables * float")
	.Attr("num_tables: int >= 1")
	.Attr("tpu_embedding_config: string")
	.SetIsStateful()
	.SetShapeFn(tpu_embedding_config_util::ActivationShapes)
	.Doc(R"doc(
	An op that receives embedding activations on the TPU.

	The TPU system performs the embedding lookups and aggregations specified by
	the arguments to TPUEmbeddingEnqueueSparseBatch. The results of these
	aggregations are visible to the Tensorflow Graph as the outputs of a
	TPUEmbeddingDequeueActivations Op. This op returns a list containing one
	Tensor of activations per table specified in the model. There can be at most
	one ReceieveActivations op in the TPU graph.

	outputs: A TensorList of embedding activations containing one Tensor per
	embedding table in the model.
	num_tables: The number of output activation tensors, equal to the number of
	embedding tables in the model.
	tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
	)doc");

	REGISTER_OP("TPUEmbeddingActivations")
	.Input("embedding_variable: float32")
	.Input("sliced_activations: float32")
	.Output("output: float32")
	.Attr("table_id: int >= 0")
	.Attr("lookup_id: int >= 0")
	.SetShapeFn([](shape_inference::InferenceContext *c) {
	c->set_output(0, c->input(1));
	return Status::OK();
	})
	.Doc(R"doc(
	An op enabling differentiation of TPU Embeddings.

	This op simply returns its first input, which is assumed to have been sliced
	from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of this
	op, and its first argument being a trainable Variable, enables automatic
	differentiation of graphs containing embeddings via the TPU Embedding Python
	libraries.

	embedding_variable: A trainable variable, enabling optimizers to find this op.
	sliced_activations: The embedding activations Tensor to return.
	table_id: The id of the table in the embedding layer configuration from which
	these activations were computed.
	lookup_id: Identifier of the set of embedding indices which produced these
	activations.
	)doc");

	REGISTER_OP("TPUEmbeddingSendGradients")
	.Input("gradients: num_tables * float32")
	.Attr("num_tables: int >= 1")
	.Attr("tpu_embedding_config: string")
	.SetIsStateful()
	.SetShapeFn(shape_inference::UnknownShape)
	.Doc(R"doc(
	An op that performs gradient updates of embedding tables.

	The TensorList argument has the same length and shapes as the return value of
	TPUEmbeddingReceiveActivations, but contains gradients of the model's loss
	with respect to the embedding activations. The embedding tables are updated
	from these gradients via the optimizer specified in the configuration given
	to tpu.initialize_system.

	gradients: A TensorList of gradients with which to update embedding tables.
	tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
	)doc");

	} // namespace tensorflow