tensorflow/contrib/tpu/python/tpu/device_assignment.py - platform/external/tensorflow - Git at Google

 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ======================================
 """Library of TPU helper functions."""

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import math
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin

 from tensorflow.contrib.tpu.python.tpu.topology import Topology


 def _tpu_device_name(job, task, device):
   """Returns the device name for the TPU `device` on `task` of `job`."""
   if job is None:
     return "/task:%d/device:TPU:%d" % (task, device)
   else:
     return "/job:%s/task:%d/device:TPU:%d" % (job, task, device)


 def _tpu_host_device_name(job, task):
   """Returns the device name for the CPU device on `task` of `job`."""
   if job is None:
     return "/task:%d/device:CPU:0" % task
   else:
     return "/job:%s/task:%d/device:CPU:0" % (job, task)


 class DeviceAssignment(object):
   """Mapping from logical cores in a computation to the physical TPU topology.

   Prefer to use the `device_assignment()` helper to construct a
   `DeviceAssignment`; it is easier if less flexible than constructing a
   `DeviceAssignment` directly.
   """

   def __init__(self, topology, core_assignment):
     """Constructs a `DeviceAssignment` object.

     Args:
       topology: A `Topology` object that describes the physical TPU topology.
       core_assignment: A logical to physical core mapping, represented as a
         rank 3 numpy array. See the description of the `core_assignment`
         property for more details.

     Raises:
       ValueError: If `topology` is not `Topology` object.
       ValueError: If `core_assignment` is not a rank 3 numpy array.
     """
     if not isinstance(topology, Topology):
       raise ValueError("topology must be a Topology object, got {}".format(
           type(topology)))
     core_assignment = np.asarray(core_assignment, dtype=np.int32)

     self._topology = topology
     self._topology_tasks, self._topology_devices = (
         self._invert_topology(topology))

     topology_rank = self._topology_tasks.ndim
     if core_assignment.ndim != topology_rank + 2:
       raise ValueError("core_assignment must be a rank {} numpy array".format(
           topology_rank + 2))

     self._num_replicas = core_assignment.shape[0]
     self._computation_shape = np.array(
         core_assignment.shape[1:-1], dtype=np.int32)

     if core_assignment.shape[-1] != topology_rank:
       raise ValueError(
           "minor dimension of core_assignment must have size equal to topology "
           "rank ({}), got shape {}".format(topology_rank,
                                            core_assignment.shape))

     self._core_assignment = core_assignment
     self._task_and_cores_to_replicas = self._compute_task_and_cores_to_replicas(
         self._core_assignment, self._topology_tasks)

   def _invert_topology(self, topology):
     """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
     mesh_shape = topology.mesh_shape
     tasks = np.full(list(mesh_shape), -1, dtype=np.int32)
     devices = np.full(list(mesh_shape), -1, dtype=np.int32)
     for task in xrange(topology.device_coordinates.shape[0]):
       for device in xrange(topology.device_coordinates.shape[1]):
         x, y, z = topology.device_coordinates[task, device, :]
         tasks[x, y, z] = task
         devices[x, y, z] = device
     return tasks, devices

   def _compute_task_and_cores_to_replicas(self, core_assignment,
                                           topology_tasks):
     """Computes a nested dict which maps task and logical core to replicas."""
     task_and_cores_to_replicas = {}
     for replica in xrange(core_assignment.shape[0]):
       for dx in xrange(core_assignment.shape[1]):
         for dy in xrange(core_assignment.shape[2]):
           for dz in xrange(core_assignment.shape[3]):
             x, y, z = core_assignment[replica, dx, dy, dz, :]
             task_id = topology_tasks[x, y, z]
             if task_id not in task_and_cores_to_replicas:
               task_and_cores_to_replicas[task_id] = {}
             logical_core = (dx, dy, dz)
             if logical_core not in task_and_cores_to_replicas[task_id]:
               task_and_cores_to_replicas[task_id][logical_core] = set()

             task_and_cores_to_replicas[task_id][logical_core].add(replica)

     task_to_sorted_replica_id = {}

     for task, core_to_replicas in task_and_cores_to_replicas.items():
       core_to_sorted_replicas = {}
       for core, replicas in core_to_replicas.items():
         core_to_sorted_replicas[core] = sorted(replicas)

       task_to_sorted_replica_id[task] = core_to_sorted_replicas
     return task_to_sorted_replica_id

   @property
   def topology(self):
     """A `Topology` that describes the TPU topology."""
     return self._topology

   @property
   def computation_shape(self):
     """The computation shape.

     Returns:
       A rank-1 int32 numpy array with size equal to the TPU topology rank.
       Describes the logical shape in numbers of core of each replica of the
       computation in the TPU topology.

     Returns:
       The computation shape.
     """
     return self._computation_shape

   @property
   def num_cores_per_replica(self):
     """The number of cores per replica."""
     return np.prod(self.computation_shape)

   @property
   def num_replicas(self):
     """The number of replicas of the computation."""
     return self._num_replicas

   @property
   def core_assignment(self):
     """The logical to physical core mapping.

     Returns:
       A numpy array of rank `topology_rank + 2`, with shape
       `[num_replicas] + computation_shape + [topology_rank]`. Maps
       (replica, logical core coordinates) pairs to physical topology
       coordinates.
     """
     return self._core_assignment

   def _coordinates(self, replica, logical_core):
     """Returns the physical topology coordinates of a logical core."""
     if logical_core is None:
       logical_core = np.array([0, 0, 0], np.int32)
     else:
       logical_core = np.asarray(logical_core)

     if any(logical_core < 0) or any(logical_core >= self.computation_shape):
       raise ValueError("Invalid core {}; computation shape is {}".format(
           logical_core, self.computation_shape))

     logical_offset = tuple([replica] + logical_core.tolist() + [slice(3)])
     return tuple(self.core_assignment[logical_offset])

   def lookup_replicas(self, task_id, logical_core):
     """Lookup replica ids by task number and logical core.

     Args:
       task_id: TensorFlow task number.
       logical_core: A tuple of three integers which represents a logical core.
     Returns:
       A sorted list of the replicas that are attached to that task and
       logical_core.
     Raises:
       ValueError: If no replica exists in the task which contains the logical
       core.
     """
     try:
       return self._task_and_cores_to_replicas[task_id][logical_core]
     except KeyError:
       raise ValueError(
           "Can not find any replica in task: {} contains logical_core: {} ".
           format(task_id, logical_core))

   def tpu_ordinal(self, replica=0, logical_core=None):
     """Returns the ordinal of the TPU device assigned to a logical core."""
     coordinates = self._coordinates(replica, logical_core)
     return self._topology_devices[coordinates]

   def host_device(self, replica=0, logical_core=None, job=None):
     """Returns the CPU device attached to a logical core."""
     coordinates = self._coordinates(replica, logical_core)
     return _tpu_host_device_name(job, self._topology_tasks[coordinates])

   def tpu_device(self, replica=0, logical_core=None, job=None):
     """Returns the name of the TPU device assigned to a logical core."""
     coordinates = self._coordinates(replica, logical_core)
     return _tpu_device_name(job, self._topology_tasks[coordinates],
                             self._topology_devices[coordinates])


 def device_assignment(topology,
                       computation_shape=None,
                       computation_stride=None,
                       num_replicas=1):
   """Computes a device_assignment of a computation across a TPU topology.

   Returns a `DeviceAssignment` that describes the cores in the topology assigned
   to each core of each replica.

   `computation_shape` and `computation_stride` values should be powers of 2 for
   optimal packing.

   Args:
     topology: A `Topology` object that describes the TPU cluster topology.
       To obtain a TPU topology, evaluate the `Tensor` returned by
       `initialize_system` using `Session.run`. Either a serialized
       `TopologyProto` or a `Topology` object may be passed. Note: you must
       evaluate the `Tensor` first; you cannot pass an unevaluated `Tensor` here.
     computation_shape: A rank 1 int32 numpy array of size 3, describing the
       shape of the computation's block of cores. If None, the
       `computation_shape` is `[1, 1, 1]`.
     computation_stride: A rank 1 int32 numpy array of size 3, describing the
       inter-core spacing of the `computation_shape` cores in the TPU topology.
       If None, the `computation_stride` is `[1, 1, 1]`.
     num_replicas: The number of computation replicas to run. The replicas will
       be packed into the free spaces of the topology.

   Returns:
     A DeviceAssignment object, which describes the mapping between the logical
     cores in each computation replica and the physical cores in the TPU
     topology.

   Raises:
     ValueError: If `topology` is not a valid `Topology` object.
     ValueError: If `computation_shape` or `computation_stride` are not 1D int32
       numpy arrays with shape [3] where all values are positive.
     ValueError: If computation's replicas cannot fit into the TPU topology.
   """
   # Deserialize the Topology proto, if it is a string.
   if isinstance(topology, bytes):
     topology = Topology(serialized=topology)

   if not isinstance(topology, Topology):
     raise ValueError("`topology` is not a Topology object; got {}".format(
         type(topology)))

   topology_rank = len(topology.mesh_shape)
   mesh_shape = topology.mesh_shape
   if computation_shape is None:
     computation_shape = np.array([1, 1, 1], dtype=np.int32)
   else:
     computation_shape = np.asarray(computation_shape, dtype=np.int32)

   if computation_stride is None:
     computation_stride = np.array([1, 1, 1], dtype=np.int32)
   else:
     computation_stride = np.asarray(computation_stride, dtype=np.int32)

   if computation_shape.shape != (3,):
     raise ValueError("computation_shape must have shape [3]; got {}".format(
         computation_shape.shape))
   if computation_stride.shape != (3,):
     raise ValueError("computation_stride must have shape [3]; got {}".format(
         computation_stride.shape))

   if any(computation_shape < 1):
     raise ValueError(
         "computation_shape must be positive; got computation_shape={}".format(
             computation_shape))
   if any(computation_stride < 1):
     raise ValueError(
         "computation_stride must be positive; got computation_stride={}".format(
             computation_stride))

   # Computes the physical size of one computation instance.
   computation_footprint = computation_shape * computation_stride
   if any(computation_footprint > mesh_shape):
     raise ValueError(
         "computation footprint {} does not fit in TPU topology shape {}".format(
             computation_footprint, mesh_shape))

   # Computes how many copies of the computation footprint fit in the mesh.
   block_counts = mesh_shape // computation_footprint

   replica_counts = block_counts * computation_stride
   max_replicas = np.prod(replica_counts)
   if num_replicas > max_replicas:
     raise ValueError(
         "requested {} replicas but only {} replicas with shape {} and "
         "computation_stride {} fit in a TPU mesh of shape {}".format(
             num_replicas, max_replicas, computation_shape, computation_stride,
             mesh_shape))

   # Choose a compact layout for the cores. Choose the smaller dimension in the
   # topology to be close to the square root of the number of replicas.
   num_chips = int(math.ceil(num_replicas / replica_counts[2]))
   target_size = int(math.ceil(math.sqrt(num_chips)))

   # Prefer an even size, if possible. Odd numbered rows head back towards the
   # first column, so it's best if the last row has an odd index.
   if target_size % 2 != 0:
     target_size -= 1
   y_size = min(replica_counts[1], target_size)
   if y_size * replica_counts[0] < num_chips:
     y_size = replica_counts[1]

   # Assigns an offset to each replica such that no two replicas overlap.
   replica_offsets = np.full([num_replicas, 3], -1, dtype=np.int32)
   for replica in xrange(num_replicas):
     # Chooses a replica number in X/Y/Z axes.
     z = replica % replica_counts[2]
     t = replica // replica_counts[2]
     y = t % y_size
     x = t // y_size
     replica_pos = np.array([x, y, z], dtype=np.int32)

     # Determines where that replica starts in each axis.
     outer = replica_pos // computation_stride
     inner = replica_pos % computation_stride
     replica_offsets[replica, :] = outer * computation_footprint + inner

   # Computes a complete logical core -> physical core mapping for each replica.
   indices = [
       np.arange(0, computation_shape[i] * computation_stride[i],
                 computation_stride[i]) for i in xrange(topology_rank)
   ]
   indices = np.concatenate(
       [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
       axis=-1)
   assignment = (
       indices + replica_offsets[:, np.newaxis, np.newaxis, np.newaxis, :])
   return DeviceAssignment(topology, core_assignment=assignment)
	# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ======================================
	"""Library of TPU helper functions."""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import math
	import numpy as np
	from six.moves import xrange # pylint: disable=redefined-builtin

	from tensorflow.contrib.tpu.python.tpu.topology import Topology


	def _tpu_device_name(job, task, device):
	"""Returns the device name for the TPU `device` on `task` of `job`."""
	if job is None:
	return "/task:%d/device:TPU:%d" % (task, device)
	else:
	return "/job:%s/task:%d/device:TPU:%d" % (job, task, device)


	def _tpu_host_device_name(job, task):
	"""Returns the device name for the CPU device on `task` of `job`."""
	if job is None:
	return "/task:%d/device:CPU:0" % task
	else:
	return "/job:%s/task:%d/device:CPU:0" % (job, task)


	class DeviceAssignment(object):
	"""Mapping from logical cores in a computation to the physical TPU topology.

	Prefer to use the `device_assignment()` helper to construct a
	`DeviceAssignment`; it is easier if less flexible than constructing a
	`DeviceAssignment` directly.
	"""

	def __init__(self, topology, core_assignment):
	"""Constructs a `DeviceAssignment` object.

	Args:
	topology: A `Topology` object that describes the physical TPU topology.
	core_assignment: A logical to physical core mapping, represented as a
	rank 3 numpy array. See the description of the `core_assignment`
	property for more details.

	Raises:
	ValueError: If `topology` is not `Topology` object.
	ValueError: If `core_assignment` is not a rank 3 numpy array.
	"""
	if not isinstance(topology, Topology):
	raise ValueError("topology must be a Topology object, got {}".format(
	type(topology)))
	core_assignment = np.asarray(core_assignment, dtype=np.int32)

	self._topology = topology
	self._topology_tasks, self._topology_devices = (
	self._invert_topology(topology))

	topology_rank = self._topology_tasks.ndim
	if core_assignment.ndim != topology_rank + 2:
	raise ValueError("core_assignment must be a rank {} numpy array".format(
	topology_rank + 2))

	self._num_replicas = core_assignment.shape[0]
	self._computation_shape = np.array(
	core_assignment.shape[1:-1], dtype=np.int32)

	if core_assignment.shape[-1] != topology_rank:
	raise ValueError(
	"minor dimension of core_assignment must have size equal to topology "
	"rank ({}), got shape {}".format(topology_rank,
	core_assignment.shape))

	self._core_assignment = core_assignment
	self._task_and_cores_to_replicas = self._compute_task_and_cores_to_replicas(
	self._core_assignment, self._topology_tasks)

	def _invert_topology(self, topology):
	"""Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
	mesh_shape = topology.mesh_shape
	tasks = np.full(list(mesh_shape), -1, dtype=np.int32)
	devices = np.full(list(mesh_shape), -1, dtype=np.int32)
	for task in xrange(topology.device_coordinates.shape[0]):
	for device in xrange(topology.device_coordinates.shape[1]):
	x, y, z = topology.device_coordinates[task, device, :]
	tasks[x, y, z] = task
	devices[x, y, z] = device
	return tasks, devices

	def _compute_task_and_cores_to_replicas(self, core_assignment,
	topology_tasks):
	"""Computes a nested dict which maps task and logical core to replicas."""
	task_and_cores_to_replicas = {}
	for replica in xrange(core_assignment.shape[0]):
	for dx in xrange(core_assignment.shape[1]):
	for dy in xrange(core_assignment.shape[2]):
	for dz in xrange(core_assignment.shape[3]):
	x, y, z = core_assignment[replica, dx, dy, dz, :]
	task_id = topology_tasks[x, y, z]
	if task_id not in task_and_cores_to_replicas:
	task_and_cores_to_replicas[task_id] = {}
	logical_core = (dx, dy, dz)
	if logical_core not in task_and_cores_to_replicas[task_id]:
	task_and_cores_to_replicas[task_id][logical_core] = set()

	task_and_cores_to_replicas[task_id][logical_core].add(replica)

	task_to_sorted_replica_id = {}

	for task, core_to_replicas in task_and_cores_to_replicas.items():
	core_to_sorted_replicas = {}
	for core, replicas in core_to_replicas.items():
	core_to_sorted_replicas[core] = sorted(replicas)

	task_to_sorted_replica_id[task] = core_to_sorted_replicas
	return task_to_sorted_replica_id

	@property
	def topology(self):
	"""A `Topology` that describes the TPU topology."""
	return self._topology

	@property
	def computation_shape(self):
	"""The computation shape.

	Returns:
	A rank-1 int32 numpy array with size equal to the TPU topology rank.
	Describes the logical shape in numbers of core of each replica of the
	computation in the TPU topology.

	Returns:
	The computation shape.
	"""
	return self._computation_shape

	@property
	def num_cores_per_replica(self):
	"""The number of cores per replica."""
	return np.prod(self.computation_shape)

	@property
	def num_replicas(self):
	"""The number of replicas of the computation."""
	return self._num_replicas

	@property
	def core_assignment(self):
	"""The logical to physical core mapping.

	Returns:
	A numpy array of rank `topology_rank + 2`, with shape
	`[num_replicas] + computation_shape + [topology_rank]`. Maps
	(replica, logical core coordinates) pairs to physical topology
	coordinates.
	"""
	return self._core_assignment

	def _coordinates(self, replica, logical_core):
	"""Returns the physical topology coordinates of a logical core."""
	if logical_core is None:
	logical_core = np.array([0, 0, 0], np.int32)
	else:
	logical_core = np.asarray(logical_core)

	if any(logical_core < 0) or any(logical_core >= self.computation_shape):
	raise ValueError("Invalid core {}; computation shape is {}".format(
	logical_core, self.computation_shape))

	logical_offset = tuple([replica] + logical_core.tolist() + [slice(3)])
	return tuple(self.core_assignment[logical_offset])

	def lookup_replicas(self, task_id, logical_core):
	"""Lookup replica ids by task number and logical core.

	Args:
	task_id: TensorFlow task number.
	logical_core: A tuple of three integers which represents a logical core.
	Returns:
	A sorted list of the replicas that are attached to that task and
	logical_core.
	Raises:
	ValueError: If no replica exists in the task which contains the logical
	core.
	"""
	try:
	return self._task_and_cores_to_replicas[task_id][logical_core]
	except KeyError:
	raise ValueError(
	"Can not find any replica in task: {} contains logical_core: {} ".
	format(task_id, logical_core))

	def tpu_ordinal(self, replica=0, logical_core=None):
	"""Returns the ordinal of the TPU device assigned to a logical core."""
	coordinates = self._coordinates(replica, logical_core)
	return self._topology_devices[coordinates]

	def host_device(self, replica=0, logical_core=None, job=None):
	"""Returns the CPU device attached to a logical core."""
	coordinates = self._coordinates(replica, logical_core)
	return _tpu_host_device_name(job, self._topology_tasks[coordinates])

	def tpu_device(self, replica=0, logical_core=None, job=None):
	"""Returns the name of the TPU device assigned to a logical core."""
	coordinates = self._coordinates(replica, logical_core)
	return _tpu_device_name(job, self._topology_tasks[coordinates],
	self._topology_devices[coordinates])


	def device_assignment(topology,
	computation_shape=None,
	computation_stride=None,
	num_replicas=1):
	"""Computes a device_assignment of a computation across a TPU topology.

	Returns a `DeviceAssignment` that describes the cores in the topology assigned
	to each core of each replica.

	`computation_shape` and `computation_stride` values should be powers of 2 for
	optimal packing.

	Args:
	topology: A `Topology` object that describes the TPU cluster topology.
	To obtain a TPU topology, evaluate the `Tensor` returned by
	`initialize_system` using `Session.run`. Either a serialized
	`TopologyProto` or a `Topology` object may be passed. Note: you must
	evaluate the `Tensor` first; you cannot pass an unevaluated `Tensor` here.
	computation_shape: A rank 1 int32 numpy array of size 3, describing the
	shape of the computation's block of cores. If None, the
	`computation_shape` is `[1, 1, 1]`.
	computation_stride: A rank 1 int32 numpy array of size 3, describing the
	inter-core spacing of the `computation_shape` cores in the TPU topology.
	If None, the `computation_stride` is `[1, 1, 1]`.
	num_replicas: The number of computation replicas to run. The replicas will
	be packed into the free spaces of the topology.

	Returns:
	A DeviceAssignment object, which describes the mapping between the logical
	cores in each computation replica and the physical cores in the TPU
	topology.

	Raises:
	ValueError: If `topology` is not a valid `Topology` object.
	ValueError: If `computation_shape` or `computation_stride` are not 1D int32
	numpy arrays with shape [3] where all values are positive.
	ValueError: If computation's replicas cannot fit into the TPU topology.
	"""
	# Deserialize the Topology proto, if it is a string.
	if isinstance(topology, bytes):
	topology = Topology(serialized=topology)

	if not isinstance(topology, Topology):
	raise ValueError("`topology` is not a Topology object; got {}".format(
	type(topology)))

	topology_rank = len(topology.mesh_shape)
	mesh_shape = topology.mesh_shape
	if computation_shape is None:
	computation_shape = np.array([1, 1, 1], dtype=np.int32)
	else:
	computation_shape = np.asarray(computation_shape, dtype=np.int32)

	if computation_stride is None:
	computation_stride = np.array([1, 1, 1], dtype=np.int32)
	else:
	computation_stride = np.asarray(computation_stride, dtype=np.int32)

	if computation_shape.shape != (3,):
	raise ValueError("computation_shape must have shape [3]; got {}".format(
	computation_shape.shape))
	if computation_stride.shape != (3,):
	raise ValueError("computation_stride must have shape [3]; got {}".format(
	computation_stride.shape))

	if any(computation_shape < 1):
	raise ValueError(
	"computation_shape must be positive; got computation_shape={}".format(
	computation_shape))
	if any(computation_stride < 1):
	raise ValueError(
	"computation_stride must be positive; got computation_stride={}".format(
	computation_stride))

	# Computes the physical size of one computation instance.
	computation_footprint = computation_shape * computation_stride
	if any(computation_footprint > mesh_shape):
	raise ValueError(
	"computation footprint {} does not fit in TPU topology shape {}".format(
	computation_footprint, mesh_shape))

	# Computes how many copies of the computation footprint fit in the mesh.
	block_counts = mesh_shape // computation_footprint

	replica_counts = block_counts * computation_stride
	max_replicas = np.prod(replica_counts)
	if num_replicas > max_replicas:
	raise ValueError(
	"requested {} replicas but only {} replicas with shape {} and "
	"computation_stride {} fit in a TPU mesh of shape {}".format(
	num_replicas, max_replicas, computation_shape, computation_stride,
	mesh_shape))

	# Choose a compact layout for the cores. Choose the smaller dimension in the
	# topology to be close to the square root of the number of replicas.
	num_chips = int(math.ceil(num_replicas / replica_counts[2]))
	target_size = int(math.ceil(math.sqrt(num_chips)))

	# Prefer an even size, if possible. Odd numbered rows head back towards the
	# first column, so it's best if the last row has an odd index.
	if target_size % 2 != 0:
	target_size -= 1
	y_size = min(replica_counts[1], target_size)
	if y_size * replica_counts[0] < num_chips:
	y_size = replica_counts[1]

	# Assigns an offset to each replica such that no two replicas overlap.
	replica_offsets = np.full([num_replicas, 3], -1, dtype=np.int32)
	for replica in xrange(num_replicas):
	# Chooses a replica number in X/Y/Z axes.
	z = replica % replica_counts[2]
	t = replica // replica_counts[2]
	y = t % y_size
	x = t // y_size
	replica_pos = np.array([x, y, z], dtype=np.int32)

	# Determines where that replica starts in each axis.
	outer = replica_pos // computation_stride
	inner = replica_pos % computation_stride
	replica_offsets[replica, :] = outer * computation_footprint + inner

	# Computes a complete logical core -> physical core mapping for each replica.
	indices = [
	np.arange(0, computation_shape[i] * computation_stride[i],
	computation_stride[i]) for i in xrange(topology_rank)
	]
	indices = np.concatenate(
	[i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
	axis=-1)
	assignment = (
	indices + replica_offsets[:, np.newaxis, np.newaxis, np.newaxis, :])
	return DeviceAssignment(topology, core_assignment=assignment)