aten/src/THC/THCTensorInfo.cuh - platform/external/pytorch - Git at Google

 #ifndef THC_TENSOR_INFO_INC
 #define THC_TENSOR_INFO_INC

 #include <cuda.h>
 #include <assert.h>
 #include "THCGeneral.h"
 #include "THCIntegerDivider.cuh"
 #include "THCTensor.h"

 // Maximum number of dimensions allowed for cutorch
 #define MAX_CUTORCH_DIMS 25

 // Warning string for tensor arguments that are too large or have too
 // many dimensions
 #define CUTORCH_STR(X) #X
 #define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
   CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"

 // CUDA kernel argument that defines tensor layout
 template <typename T, typename IndexType>
 struct TensorInfo {
   TensorInfo(T* p,
              int dim,
              IndexType sz[MAX_CUTORCH_DIMS],
              IndexType st[MAX_CUTORCH_DIMS]);

   // Set the size of the given dimension to 1, as if it were a
   // reduction dim (allows you to calculate offsets of the reduction
   // slice)
   void reduceDim(int dim);

   /*
   Updates the TensorInfo's dims, sizes, and strides to reflect a "collapse" of
   the info, possibly excluding the optional excludeDim. A "collapsed" version
   of the info is the fewest dims that order the tensor's elements in the same
   way as the original info. If excludeDim is specified, the collapse is the
   fewest dims that order the tensor's elements as the original and preserve the
   excluded dimension, unless the tensor collapses to a point.

   Returns the (new) index of the preserved dimension if excludeDim is
   specified. Returns 0 if the tensor is collapsed to a point. Returns -1
   otherwise.
   */
   int collapseDims(const int excludeDim = -1);

   // Contiguous tensors of more than one dimension are collapsed down
   // to one tensor
   __host__ __device__ inline bool isContiguous() const {
     return (dims == 1 && strides[0] == 1);
   }

   T* data;
   IndexType sizes[MAX_CUTORCH_DIMS];
   IndexType strides[MAX_CUTORCH_DIMS];
   int dims;
 };

 template <typename T, typename IndexType>
 TensorInfo<T, IndexType>::TensorInfo(T* p,
                                      int dim,
                                      IndexType sz[MAX_CUTORCH_DIMS],
                                      IndexType st[MAX_CUTORCH_DIMS]) {
   data = p;
   dims = dim;
   assert(dims > 0 && dims < MAX_CUTORCH_DIMS);

   for (int i = 0; i < dim; ++i) {
     sizes[i] = sz[i];
     strides[i] = st[i];
   }
 }

 template <typename T, typename IndexType>
 void
 TensorInfo<T, IndexType>::reduceDim(int dim) {
   assert(dim < dims && dim >= 0);
   sizes[dim] = 1;
 }

 template <typename T, typename IndexType>
 int
 TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {

   assert(excludeDim >= -1 && excludeDim < dims);

   int stopDim = (excludeDim == -1) ? dims : excludeDim;
   int newIndex = -1;
   int oldIndex = 0;
   int remappedExcludedDim = -1;

   while (oldIndex < dims) {
     // Finds a dimension to collapse into
     for (; oldIndex < stopDim; ++oldIndex) {
       if (sizes[oldIndex] == 1) {
         continue;
       }
       ++newIndex;
       sizes[newIndex] = sizes[oldIndex];
       strides[newIndex] = strides[oldIndex];
       ++oldIndex;
       break;
     }

     // Collapses dims
     for (; oldIndex < stopDim; ++oldIndex) {
       if (sizes[oldIndex] == 1) {
         continue;
       }

       if (strides[newIndex] == sizes[oldIndex] * strides[oldIndex]) {
         sizes[newIndex] *= sizes[oldIndex];
         strides[newIndex] = strides[oldIndex];
       } else {
         ++newIndex;
         sizes[newIndex] = sizes[oldIndex];
         strides[newIndex] = strides[oldIndex];
       }
     }

     // Handles excludeDim being set (oldIndex == excludeDim)
     if (oldIndex != dims) {

       // Preserves excluded dimension
       ++newIndex;
       sizes[newIndex] = sizes[oldIndex];
       strides[newIndex] = strides[oldIndex];
       remappedExcludedDim = newIndex;

       // Restarts iteration after excludeDim
       ++oldIndex;
       stopDim = dims;
     }
   }

   // Handles special case of all dims size 1
   if (newIndex == -1 || (newIndex == 0 && sizes[0] == 1)) {
     dims = 1;
     sizes[0] = 1;
     strides[0] = 1;

     return 0;
   }

   dims = newIndex + 1;
   return remappedExcludedDim;
 }

 // Translate a linear index for the apply to a T* offset;
 // specialized on `Dims` to reduce nvcc compilation time
 template <typename T, typename IndexType, int Dims>
 struct IndexToOffset {
   static __host__ __device__ IndexType get(
     IndexType linearId,
     const TensorInfo<T, IndexType>& info) {

     IndexType offset = 0;

     // Uses static dims
     for (int i = Dims - 1; i > 0; --i) {
       IndexType curDimIndex = linearId % info.sizes[i];
       IndexType curDimOffset = curDimIndex * info.strides[i];
       offset += curDimOffset;
       linearId /= info.sizes[i];
     }

     return offset + linearId * info.strides[0];
   }
 };

 template <typename T, typename IndexType>
 struct IndexToOffset<T, IndexType, -1> {
   static inline __host__ __device__ IndexType get(
     IndexType linearId,
     const TensorInfo<T, IndexType>& info) {

     IndexType offset = 0;

     // Uses dynamic dims
     for (int i = info.dims - 1; i > 0; --i) {
       IndexType curDimIndex = linearId % info.sizes[i];
       IndexType curDimOffset = curDimIndex * info.strides[i];
       offset += curDimOffset;
       linearId /= info.sizes[i];
     }

     return offset + linearId * info.strides[0];
   }
 };

 // OffsetInfo is a faster implementation of IndexToOffset that uses faster
 // integer division: we transform each division into integer multiplication by a
 // pre-computed constant.  (See IntDivider for details.)
 template <typename T, typename IndexType, int Dims>
 struct OffsetInfo {
   explicit OffsetInfo(const TensorInfo<T, IndexType>& tinfo) {
     assert(tinfo.dims == Dims);
     data = tinfo.data;

     for (int i = 0; i < Dims; ++i) {
       sizes[i] = IntDivider<IndexType>(tinfo.sizes[i]);
       strides[i] = tinfo.strides[i];
     }
   }

   __host__ __device__ T* get(IndexType linearIndex) const {
     IndexType offset = 0;

     for (int i = Dims - 1; i > 0; --i) {
       DivMod<IndexType> divmod = sizes[i].divmod(linearIndex);
       linearIndex = divmod.div;
       offset += divmod.mod * strides[i];
     }

     return &data[offset + linearIndex * strides[0]];
   }

   T* data;
   IntDivider<IndexType> sizes[Dims];
   IndexType strides[Dims];
 };

 // For 1D tensors the offset equals linear index * stride.
 template <typename T, typename IndexType>
 struct OffsetInfo<T, IndexType, 1> {
   explicit OffsetInfo(const TensorInfo<T, IndexType>& tinfo)
     : data{tinfo.data}, stride{tinfo.strides[0]} {}

   __host__ __device__ T* get(IndexType linearIndex) const {
     return &data[linearIndex * stride];
   }

   T* data;
   const IndexType stride;
 };

 // Dims=-1 is used when the dimension is unknown at compile time.
 //
 // Unfortunately, pre-computation does not work here, because of a bug in nvcc
 // (tested on CUDA 8.0): if a kernel argument contains an array that is
 // dynamically accessed, the whole array is first copied into the local memory.
 // (That is, every kernel thread makes its own copy of the argument, even if it
 // is never updated.)  Pre-computation makes it worse because now we have more
 // data to copy.
 //
 // So let's fall back to vanilla division approach.

 template <typename T, typename IndexType>
 struct OffsetInfo<T, IndexType, -1> {
   explicit OffsetInfo(const TensorInfo<T, IndexType>& tinfo)
     : tinfo(tinfo) { }

   __host__ __device__ T* get(IndexType linearIndex) const {
     IndexType offset = IndexToOffset<T, IndexType, -1>::get(linearIndex, tinfo);
     return &tinfo.data[offset];
   }

   TensorInfo<T, IndexType> tinfo;
 };

 #endif // THC_TENSOR_INFO_INC
	#ifndef THC_TENSOR_INFO_INC
	#define THC_TENSOR_INFO_INC

	#include <cuda.h>
	#include <assert.h>
	#include "THCGeneral.h"
	#include "THCIntegerDivider.cuh"
	#include "THCTensor.h"

	// Maximum number of dimensions allowed for cutorch
	#define MAX_CUTORCH_DIMS 25

	// Warning string for tensor arguments that are too large or have too
	// many dimensions
	#define CUTORCH_STR(X) #X
	#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
	CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"

	// CUDA kernel argument that defines tensor layout
	template <typename T, typename IndexType>
	struct TensorInfo {
	TensorInfo(T* p,
	int dim,
	IndexType sz[MAX_CUTORCH_DIMS],
	IndexType st[MAX_CUTORCH_DIMS]);

	// Set the size of the given dimension to 1, as if it were a
	// reduction dim (allows you to calculate offsets of the reduction
	// slice)
	void reduceDim(int dim);

	/*
	Updates the TensorInfo's dims, sizes, and strides to reflect a "collapse" of
	the info, possibly excluding the optional excludeDim. A "collapsed" version
	of the info is the fewest dims that order the tensor's elements in the same
	way as the original info. If excludeDim is specified, the collapse is the
	fewest dims that order the tensor's elements as the original and preserve the
	excluded dimension, unless the tensor collapses to a point.

	Returns the (new) index of the preserved dimension if excludeDim is
	specified. Returns 0 if the tensor is collapsed to a point. Returns -1
	otherwise.
	*/
	int collapseDims(const int excludeDim = -1);

	// Contiguous tensors of more than one dimension are collapsed down
	// to one tensor
	__host__ __device__ inline bool isContiguous() const {
	return (dims == 1 && strides[0] == 1);
	}

	T* data;
	IndexType sizes[MAX_CUTORCH_DIMS];
	IndexType strides[MAX_CUTORCH_DIMS];
	int dims;
	};

	template <typename T, typename IndexType>
	TensorInfo<T, IndexType>::TensorInfo(T* p,
	int dim,
	IndexType sz[MAX_CUTORCH_DIMS],
	IndexType st[MAX_CUTORCH_DIMS]) {
	data = p;
	dims = dim;
	assert(dims > 0 && dims < MAX_CUTORCH_DIMS);

	for (int i = 0; i < dim; ++i) {
	sizes[i] = sz[i];
	strides[i] = st[i];
	}
	}

	template <typename T, typename IndexType>
	void
	TensorInfo<T, IndexType>::reduceDim(int dim) {
	assert(dim < dims && dim >= 0);
	sizes[dim] = 1;
	}

	template <typename T, typename IndexType>
	int
	TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {

	assert(excludeDim >= -1 && excludeDim < dims);

	int stopDim = (excludeDim == -1) ? dims : excludeDim;
	int newIndex = -1;
	int oldIndex = 0;
	int remappedExcludedDim = -1;

	while (oldIndex < dims) {
	// Finds a dimension to collapse into
	for (; oldIndex < stopDim; ++oldIndex) {
	if (sizes[oldIndex] == 1) {
	continue;
	}
	++newIndex;
	sizes[newIndex] = sizes[oldIndex];
	strides[newIndex] = strides[oldIndex];
	++oldIndex;
	break;
	}

	// Collapses dims
	for (; oldIndex < stopDim; ++oldIndex) {
	if (sizes[oldIndex] == 1) {
	continue;
	}

	if (strides[newIndex] == sizes[oldIndex] * strides[oldIndex]) {
	sizes[newIndex] *= sizes[oldIndex];
	strides[newIndex] = strides[oldIndex];
	} else {
	++newIndex;
	sizes[newIndex] = sizes[oldIndex];
	strides[newIndex] = strides[oldIndex];
	}
	}

	// Handles excludeDim being set (oldIndex == excludeDim)
	if (oldIndex != dims) {

	// Preserves excluded dimension
	++newIndex;
	sizes[newIndex] = sizes[oldIndex];
	strides[newIndex] = strides[oldIndex];
	remappedExcludedDim = newIndex;

	// Restarts iteration after excludeDim
	++oldIndex;
	stopDim = dims;
	}
	}

	// Handles special case of all dims size 1
	if (newIndex == -1 \|\| (newIndex == 0 && sizes[0] == 1)) {
	dims = 1;
	sizes[0] = 1;
	strides[0] = 1;

	return 0;
	}

	dims = newIndex + 1;
	return remappedExcludedDim;
	}

	// Translate a linear index for the apply to a T* offset;
	// specialized on `Dims` to reduce nvcc compilation time
	template <typename T, typename IndexType, int Dims>
	struct IndexToOffset {
	static __host__ __device__ IndexType get(
	IndexType linearId,
	const TensorInfo<T, IndexType>& info) {

	IndexType offset = 0;

	// Uses static dims
	for (int i = Dims - 1; i > 0; --i) {
	IndexType curDimIndex = linearId % info.sizes[i];
	IndexType curDimOffset = curDimIndex * info.strides[i];
	offset += curDimOffset;
	linearId /= info.sizes[i];
	}

	return offset + linearId * info.strides[0];
	}
	};

	template <typename T, typename IndexType>
	struct IndexToOffset<T, IndexType, -1> {
	static inline __host__ __device__ IndexType get(
	IndexType linearId,
	const TensorInfo<T, IndexType>& info) {

	IndexType offset = 0;

	// Uses dynamic dims
	for (int i = info.dims - 1; i > 0; --i) {
	IndexType curDimIndex = linearId % info.sizes[i];
	IndexType curDimOffset = curDimIndex * info.strides[i];
	offset += curDimOffset;
	linearId /= info.sizes[i];
	}

	return offset + linearId * info.strides[0];
	}
	};

	// OffsetInfo is a faster implementation of IndexToOffset that uses faster
	// integer division: we transform each division into integer multiplication by a
	// pre-computed constant. (See IntDivider for details.)
	template <typename T, typename IndexType, int Dims>
	struct OffsetInfo {
	explicit OffsetInfo(const TensorInfo<T, IndexType>& tinfo) {
	assert(tinfo.dims == Dims);
	data = tinfo.data;

	for (int i = 0; i < Dims; ++i) {
	sizes[i] = IntDivider<IndexType>(tinfo.sizes[i]);
	strides[i] = tinfo.strides[i];
	}
	}

	__host__ __device__ T* get(IndexType linearIndex) const {
	IndexType offset = 0;

	for (int i = Dims - 1; i > 0; --i) {
	DivMod<IndexType> divmod = sizes[i].divmod(linearIndex);
	linearIndex = divmod.div;
	offset += divmod.mod * strides[i];
	}

	return &data[offset + linearIndex * strides[0]];
	}

	T* data;
	IntDivider<IndexType> sizes[Dims];
	IndexType strides[Dims];
	};

	// For 1D tensors the offset equals linear index * stride.
	template <typename T, typename IndexType>
	struct OffsetInfo<T, IndexType, 1> {
	explicit OffsetInfo(const TensorInfo<T, IndexType>& tinfo)
	: data{tinfo.data}, stride{tinfo.strides[0]} {}

	__host__ __device__ T* get(IndexType linearIndex) const {
	return &data[linearIndex * stride];
	}

	T* data;
	const IndexType stride;
	};

	// Dims=-1 is used when the dimension is unknown at compile time.
	//
	// Unfortunately, pre-computation does not work here, because of a bug in nvcc
	// (tested on CUDA 8.0): if a kernel argument contains an array that is
	// dynamically accessed, the whole array is first copied into the local memory.
	// (That is, every kernel thread makes its own copy of the argument, even if it
	// is never updated.) Pre-computation makes it worse because now we have more
	// data to copy.
	//
	// So let's fall back to vanilla division approach.

	template <typename T, typename IndexType>
	struct OffsetInfo<T, IndexType, -1> {
	explicit OffsetInfo(const TensorInfo<T, IndexType>& tinfo)
	: tinfo(tinfo) { }

	__host__ __device__ T* get(IndexType linearIndex) const {
	IndexType offset = IndexToOffset<T, IndexType, -1>::get(linearIndex, tinfo);
	return &tinfo.data[offset];
	}

	TensorInfo<T, IndexType> tinfo;
	};

	#endif // THC_TENSOR_INFO_INC