THCDeviceTensor.cuh - platform/external/pytorch - Git at Google

 #ifndef THC_DEVICE_TENSOR_INC
 #define THC_DEVICE_TENSOR_INC

 #include <cuda.h>
 #include <cuda_runtime.h>

 // A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0.
 template <bool>
 struct THCStaticAssert;

 template <>
 struct THCStaticAssert<true> {
 };

 #define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>())

 /// Our tensor type
 template <typename T,
           int Dim,
           typename IndexT,
           template <typename U> class PtrTraits>
 class THCDeviceTensor;

 /// Type of a subspace of a tensor
 namespace detail {
 template <typename TensorType,
           int SubDim,
           template <typename U> class PtrTraits>
 class THCDeviceSubTensor;
 }

 template <typename T>
 struct RestrictPtrTraits {
   typedef T* __restrict__ PtrType;
 };

 template <typename T>
 struct DefaultPtrTraits {
   typedef T* PtrType;
 };

 /**
    Templated multi-dimensional array that supports strided access of
    elements. Main access is through `operator[]`; e.g.,
    `tensor[x][y][z]`.

 - `T` is the contained type (e.g., `float`)
 - `Dim` is the tensor rank
 - `IndexT` is the integer type used for size/stride arrays, and for
 - all indexing math. Default is `int`, but for large tensors, `long`
 - can be used instead.
 - `PtrTraits` are traits applied to our data pointer (T*). By default,
 - this is just T*, but RestrictPtrTraits can be used to apply T*
 - __restrict__ for alias-free analysis.
 */
 template <typename T,
           int Dim,
           typename IndexT = int,
           template <typename U> class PtrTraits = DefaultPtrTraits>
 class THCDeviceTensor {
  public:
   enum { NumDim = Dim };
   typedef T DataType;
   typedef IndexT IndexType;
   typedef typename PtrTraits<T>::PtrType DataPtrType;
   typedef THCDeviceTensor<T, Dim, IndexT, PtrTraits> TensorType;

   /// Default constructor
   __host__ __device__ THCDeviceTensor();

   /// Constructor that calculates strides with no padding
   __host__ __device__ THCDeviceTensor(DataPtrType data,
                                       const IndexT sizes[Dim]);

   /// Constructor that takes arbitrary size/stride arrays
   __host__ __device__ THCDeviceTensor(DataPtrType data,
                                       const IndexT sizes[Dim],
                                       const IndexT strides[Dim]);

   /// Returns true if the two tensors are of the same dimensionality,
   /// size and stride.
   template <int OtherDim>
   __host__ __device__ bool
   isSameSizeAndStride(
     const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const;

   /// Cast to a tensor of a different type of the same size and stride
   template <typename U>
   __host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast();

   /// Const version of `cast`
   template <typename U>
   __host__ __device__
   const THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast() const;

   /// Returns a raw pointer to the start of our data.
   __host__ __device__ __forceinline__ DataPtrType data() {
     return data_;
   }

   /// Returns a raw pointer to the start of our data (const).
   __host__ __device__ __forceinline__
   const DataPtrType data() const {
     return data_;
   }

   /// Cast to a different datatype
   template <typename U>
   __host__ __device__ __forceinline__
   typename PtrTraits<U>::PtrType dataAs() {
     return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
   }

   /// Cast to a different datatype
   template <typename U>
   __host__ __device__ __forceinline__
   const typename PtrTraits<const U>::PtrType dataAs() const {
     return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
   }

   /// Returns a read/write view of a portion of our tensor.
   __host__ __device__ __forceinline__
   detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
     operator[](IndexT);

   /// Returns a read/write view of a portion of our tensor (const).
   __host__ __device__ __forceinline__
   const detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
     operator[](IndexT) const;

   /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
   /// checking.
   __host__ __device__ __forceinline__ int getSize(int i) const {
     return size_[i];
   }

   /// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
   /// checking.
   __host__ __device__ __forceinline__ int getStride(int i) const {
     return stride_[i];
   }

   /// Returns the total number of elements contained within our data
   /// (product of `getSize(i)`)
   __host__ __device__ long numElements() const;

   /// Returns the size array.
   __host__ __device__ __forceinline__ const IndexT* sizes() const {
     return size_;
   }

   /// Returns the stride array.
   __host__ __device__ __forceinline__ const IndexT* strides() const {
     return stride_;
   }

   /// Returns true if there is no padding within the tensor and no
   /// re-ordering of the dimensions.
   /// ~~~
   /// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
   /// ~~~
   __host__ __device__ bool isContiguous() const;

   /// Returns whether a given dimension has only increasing stride
   /// from the previous dimension. A tensor that was permuted by
   /// exchanging size and stride only will fail this check.
   /// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
   __host__ __device__ bool isConsistentlySized(int i) const;

   // Returns whether at each dimension `stride <= size`.
   // If this is not the case then iterating once over the size space will
   // touch the same memory locations multiple times.
   __host__ __device__ bool isConsistentlySized() const;

   /// Returns true if the given dimension index has no padding
   __host__ __device__ bool isContiguousDim(int i) const;

   /// Returns a tensor of the same dimension after transposing the two
   /// dimensions given. Does not actually move elements; transposition
   /// is made by permuting the size/stride arrays.
   /// If the dimensions are not valid, asserts.
   __host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
   transpose(int dim1, int dim2) const;

   /// Upcast a tensor of dimension `D` to some tensor of dimension
   /// D' > D by padding the leading dimensions by 1
   /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
   template <int NewDim>
   __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
   upcastOuter();

   /// Upcast a tensor of dimension `D` to some tensor of dimension
   /// D' > D by padding the lowest/most varying dimensions by 1
   /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
   template <int NewDim>
   __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
   upcastInner();

   /// Downcast a tensor of dimension `D` to some tensor of dimension
   /// D' < D by collapsing the leading dimensions. asserts if there is
   /// padding on the leading dimensions.
   template <int NewDim>
   __host__ __device__
   THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastOuter();

   /// Downcast a tensor of dimension `D` to some tensor of dimension
   /// D' < D by collapsing the leading dimensions. asserts if there is
   /// padding on the leading dimensions.
   template <int NewDim>
   __host__ __device__
   THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastInner();

   /// Returns a tensor that is a view of the `SubDim`-dimensional slice
   /// of this tensor, starting at `at`.
   template <int SubDim>
   __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
   view(DataPtrType at);

   /// Returns a tensor that is a view of the `SubDim`-dimensional slice
   /// of this tensor, starting where our data begins
   template <int SubDim>
   __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
   view();

   /// Zeroes out the tensor asynchronously. Asserts if the contents
   /// in question are not contiguous.
   void zero(cudaStream_t stream = 0);

  private:
   /// Raw pointer to where the tensor data begins
   DataPtrType data_;

   /// Array of strides (in sizeof(T) terms) per each dimension
   IndexT stride_[Dim];

   /// Size per each dimension
   IndexT size_[Dim];
 };

 namespace detail {

 /// Specialization for a view of a single value (0-dimensional)
 template <typename TensorType, template <typename U> class PtrTraits>
 class THCDeviceSubTensor<TensorType, 0, PtrTraits> {
  public:
   __host__ __device__ THCDeviceSubTensor<TensorType, 0, PtrTraits>
   operator=(typename TensorType::DataType val) {
     *data_ = val;
     return *this;
   }

   // operator T&
   __host__ __device__ operator typename TensorType::DataType&() {
     return *data_;
   }

   // const operator T& returning const T&
   __host__ __device__ operator const typename TensorType::DataType&() const {
     return *data_;
   }

   // operator& returning T*
   __host__ __device__ typename TensorType::DataType* operator&() {
     return data_;
   }

   // const operator& returning const T*
   __host__ __device__ const typename TensorType::DataType* operator&() const {
     return data_;
   }

   /// Returns a raw accessor to our slice.
   __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
     return data_;
   }

   /// Returns a raw accessor to our slice (const).
   __host__ __device__ __forceinline__
   const typename TensorType::DataPtrType data() const {
     return data_;
   }

   /// Cast to a different datatype.
   template <typename T>
   __host__ __device__ T& as() {
     return *dataAs<T>();
   }

   /// Cast to a different datatype (const).
   template <typename T>
   __host__ __device__ const T& as() const {
     return *dataAs<T>();
   }

   /// Cast to a different datatype
   template <typename T>
   __host__ __device__ __forceinline__
   typename PtrTraits<T>::PtrType dataAs() {
     return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
   }

   /// Cast to a different datatype (const)
   template <typename T>
   __host__ __device__ __forceinline__
   typename PtrTraits<const T>::PtrType dataAs() const {
     return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
   }

   /// Use the texture cache for reads
   __device__ __forceinline__ typename TensorType::DataType ldg() const {
 #if __CUDA_ARCH__ >= 350
     return __ldg(data_);
 #else
     return *data_;
 #endif
   }

   /// Use the texture cache for reads; cast as a particular type
   template <typename T>
   __device__ __forceinline__ T ldgAs() const {
 #if __CUDA_ARCH__ >= 350
     return __ldg(dataAs<T>());
 #else
     return as<T>();
 #endif
   }

   private:
   /// One dimension greater can create us
   friend class THCDeviceSubTensor<TensorType, 1, PtrTraits>;

   /// Our parent tensor can create us
   friend class THCDeviceTensor<typename TensorType::DataType,
                                1,
                                typename TensorType::IndexType,
                                PtrTraits>;

   __host__ __device__ __forceinline__ THCDeviceSubTensor(
     TensorType& t,
     typename TensorType::DataPtrType data)
       : tensor_(t),
         data_(data) {
   }

   /// The tensor we're referencing
   TensorType& tensor_;

   /// Where our value is located
   typename TensorType::DataPtrType const data_;
 };

 /// A `SubDim`-rank slice of a parent THCDeviceTensor
 template <typename TensorType,
           int SubDim,
           template <typename U> class PtrTraits>
 class THCDeviceSubTensor {
  public:
   /// Returns a view of the data located at our offset (the dimension
   /// `SubDim` - 1 tensor).
   __host__ __device__ __forceinline__
   THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
     operator[](typename TensorType::IndexType index) {
     return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
       tensor_,
       data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
   }

   /// Returns a view of the data located at our offset (the dimension
   /// `SubDim` - 1 tensor) (const).
   __host__ __device__ __forceinline__
   const THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
     operator[](typename TensorType::IndexType index) const {
     return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
       tensor_,
       data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
   }

   // operator& returning T*
   __host__ __device__ typename TensorType::DataType* operator&() {
     return data_;
   }

   // const operator& returning const T*
   __host__ __device__ const typename TensorType::DataType* operator&() const {
     return data_;
   }

   /// Returns a raw accessor to our slice.
   __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
     return data_;
   }

   /// Returns a raw accessor to our slice (const).
   __host__ __device__ __forceinline__
   const typename TensorType::DataPtrType data() const {
     return data_;
   }

   /// Cast to a different datatype.
   template <typename T>
   __host__ __device__ T& as() {
     return *dataAs<T>();
   }

   /// Cast to a different datatype (const).
   template <typename T>
   __host__ __device__ const T& as() const {
     return *dataAs<T>();
   }

   /// Cast to a different datatype
   template <typename T>
   __host__ __device__ __forceinline__
   typename PtrTraits<T>::PtrType dataAs() {
     return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
   }

   /// Cast to a different datatype (const)
   template <typename T>
   __host__ __device__ __forceinline__
   typename PtrTraits<const T>::PtrType dataAs() const {
     return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
   }

   /// Use the texture cache for reads
   __device__ __forceinline__ typename TensorType::DataType ldg() const {
 #if __CUDA_ARCH__ >= 350
     return __ldg(data_);
 #else
     return *data_;
 #endif
   }

   /// Use the texture cache for reads; cast as a particular type
   template <typename T>
   __device__ __forceinline__ T ldgAs() const {
 #if __CUDA_ARCH__ >= 350
     return __ldg(dataAs<T>());
 #else
     return as<T>();
 #endif
   }

   /// Returns a tensor that is a view of the SubDim-dimensional slice
   /// of this tensor, starting where our data begins
   THCDeviceTensor<typename TensorType::DataType,
                SubDim,
                typename TensorType::IndexType,
                PtrTraits> view() {
     return tensor_.template view<SubDim>(data_);
   }

  private:
   /// One dimension greater can create us
   friend class THCDeviceSubTensor<TensorType, SubDim + 1, PtrTraits>;

   /// Our parent tensor can create us
   friend class
   THCDeviceTensor<typename TensorType::DataType,
                TensorType::NumDim,
                typename TensorType::IndexType,
                PtrTraits>;

   __host__ __device__ __forceinline__ THCDeviceSubTensor(
     TensorType& t,
     typename TensorType::DataPtrType data)
       : tensor_(t),
         data_(data) {
   }

   /// The tensor we're referencing
   TensorType& tensor_;

   /// The start of our sub-region
   typename TensorType::DataPtrType const data_;
 };

 } // namespace detail

 template <typename T, int Dim,
           typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__ __forceinline__
 detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
                         Dim - 1, PtrTraits>
 THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) {
   return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
     detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
       *this, data_)[index]);
 }

 template <typename T, int Dim,
           typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__ __forceinline__
 const detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
                               Dim - 1, PtrTraits>
 THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) const {
   return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
     detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
       const_cast<TensorType&>(*this), data_)[index]);
 }

 #include "THCDeviceTensor-inl.cuh"

 #endif // THC_DEVICE_TENSOR_INC
	#ifndef THC_DEVICE_TENSOR_INC
	#define THC_DEVICE_TENSOR_INC

	#include <cuda.h>
	#include <cuda_runtime.h>

	// A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0.
	template <bool>
	struct THCStaticAssert;

	template <>
	struct THCStaticAssert<true> {
	};

	#define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>())

	/// Our tensor type
	template <typename T,
	int Dim,
	typename IndexT,
	template <typename U> class PtrTraits>
	class THCDeviceTensor;

	/// Type of a subspace of a tensor
	namespace detail {
	template <typename TensorType,
	int SubDim,
	template <typename U> class PtrTraits>
	class THCDeviceSubTensor;
	}

	template <typename T>
	struct RestrictPtrTraits {
	typedef T* __restrict__ PtrType;
	};

	template <typename T>
	struct DefaultPtrTraits {
	typedef T* PtrType;
	};

	/**
	Templated multi-dimensional array that supports strided access of
	elements. Main access is through `operator[]`; e.g.,
	`tensor[x][y][z]`.

	- `T` is the contained type (e.g., `float`)
	- `Dim` is the tensor rank
	- `IndexT` is the integer type used for size/stride arrays, and for
	- all indexing math. Default is `int`, but for large tensors, `long`
	- can be used instead.
	- `PtrTraits` are traits applied to our data pointer (T*). By default,
	- this is just T, but RestrictPtrTraits can be used to apply T
	- __restrict__ for alias-free analysis.
	*/
	template <typename T,
	int Dim,
	typename IndexT = int,
	template <typename U> class PtrTraits = DefaultPtrTraits>
	class THCDeviceTensor {
	public:
	enum { NumDim = Dim };
	typedef T DataType;
	typedef IndexT IndexType;
	typedef typename PtrTraits<T>::PtrType DataPtrType;
	typedef THCDeviceTensor<T, Dim, IndexT, PtrTraits> TensorType;

	/// Default constructor
	__host__ __device__ THCDeviceTensor();

	/// Constructor that calculates strides with no padding
	__host__ __device__ THCDeviceTensor(DataPtrType data,
	const IndexT sizes[Dim]);

	/// Constructor that takes arbitrary size/stride arrays
	__host__ __device__ THCDeviceTensor(DataPtrType data,
	const IndexT sizes[Dim],
	const IndexT strides[Dim]);

	/// Returns true if the two tensors are of the same dimensionality,
	/// size and stride.
	template <int OtherDim>
	__host__ __device__ bool
	isSameSizeAndStride(
	const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const;

	/// Cast to a tensor of a different type of the same size and stride
	template <typename U>
	__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast();

	/// Const version of `cast`
	template <typename U>
	__host__ __device__
	const THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast() const;

	/// Returns a raw pointer to the start of our data.
	__host__ __device__ __forceinline__ DataPtrType data() {
	return data_;
	}

	/// Returns a raw pointer to the start of our data (const).
	__host__ __device__ __forceinline__
	const DataPtrType data() const {
	return data_;
	}

	/// Cast to a different datatype
	template <typename U>
	__host__ __device__ __forceinline__
	typename PtrTraits<U>::PtrType dataAs() {
	return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
	}

	/// Cast to a different datatype
	template <typename U>
	__host__ __device__ __forceinline__
	const typename PtrTraits<const U>::PtrType dataAs() const {
	return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
	}

	/// Returns a read/write view of a portion of our tensor.
	__host__ __device__ __forceinline__
	detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
	operator[](IndexT);

	/// Returns a read/write view of a portion of our tensor (const).
	__host__ __device__ __forceinline__
	const detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
	operator[](IndexT) const;

	/// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
	/// checking.
	__host__ __device__ __forceinline__ int getSize(int i) const {
	return size_[i];
	}

	/// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
	/// checking.
	__host__ __device__ __forceinline__ int getStride(int i) const {
	return stride_[i];
	}

	/// Returns the total number of elements contained within our data
	/// (product of `getSize(i)`)
	__host__ __device__ long numElements() const;

	/// Returns the size array.
	__host__ __device__ __forceinline__ const IndexT* sizes() const {
	return size_;
	}

	/// Returns the stride array.
	__host__ __device__ __forceinline__ const IndexT* strides() const {
	return stride_;
	}

	/// Returns true if there is no padding within the tensor and no
	/// re-ordering of the dimensions.
	/// ~~~
	/// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
	/// ~~~
	__host__ __device__ bool isContiguous() const;

	/// Returns whether a given dimension has only increasing stride
	/// from the previous dimension. A tensor that was permuted by
	/// exchanging size and stride only will fail this check.
	/// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
	__host__ __device__ bool isConsistentlySized(int i) const;

	// Returns whether at each dimension `stride <= size`.
	// If this is not the case then iterating once over the size space will
	// touch the same memory locations multiple times.
	__host__ __device__ bool isConsistentlySized() const;

	/// Returns true if the given dimension index has no padding
	__host__ __device__ bool isContiguousDim(int i) const;

	/// Returns a tensor of the same dimension after transposing the two
	/// dimensions given. Does not actually move elements; transposition
	/// is made by permuting the size/stride arrays.
	/// If the dimensions are not valid, asserts.
	__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
	transpose(int dim1, int dim2) const;

	/// Upcast a tensor of dimension `D` to some tensor of dimension
	/// D' > D by padding the leading dimensions by 1
	/// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
	template <int NewDim>
	__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
	upcastOuter();

	/// Upcast a tensor of dimension `D` to some tensor of dimension
	/// D' > D by padding the lowest/most varying dimensions by 1
	/// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
	template <int NewDim>
	__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
	upcastInner();

	/// Downcast a tensor of dimension `D` to some tensor of dimension
	/// D' < D by collapsing the leading dimensions. asserts if there is
	/// padding on the leading dimensions.
	template <int NewDim>
	__host__ __device__
	THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastOuter();

	/// Downcast a tensor of dimension `D` to some tensor of dimension
	/// D' < D by collapsing the leading dimensions. asserts if there is
	/// padding on the leading dimensions.
	template <int NewDim>
	__host__ __device__
	THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastInner();

	/// Returns a tensor that is a view of the `SubDim`-dimensional slice
	/// of this tensor, starting at `at`.
	template <int SubDim>
	__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
	view(DataPtrType at);

	/// Returns a tensor that is a view of the `SubDim`-dimensional slice
	/// of this tensor, starting where our data begins
	template <int SubDim>
	__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
	view();

	/// Zeroes out the tensor asynchronously. Asserts if the contents
	/// in question are not contiguous.
	void zero(cudaStream_t stream = 0);

	private:
	/// Raw pointer to where the tensor data begins
	DataPtrType data_;

	/// Array of strides (in sizeof(T) terms) per each dimension
	IndexT stride_[Dim];

	/// Size per each dimension
	IndexT size_[Dim];
	};

	namespace detail {

	/// Specialization for a view of a single value (0-dimensional)
	template <typename TensorType, template <typename U> class PtrTraits>
	class THCDeviceSubTensor<TensorType, 0, PtrTraits> {
	public:
	__host__ __device__ THCDeviceSubTensor<TensorType, 0, PtrTraits>
	operator=(typename TensorType::DataType val) {
	*data_ = val;
	return *this;
	}

	// operator T&
	__host__ __device__ operator typename TensorType::DataType&() {
	return *data_;
	}

	// const operator T& returning const T&
	__host__ __device__ operator const typename TensorType::DataType&() const {
	return *data_;
	}

	// operator& returning T*
	__host__ __device__ typename TensorType::DataType* operator&() {
	return data_;
	}

	// const operator& returning const T*
	__host__ __device__ const typename TensorType::DataType* operator&() const {
	return data_;
	}

	/// Returns a raw accessor to our slice.
	__host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
	return data_;
	}

	/// Returns a raw accessor to our slice (const).
	__host__ __device__ __forceinline__
	const typename TensorType::DataPtrType data() const {
	return data_;
	}

	/// Cast to a different datatype.
	template <typename T>
	__host__ __device__ T& as() {
	return *dataAs<T>();
	}

	/// Cast to a different datatype (const).
	template <typename T>
	__host__ __device__ const T& as() const {
	return *dataAs<T>();
	}

	/// Cast to a different datatype
	template <typename T>
	__host__ __device__ __forceinline__
	typename PtrTraits<T>::PtrType dataAs() {
	return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
	}

	/// Cast to a different datatype (const)
	template <typename T>
	__host__ __device__ __forceinline__
	typename PtrTraits<const T>::PtrType dataAs() const {
	return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
	}

	/// Use the texture cache for reads
	__device__ __forceinline__ typename TensorType::DataType ldg() const {
	#if __CUDA_ARCH__ >= 350
	return __ldg(data_);
	#else
	return *data_;
	#endif
	}

	/// Use the texture cache for reads; cast as a particular type
	template <typename T>
	__device__ __forceinline__ T ldgAs() const {
	#if __CUDA_ARCH__ >= 350
	return __ldg(dataAs<T>());
	#else
	return as<T>();
	#endif
	}

	private:
	/// One dimension greater can create us
	friend class THCDeviceSubTensor<TensorType, 1, PtrTraits>;

	/// Our parent tensor can create us
	friend class THCDeviceTensor<typename TensorType::DataType,
	1,
	typename TensorType::IndexType,
	PtrTraits>;

	__host__ __device__ __forceinline__ THCDeviceSubTensor(
	TensorType& t,
	typename TensorType::DataPtrType data)
	: tensor_(t),
	data_(data) {
	}

	/// The tensor we're referencing
	TensorType& tensor_;

	/// Where our value is located
	typename TensorType::DataPtrType const data_;
	};

	/// A `SubDim`-rank slice of a parent THCDeviceTensor
	template <typename TensorType,
	int SubDim,
	template <typename U> class PtrTraits>
	class THCDeviceSubTensor {
	public:
	/// Returns a view of the data located at our offset (the dimension
	/// `SubDim` - 1 tensor).
	__host__ __device__ __forceinline__
	THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
	operator[](typename TensorType::IndexType index) {
	return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
	tensor_,
	data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
	}

	/// Returns a view of the data located at our offset (the dimension
	/// `SubDim` - 1 tensor) (const).
	__host__ __device__ __forceinline__
	const THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
	operator[](typename TensorType::IndexType index) const {
	return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
	tensor_,
	data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
	}

	// operator& returning T*
	__host__ __device__ typename TensorType::DataType* operator&() {
	return data_;
	}

	// const operator& returning const T*
	__host__ __device__ const typename TensorType::DataType* operator&() const {
	return data_;
	}

	/// Returns a raw accessor to our slice.
	__host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
	return data_;
	}

	/// Returns a raw accessor to our slice (const).
	__host__ __device__ __forceinline__
	const typename TensorType::DataPtrType data() const {
	return data_;
	}

	/// Cast to a different datatype.
	template <typename T>
	__host__ __device__ T& as() {
	return *dataAs<T>();
	}

	/// Cast to a different datatype (const).
	template <typename T>
	__host__ __device__ const T& as() const {
	return *dataAs<T>();
	}

	/// Cast to a different datatype
	template <typename T>
	__host__ __device__ __forceinline__
	typename PtrTraits<T>::PtrType dataAs() {
	return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
	}

	/// Cast to a different datatype (const)
	template <typename T>
	__host__ __device__ __forceinline__
	typename PtrTraits<const T>::PtrType dataAs() const {
	return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
	}

	/// Use the texture cache for reads
	__device__ __forceinline__ typename TensorType::DataType ldg() const {
	#if __CUDA_ARCH__ >= 350
	return __ldg(data_);
	#else
	return *data_;
	#endif
	}

	/// Use the texture cache for reads; cast as a particular type
	template <typename T>
	__device__ __forceinline__ T ldgAs() const {
	#if __CUDA_ARCH__ >= 350
	return __ldg(dataAs<T>());
	#else
	return as<T>();
	#endif
	}

	/// Returns a tensor that is a view of the SubDim-dimensional slice
	/// of this tensor, starting where our data begins
	THCDeviceTensor<typename TensorType::DataType,
	SubDim,
	typename TensorType::IndexType,
	PtrTraits> view() {
	return tensor_.template view<SubDim>(data_);
	}

	private:
	/// One dimension greater can create us
	friend class THCDeviceSubTensor<TensorType, SubDim + 1, PtrTraits>;

	/// Our parent tensor can create us
	friend class
	THCDeviceTensor<typename TensorType::DataType,
	TensorType::NumDim,
	typename TensorType::IndexType,
	PtrTraits>;

	__host__ __device__ __forceinline__ THCDeviceSubTensor(
	TensorType& t,
	typename TensorType::DataPtrType data)
	: tensor_(t),
	data_(data) {
	}

	/// The tensor we're referencing
	TensorType& tensor_;

	/// The start of our sub-region
	typename TensorType::DataPtrType const data_;
	};

	} // namespace detail

	template <typename T, int Dim,
	typename IndexT, template <typename U> class PtrTraits>
	__host__ __device__ __forceinline__
	detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
	Dim - 1, PtrTraits>
	THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) {
	return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
	detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
	*this, data_)[index]);
	}

	template <typename T, int Dim,
	typename IndexT, template <typename U> class PtrTraits>
	__host__ __device__ __forceinline__
	const detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
	Dim - 1, PtrTraits>
	THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) const {
	return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
	detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
	const_cast<TensorType&>(*this), data_)[index]);
	}

	#include "THCDeviceTensor-inl.cuh"

	#endif // THC_DEVICE_TENSOR_INC