torch/lib/THC/THCTensorScatterGather.cu - platform/external/pytorch - Git at Google

 #include "THCTensorMath.h"
 #include "THCGeneral.h"
 #include "THCAtomics.cuh"
 #include "THCApply.cuh"

 // Compute the offsets into the given tensors for a linear index. For the 't2'
 // tensor, dimension 'dim' is skipped. The tensors are assumed to have the same
 // size (with the exception of 't2' in dimension 'dim').
 // This version uses a static number of dimensions.
 template <typename IndexType, typename Real, int Dims>
 struct IndexToScatterGatherOffsets {
   static __device__ void compute(
       IndexType linearId, const int dim,
       const TensorInfo<int64_t, IndexType>& index, IndexType* indexOffset,
       const TensorInfo<Real, IndexType>& t1, IndexType* t1Offset,
       const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
     for (int d = Dims - 1; d >= 0; d--) {
       IndexType curDimIndex = linearId % index.sizes[d];
       *indexOffset += curDimIndex * index.strides[d];
       *t1Offset += curDimIndex * t1.strides[d];
       if (d != dim) {
         *t2Offset += curDimIndex * t2.strides[d];
       }
       linearId /= index.sizes[d];
     }
   }

   static __device__ void compute(
       IndexType linearId, const int dim,
       const TensorInfo<int64_t, IndexType>& index, IndexType* indexOffset,
       const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
     for (int d = Dims - 1; d >= 0; d--) {
       IndexType curDimIndex = linearId % index.sizes[d];
       *indexOffset += curDimIndex * index.strides[d];
       if (d != dim) {
         *t2Offset += curDimIndex * t2.strides[d];
       }
       linearId /= index.sizes[d];
     }
   }
 };

 // Same as above but using a dynamic number of dimensions.
 template <typename IndexType, typename Real>
 struct IndexToScatterGatherOffsets<IndexType, Real, -1> {
   static __device__ void compute(
       IndexType linearId, const int dim,
       const TensorInfo<int64_t, IndexType>& index, IndexType* indexOffset,
       const TensorInfo<Real, IndexType>& t1, IndexType* t1Offset,
       const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
     for (int d = index.dims - 1; d >= 0; d--) {
       IndexType curDimIndex = linearId % index.sizes[d];
       *indexOffset += curDimIndex * index.strides[d];
       *t1Offset += curDimIndex * t1.strides[d];
       if (d != dim) {
         *t2Offset += curDimIndex * t2.strides[d];
       }
       linearId /= index.sizes[d];
     }
   }

   static __device__ void compute(
       IndexType linearId, const int dim,
       const TensorInfo<int64_t, IndexType>& index, IndexType* indexOffset,
       const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
     for (int d = index.dims - 1; d >= 0; d--) {
       IndexType curDimIndex = linearId % index.sizes[d];
       *indexOffset += curDimIndex * index.strides[d];
       if (d != dim) {
         *t2Offset += curDimIndex * t2.strides[d];
       }
       linearId /= index.sizes[d];
     }
   }
 };

 template <typename IndexType, typename Real, int Dims>
 __global__ void THCudaTensor_gatherKernel(
     TensorInfo<Real, IndexType> tensor,
     TensorInfo<Real, IndexType> src,
     TensorInfo<int64_t, IndexType> index,
     const int dim,
     const IndexType totalElements) {
   for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
        linearId < totalElements;
        linearId += gridDim.x * blockDim.x) {
     IndexType tensorOffset = 0;
     IndexType srcOffset = 0;
     IndexType indexOffset = 0;

     IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
                                                           index, &indexOffset,
                                                           tensor, &tensorOffset,
                                                           src, &srcOffset);

     int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE;
     assert(indexValue >= 0 && indexValue < src.sizes[dim]);
     srcOffset += indexValue * src.strides[dim];

     tensor.data[tensorOffset] = src.data[srcOffset];
   }
 }

 template <typename IndexType, typename Real, int Dims>
 __global__ void THCudaTensor_scatterKernel(
     TensorInfo<Real, IndexType> tensor,
     TensorInfo<Real, IndexType> src,
     TensorInfo<int64_t, IndexType> index,
     const int dim,
     const IndexType totalElements) {
   for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
        linearId < totalElements;
        linearId += gridDim.x * blockDim.x) {
     IndexType tensorOffset = 0;
     IndexType srcOffset = 0;
     IndexType indexOffset = 0;

     IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
                                                           index, &indexOffset,
                                                           src, &srcOffset,
                                                           tensor, &tensorOffset);

     int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE;
     assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
     tensorOffset += indexValue * tensor.strides[dim];

     tensor.data[tensorOffset] = src.data[srcOffset];
   }
 }

 template <typename IndexType, typename Real, int Dims>
 __global__ void THCudaTensor_scatterAddKernel(
     TensorInfo<Real, IndexType> tensor,
     TensorInfo<Real, IndexType> src,
     TensorInfo<int64_t, IndexType> index,
     const int dim,
     const IndexType totalElements) {
   for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
        linearId < totalElements;
        linearId += gridDim.x * blockDim.x) {
     IndexType tensorOffset = 0;
     IndexType srcOffset = 0;
     IndexType indexOffset = 0;

     IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
                                                           index, &indexOffset,
                                                           src, &srcOffset,
                                                           tensor, &tensorOffset);

     int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE;
     assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
     tensorOffset += indexValue * tensor.strides[dim];

     atomicAdd(&tensor.data[tensorOffset], src.data[srcOffset]);
   }
 }

 template <typename IndexType, typename Real, int Dims>
 __global__ void THCudaTensor_scatterFillKernel(
     TensorInfo<Real, IndexType> tensor,
     TensorInfo<int64_t, IndexType> index,
     Real value,
     const int dim,
     const IndexType totalElements) {
   for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
        linearId < totalElements;
        linearId += gridDim.x * blockDim.x) {
     IndexType tensorOffset = 0;
     IndexType indexOffset = 0;

     IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
                                                           index, &indexOffset,
                                                           tensor, &tensorOffset);

     int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE;
     assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
     tensorOffset += indexValue * tensor.strides[dim];

     tensor.data[tensorOffset] = value;
   }
 }

 #include "generic/THCTensorScatterGather.cu"
 #include "THCGenerateAllTypes.h"
	#include "THCTensorMath.h"
	#include "THCGeneral.h"
	#include "THCAtomics.cuh"
	#include "THCApply.cuh"

	// Compute the offsets into the given tensors for a linear index. For the 't2'
	// tensor, dimension 'dim' is skipped. The tensors are assumed to have the same
	// size (with the exception of 't2' in dimension 'dim').
	// This version uses a static number of dimensions.
	template <typename IndexType, typename Real, int Dims>
	struct IndexToScatterGatherOffsets {
	static __device__ void compute(
	IndexType linearId, const int dim,
	const TensorInfo<int64_t, IndexType>& index, IndexType* indexOffset,
	const TensorInfo<Real, IndexType>& t1, IndexType* t1Offset,
	const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
	for (int d = Dims - 1; d >= 0; d--) {
	IndexType curDimIndex = linearId % index.sizes[d];
	indexOffset += curDimIndex index.strides[d];
	t1Offset += curDimIndex t1.strides[d];
	if (d != dim) {
	t2Offset += curDimIndex t2.strides[d];
	}
	linearId /= index.sizes[d];
	}
	}

	static __device__ void compute(
	IndexType linearId, const int dim,
	const TensorInfo<int64_t, IndexType>& index, IndexType* indexOffset,
	const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
	for (int d = Dims - 1; d >= 0; d--) {
	IndexType curDimIndex = linearId % index.sizes[d];
	indexOffset += curDimIndex index.strides[d];
	if (d != dim) {
	t2Offset += curDimIndex t2.strides[d];
	}
	linearId /= index.sizes[d];
	}
	}
	};

	// Same as above but using a dynamic number of dimensions.
	template <typename IndexType, typename Real>
	struct IndexToScatterGatherOffsets<IndexType, Real, -1> {
	static __device__ void compute(
	IndexType linearId, const int dim,
	const TensorInfo<int64_t, IndexType>& index, IndexType* indexOffset,
	const TensorInfo<Real, IndexType>& t1, IndexType* t1Offset,
	const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
	for (int d = index.dims - 1; d >= 0; d--) {
	IndexType curDimIndex = linearId % index.sizes[d];
	indexOffset += curDimIndex index.strides[d];
	t1Offset += curDimIndex t1.strides[d];
	if (d != dim) {
	t2Offset += curDimIndex t2.strides[d];
	}
	linearId /= index.sizes[d];
	}
	}

	static __device__ void compute(
	IndexType linearId, const int dim,
	const TensorInfo<int64_t, IndexType>& index, IndexType* indexOffset,
	const TensorInfo<Real, IndexType>& t2, IndexType* t2Offset) {
	for (int d = index.dims - 1; d >= 0; d--) {
	IndexType curDimIndex = linearId % index.sizes[d];
	indexOffset += curDimIndex index.strides[d];
	if (d != dim) {
	t2Offset += curDimIndex t2.strides[d];
	}
	linearId /= index.sizes[d];
	}
	}
	};

	template <typename IndexType, typename Real, int Dims>
	__global__ void THCudaTensor_gatherKernel(
	TensorInfo<Real, IndexType> tensor,
	TensorInfo<Real, IndexType> src,
	TensorInfo<int64_t, IndexType> index,
	const int dim,
	const IndexType totalElements) {
	for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
	linearId < totalElements;
	linearId += gridDim.x * blockDim.x) {
	IndexType tensorOffset = 0;
	IndexType srcOffset = 0;
	IndexType indexOffset = 0;

	IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
	index, &indexOffset,
	tensor, &tensorOffset,
	src, &srcOffset);

	int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE;
	assert(indexValue >= 0 && indexValue < src.sizes[dim]);
	srcOffset += indexValue * src.strides[dim];

	tensor.data[tensorOffset] = src.data[srcOffset];
	}
	}

	template <typename IndexType, typename Real, int Dims>
	__global__ void THCudaTensor_scatterKernel(
	TensorInfo<Real, IndexType> tensor,
	TensorInfo<Real, IndexType> src,
	TensorInfo<int64_t, IndexType> index,
	const int dim,
	const IndexType totalElements) {
	for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
	linearId < totalElements;
	linearId += gridDim.x * blockDim.x) {
	IndexType tensorOffset = 0;
	IndexType srcOffset = 0;
	IndexType indexOffset = 0;

	IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
	index, &indexOffset,
	src, &srcOffset,
	tensor, &tensorOffset);

	int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE;
	assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
	tensorOffset += indexValue * tensor.strides[dim];

	tensor.data[tensorOffset] = src.data[srcOffset];
	}
	}

	template <typename IndexType, typename Real, int Dims>
	__global__ void THCudaTensor_scatterAddKernel(
	TensorInfo<Real, IndexType> tensor,
	TensorInfo<Real, IndexType> src,
	TensorInfo<int64_t, IndexType> index,
	const int dim,
	const IndexType totalElements) {
	for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
	linearId < totalElements;
	linearId += gridDim.x * blockDim.x) {
	IndexType tensorOffset = 0;
	IndexType srcOffset = 0;
	IndexType indexOffset = 0;

	IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
	index, &indexOffset,
	src, &srcOffset,
	tensor, &tensorOffset);

	int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE;
	assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
	tensorOffset += indexValue * tensor.strides[dim];

	atomicAdd(&tensor.data[tensorOffset], src.data[srcOffset]);
	}
	}

	template <typename IndexType, typename Real, int Dims>
	__global__ void THCudaTensor_scatterFillKernel(
	TensorInfo<Real, IndexType> tensor,
	TensorInfo<int64_t, IndexType> index,
	Real value,
	const int dim,
	const IndexType totalElements) {
	for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
	linearId < totalElements;
	linearId += gridDim.x * blockDim.x) {
	IndexType tensorOffset = 0;
	IndexType indexOffset = 0;

	IndexToScatterGatherOffsets<IndexType, Real, Dims>::compute(linearId, dim,
	index, &indexOffset,
	tensor, &tensorOffset);

	int64_t indexValue = index.data[indexOffset] - TH_INDEX_BASE;
	assert(indexValue >= 0 && indexValue < tensor.sizes[dim]);
	tensorOffset += indexValue * tensor.strides[dim];

	tensor.data[tensorOffset] = value;
	}
	}

	#include "generic/THCTensorScatterGather.cu"
	#include "THCGenerateAllTypes.h"