THCTensorMathScan.cu - platform/external/pytorch - Git at Google

 #include "THCTensorMath.h"
 #include "THCGeneral.h"
 #include "THCBlas.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCReduce.cuh"

 #include <thrust/functional.h>

 /* Perform an inclusive scan along an outer dimension of a tensor.
  *
  * - num_orows is the size of the flattened outer dimensions;
  * - num_irows is the size of the flattened inner dimensions;
  * - row_size is the size of the dimension along which to compute the variance;
  *
  * The dimensions to the outside and inside of the specified dimension are considered as flattened.
  * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened
  * outer dimensions, which contains several "inner rows").
  * Each thread processes a single inner row at a time.
  */
 template<class BinaryOp>
 __global__ void THCudaTensor_kernel_scanOuterDim(float *tgt_, float *src_,
                                                  unsigned num_orows, unsigned num_irows, unsigned row_size,
                                                  float init, BinaryOp binary_op)
 {
   for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
     for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
       float *src = src_ + orow * row_size * num_irows + irow;
       float *tgt = tgt_ + orow * row_size * num_irows + irow;
       float acc = init;

       for (unsigned col = 0; col < row_size; ++col) {
         acc = binary_op(acc, *src);
         *tgt = acc;

         src += num_irows;
         tgt += num_irows;
       }
     }
   }
 }

 template<class BinaryOp>
 __host__ void THCudaTensor_scanOuterDim(THCState *state, THCudaTensor *tgt, THCudaTensor *src, long dimension,
                                         float init, BinaryOp binary_op)
 {
   unsigned ndim = THCudaTensor_nDimension(state, src);
   // Treat all outer dimensions (i.e. dim < dimension) as one.
   unsigned num_orows = 1;
   for (unsigned dim = 0; dim < dimension; dim++) {
     num_orows *= THCudaTensor_size(state, src, dim);
   }
   unsigned row_size = THCudaTensor_size(state, src, dimension);
   // Treat all inner dimensions (i.e. dim > dimension) as one.
   unsigned num_irows = 1;
   for (unsigned dim = dimension + 1; dim < ndim; dim++) {
     num_irows *= THCudaTensor_size(state, src, dim);
   }

   dim3 threads(min(512, num_irows));
   unsigned maxGridDim = 1024;
   dim3 grid(min(maxGridDim, num_orows), min(maxGridDim, THCCeilDiv(num_irows, threads.x)));

   THCudaTensor_kernel_scanOuterDim<<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
       THCudaTensor_data(state, tgt), THCudaTensor_data(state, src), num_orows, num_irows, row_size, init, binary_op);
   cudaError errcode = cudaGetLastError();
   if (errcode != cudaSuccess) {
     THError(cudaGetErrorString(errcode));
   }
 }


 /* Perform an inclusive scan along the innermost dimension of a tensor.
  *
  * - num_rows is the size of the flattened outer dimensions;
  * - row_size is the size of the innermost dimension;
  *
  * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
  * considered as having 'num_rows' rows of size 'row_size'.
  * Each thread block processes one or more sets of contiguous rows (processing multiple rows
  * per thread block is quicker than processing a single row, especially for short rows).
  */
 template<int num_threads_x, int num_threads_y, class BinaryFunction>
 __global__ void THCudaTensor_kernel_scanInnermostDim(float *tgt_, float *src_,
                                                      unsigned num_rows, unsigned row_size,
                                                      float init, BinaryFunction binary_op)
 {
   __shared__ float sbuf[num_threads_y][2 * num_threads_x];

   float* row_buf = sbuf[threadIdx.y];

   for (unsigned block_row = blockIdx.x * blockDim.y;
        block_row < num_rows;
        block_row += blockDim.y * gridDim.x) {
     unsigned row = block_row + threadIdx.y;
     float block_total = init;

     float *row_src = src_ + row * row_size;
     float *row_tgt = tgt_ + row * row_size;

     // Perform scan on one block at a time, keeping track of the total value of
     // all blocks processed so far.
     for (unsigned block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) {
       // Load data into shared memory (two values per thread).
       unsigned col1 = block_col + threadIdx.x;
       unsigned col2 = block_col + num_threads_x + threadIdx.x;
       if (row < num_rows) {
         if (col1 < row_size) {
           row_buf[threadIdx.x] = row_src[col1];
         } else {
           row_buf[threadIdx.x] = init;
         }

         if (col2 < row_size) {
           row_buf[num_threads_x + threadIdx.x] = row_src[col2];
         } else {
           row_buf[num_threads_x + threadIdx.x] = init;
         }

         // Add the total value of all previous blocks to the first value of this block.
         if (threadIdx.x == 0) {
           row_buf[0] = binary_op(row_buf[0], block_total);
         }
       }
       __syncthreads();

       // Parallel reduction (up-sweep).
       for (unsigned s = num_threads_x, d = 1; s >= 1; s >>= 1, d <<= 1) {
         if (row < num_rows && threadIdx.x < s) {
           unsigned offset = (2 * threadIdx.x + 1) * d - 1;
           row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
         }
         __syncthreads();
       }

       // Down-sweep.
       for (unsigned s = 2, d = num_threads_x / 2; d >= 1; s <<= 1, d >>= 1) {
         if (row < num_rows && threadIdx.x < s - 1) {
           unsigned offset = 2 * (threadIdx.x + 1) * d - 1;
           row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
         }
         __syncthreads();
       }

       // Write back to output.
       if (row < num_rows) {
         if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x];
         if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x];
       }
       block_total = row_buf[2 * num_threads_x - 1];
       __syncthreads();
     }
   }
 }

 template<class BinaryFunction>
 __host__ void THCudaTensor_scanInnermostDim(THCState *state, THCudaTensor *tgt, THCudaTensor *src, float init, BinaryFunction binary_op)
 {
   unsigned ndim = THCudaTensor_nDimension(state, src);
   // Treat all outer dimensions as a single dimension.
   unsigned num_rows = 1;
   for (unsigned dim = 0; dim < ndim - 1; dim++) {
     num_rows *= THCudaTensor_size(state, src, dim);
   }
   unsigned row_size = THCudaTensor_size(state, src, ndim - 1);

   dim3 threads(16, 32);
   dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));

   THCudaTensor_kernel_scanInnermostDim<16, 32><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
       THCudaTensor_data(state, tgt), THCudaTensor_data(state, src), num_rows, row_size, init, binary_op);
   cudaError errcode = cudaGetLastError();
   if (errcode != cudaSuccess) {
     THError(cudaGetErrorString(errcode));
   }
 }

 template<class BinaryFunction>
 void THCudaTensor_scanDim(THCState *state, THCudaTensor *self_, THCudaTensor *src, long dimension, float init, BinaryFunction binary_op)
 {
   THCudaTensor_resizeAs(state, self_, src);

   THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
   src = THCudaTensor_newContiguous(state, src);

   if (dimension == THCudaTensor_nDimension(state, src) - 1) {
     THCudaTensor_scanInnermostDim(state, self, src, init, binary_op);
   } else {
     THCudaTensor_scanOuterDim(state, self, src, dimension, init, binary_op);
   }

   THCudaTensor_free(state, src);
   THCudaTensor_freeCopyTo(state, self, self_);
 }

 void THCudaTensor_cumsum(THCState *state, THCudaTensor *self, THCudaTensor *src, long dimension)
 {
   THAssert(THCudaTensor_checkGPU(state, 2, self, src));
   return THCudaTensor_scanDim(state, self, src, dimension, 0.0f, thrust::plus<float>());
 }

 void THCudaTensor_cumprod(THCState *state, THCudaTensor *self, THCudaTensor *src, long dimension)
 {
   THAssert(THCudaTensor_checkGPU(state, 2, self, src));
   return THCudaTensor_scanDim(state, self, src, dimension, 1.0f, thrust::multiplies<float>());
 }
	#include "THCTensorMath.h"
	#include "THCGeneral.h"
	#include "THCBlas.h"
	#include "THCTensorCopy.h"
	#include "THCApply.cuh"
	#include "THCReduce.cuh"

	#include <thrust/functional.h>

	/* Perform an inclusive scan along an outer dimension of a tensor.
	*
	* - num_orows is the size of the flattened outer dimensions;
	* - num_irows is the size of the flattened inner dimensions;
	* - row_size is the size of the dimension along which to compute the variance;
	*
	* The dimensions to the outside and inside of the specified dimension are considered as flattened.
	* Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened
	* outer dimensions, which contains several "inner rows").
	* Each thread processes a single inner row at a time.
	*/
	template<class BinaryOp>
	__global__ void THCudaTensor_kernel_scanOuterDim(float tgt_, float src_,
	unsigned num_orows, unsigned num_irows, unsigned row_size,
	float init, BinaryOp binary_op)
	{
	for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
	for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
	float src = src_ + orow row_size * num_irows + irow;
	float tgt = tgt_ + orow row_size * num_irows + irow;
	float acc = init;

	for (unsigned col = 0; col < row_size; ++col) {
	acc = binary_op(acc, *src);
	*tgt = acc;

	src += num_irows;
	tgt += num_irows;
	}
	}
	}
	}

	template<class BinaryOp>
	__host__ void THCudaTensor_scanOuterDim(THCState state, THCudaTensor tgt, THCudaTensor *src, long dimension,
	float init, BinaryOp binary_op)
	{
	unsigned ndim = THCudaTensor_nDimension(state, src);
	// Treat all outer dimensions (i.e. dim < dimension) as one.
	unsigned num_orows = 1;
	for (unsigned dim = 0; dim < dimension; dim++) {
	num_orows *= THCudaTensor_size(state, src, dim);
	}
	unsigned row_size = THCudaTensor_size(state, src, dimension);
	// Treat all inner dimensions (i.e. dim > dimension) as one.
	unsigned num_irows = 1;
	for (unsigned dim = dimension + 1; dim < ndim; dim++) {
	num_irows *= THCudaTensor_size(state, src, dim);
	}

	dim3 threads(min(512, num_irows));
	unsigned maxGridDim = 1024;
	dim3 grid(min(maxGridDim, num_orows), min(maxGridDim, THCCeilDiv(num_irows, threads.x)));

	THCudaTensor_kernel_scanOuterDim<<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
	THCudaTensor_data(state, tgt), THCudaTensor_data(state, src), num_orows, num_irows, row_size, init, binary_op);
	cudaError errcode = cudaGetLastError();
	if (errcode != cudaSuccess) {
	THError(cudaGetErrorString(errcode));
	}
	}


	/* Perform an inclusive scan along the innermost dimension of a tensor.
	*
	* - num_rows is the size of the flattened outer dimensions;
	* - row_size is the size of the innermost dimension;
	*
	* The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
	* considered as having 'num_rows' rows of size 'row_size'.
	* Each thread block processes one or more sets of contiguous rows (processing multiple rows
	* per thread block is quicker than processing a single row, especially for short rows).
	*/
	template<int num_threads_x, int num_threads_y, class BinaryFunction>
	__global__ void THCudaTensor_kernel_scanInnermostDim(float tgt_, float src_,
	unsigned num_rows, unsigned row_size,
	float init, BinaryFunction binary_op)
	{
	__shared__ float sbuf[num_threads_y][2 * num_threads_x];

	float* row_buf = sbuf[threadIdx.y];

	for (unsigned block_row = blockIdx.x * blockDim.y;
	block_row < num_rows;
	block_row += blockDim.y * gridDim.x) {
	unsigned row = block_row + threadIdx.y;
	float block_total = init;

	float row_src = src_ + row row_size;
	float row_tgt = tgt_ + row row_size;

	// Perform scan on one block at a time, keeping track of the total value of
	// all blocks processed so far.
	for (unsigned block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) {
	// Load data into shared memory (two values per thread).
	unsigned col1 = block_col + threadIdx.x;
	unsigned col2 = block_col + num_threads_x + threadIdx.x;
	if (row < num_rows) {
	if (col1 < row_size) {
	row_buf[threadIdx.x] = row_src[col1];
	} else {
	row_buf[threadIdx.x] = init;
	}

	if (col2 < row_size) {
	row_buf[num_threads_x + threadIdx.x] = row_src[col2];
	} else {
	row_buf[num_threads_x + threadIdx.x] = init;
	}

	// Add the total value of all previous blocks to the first value of this block.
	if (threadIdx.x == 0) {
	row_buf[0] = binary_op(row_buf[0], block_total);
	}
	}
	__syncthreads();

	// Parallel reduction (up-sweep).
	for (unsigned s = num_threads_x, d = 1; s >= 1; s >>= 1, d <<= 1) {
	if (row < num_rows && threadIdx.x < s) {
	unsigned offset = (2 * threadIdx.x + 1) * d - 1;
	row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
	}
	__syncthreads();
	}

	// Down-sweep.
	for (unsigned s = 2, d = num_threads_x / 2; d >= 1; s <<= 1, d >>= 1) {
	if (row < num_rows && threadIdx.x < s - 1) {
	unsigned offset = 2 * (threadIdx.x + 1) * d - 1;
	row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
	}
	__syncthreads();
	}

	// Write back to output.
	if (row < num_rows) {
	if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x];
	if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x];
	}
	block_total = row_buf[2 * num_threads_x - 1];
	__syncthreads();
	}
	}
	}

	template<class BinaryFunction>
	__host__ void THCudaTensor_scanInnermostDim(THCState state, THCudaTensor tgt, THCudaTensor *src, float init, BinaryFunction binary_op)
	{
	unsigned ndim = THCudaTensor_nDimension(state, src);
	// Treat all outer dimensions as a single dimension.
	unsigned num_rows = 1;
	for (unsigned dim = 0; dim < ndim - 1; dim++) {
	num_rows *= THCudaTensor_size(state, src, dim);
	}
	unsigned row_size = THCudaTensor_size(state, src, ndim - 1);

	dim3 threads(16, 32);
	dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));

	THCudaTensor_kernel_scanInnermostDim<16, 32><<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
	THCudaTensor_data(state, tgt), THCudaTensor_data(state, src), num_rows, row_size, init, binary_op);
	cudaError errcode = cudaGetLastError();
	if (errcode != cudaSuccess) {
	THError(cudaGetErrorString(errcode));
	}
	}

	template<class BinaryFunction>
	void THCudaTensor_scanDim(THCState state, THCudaTensor self_, THCudaTensor *src, long dimension, float init, BinaryFunction binary_op)
	{
	THCudaTensor_resizeAs(state, self_, src);

	THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
	src = THCudaTensor_newContiguous(state, src);

	if (dimension == THCudaTensor_nDimension(state, src) - 1) {
	THCudaTensor_scanInnermostDim(state, self, src, init, binary_op);
	} else {
	THCudaTensor_scanOuterDim(state, self, src, dimension, init, binary_op);
	}

	THCudaTensor_free(state, src);
	THCudaTensor_freeCopyTo(state, self, self_);
	}

	void THCudaTensor_cumsum(THCState state, THCudaTensor self, THCudaTensor *src, long dimension)
	{
	THAssert(THCudaTensor_checkGPU(state, 2, self, src));
	return THCudaTensor_scanDim(state, self, src, dimension, 0.0f, thrust::plus<float>());
	}

	void THCudaTensor_cumprod(THCState state, THCudaTensor self, THCudaTensor *src, long dimension)
	{
	THAssert(THCudaTensor_checkGPU(state, 2, self, src));
	return THCudaTensor_scanDim(state, self, src, dimension, 1.0f, thrust::multiplies<float>());
	}