torch/lib/THC/THCTensorMathScan.cu - platform/external/pytorch - Git at Google

 #include "THCTensorMath.h"
 #include "THCGeneral.h"
 #include "THCBlas.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCReduce.cuh"
 #include "THCNumerics.cuh"
 #include "THCTensorMathReduce.cuh"
 #include <thrust/scan.h>
 #include <thrust/execution_policy.h>

 /* Perform an inclusive scan along an outer dimension of a tensor.
  *
  * - num_orows is the size of the flattened outer dimensions;
  * - num_irows is the size of the flattened inner dimensions;
  * - row_size is the size of the dimension along which to compute the variance;
  *
  * The dimensions to the outside and inside of the specified dimension are considered as flattened.
  * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened
  * outer dimensions, which contains several "inner rows").
  * Each thread processes a single inner row at a time.
  */
 template<typename T, class BinaryOp>
 __global__ void THCTensor_kernel_scanOuterDim(T *tgt_, T *src_,
                                               unsigned num_orows, unsigned num_irows, unsigned row_size,
                                               T init, BinaryOp binary_op)
 {
   for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
     for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
       T *src = src_ + orow * row_size * num_irows + irow;
       T *tgt = tgt_ + orow * row_size * num_irows + irow;
       T acc = init;

       for (unsigned col = 0; col < row_size; ++col) {
         acc = binary_op(acc, *src);
         *tgt = acc;

         src += num_irows;
         tgt += num_irows;
       }
     }
   }
 }

 /* Perform an inclusive scan along the innermost dimension of a tensor.
  *
  * - num_rows is the size of the flattened outer dimensions;
  * - row_size is the size of the innermost dimension;
  *
  * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
  * considered as having 'num_rows' rows of size 'row_size'.
  * Each thread block processes one or more sets of contiguous rows (processing multiple rows
  * per thread block is quicker than processing a single row, especially for short rows).
  */
 template<typename T, int num_threads_x, int num_threads_y, class BinaryFunction>
 __global__ void THCTensor_kernel_scanInnermostDim(T *tgt_, T *src_,
                                                   unsigned num_rows, unsigned row_size,
                                                   T init, BinaryFunction binary_op)
 {
   __shared__ T sbuf[num_threads_y][2 * num_threads_x];

   T* row_buf = sbuf[threadIdx.y];

   for (unsigned block_row = blockIdx.x * blockDim.y;
        block_row < num_rows;
        block_row += blockDim.y * gridDim.x) {
     unsigned row = block_row + threadIdx.y;
     T block_total = init;

     T *row_src = src_ + row * row_size;
     T *row_tgt = tgt_ + row * row_size;

     // Perform scan on one block at a time, keeping track of the total value of
     // all blocks processed so far.
     for (unsigned block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) {
       // Load data into shared memory (two values per thread).
       unsigned col1 = block_col + threadIdx.x;
       unsigned col2 = block_col + num_threads_x + threadIdx.x;
       if (row < num_rows) {
         if (col1 < row_size) {
           row_buf[threadIdx.x] = row_src[col1];
         } else {
           row_buf[threadIdx.x] = init;
         }

         if (col2 < row_size) {
           row_buf[num_threads_x + threadIdx.x] = row_src[col2];
         } else {
           row_buf[num_threads_x + threadIdx.x] = init;
         }

         // Add the total value of all previous blocks to the first value of this block.
         if (threadIdx.x == 0) {
           row_buf[0] = binary_op(row_buf[0], block_total);
         }
       }
       __syncthreads();

       // Parallel reduction (up-sweep).
       for (unsigned s = num_threads_x, d = 1; s >= 1; s >>= 1, d <<= 1) {
         if (row < num_rows && threadIdx.x < s) {
           unsigned offset = (2 * threadIdx.x + 1) * d - 1;
           row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
         }
         __syncthreads();
       }

       // Down-sweep.
       for (unsigned s = 2, d = num_threads_x / 2; d >= 1; s <<= 1, d >>= 1) {
         if (row < num_rows && threadIdx.x < s - 1) {
           unsigned offset = 2 * (threadIdx.x + 1) * d - 1;
           row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
         }
         __syncthreads();
       }

       // Write back to output.
       if (row < num_rows) {
         if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x];
         if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x];
       }
       block_total = row_buf[2 * num_threads_x - 1];
       __syncthreads();
     }
   }
 }

 #include "generic/THCTensorMathScan.cu"
 #include "THCGenerateAllTypes.h"
	#include "THCTensorMath.h"
	#include "THCGeneral.h"
	#include "THCBlas.h"
	#include "THCTensorCopy.h"
	#include "THCApply.cuh"
	#include "THCReduce.cuh"
	#include "THCNumerics.cuh"
	#include "THCTensorMathReduce.cuh"
	#include <thrust/scan.h>
	#include <thrust/execution_policy.h>

	/* Perform an inclusive scan along an outer dimension of a tensor.
	*
	* - num_orows is the size of the flattened outer dimensions;
	* - num_irows is the size of the flattened inner dimensions;
	* - row_size is the size of the dimension along which to compute the variance;
	*
	* The dimensions to the outside and inside of the specified dimension are considered as flattened.
	* Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened
	* outer dimensions, which contains several "inner rows").
	* Each thread processes a single inner row at a time.
	*/
	template<typename T, class BinaryOp>
	__global__ void THCTensor_kernel_scanOuterDim(T tgt_, T src_,
	unsigned num_orows, unsigned num_irows, unsigned row_size,
	T init, BinaryOp binary_op)
	{
	for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
	for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
	T src = src_ + orow row_size * num_irows + irow;
	T tgt = tgt_ + orow row_size * num_irows + irow;
	T acc = init;

	for (unsigned col = 0; col < row_size; ++col) {
	acc = binary_op(acc, *src);
	*tgt = acc;

	src += num_irows;
	tgt += num_irows;
	}
	}
	}
	}

	/* Perform an inclusive scan along the innermost dimension of a tensor.
	*
	* - num_rows is the size of the flattened outer dimensions;
	* - row_size is the size of the innermost dimension;
	*
	* The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
	* considered as having 'num_rows' rows of size 'row_size'.
	* Each thread block processes one or more sets of contiguous rows (processing multiple rows
	* per thread block is quicker than processing a single row, especially for short rows).
	*/
	template<typename T, int num_threads_x, int num_threads_y, class BinaryFunction>
	__global__ void THCTensor_kernel_scanInnermostDim(T tgt_, T src_,
	unsigned num_rows, unsigned row_size,
	T init, BinaryFunction binary_op)
	{
	__shared__ T sbuf[num_threads_y][2 * num_threads_x];

	T* row_buf = sbuf[threadIdx.y];

	for (unsigned block_row = blockIdx.x * blockDim.y;
	block_row < num_rows;
	block_row += blockDim.y * gridDim.x) {
	unsigned row = block_row + threadIdx.y;
	T block_total = init;

	T row_src = src_ + row row_size;
	T row_tgt = tgt_ + row row_size;

	// Perform scan on one block at a time, keeping track of the total value of
	// all blocks processed so far.
	for (unsigned block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) {
	// Load data into shared memory (two values per thread).
	unsigned col1 = block_col + threadIdx.x;
	unsigned col2 = block_col + num_threads_x + threadIdx.x;
	if (row < num_rows) {
	if (col1 < row_size) {
	row_buf[threadIdx.x] = row_src[col1];
	} else {
	row_buf[threadIdx.x] = init;
	}

	if (col2 < row_size) {
	row_buf[num_threads_x + threadIdx.x] = row_src[col2];
	} else {
	row_buf[num_threads_x + threadIdx.x] = init;
	}

	// Add the total value of all previous blocks to the first value of this block.
	if (threadIdx.x == 0) {
	row_buf[0] = binary_op(row_buf[0], block_total);
	}
	}
	__syncthreads();

	// Parallel reduction (up-sweep).
	for (unsigned s = num_threads_x, d = 1; s >= 1; s >>= 1, d <<= 1) {
	if (row < num_rows && threadIdx.x < s) {
	unsigned offset = (2 * threadIdx.x + 1) * d - 1;
	row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
	}
	__syncthreads();
	}

	// Down-sweep.
	for (unsigned s = 2, d = num_threads_x / 2; d >= 1; s <<= 1, d >>= 1) {
	if (row < num_rows && threadIdx.x < s - 1) {
	unsigned offset = 2 * (threadIdx.x + 1) * d - 1;
	row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
	}
	__syncthreads();
	}

	// Write back to output.
	if (row < num_rows) {
	if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x];
	if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x];
	}
	block_total = row_buf[2 * num_threads_x - 1];
	__syncthreads();
	}
	}
	}

	#include "generic/THCTensorMathScan.cu"
	#include "THCGenerateAllTypes.h"