| #include "THCTensorMath.h" |
| #include "THCGeneral.h" |
| #include "THCBlas.h" |
| #include "THCTensorCopy.h" |
| #include "THCApply.cuh" |
| #include "THCReduce.cuh" |
| #include "THCNumerics.cuh" |
| #include "THCTensorMathReduce.cuh" |
| #include <thrust/scan.h> |
| #include <thrust/execution_policy.h> |
| |
| /* Perform an inclusive scan along an outer dimension of a tensor. |
| * |
| * - num_orows is the size of the flattened outer dimensions; |
| * - num_irows is the size of the flattened inner dimensions; |
| * - row_size is the size of the dimension along which to compute the variance; |
| * |
| * The dimensions to the outside and inside of the specified dimension are considered as flattened. |
| * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened |
| * outer dimensions, which contains several "inner rows"). |
| * Each thread processes a single inner row at a time. |
| */ |
| template<typename T, class BinaryOp> |
| __global__ void THCTensor_kernel_scanOuterDim(T *tgt_, T *src_, |
| unsigned num_orows, unsigned num_irows, unsigned row_size, |
| T init, BinaryOp binary_op) |
| { |
| for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { |
| for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) { |
| T *src = src_ + orow * row_size * num_irows + irow; |
| T *tgt = tgt_ + orow * row_size * num_irows + irow; |
| T acc = init; |
| |
| for (unsigned col = 0; col < row_size; ++col) { |
| acc = binary_op(acc, *src); |
| *tgt = acc; |
| |
| src += num_irows; |
| tgt += num_irows; |
| } |
| } |
| } |
| } |
| |
| /* Perform an inclusive scan along the innermost dimension of a tensor. |
| * |
| * - num_rows is the size of the flattened outer dimensions; |
| * - row_size is the size of the innermost dimension; |
| * |
| * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is |
| * considered as having 'num_rows' rows of size 'row_size'. |
| * Each thread block processes one or more sets of contiguous rows (processing multiple rows |
| * per thread block is quicker than processing a single row, especially for short rows). |
| */ |
| template<typename T, int num_threads_x, int num_threads_y, class BinaryFunction> |
| __global__ void THCTensor_kernel_scanInnermostDim(T *tgt_, T *src_, |
| unsigned num_rows, unsigned row_size, |
| T init, BinaryFunction binary_op) |
| { |
| __shared__ T sbuf[num_threads_y][2 * num_threads_x]; |
| |
| T* row_buf = sbuf[threadIdx.y]; |
| |
| for (unsigned block_row = blockIdx.x * blockDim.y; |
| block_row < num_rows; |
| block_row += blockDim.y * gridDim.x) { |
| unsigned row = block_row + threadIdx.y; |
| T block_total = init; |
| |
| T *row_src = src_ + row * row_size; |
| T *row_tgt = tgt_ + row * row_size; |
| |
| // Perform scan on one block at a time, keeping track of the total value of |
| // all blocks processed so far. |
| for (unsigned block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) { |
| // Load data into shared memory (two values per thread). |
| unsigned col1 = block_col + threadIdx.x; |
| unsigned col2 = block_col + num_threads_x + threadIdx.x; |
| if (row < num_rows) { |
| if (col1 < row_size) { |
| row_buf[threadIdx.x] = row_src[col1]; |
| } else { |
| row_buf[threadIdx.x] = init; |
| } |
| |
| if (col2 < row_size) { |
| row_buf[num_threads_x + threadIdx.x] = row_src[col2]; |
| } else { |
| row_buf[num_threads_x + threadIdx.x] = init; |
| } |
| |
| // Add the total value of all previous blocks to the first value of this block. |
| if (threadIdx.x == 0) { |
| row_buf[0] = binary_op(row_buf[0], block_total); |
| } |
| } |
| __syncthreads(); |
| |
| // Parallel reduction (up-sweep). |
| for (unsigned s = num_threads_x, d = 1; s >= 1; s >>= 1, d <<= 1) { |
| if (row < num_rows && threadIdx.x < s) { |
| unsigned offset = (2 * threadIdx.x + 1) * d - 1; |
| row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]); |
| } |
| __syncthreads(); |
| } |
| |
| // Down-sweep. |
| for (unsigned s = 2, d = num_threads_x / 2; d >= 1; s <<= 1, d >>= 1) { |
| if (row < num_rows && threadIdx.x < s - 1) { |
| unsigned offset = 2 * (threadIdx.x + 1) * d - 1; |
| row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]); |
| } |
| __syncthreads(); |
| } |
| |
| // Write back to output. |
| if (row < num_rows) { |
| if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x]; |
| if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x]; |
| } |
| block_total = row_buf[2 * num_threads_x - 1]; |
| __syncthreads(); |
| } |
| } |
| } |
| |
| #include "generic/THCTensorMathScan.cu" |
| #include "THCGenerateAllTypes.h" |