| #include "THCTensorConv.h" |
| #include "THCTensorMath.h" |
| #include "THCTensorCopy.h" |
| #include "THCGeneral.h" |
| #include <stdio.h> |
| |
| /* |
| * Description: |
| * This code provides convolutions and xcorrelations that are API compatible with |
| * the ones in THLabConv. |
| * |
| * History: |
| * Sept 11, 2011, 11:59PM - Clement Farabet - Optimized RevConv by a good x2 |
| * July 22, 2011, 8:38PM - Clement Farabet - All Valid/Full/XCORR/CONV implemented |
| * July 22, 2011, 4:00PM - Clement Farabet - Rewrote for loop to insure memory coalescing |
| * July 21, 2011, 11:21PM - Clement Farabet - Creation, based conv2d routine |
| */ |
| |
| #define CUDA_SHARED_MEM_SIZE (12*1024-32) // this is given by nVidia: max shared mem per block |
| |
| /* |
| * Description: |
| * base conv2D routine: 3D input, 3D output, 4D kernel |
| * |
| * - all chunks of data should be contiguous |
| * - the swapkernel flag can be used to generate a conv2 instead of xcorr2 |
| * - the templated kernel size is useful to generate code that's 2x faster |
| * but can be set to 0 to allow arbitrary kernel sizes |
| */ |
| template <bool swapkernel, int T_kernel_h, int T_kernel_w> |
| __global__ void conv2generic(float *input, float *kernel, float *output, |
| int input_n, int input_h, int input_w, |
| int kernel_n, int kernel_h, int kernel_w, |
| int stride_h, int stride_w) |
| { |
| // output dimensions |
| int output_h = (input_h - kernel_h) / stride_h + 1; |
| int output_w = (input_w - kernel_w) / stride_w + 1; |
| |
| // xcorr or conv |
| int koffset = swapkernel ? kernel_w*kernel_h-1 : 0; |
| |
| // nb outputs |
| int output_n = kernel_n / input_n; |
| |
| // generate offsets according to block/thread ids |
| int xx_start = threadIdx.x; |
| int xx_end = output_w; |
| int xx_step = blockDim.x; |
| |
| int yy_start = blockDim.y*blockIdx.y + threadIdx.y; |
| int yy_end = output_h; |
| int yy_step = blockDim.y*gridDim.y; |
| |
| int oo_start = blockIdx.x; |
| int oo_end = oo_start+1; |
| |
| int ii_start = (blockIdx.x / output_n) * input_n; |
| int ii_end = ii_start + input_n; |
| |
| // nb threads, unique thread id |
| int tid = blockDim.x*blockDim.y*threadIdx.z + blockDim.x * threadIdx.y + threadIdx.x; |
| int nthreads = blockDim.x * blockDim.y * blockDim.z; |
| |
| // iterators |
| int oo, ii, xx, yy, kx, ky, kk; |
| |
| // do the kernels fit in shared mem ? |
| if (input_n*kernel_w*kernel_h <= CUDA_SHARED_MEM_SIZE) { |
| |
| // put the kernel in shared memory |
| __shared__ float shared_kernel[CUDA_SHARED_MEM_SIZE]; |
| |
| // first thread of each block does the copy |
| for (kk = tid; kk < kernel_w*kernel_h*input_n; kk += nthreads) { |
| shared_kernel[kk] = kernel[input_n*kernel_w*kernel_h*(oo_start % output_n) + kk]; |
| } |
| __syncthreads(); |
| |
| // templated kernel size |
| if ((T_kernel_w > 0) && (T_kernel_h > 0)) { |
| // unrolled convolution loop |
| for(oo = oo_start; oo < oo_end; oo++) { |
| for(ii = ii_start; ii < ii_end; ii++) { |
| for(yy = yy_start; yy < yy_end; yy+=yy_step) { |
| for(xx = xx_start; xx < xx_end; xx+=xx_step) { |
| // Dot product in two dimensions... (between input image and the mask) |
| float *input_p = input + ii*input_h*input_w + yy*stride_h*input_w + xx*stride_w; |
| float *output_p = output + oo*output_h*output_w + yy*output_w + xx; |
| float *kernel_p = shared_kernel + (ii % input_n)*kernel_w*kernel_h + koffset; |
| float sum = 0; |
| if (swapkernel) { |
| #pragma unroll |
| for(ky = 0; ky < T_kernel_h; ky++) { |
| #pragma unroll |
| for(kx = 0; kx < T_kernel_w; kx++) { |
| sum += input_p[kx]*(*kernel_p--); |
| } |
| input_p += input_w; |
| } |
| } else { |
| #pragma unroll |
| for(ky = 0; ky < T_kernel_h; ky++) { |
| #pragma unroll |
| for(kx = 0; kx < T_kernel_w; kx++) { |
| sum += input_p[kx]*(*kernel_p++); |
| } |
| input_p += input_w; |
| } |
| } |
| *output_p += sum; |
| } |
| } |
| } |
| } |
| } else { |
| // default convolution loop |
| for(oo = oo_start; oo < oo_end; oo++) { |
| for(ii = ii_start; ii < ii_end; ii++) { |
| for(yy = yy_start; yy < yy_end; yy+=yy_step) { |
| for(xx = xx_start; xx < xx_end; xx+=xx_step) { |
| // Dot product in two dimensions... (between input image and the mask) |
| float *input_p = input + ii*input_h*input_w + yy*stride_h*input_w + xx*stride_w; |
| float *output_p = output + oo*output_h*output_w + yy*output_w + xx; |
| float *kernel_p = shared_kernel + (ii % input_n) * kernel_w * kernel_h + koffset; |
| float sum = 0; |
| if (swapkernel) { |
| for(ky = 0; ky < kernel_h; ky++) { |
| #pragma unroll 5 |
| for(kx = 0; kx < kernel_w; kx++) { |
| sum += input_p[kx]*(*kernel_p--); |
| } |
| input_p += input_w; |
| } |
| } else { |
| for(ky = 0; ky < kernel_h; ky++) { |
| #pragma unroll 5 |
| for(kx = 0; kx < kernel_w; kx++) { |
| sum += input_p[kx]*(*kernel_p++); |
| } |
| input_p += input_w; |
| } |
| } |
| *output_p += sum; |
| } |
| } |
| } |
| } |
| } |
| |
| } else { // not enough shared mem for kernels, simply stream them |
| |
| // convolution loop |
| for(oo = oo_start; oo < oo_end; oo++) { |
| for(ii = ii_start; ii < ii_end; ii++) { |
| for(yy = yy_start; yy < yy_end; yy+=yy_step) { |
| for(xx = xx_start; xx < xx_end; xx+=xx_step) { |
| // Dot product in two dimensions... (between input image and the mask) |
| float *input_p = input + ii*input_h*input_w + yy*stride_h*input_w + xx*stride_w; |
| float *output_p = output + oo*output_h*output_w + yy*output_w + xx; |
| float *kernel_p = kernel + ((oo % output_n) * input_n + (ii % input_n))*kernel_w*kernel_h + koffset; |
| float sum = 0; |
| if (swapkernel) { |
| for(ky = 0; ky < kernel_h; ky++) { |
| #pragma unroll 5 |
| for(kx = 0; kx < kernel_w; kx++) { |
| sum += input_p[kx]*(*kernel_p--); |
| } |
| input_p += input_w; |
| } |
| } else { |
| for(ky = 0; ky < kernel_h; ky++) { |
| #pragma unroll 5 |
| for(kx = 0; kx < kernel_w; kx++) { |
| sum += input_p[kx]*(*kernel_p++); |
| } |
| input_p += input_w; |
| } |
| } |
| *output_p += sum; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /* |
| * Description: |
| * base conv2D routine with reversed stride: 3D input, 4D output, 3D kernel |
| * this is useful for computing gradients with respect to kernels, where: |
| * input=input, kernel=gradOutput, output=gradWeight |
| * |
| * - all chunks of data should be contiguous |
| * - the swapkernel flag can be used to generate a conv2 instead of xcorr2 |
| */ |
| __global__ void conv2genericrev(float *input, float *kernel, float *output, |
| int input_n, int input_h, int input_w, |
| int kernel_n, int kernel_h, int kernel_w, |
| float alpha, int stride_h, int stride_w) |
| { |
| // output dimensions |
| int output_h = input_h - (kernel_h - 1) * stride_h; |
| int output_w = input_w - (kernel_w - 1) * stride_w; |
| |
| // this thread only processes one output, defined by the block Ids |
| int kk = blockIdx.x; |
| int ii = blockIdx.y; |
| |
| // batch id |
| int batch = threadIdx.z; |
| |
| // kernel id |
| int kid = threadIdx.x; |
| int nkids = blockDim.x; |
| |
| // thread ID |
| int tid = kid + batch*blockDim.x; |
| int nthreads = blockDim.x * blockDim.z; |
| |
| // one thread only sees one output |
| output = output + (kk * input_n + ii) * output_h*output_w; |
| |
| // put the output in shared memory |
| __shared__ float shared_output[CUDA_SHARED_MEM_SIZE]; |
| |
| // generate tid outputs in shared memory |
| float *output_s = shared_output + tid*output_w*output_h; |
| |
| // convolution loop |
| int xx, yy, kx, ky; |
| yy = threadIdx.y; |
| float *output_p = output_s + yy * output_w; |
| for(xx=0; xx<output_w; xx++) { |
| // Dot product in two dimensions... (between input image and kernel) |
| float *input_p = input + (ii + batch*input_n)*input_h*input_w + yy*stride_h*input_w + xx*stride_w; |
| float *kernel_p = kernel + (kk + batch*kernel_n)*kernel_w*kernel_h; |
| float sum = 0; |
| for(ky=0; ky<kernel_h; ky++) { |
| for(kx=kid; kx<kernel_w; kx+=nkids) { |
| sum += input_p[kx]*kernel_p[kx]; |
| } |
| input_p += input_w; |
| kernel_p += kernel_w; |
| } |
| *(output_p++) = sum; |
| } |
| __syncthreads(); |
| |
| // reduce and write back |
| if (yy == 0) { |
| // reduce outputs |
| for (int k=1; k<nthreads; k++) { |
| for (int i=tid; i<output_w*output_h; i+=nthreads) { |
| shared_output[i] += shared_output[k*output_h*output_w + i]; |
| } |
| } |
| __syncthreads(); |
| |
| // add existing output, and write back |
| for (int i=tid; i<output_w*output_h; i+=nthreads) { |
| output[i] += alpha*shared_output[i]; |
| } |
| } |
| } |
| |
| // A helper macro for the common pattern of checking the input |
| // rows/columns for a small number of values, specializing the kernel |
| // template paremeters if rows/columns are equal and small, and |
| // otherwise just passing zero to the kernel. |
| #define FOR_KERNEL_SPECIALIZED_DIMENSION(ROWS, COLUMNS, KERNEL) \ |
| if ((ROWS) == (COLUMNS)) { \ |
| switch ((ROWS)) { \ |
| case 3: { KERNEL(3); break; } \ |
| case 4: { KERNEL(4); break; } \ |
| case 5: { KERNEL(5); break; } \ |
| case 6: { KERNEL(6); break; } \ |
| case 7: { KERNEL(7); break; } \ |
| case 8: { KERNEL(8); break; } \ |
| case 9: { KERNEL(9); break; } \ |
| case 10: { KERNEL(10); break; } \ |
| case 11: { KERNEL(11); break; } \ |
| case 12: { KERNEL(12); break; } \ |
| case 13: { KERNEL(13); break; } \ |
| default: { KERNEL(0); break; } \ |
| } \ |
| } else { \ |
| KERNEL(0); \ |
| } |
| |
| /* |
| * API-compatible with THRealTensor_conv2Dmv |
| * 3D input, 4D kernel, 3D output |
| * matrix vector product like: y <- Ax + beta*y |
| */ |
| THC_API void THCudaTensor_conv2Dmv(THCState *state, THCudaTensor *output, float beta, THCudaTensor *input, |
| THCudaTensor *kernel, long srow, long scol, const char *type) |
| { |
| THAssert(THCudaTensor_checkGPU(state, 3, output, input, kernel)); |
| long nInputPlane, nInputRows, nInputCols; |
| long nKernelRows, nKernelCols; |
| long nOutputPlane, nOutputRows, nOutputCols; |
| |
| THArgCheck(kernel->nDimension == 4 , 4, "kernel: 4D Tensor expected"); |
| THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); |
| THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); |
| THArgCheck(type[0] == 'v' || type[0] == 'f', 7, "type of convolution can 'v' or 'f'"); |
| THArgCheck(type[1] == 'c' || type[1] == 'x', 7, "type of convolution can 'x' or 'c'"); |
| |
| input = THCudaTensor_newContiguous(state, input); |
| kernel = THCudaTensor_newContiguous(state, kernel); |
| |
| nInputPlane = input->size[0]; |
| nInputRows = input->size[1]; |
| nInputCols = input->size[2]; |
| |
| nKernelRows = kernel->size[2]; |
| nKernelCols = kernel->size[3]; |
| nOutputPlane = kernel->size[0]; |
| THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes"); |
| |
| THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *type == 'f', 2, |
| "conv2Dmv : Input image is smaller than kernel"); |
| |
| if (*type == 'f') { |
| // output dims |
| nOutputRows = (nInputRows - 1) * srow + nKernelRows; |
| nOutputCols = (nInputCols - 1) * scol + nKernelCols; |
| |
| // use temp buffer |
| THCudaTensor *inputP = THCudaTensor_new(state); |
| |
| // create a zero-padded input |
| long nInputRowsPadded = (nOutputRows - 1) * srow + nKernelRows; |
| long nInputColsPadded = (nOutputCols - 1) * scol + nKernelCols; |
| THCudaTensor_resize3d(state, inputP, nInputPlane, nInputRowsPadded, nInputColsPadded); |
| THCudaTensor_zero(state, inputP); |
| |
| THCudaTensor *centered = THCudaTensor_new(state); |
| THCudaTensor_narrow(state, centered, inputP, 2, nKernelCols-1, nInputCols); |
| THCudaTensor_narrow(state, centered, NULL, 1, nKernelRows-1, nInputRows); |
| THCudaTensor_copy(state, centered, input); |
| THCudaTensor_free(state, centered); |
| |
| // remap input to newly created tensor |
| THCudaTensor_free(state, input); |
| input = inputP; |
| nInputRows = nInputRowsPadded; |
| nInputCols = nInputColsPadded; |
| |
| } else { // 'v' |
| // output dims |
| nOutputRows = (nInputRows - nKernelRows) / srow + 1; |
| nOutputCols = (nInputCols - nKernelCols) / scol + 1; |
| } |
| |
| ptrdiff_t nelem = THCudaTensor_nElement(state, output); |
| THCudaTensor_resize3d(state, output, nOutputPlane, nOutputRows, nOutputCols); |
| |
| if (beta == 0 || nelem != THCudaTensor_nElement(state, output)) { |
| THCudaTensor_zero(state, output); |
| } else if (beta != 1) { |
| THCudaTensor_mul(state, output, output, beta); |
| } |
| |
| float *input_data = THCudaTensor_data(state, input); |
| float *weight_data = THCudaTensor_data(state, kernel); |
| float *output_data = THCudaTensor_data(state, output); |
| |
| // cuda blocks & threads: |
| int yblocks = (int)(16L / nOutputPlane); |
| yblocks = yblocks < 1 ? 1 : yblocks; |
| dim3 blocks(nOutputPlane,yblocks); |
| dim3 threads(32,8); |
| |
| // convolution: xcorr2 or conv2 |
| if (type[1] == 'x') { |
| #define X_CONV_KERNEL(dim) \ |
| conv2generic <false, (dim), (dim)> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( \ |
| input_data, weight_data, output_data, \ |
| nInputPlane, nInputRows, nInputCols, \ |
| nOutputPlane*nInputPlane, nKernelRows, nKernelCols, \ |
| srow, scol); |
| |
| FOR_KERNEL_SPECIALIZED_DIMENSION(nKernelRows, nKernelCols, X_CONV_KERNEL); |
| #undef X_CONV_KERNEL |
| } else { // 'c' |
| #define C_CONV_KERNEL(dim) \ |
| conv2generic <true, (dim), (dim)> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( \ |
| input_data, weight_data, output_data, \ |
| nInputPlane, nInputRows, nInputCols, \ |
| nOutputPlane*nInputPlane, nKernelRows, nKernelCols, \ |
| srow, scol); |
| |
| FOR_KERNEL_SPECIALIZED_DIMENSION(nKernelRows, nKernelCols, C_CONV_KERNEL); |
| #undef C_CONV_KERNEL |
| } |
| |
| // clean |
| if (*type != 'f') THCudaTensor_free(state, input); |
| THCudaTensor_free(state, kernel); |
| |
| // check for errors |
| cudaError_t err = cudaGetLastError(); |
| if (err != cudaSuccess) { |
| printf("error in conv2Dmv: %s\n", cudaGetErrorString(err)); |
| THError("aborting"); |
| } |
| } |
| |
| /* |
| * API-compatible with THRealTensor_conv2Dmm |
| * 4D input, 4D kernel, 4D output |
| * matrix vector product like: y <- Ax + beta*y |
| */ |
| THC_API void THCudaTensor_conv2Dmm(THCState *state, THCudaTensor *output, float beta, THCudaTensor *input, |
| THCudaTensor *kernel, long srow, long scol, const char *type) |
| { |
| THAssert(THCudaTensor_checkGPU(state, 3, output, input, kernel)); |
| long nbatch, nInputPlane, nInputRows, nInputCols; |
| long nKernelRows, nKernelCols; |
| long nOutputPlane, nOutputRows, nOutputCols; |
| |
| THArgCheck(kernel->nDimension == 4 , 4, "kernel: 4D Tensor expected"); |
| THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); |
| THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); |
| THArgCheck(type[0] == 'v' || type[0] == 'f', 7, "type of convolution can 'v' or 'f'"); |
| THArgCheck(type[1] == 'c' || type[1] == 'x', 7, "type of convolution can 'x' or 'c'"); |
| |
| input = THCudaTensor_newContiguous(state, input); |
| kernel = THCudaTensor_newContiguous(state, kernel); |
| |
| nbatch = input->size[0]; |
| nInputPlane = input->size[1]; |
| nInputRows = input->size[2]; |
| nInputCols = input->size[3]; |
| |
| nKernelRows = kernel->size[2]; |
| nKernelCols = kernel->size[3]; |
| nOutputPlane = kernel->size[0]; |
| THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes"); |
| |
| THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *type == 'f', 2, |
| "conv2Dmm : Input image is smaller than kernel"); |
| |
| if (*type == 'f') { |
| // output dims |
| nOutputRows = (nInputRows - 1) * srow + nKernelRows; |
| nOutputCols = (nInputCols - 1) * scol + nKernelCols; |
| |
| // use temp buffer |
| THCudaTensor *inputP = THCudaTensor_new(state); |
| |
| // create a zero-padded input |
| long nInputRowsPadded = (nOutputRows - 1) * srow + nKernelRows; |
| long nInputColsPadded = (nOutputCols - 1) * scol + nKernelCols; |
| THCudaTensor_resize4d(state, inputP, nbatch, nInputPlane, nInputRowsPadded, nInputColsPadded); |
| THCudaTensor_zero(state, inputP); |
| |
| THCudaTensor *centered = THCudaTensor_new(state); |
| THCudaTensor_narrow(state, centered, inputP, 3, nKernelCols-1, nInputCols); |
| THCudaTensor_narrow(state, centered, NULL, 2, nKernelRows-1, nInputRows); |
| THCudaTensor_copy(state, centered, input); |
| THCudaTensor_free(state, centered); |
| |
| // remap input to newly created tensor |
| THCudaTensor_free(state, input); |
| input = inputP; |
| nInputRows = nInputRowsPadded; |
| nInputCols = nInputColsPadded; |
| |
| } else { // 'v' |
| // output dims |
| nOutputRows = (nInputRows - nKernelRows) / srow + 1; |
| nOutputCols = (nInputCols - nKernelCols) / scol + 1; |
| } |
| |
| ptrdiff_t nelem = THCudaTensor_nElement(state, output); |
| THCudaTensor_resize4d(state, output, nbatch, nOutputPlane, nOutputRows, nOutputCols); |
| |
| if (beta == 0 || nelem != THCudaTensor_nElement(state, output)) { |
| THCudaTensor_zero(state, output); |
| } else if (beta != 1) { |
| THCudaTensor_mul(state, output, output, beta); |
| } |
| |
| float *input_data = THCudaTensor_data(state, input); |
| float *weight_data = THCudaTensor_data(state, kernel); |
| float *output_data = THCudaTensor_data(state, output); |
| |
| // cuda blocks & threads: |
| int yblocks = (int)(16L / nOutputPlane); |
| yblocks = yblocks < 1 ? 1 : yblocks; |
| dim3 blocks(nOutputPlane*nbatch,yblocks); |
| dim3 threads(32,8); |
| |
| // convolution: xcorr2 or conv2 |
| if (type[1] == 'x') { |
| #define X_CONV_KERNEL(dim) \ |
| conv2generic <false, (dim), (dim)> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( \ |
| input_data, weight_data, output_data, \ |
| nInputPlane, nInputRows, nInputCols, \ |
| nOutputPlane*nInputPlane, nKernelRows, nKernelCols, \ |
| srow, scol); |
| |
| FOR_KERNEL_SPECIALIZED_DIMENSION(nKernelCols, nKernelRows, X_CONV_KERNEL); |
| #undef X_CONV_KERNEL |
| } else { // 'c' |
| #define C_CONV_KERNEL(dim) \ |
| conv2generic <true, (dim), (dim)> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( \ |
| input_data, weight_data, output_data, \ |
| nInputPlane, nInputRows, nInputCols, \ |
| nOutputPlane*nInputPlane, nKernelRows, nKernelCols, \ |
| srow, scol); \ |
| |
| FOR_KERNEL_SPECIALIZED_DIMENSION(nKernelCols, nKernelRows, C_CONV_KERNEL); |
| #undef C_CONV_KERNEL |
| } |
| |
| // clean |
| if (*type != 'f') THCudaTensor_free(state, input); |
| THCudaTensor_free(state, kernel); |
| |
| // check for errors |
| cudaError_t err = cudaGetLastError(); |
| if (err != cudaSuccess) { |
| cudaDeviceProp deviceProp; |
| cudaGetDeviceProperties(&deviceProp, 0); |
| printf("error in conv2Dmm: %s\n", cudaGetErrorString(err)); |
| printf("requested grid size: %dx%dx%d, max allowed: %dx%dx%d\n", |
| blocks.x, blocks.y, blocks.z, |
| deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); |
| printf("requested block size: %dx%dx%d, max allowed: %dx%dx%d\n", |
| threads.x, threads.y, threads.z, |
| deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); |
| THError("aborting"); |
| } |
| } |
| |
| /* |
| * API-compatible with THRealTensor_conv2DRevger |
| * 3D input, 3D kernel, 4D output |
| * like rank1 update |
| * A <- xx' + beta*A |
| * for sr,sc=1 this is equivalent to xcorr2Dger, but otherwise it is useful for |
| * calculating derivatives wrt a kernel that is applied with stride sr,sc != 1 |
| */ |
| THC_API void THCudaTensor_conv2DRevger(THCState *state, THCudaTensor *output, float beta, float alpha, |
| THCudaTensor *input, THCudaTensor *kernel, |
| long srow, long scol) |
| { |
| THAssert(THCudaTensor_checkGPU(state, 3, output, input, kernel)); |
| long nInputPlane, nInputRows, nInputCols; |
| long nKernelPlane, nKernelRows, nKernelCols; |
| long nOutputRows, nOutputCols; |
| |
| THArgCheck(input->nDimension == 3 , 3, "input: 3D Tensor expected"); |
| THArgCheck(kernel->nDimension == 3 , 4, "kernel: 3D Tensor expected"); |
| THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); |
| THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); |
| |
| input = THCudaTensor_newContiguous(state, input); |
| kernel = THCudaTensor_newContiguous(state, kernel); |
| |
| nInputPlane = input->size[0]; |
| nInputRows = input->size[1]; |
| nInputCols = input->size[2]; |
| |
| nKernelPlane = kernel->size[0]; |
| nKernelRows = kernel->size[1]; |
| nKernelCols = kernel->size[2]; |
| |
| THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, |
| "conv2DRevger : Input image is smaller than kernel"); |
| |
| nOutputRows = nInputRows - (nKernelRows - 1) * srow; |
| nOutputCols = nInputCols - (nKernelCols - 1) * scol; |
| |
| ptrdiff_t nelem = THCudaTensor_nElement(state, output); |
| THCudaTensor_resize4d(state, output, nKernelPlane, nInputPlane, nOutputRows, nOutputCols); |
| |
| if (nelem == 0 || beta == 0 || nelem != THCudaTensor_nElement(state, output)) { |
| THCudaTensor_zero(state, output); |
| } else if (beta != 1) { |
| THCudaTensor_mul(state, output, output, beta); |
| } |
| |
| float *input_data = THCudaTensor_data(state, input); |
| float *kernel_data = THCudaTensor_data(state, kernel); |
| float *output_data = THCudaTensor_data(state, output); |
| |
| // auto compute nb of blocks and threads |
| dim3 blocks(nKernelPlane, nInputPlane); |
| dim3 threads(128/nOutputRows, nOutputRows); |
| |
| // compute rev conv |
| conv2genericrev <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>( |
| input_data, kernel_data, output_data, |
| nInputPlane, nInputRows, nInputCols, |
| nKernelPlane, nKernelRows, nKernelCols, |
| alpha, srow, scol); |
| |
| // clean |
| THCudaTensor_free(state, input); |
| THCudaTensor_free(state, kernel); |
| |
| // check for errors |
| cudaError_t err = cudaGetLastError(); |
| if (err != cudaSuccess) { |
| printf("error in conv2DRevger: %s\n", cudaGetErrorString(err)); |
| THError("aborting"); |
| } |
| } |
| |
| /* |
| * API-compatible with THRealTensor_conv2DRevgerm |
| * 4D input, 4D kernel, 4D output |
| * conv2DRevgerm is doing the same thing as conv2DRevger, but with batch inputs |
| */ |
| THC_API void THCudaTensor_conv2DRevgerm(THCState *state, THCudaTensor *output, float beta, float alpha, |
| THCudaTensor *input, THCudaTensor *kernel, |
| long srow, long scol) |
| { |
| long nInputPlane, nInputRows, nInputCols; |
| long nKernelPlane, nKernelRows, nKernelCols; |
| long nOutputRows, nOutputCols; |
| long nbatch; |
| |
| THArgCheck(input->nDimension == 4 , 3, "input: 3D Tensor expected"); |
| THArgCheck(kernel->nDimension == 4 , 4, "kernel: 3D Tensor expected"); |
| THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); |
| THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); |
| |
| input = THCudaTensor_newContiguous(state, input); |
| kernel = THCudaTensor_newContiguous(state, kernel); |
| |
| nbatch = input->size[0]; |
| nInputPlane = input->size[1]; |
| nInputRows = input->size[2]; |
| nInputCols = input->size[3]; |
| |
| nKernelPlane = kernel->size[1]; |
| nKernelRows = kernel->size[2]; |
| nKernelCols = kernel->size[3]; |
| |
| THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, |
| "conv2DRevger : Input image is smaller than kernel"); |
| |
| nOutputRows = nInputRows - (nKernelRows - 1) * srow; |
| nOutputCols = nInputCols - (nKernelCols - 1) * scol; |
| |
| ptrdiff_t nelem = THCudaTensor_nElement(state, output); |
| THCudaTensor_resize4d(state, output, nKernelPlane, nInputPlane, nOutputRows, nOutputCols); |
| |
| if (nelem == 0 || beta == 0 || nelem != THCudaTensor_nElement(state, output)) { |
| THCudaTensor_zero(state, output); |
| } else if (beta != 1) { |
| THCudaTensor_mul(state, output, output, beta); |
| } |
| |
| float *input_data = THCudaTensor_data(state, input); |
| float *kernel_data = THCudaTensor_data(state, kernel); |
| float *output_data = THCudaTensor_data(state, output); |
| |
| // kernel is called multiple times |
| // (the arbitrary split below is just here to make sure we dont go over 256 threads) |
| for (int sl=0; sl<nbatch; sl+=6) { |
| // auto compute nb of blocks and threads |
| dim3 blocks(nKernelPlane, nInputPlane); |
| int subbatch = 6; |
| if (sl+subbatch > nbatch) subbatch = nbatch - sl; |
| int cst = 256 / (subbatch * nOutputRows); |
| dim3 threads(cst, nOutputRows, subbatch); |
| |
| // compute rev conv |
| conv2genericrev <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>( |
| input_data + input->stride[0]*sl, |
| kernel_data + kernel->stride[0]*sl, |
| output_data, |
| nInputPlane, nInputRows, nInputCols, |
| nKernelPlane, nKernelRows, nKernelCols, |
| alpha, srow, scol); |
| } |
| |
| // clean |
| THCudaTensor_free(state, input); |
| THCudaTensor_free(state, kernel); |
| |
| // check for errors |
| cudaError_t err = cudaGetLastError(); |
| if (err != cudaSuccess) { |
| printf("error in conv2DRevger: %s\n", cudaGetErrorString(err)); |
| THError("aborting"); |
| } |
| } |
| |
| |
| /////////////////////////////////// |
| ///// ConvolutionMap |
| /* |
| * Description: |
| * base conv2D routine: 3D input, 3D output, 4D kernel |
| * |
| * - all chunks of data should be contiguous |
| * - the swapkernel flag can be used to generate a conv2 instead of xcorr2 |
| * - the templated kernel size is useful to generate code that's 2x faster |
| * but can be set to 0 to allow arbitrary kernel sizes |
| * ---- the table should have the first dim with the outputs, each output |
| * ---- should have a fanin set of inputs contiguously |
| */ |
| template <bool swapkernel, int T_kernel_h, int T_kernel_w> |
| __global__ void conv2mapgeneric(float *input, float *kernel, float *output, |
| int input_n, int input_h, int input_w, |
| int kernel_n, int kernel_h, int kernel_w, |
| int stride_w, int stride_h, |
| float *table, int fanin) |
| { |
| // output dimensions |
| int output_h = (input_h - kernel_h) / stride_h + 1; |
| int output_w = (input_w - kernel_w) / stride_w + 1; |
| |
| // xcorr or conv |
| int koffset = swapkernel ? kernel_w*kernel_h-1 : 0; |
| |
| // nb outputs |
| // int output_n = kernel_n / fanin; |
| |
| // generate offsets according to block/thread ids |
| int xx_start = threadIdx.x; |
| int xx_end = output_w; |
| int xx_step = blockDim.x; |
| |
| int yy_start = blockDim.y*blockIdx.y + threadIdx.y; |
| int yy_end = output_h; |
| int yy_step = blockDim.y*gridDim.y; |
| |
| int oo_start = blockIdx.x; |
| int oo_end = oo_start+1; |
| |
| int table_start = blockIdx.x * (fanin * 2); |
| int table_end = table_start + (fanin * 2); |
| |
| // nb threads, unique thread id |
| int tid = blockDim.x*blockDim.y*threadIdx.z |
| + blockDim.x * threadIdx.y + threadIdx.x; |
| int nthreads = blockDim.x * blockDim.y * blockDim.z; |
| |
| // iterators |
| int oo, ii, xx, yy, kx, ky, kk; |
| |
| // do the kernels fit in shared mem ? |
| if (kernel_w*kernel_h*kernel_n <= CUDA_SHARED_MEM_SIZE) { |
| // put the kernel in shared memory |
| __shared__ float shared_kernel[CUDA_SHARED_MEM_SIZE]; |
| |
| // first thread of each block does the copy |
| for (kk = tid; kk < kernel_w*kernel_h*kernel_n; kk += nthreads) { |
| shared_kernel[kk] = kernel[kk]; |
| } |
| __syncthreads(); |
| |
| // templated kernel size |
| if ((T_kernel_w > 0) && (T_kernel_h > 0)) { |
| // unrolled convolution loop |
| for(oo = oo_start; oo < oo_end; oo++) { |
| for (ii = table_start; ii < table_end; ii = ii + 2) { |
| for(yy = yy_start; yy < yy_end; yy+=yy_step) { |
| for(xx = xx_start; xx < xx_end; xx+=xx_step) { |
| // Dot product in two dimensions... (between input image and the mask) |
| float *input_p = input + ((long)table[ii]-1)*input_h*input_w |
| + yy*stride_h*input_w + xx*stride_w; |
| float *output_p = output + oo*output_h*output_w + yy*output_w + xx; |
| float *kernel_p = shared_kernel |
| + ((long)table[ii + 1]-1) *kernel_w*kernel_h + koffset; |
| float sum = 0; |
| if (swapkernel) { |
| #pragma unroll |
| for(ky = 0; ky < T_kernel_h; ky++) { |
| #pragma unroll |
| for(kx = 0; kx < T_kernel_w; kx++) { |
| sum += input_p[kx]*(*kernel_p--); |
| } |
| input_p += input_w; |
| } |
| } else { |
| #pragma unroll |
| for(ky = 0; ky < T_kernel_h; ky++) { |
| #pragma unroll |
| for(kx = 0; kx < T_kernel_w; kx++) { |
| sum += input_p[kx]*(*kernel_p++); |
| } |
| input_p += input_w; |
| } |
| } |
| *output_p += sum; |
| } |
| } |
| } |
| } |
| } else { |
| // default convolution loop |
| for(oo = oo_start; oo < oo_end; oo++) { |
| for (ii = table_start; ii < table_end; ii++) { |
| for(yy = yy_start; yy < yy_end; yy+=yy_step) { |
| for(xx = xx_start; xx < xx_end; xx+=xx_step) { |
| // Dot product in two dims (between input image and the mask) |
| float *input_p = input + ((long)table[ii]-1)*input_h*input_w |
| + yy*stride_h*input_w + xx*stride_w; |
| float *output_p = output + oo*output_h*output_w + yy*output_w |
| + xx; |
| float *kernel_p = shared_kernel |
| + ((long)table[ii + 1]-1) *kernel_w*kernel_h + koffset; |
| float sum = 0; |
| if (swapkernel) { |
| for(ky = 0; ky < kernel_h; ky++) { |
| #pragma unroll 5 |
| for(kx = 0; kx < kernel_w; kx++) { |
| sum += input_p[kx]*(*kernel_p--); |
| } |
| input_p += input_w; |
| } |
| } else { |
| for(ky = 0; ky < kernel_h; ky++) { |
| #pragma unroll 5 |
| for(kx = 0; kx < kernel_w; kx++) { |
| sum += input_p[kx]*(*kernel_p++); |
| } |
| input_p += input_w; |
| } |
| } |
| *output_p += sum; |
| } |
| } |
| } |
| } |
| } |
| |
| } else { // not enough shared mem for kernels, simply stream them |
| |
| // convolution loop |
| for(oo = oo_start; oo < oo_end; oo++) { |
| for (ii = table_start; ii < table_end; ii = ii + 2) { |
| for(yy = yy_start; yy < yy_end; yy+=yy_step) { |
| for(xx = xx_start; xx < xx_end; xx+=xx_step) { |
| // Dot product in two dimensions... (between input image and the mask) |
| float *input_p = input + ((long)table[ii]-1)*input_h*input_w |
| + yy*stride_h*input_w + xx*stride_w; |
| float *output_p = output + oo*output_h*output_w + yy*output_w + xx; |
| float *kernel_p = kernel + ((long)table[ii + 1]-1) *kernel_w*kernel_h + koffset; |
| float sum = 0; |
| if (swapkernel) { |
| for(ky = 0; ky < kernel_h; ky++) { |
| #pragma unroll 5 |
| for(kx = 0; kx < kernel_w; kx++) { |
| sum += input_p[kx]*(*kernel_p--); |
| } |
| input_p += input_w; |
| } |
| } else { |
| for(ky = 0; ky < kernel_h; ky++) { |
| #pragma unroll 5 |
| for(kx = 0; kx < kernel_w; kx++) { |
| sum += input_p[kx]*(*kernel_p++); |
| } |
| input_p += input_w; |
| } |
| } |
| *output_p += sum; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| |
| /* |
| * API-compatible with THRealTensor_conv2Dmv |
| * 3D input, 4D kernel, 3D output |
| * matrix vector product like: y <- Ax + beta*y |
| */ |
| THC_API void THCudaTensor_conv2Dmap(THCState *state, THCudaTensor *output, THCudaTensor *input, |
| THCudaTensor *kernel, long stride_x, long stride_y, |
| THCudaTensor *table, long fanin) |
| { |
| THAssert(THCudaTensor_checkGPU(state, 4, output, input, kernel, table)); |
| long nInputPlane, nInputRows, nInputCols; |
| long nKernelRows, nKernelCols; |
| long nOutputPlane, nOutputRows, nOutputCols; |
| |
| THArgCheck(kernel->nDimension == 3 , 4, "kernel: 3D Tensor expected"); |
| THArgCheck(stride_x >= 1, 5, "Stride should be a positive integer"); |
| THArgCheck(stride_y >= 1, 6, "Stride should be a positive integer"); |
| |
| input = THCudaTensor_newContiguous(state, input); |
| kernel = THCudaTensor_newContiguous(state, kernel); |
| table = THCudaTensor_newContiguous(state, table); |
| |
| nInputPlane = input->size[0]; |
| nInputRows = input->size[1]; |
| nInputCols = input->size[2]; |
| |
| nKernelRows = kernel->size[1]; |
| nKernelCols = kernel->size[2]; |
| nOutputPlane = kernel->size[0] / fanin; |
| // THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes"); |
| |
| THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols), 2, |
| "conv2Dmap : Input image is smaller than kernel"); |
| |
| // output dims |
| nOutputRows = (nInputRows - nKernelRows) / stride_y + 1; |
| nOutputCols = (nInputCols - nKernelCols) / stride_x + 1; |
| |
| // ptrdiff_t nelem = THCudaTensor_nElement(state, output); |
| THCudaTensor_resize3d(state, output, nOutputPlane, nOutputRows, nOutputCols); |
| |
| float *input_data = THCudaTensor_data(state, input); |
| float *kernel_data = THCudaTensor_data(state, kernel); |
| float *output_data = THCudaTensor_data(state, output); |
| float *table_data = THCudaTensor_data(state, table); |
| |
| // set the number of blocks and threads |
| int nthreads_x = 32; |
| int nthreads_y = 8; |
| int block_height = (int)(16L / nOutputPlane); |
| if (block_height < 1) |
| block_height = 1; |
| dim3 blocks(nOutputPlane,block_height); |
| dim3 threads(nthreads_x,nthreads_y); |
| |
| #define GENERIC_MAP_KERNEL(dim) \ |
| conv2mapgeneric <false, (dim), (dim)> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( \ |
| input_data, kernel_data, output_data, nInputPlane, nInputRows, \ |
| nInputCols, nOutputPlane*fanin, nKernelRows, nKernelCols, \ |
| stride_x, stride_y, table_data, fanin); |
| |
| FOR_KERNEL_SPECIALIZED_DIMENSION(nKernelCols, nKernelRows, GENERIC_MAP_KERNEL); |
| #undef GENERIC_MAP_KERNEL |
| // clean |
| THCudaTensor_free(state, input); |
| THCudaTensor_free(state, kernel); |
| THCudaTensor_free(state, table); |
| |
| // check for errors |
| cudaError_t err = cudaGetLastError(); |
| if (err != cudaSuccess) { |
| printf("error in conv2Dmap: %s\n", cudaGetErrorString(err)); |
| THError("aborting"); |
| } |
| } |
| |
| #undef FOR_KERNEL_SPECIALIZED_DIMENSION |