blob: c188d3ab7e569b2c1f44f51b4fe9d77136561e45 [file] [log] [blame]
#include "THCUNN.h"
#include "common.h"
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
/*
* Description:
*/
__device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor)
{
int x, y, z, w;
w = ii % d3;
ii = ii/d3;
z = ii % d2;
ii = ii/d2;
y = ii % d1;
ii = ii/d1;
x = ii;
w = w/scale_factor;
z = z/scale_factor;
d2 /= scale_factor;
d3 /= scale_factor;
return (((x*d1+y)*d2)+z)*d3+w;
}
__device__ int translate_idx_inv(int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y)
{
int x, y, z, w;
w = ii % d3;
ii = ii/d3;
z = ii % d2;
ii = ii/d2;
y = ii % d1;
ii = ii/d1;
x = ii;
w = w*scale_factor+off_x;
z = z*scale_factor+off_y;
d2 *= scale_factor;
d3 *= scale_factor;
return (((x*d1+y)*d2)+z)*d3+w;
}
__global__ void upscale(float *input, float *output, long no_elements,
int scale_factor, int d1, int d2, int d3)
{
// output offset:
long ii = threadIdx.x + blockDim.x * blockIdx.x;
ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
if (ii >= no_elements) return;
int ipidx = translate_idx(ii, d1, d2, d3, scale_factor);
output[ii]=input[ipidx];
}
void THNN_CudaSpatialUpSamplingNearest_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, int scale_factor)
{
THCudaTensor_zero(state, output);
THCUNN_assertSameGPU(state, 2, input, output);
input = THCudaTensor_newContiguous(state, input);
// This is for allocating output Tensor
long no_elements = 1;
for(int i = 0; i < input->nDimension; i++){
no_elements *= input->size[i];
}
no_elements *= scale_factor * scale_factor;
int d1;
int d2;
int d3;
if (input->nDimension == 3) {
d1 = output->size[0];
d2 = output->size[1];
d3 = output->size[2];
} else {
d1 = output->size[1];
d2 = output->size[2];
d3 = output->size[3];
}
float *input_data = THCudaTensor_data(state, input);
float *output_data = THCudaTensor_data(state, output);
// cuda blocks & threads:
long nthreads = 256;
// Max number of blocks: http://en.wikipedia.org/wiki/CUDA
// 65535 for SM 2.x, 2^32 -1 for >= 3.0
// TODO: When we move to SM 3.5 we should update this
long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
if (n_yblocks > 65535) {
THError("Input size is too large! aborting");
}
dim3 blocks(n_xblocks, n_yblocks);
dim3 threads(nthreads);
// kernel:
upscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, no_elements, scale_factor, d1, d2, d3);
THCudaCheck(cudaGetLastError());
// final cut:
THCudaTensor_free(state, input);
}
/*
* Description:
*/
__global__ void downscale(float *gradInput_data, float *gradOutput_data, long no_elements,
int scale_factor, int d1, int d2, int d3)
{
// output offset:
long ii = threadIdx.x + blockDim.x * blockIdx.x;
ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
if (ii >= no_elements) return;
for (int i=0; i < scale_factor; i++){
for(int j=0; j < scale_factor; j++){
int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j);
gradInput_data[ii] += gradOutput_data[ipidx];
}
}
}
void THNN_CudaSpatialUpSamplingNearest_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, int scale_factor)
{
THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
THCudaTensor_zero(state, gradInput);
float *gradInput_data = THCudaTensor_data(state, gradInput);
float *gradOutput_data = THCudaTensor_data(state, gradOutput);
long no_elements = 1;
for(int i = 0; i < gradInput->nDimension; i++){
no_elements *= gradInput->size[i];
}
int d1;
int d2;
int d3;
if (gradInput->nDimension == 3) {
d1 = gradInput->size[0];
d2 = gradInput->size[1];
d3 = gradInput->size[2];
} else {
d1 = gradInput->size[1];
d2 = gradInput->size[2];
d3 = gradInput->size[3];
}
// cuda blocks & threads:
long nthreads = 256;
// Max number of blocks: http://en.wikipedia.org/wiki/CUDA
// 65535 for SM 2.x, 2^32 -1 for >= 3.0
// TODO: When we move to SM 3.5 we should update this
long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
if (n_yblocks > 65535) {
THError("Input size is too large! aborting");
}
dim3 blocks(n_xblocks, n_yblocks);
dim3 threads(nthreads);
// kernel:
downscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, no_elements,
scale_factor, d1, d2, d3);
THCudaCheck(cudaGetLastError());
}