blob: 6fd1370683154685f7a6fef8b3508af3a569af2d [file] [log] [blame]
#include "caffe2/operators/pool_op.h"
#include <array>
#include <functional>
#include <limits>
#include <numeric>
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
namespace {
template <typename T>
__global__ void AveragePool1DForwardNCHWCUDAKernel(
const int X_size,
const int Y_size,
const int kernel,
const int stride,
const int pad,
const bool count_include_pad,
const T* X,
T* Y) {
const int nc = blockIdx.x;
const T* X_ptr = X + nc * X_size;
T* Y_ptr = Y + nc * Y_size;
for (int y = threadIdx.x; y < Y_size; y += blockDim.x) {
const int x = y * stride - pad;
const int l = max(x, 0);
const int r = min(x + kernel, X_size);
const T scale = T(1) / static_cast<T>(count_include_pad ? kernel : r - l);
T sum = 0;
for (int i = l; i < r; ++i) {
#if __CUDA_ARCH__ >= 350
sum += __ldg(X_ptr + i);
#else
sum += X_ptr[i];
#endif
}
Y_ptr[y] = sum * scale;
}
}
template <typename T>
__global__ void AveragePool1DForwardNHWCCUDAKernel(
const int C,
const int X_size,
const int Y_size,
const int kernel,
const int stride,
const int pad,
const bool count_include_pad,
const T* X,
T* Y) {
const int n = blockIdx.x / Y_size;
const int y = blockIdx.x % Y_size;
const int x = y * stride - pad;
const int l = max(x, 0);
const int r = min(x + kernel, X_size);
const T scale = T(1) / static_cast<T>(count_include_pad ? kernel : r - l);
const T* X_ptr = X + n * X_size * C;
T* Y_ptr = Y + n * Y_size * C;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
T sum = 0;
for (int i = l; i < r; ++i) {
#if __CUDA_ARCH__ >= 350
sum += __ldg(X_ptr + i * C + c);
#else
sum += X_ptr[i * C + c];
#endif
}
Y_ptr[y * C + c] = sum * scale;
}
}
template <typename T>
__global__ void AveragePool2DForwardNCHWCUDAKernel(
const int X_H,
const int X_W,
const int Y_H,
const int Y_W,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
const bool count_include_pad,
const T* X,
T* Y) {
const int X_HxW = X_H * X_W;
const int Y_HxW = Y_H * Y_W;
const int nc = blockIdx.x / Y_H;
const int yh = blockIdx.x % Y_H;
const T* X_ptr = X + nc * X_HxW;
T* Y_ptr = Y + nc * Y_HxW;
const int xh = yh * stride_h - pad_t;
const int t = max(xh, 0);
const int b = min(xh + kernel_h, X_H);
for (int yw = threadIdx.x; yw < Y_W; yw += blockDim.x) {
const int xw = yw * stride_w - pad_l;
const int l = max(xw, 0);
const int r = min(xw + kernel_w, X_W);
const T scale = T(1) /
static_cast<T>(count_include_pad ? kernel_h * kernel_w
: (b - t) * (r - l));
T sum = 0;
for (int i = t; i < b; ++i) {
for (int j = l; j < r; ++j) {
#if __CUDA_ARCH__ >= 350
sum += __ldg(X_ptr + i * X_W + j);
#else
sum += X_ptr[i * X_W + j];
#endif
}
}
Y_ptr[yh * Y_W + yw] = sum * scale;
}
}
template <typename T>
__global__ void AveragePool2DForwardNHWCCUDAKernel(
const int C,
const int X_H,
const int X_W,
const int Y_H,
const int Y_W,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
const bool count_include_pad,
const T* X,
T* Y) {
const int X_HxW = X_H * X_W;
const int Y_HxW = Y_H * Y_W;
const int n = blockIdx.x / Y_HxW;
const int y = blockIdx.x % Y_HxW;
const int yh = y / Y_W;
const int yw = y % Y_W;
const int xh = yh * stride_h - pad_t;
const int xw = yw * stride_w - pad_l;
const int t = max(xh, 0);
const int b = min(xh + kernel_h, X_H);
const int l = max(xw, 0);
const int r = min(xw + kernel_w, X_W);
const T scale = T(1) /
static_cast<T>(count_include_pad ? kernel_h * kernel_w
: (b - t) * (r - l));
const T* X_ptr = X + n * X_HxW * C;
T* Y_ptr = Y + n * Y_HxW * C;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
T sum = 0;
for (int i = t; i < b; ++i) {
for (int j = l; j < r; ++j) {
#if __CUDA_ARCH__ >= 350
sum += __ldg(X_ptr + (i * X_W + j) * C + c);
#else
sum += X_ptr[(i * X_W + j) * C + c];
#endif
}
}
Y_ptr[y * C + c] = sum * scale;
}
}
template <typename T>
__global__ void AveragePool3DForwardNCHWCUDAKernel(
const int X_D,
const int X_H,
const int X_W,
const int Y_D,
const int Y_H,
const int Y_W,
const int kernel_d,
const int kernel_h,
const int kernel_w,
const int stride_d,
const int stride_h,
const int stride_w,
const int pad_p,
const int pad_t,
const int pad_l,
const bool count_include_pad,
const T* X,
T* Y) {
const int X_HxW = X_D * X_H * X_W;
const int Y_HxW = Y_D * Y_H * Y_W;
const int yy = blockIdx.x / Y_H;
const int nc = yy / Y_D;
const int yd = yy % Y_D;
const int yh = blockIdx.x % Y_H;
const T* X_ptr = X + nc * X_HxW;
T* Y_ptr = Y + nc * Y_HxW;
const int xd = yd * stride_d - pad_p;
const int xh = yh * stride_h - pad_t;
const int p = max(xd, 0);
const int a = min(xd + kernel_d, X_D);
const int t = max(xh, 0);
const int b = min(xh + kernel_h, X_H);
for (int yw = threadIdx.x; yw < Y_W; yw += blockDim.x) {
const int xw = yw * stride_w - pad_l;
const int l = max(xw, 0);
const int r = min(xw + kernel_w, X_W);
const T scale = T(1) /
static_cast<T>(count_include_pad ? kernel_d * kernel_h * kernel_w
: (a - p) * (b - t) * (r - l));
T sum = 0;
for (int i = p; i < a; ++i) {
for (int j = t; j < b; ++j) {
for (int k = l; k < r; ++k) {
#if __CUDA_ARCH__ >= 350
sum += __ldg(X_ptr + (i * X_H + j) * X_W + k);
#else
sum += X_ptr[(i * X_H + j) * X_W + k];
#endif
}
}
}
Y_ptr[(yd * Y_H + yh) * Y_W + yw] = sum * scale;
}
}
template <typename T>
__global__ void AveragePool3DForwardNHWCCUDAKernel(
const int C,
const int X_D,
const int X_H,
const int X_W,
const int Y_D,
const int Y_H,
const int Y_W,
const int kernel_d,
const int kernel_h,
const int kernel_w,
const int stride_d,
const int stride_h,
const int stride_w,
const int pad_p,
const int pad_t,
const int pad_l,
const bool count_include_pad,
const T* X,
T* Y) {
const int X_HxW = X_D * X_H * X_W;
const int Y_HxW = Y_D * Y_H * Y_W;
const int n = blockIdx.x / Y_HxW;
const int y = blockIdx.x % Y_HxW;
const int yy = y / Y_W;
const int yd = yy / Y_H;
const int yh = yy % Y_H;
const int yw = y % Y_W;
const int xd = yd * stride_d - pad_p;
const int xh = yh * stride_h - pad_t;
const int xw = yw * stride_w - pad_l;
const int p = max(xd, 0);
const int a = min(xd + kernel_d, X_D);
const int t = max(xh, 0);
const int b = min(xh + kernel_h, X_H);
const int l = max(xw, 0);
const int r = min(xw + kernel_w, X_W);
const T scale = T(1) /
static_cast<T>(count_include_pad ? kernel_d * kernel_h * kernel_w
: (a - p) * (b - t) * (r - l));
const T* X_ptr = X + n * X_HxW * C;
T* Y_ptr = Y + n * Y_HxW * C;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
T sum = 0;
for (int i = p; i < a; ++i) {
for (int j = t; j < b; ++j) {
for (int k = l; k < r; ++k) {
#if __CUDA_ARCH__ >= 350
sum += __ldg(X_ptr + ((i * X_H + j) * X_W + k) * C + c);
#else
sum += X_ptr[((i * X_H + j) * X_W + k) * C + c];
#endif
}
}
}
Y_ptr[y * C + c] = sum * scale;
}
}
template <typename T>
__global__ void GlobalAveragePoolingBackwardNCHWCUDAKernel(
const int K,
const int HxW,
const T scale,
const T* dY,
T* dX) {
const int nc = blockIdx.x / K;
const int block = blockIdx.x % K;
const int x = threadIdx.x + block * CAFFE_CUDA_NUM_THREADS;
if (x < HxW) {
#if __CUDA_ARCH__ >= 350
dX[nc * HxW + x] = __ldg(dY + nc) * scale;
#else
dX[nc * HxW + x] = dY[nc] * scale;
#endif
}
}
template <typename T>
__global__ void GlobalAveragePoolingBackwardNHWCCUDAKernel(
const int C,
const int HxW,
const T scale,
const T* dY,
T* dX) {
const int n = blockIdx.x / HxW;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
#if __CUDA_ARCH__ >= 350
dX[blockIdx.x * C + c] = __ldg(dY + n * C + c) * scale;
#else
dX[blockIdx.x * C + c] = dY[n * C + c] * scale;
#endif
}
}
template <typename T, bool kCountIncludePad>
__global__ void AveragePool1DBackwardNCHWCUDAKernel(
const int X_size,
const int Y_size,
const int kernel,
const int stride,
const int pad,
const T* dY,
T* dX) {
const int nc = blockIdx.x;
const T* dY_ptr = dY + nc * Y_size;
T* dX_ptr = dX + nc * X_size;
for (int x = threadIdx.x; x < X_size; x += blockDim.x) {
const int w = x + pad;
const int l = w < kernel ? 0 : (w - kernel) / stride + 1;
const int r = min(w / stride + 1, Y_size);
T sum = 0;
for (int i = l; i < r; ++i) {
if (kCountIncludePad) {
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY_ptr + i);
#else
sum += dY_ptr[i];
#endif
} else {
const int xx = i * stride - pad;
const int xl = max(xx, 0);
const int xr = min(xx + kernel, X_size);
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY_ptr + i) / static_cast<T>(xr - xl);
#else
sum += dY_ptr[i] / static_cast<T>(xr - xl);
#endif
}
}
if (kCountIncludePad) {
dX_ptr[x] = sum / static_cast<T>(kernel);
} else {
dX_ptr[x] = sum;
}
}
}
template <typename T, bool kCountIncludePad>
__global__ void AveragePool1DBackwardNHWCCUDAKernel(
const int C,
const int X_size,
const int Y_size,
const int kernel,
const int stride,
const int pad,
const T* dY,
T* dX) {
const int n = blockIdx.x / X_size;
const int x = blockIdx.x % X_size;
const int w = x + pad;
const int l = w < kernel ? 0 : (w - kernel) / stride + 1;
const int r = min(w / stride + 1, Y_size);
const T scale = T(1) / static_cast<T>(kernel);
const T* dY_ptr = dY + n * Y_size * C;
T* dX_ptr = dX + n * X_size * C;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
T sum = 0;
for (int i = l; i < r; ++i) {
if (kCountIncludePad) {
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY_ptr + i * C + c);
#else
sum += dY_ptr[i * C + c];
#endif
} else {
const int xx = i * stride - pad;
const int xl = max(xx, 0);
const int xr = min(xx + kernel, X_size);
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY_ptr + i * C + c) / static_cast<T>(xr - xl);
#else
sum += dY_ptr[i * C + c] / static_cast<T>(xr - xl);
#endif
}
}
if (kCountIncludePad) {
dX_ptr[x * C + c] = sum * scale;
} else {
dX_ptr[x * C + c] = sum;
}
}
}
template <typename T, bool kCountIncludePad>
__global__ void AveragePool2DBackwardNCHWCUDAKernel(
const int X_H,
const int X_W,
const int Y_H,
const int Y_W,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
const T* dY,
T* dX) {
const int X_HxW = X_H * X_W;
const int Y_HxW = Y_H * Y_W;
const int nc = blockIdx.x / X_H;
const int hh = blockIdx.x % X_H;
const T* dY_ptr = dY + nc * Y_HxW;
T* dX_ptr = dX + nc * X_HxW;
const int h = hh + pad_t;
const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
const int b = min(h / stride_h + 1, Y_H);
for (int ww = threadIdx.x; ww < X_W; ww += blockDim.x) {
const int w = ww + pad_l;
const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
const int r = min(w / stride_w + 1, Y_W);
T sum = 0;
for (int i = t; i < b; ++i) {
for (int j = l; j < r; ++j) {
if (kCountIncludePad) {
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY_ptr + i * Y_W + j);
#else
sum += dY_ptr[i * Y_W + j];
#endif
} else {
const int xh = i * stride_h - pad_t;
const int xw = j * stride_w - pad_l;
const int xt = max(xh, 0);
const int xb = min(xh + kernel_h, X_H);
const int xl = max(xw, 0);
const int xr = min(xw + kernel_w, X_W);
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY_ptr + i * Y_W + j) /
static_cast<T>((xb - xt) * (xr - xl));
#else
sum += dY_ptr[i * Y_W + j] / static_cast<T>((xb - xt) * (xr - xl));
#endif
}
}
}
if (kCountIncludePad) {
dX_ptr[hh * X_W + ww] = sum / static_cast<T>(kernel_h * kernel_w);
} else {
dX_ptr[hh * X_W + ww] = sum;
}
}
}
template <typename T, bool kCountIncludePad>
__global__ void AveragePool2DBackwardNHWCCUDAKernel(
const int C,
const int X_H,
const int X_W,
const int Y_H,
const int Y_W,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
const T* dY,
T* dX) {
const int X_HxW = X_H * X_W;
const int Y_HxW = Y_H * Y_W;
const int n = blockIdx.x / X_HxW;
const int x = blockIdx.x % X_HxW;
const int h = x / X_W + pad_t;
const int w = x % X_W + pad_l;
const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
const int b = min(h / stride_h + 1, Y_H);
const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
const int r = min(w / stride_w + 1, Y_W);
const T scale = T(1) / static_cast<T>(kernel_h * kernel_w);
const T* dY_ptr = dY + n * Y_HxW * C;
T* dX_ptr = dX + n * X_HxW * C;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
T sum = 0;
for (int i = t; i < b; ++i) {
for (int j = l; j < r; ++j) {
if (kCountIncludePad) {
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY_ptr + (i * Y_W + j) * C + c);
#else
sum += dY_ptr[(i * Y_W + j) * C + c];
#endif
} else {
const int xh = i * stride_h - pad_t;
const int xw = j * stride_w - pad_l;
const int xt = max(xh, 0);
const int xb = min(xh + kernel_h, X_H);
const int xl = max(xw, 0);
const int xr = min(xw + kernel_w, X_W);
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY_ptr + (i * Y_W + j) * C + c) /
static_cast<T>((xb - xt) * (xr - xl));
#else
sum += dY_ptr[(i * Y_W + j) * C + c] /
static_cast<T>((xb - xt) * (xr - xl));
#endif
}
}
}
if (kCountIncludePad) {
dX_ptr[x * C + c] = sum * scale;
} else {
dX_ptr[x * C + c] = sum;
}
}
}
template <typename T, bool kCountIncludePad>
__global__ void AveragePool3DBackwardNCHWCUDAKernel(
const int X_D,
const int X_H,
const int X_W,
const int Y_D,
const int Y_H,
const int Y_W,
const int kernel_d,
const int kernel_h,
const int kernel_w,
const int stride_d,
const int stride_h,
const int stride_w,
const int pad_p,
const int pad_t,
const int pad_l,
const T* dY,
T* dX) {
const int X_HxW = X_D * X_H * X_W;
const int Y_HxW = Y_D * Y_H * Y_W;
const int xx = blockIdx.x / X_H;
const int nc = xx / X_D;
const int dd = xx % X_D;
const int hh = blockIdx.x % X_H;
const T* dY_ptr = dY + nc * Y_HxW;
T* dX_ptr = dX + nc * X_HxW;
const int d = dd + pad_p;
const int h = hh + pad_t;
const int p = d < kernel_d ? 0 : (d - kernel_d) / stride_d + 1;
const int a = min(d / stride_d + 1, Y_D);
const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
const int b = min(h / stride_h + 1, Y_H);
for (int ww = threadIdx.x; ww < X_W; ww += blockDim.x) {
const int w = ww + pad_l;
const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
const int r = min(w / stride_w + 1, Y_W);
T sum = 0;
for (int i = p; i < a; ++i) {
for (int j = t; j < b; ++j) {
for (int k = l; k < r; ++k) {
if (kCountIncludePad) {
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY_ptr + (i * Y_H + j) * Y_W + k);
#else
sum += dY_ptr[(i * Y_H + j) * Y_W + k];
#endif
} else {
const int xd = i * stride_d - pad_p;
const int xh = j * stride_h - pad_t;
const int xw = k * stride_w - pad_l;
const int xp = max(xd, 0);
const int xa = min(xd + kernel_d, X_D);
const int xt = max(xh, 0);
const int xb = min(xh + kernel_h, X_H);
const int xl = max(xw, 0);
const int xr = min(xw + kernel_w, X_W);
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY_ptr + (i * Y_H + j) * Y_W + k) /
static_cast<T>((xa - xp) * (xb - xt) * (xr - xl));
#else
sum += dY_ptr[(i * Y_H + j) * Y_W + k] /
static_cast<T>((xa - xp) * (xb - xt) * (xr - xl));
#endif
}
}
}
}
if (kCountIncludePad) {
dX_ptr[(dd * X_H + hh) * X_W + ww] =
sum / static_cast<T>(kernel_d * kernel_h * kernel_w);
} else {
dX_ptr[(dd * X_H + hh) * X_W + ww] = sum;
}
}
}
template <typename T, bool kCountIncludePad>
__global__ void AveragePool3DBackwardNHWCCUDAKernel(
const int C,
const int X_D,
const int X_H,
const int X_W,
const int Y_D,
const int Y_H,
const int Y_W,
const int kernel_d,
const int kernel_h,
const int kernel_w,
const int stride_d,
const int stride_h,
const int stride_w,
const int pad_p,
const int pad_t,
const int pad_l,
const T* dY,
T* dX) {
const int X_HxW = X_D * X_H * X_W;
const int Y_HxW = Y_D * Y_H * Y_W;
const int n = blockIdx.x / X_HxW;
const int x = blockIdx.x % X_HxW;
const int xx = x / X_W;
const int d = xx / X_H + pad_p;
const int h = xx % X_H + pad_t;
const int w = x % X_W + pad_l;
const int p = d < kernel_d ? 0 : (d - kernel_d) / stride_d + 1;
const int a = min(d / stride_d + 1, Y_D);
const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
const int b = min(h / stride_h + 1, Y_H);
const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
const int r = min(w / stride_w + 1, Y_W);
const T scale = T(1) / static_cast<T>(kernel_d * kernel_h * kernel_w);
const T* dY_ptr = dY + n * Y_HxW * C;
T* dX_ptr = dX + n * X_HxW * C;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
T sum = 0;
for (int i = p; i < a; ++i) {
for (int j = t; j < b; ++j) {
for (int k = l; k < r; ++k) {
if (kCountIncludePad) {
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY_ptr + ((i * Y_H + j) * Y_W + k) * C + c);
#else
sum += dY_ptr[((i * Y_H + j) * Y_W + k) * C + c];
#endif
} else {
const int xd = i * stride_d - pad_p;
const int xh = j * stride_h - pad_t;
const int xw = k * stride_w - pad_l;
const int xp = max(xd, 0);
const int xa = min(xd + kernel_d, X_D);
const int xt = max(xh, 0);
const int xb = min(xh + kernel_h, X_H);
const int xl = max(xw, 0);
const int xr = min(xw + kernel_w, X_W);
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY_ptr + ((i * Y_H + j) * Y_W + k) * C + c) /
static_cast<T>((xa - xp) * (xb - xt) * (xr - xl));
#else
sum += dY_ptr[((i * Y_H + j) * Y_W + k) * C + c] /
static_cast<T>((xa - xp) * (xb - xt) * (xr - xl));
#endif
}
}
}
}
if (kCountIncludePad) {
dX_ptr[x * C + c] = sum * scale;
} else {
dX_ptr[x * C + c] = sum;
}
}
}
} // namespace
template <>
template <>
bool AveragePoolFunctor<CUDAContext>::
GlobalPoolingForward<float, StorageOrder::NCHW>(
const int N,
const int C,
const int HxW,
const float* X,
float* Y,
CUDAContext* context) const {
const std::array<int, 2> X_dims = {N * C, HxW};
const std::array<int, 2> Y_dims = {N * C, 1};
math::ReduceMean<float, CUDAContext>(
2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}
template <>
template <>
bool AveragePoolFunctor<CUDAContext>::
GlobalPoolingForward<float, StorageOrder::NHWC>(
const int N,
const int C,
const int HxW,
const float* X,
float* Y,
CUDAContext* context) const {
if (ones.numel() != HxW) {
ones.Resize(HxW);
math::Set<float, CUDAContext>(
HxW, 1.0f, ones.mutable_data<float>(), context);
}
math::GemmStridedBatched<float, CUDAContext>(
CblasTrans,
CblasNoTrans,
N,
C,
1,
HxW,
1.0f / static_cast<float>(HxW),
X,
HxW * C,
ones.data<float>(),
0,
0.0f,
Y,
C,
context);
return true;
}
template <>
template <>
bool AveragePoolFunctor<CUDAContext>::Forward<float, StorageOrder::NCHW>(
const int N,
const int C,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const std::vector<int>& kernel,
const std::vector<int>& /* dilation */,
const std::vector<int>& stride,
const std::vector<int>& pads,
const float* X,
float* Y,
CUDAContext* context) const {
const int ndim = X_dims.size();
switch (ndim) {
case 1: {
const int num_blocks = N * C;
AveragePool1DForwardNCHWCUDAKernel<float>
<<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
X_dims[0],
Y_dims[0],
kernel[0],
stride[0],
pads[0],
count_include_pad,
X,
Y);
return true;
}
case 2: {
const int num_blocks = N * C * Y_dims[0];
AveragePool2DForwardNCHWCUDAKernel<float>
<<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
X_dims[0],
X_dims[1],
Y_dims[0],
Y_dims[1],
kernel[0],
kernel[1],
stride[0],
stride[1],
pads[0],
pads[1],
count_include_pad,
X,
Y);
return true;
}
case 3: {
const int num_blocks = N * C * Y_dims[0] * Y_dims[1];
AveragePool3DForwardNCHWCUDAKernel<float>
<<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
X_dims[0],
X_dims[1],
X_dims[2],
Y_dims[0],
Y_dims[1],
Y_dims[2],
kernel[0],
kernel[1],
kernel[2],
stride[0],
stride[1],
stride[2],
pads[0],
pads[1],
pads[2],
count_include_pad,
X,
Y);
return true;
}
default: {
CAFFE_THROW("Unsupported pooling dim: ", ndim);
return false;
}
}
}
template <>
template <>
bool AveragePoolFunctor<CUDAContext>::Forward<float, StorageOrder::NHWC>(
const int N,
const int C,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const std::vector<int>& kernel,
const std::vector<int>& /* dilation */,
const std::vector<int>& stride,
const std::vector<int>& pads,
const float* X,
float* Y,
CUDAContext* context) const {
// Each CUDA block handles one point, one thread per channel.
const int ndim = X_dims.size();
const int Y_HxW = std::accumulate(
Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
switch (ndim) {
case 1: {
AveragePool1DForwardNHWCCUDAKernel<float>
<<<N * Y_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
C,
X_dims[0],
Y_dims[0],
kernel[0],
stride[0],
pads[0],
count_include_pad,
X,
Y);
return true;
}
case 2: {
AveragePool2DForwardNHWCCUDAKernel<float>
<<<N * Y_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
C,
X_dims[0],
X_dims[1],
Y_dims[0],
Y_dims[1],
kernel[0],
kernel[1],
stride[0],
stride[1],
pads[0],
pads[1],
count_include_pad,
X,
Y);
return true;
}
case 3: {
AveragePool3DForwardNHWCCUDAKernel<float>
<<<N * Y_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
C,
X_dims[0],
X_dims[1],
X_dims[2],
Y_dims[0],
Y_dims[1],
Y_dims[2],
kernel[0],
kernel[1],
kernel[2],
stride[0],
stride[1],
stride[2],
pads[0],
pads[1],
pads[2],
count_include_pad,
X,
Y);
return true;
}
default: {
CAFFE_THROW("Unsupported pooling dim: ", ndim);
return false;
}
}
}
template <>
template <>
bool AveragePoolFunctor<CUDAContext>::
GlobalPoolingBackward<float, StorageOrder::NCHW>(
const int N,
const int C,
const int HxW,
const float* dY,
const float* /* X */,
const float* /* Y */,
float* dX,
CUDAContext* context) const {
const float scale = 1.0f / static_cast<float>(HxW);
const int K = (HxW + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
GlobalAveragePoolingBackwardNCHWCUDAKernel<float>
<<<N * C * K, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
K, HxW, scale, dY, dX);
return true;
}
template <>
template <>
bool AveragePoolFunctor<CUDAContext>::
GlobalPoolingBackward<float, StorageOrder::NHWC>(
const int N,
const int C,
const int HxW,
const float* dY,
const float* /* X */,
const float* /* Y */,
float* dX,
CUDAContext* context) const {
const float scale = 1.0f / static_cast<float>(HxW);
GlobalAveragePoolingBackwardNHWCCUDAKernel<float>
<<<N * HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
C, HxW, scale, dY, dX);
return true;
}
#define DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1( \
cond, Func, T, num_blocks, threads_per_block, cuda_stream, ...) \
do { \
if (cond) { \
Func<T, true> \
<<<num_blocks, threads_per_block, 0, cuda_stream>>>(__VA_ARGS__); \
} else { \
Func<T, false> \
<<<num_blocks, threads_per_block, 0, cuda_stream>>>(__VA_ARGS__); \
} \
} while (false)
template <>
template <>
bool AveragePoolFunctor<CUDAContext>::Backward<float, StorageOrder::NCHW>(
const int N,
const int C,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const std::vector<int>& kernel,
const std::vector<int>& /* dilation */,
const std::vector<int>& stride,
const std::vector<int>& pads,
const float* dY,
const float* /* X */,
const float* /* Y */,
float* dX,
CUDAContext* context) const {
const int ndim = X_dims.size();
switch (ndim) {
case 1: {
const int num_blocks = N * C;
DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(
count_include_pad,
AveragePool1DBackwardNCHWCUDAKernel,
float,
num_blocks,
CAFFE_CUDA_NUM_THREADS,
context->cuda_stream(),
X_dims[0],
Y_dims[0],
kernel[0],
stride[0],
pads[0],
dY,
dX);
return true;
}
case 2: {
const int num_blocks = N * C * X_dims[0];
DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(
count_include_pad,
AveragePool2DBackwardNCHWCUDAKernel,
float,
num_blocks,
CAFFE_CUDA_NUM_THREADS,
context->cuda_stream(),
X_dims[0],
X_dims[1],
Y_dims[0],
Y_dims[1],
kernel[0],
kernel[1],
stride[0],
stride[1],
pads[0],
pads[1],
dY,
dX);
return true;
}
case 3: {
const int num_blocks = N * C * X_dims[0] * X_dims[1];
DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(
count_include_pad,
AveragePool3DBackwardNCHWCUDAKernel,
float,
num_blocks,
CAFFE_CUDA_NUM_THREADS,
context->cuda_stream(),
X_dims[0],
X_dims[1],
X_dims[2],
Y_dims[0],
Y_dims[1],
Y_dims[2],
kernel[0],
kernel[1],
kernel[2],
stride[0],
stride[1],
stride[2],
pads[0],
pads[1],
pads[2],
dY,
dX);
return true;
}
default: {
CAFFE_THROW("Unsupported pooling dim: ", ndim);
return false;
}
}
}
template <>
template <>
bool AveragePoolFunctor<CUDAContext>::Backward<float, StorageOrder::NHWC>(
const int N,
const int C,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const std::vector<int>& kernel,
const std::vector<int>& /* dilation */,
const std::vector<int>& stride,
const std::vector<int>& pads,
const float* dY,
const float* /* X */,
const float* /* Y */,
float* dX,
CUDAContext* context) const {
const int ndim = X_dims.size();
const int X_HxW = std::accumulate(
X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
const int num_blocks = N * X_HxW;
switch (ndim) {
case 1: {
DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(
count_include_pad,
AveragePool1DBackwardNHWCCUDAKernel,
float,
num_blocks,
CAFFE_CUDA_NUM_THREADS,
context->cuda_stream(),
C,
X_dims[0],
Y_dims[0],
kernel[0],
stride[0],
pads[0],
dY,
dX);
return true;
}
case 2: {
DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(
count_include_pad,
AveragePool2DBackwardNHWCCUDAKernel,
float,
num_blocks,
CAFFE_CUDA_NUM_THREADS,
context->cuda_stream(),
C,
X_dims[0],
X_dims[1],
Y_dims[0],
Y_dims[1],
kernel[0],
kernel[1],
stride[0],
stride[1],
pads[0],
pads[1],
dY,
dX);
return true;
}
case 3: {
DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(
count_include_pad,
AveragePool3DBackwardNHWCCUDAKernel,
float,
num_blocks,
CAFFE_CUDA_NUM_THREADS,
context->cuda_stream(),
C,
X_dims[0],
X_dims[1],
X_dims[2],
Y_dims[0],
Y_dims[1],
Y_dims[2],
kernel[0],
kernel[1],
kernel[2],
stride[0],
stride[1],
stride[2],
pads[0],
pads[1],
pads[2],
dY,
dX);
return true;
}
default: {
CAFFE_THROW("Unsupported pooling dim: ", ndim);
return false;
}
}
}
#undef DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1
namespace {
template <typename T>
__global__ void MaxPool1DForwardNCHWCUDAKernel(
const int X_size,
const int Y_size,
const int kernel,
const int stride,
const int pad,
const T* X,
T* Y) {
const int nc = blockIdx.x;
const T* X_ptr = X + nc * X_size;
T* Y_ptr = Y + nc * Y_size;
for (int y = threadIdx.x; y < Y_size; y += blockDim.x) {
const int x = y * stride;
const int l = max(x - pad, 0);
const int r = min(x - pad + kernel, X_size);
T val = std::numeric_limits<T>::lowest();
for (int i = l; i < r; ++i) {
#if __CUDA_ARCH__ >= 350
val = max(val, __ldg(X_ptr + i));
#else
val = max(val, X_ptr[i]);
#endif
}
Y_ptr[y] = val;
}
}
template <typename T>
__global__ void MaxPool1DForwardNHWCCUDAKernel(
const int C,
const int X_size,
const int Y_size,
const int kernel,
const int stride,
const int pad,
const T* X,
T* Y) {
const int n = blockIdx.x / Y_size;
const int y = blockIdx.x % Y_size;
const int x = y * stride;
const int l = max(x - pad, 0);
const int r = min(x - pad + kernel, X_size);
const T* X_ptr = X + n * X_size * C;
T* Y_ptr = Y + n * Y_size * C;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
T val = std::numeric_limits<T>::lowest();
for (int i = l; i < r; ++i) {
#if __CUDA_ARCH__ >= 350
val = max(val, __ldg(X_ptr + i * C + c));
#else
val = max(val, X_ptr[i * C + c]);
#endif
}
Y_ptr[y * C + c] = val;
}
}
template <typename T>
__global__ void MaxPool2DForwardNCHWCUDAKernel(
const int X_H,
const int X_W,
const int Y_H,
const int Y_W,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
const T* X,
T* Y) {
const int X_HxW = X_H * X_W;
const int Y_HxW = Y_H * Y_W;
const int nc = blockIdx.x / Y_H;
const int yh = blockIdx.x % Y_H;
const T* X_ptr = X + nc * X_HxW;
T* Y_ptr = Y + nc * Y_HxW;
const int xh = yh * stride_h;
const int t = max(xh - pad_t, 0);
const int b = min(xh - pad_t + kernel_h, X_H);
for (int yw = threadIdx.x; yw < Y_W; yw += blockDim.x) {
const int xw = yw * stride_w;
const int l = max(xw - pad_l, 0);
const int r = min(xw - pad_l + kernel_w, X_W);
T val = std::numeric_limits<T>::lowest();
for (int i = t; i < b; ++i) {
for (int j = l; j < r; ++j) {
#if __CUDA_ARCH__ >= 350
val = max(val, __ldg(X_ptr + i * X_W + j));
#else
val = max(val, X_ptr[i * X_W + j]);
#endif
}
}
Y_ptr[yh * Y_W + yw] = val;
}
}
template <typename T>
__global__ void MaxPool2DForwardNHWCCUDAKernel(
const int C,
const int X_H,
const int X_W,
const int Y_H,
const int Y_W,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
const T* X,
T* Y) {
const int X_HxW = X_H * X_W;
const int Y_HxW = Y_H * Y_W;
const int n = blockIdx.x / Y_HxW;
const int y = blockIdx.x % Y_HxW;
const int yh = y / Y_W;
const int yw = y % Y_W;
const int xh = yh * stride_h;
const int xw = yw * stride_w;
const int t = max(xh - pad_t, 0);
const int b = min(xh - pad_t + kernel_h, X_H);
const int l = max(xw - pad_l, 0);
const int r = min(xw - pad_l + kernel_w, X_W);
const T* X_ptr = X + n * X_HxW * C;
T* Y_ptr = Y + n * Y_HxW * C;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
T val = std::numeric_limits<T>::lowest();
for (int i = t; i < b; ++i) {
for (int j = l; j < r; ++j) {
#if __CUDA_ARCH__ >= 350
val = max(val, __ldg(X_ptr + (i * X_W + j) * C + c));
#else
val = max(val, X_ptr[(i * X_W + j) * C + c]);
#endif
}
}
Y_ptr[y * C + c] = val;
}
}
template <typename T>
__global__ void MaxPool3DForwardNCHWCUDAKernel(
const int X_D,
const int X_H,
const int X_W,
const int Y_D,
const int Y_H,
const int Y_W,
const int kernel_d,
const int kernel_h,
const int kernel_w,
const int stride_d,
const int stride_h,
const int stride_w,
const int pad_p,
const int pad_t,
const int pad_l,
const T* X,
T* Y) {
const int X_HxW = X_D * X_H * X_W;
const int Y_HxW = Y_D * Y_H * Y_W;
const int yy = blockIdx.x / Y_H;
const int nc = yy / Y_D;
const int yd = yy % Y_D;
const int yh = blockIdx.x % Y_H;
const T* X_ptr = X + nc * X_HxW;
T* Y_ptr = Y + nc * Y_HxW;
const int xd = yd * stride_d;
const int xh = yh * stride_h;
const int p = max(xd - pad_p, 0);
const int a = min(xd - pad_p + kernel_d, X_D);
const int t = max(xh - pad_t, 0);
const int b = min(xh - pad_t + kernel_h, X_H);
for (int yw = threadIdx.x; yw < Y_W; yw += blockDim.x) {
const int xw = yw * stride_w;
const int l = max(xw - pad_l, 0);
const int r = min(xw - pad_l + kernel_w, X_W);
T val = std::numeric_limits<T>::lowest();
for (int i = p; i < a; ++i) {
for (int j = t; j < b; ++j) {
for (int k = l; k < r; ++k) {
#if __CUDA_ARCH__ >= 350
val = max(val, __ldg(X_ptr + (i * X_H + j) * X_W + k));
#else
val = max(val, X_ptr[(i * X_H + j) * X_W + k]);
#endif
}
}
}
Y_ptr[(yd * Y_H + yh) * Y_W + yw] = val;
}
}
template <typename T>
__global__ void MaxPool3DForwardNHWCCUDAKernel(
const int C,
const int X_D,
const int X_H,
const int X_W,
const int Y_D,
const int Y_H,
const int Y_W,
const int kernel_d,
const int kernel_h,
const int kernel_w,
const int stride_d,
const int stride_h,
const int stride_w,
const int pad_p,
const int pad_t,
const int pad_l,
const T* X,
T* Y) {
const int X_HxW = X_D * X_H * X_W;
const int Y_HxW = Y_D * Y_H * Y_W;
const int n = blockIdx.x / Y_HxW;
const int y = blockIdx.x % Y_HxW;
const int yy = y / Y_W;
const int yw = y % Y_W;
const int yh = yy % Y_H;
const int yd = yy / Y_H;
const int xd = yd * stride_d;
const int xh = yh * stride_h;
const int xw = yw * stride_w;
const int p = max(xd - pad_p, 0);
const int a = min(xd - pad_p + kernel_d, X_D);
const int t = max(xh - pad_t, 0);
const int b = min(xh - pad_t + kernel_h, X_H);
const int l = max(xw - pad_l, 0);
const int r = min(xw - pad_l + kernel_w, X_W);
const T* X_ptr = X + n * X_HxW * C;
T* Y_ptr = Y + n * Y_HxW * C;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
T val = std::numeric_limits<T>::lowest();
for (int i = p; i < a; ++i) {
for (int j = t; j < b; ++j) {
for (int k = l; k < r; ++k) {
#if __CUDA_ARCH__ >= 350
val = max(val, __ldg(X_ptr + ((i * X_H + j) * X_W + k) * C + c));
#else
val = max(val, X_ptr[((i * X_H + j) * X_W + k) * C + c]);
#endif
}
}
}
Y_ptr[y * C + c] = val;
}
}
template <typename T>
__global__ void GlobalMaxPoolingBackwardNCHWCUDAKernel(
const int K,
const int HxW,
const T* dY,
const T* X,
const T* Y,
T* dX) {
const int nc = blockIdx.x / K;
const int block = blockIdx.x % K;
const int x = threadIdx.x + block * CAFFE_CUDA_NUM_THREADS;
if (x < HxW) {
#if __CUDA_ARCH__ >= 350
dX[nc * HxW + x] =
(__ldg(X + nc * HxW + x) == __ldg(Y + nc)) ? __ldg(dY + nc) : T(0);
#else
dX[nc * HxW + x] = (X[nc * HxW + x] == Y[nc]) ? dY[nc] : T(0);
#endif
}
}
template <typename T>
__global__ void GlobalMaxPoolingBackwardNHWCCUDAKernel(
const int C,
const int HxW,
const T* dY,
const T* X,
const T* Y,
T* dX) {
const int n = blockIdx.x / HxW;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
#if __CUDA_ARCH__ >= 350
dX[blockIdx.x * C + c] =
(__ldg(X + blockIdx.x * C + c) == __ldg(Y + n * C + c))
? __ldg(dY + n * C + c)
: T(0);
#else
dX[blockIdx.x * C + c] =
(X[blockIdx.x * C + c] == Y[n * C + c]) ? dY[n * C + c] : T(0);
#endif
}
}
template <typename T>
__global__ void MaxPool1DBackwardNCHWCUDAKernel(
const int X_size,
const int Y_size,
const int kernel,
const int stride,
const int pad,
const T* dY,
const T* X,
const T* Y,
T* dX) {
const int nc = blockIdx.x;
const T* dY_ptr = dY + nc * Y_size;
const T* X_ptr = X + nc * X_size;
const T* Y_ptr = Y + nc * Y_size;
T* dX_ptr = dX + nc * X_size;
for (int x = threadIdx.x; x < X_size; x += blockDim.x) {
const int w = x + pad;
const int l = w < kernel ? 0 : (w - kernel) / stride + 1;
const int r = min(w / stride + 1, Y_size);
T sum = 0;
for (int i = l; i < r; ++i) {
#if __CUDA_ARCH__ >= 350
if (__ldg(X_ptr + x) == __ldg(Y_ptr + i)) {
sum += __ldg(dY_ptr + i);
}
#else
if (X_ptr[x] == Y_ptr[i]) {
sum += dY_ptr[i];
}
#endif
}
dX_ptr[x] = sum;
}
}
template <typename T>
__global__ void MaxPool1DBackwardNHWCCUDAKernel(
const int C,
const int X_size,
const int Y_size,
const int kernel,
const int stride,
const int pad,
const T* dY,
const T* X,
const T* Y,
T* dX) {
const int n = blockIdx.x / X_size;
const int x = blockIdx.x % X_size;
const int w = x + pad;
const int l = w < kernel ? 0 : (w - kernel) / stride + 1;
const int r = min(w / stride + 1, Y_size);
const T* dY_ptr = dY + n * Y_size * C;
const T* X_ptr = X + n * X_size * C;
const T* Y_ptr = Y + n * Y_size * C;
T* dX_ptr = dX + n * X_size * C;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
T sum = 0;
for (int i = l; i < r; ++i) {
#if __CUDA_ARCH__ >= 350
if (__ldg(X_ptr + x * C + c) == __ldg(Y_ptr + i * C + c)) {
sum += __ldg(dY_ptr + i * C + c);
}
#else
if (X_ptr[x * C + c] == Y_ptr[i * C + c]) {
sum += dY_ptr[i * C + c];
}
#endif
}
dX_ptr[x * C + c] = sum;
}
}
template <typename T>
__global__ void MaxPool2DBackwardNCHWCUDAKernel(
const int X_H,
const int X_W,
const int Y_H,
const int Y_W,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
const T* dY,
const T* X,
const T* Y,
T* dX) {
const int X_HxW = X_H * X_W;
const int Y_HxW = Y_H * Y_W;
const int nc = blockIdx.x / X_H;
const int xh = blockIdx.x % X_H;
const T* dY_ptr = dY + nc * Y_HxW;
const T* X_ptr = X + nc * X_HxW;
const T* Y_ptr = Y + nc * Y_HxW;
T* dX_ptr = dX + nc * X_HxW;
const int h = xh + pad_t;
const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
const int b = min(h / stride_h + 1, Y_H);
for (int xw = threadIdx.x; xw < X_W; xw += blockDim.x) {
const int w = xw + pad_l;
const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
const int r = min(w / stride_w + 1, Y_W);
const int x = xh * X_W + xw;
T sum = 0;
for (int i = t; i < b; ++i) {
for (int j = l; j < r; ++j) {
const int y = i * Y_W + j;
#if __CUDA_ARCH__ >= 350
if (__ldg(X_ptr + x) == __ldg(Y_ptr + y)) {
sum += __ldg(dY_ptr + y);
}
#else
if (X_ptr[x] == Y_ptr[y]) {
sum += dY_ptr[y];
}
#endif
}
}
dX_ptr[x] = sum;
}
}
template <typename T>
__global__ void MaxPool2DBackwardNHWCCUDAKernel(
const int C,
const int X_H,
const int X_W,
const int Y_H,
const int Y_W,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_t,
const int pad_l,
const T* dY,
const T* X,
const T* Y,
T* dX) {
const int X_HxW = X_H * X_W;
const int Y_HxW = Y_H * Y_W;
const int n = blockIdx.x / X_HxW;
const int x = blockIdx.x % X_HxW;
const int h = x / X_W + pad_t;
const int w = x % X_W + pad_l;
const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
const int b = min(h / stride_h + 1, Y_H);
const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
const int r = min(w / stride_w + 1, Y_W);
const T* dY_ptr = dY + n * Y_HxW * C;
const T* X_ptr = X + n * X_HxW * C;
const T* Y_ptr = Y + n * Y_HxW * C;
T* dX_ptr = dX + n * X_HxW * C;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
T sum = 0;
for (int i = t; i < b; ++i) {
for (int j = l; j < r; ++j) {
const int y = i * Y_W + j;
#if __CUDA_ARCH__ >= 350
if (__ldg(X_ptr + x * C + c) == __ldg(Y_ptr + y * C + c)) {
sum += __ldg(dY_ptr + y * C + c);
}
#else
if (X_ptr[x * C + c] == Y_ptr[y * C + c]) {
sum += dY_ptr[y * C + c];
}
#endif
}
}
dX_ptr[x * C + c] = sum;
}
}
template <typename T>
__global__ void MaxPool3DBackwardNCHWCUDAKernel(
const int X_D,
const int X_H,
const int X_W,
const int Y_D,
const int Y_H,
const int Y_W,
const int kernel_d,
const int kernel_h,
const int kernel_w,
const int stride_d,
const int stride_h,
const int stride_w,
const int pad_p,
const int pad_t,
const int pad_l,
const T* dY,
const T* X,
const T* Y,
T* dX) {
const int X_HxW = X_D * X_H * X_W;
const int Y_HxW = Y_D * Y_H * Y_W;
const int xx = blockIdx.x / X_H;
const int nc = xx / X_D;
const int xd = xx % X_D;
const int xh = blockIdx.x % X_H;
const T* dY_ptr = dY + nc * Y_HxW;
const T* X_ptr = X + nc * X_HxW;
const T* Y_ptr = Y + nc * Y_HxW;
T* dX_ptr = dX + nc * X_HxW;
const int d = xd + pad_p;
const int h = xh + pad_t;
const int p = d < kernel_d ? 0 : (d - kernel_d) / stride_d + 1;
const int a = min(d / stride_d + 1, Y_D);
const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
const int b = min(h / stride_h + 1, Y_H);
for (int xw = threadIdx.x; xw < X_W; xw += blockDim.x) {
const int w = xw + pad_l;
const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
const int r = min(w / stride_w + 1, Y_W);
const int x = (xd * X_H + xh) * X_W + xw;
T sum = 0;
for (int i = p; i < a; ++i) {
for (int j = t; j < b; ++j) {
for (int k = l; k < r; ++k) {
const int y = (i * Y_H + j) * Y_W + k;
#if __CUDA_ARCH__ >= 350
if (__ldg(X_ptr + x) == __ldg(Y_ptr + y)) {
sum += __ldg(dY_ptr + y);
}
#else
if (X_ptr[x] == Y_ptr[y]) {
sum += dY_ptr[y];
}
#endif
}
}
}
dX_ptr[x] = sum;
}
}
template <typename T>
__global__ void MaxPool3DBackwardNHWCCUDAKernel(
const int C,
const int X_D,
const int X_H,
const int X_W,
const int Y_D,
const int Y_H,
const int Y_W,
const int kernel_d,
const int kernel_h,
const int kernel_w,
const int stride_d,
const int stride_h,
const int stride_w,
const int pad_p,
const int pad_t,
const int pad_l,
const T* dY,
const T* X,
const T* Y,
T* dX) {
const int X_HxW = X_D * X_H * X_W;
const int Y_HxW = Y_D * Y_H * Y_W;
const int n = blockIdx.x / X_HxW;
const int x = blockIdx.x % X_HxW;
const int xx = x / X_W;
const int d = xx / X_H + pad_p;
const int h = xx % X_H + pad_t;
const int w = x % X_W + pad_l;
const int p = d < kernel_d ? 0 : (d - kernel_d) / stride_d + 1;
const int a = min(d / stride_d + 1, Y_D);
const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
const int b = min(h / stride_h + 1, Y_H);
const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
const int r = min(w / stride_w + 1, Y_W);
const T* dY_ptr = dY + n * Y_HxW * C;
const T* X_ptr = X + n * X_HxW * C;
const T* Y_ptr = Y + n * Y_HxW * C;
T* dX_ptr = dX + n * X_HxW * C;
for (int c = threadIdx.x; c < C; c += blockDim.x) {
T sum = 0;
for (int i = p; i < a; ++i) {
for (int j = t; j < b; ++j) {
for (int k = l; k < r; ++k) {
const int y = (i * Y_H + j) * Y_W + k;
#if __CUDA_ARCH__ >= 350
if (__ldg(X_ptr + x * C + c) == __ldg(Y_ptr + y * C + c)) {
sum += __ldg(dY_ptr + y * C + c);
}
#else
if (X_ptr[x * C + c] == Y_ptr[y * C + c]) {
sum += dY_ptr[y * C + c];
}
#endif
}
}
}
dX_ptr[x * C + c] = sum;
}
}
} // namespace
template <>
template <>
bool MaxPoolFunctor<CUDAContext>::
GlobalPoolingForward<float, StorageOrder::NCHW>(
const int N,
const int C,
const int HxW,
const float* X,
float* Y,
CUDAContext* context) const {
const std::array<int, 2> X_dims = {N * C, HxW};
const std::array<int, 2> Y_dims = {N * C, 1};
math::ReduceMax<float, CUDAContext>(
2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}
template <>
template <>
bool MaxPoolFunctor<CUDAContext>::
GlobalPoolingForward<float, StorageOrder::NHWC>(
const int N,
const int C,
const int HxW,
const float* X,
float* Y,
CUDAContext* context) const {
const std::array<int, 3> X_dims = {N, HxW, C};
const std::array<int, 3> Y_dims = {N, 1, C};
math::ReduceMax<float, CUDAContext>(
3, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}
template <>
template <>
bool MaxPoolFunctor<CUDAContext>::Forward<float, StorageOrder::NCHW>(
const int N,
const int C,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const std::vector<int>& kernel,
const std::vector<int>& /* dilation */,
const std::vector<int>& stride,
const std::vector<int>& pads,
const float* X,
float* Y,
CUDAContext* context) const {
const int ndim = X_dims.size();
switch (ndim) {
case 1: {
const int num_blocks = N * C;
MaxPool1DForwardNCHWCUDAKernel<float>
<<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
X_dims[0], Y_dims[0], kernel[0], stride[0], pads[0], X, Y);
return true;
}
case 2: {
const int num_blocks = N * C * Y_dims[0];
MaxPool2DForwardNCHWCUDAKernel<float>
<<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
X_dims[0],
X_dims[1],
Y_dims[0],
Y_dims[1],
kernel[0],
kernel[1],
stride[0],
stride[1],
pads[0],
pads[1],
X,
Y);
return true;
}
case 3: {
const int num_blocks = N * C * Y_dims[0] * Y_dims[1];
MaxPool3DForwardNCHWCUDAKernel<float>
<<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
X_dims[0],
X_dims[1],
X_dims[2],
Y_dims[0],
Y_dims[1],
Y_dims[2],
kernel[0],
kernel[1],
kernel[2],
stride[0],
stride[1],
stride[2],
pads[0],
pads[1],
pads[2],
X,
Y);
return true;
}
default: {
CAFFE_THROW("Unsupported pooling dim: ", ndim);
return false;
}
}
}
template <>
template <>
bool MaxPoolFunctor<CUDAContext>::Forward<float, StorageOrder::NHWC>(
const int N,
const int C,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const std::vector<int>& kernel,
const std::vector<int>& /* dilation */,
const std::vector<int>& stride,
const std::vector<int>& pads,
const float* X,
float* Y,
CUDAContext* context) const {
// Each CUDA block handles one point, one thread per channel.
const int ndim = X_dims.size();
const int Y_HxW = std::accumulate(
Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
switch (ndim) {
case 1: {
MaxPool1DForwardNHWCCUDAKernel<float>
<<<N * Y_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
C, X_dims[0], Y_dims[0], kernel[0], stride[0], pads[0], X, Y);
return true;
}
case 2: {
MaxPool2DForwardNHWCCUDAKernel<float>
<<<N * Y_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
C,
X_dims[0],
X_dims[1],
Y_dims[0],
Y_dims[1],
kernel[0],
kernel[1],
stride[0],
stride[1],
pads[0],
pads[1],
X,
Y);
return true;
}
case 3: {
MaxPool3DForwardNHWCCUDAKernel<float>
<<<N * Y_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
C,
X_dims[0],
X_dims[1],
X_dims[2],
Y_dims[0],
Y_dims[1],
Y_dims[2],
kernel[0],
kernel[1],
kernel[2],
stride[0],
stride[1],
stride[2],
pads[0],
pads[1],
pads[2],
X,
Y);
return true;
}
default: {
CAFFE_THROW("Unsupported pooling dim: ", ndim);
return false;
}
}
}
template <>
template <>
bool MaxPoolFunctor<CUDAContext>::
GlobalPoolingBackward<float, StorageOrder::NCHW>(
const int N,
const int C,
const int HxW,
const float* dY,
const float* X,
const float* Y,
float* dX,
CUDAContext* context) const {
const int K = (HxW + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
GlobalMaxPoolingBackwardNCHWCUDAKernel<float>
<<<N * C * K, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
K, HxW, dY, X, Y, dX);
return true;
}
template <>
template <>
bool MaxPoolFunctor<CUDAContext>::
GlobalPoolingBackward<float, StorageOrder::NHWC>(
const int N,
const int C,
const int HxW,
const float* dY,
const float* X,
const float* Y,
float* dX,
CUDAContext* context) const {
GlobalMaxPoolingBackwardNHWCCUDAKernel<float>
<<<N * HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
C, HxW, dY, X, Y, dX);
return true;
}
template <>
template <>
bool MaxPoolFunctor<CUDAContext>::Backward<float, StorageOrder::NCHW>(
const int N,
const int C,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const std::vector<int>& kernel,
const std::vector<int>& /* dilation */,
const std::vector<int>& stride,
const std::vector<int>& pads,
const float* dY,
const float* X,
const float* Y,
float* dX,
CUDAContext* context) const {
const int ndim = X_dims.size();
switch (ndim) {
case 1: {
const int num_blocks = N * C;
MaxPool1DBackwardNCHWCUDAKernel<float>
<<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
X_dims[0],
Y_dims[0],
kernel[0],
stride[0],
pads[0],
dY,
X,
Y,
dX);
return true;
}
case 2: {
const int num_blocks = N * C * X_dims[0];
MaxPool2DBackwardNCHWCUDAKernel<float>
<<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
X_dims[0],
X_dims[1],
Y_dims[0],
Y_dims[1],
kernel[0],
kernel[1],
stride[0],
stride[1],
pads[0],
pads[1],
dY,
X,
Y,
dX);
return true;
}
case 3: {
const int num_blocks = N * C * X_dims[0] * X_dims[1];
MaxPool3DBackwardNCHWCUDAKernel<float>
<<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
X_dims[0],
X_dims[1],
X_dims[2],
Y_dims[0],
Y_dims[1],
Y_dims[2],
kernel[0],
kernel[1],
kernel[2],
stride[0],
stride[1],
stride[2],
pads[0],
pads[1],
pads[2],
dY,
X,
Y,
dX);
return true;
}
default: {
CAFFE_THROW("Unsupported pooling dim: ", ndim);
return false;
}
}
}
template <>
template <>
bool MaxPoolFunctor<CUDAContext>::Backward<float, StorageOrder::NHWC>(
const int N,
const int C,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const std::vector<int>& kernel,
const std::vector<int>& /* dilation */,
const std::vector<int>& stride,
const std::vector<int>& pads,
const float* dY,
const float* X,
const float* Y,
float* dX,
CUDAContext* context) const {
const int ndim = X_dims.size();
const int X_HxW = std::accumulate(
X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
switch (ndim) {
case 1: {
MaxPool1DBackwardNHWCCUDAKernel<float>
<<<N * X_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
C,
X_dims[0],
Y_dims[0],
kernel[0],
stride[0],
pads[0],
dY,
X,
Y,
dX);
return true;
}
case 2: {
MaxPool2DBackwardNHWCCUDAKernel<float>
<<<N * X_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
C,
X_dims[0],
X_dims[1],
Y_dims[0],
Y_dims[1],
kernel[0],
kernel[1],
stride[0],
stride[1],
pads[0],
pads[1],
dY,
X,
Y,
dX);
return true;
}
case 3: {
MaxPool3DBackwardNHWCCUDAKernel<float>
<<<N * X_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
C,
X_dims[0],
X_dims[1],
X_dims[2],
Y_dims[0],
Y_dims[1],
Y_dims[2],
kernel[0],
kernel[1],
kernel[2],
stride[0],
stride[1],
stride[2],
pads[0],
pads[1],
pads[2],
dY,
X,
Y,
dX);
return true;
}
default: {
CAFFE_THROW("Unsupported pooling dim: ", ndim);
return false;
}
}
}
REGISTER_CUDA_OPERATOR(
AveragePool,
PoolOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
AveragePoolGradient,
PoolGradientOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
AveragePool1D,
PoolOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
AveragePool1DGradient,
PoolGradientOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
AveragePool2D,
PoolOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
AveragePool2DGradient,
PoolGradientOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
AveragePool3D,
PoolOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
AveragePool3DGradient,
PoolGradientOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
MaxPool,
PoolOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
MaxPoolGradient,
PoolGradientOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
MaxPool1D,
PoolOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
MaxPool1DGradient,
PoolGradientOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
MaxPool2D,
PoolOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
MaxPool2DGradient,
PoolGradientOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
MaxPool3D,
PoolOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);
REGISTER_CUDA_OPERATOR(
MaxPool3DGradient,
PoolGradientOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);
} // namespace caffe2