caffe2/operators/pool_op.cu - platform/external/pytorch - Git at Google

 #include "caffe2/operators/pool_op.h"

 #include <array>
 #include <functional>
 #include <limits>
 #include <numeric>

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/utils/math.h"

 namespace caffe2 {

 namespace {

 template <typename T>
 __global__ void AveragePool1DForwardNCHWCUDAKernel(
     const int X_size,
     const int Y_size,
     const int kernel,
     const int stride,
     const int pad,
     const bool count_include_pad,
     const T* X,
     T* Y) {
   const int nc = blockIdx.x;
   const T* X_ptr = X + nc * X_size;
   T* Y_ptr = Y + nc * Y_size;
   for (int y = threadIdx.x; y < Y_size; y += blockDim.x) {
     const int x = y * stride - pad;
     const int l = max(x, 0);
     const int r = min(x + kernel, X_size);
     const T scale = T(1) / static_cast<T>(count_include_pad ? kernel : r - l);
     T sum = 0;
     for (int i = l; i < r; ++i) {
 #if __CUDA_ARCH__ >= 350
       sum += __ldg(X_ptr + i);
 #else
       sum += X_ptr[i];
 #endif
     }
     Y_ptr[y] = sum * scale;
   }
 }

 template <typename T>
 __global__ void AveragePool1DForwardNHWCCUDAKernel(
     const int C,
     const int X_size,
     const int Y_size,
     const int kernel,
     const int stride,
     const int pad,
     const bool count_include_pad,
     const T* X,
     T* Y) {
   const int n = blockIdx.x / Y_size;
   const int y = blockIdx.x % Y_size;
   const int x = y * stride - pad;
   const int l = max(x, 0);
   const int r = min(x + kernel, X_size);
   const T scale = T(1) / static_cast<T>(count_include_pad ? kernel : r - l);
   const T* X_ptr = X + n * X_size * C;
   T* Y_ptr = Y + n * Y_size * C;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
     T sum = 0;
     for (int i = l; i < r; ++i) {
 #if __CUDA_ARCH__ >= 350
       sum += __ldg(X_ptr + i * C + c);
 #else
       sum += X_ptr[i * C + c];
 #endif
     }
     Y_ptr[y * C + c] = sum * scale;
   }
 }

 template <typename T>
 __global__ void AveragePool2DForwardNCHWCUDAKernel(
     const int X_H,
     const int X_W,
     const int Y_H,
     const int Y_W,
     const int kernel_h,
     const int kernel_w,
     const int stride_h,
     const int stride_w,
     const int pad_t,
     const int pad_l,
     const bool count_include_pad,
     const T* X,
     T* Y) {
   const int X_HxW = X_H * X_W;
   const int Y_HxW = Y_H * Y_W;
   const int nc = blockIdx.x / Y_H;
   const int yh = blockIdx.x % Y_H;
   const T* X_ptr = X + nc * X_HxW;
   T* Y_ptr = Y + nc * Y_HxW;
   const int xh = yh * stride_h - pad_t;
   const int t = max(xh, 0);
   const int b = min(xh + kernel_h, X_H);
   for (int yw = threadIdx.x; yw < Y_W; yw += blockDim.x) {
     const int xw = yw * stride_w - pad_l;
     const int l = max(xw, 0);
     const int r = min(xw + kernel_w, X_W);
     const T scale = T(1) /
         static_cast<T>(count_include_pad ? kernel_h * kernel_w
                                          : (b - t) * (r - l));
     T sum = 0;
     for (int i = t; i < b; ++i) {
       for (int j = l; j < r; ++j) {
 #if __CUDA_ARCH__ >= 350
         sum += __ldg(X_ptr + i * X_W + j);
 #else
         sum += X_ptr[i * X_W + j];
 #endif
       }
     }
     Y_ptr[yh * Y_W + yw] = sum * scale;
   }
 }

 template <typename T>
 __global__ void AveragePool2DForwardNHWCCUDAKernel(
     const int C,
     const int X_H,
     const int X_W,
     const int Y_H,
     const int Y_W,
     const int kernel_h,
     const int kernel_w,
     const int stride_h,
     const int stride_w,
     const int pad_t,
     const int pad_l,
     const bool count_include_pad,
     const T* X,
     T* Y) {
   const int X_HxW = X_H * X_W;
   const int Y_HxW = Y_H * Y_W;
   const int n = blockIdx.x / Y_HxW;
   const int y = blockIdx.x % Y_HxW;
   const int yh = y / Y_W;
   const int yw = y % Y_W;
   const int xh = yh * stride_h - pad_t;
   const int xw = yw * stride_w - pad_l;
   const int t = max(xh, 0);
   const int b = min(xh + kernel_h, X_H);
   const int l = max(xw, 0);
   const int r = min(xw + kernel_w, X_W);
   const T scale = T(1) /
       static_cast<T>(count_include_pad ? kernel_h * kernel_w
                                        : (b - t) * (r - l));
   const T* X_ptr = X + n * X_HxW * C;
   T* Y_ptr = Y + n * Y_HxW * C;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
     T sum = 0;
     for (int i = t; i < b; ++i) {
       for (int j = l; j < r; ++j) {
 #if __CUDA_ARCH__ >= 350
         sum += __ldg(X_ptr + (i * X_W + j) * C + c);
 #else
         sum += X_ptr[(i * X_W + j) * C + c];
 #endif
       }
     }
     Y_ptr[y * C + c] = sum * scale;
   }
 }

 template <typename T>
 __global__ void AveragePool3DForwardNCHWCUDAKernel(
     const int X_D,
     const int X_H,
     const int X_W,
     const int Y_D,
     const int Y_H,
     const int Y_W,
     const int kernel_d,
     const int kernel_h,
     const int kernel_w,
     const int stride_d,
     const int stride_h,
     const int stride_w,
     const int pad_p,
     const int pad_t,
     const int pad_l,
     const bool count_include_pad,
     const T* X,
     T* Y) {
   const int X_HxW = X_D * X_H * X_W;
   const int Y_HxW = Y_D * Y_H * Y_W;
   const int yy = blockIdx.x / Y_H;
   const int nc = yy / Y_D;
   const int yd = yy % Y_D;
   const int yh = blockIdx.x % Y_H;
   const T* X_ptr = X + nc * X_HxW;
   T* Y_ptr = Y + nc * Y_HxW;
   const int xd = yd * stride_d - pad_p;
   const int xh = yh * stride_h - pad_t;
   const int p = max(xd, 0);
   const int a = min(xd + kernel_d, X_D);
   const int t = max(xh, 0);
   const int b = min(xh + kernel_h, X_H);
   for (int yw = threadIdx.x; yw < Y_W; yw += blockDim.x) {
     const int xw = yw * stride_w - pad_l;
     const int l = max(xw, 0);
     const int r = min(xw + kernel_w, X_W);
     const T scale = T(1) /
         static_cast<T>(count_include_pad ? kernel_d * kernel_h * kernel_w
                                          : (a - p) * (b - t) * (r - l));
     T sum = 0;
     for (int i = p; i < a; ++i) {
       for (int j = t; j < b; ++j) {
         for (int k = l; k < r; ++k) {
 #if __CUDA_ARCH__ >= 350
           sum += __ldg(X_ptr + (i * X_H + j) * X_W + k);
 #else
           sum += X_ptr[(i * X_H + j) * X_W + k];
 #endif
         }
       }
     }
     Y_ptr[(yd * Y_H + yh) * Y_W + yw] = sum * scale;
   }
 }

 template <typename T>
 __global__ void AveragePool3DForwardNHWCCUDAKernel(
     const int C,
     const int X_D,
     const int X_H,
     const int X_W,
     const int Y_D,
     const int Y_H,
     const int Y_W,
     const int kernel_d,
     const int kernel_h,
     const int kernel_w,
     const int stride_d,
     const int stride_h,
     const int stride_w,
     const int pad_p,
     const int pad_t,
     const int pad_l,
     const bool count_include_pad,
     const T* X,
     T* Y) {
   const int X_HxW = X_D * X_H * X_W;
   const int Y_HxW = Y_D * Y_H * Y_W;
   const int n = blockIdx.x / Y_HxW;
   const int y = blockIdx.x % Y_HxW;
   const int yy = y / Y_W;
   const int yd = yy / Y_H;
   const int yh = yy % Y_H;
   const int yw = y % Y_W;
   const int xd = yd * stride_d - pad_p;
   const int xh = yh * stride_h - pad_t;
   const int xw = yw * stride_w - pad_l;
   const int p = max(xd, 0);
   const int a = min(xd + kernel_d, X_D);
   const int t = max(xh, 0);
   const int b = min(xh + kernel_h, X_H);
   const int l = max(xw, 0);
   const int r = min(xw + kernel_w, X_W);
   const T scale = T(1) /
       static_cast<T>(count_include_pad ? kernel_d * kernel_h * kernel_w
                                        : (a - p) * (b - t) * (r - l));
   const T* X_ptr = X + n * X_HxW * C;
   T* Y_ptr = Y + n * Y_HxW * C;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
     T sum = 0;
     for (int i = p; i < a; ++i) {
       for (int j = t; j < b; ++j) {
         for (int k = l; k < r; ++k) {
 #if __CUDA_ARCH__ >= 350
           sum += __ldg(X_ptr + ((i * X_H + j) * X_W + k) * C + c);
 #else
           sum += X_ptr[((i * X_H + j) * X_W + k) * C + c];
 #endif
         }
       }
     }
     Y_ptr[y * C + c] = sum * scale;
   }
 }

 template <typename T>
 __global__ void GlobalAveragePoolingBackwardNCHWCUDAKernel(
     const int K,
     const int HxW,
     const T scale,
     const T* dY,
     T* dX) {
   const int nc = blockIdx.x / K;
   const int block = blockIdx.x % K;
   const int x = threadIdx.x + block * CAFFE_CUDA_NUM_THREADS;
   if (x < HxW) {
 #if __CUDA_ARCH__ >= 350
     dX[nc * HxW + x] = __ldg(dY + nc) * scale;
 #else
     dX[nc * HxW + x] = dY[nc] * scale;
 #endif
   }
 }

 template <typename T>
 __global__ void GlobalAveragePoolingBackwardNHWCCUDAKernel(
     const int C,
     const int HxW,
     const T scale,
     const T* dY,
     T* dX) {
   const int n = blockIdx.x / HxW;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
 #if __CUDA_ARCH__ >= 350
     dX[blockIdx.x * C + c] = __ldg(dY + n * C + c) * scale;
 #else
     dX[blockIdx.x * C + c] = dY[n * C + c] * scale;
 #endif
   }
 }

 template <typename T, bool kCountIncludePad>
 __global__ void AveragePool1DBackwardNCHWCUDAKernel(
     const int X_size,
     const int Y_size,
     const int kernel,
     const int stride,
     const int pad,
     const T* dY,
     T* dX) {
   const int nc = blockIdx.x;
   const T* dY_ptr = dY + nc * Y_size;
   T* dX_ptr = dX + nc * X_size;
   for (int x = threadIdx.x; x < X_size; x += blockDim.x) {
     const int w = x + pad;
     const int l = w < kernel ? 0 : (w - kernel) / stride + 1;
     const int r = min(w / stride + 1, Y_size);
     T sum = 0;
     for (int i = l; i < r; ++i) {
       if (kCountIncludePad) {
 #if __CUDA_ARCH__ >= 350
         sum += __ldg(dY_ptr + i);
 #else
         sum += dY_ptr[i];
 #endif
       } else {
         const int xx = i * stride - pad;
         const int xl = max(xx, 0);
         const int xr = min(xx + kernel, X_size);
 #if __CUDA_ARCH__ >= 350
         sum += __ldg(dY_ptr + i) / static_cast<T>(xr - xl);
 #else
         sum += dY_ptr[i] / static_cast<T>(xr - xl);
 #endif
       }
     }
     if (kCountIncludePad) {
       dX_ptr[x] = sum / static_cast<T>(kernel);
     } else {
       dX_ptr[x] = sum;
     }
   }
 }

 template <typename T, bool kCountIncludePad>
 __global__ void AveragePool1DBackwardNHWCCUDAKernel(
     const int C,
     const int X_size,
     const int Y_size,
     const int kernel,
     const int stride,
     const int pad,
     const T* dY,
     T* dX) {
   const int n = blockIdx.x / X_size;
   const int x = blockIdx.x % X_size;
   const int w = x + pad;
   const int l = w < kernel ? 0 : (w - kernel) / stride + 1;
   const int r = min(w / stride + 1, Y_size);
   const T scale = T(1) / static_cast<T>(kernel);
   const T* dY_ptr = dY + n * Y_size * C;
   T* dX_ptr = dX + n * X_size * C;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
     T sum = 0;
     for (int i = l; i < r; ++i) {
       if (kCountIncludePad) {
 #if __CUDA_ARCH__ >= 350
         sum += __ldg(dY_ptr + i * C + c);
 #else
         sum += dY_ptr[i * C + c];
 #endif
       } else {
         const int xx = i * stride - pad;
         const int xl = max(xx, 0);
         const int xr = min(xx + kernel, X_size);
 #if __CUDA_ARCH__ >= 350
         sum += __ldg(dY_ptr + i * C + c) / static_cast<T>(xr - xl);
 #else
         sum += dY_ptr[i * C + c] / static_cast<T>(xr - xl);
 #endif
       }
     }
     if (kCountIncludePad) {
       dX_ptr[x * C + c] = sum * scale;
     } else {
       dX_ptr[x * C + c] = sum;
     }
   }
 }

 template <typename T, bool kCountIncludePad>
 __global__ void AveragePool2DBackwardNCHWCUDAKernel(
     const int X_H,
     const int X_W,
     const int Y_H,
     const int Y_W,
     const int kernel_h,
     const int kernel_w,
     const int stride_h,
     const int stride_w,
     const int pad_t,
     const int pad_l,
     const T* dY,
     T* dX) {
   const int X_HxW = X_H * X_W;
   const int Y_HxW = Y_H * Y_W;
   const int nc = blockIdx.x / X_H;
   const int hh = blockIdx.x % X_H;
   const T* dY_ptr = dY + nc * Y_HxW;
   T* dX_ptr = dX + nc * X_HxW;
   const int h = hh + pad_t;
   const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
   const int b = min(h / stride_h + 1, Y_H);
   for (int ww = threadIdx.x; ww < X_W; ww += blockDim.x) {
     const int w = ww + pad_l;
     const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
     const int r = min(w / stride_w + 1, Y_W);
     T sum = 0;
     for (int i = t; i < b; ++i) {
       for (int j = l; j < r; ++j) {
         if (kCountIncludePad) {
 #if __CUDA_ARCH__ >= 350
           sum += __ldg(dY_ptr + i * Y_W + j);
 #else
           sum += dY_ptr[i * Y_W + j];
 #endif
         } else {
           const int xh = i * stride_h - pad_t;
           const int xw = j * stride_w - pad_l;
           const int xt = max(xh, 0);
           const int xb = min(xh + kernel_h, X_H);
           const int xl = max(xw, 0);
           const int xr = min(xw + kernel_w, X_W);
 #if __CUDA_ARCH__ >= 350
           sum += __ldg(dY_ptr + i * Y_W + j) /
               static_cast<T>((xb - xt) * (xr - xl));
 #else
           sum += dY_ptr[i * Y_W + j] / static_cast<T>((xb - xt) * (xr - xl));
 #endif
         }
       }
     }
     if (kCountIncludePad) {
       dX_ptr[hh * X_W + ww] = sum / static_cast<T>(kernel_h * kernel_w);
     } else {
       dX_ptr[hh * X_W + ww] = sum;
     }
   }
 }

 template <typename T, bool kCountIncludePad>
 __global__ void AveragePool2DBackwardNHWCCUDAKernel(
     const int C,
     const int X_H,
     const int X_W,
     const int Y_H,
     const int Y_W,
     const int kernel_h,
     const int kernel_w,
     const int stride_h,
     const int stride_w,
     const int pad_t,
     const int pad_l,
     const T* dY,
     T* dX) {
   const int X_HxW = X_H * X_W;
   const int Y_HxW = Y_H * Y_W;
   const int n = blockIdx.x / X_HxW;
   const int x = blockIdx.x % X_HxW;
   const int h = x / X_W + pad_t;
   const int w = x % X_W + pad_l;
   const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
   const int b = min(h / stride_h + 1, Y_H);
   const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
   const int r = min(w / stride_w + 1, Y_W);
   const T scale = T(1) / static_cast<T>(kernel_h * kernel_w);
   const T* dY_ptr = dY + n * Y_HxW * C;
   T* dX_ptr = dX + n * X_HxW * C;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
     T sum = 0;
     for (int i = t; i < b; ++i) {
       for (int j = l; j < r; ++j) {
         if (kCountIncludePad) {
 #if __CUDA_ARCH__ >= 350
           sum += __ldg(dY_ptr + (i * Y_W + j) * C + c);
 #else
           sum += dY_ptr[(i * Y_W + j) * C + c];
 #endif
         } else {
           const int xh = i * stride_h - pad_t;
           const int xw = j * stride_w - pad_l;
           const int xt = max(xh, 0);
           const int xb = min(xh + kernel_h, X_H);
           const int xl = max(xw, 0);
           const int xr = min(xw + kernel_w, X_W);
 #if __CUDA_ARCH__ >= 350
           sum += __ldg(dY_ptr + (i * Y_W + j) * C + c) /
               static_cast<T>((xb - xt) * (xr - xl));
 #else
           sum += dY_ptr[(i * Y_W + j) * C + c] /
               static_cast<T>((xb - xt) * (xr - xl));
 #endif
         }
       }
     }
     if (kCountIncludePad) {
       dX_ptr[x * C + c] = sum * scale;
     } else {
       dX_ptr[x * C + c] = sum;
     }
   }
 }

 template <typename T, bool kCountIncludePad>
 __global__ void AveragePool3DBackwardNCHWCUDAKernel(
     const int X_D,
     const int X_H,
     const int X_W,
     const int Y_D,
     const int Y_H,
     const int Y_W,
     const int kernel_d,
     const int kernel_h,
     const int kernel_w,
     const int stride_d,
     const int stride_h,
     const int stride_w,
     const int pad_p,
     const int pad_t,
     const int pad_l,
     const T* dY,
     T* dX) {
   const int X_HxW = X_D * X_H * X_W;
   const int Y_HxW = Y_D * Y_H * Y_W;
   const int xx = blockIdx.x / X_H;
   const int nc = xx / X_D;
   const int dd = xx % X_D;
   const int hh = blockIdx.x % X_H;
   const T* dY_ptr = dY + nc * Y_HxW;
   T* dX_ptr = dX + nc * X_HxW;
   const int d = dd + pad_p;
   const int h = hh + pad_t;
   const int p = d < kernel_d ? 0 : (d - kernel_d) / stride_d + 1;
   const int a = min(d / stride_d + 1, Y_D);
   const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
   const int b = min(h / stride_h + 1, Y_H);
   for (int ww = threadIdx.x; ww < X_W; ww += blockDim.x) {
     const int w = ww + pad_l;
     const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
     const int r = min(w / stride_w + 1, Y_W);
     T sum = 0;
     for (int i = p; i < a; ++i) {
       for (int j = t; j < b; ++j) {
         for (int k = l; k < r; ++k) {
           if (kCountIncludePad) {
 #if __CUDA_ARCH__ >= 350
             sum += __ldg(dY_ptr + (i * Y_H + j) * Y_W + k);
 #else
             sum += dY_ptr[(i * Y_H + j) * Y_W + k];
 #endif
           } else {
             const int xd = i * stride_d - pad_p;
             const int xh = j * stride_h - pad_t;
             const int xw = k * stride_w - pad_l;
             const int xp = max(xd, 0);
             const int xa = min(xd + kernel_d, X_D);
             const int xt = max(xh, 0);
             const int xb = min(xh + kernel_h, X_H);
             const int xl = max(xw, 0);
             const int xr = min(xw + kernel_w, X_W);
 #if __CUDA_ARCH__ >= 350
             sum += __ldg(dY_ptr + (i * Y_H + j) * Y_W + k) /
                 static_cast<T>((xa - xp) * (xb - xt) * (xr - xl));
 #else
             sum += dY_ptr[(i * Y_H + j) * Y_W + k] /
                 static_cast<T>((xa - xp) * (xb - xt) * (xr - xl));
 #endif
           }
         }
       }
     }
     if (kCountIncludePad) {
       dX_ptr[(dd * X_H + hh) * X_W + ww] =
           sum / static_cast<T>(kernel_d * kernel_h * kernel_w);
     } else {
       dX_ptr[(dd * X_H + hh) * X_W + ww] = sum;
     }
   }
 }

 template <typename T, bool kCountIncludePad>
 __global__ void AveragePool3DBackwardNHWCCUDAKernel(
     const int C,
     const int X_D,
     const int X_H,
     const int X_W,
     const int Y_D,
     const int Y_H,
     const int Y_W,
     const int kernel_d,
     const int kernel_h,
     const int kernel_w,
     const int stride_d,
     const int stride_h,
     const int stride_w,
     const int pad_p,
     const int pad_t,
     const int pad_l,
     const T* dY,
     T* dX) {
   const int X_HxW = X_D * X_H * X_W;
   const int Y_HxW = Y_D * Y_H * Y_W;
   const int n = blockIdx.x / X_HxW;
   const int x = blockIdx.x % X_HxW;
   const int xx = x / X_W;
   const int d = xx / X_H + pad_p;
   const int h = xx % X_H + pad_t;
   const int w = x % X_W + pad_l;
   const int p = d < kernel_d ? 0 : (d - kernel_d) / stride_d + 1;
   const int a = min(d / stride_d + 1, Y_D);
   const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
   const int b = min(h / stride_h + 1, Y_H);
   const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
   const int r = min(w / stride_w + 1, Y_W);
   const T scale = T(1) / static_cast<T>(kernel_d * kernel_h * kernel_w);
   const T* dY_ptr = dY + n * Y_HxW * C;
   T* dX_ptr = dX + n * X_HxW * C;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
     T sum = 0;
     for (int i = p; i < a; ++i) {
       for (int j = t; j < b; ++j) {
         for (int k = l; k < r; ++k) {
           if (kCountIncludePad) {
 #if __CUDA_ARCH__ >= 350
             sum += __ldg(dY_ptr + ((i * Y_H + j) * Y_W + k) * C + c);
 #else
             sum += dY_ptr[((i * Y_H + j) * Y_W + k) * C + c];
 #endif
           } else {
             const int xd = i * stride_d - pad_p;
             const int xh = j * stride_h - pad_t;
             const int xw = k * stride_w - pad_l;
             const int xp = max(xd, 0);
             const int xa = min(xd + kernel_d, X_D);
             const int xt = max(xh, 0);
             const int xb = min(xh + kernel_h, X_H);
             const int xl = max(xw, 0);
             const int xr = min(xw + kernel_w, X_W);
 #if __CUDA_ARCH__ >= 350
             sum += __ldg(dY_ptr + ((i * Y_H + j) * Y_W + k) * C + c) /
                 static_cast<T>((xa - xp) * (xb - xt) * (xr - xl));
 #else
             sum += dY_ptr[((i * Y_H + j) * Y_W + k) * C + c] /
                 static_cast<T>((xa - xp) * (xb - xt) * (xr - xl));
 #endif
           }
         }
       }
     }
     if (kCountIncludePad) {
       dX_ptr[x * C + c] = sum * scale;
     } else {
       dX_ptr[x * C + c] = sum;
     }
   }
 }

 } // namespace

 template <>
 template <>
 bool AveragePoolFunctor<CUDAContext>::
     GlobalPoolingForward<float, StorageOrder::NCHW>(
         const int N,
         const int C,
         const int HxW,
         const float* X,
         float* Y,
         CUDAContext* context) const {
   const std::array<int, 2> X_dims = {N * C, HxW};
   const std::array<int, 2> Y_dims = {N * C, 1};
   math::ReduceMean<float, CUDAContext>(
       2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
   return true;
 }

 template <>
 template <>
 bool AveragePoolFunctor<CUDAContext>::
     GlobalPoolingForward<float, StorageOrder::NHWC>(
         const int N,
         const int C,
         const int HxW,
         const float* X,
         float* Y,
         CUDAContext* context) const {
   if (ones.numel() != HxW) {
     ones.Resize(HxW);
     math::Set<float, CUDAContext>(
         HxW, 1.0f, ones.mutable_data<float>(), context);
   }
   math::GemmStridedBatched<float, CUDAContext>(
       CblasTrans,
       CblasNoTrans,
       N,
       C,
       1,
       HxW,
       1.0f / static_cast<float>(HxW),
       X,
       HxW * C,
       ones.data<float>(),
       0,
       0.0f,
       Y,
       C,
       context);
   return true;
 }

 template <>
 template <>
 bool AveragePoolFunctor<CUDAContext>::Forward<float, StorageOrder::NCHW>(
     const int N,
     const int C,
     const std::vector<int>& X_dims,
     const std::vector<int>& Y_dims,
     const std::vector<int>& kernel,
     const std::vector<int>& /* dilation */,
     const std::vector<int>& stride,
     const std::vector<int>& pads,
     const float* X,
     float* Y,
     CUDAContext* context) const {
   const int ndim = X_dims.size();
   switch (ndim) {
     case 1: {
       const int num_blocks = N * C;
       AveragePool1DForwardNCHWCUDAKernel<float>
           <<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               X_dims[0],
               Y_dims[0],
               kernel[0],
               stride[0],
               pads[0],
               count_include_pad,
               X,
               Y);
       return true;
     }
     case 2: {
       const int num_blocks = N * C * Y_dims[0];
       AveragePool2DForwardNCHWCUDAKernel<float>
           <<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               X_dims[0],
               X_dims[1],
               Y_dims[0],
               Y_dims[1],
               kernel[0],
               kernel[1],
               stride[0],
               stride[1],
               pads[0],
               pads[1],
               count_include_pad,
               X,
               Y);
       return true;
     }
     case 3: {
       const int num_blocks = N * C * Y_dims[0] * Y_dims[1];
       AveragePool3DForwardNCHWCUDAKernel<float>
           <<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               X_dims[0],
               X_dims[1],
               X_dims[2],
               Y_dims[0],
               Y_dims[1],
               Y_dims[2],
               kernel[0],
               kernel[1],
               kernel[2],
               stride[0],
               stride[1],
               stride[2],
               pads[0],
               pads[1],
               pads[2],
               count_include_pad,
               X,
               Y);
       return true;
     }
     default: {
       CAFFE_THROW("Unsupported pooling dim: ", ndim);
       return false;
     }
   }
 }

 template <>
 template <>
 bool AveragePoolFunctor<CUDAContext>::Forward<float, StorageOrder::NHWC>(
     const int N,
     const int C,
     const std::vector<int>& X_dims,
     const std::vector<int>& Y_dims,
     const std::vector<int>& kernel,
     const std::vector<int>& /* dilation */,
     const std::vector<int>& stride,
     const std::vector<int>& pads,
     const float* X,
     float* Y,
     CUDAContext* context) const {
   // Each CUDA block handles one point, one thread per channel.
   const int ndim = X_dims.size();
   const int Y_HxW = std::accumulate(
       Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
   switch (ndim) {
     case 1: {
       AveragePool1DForwardNHWCCUDAKernel<float>
           <<<N * Y_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               C,
               X_dims[0],
               Y_dims[0],
               kernel[0],
               stride[0],
               pads[0],
               count_include_pad,
               X,
               Y);
       return true;
     }
     case 2: {
       AveragePool2DForwardNHWCCUDAKernel<float>
           <<<N * Y_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               C,
               X_dims[0],
               X_dims[1],
               Y_dims[0],
               Y_dims[1],
               kernel[0],
               kernel[1],
               stride[0],
               stride[1],
               pads[0],
               pads[1],
               count_include_pad,
               X,
               Y);
       return true;
     }
     case 3: {
       AveragePool3DForwardNHWCCUDAKernel<float>
           <<<N * Y_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               C,
               X_dims[0],
               X_dims[1],
               X_dims[2],
               Y_dims[0],
               Y_dims[1],
               Y_dims[2],
               kernel[0],
               kernel[1],
               kernel[2],
               stride[0],
               stride[1],
               stride[2],
               pads[0],
               pads[1],
               pads[2],
               count_include_pad,
               X,
               Y);
       return true;
     }
     default: {
       CAFFE_THROW("Unsupported pooling dim: ", ndim);
       return false;
     }
   }
 }

 template <>
 template <>
 bool AveragePoolFunctor<CUDAContext>::
     GlobalPoolingBackward<float, StorageOrder::NCHW>(
         const int N,
         const int C,
         const int HxW,
         const float* dY,
         const float* /* X */,
         const float* /* Y */,
         float* dX,
         CUDAContext* context) const {
   const float scale = 1.0f / static_cast<float>(HxW);
   const int K = (HxW + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
   GlobalAveragePoolingBackwardNCHWCUDAKernel<float>
       <<<N * C * K, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
           K, HxW, scale, dY, dX);
   return true;
 }

 template <>
 template <>
 bool AveragePoolFunctor<CUDAContext>::
     GlobalPoolingBackward<float, StorageOrder::NHWC>(
         const int N,
         const int C,
         const int HxW,
         const float* dY,
         const float* /* X */,
         const float* /* Y */,
         float* dX,
         CUDAContext* context) const {
   const float scale = 1.0f / static_cast<float>(HxW);
   GlobalAveragePoolingBackwardNHWCCUDAKernel<float>
       <<<N * HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
           C, HxW, scale, dY, dX);
   return true;
 }

 #define DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(                       \
     cond, Func, T, num_blocks, threads_per_block, cuda_stream, ...)         \
   do {                                                                      \
     if (cond) {                                                             \
       Func<T, true>                                                         \
           <<<num_blocks, threads_per_block, 0, cuda_stream>>>(__VA_ARGS__); \
     } else {                                                                \
       Func<T, false>                                                        \
           <<<num_blocks, threads_per_block, 0, cuda_stream>>>(__VA_ARGS__); \
     }                                                                       \
   } while (false)

 template <>
 template <>
 bool AveragePoolFunctor<CUDAContext>::Backward<float, StorageOrder::NCHW>(
     const int N,
     const int C,
     const std::vector<int>& X_dims,
     const std::vector<int>& Y_dims,
     const std::vector<int>& kernel,
     const std::vector<int>& /* dilation */,
     const std::vector<int>& stride,
     const std::vector<int>& pads,
     const float* dY,
     const float* /* X */,
     const float* /* Y */,
     float* dX,
     CUDAContext* context) const {
   const int ndim = X_dims.size();
   switch (ndim) {
     case 1: {
       const int num_blocks = N * C;
       DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(
           count_include_pad,
           AveragePool1DBackwardNCHWCUDAKernel,
           float,
           num_blocks,
           CAFFE_CUDA_NUM_THREADS,
           context->cuda_stream(),
           X_dims[0],
           Y_dims[0],
           kernel[0],
           stride[0],
           pads[0],
           dY,
           dX);
       return true;
     }
     case 2: {
       const int num_blocks = N * C * X_dims[0];
       DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(
           count_include_pad,
           AveragePool2DBackwardNCHWCUDAKernel,
           float,
           num_blocks,
           CAFFE_CUDA_NUM_THREADS,
           context->cuda_stream(),
           X_dims[0],
           X_dims[1],
           Y_dims[0],
           Y_dims[1],
           kernel[0],
           kernel[1],
           stride[0],
           stride[1],
           pads[0],
           pads[1],
           dY,
           dX);
       return true;
     }
     case 3: {
       const int num_blocks = N * C * X_dims[0] * X_dims[1];
       DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(
           count_include_pad,
           AveragePool3DBackwardNCHWCUDAKernel,
           float,
           num_blocks,
           CAFFE_CUDA_NUM_THREADS,
           context->cuda_stream(),
           X_dims[0],
           X_dims[1],
           X_dims[2],
           Y_dims[0],
           Y_dims[1],
           Y_dims[2],
           kernel[0],
           kernel[1],
           kernel[2],
           stride[0],
           stride[1],
           stride[2],
           pads[0],
           pads[1],
           pads[2],
           dY,
           dX);
       return true;
     }
     default: {
       CAFFE_THROW("Unsupported pooling dim: ", ndim);
       return false;
     }
   }
 }

 template <>
 template <>
 bool AveragePoolFunctor<CUDAContext>::Backward<float, StorageOrder::NHWC>(
     const int N,
     const int C,
     const std::vector<int>& X_dims,
     const std::vector<int>& Y_dims,
     const std::vector<int>& kernel,
     const std::vector<int>& /* dilation */,
     const std::vector<int>& stride,
     const std::vector<int>& pads,
     const float* dY,
     const float* /* X */,
     const float* /* Y */,
     float* dX,
     CUDAContext* context) const {
   const int ndim = X_dims.size();
   const int X_HxW = std::accumulate(
       X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
   const int num_blocks = N * X_HxW;
   switch (ndim) {
     case 1: {
       DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(
           count_include_pad,
           AveragePool1DBackwardNHWCCUDAKernel,
           float,
           num_blocks,
           CAFFE_CUDA_NUM_THREADS,
           context->cuda_stream(),
           C,
           X_dims[0],
           Y_dims[0],
           kernel[0],
           stride[0],
           pads[0],
           dY,
           dX);
       return true;
     }
     case 2: {
       DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(
           count_include_pad,
           AveragePool2DBackwardNHWCCUDAKernel,
           float,
           num_blocks,
           CAFFE_CUDA_NUM_THREADS,
           context->cuda_stream(),
           C,
           X_dims[0],
           X_dims[1],
           Y_dims[0],
           Y_dims[1],
           kernel[0],
           kernel[1],
           stride[0],
           stride[1],
           pads[0],
           pads[1],
           dY,
           dX);
       return true;
     }
     case 3: {
       DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1(
           count_include_pad,
           AveragePool3DBackwardNHWCCUDAKernel,
           float,
           num_blocks,
           CAFFE_CUDA_NUM_THREADS,
           context->cuda_stream(),
           C,
           X_dims[0],
           X_dims[1],
           X_dims[2],
           Y_dims[0],
           Y_dims[1],
           Y_dims[2],
           kernel[0],
           kernel[1],
           kernel[2],
           stride[0],
           stride[1],
           stride[2],
           pads[0],
           pads[1],
           pads[2],
           dY,
           dX);
       return true;
     }
     default: {
       CAFFE_THROW("Unsupported pooling dim: ", ndim);
       return false;
     }
   }
 }

 #undef DISPATCH_KERNEL_FUNCTION_BY_BOOL_WITH_TYPE_1

 namespace {

 template <typename T>
 __global__ void MaxPool1DForwardNCHWCUDAKernel(
     const int X_size,
     const int Y_size,
     const int kernel,
     const int stride,
     const int pad,
     const T* X,
     T* Y) {
   const int nc = blockIdx.x;
   const T* X_ptr = X + nc * X_size;
   T* Y_ptr = Y + nc * Y_size;
   for (int y = threadIdx.x; y < Y_size; y += blockDim.x) {
     const int x = y * stride;
     const int l = max(x - pad, 0);
     const int r = min(x - pad + kernel, X_size);
     T val = std::numeric_limits<T>::lowest();
     for (int i = l; i < r; ++i) {
 #if __CUDA_ARCH__ >= 350
       val = max(val, __ldg(X_ptr + i));
 #else
       val = max(val, X_ptr[i]);
 #endif
     }
     Y_ptr[y] = val;
   }
 }

 template <typename T>
 __global__ void MaxPool1DForwardNHWCCUDAKernel(
     const int C,
     const int X_size,
     const int Y_size,
     const int kernel,
     const int stride,
     const int pad,
     const T* X,
     T* Y) {
   const int n = blockIdx.x / Y_size;
   const int y = blockIdx.x % Y_size;
   const int x = y * stride;
   const int l = max(x - pad, 0);
   const int r = min(x - pad + kernel, X_size);
   const T* X_ptr = X + n * X_size * C;
   T* Y_ptr = Y + n * Y_size * C;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
     T val = std::numeric_limits<T>::lowest();
     for (int i = l; i < r; ++i) {
 #if __CUDA_ARCH__ >= 350
       val = max(val, __ldg(X_ptr + i * C + c));
 #else
       val = max(val, X_ptr[i * C + c]);
 #endif
     }
     Y_ptr[y * C + c] = val;
   }
 }

 template <typename T>
 __global__ void MaxPool2DForwardNCHWCUDAKernel(
     const int X_H,
     const int X_W,
     const int Y_H,
     const int Y_W,
     const int kernel_h,
     const int kernel_w,
     const int stride_h,
     const int stride_w,
     const int pad_t,
     const int pad_l,
     const T* X,
     T* Y) {
   const int X_HxW = X_H * X_W;
   const int Y_HxW = Y_H * Y_W;
   const int nc = blockIdx.x / Y_H;
   const int yh = blockIdx.x % Y_H;
   const T* X_ptr = X + nc * X_HxW;
   T* Y_ptr = Y + nc * Y_HxW;
   const int xh = yh * stride_h;
   const int t = max(xh - pad_t, 0);
   const int b = min(xh - pad_t + kernel_h, X_H);
   for (int yw = threadIdx.x; yw < Y_W; yw += blockDim.x) {
     const int xw = yw * stride_w;
     const int l = max(xw - pad_l, 0);
     const int r = min(xw - pad_l + kernel_w, X_W);
     T val = std::numeric_limits<T>::lowest();
     for (int i = t; i < b; ++i) {
       for (int j = l; j < r; ++j) {
 #if __CUDA_ARCH__ >= 350
         val = max(val, __ldg(X_ptr + i * X_W + j));
 #else
         val = max(val, X_ptr[i * X_W + j]);
 #endif
       }
     }
     Y_ptr[yh * Y_W + yw] = val;
   }
 }

 template <typename T>
 __global__ void MaxPool2DForwardNHWCCUDAKernel(
     const int C,
     const int X_H,
     const int X_W,
     const int Y_H,
     const int Y_W,
     const int kernel_h,
     const int kernel_w,
     const int stride_h,
     const int stride_w,
     const int pad_t,
     const int pad_l,
     const T* X,
     T* Y) {
   const int X_HxW = X_H * X_W;
   const int Y_HxW = Y_H * Y_W;
   const int n = blockIdx.x / Y_HxW;
   const int y = blockIdx.x % Y_HxW;
   const int yh = y / Y_W;
   const int yw = y % Y_W;
   const int xh = yh * stride_h;
   const int xw = yw * stride_w;
   const int t = max(xh - pad_t, 0);
   const int b = min(xh - pad_t + kernel_h, X_H);
   const int l = max(xw - pad_l, 0);
   const int r = min(xw - pad_l + kernel_w, X_W);
   const T* X_ptr = X + n * X_HxW * C;
   T* Y_ptr = Y + n * Y_HxW * C;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
     T val = std::numeric_limits<T>::lowest();
     for (int i = t; i < b; ++i) {
       for (int j = l; j < r; ++j) {
 #if __CUDA_ARCH__ >= 350
         val = max(val, __ldg(X_ptr + (i * X_W + j) * C + c));
 #else
         val = max(val, X_ptr[(i * X_W + j) * C + c]);
 #endif
       }
     }
     Y_ptr[y * C + c] = val;
   }
 }

 template <typename T>
 __global__ void MaxPool3DForwardNCHWCUDAKernel(
     const int X_D,
     const int X_H,
     const int X_W,
     const int Y_D,
     const int Y_H,
     const int Y_W,
     const int kernel_d,
     const int kernel_h,
     const int kernel_w,
     const int stride_d,
     const int stride_h,
     const int stride_w,
     const int pad_p,
     const int pad_t,
     const int pad_l,
     const T* X,
     T* Y) {
   const int X_HxW = X_D * X_H * X_W;
   const int Y_HxW = Y_D * Y_H * Y_W;
   const int yy = blockIdx.x / Y_H;
   const int nc = yy / Y_D;
   const int yd = yy % Y_D;
   const int yh = blockIdx.x % Y_H;
   const T* X_ptr = X + nc * X_HxW;
   T* Y_ptr = Y + nc * Y_HxW;
   const int xd = yd * stride_d;
   const int xh = yh * stride_h;
   const int p = max(xd - pad_p, 0);
   const int a = min(xd - pad_p + kernel_d, X_D);
   const int t = max(xh - pad_t, 0);
   const int b = min(xh - pad_t + kernel_h, X_H);
   for (int yw = threadIdx.x; yw < Y_W; yw += blockDim.x) {
     const int xw = yw * stride_w;
     const int l = max(xw - pad_l, 0);
     const int r = min(xw - pad_l + kernel_w, X_W);
     T val = std::numeric_limits<T>::lowest();
     for (int i = p; i < a; ++i) {
       for (int j = t; j < b; ++j) {
         for (int k = l; k < r; ++k) {
 #if __CUDA_ARCH__ >= 350
           val = max(val, __ldg(X_ptr + (i * X_H + j) * X_W + k));
 #else
           val = max(val, X_ptr[(i * X_H + j) * X_W + k]);
 #endif
         }
       }
     }
     Y_ptr[(yd * Y_H + yh) * Y_W + yw] = val;
   }
 }

 template <typename T>
 __global__ void MaxPool3DForwardNHWCCUDAKernel(
     const int C,
     const int X_D,
     const int X_H,
     const int X_W,
     const int Y_D,
     const int Y_H,
     const int Y_W,
     const int kernel_d,
     const int kernel_h,
     const int kernel_w,
     const int stride_d,
     const int stride_h,
     const int stride_w,
     const int pad_p,
     const int pad_t,
     const int pad_l,
     const T* X,
     T* Y) {
   const int X_HxW = X_D * X_H * X_W;
   const int Y_HxW = Y_D * Y_H * Y_W;
   const int n = blockIdx.x / Y_HxW;
   const int y = blockIdx.x % Y_HxW;
   const int yy = y / Y_W;
   const int yw = y % Y_W;
   const int yh = yy % Y_H;
   const int yd = yy / Y_H;
   const int xd = yd * stride_d;
   const int xh = yh * stride_h;
   const int xw = yw * stride_w;
   const int p = max(xd - pad_p, 0);
   const int a = min(xd - pad_p + kernel_d, X_D);
   const int t = max(xh - pad_t, 0);
   const int b = min(xh - pad_t + kernel_h, X_H);
   const int l = max(xw - pad_l, 0);
   const int r = min(xw - pad_l + kernel_w, X_W);
   const T* X_ptr = X + n * X_HxW * C;
   T* Y_ptr = Y + n * Y_HxW * C;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
     T val = std::numeric_limits<T>::lowest();
     for (int i = p; i < a; ++i) {
       for (int j = t; j < b; ++j) {
         for (int k = l; k < r; ++k) {
 #if __CUDA_ARCH__ >= 350
           val = max(val, __ldg(X_ptr + ((i * X_H + j) * X_W + k) * C + c));
 #else
           val = max(val, X_ptr[((i * X_H + j) * X_W + k) * C + c]);
 #endif
         }
       }
     }
     Y_ptr[y * C + c] = val;
   }
 }

 template <typename T>
 __global__ void GlobalMaxPoolingBackwardNCHWCUDAKernel(
     const int K,
     const int HxW,
     const T* dY,
     const T* X,
     const T* Y,
     T* dX) {
   const int nc = blockIdx.x / K;
   const int block = blockIdx.x % K;
   const int x = threadIdx.x + block * CAFFE_CUDA_NUM_THREADS;
   if (x < HxW) {
 #if __CUDA_ARCH__ >= 350
     dX[nc * HxW + x] =
         (__ldg(X + nc * HxW + x) == __ldg(Y + nc)) ? __ldg(dY + nc) : T(0);
 #else
     dX[nc * HxW + x] = (X[nc * HxW + x] == Y[nc]) ? dY[nc] : T(0);
 #endif
   }
 }

 template <typename T>
 __global__ void GlobalMaxPoolingBackwardNHWCCUDAKernel(
     const int C,
     const int HxW,
     const T* dY,
     const T* X,
     const T* Y,
     T* dX) {
   const int n = blockIdx.x / HxW;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
 #if __CUDA_ARCH__ >= 350
     dX[blockIdx.x * C + c] =
         (__ldg(X + blockIdx.x * C + c) == __ldg(Y + n * C + c))
         ? __ldg(dY + n * C + c)
         : T(0);
 #else
     dX[blockIdx.x * C + c] =
         (X[blockIdx.x * C + c] == Y[n * C + c]) ? dY[n * C + c] : T(0);
 #endif
   }
 }

 template <typename T>
 __global__ void MaxPool1DBackwardNCHWCUDAKernel(
     const int X_size,
     const int Y_size,
     const int kernel,
     const int stride,
     const int pad,
     const T* dY,
     const T* X,
     const T* Y,
     T* dX) {
   const int nc = blockIdx.x;
   const T* dY_ptr = dY + nc * Y_size;
   const T* X_ptr = X + nc * X_size;
   const T* Y_ptr = Y + nc * Y_size;
   T* dX_ptr = dX + nc * X_size;
   for (int x = threadIdx.x; x < X_size; x += blockDim.x) {
     const int w = x + pad;
     const int l = w < kernel ? 0 : (w - kernel) / stride + 1;
     const int r = min(w / stride + 1, Y_size);
     T sum = 0;
     for (int i = l; i < r; ++i) {
 #if __CUDA_ARCH__ >= 350
       if (__ldg(X_ptr + x) == __ldg(Y_ptr + i)) {
         sum += __ldg(dY_ptr + i);
       }
 #else
       if (X_ptr[x] == Y_ptr[i]) {
         sum += dY_ptr[i];
       }
 #endif
     }
     dX_ptr[x] = sum;
   }
 }

 template <typename T>
 __global__ void MaxPool1DBackwardNHWCCUDAKernel(
     const int C,
     const int X_size,
     const int Y_size,
     const int kernel,
     const int stride,
     const int pad,
     const T* dY,
     const T* X,
     const T* Y,
     T* dX) {
   const int n = blockIdx.x / X_size;
   const int x = blockIdx.x % X_size;
   const int w = x + pad;
   const int l = w < kernel ? 0 : (w - kernel) / stride + 1;
   const int r = min(w / stride + 1, Y_size);
   const T* dY_ptr = dY + n * Y_size * C;
   const T* X_ptr = X + n * X_size * C;
   const T* Y_ptr = Y + n * Y_size * C;
   T* dX_ptr = dX + n * X_size * C;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
     T sum = 0;
     for (int i = l; i < r; ++i) {
 #if __CUDA_ARCH__ >= 350
       if (__ldg(X_ptr + x * C + c) == __ldg(Y_ptr + i * C + c)) {
         sum += __ldg(dY_ptr + i * C + c);
       }
 #else
       if (X_ptr[x * C + c] == Y_ptr[i * C + c]) {
         sum += dY_ptr[i * C + c];
       }
 #endif
     }
     dX_ptr[x * C + c] = sum;
   }
 }

 template <typename T>
 __global__ void MaxPool2DBackwardNCHWCUDAKernel(
     const int X_H,
     const int X_W,
     const int Y_H,
     const int Y_W,
     const int kernel_h,
     const int kernel_w,
     const int stride_h,
     const int stride_w,
     const int pad_t,
     const int pad_l,
     const T* dY,
     const T* X,
     const T* Y,
     T* dX) {
   const int X_HxW = X_H * X_W;
   const int Y_HxW = Y_H * Y_W;
   const int nc = blockIdx.x / X_H;
   const int xh = blockIdx.x % X_H;
   const T* dY_ptr = dY + nc * Y_HxW;
   const T* X_ptr = X + nc * X_HxW;
   const T* Y_ptr = Y + nc * Y_HxW;
   T* dX_ptr = dX + nc * X_HxW;
   const int h = xh + pad_t;
   const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
   const int b = min(h / stride_h + 1, Y_H);
   for (int xw = threadIdx.x; xw < X_W; xw += blockDim.x) {
     const int w = xw + pad_l;
     const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
     const int r = min(w / stride_w + 1, Y_W);
     const int x = xh * X_W + xw;
     T sum = 0;
     for (int i = t; i < b; ++i) {
       for (int j = l; j < r; ++j) {
         const int y = i * Y_W + j;
 #if __CUDA_ARCH__ >= 350
         if (__ldg(X_ptr + x) == __ldg(Y_ptr + y)) {
           sum += __ldg(dY_ptr + y);
         }
 #else
         if (X_ptr[x] == Y_ptr[y]) {
           sum += dY_ptr[y];
         }
 #endif
       }
     }
     dX_ptr[x] = sum;
   }
 }

 template <typename T>
 __global__ void MaxPool2DBackwardNHWCCUDAKernel(
     const int C,
     const int X_H,
     const int X_W,
     const int Y_H,
     const int Y_W,
     const int kernel_h,
     const int kernel_w,
     const int stride_h,
     const int stride_w,
     const int pad_t,
     const int pad_l,
     const T* dY,
     const T* X,
     const T* Y,
     T* dX) {
   const int X_HxW = X_H * X_W;
   const int Y_HxW = Y_H * Y_W;
   const int n = blockIdx.x / X_HxW;
   const int x = blockIdx.x % X_HxW;
   const int h = x / X_W + pad_t;
   const int w = x % X_W + pad_l;
   const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
   const int b = min(h / stride_h + 1, Y_H);
   const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
   const int r = min(w / stride_w + 1, Y_W);
   const T* dY_ptr = dY + n * Y_HxW * C;
   const T* X_ptr = X + n * X_HxW * C;
   const T* Y_ptr = Y + n * Y_HxW * C;
   T* dX_ptr = dX + n * X_HxW * C;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
     T sum = 0;
     for (int i = t; i < b; ++i) {
       for (int j = l; j < r; ++j) {
         const int y = i * Y_W + j;
 #if __CUDA_ARCH__ >= 350
         if (__ldg(X_ptr + x * C + c) == __ldg(Y_ptr + y * C + c)) {
           sum += __ldg(dY_ptr + y * C + c);
         }
 #else
         if (X_ptr[x * C + c] == Y_ptr[y * C + c]) {
           sum += dY_ptr[y * C + c];
         }
 #endif
       }
     }
     dX_ptr[x * C + c] = sum;
   }
 }

 template <typename T>
 __global__ void MaxPool3DBackwardNCHWCUDAKernel(
     const int X_D,
     const int X_H,
     const int X_W,
     const int Y_D,
     const int Y_H,
     const int Y_W,
     const int kernel_d,
     const int kernel_h,
     const int kernel_w,
     const int stride_d,
     const int stride_h,
     const int stride_w,
     const int pad_p,
     const int pad_t,
     const int pad_l,
     const T* dY,
     const T* X,
     const T* Y,
     T* dX) {
   const int X_HxW = X_D * X_H * X_W;
   const int Y_HxW = Y_D * Y_H * Y_W;
   const int xx = blockIdx.x / X_H;
   const int nc = xx / X_D;
   const int xd = xx % X_D;
   const int xh = blockIdx.x % X_H;
   const T* dY_ptr = dY + nc * Y_HxW;
   const T* X_ptr = X + nc * X_HxW;
   const T* Y_ptr = Y + nc * Y_HxW;
   T* dX_ptr = dX + nc * X_HxW;
   const int d = xd + pad_p;
   const int h = xh + pad_t;
   const int p = d < kernel_d ? 0 : (d - kernel_d) / stride_d + 1;
   const int a = min(d / stride_d + 1, Y_D);
   const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
   const int b = min(h / stride_h + 1, Y_H);
   for (int xw = threadIdx.x; xw < X_W; xw += blockDim.x) {
     const int w = xw + pad_l;
     const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
     const int r = min(w / stride_w + 1, Y_W);
     const int x = (xd * X_H + xh) * X_W + xw;
     T sum = 0;
     for (int i = p; i < a; ++i) {
       for (int j = t; j < b; ++j) {
         for (int k = l; k < r; ++k) {
           const int y = (i * Y_H + j) * Y_W + k;
 #if __CUDA_ARCH__ >= 350
           if (__ldg(X_ptr + x) == __ldg(Y_ptr + y)) {
             sum += __ldg(dY_ptr + y);
           }
 #else
           if (X_ptr[x] == Y_ptr[y]) {
             sum += dY_ptr[y];
           }
 #endif
         }
       }
     }
     dX_ptr[x] = sum;
   }
 }

 template <typename T>
 __global__ void MaxPool3DBackwardNHWCCUDAKernel(
     const int C,
     const int X_D,
     const int X_H,
     const int X_W,
     const int Y_D,
     const int Y_H,
     const int Y_W,
     const int kernel_d,
     const int kernel_h,
     const int kernel_w,
     const int stride_d,
     const int stride_h,
     const int stride_w,
     const int pad_p,
     const int pad_t,
     const int pad_l,
     const T* dY,
     const T* X,
     const T* Y,
     T* dX) {
   const int X_HxW = X_D * X_H * X_W;
   const int Y_HxW = Y_D * Y_H * Y_W;
   const int n = blockIdx.x / X_HxW;
   const int x = blockIdx.x % X_HxW;
   const int xx = x / X_W;
   const int d = xx / X_H + pad_p;
   const int h = xx % X_H + pad_t;
   const int w = x % X_W + pad_l;
   const int p = d < kernel_d ? 0 : (d - kernel_d) / stride_d + 1;
   const int a = min(d / stride_d + 1, Y_D);
   const int t = h < kernel_h ? 0 : (h - kernel_h) / stride_h + 1;
   const int b = min(h / stride_h + 1, Y_H);
   const int l = w < kernel_w ? 0 : (w - kernel_w) / stride_w + 1;
   const int r = min(w / stride_w + 1, Y_W);
   const T* dY_ptr = dY + n * Y_HxW * C;
   const T* X_ptr = X + n * X_HxW * C;
   const T* Y_ptr = Y + n * Y_HxW * C;
   T* dX_ptr = dX + n * X_HxW * C;
   for (int c = threadIdx.x; c < C; c += blockDim.x) {
     T sum = 0;
     for (int i = p; i < a; ++i) {
       for (int j = t; j < b; ++j) {
         for (int k = l; k < r; ++k) {
           const int y = (i * Y_H + j) * Y_W + k;
 #if __CUDA_ARCH__ >= 350
           if (__ldg(X_ptr + x * C + c) == __ldg(Y_ptr + y * C + c)) {
             sum += __ldg(dY_ptr + y * C + c);
           }
 #else
           if (X_ptr[x * C + c] == Y_ptr[y * C + c]) {
             sum += dY_ptr[y * C + c];
           }
 #endif
         }
       }
     }
     dX_ptr[x * C + c] = sum;
   }
 }

 } // namespace

 template <>
 template <>
 bool MaxPoolFunctor<CUDAContext>::
     GlobalPoolingForward<float, StorageOrder::NCHW>(
         const int N,
         const int C,
         const int HxW,
         const float* X,
         float* Y,
         CUDAContext* context) const {
   const std::array<int, 2> X_dims = {N * C, HxW};
   const std::array<int, 2> Y_dims = {N * C, 1};
   math::ReduceMax<float, CUDAContext>(
       2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
   return true;
 }

 template <>
 template <>
 bool MaxPoolFunctor<CUDAContext>::
     GlobalPoolingForward<float, StorageOrder::NHWC>(
         const int N,
         const int C,
         const int HxW,
         const float* X,
         float* Y,
         CUDAContext* context) const {
   const std::array<int, 3> X_dims = {N, HxW, C};
   const std::array<int, 3> Y_dims = {N, 1, C};
   math::ReduceMax<float, CUDAContext>(
       3, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
   return true;
 }

 template <>
 template <>
 bool MaxPoolFunctor<CUDAContext>::Forward<float, StorageOrder::NCHW>(
     const int N,
     const int C,
     const std::vector<int>& X_dims,
     const std::vector<int>& Y_dims,
     const std::vector<int>& kernel,
     const std::vector<int>& /* dilation */,
     const std::vector<int>& stride,
     const std::vector<int>& pads,
     const float* X,
     float* Y,
     CUDAContext* context) const {
   const int ndim = X_dims.size();
   switch (ndim) {
     case 1: {
       const int num_blocks = N * C;
       MaxPool1DForwardNCHWCUDAKernel<float>
           <<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               X_dims[0], Y_dims[0], kernel[0], stride[0], pads[0], X, Y);
       return true;
     }
     case 2: {
       const int num_blocks = N * C * Y_dims[0];
       MaxPool2DForwardNCHWCUDAKernel<float>
           <<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               X_dims[0],
               X_dims[1],
               Y_dims[0],
               Y_dims[1],
               kernel[0],
               kernel[1],
               stride[0],
               stride[1],
               pads[0],
               pads[1],
               X,
               Y);
       return true;
     }
     case 3: {
       const int num_blocks = N * C * Y_dims[0] * Y_dims[1];
       MaxPool3DForwardNCHWCUDAKernel<float>
           <<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               X_dims[0],
               X_dims[1],
               X_dims[2],
               Y_dims[0],
               Y_dims[1],
               Y_dims[2],
               kernel[0],
               kernel[1],
               kernel[2],
               stride[0],
               stride[1],
               stride[2],
               pads[0],
               pads[1],
               pads[2],
               X,
               Y);
       return true;
     }
     default: {
       CAFFE_THROW("Unsupported pooling dim: ", ndim);
       return false;
     }
   }
 }

 template <>
 template <>
 bool MaxPoolFunctor<CUDAContext>::Forward<float, StorageOrder::NHWC>(
     const int N,
     const int C,
     const std::vector<int>& X_dims,
     const std::vector<int>& Y_dims,
     const std::vector<int>& kernel,
     const std::vector<int>& /* dilation */,
     const std::vector<int>& stride,
     const std::vector<int>& pads,
     const float* X,
     float* Y,
     CUDAContext* context) const {
   // Each CUDA block handles one point, one thread per channel.
   const int ndim = X_dims.size();
   const int Y_HxW = std::accumulate(
       Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies<int>());
   switch (ndim) {
     case 1: {
       MaxPool1DForwardNHWCCUDAKernel<float>
           <<<N * Y_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               C, X_dims[0], Y_dims[0], kernel[0], stride[0], pads[0], X, Y);
       return true;
     }
     case 2: {
       MaxPool2DForwardNHWCCUDAKernel<float>
           <<<N * Y_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               C,
               X_dims[0],
               X_dims[1],
               Y_dims[0],
               Y_dims[1],
               kernel[0],
               kernel[1],
               stride[0],
               stride[1],
               pads[0],
               pads[1],
               X,
               Y);
       return true;
     }
     case 3: {
       MaxPool3DForwardNHWCCUDAKernel<float>
           <<<N * Y_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               C,
               X_dims[0],
               X_dims[1],
               X_dims[2],
               Y_dims[0],
               Y_dims[1],
               Y_dims[2],
               kernel[0],
               kernel[1],
               kernel[2],
               stride[0],
               stride[1],
               stride[2],
               pads[0],
               pads[1],
               pads[2],
               X,
               Y);
       return true;
     }
     default: {
       CAFFE_THROW("Unsupported pooling dim: ", ndim);
       return false;
     }
   }
 }

 template <>
 template <>
 bool MaxPoolFunctor<CUDAContext>::
     GlobalPoolingBackward<float, StorageOrder::NCHW>(
         const int N,
         const int C,
         const int HxW,
         const float* dY,
         const float* X,
         const float* Y,
         float* dX,
         CUDAContext* context) const {
   const int K = (HxW + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
   GlobalMaxPoolingBackwardNCHWCUDAKernel<float>
       <<<N * C * K, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
           K, HxW, dY, X, Y, dX);
   return true;
 }

 template <>
 template <>
 bool MaxPoolFunctor<CUDAContext>::
     GlobalPoolingBackward<float, StorageOrder::NHWC>(
         const int N,
         const int C,
         const int HxW,
         const float* dY,
         const float* X,
         const float* Y,
         float* dX,
         CUDAContext* context) const {
   GlobalMaxPoolingBackwardNHWCCUDAKernel<float>
       <<<N * HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
           C, HxW, dY, X, Y, dX);
   return true;
 }

 template <>
 template <>
 bool MaxPoolFunctor<CUDAContext>::Backward<float, StorageOrder::NCHW>(
     const int N,
     const int C,
     const std::vector<int>& X_dims,
     const std::vector<int>& Y_dims,
     const std::vector<int>& kernel,
     const std::vector<int>& /* dilation */,
     const std::vector<int>& stride,
     const std::vector<int>& pads,
     const float* dY,
     const float* X,
     const float* Y,
     float* dX,
     CUDAContext* context) const {
   const int ndim = X_dims.size();
   switch (ndim) {
     case 1: {
       const int num_blocks = N * C;
       MaxPool1DBackwardNCHWCUDAKernel<float>
           <<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               X_dims[0],
               Y_dims[0],
               kernel[0],
               stride[0],
               pads[0],
               dY,
               X,
               Y,
               dX);
       return true;
     }
     case 2: {
       const int num_blocks = N * C * X_dims[0];
       MaxPool2DBackwardNCHWCUDAKernel<float>
           <<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               X_dims[0],
               X_dims[1],
               Y_dims[0],
               Y_dims[1],
               kernel[0],
               kernel[1],
               stride[0],
               stride[1],
               pads[0],
               pads[1],
               dY,
               X,
               Y,
               dX);
       return true;
     }
     case 3: {
       const int num_blocks = N * C * X_dims[0] * X_dims[1];
       MaxPool3DBackwardNCHWCUDAKernel<float>
           <<<num_blocks, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               X_dims[0],
               X_dims[1],
               X_dims[2],
               Y_dims[0],
               Y_dims[1],
               Y_dims[2],
               kernel[0],
               kernel[1],
               kernel[2],
               stride[0],
               stride[1],
               stride[2],
               pads[0],
               pads[1],
               pads[2],
               dY,
               X,
               Y,
               dX);
       return true;
     }
     default: {
       CAFFE_THROW("Unsupported pooling dim: ", ndim);
       return false;
     }
   }
 }

 template <>
 template <>
 bool MaxPoolFunctor<CUDAContext>::Backward<float, StorageOrder::NHWC>(
     const int N,
     const int C,
     const std::vector<int>& X_dims,
     const std::vector<int>& Y_dims,
     const std::vector<int>& kernel,
     const std::vector<int>& /* dilation */,
     const std::vector<int>& stride,
     const std::vector<int>& pads,
     const float* dY,
     const float* X,
     const float* Y,
     float* dX,
     CUDAContext* context) const {
   const int ndim = X_dims.size();
   const int X_HxW = std::accumulate(
       X_dims.cbegin(), X_dims.cend(), 1, std::multiplies<int>());
   switch (ndim) {
     case 1: {
       MaxPool1DBackwardNHWCCUDAKernel<float>
           <<<N * X_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               C,
               X_dims[0],
               Y_dims[0],
               kernel[0],
               stride[0],
               pads[0],
               dY,
               X,
               Y,
               dX);
       return true;
     }
     case 2: {
       MaxPool2DBackwardNHWCCUDAKernel<float>
           <<<N * X_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               C,
               X_dims[0],
               X_dims[1],
               Y_dims[0],
               Y_dims[1],
               kernel[0],
               kernel[1],
               stride[0],
               stride[1],
               pads[0],
               pads[1],
               dY,
               X,
               Y,
               dX);
       return true;
     }
     case 3: {
       MaxPool3DBackwardNHWCCUDAKernel<float>
           <<<N * X_HxW, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
               C,
               X_dims[0],
               X_dims[1],
               X_dims[2],
               Y_dims[0],
               Y_dims[1],
               Y_dims[2],
               kernel[0],
               kernel[1],
               kernel[2],
               stride[0],
               stride[1],
               stride[2],
               pads[0],
               pads[1],
               pads[2],
               dY,
               X,
               Y,
               dX);
       return true;
     }
     default: {
       CAFFE_THROW("Unsupported pooling dim: ", ndim);
       return false;
     }
   }
 }

 REGISTER_CUDA_OPERATOR(
     AveragePool,
     PoolOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     AveragePoolGradient,
     PoolGradientOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);

 REGISTER_CUDA_OPERATOR(
     AveragePool1D,
     PoolOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     AveragePool1DGradient,
     PoolGradientOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);

 REGISTER_CUDA_OPERATOR(
     AveragePool2D,
     PoolOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     AveragePool2DGradient,
     PoolGradientOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);

 REGISTER_CUDA_OPERATOR(
     AveragePool3D,
     PoolOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     AveragePool3DGradient,
     PoolGradientOp<float, CUDAContext, AveragePoolFunctor<CUDAContext>>);

 REGISTER_CUDA_OPERATOR(
     MaxPool,
     PoolOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     MaxPoolGradient,
     PoolGradientOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);

 REGISTER_CUDA_OPERATOR(
     MaxPool1D,
     PoolOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     MaxPool1DGradient,
     PoolGradientOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);

 REGISTER_CUDA_OPERATOR(
     MaxPool2D,
     PoolOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     MaxPool2DGradient,
     PoolGradientOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);

 REGISTER_CUDA_OPERATOR(
     MaxPool3D,
     PoolOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     MaxPool3DGradient,
     PoolGradientOp<float, CUDAContext, MaxPoolFunctor<CUDAContext>>);

 } // namespace caffe2