caffe2/operators/pool_op.cc - platform/external/pytorch - Git at Google

 #include "caffe2/operators/pool_op.h"

 #include <limits>
 #include <string>
 #include <type_traits>

 #include "caffe2/operators/pool_op_util.h"
 #include "caffe2/utils/eigen_utils.h"
 #include "caffe2/utils/math.h"

 namespace caffe2 {

 namespace {

 template <typename T, StorageOrder kOrder>
 void ComputeAveragePool1D(
     int l,
     int r,
     int y,
     T scale,
     const ConstEigenArrayMap<T>& X_arr,
     EigenArrayMap<T>* Y_arr);

 template <>
 void ComputeAveragePool1D<float, StorageOrder::NCHW>(
     const int l,
     const int r,
     const int y,
     const float scale,
     const ConstEigenArrayMap<float>& X_arr,
     EigenArrayMap<float>* Y_arr) {
   (*Y_arr)(y) = X_arr.col(0).segment(l, r - l).sum() * scale;
 }

 template <>
 void ComputeAveragePool1D<float, StorageOrder::NHWC>(
     const int l,
     const int r,
     const int y,
     const float scale,
     const ConstEigenArrayMap<float>& X_arr,
     EigenArrayMap<float>* Y_arr) {
   Y_arr->col(y) = X_arr.col(l);
   for (int i = l + 1; i < r; ++i) {
     Y_arr->col(y) += X_arr.col(i);
   }
   Y_arr->col(y) *= scale;
 }

 template <typename T, StorageOrder kOrder>
 void ComputeAveragePool2D(
     int W,
     int t,
     int b,
     int l,
     int r,
     int y,
     T scale,
     const ConstEigenArrayMap<T>& X_arr,
     EigenArrayMap<T>* Y_arr);

 template <>
 void ComputeAveragePool2D<float, StorageOrder::NCHW>(
     const int /* W */,
     const int t,
     const int b,
     const int l,
     const int r,
     const int y,
     const float scale,
     const ConstEigenArrayMap<float>& X_arr,
     EigenArrayMap<float>* Y_arr) {
   (*Y_arr)(y) = X_arr.block(l, t, r - l, b - t).sum() * scale;
 }

 template <>
 void ComputeAveragePool2D<float, StorageOrder::NHWC>(
     const int W,
     const int t,
     const int b,
     const int l,
     const int r,
     const int y,
     const float scale,
     const ConstEigenArrayMap<float>& X_arr,
     EigenArrayMap<float>* Y_arr) {
   Y_arr->col(y).setZero();
   for (int i = t; i < b; ++i) {
     for (int j = l; j < r; ++j) {
       Y_arr->col(y) += X_arr.col(i * W + j);
     }
   }
   Y_arr->col(y) *= scale;
 }

 template <typename T, StorageOrder kOrder>
 void ComputeAveragePool3D(
     int H,
     int W,
     int p,
     int a,
     int t,
     int b,
     int l,
     int r,
     int y,
     T scale,
     const ConstEigenArrayMap<T>& X_arr,
     EigenArrayMap<T>* Y_arr);

 template <>
 void ComputeAveragePool3D<float, StorageOrder::NCHW>(
     const int H,
     const int /* W */,
     const int p,
     const int a,
     const int t,
     const int b,
     const int l,
     const int r,
     const int y,
     const float scale,
     const ConstEigenArrayMap<float>& X_arr,
     EigenArrayMap<float>* Y_arr) {
   (*Y_arr)(y) = 0;
   for (int i = p; i < a; ++i) {
     (*Y_arr)(y) += X_arr.block(l, i * H + t, r - l, b - t).sum();
   }
   (*Y_arr)(y) *= scale;
 }

 template <>
 void ComputeAveragePool3D<float, StorageOrder::NHWC>(
     const int H,
     const int W,
     const int p,
     const int a,
     const int t,
     const int b,
     const int l,
     const int r,
     const int y,
     const float scale,
     const ConstEigenArrayMap<float>& X_arr,
     EigenArrayMap<float>* Y_arr) {
   Y_arr->col(y).setZero();
   for (int i = p; i < a; ++i) {
     for (int j = t; j < b; ++j) {
       for (int k = l; k < r; ++k) {
         Y_arr->col(y) += X_arr.col(i * H * W + j * W + k);
       }
     }
   }
   Y_arr->col(y) *= scale;
 }

 template <typename T, StorageOrder kOrder>
 void RunAveragePool1D(
     const int N,
     const int C,
     const int X_size,
     const int Y_size,
     const int kernel,
     const int stride,
     const int pad,
     const bool count_include_pad,
     const T* X,
     T* Y) {
   const int batch_size = kOrder == StorageOrder::NCHW ? N * C : N;
   const int X_stride = kOrder == StorageOrder::NCHW ? X_size : X_size * C;
   const int Y_stride = kOrder == StorageOrder::NCHW ? Y_size : Y_size * C;
   const T* X_ptr = X;
   T* Y_ptr = Y;
   for (int i = 0; i < batch_size; ++i) {
     ConstEigenArrayMap<T> X_arr = kOrder == StorageOrder::NCHW
         ? ConstEigenArrayMap<T>(X_ptr, X_size, 1)
         : ConstEigenArrayMap<T>(X_ptr, C, X_size);
     EigenArrayMap<T> Y_arr = kOrder == StorageOrder::NCHW
         ? EigenArrayMap<T>(Y_ptr, Y_size, 1)
         : EigenArrayMap<T>(Y_ptr, C, Y_size);
     for (int y = 0; y < Y_size; ++y) {
       const int l = std::max(y * stride - pad, 0);
       const int r = std::min(y * stride - pad + kernel, X_size);
       const T scale = T(1) / static_cast<T>(count_include_pad ? kernel : r - l);
       ComputeAveragePool1D<T, kOrder>(l, r, y, scale, X_arr, &Y_arr);
     }
     X_ptr += X_stride;
     Y_ptr += Y_stride;
   }
 }

 template <typename T, StorageOrder kOrder>
 void RunAveragePool2D(
     const int N,
     const int C,
     const int X_H,
     const int X_W,
     const int Y_H,
     const int Y_W,
     const int kernel_h,
     const int kernel_w,
     const int stride_h,
     const int stride_w,
     const int pad_t,
     const int pad_l,
     const bool count_include_pad,
     const T* X,
     T* Y) {
   const int batch_size = kOrder == StorageOrder::NCHW ? N * C : N;
   const int X_HxW = X_H * X_W;
   const int Y_HxW = Y_H * Y_W;
   const int X_stride = kOrder == StorageOrder::NCHW ? X_HxW : X_HxW * C;
   const int Y_stride = kOrder == StorageOrder::NCHW ? Y_HxW : Y_HxW * C;
   const T* X_ptr = X;
   T* Y_ptr = Y;
   for (int i = 0; i < batch_size; ++i) {
     ConstEigenArrayMap<T> X_arr = kOrder == StorageOrder::NCHW
         ? ConstEigenArrayMap<T>(X_ptr, X_W, X_H)
         : ConstEigenArrayMap<T>(X_ptr, C, X_HxW);
     EigenArrayMap<T> Y_arr = kOrder == StorageOrder::NCHW
         ? EigenArrayMap<T>(Y_ptr, Y_W, Y_H)
         : EigenArrayMap<T>(Y_ptr, C, Y_HxW);
     for (int h = 0; h < Y_H; ++h) {
       const int t = std::max(h * stride_h - pad_t, 0);
       const int b = std::min(h * stride_h - pad_t + kernel_h, X_H);
       for (int w = 0; w < Y_W; ++w) {
         const int l = std::max(w * stride_w - pad_l, 0);
         const int r = std::min(w * stride_w - pad_l + kernel_w, X_W);
         const int y = h * Y_W + w;
         const T scale = T(1) /
             static_cast<T>(count_include_pad ? kernel_h * kernel_w
                                              : (b - t) * (r - l));
         ComputeAveragePool2D<T, kOrder>(
             X_W, t, b, l, r, y, scale, X_arr, &Y_arr);
       }
     }
     X_ptr += X_stride;
     Y_ptr += Y_stride;
   }
 }

 template <typename T, StorageOrder kOrder>
 void RunAveragePool3D(
     const int N,
     const int C,
     const int X_D,
     const int X_H,
     const int X_W,
     const int Y_D,
     const int Y_H,
     const int Y_W,
     const int kernel_d,
     const int kernel_h,
     const int kernel_w,
     const int stride_d,
     const int stride_h,
     const int stride_w,
     const int pad_p,
     const int pad_t,
     const int pad_l,
     const bool count_include_pad,
     const T* X,
     T* Y) {
   const int batch_size = kOrder == StorageOrder::NCHW ? N * C : N;
   const int X_HxW = X_D * X_H * X_W;
   const int Y_HxW = Y_D * Y_H * Y_W;
   const int X_stride = kOrder == StorageOrder::NCHW ? X_HxW : X_HxW * C;
   const int Y_stride = kOrder == StorageOrder::NCHW ? Y_HxW : Y_HxW * C;
   const T* X_ptr = X;
   T* Y_ptr = Y;
   for (int i = 0; i < batch_size; ++i) {
     ConstEigenArrayMap<T> X_arr = kOrder == StorageOrder::NCHW
         ? ConstEigenArrayMap<T>(X_ptr, X_W, X_D * X_H)
         : ConstEigenArrayMap<T>(X_ptr, C, X_HxW);
     EigenArrayMap<T> Y_arr = kOrder == StorageOrder::NCHW
         ? EigenArrayMap<T>(Y_ptr, Y_W, Y_D * Y_H)
         : EigenArrayMap<T>(Y_ptr, C, Y_HxW);
     for (int d = 0; d < Y_D; ++d) {
       const int p = std::max(d * stride_d - pad_p, 0);
       const int a = std::min(d * stride_d - pad_p + kernel_d, X_D);
       for (int h = 0; h < Y_H; ++h) {
         const int t = std::max(h * stride_h - pad_t, 0);
         const int b = std::min(h * stride_h - pad_t + kernel_h, X_H);
         for (int w = 0; w < Y_W; ++w) {
           const int l = std::max(w * stride_w - pad_l, 0);
           const int r = std::min(w * stride_w - pad_l + kernel_w, X_W);
           const int y = d * Y_H * Y_W + h * Y_W + w;
           const T scale = T(1) /
               static_cast<T>(count_include_pad ? kernel_d * kernel_h * kernel_w
                                                : (a - p) * (b - t) * (r - l));
           ComputeAveragePool3D<T, kOrder>(
               X_H, X_W, p, a, t, b, l, r, y, scale, X_arr, &Y_arr);
         }
       }
     }
     X_ptr += X_stride;
     Y_ptr += Y_stride;
   }
 }

 template <typename T, StorageOrder kOrder>
 void ComputeMaxPool1D(
     int l,
     int r,
     int y,
     const ConstEigenArrayMap<T>& X_arr,
     EigenArrayMap<T>* Y_arr);

 template <>
 void ComputeMaxPool1D<float, StorageOrder::NCHW>(
     const int l,
     const int r,
     const int y,
     const ConstEigenArrayMap<float>& X_arr,
     EigenArrayMap<float>* Y_arr) {
   (*Y_arr)(y) = X_arr.col(0).segment(l, r - l).maxCoeff();
 }

 template <>
 void ComputeMaxPool1D<float, StorageOrder::NHWC>(
     const int l,
     const int r,
     const int y,
     const ConstEigenArrayMap<float>& X_arr,
     EigenArrayMap<float>* Y_arr) {
   Y_arr->col(y) = X_arr.col(l);
   for (int i = l + 1; i < r; ++i) {
     Y_arr->col(y) = Y_arr->col(y).max(X_arr.col(i));
   }
 }

 template <typename T, StorageOrder kOrder>
 void ComputeMaxPool2D(
     int W,
     int t,
     int b,
     int l,
     int r,
     int y,
     const ConstEigenArrayMap<T>& X_arr,
     EigenArrayMap<T>* Y_arr);

 template <>
 void ComputeMaxPool2D<float, StorageOrder::NCHW>(
     const int /* W */,
     const int t,
     const int b,
     const int l,
     const int r,
     const int y,
     const ConstEigenArrayMap<float>& X_arr,
     EigenArrayMap<float>* Y_arr) {
   (*Y_arr)(y) = X_arr.block(l, t, r - l, b - t).maxCoeff();
 }

 template <>
 void ComputeMaxPool2D<float, StorageOrder::NHWC>(
     const int W,
     const int t,
     const int b,
     const int l,
     const int r,
     const int y,
     const ConstEigenArrayMap<float>& X_arr,
     EigenArrayMap<float>* Y_arr) {
   Y_arr->col(y).setConstant(std::numeric_limits<float>::lowest());
   for (int i = t; i < b; ++i) {
     for (int j = l; j < r; ++j) {
       Y_arr->col(y) = Y_arr->col(y).max(X_arr.col(i * W + j));
     }
   }
 }

 template <typename T, StorageOrder kOrder>
 void ComputeMaxPool3D(
     int H,
     int W,
     int p,
     int a,
     int t,
     int b,
     int l,
     int r,
     int y,
     const ConstEigenArrayMap<T>& X_arr,
     EigenArrayMap<T>* Y_arr);

 template <>
 void ComputeMaxPool3D<float, StorageOrder::NCHW>(
     const int H,
     const int /* W */,
     const int p,
     const int a,
     const int t,
     const int b,
     const int l,
     const int r,
     const int y,
     const ConstEigenArrayMap<float>& X_arr,
     EigenArrayMap<float>* Y_arr) {
   (*Y_arr)(y) = std::numeric_limits<float>::lowest();
   for (int i = p; i < a; ++i) {
     (*Y_arr)(y) = std::max(
         (*Y_arr)(y), X_arr.block(l, i * H + t, r - l, b - t).maxCoeff());
   }
 }

 template <>
 void ComputeMaxPool3D<float, StorageOrder::NHWC>(
     const int H,
     const int W,
     const int p,
     const int a,
     const int t,
     const int b,
     const int l,
     const int r,
     const int y,
     const ConstEigenArrayMap<float>& X_arr,
     EigenArrayMap<float>* Y_arr) {
   Y_arr->col(y).setConstant(std::numeric_limits<float>::lowest());
   for (int i = p; i < a; ++i) {
     for (int j = t; j < b; ++j) {
       for (int k = l; k < r; ++k) {
         Y_arr->col(y) = Y_arr->col(y).max(X_arr.col(i * H * W + j * W + k));
       }
     }
   }
 }

 template <typename T, StorageOrder kOrder>
 void RunMaxPool1D(
     const int N,
     const int C,
     const int X_size,
     const int Y_size,
     const int kernel,
     const int stride,
     const int pad,
     const T* X,
     T* Y) {
   const int batch_size = kOrder == StorageOrder::NCHW ? N * C : N;
   const int X_stride = kOrder == StorageOrder::NCHW ? X_size : X_size * C;
   const int Y_stride = kOrder == StorageOrder::NCHW ? Y_size : Y_size * C;
   const T* X_ptr = X;
   T* Y_ptr = Y;
   for (int i = 0; i < batch_size; ++i) {
     ConstEigenArrayMap<T> X_arr = kOrder == StorageOrder::NCHW
         ? ConstEigenArrayMap<T>(X_ptr, X_size, 1)
         : ConstEigenArrayMap<T>(X_ptr, C, X_size);
     EigenArrayMap<T> Y_arr = kOrder == StorageOrder::NCHW
         ? EigenArrayMap<T>(Y_ptr, Y_size, 1)
         : EigenArrayMap<T>(Y_ptr, C, Y_size);
     for (int y = 0; y < Y_size; ++y) {
       const int l = std::max(y * stride - pad, 0);
       const int r = std::min(y * stride - pad + kernel, X_size);
       ComputeMaxPool1D<T, kOrder>(l, r, y, X_arr, &Y_arr);
     }
     X_ptr += X_stride;
     Y_ptr += Y_stride;
   }
 }

 template <typename T, StorageOrder kOrder>
 void RunMaxPool2D(
     const int N,
     const int C,
     const int X_H,
     const int X_W,
     const int Y_H,
     const int Y_W,
     const int kernel_h,
     const int kernel_w,
     const int stride_h,
     const int stride_w,
     const int pad_t,
     const int pad_l,
     const T* X,
     T* Y) {
   const int batch_size = kOrder == StorageOrder::NCHW ? N * C : N;
   const int X_HxW = X_H * X_W;
   const int Y_HxW = Y_H * Y_W;
   const int X_stride = kOrder == StorageOrder::NCHW ? X_HxW : X_HxW * C;
   const int Y_stride = kOrder == StorageOrder::NCHW ? Y_HxW : Y_HxW * C;
   const T* X_ptr = X;
   T* Y_ptr = Y;
   for (int i = 0; i < batch_size; ++i) {
     ConstEigenArrayMap<T> X_arr = kOrder == StorageOrder::NCHW
         ? ConstEigenArrayMap<T>(X_ptr, X_W, X_H)
         : ConstEigenArrayMap<T>(X_ptr, C, X_HxW);
     EigenArrayMap<T> Y_arr = kOrder == StorageOrder::NCHW
         ? EigenArrayMap<T>(Y_ptr, Y_W, Y_H)
         : EigenArrayMap<T>(Y_ptr, C, Y_HxW);
     for (int h = 0; h < Y_H; ++h) {
       const int t = std::max(h * stride_h - pad_t, 0);
       const int b = std::min(h * stride_h - pad_t + kernel_h, X_H);
       for (int w = 0; w < Y_W; ++w) {
         const int l = std::max(w * stride_w - pad_l, 0);
         const int r = std::min(w * stride_w - pad_l + kernel_w, X_W);
         const int y = h * Y_W + w;
         ComputeMaxPool2D<T, kOrder>(X_W, t, b, l, r, y, X_arr, &Y_arr);
       }
     }
     X_ptr += X_stride;
     Y_ptr += Y_stride;
   }
 }
 template <typename T, StorageOrder kOrder>
 void RunMaxPool3D(
     const int N,
     const int C,
     const int X_D,
     const int X_H,
     const int X_W,
     const int Y_D,
     const int Y_H,
     const int Y_W,
     const int kernel_d,
     const int kernel_h,
     const int kernel_w,
     const int stride_d,
     const int stride_h,
     const int stride_w,
     const int pad_p,
     const int pad_t,
     const int pad_l,
     const T* X,
     T* Y) {
   const int batch_size = kOrder == StorageOrder::NCHW ? N * C : N;
   const int X_HxW = X_D * X_H * X_W;
   const int Y_HxW = Y_D * Y_H * Y_W;
   const int X_stride = kOrder == StorageOrder::NCHW ? X_HxW : X_HxW * C;
   const int Y_stride = kOrder == StorageOrder::NCHW ? Y_HxW : Y_HxW * C;
   const T* X_ptr = X;
   T* Y_ptr = Y;
   for (int i = 0; i < batch_size; ++i) {
     ConstEigenArrayMap<T> X_arr = kOrder == StorageOrder::NCHW
         ? ConstEigenArrayMap<T>(X_ptr, X_W, X_D * X_H)
         : ConstEigenArrayMap<T>(X_ptr, C, X_HxW);
     EigenArrayMap<T> Y_arr = kOrder == StorageOrder::NCHW
         ? EigenArrayMap<T>(Y_ptr, Y_W, Y_D * Y_H)
         : EigenArrayMap<T>(Y_ptr, C, Y_HxW);
     for (int d = 0; d < Y_D; ++d) {
       const int p = std::max(d * stride_d - pad_p, 0);
       const int a = std::min(d * stride_d - pad_p + kernel_d, X_D);
       for (int h = 0; h < Y_H; ++h) {
         const int t = std::max(h * stride_h - pad_t, 0);
         const int b = std::min(h * stride_h - pad_t + kernel_h, X_H);
         for (int w = 0; w < Y_W; ++w) {
           const int l = std::max(w * stride_w - pad_l, 0);
           const int r = std::min(w * stride_w - pad_l + kernel_w, X_W);
           const int y = d * Y_H * Y_W + h * Y_W + w;
           ComputeMaxPool3D<T, kOrder>(
               X_H, X_W, p, a, t, b, l, r, y, X_arr, &Y_arr);
         }
       }
     }
     X_ptr += X_stride;
     Y_ptr += Y_stride;
   }
 }

 } // namespace

 template <>
 template <>
 bool AveragePoolFunctor<CPUContext>::
     GlobalPoolingForward<float, StorageOrder::NCHW>(
         const int N,
         const int C,
         const int HxW,
         const float* X,
         float* Y,
         CPUContext* context) const {
   const std::array<int, 2> X_dims = {N * C, HxW};
   const std::array<int, 2> Y_dims = {N * C, 1};
   math::ReduceMean<float, CPUContext>(
       2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
   return true;
 }

 template <>
 template <>
 bool AveragePoolFunctor<CPUContext>::
     GlobalPoolingForward<float, StorageOrder::NHWC>(
         const int N,
         const int C,
         const int HxW,
         const float* X,
         float* Y,
         CPUContext* context) const {
   math::Set<float, CPUContext>(N * C, 0.0f, Y, context);
   const float* X_ptr = X;
   float* Y_ptr = Y;
   for (int i = 0; i < N; ++i) {
     for (int j = 0; j < HxW; ++j) {
       math::Add<float, CPUContext>(C, Y_ptr, X_ptr + j * C, Y_ptr, context);
     }
     X_ptr += HxW * C;
     Y_ptr += C;
   }
   math::Scale<float, float, CPUContext>(
       N * C, 1.0f / static_cast<float>(HxW), Y, Y, context);
   return true;
 }

 #define CAFFE2_SPECIALIZED_AVERAGE_POOL_FUNCTOR_FORWARD(T, kOrder)           \
   template <>                                                                \
   template <>                                                                \
   bool AveragePoolFunctor<CPUContext>::Forward<T, kOrder>(                   \
       const int N,                                                           \
       const int C,                                                           \
       const std::vector<int>& X_dims,                                        \
       const std::vector<int>& Y_dims,                                        \
       const std::vector<int>& kernel,                                        \
       const std::vector<int>& dilation,                                      \
       const std::vector<int>& stride,                                        \
       const std::vector<int>& pads,                                          \
       const T* X,                                                            \
       T* Y,                                                                  \
       CPUContext* /* context */) const {                                     \
     const int ndim = X_dims.size();                                          \
     switch (ndim) {                                                          \
       case 1: {                                                              \
         RunAveragePool1D<T, kOrder>(                                         \
             N,                                                               \
             C,                                                               \
             X_dims[0],                                                       \
             Y_dims[0],                                                       \
             kernel[0],                                                       \
             stride[0],                                                       \
             pads[0],                                                         \
             count_include_pad,                                               \
             X,                                                               \
             Y);                                                              \
         return true;                                                         \
       }                                                                      \
       case 2: {                                                              \
         if (std::is_same<T, float>::value && kOrder == StorageOrder::NCHW && \
             pool_op_util::IsNeon4x4p0s0Eligible(                             \
                 X_dims[0],                                                   \
                 X_dims[1],                                                   \
                 Y_dims[0],                                                   \
                 Y_dims[1],                                                   \
                 kernel[0],                                                   \
                 kernel[1],                                                   \
                 stride[0],                                                   \
                 stride[1],                                                   \
                 pads[0],                                                     \
                 pads[1],                                                     \
                 pads[2],                                                     \
                 pads[3],                                                     \
                 dilation[0],                                                 \
                 dilation[1],                                                 \
                 X,                                                           \
                 Y)) {                                                        \
           pool_op_util::RunNeonAveragePool4x4p0s0NCHW(                       \
               N, C, X_dims[0], X_dims[1], X, Y);                             \
         } else {                                                             \
           RunAveragePool2D<T, kOrder>(                                       \
               N,                                                             \
               C,                                                             \
               X_dims[0],                                                     \
               X_dims[1],                                                     \
               Y_dims[0],                                                     \
               Y_dims[1],                                                     \
               kernel[0],                                                     \
               kernel[1],                                                     \
               stride[0],                                                     \
               stride[1],                                                     \
               pads[0],                                                       \
               pads[1],                                                       \
               count_include_pad,                                             \
               X,                                                             \
               Y);                                                            \
         }                                                                    \
         return true;                                                         \
       }                                                                      \
       case 3: {                                                              \
         RunAveragePool3D<T, kOrder>(                                         \
             N,                                                               \
             C,                                                               \
             X_dims[0],                                                       \
             X_dims[1],                                                       \
             X_dims[2],                                                       \
             Y_dims[0],                                                       \
             Y_dims[1],                                                       \
             Y_dims[2],                                                       \
             kernel[0],                                                       \
             kernel[1],                                                       \
             kernel[2],                                                       \
             stride[0],                                                       \
             stride[1],                                                       \
             stride[2],                                                       \
             pads[0],                                                         \
             pads[1],                                                         \
             pads[2],                                                         \
             count_include_pad,                                               \
             X,                                                               \
             Y);                                                              \
         return true;                                                         \
       }                                                                      \
       default: {                                                             \
         CAFFE_THROW("Unsupported pooling dim: ", ndim);                      \
         return false;                                                        \
       }                                                                      \
     }                                                                        \
   }
 CAFFE2_SPECIALIZED_AVERAGE_POOL_FUNCTOR_FORWARD(float, StorageOrder::NCHW)
 CAFFE2_SPECIALIZED_AVERAGE_POOL_FUNCTOR_FORWARD(float, StorageOrder::NHWC)
 #undef CAFFE2_SPECIALIZED_AVERAGE_POOL_FUNCTOR_FORWARD

 template <>
 template <>
 bool MaxPoolFunctor<CPUContext>::
     GlobalPoolingForward<float, StorageOrder::NCHW>(
         const int N,
         const int C,
         const int HxW,
         const float* X,
         float* Y,
         CPUContext* context) const {
   const std::array<int, 2> X_dims = {N * C, HxW};
   const std::array<int, 2> Y_dims = {N * C, 1};
   math::ReduceMax<float, CPUContext>(
       2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
   return true;
 }

 template <>
 template <>
 bool MaxPoolFunctor<CPUContext>::
     GlobalPoolingForward<float, StorageOrder::NHWC>(
         const int N,
         const int C,
         const int HxW,
         const float* X,
         float* Y,
         CPUContext* context) const {
   math::Set<float, CPUContext>(
       N * C, std::numeric_limits<float>::lowest(), Y, context);
   const float* X_ptr = X;
   float* Y_ptr = Y;
   for (int i = 0; i < N; ++i) {
     ConstEigenArrayMap<float> X_arr(X_ptr, C, HxW);
     EigenVectorArrayMap<float> Y_arr(Y_ptr, C);
     for (int j = 0; j < HxW; ++j) {
       Y_arr = Y_arr.max(X_arr.col(j));
     }
     X_ptr += HxW * C;
     Y_ptr += C;
   }
   return true;
 }

 #define CAFFE2_SPECIALIZED_MAX_POOL_FUNCTOR_FORWARD(T, kOrder)                \
   template <>                                                                 \
   template <>                                                                 \
   bool MaxPoolFunctor<CPUContext>::Forward<T, kOrder>(                        \
       const int N,                                                            \
       const int C,                                                            \
       const std::vector<int>& X_dims,                                         \
       const std::vector<int>& Y_dims,                                         \
       const std::vector<int>& kernel,                                         \
       const std::vector<int>& dilation,                                       \
       const std::vector<int>& stride,                                         \
       const std::vector<int>& pads,                                           \
       const T* X,                                                             \
       T* Y,                                                                   \
       CPUContext* /* context */) const {                                      \
     const int ndim = X_dims.size();                                           \
     switch (ndim) {                                                           \
       case 1: {                                                               \
         RunMaxPool1D<T, kOrder>(                                              \
             N, C, X_dims[0], Y_dims[0], kernel[0], stride[0], pads[0], X, Y); \
         return true;                                                          \
       }                                                                       \
       case 2: {                                                               \
         if (std::is_same<T, float>::value && kOrder == StorageOrder::NCHW &&  \
             pool_op_util::IsNeon2x2p0s0Eligible(                              \
                 X_dims[0],                                                    \
                 X_dims[1],                                                    \
                 Y_dims[0],                                                    \
                 Y_dims[1],                                                    \
                 kernel[0],                                                    \
                 kernel[1],                                                    \
                 stride[0],                                                    \
                 stride[1],                                                    \
                 pads[0],                                                      \
                 pads[1],                                                      \
                 pads[2],                                                      \
                 pads[3],                                                      \
                 dilation[0],                                                  \
                 dilation[1],                                                  \
                 X,                                                            \
                 Y)) {                                                         \
           pool_op_util::RunNeonMaxPool2x2p0s0NCHW(                            \
               N, C, X_dims[0], X_dims[1], X, Y);                              \
         } else {                                                              \
           RunMaxPool2D<T, kOrder>(                                            \
               N,                                                              \
               C,                                                              \
               X_dims[0],                                                      \
               X_dims[1],                                                      \
               Y_dims[0],                                                      \
               Y_dims[1],                                                      \
               kernel[0],                                                      \
               kernel[1],                                                      \
               stride[0],                                                      \
               stride[1],                                                      \
               pads[0],                                                        \
               pads[1],                                                        \
               X,                                                              \
               Y);                                                             \
         }                                                                     \
         return true;                                                          \
       }                                                                       \
       case 3: {                                                               \
         RunMaxPool3D<T, kOrder>(                                              \
             N,                                                                \
             C,                                                                \
             X_dims[0],                                                        \
             X_dims[1],                                                        \
             X_dims[2],                                                        \
             Y_dims[0],                                                        \
             Y_dims[1],                                                        \
             Y_dims[2],                                                        \
             kernel[0],                                                        \
             kernel[1],                                                        \
             kernel[2],                                                        \
             stride[0],                                                        \
             stride[1],                                                        \
             stride[2],                                                        \
             pads[0],                                                          \
             pads[1],                                                          \
             pads[2],                                                          \
             X,                                                                \
             Y);                                                               \
         return true;                                                          \
       }                                                                       \
       default: {                                                              \
         CAFFE_THROW("Unsupported pooling dim: ", ndim);                       \
         return false;                                                         \
       }                                                                       \
     }                                                                         \
   }
 CAFFE2_SPECIALIZED_MAX_POOL_FUNCTOR_FORWARD(float, StorageOrder::NCHW)
 CAFFE2_SPECIALIZED_MAX_POOL_FUNCTOR_FORWARD(float, StorageOrder::NHWC)
 #undef CAFFE2_SPECIALIZED_MAX_POOL_FUNCTOR_FORWARD

 constexpr char kAveragePoolDoc[] = R"DOC(
 consumes an input blob and applies average pooling across the the blob according
 to kernel sizes, stride sizes, pad lengths and dilation. Average pooling consists
 of taking the average value of a subset of the input tensor according to the kernel
 size and downsampling the data into the output blob for further processing. The
 `brew` module has a wrapper for this operator for use in a `ModelHelper` object.

 Pooling layers reduce the spatial dimensionality of the input blob. Each of the
 output blob's dimensions will reduce according to:

 $$dim_{out}=\frac{dim_{in}-kernel+2*pad}{stride}+1$$

 Github Links:

 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.h
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.cc
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_pool_op_base.h


 <details>

 <summary> <b>Example</b> </summary>

 **Code**

 ```
 workspace.ResetWorkspace()

 op = core.CreateOperator(
     "AveragePool",
     ["X"],
     ["Y"],
     kernel=2,
     stride=2,
 )

 workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 ```

 **Result**

 ```
 X:
  [[[[-0.2883434   0.43498734  0.05417408  1.912558    0.09390241
     -0.33173105]
    [ 1.633709    1.2047161   0.36964908  0.99961185  0.4184147
      0.9989975 ]
    [ 1.7644193   0.1789665   1.5812988  -0.6038542  -0.36090398
      0.33195344]
    [ 0.9457722  -0.95174325 -0.78124577  1.2062047   1.1903144
      0.2586746 ]
    [ 1.252104    0.32645547  1.8073524  -0.78397465  0.9978303
     -0.97614396]
    [ 0.5440196   1.5778259  -0.76750124  0.5051756   0.8838398
     -0.37085298]]]]

 Y:
  [[[[0.7462672  0.83399826 0.2948959 ]
    [0.4843537  0.3506009  0.35500962]
    [0.9251013  0.19026303 0.13366827]]]]
 ```

 </details>

 )DOC";

 constexpr char kMaxPoolDoc[] = R"DOC(
 consumes an input blob and applies max pooling across the the blob according to
 kernel sizes, stride sizes, pad lengths and dilation. Max pooling consists of
 taking the maximum value of a subset of the input tensor according to the kernel
 size and downsampling the data into the output blob for further processing. The
 `brew` module has a wrapper for this operator for use in a `ModelHelper` object.

 Pooling layers reduce the spatial dimensionality of the input blob. Each of the
 output blob's dimensions will reduce according to:

 $$dim_{out}=\frac{dim_{in}-kernel+2*pad}{stride}+1$$

 Github Links:

 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.h
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.cc
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_pool_op_base.h

 <details>

 <summary> <b>Example</b> </summary>

 **Code**

 ```
 workspace.ResetWorkspace()

 op = core.CreateOperator(
     "MaxPool",
     ["X"],
     ["Y"],
     kernel=2,
     stride=2,
 )

 workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 ```

 **Result**

 ```
 X:
  [[[[-2.8534958e-01 -1.7719941e+00 -8.2277227e-04  1.1088650e+00
     -2.1476576e+00 -3.5070452e-01]
    [-9.0058845e-01 -3.0070004e-01 -1.7907504e+00 -7.1746534e-01
      1.2798511e+00 -3.2214901e-01]
    [ 1.5806322e+00  1.6845188e+00 -2.6633200e-01 -3.8576153e-01
     -9.6424848e-02 -3.9696163e-01]
    [ 1.2572408e-01  6.3612902e-01 -3.9554062e-01 -6.9735396e-01
     -9.1898698e-01 -1.9609968e-01]
    [-1.1587460e+00  2.4605224e+00 -1.5497679e+00  1.3020347e-01
     -8.1293899e-01 -7.8803545e-01]
    [ 1.4323474e+00  1.3618395e+00  9.8975077e-02 -1.1307785e-01
      7.2035044e-01  2.7642491e-01]]]]

 Y:
  [[[[-0.28534958  1.108865    1.2798511 ]
    [ 1.6845188  -0.266332   -0.09642485]
    [ 2.4605224   0.13020347  0.72035044]]]]

 ```

 </details>

 )DOC";

 std::function<void(OpSchema&)> AveragePoolDocGenerator(const char* dim) {
   return [=](OpSchema& schema) {
     std::string doc = "AveragePool{dim} {pool_doc}";
     c10::ReplaceAll(doc, "{dim}", dim);
     c10::ReplaceAll(doc, "{pool_doc}", kAveragePoolDoc);
     schema.SetDoc(doc);
     schema.Input(
         0,
         "X",
         "*(type: Tensor`<float>`)* Input data tensor of shape NCHW or NHWC.");
     schema.Output(0, "Y", "*(type: Tensor`<float>`)* Output data tensor.");
     // schema.Arg(
     //     "kernel", "*(type: int)* Size of the window to take an average
     //     over.");
     // schema.Arg("stride", "*(type: int)* Stride of the window.");
     // schema.Arg(
     //     "pad",
     //     "*(type: int)* Implicit zero padding to be added on both sides.");
     // schema.Arg(
     //     "dilation",
     //     "*(type: int)* Parameter that controls the stride of elements in the
     //     " "window.");
     // schema.Arg(
     //     "order",
     //     "*(type: string; default: 'NCHW')* Order of the blob dimensions.");
     // schema.Arg(
     //     "count_include_pad",
     //     "*(type: bool; default: False)* When True, will include the "
     //     "zero-padding in the averaging.");
   };
 }

 std::function<void(OpSchema&)> MaxPoolDocGenerator(const char* dim) {
   return [=](OpSchema& schema) {
     std::string doc = "MaxPool{dim} {pool_doc}";
     c10::ReplaceAll(doc, "{dim}", dim);
     c10::ReplaceAll(doc, "{pool_doc}", kMaxPoolDoc);
     schema.SetDoc(doc);
     schema.Input(
         0,
         "X",
         "*(type: Tensor`<float>`)* Input data tensor of shape NCHW or NHWC.");
     schema.Output(0, "Y", "*(type: Tensor`<float>`)* Output data tensor.");
     /*
     schema.Arg("kernel", "*(type: int)* Size of the window to take an average
     over."); schema.Arg("stride", "*(type: int)* Stride of the window.");
     schema.Arg("pad", "*(type: int)* Implicit zero padding to be added on both
     sides."); schema.Arg("dilation", "*(type: int)* Parameter that controls
     the stride of elements in the window."); schema.Arg("order", "*(type:
     string; default: 'NCHW')* Order of the blob dimensions.");
     */
   };
 }
 REGISTER_CPU_OPERATOR(
     AveragePool,
     PoolOp<float, CPUContext, AveragePoolFunctor<CPUContext>>);

 OPERATOR_SCHEMA(AveragePool)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(AveragePoolDocGenerator(""))
     .InheritOnnxSchema();

 REGISTER_CPU_OPERATOR(
     AveragePool1D,
     PoolOp<float, CPUContext, AveragePoolFunctor<CPUContext>>);

 OPERATOR_SCHEMA(AveragePool1D)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(AveragePoolDocGenerator("1D"))
     .InheritOnnxSchema("AveragePool");

 REGISTER_CPU_OPERATOR(
     AveragePool2D,
     PoolOp<float, CPUContext, AveragePoolFunctor<CPUContext>>);

 OPERATOR_SCHEMA(AveragePool2D)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(AveragePoolDocGenerator("2D"))
     .InheritOnnxSchema("AveragePool");

 REGISTER_CPU_OPERATOR(
     AveragePool3D,
     PoolOp<float, CPUContext, AveragePoolFunctor<CPUContext>>);

 OPERATOR_SCHEMA(AveragePool3D)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(AveragePoolDocGenerator("3D"))
     .InheritOnnxSchema("AveragePool");

 REGISTER_CPU_OPERATOR(
     MaxPool,
     PoolOp<float, CPUContext, MaxPoolFunctor<CPUContext>>);

 OPERATOR_SCHEMA(MaxPool)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(MaxPoolDocGenerator(""))
     .InheritOnnxSchema();

 REGISTER_CPU_OPERATOR(
     MaxPool1D,
     PoolOp<float, CPUContext, MaxPoolFunctor<CPUContext>>);

 OPERATOR_SCHEMA(MaxPool1D)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(MaxPoolDocGenerator("1D"))
     .InheritOnnxSchema("MaxPool");

 REGISTER_CPU_OPERATOR(
     MaxPool2D,
     PoolOp<float, CPUContext, MaxPoolFunctor<CPUContext>>);

 OPERATOR_SCHEMA(MaxPool2D)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(MaxPoolDocGenerator("2D"))
     .InheritOnnxSchema("MaxPool");

 REGISTER_CPU_OPERATOR(
     MaxPool3D,
     PoolOp<float, CPUContext, MaxPoolFunctor<CPUContext>>);

 OPERATOR_SCHEMA(MaxPool3D)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(MaxPoolDocGenerator("3D"))
     .InheritOnnxSchema("MaxPool");

 } // namespace caffe2