caffe2/operators/pool_op.cc - platform/external/pytorch - Git at Google

 // TODO(ataei): reduce the apparent redundancy of all the code below.
 #include "caffe2/operators/pool_op.h"
 #include "caffe2/utils/cpu_neon.h"
 #include "caffe2/utils/eigen_utils.h"

 namespace caffe2 {

 using std::max;
 using std::min;

 namespace {

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)

 bool isNeon4x4p0s0Eligible(
     int inputH,
     int inputW,
     int outputH,
     int outputW,
     int kH,
     int kW,
     int strideH,
     int strideW,
     int padT,
     int padL,
     int padB,
     int padR,
     int dilationH,
     int dilationW,
     const float* input,
     float* output) {
   // Use this kernel only if:
   // Kernel width is 4x4
   // Kernel stride is 4x4
   // Padding is 0
   // Dilation is 1
   // Output width and height are even divisors of input width
   // Input width and height are divisible by 4 (should be implied by
   // all of the above, but just check again)
   // Input and output pointers are aligned by float32x4_t

   bool kernelOk = (kH == 4) && (kW == 4);
   bool strideOk = (strideH == 4) && (strideW == 4);
   bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
   bool dilationOk = (dilationH == 1) && (dilationW == 1);

   bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
   bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
   bool alignOk = isPointerAligned(input, sizeof(float32x4_t)) &&
       isPointerAligned(output, sizeof(float32x4_t));

   return kernelOk && strideOk && padOk && dilationOk && outputOk && inputOk &&
       alignOk;
 }

 // Vectorizes 4x4p0s0 averge pooling for ARM NEON
 void avgPoolNeon4x4p0s0Plane(
     int inputH,
     int inputW,
     const float* input,
     float* output) {
   constexpr int kKernelHeight = 4;
   constexpr int kKernelWidth = 4;
   constexpr float kDiv = (1.0f / ((float)kKernelHeight * (float)kKernelWidth));

   // Handle portion that can be unrolled by 4
   constexpr int kUnroll = 4;
   constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
   constexpr int kLoadCols = kUnroll * kLoadSizeFloat;

   if (inputW % kLoadCols == 0) {
     //
     // Manually unroll by 4 (kUnroll)
     //

     for (int h = 0; h < inputH; h += kKernelHeight) {
       float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
       const float* curInput = input + h * inputW;

       for (int w = 0; w < inputW; w += kLoadCols) {
         float32x4_t out = {};

         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
           float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
           float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
           out = vsetq_lane_f32(v0, out, 0);
         }
         curInput += kLoadSizeFloat;

         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
           float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
           float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
           out = vsetq_lane_f32(v0, out, 1);
         }
         curInput += kLoadSizeFloat;

         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
           float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
           float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
           out = vsetq_lane_f32(v0, out, 2);
         }
         curInput += kLoadSizeFloat;

         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
           float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
           float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
           out = vsetq_lane_f32(v0, out, 3);
         }
         curInput += kLoadSizeFloat;

         out = vmulq_f32(out, vdupq_n_f32(kDiv));
         vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
       }
     }
   } else {
     //
     // Not unrolled
     //

     for (int h = 0; h < inputH; h += kKernelHeight) {
       const float* inputRow = input + h * inputW;
       float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);

       for (int w = 0; w < inputW; w += kKernelWidth) {
         const float* curInput = inputRow + w;

         float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
         float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
         float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
         float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
         float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
         outputRow[w / kKernelWidth] = v0;
       }
     }
   }
 }

 void runNeonAveragePool4x4p0s0NCHW(
     int N,
     int C,
     int inputH,
     int inputW,
     const float* input,
     float* output) {
   // We only have the 4x4p0s0 implementation at present, which is
   // checked at a higher level
   int outputH = inputH / 4;
   int outputW = inputW / 4;

   for (int n = 0; n < N; ++n) {
     for (int c = 0; c < C; ++c) {
       const float* curInput = input + (n * C + c) * inputH * inputW;
       float* curOutput = output + (n * C + c) * outputH * outputW;

       avgPoolNeon4x4p0s0Plane(inputH, inputW, curInput, curOutput);
     }
   }
 }

 bool isNeon2x2p0s0Eligible(
     int inputH,
     int inputW,
     int outputH,
     int outputW,
     int kH,
     int kW,
     int strideH,
     int strideW,
     int padT,
     int padL,
     int padB,
     int padR,
     int dilationH,
     int dilationW,
     const float* input,
     float* output) {
   // Use this kernel only if:
   // Kernel width is 2x2
   // Kernel stride is 2x2
   // Padding is 0
   // Dilation is 1
   // Output width and height are even divisors of input width
   // Input width and height are divisible by 4 (should be implied by
   // all of the above, but just check again)
   // Input and output pointers are aligned by float32x4_t

   bool kernelOk = (kH == 2) && (kW == 2);
   bool strideOk = (strideH == 2) && (strideW == 2);
   bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
   bool dilationOk = (dilationH == 1) && (dilationW == 1);

   bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
   bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
   bool alignOk = isPointerAligned(input, sizeof(float32x4_t)) &&
       isPointerAligned(output, sizeof(float32x4_t));

   return kernelOk && strideOk && padOk && dilationOk && outputOk && inputOk &&
       alignOk;
 }

 // Vectorizes 2x2p0s0 averge pooling for ARM NEON
 void maxPoolNeon2x2p0s0Plane(
     int inputH,
     int inputW,
     const float* input,
     float* output) {
   constexpr int kKernelHeight = 2;
   constexpr int kKernelWidth = 2;

   // Handle portion that can be unrolled by 4
   constexpr int kUnroll = 4;
   constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
   constexpr int kLoadCols = kUnroll * kLoadSizeFloat;

   if (inputW % kLoadCols == 0) {
     for (int h = 0; h < inputH; h += kKernelHeight) {
       float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
       const float* curInput = input + h * inputW;

       for (int w = 0; w < inputW; w += kLoadCols) {
         float32x2_t hmax_0, hmax_1, hmax_2, hmax_3;
         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
           hmax_0 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
         }
         curInput += kLoadSizeFloat;
         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
           hmax_1 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
         }
         curInput += kLoadSizeFloat;
         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
           hmax_2 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
         }
         curInput += kLoadSizeFloat;
         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
           hmax_3 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
         }
         curInput += kLoadSizeFloat;

         float32x4_t out_0 = vcombine_f32(hmax_0, hmax_1);
         float32x4_t out_1 = vcombine_f32(hmax_2, hmax_3);
         vst1q_f32_aligned(&outputRow[w / kKernelWidth + 0], out_0);
         vst1q_f32_aligned(&outputRow[w / kKernelWidth + 4], out_1);
       }
     }
   } else {
     // Not unrolled
     for (int h = 0; h < inputH; h += kKernelHeight) {
       const float* inputRow = input + h * inputW;
       float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);

       for (int w = 0; w < inputW; w += kKernelWidth * 2) {
         const float* curInput = inputRow + w;
         float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
         float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
         float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
         float32x2_t hmax = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
         vst1_f32(&outputRow[w / kKernelWidth], hmax);
       }
     }
   }
 }

 void runNeonMaxPool2x2p0s0NCHW(
     int N,
     int C,
     int inputH,
     int inputW,
     const float* input,
     float* output) {
   // We only have the 2x2p0s0 implementation at present, which is
   // checked at a higher level
   int outputH = inputH / 2;
   int outputW = inputW / 2;

   for (int n = 0; n < N; ++n) {
     for (int c = 0; c < C; ++c) {
       const float* curInput = input + (n * C + c) * inputH * inputW;
       float* curOutput = output + (n * C + c) * outputH * outputW;
       maxPoolNeon2x2p0s0Plane(inputH, inputW, curInput, curOutput);
     }
   }
 }
 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)

 } // namespace

 template <typename T>
 class AveragePool {
  public:
   static float initialize() {
     return 0.0;
   }

   static void process(
       const int x_col,
       const int y_col,
       ConstEigenMatrixMap<float>& x_mat,
       EigenMatrixMap<float>& y_mat) {
     y_mat.col(y_col) += x_mat.col(x_col);
   }

   static void process(const T& x_data, T& y_data) {
     y_data += x_data;
   }

   static void finalize(const int size, T& y_data) {
     y_data /= size;
   }

   static void
   finalize(const int size, const int col, EigenMatrixMap<float>& y_mat) {
     y_mat.col(col) /= size;
   }

   static bool runSpecialized(
       int N,
       int C,
       int inputH,
       int inputW,
       int outputH,
       int outputW,
       int kH,
       int kW,
       int strideH,
       int strideW,
       int padT,
       int padL,
       int padB,
       int padR,
       int dilationH,
       int dilationW,
       const float* input,
       float* output) {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
     if (isNeon4x4p0s0Eligible(
             inputH,
             inputW,
             outputH,
             outputW,
             kH,
             kW,
             strideH,
             strideW,
             padT,
             padL,
             padB,
             padR,
             dilationH,
             dilationW,
             input,
             output)) {
       runNeonAveragePool4x4p0s0NCHW(N, C, inputH, inputW, input, output);
       return true;
     }
 #else
     (void)N;
     (void)C;
     (void)inputH;
     (void)inputW;
     (void)outputH;
     (void)outputW;
     (void)kH;
     (void)kW;
     (void)strideH;
     (void)strideW;
     (void)padT;
     (void)padL;
     (void)padB;
     (void)padR;
     (void)dilationH;
     (void)dilationW;
     (void)input;
     (void)output;
 #endif
     return false;
   }
 };

 template <typename T>
 class MaxPool {
  public:
   static float initialize() {
     return std::numeric_limits<float>::lowest();
   }

   static void process(
       const int x_col,
       const int y_col,
       ConstEigenMatrixMap<float>& x_mat,
       EigenMatrixMap<float>& y_mat) {
     y_mat.col(y_col) = y_mat.col(y_col).cwiseMax(x_mat.col(x_col));
   }

   static void process(const T& x_data, T& y_data) {
     if (x_data > y_data) {
       y_data = x_data;
     }
   }

   static void finalize(const int /*size*/, T& /*y_data*/) {}

   static void finalize(
       const int /*size*/,
       const int /*col*/,
       EigenMatrixMap<float>& /*y_mat*/) {}

   static bool runSpecialized(
       int N,
       int C,
       int inputH,
       int inputW,
       int outputH,
       int outputW,
       int kH,
       int kW,
       int strideH,
       int strideW,
       int padT,
       int padL,
       int padB,
       int padR,
       int dilationH,
       int dilationW,
       const float* input,
       float* output) {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
     if (isNeon2x2p0s0Eligible(
             inputH,
             inputW,
             outputH,
             outputW,
             kH,
             kW,
             strideH,
             strideW,
             padT,
             padL,
             padB,
             padR,
             dilationH,
             dilationW,
             input,
             output)) {
       runNeonMaxPool2x2p0s0NCHW(N, C, inputH, inputW, input, output);
       return true;
     }
 #else
     (void)N;
     (void)C;
     (void)inputH;
     (void)inputW;
     (void)outputH;
     (void)outputW;
     (void)kH;
     (void)kW;
     (void)strideH;
     (void)strideW;
     (void)padT;
     (void)padL;
     (void)padB;
     (void)padR;
     (void)dilationH;
     (void)dilationW;
     (void)input;
     (void)output;
 #endif
     return false;
   }
 };

 template <typename T, class Context, typename PoolType>
 bool PoolOp<T, Context, PoolType>::RunOnDeviceWithOrderNCHW() {
   auto& X = Input(0);
   auto* Y = Output(0);
   ConvPoolOpBase<Context>::SetOutputSize(X, Y, X.dim32(1));

   const float* Xdata = X.template data<float>();
   float* Ydata = Y->template mutable_data<float>();
   // The main loop
   int channels = X.dim32(1);
   int height = X.dim32(2);
   int width = kernel_.size() > 1 ? X.dim32(3) : 1;
   int depth = kernel_.size() > 2 ? X.dim32(4) : 1;
   int pooled_height = Y->dim32(2);
   int pooled_width = kernel_.size() > 1 ? Y->dim32(3) : 1;
   int pooled_depth = kernel_.size() > 2 ? Y->dim32(4) : 1;

   // We specialize certain variants on ARM for vectorization
   if (kernel_.size() == 2 &&
       PoolType::runSpecialized(
           X.dim32(0),
           X.dim32(1),
           X.dim32(2),
           X.dim32(3),
           Y->dim32(2),
           Y->dim32(3),
           kernel_h(),
           kernel_w(),
           stride_h(),
           stride_w(),
           pad_t(),
           pad_l(),
           pad_b(),
           pad_r(),
           dilation_h(),
           dilation_w(),
           Xdata,
           Ydata)) {
     return true;
   }

   switch (kernel_.size()) {
     case 1:
       for (int n = 0; n < X.dim32(0); ++n) {
         for (int c = 0; c < channels; ++c) {
           for (int ph = 0; ph < pooled_height; ++ph) {
             int hstart = ph * stride_h() - pad_t();
             int hend = min(hstart + kernel_h(), height);
             hstart = max(hstart, 0);
             T Yh = PoolType::initialize();
             for (int h = hstart; h < hend; ++h) {
               PoolType::process(Xdata[h], Yh);
             }
             PoolType::finalize(hend - hstart, Yh);
             Ydata[ph] = Yh;
           }
           // Do offset.
           Xdata += height;
           Ydata += pooled_height;
         }
       }
       break;
     case 2:
       for (int n = 0; n < X.dim32(0); ++n) {
         for (int c = 0; c < channels; ++c) {
           for (int ph = 0; ph < pooled_height; ++ph) {
             int hstart = ph * stride_h() - pad_t();
             int hend = min(hstart + kernel_h(), height);
             hstart = max(hstart, 0);
             for (int pw = 0; pw < pooled_width; ++pw) {
               int wstart = pw * stride_w() - pad_l();
               int wend = min(wstart + kernel_w(), width);
               wstart = max(wstart, 0);
               const int pool_index = ph * pooled_width + pw;
               T Yh = PoolType::initialize();
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
                   const int input_index = h * width + w;
                   PoolType::process(Xdata[input_index], Yh);
                 }
               }
               PoolType::finalize((hend - hstart) * (wend - wstart), Yh);
               Ydata[pool_index] = Yh;
             }
           }
           // Do offset.
           Xdata += height * width;
           Ydata += pooled_height * pooled_width;
         }
       }
       break;
     case 3:
       for (int n = 0; n < X.dim32(0); ++n) {
         for (int c = 0; c < channels; ++c) {
           for (int ph = 0; ph < pooled_height; ++ph) {
             int hstart = ph * stride_h() - pad_t();
             int hend = min(hstart + kernel_h(), height);
             hstart = max(hstart, 0);
             for (int pw = 0; pw < pooled_width; ++pw) {
               int wstart = pw * stride_w() - pad_l();
               int wend = min(wstart + kernel_w(), width);
               wstart = max(wstart, 0);
               for (int pd = 0; pd < pooled_depth; ++pd) {
                 int dstart = pd * stride_[2] - pads_[2];
                 int dend = min(dstart + kernel_[2], depth);
                 dstart = max(dstart, 0);
                 const int pool_index =
                     ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
                 T Yh = PoolType::initialize();
                 for (int h = hstart; h < hend; ++h) {
                   for (int w = wstart; w < wend; ++w) {
                     for (int d = dstart; d < dend; ++d) {
                       const int input_index = h * width * depth + w * depth + d;
                       PoolType::process(Xdata[input_index], Yh);
                     }
                   }
                 }
                 PoolType::finalize(
                     (hend - hstart) * (wend - wstart) * (dend - dstart), Yh);
                 Ydata[pool_index] = Yh;
               }
             }
           }
           // Do offset.
           Xdata += height * width * depth;
           Ydata += pooled_height * pooled_width * pooled_depth;
         }
       }
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
       return false;
   }
   return true;
 }

 template <typename T, class Context, typename PoolType>
 bool PoolOp<T, Context, PoolType>::RunOnDeviceWithOrderNHWC() {
   auto& X = Input(0);
   auto* Y = Output(0);
   int height = X.dim32(1);
   int width = kernel_.size() > 1 ? X.dim32(2) : 1;
   int depth = kernel_.size() > 2 ? X.dim32(3) : 1;
   int channels = X.dim32(X.ndim() - 1);
   ConvPoolOpBase<Context>::SetOutputSize(X, Y, channels);

   EigenMatrixMap<float> Ymat(
       Y->template mutable_data<float>(), channels, Y->size() / channels);
   ConstEigenMatrixMap<float> Xmat(
       X.template data<float>(), channels, X.size() / channels);
   int pooled_height = Y->dim32(1);
   int pooled_width = kernel_.size() > 1 ? Y->dim32(2) : 1;
   int pooled_depth = kernel_.size() > 2 ? Y->dim32(3) : 1;
   // The main loop
   switch (kernel_.size()) {
     case 1:
       for (int n = 0; n < X.dim32(0); ++n) {
         for (int ph = 0; ph < pooled_height; ++ph) {
           int hstart = ph * stride_h() - pad_t();
           int hend = min(hstart + kernel_h(), height);
           hstart = max(hstart, 0);
           const int y_col = n * pooled_height + ph;
           Ymat.col(y_col).setConstant(PoolType::initialize());
           for (int h = hstart; h < hend; ++h) {
             const int x_col = n * height + h;
             PoolType::process(x_col, y_col, Xmat, Ymat);
           }
           PoolType::finalize((hend - hstart), y_col, Ymat);
         }
       }
       break;
     case 2:
       for (int n = 0; n < X.dim32(0); ++n) {
         for (int ph = 0; ph < pooled_height; ++ph) {
           int hstart = ph * stride_h() - pad_t();
           int hend = min(hstart + kernel_h(), height);
           hstart = max(hstart, 0);
           for (int pw = 0; pw < pooled_width; ++pw) {
             int wstart = pw * stride_w() - pad_l();
             int wend = min(wstart + kernel_w(), width);
             wstart = max(wstart, 0);
             const int y_col = (n * pooled_height + ph) * pooled_width + pw;
             Ymat.col(y_col).setConstant(PoolType::initialize());
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
                 const int x_col = (n * height + h) * width + w;
                 PoolType::process(x_col, y_col, Xmat, Ymat);
               }
             }
             PoolType::finalize((hend - hstart) * (wend - wstart), y_col, Ymat);
           }
         }
       }
       break;
     case 3:
       for (int n = 0; n < X.dim32(0); ++n) {
         for (int ph = 0; ph < pooled_height; ++ph) {
           int hstart = ph * stride_h() - pad_t();
           int hend = min(hstart + kernel_h(), height);
           hstart = max(hstart, 0);
           for (int pw = 0; pw < pooled_width; ++pw) {
             int wstart = pw * stride_w() - pad_l();
             int wend = min(wstart + kernel_w(), width);
             wstart = max(wstart, 0);
             for (int pd = 0; pd < pooled_depth; ++pd) {
               int dstart = pd * stride_[2] - pads_[2];
               int dend = min(dstart + kernel_[2], depth);
               dstart = max(dstart, 0);
               const int y_col = ((n * pooled_height + ph) * pooled_width + pw) *
                       pooled_depth +
                   pd;
               Ymat.col(y_col).setConstant(PoolType::initialize());
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
                   for (int d = dstart; d < dend; ++d) {
                     const int x_col =
                         ((n * height + h) * width + w) * depth + d;
                     PoolType::process(x_col, y_col, Xmat, Ymat);
                   }
                 }
               }
               PoolType::finalize(
                   (hend - hstart) * (wend - wstart) * (dend - dstart),
                   y_col,
                   Ymat);
             }
           }
         }
       }
       break;
     default:
       CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
       return false;
   }
   return true;
 }
 const char kAveragePoolDoc[] = R"DOC(
 consumes an input blob and applies average pooling across the the blob according
 to kernel sizes, stride sizes, pad lengths and dilation. Average pooling consists
 of taking the average value of a subset of the input tensor according to the kernel
 size and downsampling the data into the output blob for further processing. The
 `brew` module has a wrapper for this operator for use in a `ModelHelper` object.

 Pooling layers reduce the spatial dimensionality of the input blob. Each of the
 output blob's dimensions will reduce according to:

 $$dim_{out}=\frac{dim_{in}-kernel+2*pad}{stride}+1$$

 Github Links:

 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.h
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.cc
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_pool_op_base.h


 <details>

 <summary> <b>Example</b> </summary>

 **Code**

 ```
 workspace.ResetWorkspace()

 op = core.CreateOperator(
     "AveragePool",
     ["X"],
     ["Y"],
     kernel=2,
     stride=2,
 )

 workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 ```

 **Result**

 ```
 X:
  [[[[-0.2883434   0.43498734  0.05417408  1.912558    0.09390241
     -0.33173105]
    [ 1.633709    1.2047161   0.36964908  0.99961185  0.4184147
      0.9989975 ]
    [ 1.7644193   0.1789665   1.5812988  -0.6038542  -0.36090398
      0.33195344]
    [ 0.9457722  -0.95174325 -0.78124577  1.2062047   1.1903144
      0.2586746 ]
    [ 1.252104    0.32645547  1.8073524  -0.78397465  0.9978303
     -0.97614396]
    [ 0.5440196   1.5778259  -0.76750124  0.5051756   0.8838398
     -0.37085298]]]]

 Y:
  [[[[0.7462672  0.83399826 0.2948959 ]
    [0.4843537  0.3506009  0.35500962]
    [0.9251013  0.19026303 0.13366827]]]]
 ```

 </details>

 )DOC";

 const char kMaxPoolDoc[] = R"DOC(
 consumes an input blob and applies max pooling across the the blob according to
 kernel sizes, stride sizes, pad lengths and dilation. Max pooling consists of
 taking the maximum value of a subset of the input tensor according to the kernel
 size and downsampling the data into the output blob for further processing. The
 `brew` module has a wrapper for this operator for use in a `ModelHelper` object.

 Pooling layers reduce the spatial dimensionality of the input blob. Each of the
 output blob's dimensions will reduce according to:

 $$dim_{out}=\frac{dim_{in}-kernel+2*pad}{stride}+1$$

 Github Links:

 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.h
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.cc
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_pool_op_base.h

 <details>

 <summary> <b>Example</b> </summary>

 **Code**

 ```
 workspace.ResetWorkspace()

 op = core.CreateOperator(
     "MaxPool",
     ["X"],
     ["Y"],
     kernel=2,
     stride=2,
 )

 workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 ```

 **Result**

 ```
 X:
  [[[[-2.8534958e-01 -1.7719941e+00 -8.2277227e-04  1.1088650e+00
     -2.1476576e+00 -3.5070452e-01]
    [-9.0058845e-01 -3.0070004e-01 -1.7907504e+00 -7.1746534e-01
      1.2798511e+00 -3.2214901e-01]
    [ 1.5806322e+00  1.6845188e+00 -2.6633200e-01 -3.8576153e-01
     -9.6424848e-02 -3.9696163e-01]
    [ 1.2572408e-01  6.3612902e-01 -3.9554062e-01 -6.9735396e-01
     -9.1898698e-01 -1.9609968e-01]
    [-1.1587460e+00  2.4605224e+00 -1.5497679e+00  1.3020347e-01
     -8.1293899e-01 -7.8803545e-01]
    [ 1.4323474e+00  1.3618395e+00  9.8975077e-02 -1.1307785e-01
      7.2035044e-01  2.7642491e-01]]]]

 Y:
  [[[[-0.28534958  1.108865    1.2798511 ]
    [ 1.6845188  -0.266332   -0.09642485]
    [ 2.4605224   0.13020347  0.72035044]]]]

 ```

 </details>

 )DOC";

 std::function<void(OpSchema&)> AveragePoolDocGenerator(const char* dim) {
   return [=](OpSchema& schema) {
     string doc = "AveragePool{dim} {pool_doc}";
     c10::ReplaceAll(doc, "{dim}", dim);
     c10::ReplaceAll(doc, "{pool_doc}", kAveragePoolDoc);
     schema.SetDoc(doc);
     schema.Input(
         0,
         "X",
         "*(type: Tensor`<float>`)* Input data tensor of shape NCHW or NHWC.");
     schema.Output(
         0,
         "Y",
         "*(type: Tensor`<float>`)* Output data tensor.");
     /*
     schema.Arg("kernel", "*(type: int)* Size of the window to take an average over.");
     schema.Arg("stride", "*(type: int)* Stride of the window.");
     schema.Arg("pad", "*(type: int)* Implicit zero padding to be added on both sides.");
     schema.Arg("dilation", "*(type: int)* Parameter that controls the stride of elements in the window.");
     schema.Arg("order", "*(type: string; default: 'NCHW')* Order of the blob dimensions.");
     */
   };
 }

 std::function<void(OpSchema&)> MaxPoolDocGenerator(const char* dim) {
   return [=](OpSchema& schema) {
     string doc = "MaxPool{dim} {pool_doc}";
     c10::ReplaceAll(doc, "{dim}", dim);
     c10::ReplaceAll(doc, "{pool_doc}", kMaxPoolDoc);
     schema.SetDoc(doc);
     schema.Input(
         0,
         "X",
         "*(type: Tensor`<float>`)* Input data tensor of shape NCHW or NHWC.");
     schema.Output(
         0,
         "Y",
         "*(type: Tensor`<float>`)* Output data tensor.");
     /*
     schema.Arg("kernel", "*(type: int)* Size of the window to take an average over.");
     schema.Arg("stride", "*(type: int)* Stride of the window.");
     schema.Arg("pad", "*(type: int)* Implicit zero padding to be added on both sides.");
     schema.Arg("dilation", "*(type: int)* Parameter that controls the stride of elements in the window.");
     schema.Arg("order", "*(type: string; default: 'NCHW')* Order of the blob dimensions.");
     */
   };
 }
 REGISTER_CPU_OPERATOR(
     AveragePool,
     PoolOp<float, CPUContext, AveragePool<float>>);

 OPERATOR_SCHEMA(AveragePool)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(AveragePoolDocGenerator(""))
     .InheritOnnxSchema();

 REGISTER_CPU_OPERATOR(
     AveragePool1D,
     PoolOp<float, CPUContext, AveragePool<float>>);

 OPERATOR_SCHEMA(AveragePool1D)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(AveragePoolDocGenerator("1D"))
     .InheritOnnxSchema("AveragePool");

 REGISTER_CPU_OPERATOR(
     AveragePool2D,
     PoolOp<float, CPUContext, AveragePool<float>>);

 OPERATOR_SCHEMA(AveragePool2D)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(AveragePoolDocGenerator("2D"))
     .InheritOnnxSchema("AveragePool");

 REGISTER_CPU_OPERATOR(
     AveragePool3D,
     PoolOp<float, CPUContext, AveragePool<float>>);

 OPERATOR_SCHEMA(AveragePool3D)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(AveragePoolDocGenerator("3D"))
     .InheritOnnxSchema("AveragePool");

 REGISTER_CPU_OPERATOR(MaxPool, PoolOp<float, CPUContext, MaxPool<float>>);

 OPERATOR_SCHEMA(MaxPool)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(MaxPoolDocGenerator(""))
     .InheritOnnxSchema();

 REGISTER_CPU_OPERATOR(MaxPool1D, PoolOp<float, CPUContext, MaxPool<float>>);

 OPERATOR_SCHEMA(MaxPool1D)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(MaxPoolDocGenerator("1D"))
     .InheritOnnxSchema("MaxPool");

 REGISTER_CPU_OPERATOR(MaxPool2D, PoolOp<float, CPUContext, MaxPool<float>>);

 OPERATOR_SCHEMA(MaxPool2D)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(MaxPoolDocGenerator("2D"))
     .InheritOnnxSchema("MaxPool");

 REGISTER_CPU_OPERATOR(MaxPool3D, PoolOp<float, CPUContext, MaxPool<float>>);

 OPERATOR_SCHEMA(MaxPool3D)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool)
     .FillUsing(MaxPoolDocGenerator("3D"))
     .InheritOnnxSchema("MaxPool");
 } // namespace caffe2