| // TODO: reduce the apparent redundancy of all the code below. | 
 | #include "caffe2/operators/pool_op.h" | 
 |  | 
 | namespace caffe2 { | 
 |  | 
 | using std::max; | 
 | using std::min; | 
 |  | 
 | struct LpPoolFunctor { | 
 |   explicit LpPoolFunctor(const OperatorBase& /* op */) {} | 
 | }; | 
 |  | 
 | template <> | 
 | bool PoolOp<float, CPUContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() { | 
 |   auto& X = Input(0); | 
 |   auto* Y = Output(0); | 
 |   ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1)); | 
 |   const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0); | 
 |   const auto inv_p = 1.0 / p; | 
 |  | 
 |   const float* Xdata = X.data<float>(); | 
 |   float* Ydata = Y->template mutable_data<float>(); | 
 |   math::Set<float, CPUContext>(Y->numel(), 0, Ydata, &context_); | 
 |   // The main loop | 
 |   int channels = X.dim32(1); | 
 |   int height = X.dim32(2); | 
 |   int width = X.dim32(3); | 
 |   int pooled_height = Y->dim32(2); | 
 |   int pooled_width = Y->dim32(3); | 
 |  | 
 |   for (int n = 0; n < X.dim32(0); ++n) { | 
 |     for (int c = 0; c < channels; ++c) { | 
 |       for (int ph = 0; ph < pooled_height; ++ph) { | 
 |         for (int pw = 0; pw < pooled_width; ++pw) { | 
 |           int hstart = ph * stride_[0] - pads_[0]; | 
 |           int wstart = pw * stride_[1] - pads_[1]; | 
 |           int hend = min(hstart + kernel_[0], height); | 
 |           int wend = min(wstart + kernel_[1], width); | 
 |           hstart = max(hstart, 0); | 
 |           wstart = max(wstart, 0); | 
 |           const int pool_index = ph * pooled_width + pw; | 
 |           for (int h = hstart; h < hend; ++h) { | 
 |             for (int w = wstart; w < wend; ++w) { | 
 |               const int input_index = h * width + w; | 
 |               Ydata[pool_index] += std::pow(std::abs(Xdata[input_index]), p); | 
 |             } | 
 |           } | 
 |           Ydata[pool_index] = std::pow(Ydata[pool_index], inv_p); | 
 |         } | 
 |       } | 
 |       // Do offset. | 
 |       Xdata += height * width; | 
 |       Ydata += pooled_height * pooled_width; | 
 |     } | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | template <> | 
 | bool PoolOp<float, CPUContext, LpPoolFunctor>::RunOnDeviceWithOrderNHWC() { | 
 |   auto& X = Input(0); | 
 |   auto* Y = Output(0); | 
 |   int height = X.dim32(1); | 
 |   int width = X.dim32(2); | 
 |   int channels = X.dim32(3); | 
 |   ConvPoolOpBase::SetOutputSize(X, Y, channels); | 
 |  | 
 |   const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0); | 
 |   const auto inv_p = 1.0 / p; | 
 |  | 
 |   const float* Xdata = X.data<float>(); | 
 |   float* Ydata = Y->template mutable_data<float>(); | 
 |   math::Set<float, CPUContext>(Y->numel(), 0, Ydata, &context_); | 
 |   // The main loop | 
 |   int pooled_height = Y->dim32(1); | 
 |   int pooled_width = Y->dim32(2); | 
 |   for (int n = 0; n < X.dim32(0); ++n) { | 
 |     for (int ph = 0; ph < pooled_height; ++ph) { | 
 |       for (int pw = 0; pw < pooled_width; ++pw) { | 
 |         int hstart = ph * stride_[0] - pads_[0]; | 
 |         int wstart = pw * stride_[1] - pads_[1]; | 
 |         int hend = min(hstart + kernel_[0], height); | 
 |         int wend = min(wstart + kernel_[1], width); | 
 |         hstart = max(hstart, 0); | 
 |         wstart = max(wstart, 0); | 
 |         const int pool_index = (ph * pooled_width + pw) * channels; | 
 |         for (int h = hstart; h < hend; ++h) { | 
 |           for (int w = wstart; w < wend; ++w) { | 
 |             const int input_index = (h * width + w) * channels; | 
 |             for (int c = 0; c < channels; ++c) { | 
 |               Ydata[pool_index + c] += | 
 |                   std::pow(std::abs(Xdata[input_index + c]), p); | 
 |             } | 
 |           } | 
 |         } | 
 |         for (int c = 0; c < channels; ++c) { | 
 |           Ydata[pool_index + c] = std::pow(Ydata[pool_index + c], inv_p); | 
 |         } | 
 |       } | 
 |     } | 
 |     // Do offset. | 
 |     Xdata += X.numel() / X.dim32(0); | 
 |     Ydata += Y->numel() / Y->dim32(0); | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | template <> | 
 | bool PoolGradientOp<float, CPUContext, LpPoolFunctor>:: | 
 |     RunOnDeviceWithOrderNCHW() { | 
 |   const auto& X = Input(0); | 
 |   const auto& Y = Input(1); | 
 |   auto& dY = Input(2); | 
 |  | 
 |   const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0); | 
 |  | 
 |   // TODO(Yangqing): Add shape checks. | 
 |   auto* dX = Output(0, X.sizes(), at::dtype<float>()); | 
 |   math::Set<float, CPUContext>( | 
 |       X.numel(), 0, dX->template mutable_data<float>(), &context_); | 
 |   const float* dYdata = dY.data<float>(); | 
 |   const float* Xdata = X.data<float>(); | 
 |   const float* Ydata = Y.data<float>(); | 
 |   float* dXdata = dX->template mutable_data<float>(); | 
 |  | 
 |   int channels = X.dim32(1); | 
 |   CAFFE_ENFORCE_EQ(channels, dY.dim32(1)); | 
 |   int height = X.dim32(2); | 
 |   int width = X.dim32(3); | 
 |   ConvPoolOpBase<CPUContext>::ComputePads({height, width}); | 
 |   int pooled_height = dY.dim32(2); | 
 |   int pooled_width = dY.dim32(3); | 
 |   // The main loop | 
 |   for (int n = 0; n < X.dim32(0); ++n) { | 
 |     for (int c = 0; c < channels; ++c) { | 
 |       for (int ph = 0; ph < pooled_height; ++ph) { | 
 |         for (int pw = 0; pw < pooled_width; ++pw) { | 
 |           int hstart = ph * stride_[0] - pads_[0]; | 
 |           int wstart = pw * stride_[1] - pads_[1]; | 
 |           int hend = min(hstart + kernel_[0], height); | 
 |           int wend = min(wstart + kernel_[1], width); | 
 |           hstart = max(hstart, 0); | 
 |           wstart = max(wstart, 0); | 
 |           for (int h = hstart; h < hend; ++h) { | 
 |             for (int w = wstart; w < wend; ++w) { | 
 |               // gradient of p-norm is x_j * |x_j|^{p-2} / |x|_p^{p-1} | 
 |               dXdata[h * width + w] += dYdata[ph * pooled_width + pw] * | 
 |                   Xdata[h * width + w] * | 
 |                   std::pow(std::abs(Xdata[h * width + w]), p - 2) / | 
 |                   std::pow(Ydata[ph * pooled_width + pw], p - 1); | 
 |             } | 
 |           } | 
 |         } | 
 |       } | 
 |       // offset | 
 |       dXdata += height * width; | 
 |       dYdata += pooled_height * pooled_width; | 
 |       Ydata += pooled_height * pooled_width; | 
 |       Xdata += height * width; | 
 |     } | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | template <> | 
 | bool PoolGradientOp<float, CPUContext, LpPoolFunctor>:: | 
 |     RunOnDeviceWithOrderNHWC() { | 
 |   const auto& X = Input(0); | 
 |   const auto& Y = Input(1); | 
 |   auto& dY = Input(2); | 
 |   CAFFE_ENFORCE_EQ(dY.dim(), 4); | 
 |  | 
 |   // TODO(Yangqing): Add shape checks. | 
 |   auto* dX = Output(0, X.sizes(), at::dtype<float>()); | 
 |   math::Set<float, CPUContext>( | 
 |       X.numel(), 0, dX->template mutable_data<float>(), &context_); | 
 |   const float* dYdata = dY.data<float>(); | 
 |   float* dXdata = dX->template mutable_data<float>(); | 
 |   const float* Xdata = X.data<float>(); | 
 |   const float* Ydata = Y.data<float>(); | 
 |   // The main loop | 
 |   int height = X.dim32(1); | 
 |   int width = X.dim32(2); | 
 |   ConvPoolOpBase<CPUContext>::ComputePads({height, width}); | 
 |   const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0); | 
 |  | 
 |   int pooled_height = dY.dim32(1); | 
 |   int pooled_width = dY.dim32(2); | 
 |   int channels = X.dim32(3); | 
 |   CAFFE_ENFORCE_EQ(channels, dY.dim32(3)); | 
 |   for (int n = 0; n < X.dim32(0); ++n) { | 
 |     for (int ph = 0; ph < pooled_height; ++ph) { | 
 |       for (int pw = 0; pw < pooled_width; ++pw) { | 
 |         int hstart = ph * stride_[0] - pads_[0]; | 
 |         int wstart = pw * stride_[1] - pads_[1]; | 
 |         int hend = min(hstart + kernel_[0], height); | 
 |         int wend = min(wstart + kernel_[1], width); | 
 |         hstart = max(hstart, 0); | 
 |         wstart = max(wstart, 0); | 
 |         for (int h = hstart; h < hend; ++h) { | 
 |           for (int w = wstart; w < wend; ++w) { | 
 |             for (int c = 0; c < channels; ++c) { | 
 |               dXdata[(h * width + w) * channels + c] += | 
 |                   dYdata[(ph * pooled_width + pw) * channels + c] * | 
 |                   Xdata[(h * width + w) * channels + c] * | 
 |                   std::pow( | 
 |                       std::abs(Xdata[(h * width + w) * channels + c]), p - 2) / | 
 |                   std::pow( | 
 |                       Ydata[(ph * pooled_width + pw) * channels + c], p - 1); | 
 |             } | 
 |           } | 
 |         } | 
 |       } | 
 |     } | 
 |     // offset | 
 |     dXdata += X.numel() / X.dim32(0); | 
 |     dYdata += dY.numel() / dY.dim32(0); | 
 |     Xdata += X.numel() / X.dim32(0); | 
 |     Ydata += Y.numel() / Y.dim32(0); | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | REGISTER_CPU_OPERATOR(LpPool, PoolOp<float, CPUContext, LpPoolFunctor>); | 
 | REGISTER_CPU_OPERATOR( | 
 |     LpPoolGradient, | 
 |     PoolGradientOp<float, CPUContext, LpPoolFunctor>); | 
 |  | 
 | OPERATOR_SCHEMA(LpPool) | 
 |     .NumInputs(1) | 
 |     .NumOutputs(1) | 
 |     .SetDoc(R"DOC( | 
 | `LpPool` consumes an input blob and applies max pooling across the the blob according to kernel sizes, stride sizes, pad lengths and dilation. $L_p$ pooling consists of taking the $L_p$ norm of a subset of the input tensor according to the kernel size and downsampling the data into the output blob for further processing. | 
 |  | 
 | Pooling layers reduce the spatial dimensionality of the input blob. Each of the output blob's dimensions will reduce according to: | 
 |  | 
 | $$dim_{out}=\frac{dim_{in}-kernel+2*pad}{stride}+1$$ | 
 |  | 
 | Github Links: | 
 | - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/lp_pool_op.cc | 
 |  | 
 | <details> | 
 |  | 
 | <summary> <b>Example</b> </summary> | 
 |  | 
 | **Code** | 
 |  | 
 | ``` | 
 |  | 
 | workspace.ResetWorkspace() | 
 |  | 
 | op = core.CreateOperator( | 
 |     "LpPool", | 
 |     ["X"], | 
 |     ["Y"], | 
 |     kernel=2, | 
 |     stride=2, | 
 |     p=2.0 | 
 | ) | 
 |  | 
 | workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW | 
 | print("X:\n", workspace.FetchBlob("X"), "\n") | 
 | workspace.RunOperatorOnce(op) | 
 | print("Y:\n", workspace.FetchBlob("Y")) | 
 |  | 
 | ``` | 
 |  | 
 | **Result** | 
 |  | 
 | ``` | 
 |  | 
 | X: | 
 |  [[[[-1.1113514  -1.1173418  -0.1504435   0.1327146  -1.2221841  -0.5654315 ] | 
 |    [-1.9209646  -0.04675794  0.8604731   1.2042469   0.28154245   0.38656202] | 
 |    [-0.8772837  -0.03264008  0.26222762  0.28526652  0.321102    -2.5891325 ] | 
 |    [-0.9248281   1.440776   -0.56832    -0.6017927   1.2262512   -2.1443934 ] | 
 |    [ 0.5194415  -1.6858683   0.45221648  0.65029615 -0.8574544    0.8121054 ] | 
 |    [ 0.25902653  0.4934758   0.49870652 -0.48134378 -0.9178449   -0.07626943]]]] | 
 |  | 
 | Y: | 
 |  [[[[2.4851248 1.49361   1.4290358] | 
 |    [1.9240153 0.9139378 3.5928857] | 
 |    [1.8500228 1.0525136 1.4976646]]]] | 
 |  | 
 | ``` | 
 |  | 
 | </details> | 
 |  | 
 | )DOC") | 
 |     .Arg("p", "(*float*): type of $L_p$ norm to use (default=2.0)") | 
 |     .Arg("kernel", "(*int*): the size of the window to take a max over") | 
 |     .Arg("stride", "(*int*): the stride of the window") | 
 |     .Arg("pad", "(*int*): implicit zero padding to be added on both sides") | 
 |     .Arg( | 
 |         "dilation", | 
 |         "(*int*): parameter that controls the stride of elements in the window") | 
 |     .Arg("order", "(*string*): order of blob dimensions (default=\"NCHW\")") | 
 |     .Input(0, "X", "(*Tensor`<float>`*): input tensor") | 
 |     .Output(0, "Y", "(*Tensor`<float>`*): output tensor"); | 
 |  | 
 | OPERATOR_SCHEMA(LpPoolGradient).NumInputs(3).NumOutputs(1); | 
 |  | 
 | class GetPoolGradient : public GradientMakerBase { | 
 |   using GradientMakerBase::GradientMakerBase; | 
 |   vector<OperatorDef> GetGradientDefs() override { | 
 |     return SingleGradientDef( | 
 |         def_.type() + "Gradient", | 
 |         "", | 
 |         vector<string>{I(0), O(0), GO(0)}, | 
 |         vector<string>{GI(0)}); | 
 |   } | 
 | }; | 
 | REGISTER_GRADIENT(LpPool, GetPoolGradient); | 
 | } // namespace caffe2 |