| #include "caffe2/operators/pool_op.h" |
| |
| #include <limits> |
| #include <string> |
| #include <type_traits> |
| |
| #include "caffe2/operators/pool_op_util.h" |
| #include "caffe2/utils/eigen_utils.h" |
| #include "caffe2/utils/math.h" |
| |
| namespace caffe2 { |
| |
| namespace { |
| |
| template <typename T, StorageOrder kOrder> |
| void ComputeAveragePool1D( |
| int l, |
| int r, |
| int y, |
| T scale, |
| const ConstEigenArrayMap<T>& X_arr, |
| EigenArrayMap<T>* Y_arr); |
| |
| template <> |
| void ComputeAveragePool1D<float, StorageOrder::NCHW>( |
| const int l, |
| const int r, |
| const int y, |
| const float scale, |
| const ConstEigenArrayMap<float>& X_arr, |
| EigenArrayMap<float>* Y_arr) { |
| (*Y_arr)(y) = X_arr.col(0).segment(l, r - l).sum() * scale; |
| } |
| |
| template <> |
| void ComputeAveragePool1D<float, StorageOrder::NHWC>( |
| const int l, |
| const int r, |
| const int y, |
| const float scale, |
| const ConstEigenArrayMap<float>& X_arr, |
| EigenArrayMap<float>* Y_arr) { |
| Y_arr->col(y) = X_arr.col(l); |
| for (int i = l + 1; i < r; ++i) { |
| Y_arr->col(y) += X_arr.col(i); |
| } |
| Y_arr->col(y) *= scale; |
| } |
| |
| template <typename T, StorageOrder kOrder> |
| void ComputeAveragePool2D( |
| int W, |
| int t, |
| int b, |
| int l, |
| int r, |
| int y, |
| T scale, |
| const ConstEigenArrayMap<T>& X_arr, |
| EigenArrayMap<T>* Y_arr); |
| |
| template <> |
| void ComputeAveragePool2D<float, StorageOrder::NCHW>( |
| const int /* W */, |
| const int t, |
| const int b, |
| const int l, |
| const int r, |
| const int y, |
| const float scale, |
| const ConstEigenArrayMap<float>& X_arr, |
| EigenArrayMap<float>* Y_arr) { |
| (*Y_arr)(y) = X_arr.block(l, t, r - l, b - t).sum() * scale; |
| } |
| |
| template <> |
| void ComputeAveragePool2D<float, StorageOrder::NHWC>( |
| const int W, |
| const int t, |
| const int b, |
| const int l, |
| const int r, |
| const int y, |
| const float scale, |
| const ConstEigenArrayMap<float>& X_arr, |
| EigenArrayMap<float>* Y_arr) { |
| Y_arr->col(y).setZero(); |
| for (int i = t; i < b; ++i) { |
| for (int j = l; j < r; ++j) { |
| Y_arr->col(y) += X_arr.col(i * W + j); |
| } |
| } |
| Y_arr->col(y) *= scale; |
| } |
| |
| template <typename T, StorageOrder kOrder> |
| void ComputeAveragePool3D( |
| int H, |
| int W, |
| int p, |
| int a, |
| int t, |
| int b, |
| int l, |
| int r, |
| int y, |
| T scale, |
| const ConstEigenArrayMap<T>& X_arr, |
| EigenArrayMap<T>* Y_arr); |
| |
| template <> |
| void ComputeAveragePool3D<float, StorageOrder::NCHW>( |
| const int H, |
| const int /* W */, |
| const int p, |
| const int a, |
| const int t, |
| const int b, |
| const int l, |
| const int r, |
| const int y, |
| const float scale, |
| const ConstEigenArrayMap<float>& X_arr, |
| EigenArrayMap<float>* Y_arr) { |
| (*Y_arr)(y) = 0; |
| for (int i = p; i < a; ++i) { |
| (*Y_arr)(y) += X_arr.block(l, i * H + t, r - l, b - t).sum(); |
| } |
| (*Y_arr)(y) *= scale; |
| } |
| |
| template <> |
| void ComputeAveragePool3D<float, StorageOrder::NHWC>( |
| const int H, |
| const int W, |
| const int p, |
| const int a, |
| const int t, |
| const int b, |
| const int l, |
| const int r, |
| const int y, |
| const float scale, |
| const ConstEigenArrayMap<float>& X_arr, |
| EigenArrayMap<float>* Y_arr) { |
| Y_arr->col(y).setZero(); |
| for (int i = p; i < a; ++i) { |
| for (int j = t; j < b; ++j) { |
| for (int k = l; k < r; ++k) { |
| Y_arr->col(y) += X_arr.col(i * H * W + j * W + k); |
| } |
| } |
| } |
| Y_arr->col(y) *= scale; |
| } |
| |
| template <typename T, StorageOrder kOrder> |
| void RunAveragePool1D( |
| const int N, |
| const int C, |
| const int X_size, |
| const int Y_size, |
| const int kernel, |
| const int stride, |
| const int pad, |
| const bool count_include_pad, |
| const T* X, |
| T* Y) { |
| const int batch_size = kOrder == StorageOrder::NCHW ? N * C : N; |
| const int X_stride = kOrder == StorageOrder::NCHW ? X_size : X_size * C; |
| const int Y_stride = kOrder == StorageOrder::NCHW ? Y_size : Y_size * C; |
| const T* X_ptr = X; |
| T* Y_ptr = Y; |
| for (int i = 0; i < batch_size; ++i) { |
| ConstEigenArrayMap<T> X_arr = kOrder == StorageOrder::NCHW |
| ? ConstEigenArrayMap<T>(X_ptr, X_size, 1) |
| : ConstEigenArrayMap<T>(X_ptr, C, X_size); |
| EigenArrayMap<T> Y_arr = kOrder == StorageOrder::NCHW |
| ? EigenArrayMap<T>(Y_ptr, Y_size, 1) |
| : EigenArrayMap<T>(Y_ptr, C, Y_size); |
| for (int y = 0; y < Y_size; ++y) { |
| const int l = std::max(y * stride - pad, 0); |
| const int r = std::min(y * stride - pad + kernel, X_size); |
| const T scale = T(1) / static_cast<T>(count_include_pad ? kernel : r - l); |
| ComputeAveragePool1D<T, kOrder>(l, r, y, scale, X_arr, &Y_arr); |
| } |
| X_ptr += X_stride; |
| Y_ptr += Y_stride; |
| } |
| } |
| |
| template <typename T, StorageOrder kOrder> |
| void RunAveragePool2D( |
| const int N, |
| const int C, |
| const int X_H, |
| const int X_W, |
| const int Y_H, |
| const int Y_W, |
| const int kernel_h, |
| const int kernel_w, |
| const int stride_h, |
| const int stride_w, |
| const int pad_t, |
| const int pad_l, |
| const bool count_include_pad, |
| const T* X, |
| T* Y) { |
| const int batch_size = kOrder == StorageOrder::NCHW ? N * C : N; |
| const int X_HxW = X_H * X_W; |
| const int Y_HxW = Y_H * Y_W; |
| const int X_stride = kOrder == StorageOrder::NCHW ? X_HxW : X_HxW * C; |
| const int Y_stride = kOrder == StorageOrder::NCHW ? Y_HxW : Y_HxW * C; |
| const T* X_ptr = X; |
| T* Y_ptr = Y; |
| for (int i = 0; i < batch_size; ++i) { |
| ConstEigenArrayMap<T> X_arr = kOrder == StorageOrder::NCHW |
| ? ConstEigenArrayMap<T>(X_ptr, X_W, X_H) |
| : ConstEigenArrayMap<T>(X_ptr, C, X_HxW); |
| EigenArrayMap<T> Y_arr = kOrder == StorageOrder::NCHW |
| ? EigenArrayMap<T>(Y_ptr, Y_W, Y_H) |
| : EigenArrayMap<T>(Y_ptr, C, Y_HxW); |
| for (int h = 0; h < Y_H; ++h) { |
| const int t = std::max(h * stride_h - pad_t, 0); |
| const int b = std::min(h * stride_h - pad_t + kernel_h, X_H); |
| for (int w = 0; w < Y_W; ++w) { |
| const int l = std::max(w * stride_w - pad_l, 0); |
| const int r = std::min(w * stride_w - pad_l + kernel_w, X_W); |
| const int y = h * Y_W + w; |
| const T scale = T(1) / |
| static_cast<T>(count_include_pad ? kernel_h * kernel_w |
| : (b - t) * (r - l)); |
| ComputeAveragePool2D<T, kOrder>( |
| X_W, t, b, l, r, y, scale, X_arr, &Y_arr); |
| } |
| } |
| X_ptr += X_stride; |
| Y_ptr += Y_stride; |
| } |
| } |
| |
| template <typename T, StorageOrder kOrder> |
| void RunAveragePool3D( |
| const int N, |
| const int C, |
| const int X_D, |
| const int X_H, |
| const int X_W, |
| const int Y_D, |
| const int Y_H, |
| const int Y_W, |
| const int kernel_d, |
| const int kernel_h, |
| const int kernel_w, |
| const int stride_d, |
| const int stride_h, |
| const int stride_w, |
| const int pad_p, |
| const int pad_t, |
| const int pad_l, |
| const bool count_include_pad, |
| const T* X, |
| T* Y) { |
| const int batch_size = kOrder == StorageOrder::NCHW ? N * C : N; |
| const int X_HxW = X_D * X_H * X_W; |
| const int Y_HxW = Y_D * Y_H * Y_W; |
| const int X_stride = kOrder == StorageOrder::NCHW ? X_HxW : X_HxW * C; |
| const int Y_stride = kOrder == StorageOrder::NCHW ? Y_HxW : Y_HxW * C; |
| const T* X_ptr = X; |
| T* Y_ptr = Y; |
| for (int i = 0; i < batch_size; ++i) { |
| ConstEigenArrayMap<T> X_arr = kOrder == StorageOrder::NCHW |
| ? ConstEigenArrayMap<T>(X_ptr, X_W, X_D * X_H) |
| : ConstEigenArrayMap<T>(X_ptr, C, X_HxW); |
| EigenArrayMap<T> Y_arr = kOrder == StorageOrder::NCHW |
| ? EigenArrayMap<T>(Y_ptr, Y_W, Y_D * Y_H) |
| : EigenArrayMap<T>(Y_ptr, C, Y_HxW); |
| for (int d = 0; d < Y_D; ++d) { |
| const int p = std::max(d * stride_d - pad_p, 0); |
| const int a = std::min(d * stride_d - pad_p + kernel_d, X_D); |
| for (int h = 0; h < Y_H; ++h) { |
| const int t = std::max(h * stride_h - pad_t, 0); |
| const int b = std::min(h * stride_h - pad_t + kernel_h, X_H); |
| for (int w = 0; w < Y_W; ++w) { |
| const int l = std::max(w * stride_w - pad_l, 0); |
| const int r = std::min(w * stride_w - pad_l + kernel_w, X_W); |
| const int y = d * Y_H * Y_W + h * Y_W + w; |
| const T scale = T(1) / |
| static_cast<T>(count_include_pad ? kernel_d * kernel_h * kernel_w |
| : (a - p) * (b - t) * (r - l)); |
| ComputeAveragePool3D<T, kOrder>( |
| X_H, X_W, p, a, t, b, l, r, y, scale, X_arr, &Y_arr); |
| } |
| } |
| } |
| X_ptr += X_stride; |
| Y_ptr += Y_stride; |
| } |
| } |
| |
| template <typename T, StorageOrder kOrder> |
| void ComputeMaxPool1D( |
| int l, |
| int r, |
| int y, |
| const ConstEigenArrayMap<T>& X_arr, |
| EigenArrayMap<T>* Y_arr); |
| |
| template <> |
| void ComputeMaxPool1D<float, StorageOrder::NCHW>( |
| const int l, |
| const int r, |
| const int y, |
| const ConstEigenArrayMap<float>& X_arr, |
| EigenArrayMap<float>* Y_arr) { |
| (*Y_arr)(y) = X_arr.col(0).segment(l, r - l).maxCoeff(); |
| } |
| |
| template <> |
| void ComputeMaxPool1D<float, StorageOrder::NHWC>( |
| const int l, |
| const int r, |
| const int y, |
| const ConstEigenArrayMap<float>& X_arr, |
| EigenArrayMap<float>* Y_arr) { |
| Y_arr->col(y) = X_arr.col(l); |
| for (int i = l + 1; i < r; ++i) { |
| Y_arr->col(y) = Y_arr->col(y).max(X_arr.col(i)); |
| } |
| } |
| |
| template <typename T, StorageOrder kOrder> |
| void ComputeMaxPool2D( |
| int W, |
| int t, |
| int b, |
| int l, |
| int r, |
| int y, |
| const ConstEigenArrayMap<T>& X_arr, |
| EigenArrayMap<T>* Y_arr); |
| |
| template <> |
| void ComputeMaxPool2D<float, StorageOrder::NCHW>( |
| const int /* W */, |
| const int t, |
| const int b, |
| const int l, |
| const int r, |
| const int y, |
| const ConstEigenArrayMap<float>& X_arr, |
| EigenArrayMap<float>* Y_arr) { |
| (*Y_arr)(y) = X_arr.block(l, t, r - l, b - t).maxCoeff(); |
| } |
| |
| template <> |
| void ComputeMaxPool2D<float, StorageOrder::NHWC>( |
| const int W, |
| const int t, |
| const int b, |
| const int l, |
| const int r, |
| const int y, |
| const ConstEigenArrayMap<float>& X_arr, |
| EigenArrayMap<float>* Y_arr) { |
| Y_arr->col(y).setConstant(std::numeric_limits<float>::lowest()); |
| for (int i = t; i < b; ++i) { |
| for (int j = l; j < r; ++j) { |
| Y_arr->col(y) = Y_arr->col(y).max(X_arr.col(i * W + j)); |
| } |
| } |
| } |
| |
| template <typename T, StorageOrder kOrder> |
| void ComputeMaxPool3D( |
| int H, |
| int W, |
| int p, |
| int a, |
| int t, |
| int b, |
| int l, |
| int r, |
| int y, |
| const ConstEigenArrayMap<T>& X_arr, |
| EigenArrayMap<T>* Y_arr); |
| |
| template <> |
| void ComputeMaxPool3D<float, StorageOrder::NCHW>( |
| const int H, |
| const int /* W */, |
| const int p, |
| const int a, |
| const int t, |
| const int b, |
| const int l, |
| const int r, |
| const int y, |
| const ConstEigenArrayMap<float>& X_arr, |
| EigenArrayMap<float>* Y_arr) { |
| (*Y_arr)(y) = std::numeric_limits<float>::lowest(); |
| for (int i = p; i < a; ++i) { |
| (*Y_arr)(y) = std::max( |
| (*Y_arr)(y), X_arr.block(l, i * H + t, r - l, b - t).maxCoeff()); |
| } |
| } |
| |
| template <> |
| void ComputeMaxPool3D<float, StorageOrder::NHWC>( |
| const int H, |
| const int W, |
| const int p, |
| const int a, |
| const int t, |
| const int b, |
| const int l, |
| const int r, |
| const int y, |
| const ConstEigenArrayMap<float>& X_arr, |
| EigenArrayMap<float>* Y_arr) { |
| Y_arr->col(y).setConstant(std::numeric_limits<float>::lowest()); |
| for (int i = p; i < a; ++i) { |
| for (int j = t; j < b; ++j) { |
| for (int k = l; k < r; ++k) { |
| Y_arr->col(y) = Y_arr->col(y).max(X_arr.col(i * H * W + j * W + k)); |
| } |
| } |
| } |
| } |
| |
| template <typename T, StorageOrder kOrder> |
| void RunMaxPool1D( |
| const int N, |
| const int C, |
| const int X_size, |
| const int Y_size, |
| const int kernel, |
| const int stride, |
| const int pad, |
| const T* X, |
| T* Y) { |
| const int batch_size = kOrder == StorageOrder::NCHW ? N * C : N; |
| const int X_stride = kOrder == StorageOrder::NCHW ? X_size : X_size * C; |
| const int Y_stride = kOrder == StorageOrder::NCHW ? Y_size : Y_size * C; |
| const T* X_ptr = X; |
| T* Y_ptr = Y; |
| for (int i = 0; i < batch_size; ++i) { |
| ConstEigenArrayMap<T> X_arr = kOrder == StorageOrder::NCHW |
| ? ConstEigenArrayMap<T>(X_ptr, X_size, 1) |
| : ConstEigenArrayMap<T>(X_ptr, C, X_size); |
| EigenArrayMap<T> Y_arr = kOrder == StorageOrder::NCHW |
| ? EigenArrayMap<T>(Y_ptr, Y_size, 1) |
| : EigenArrayMap<T>(Y_ptr, C, Y_size); |
| for (int y = 0; y < Y_size; ++y) { |
| const int l = std::max(y * stride - pad, 0); |
| const int r = std::min(y * stride - pad + kernel, X_size); |
| ComputeMaxPool1D<T, kOrder>(l, r, y, X_arr, &Y_arr); |
| } |
| X_ptr += X_stride; |
| Y_ptr += Y_stride; |
| } |
| } |
| |
| template <typename T, StorageOrder kOrder> |
| void RunMaxPool2D( |
| const int N, |
| const int C, |
| const int X_H, |
| const int X_W, |
| const int Y_H, |
| const int Y_W, |
| const int kernel_h, |
| const int kernel_w, |
| const int stride_h, |
| const int stride_w, |
| const int pad_t, |
| const int pad_l, |
| const T* X, |
| T* Y) { |
| const int batch_size = kOrder == StorageOrder::NCHW ? N * C : N; |
| const int X_HxW = X_H * X_W; |
| const int Y_HxW = Y_H * Y_W; |
| const int X_stride = kOrder == StorageOrder::NCHW ? X_HxW : X_HxW * C; |
| const int Y_stride = kOrder == StorageOrder::NCHW ? Y_HxW : Y_HxW * C; |
| const T* X_ptr = X; |
| T* Y_ptr = Y; |
| for (int i = 0; i < batch_size; ++i) { |
| ConstEigenArrayMap<T> X_arr = kOrder == StorageOrder::NCHW |
| ? ConstEigenArrayMap<T>(X_ptr, X_W, X_H) |
| : ConstEigenArrayMap<T>(X_ptr, C, X_HxW); |
| EigenArrayMap<T> Y_arr = kOrder == StorageOrder::NCHW |
| ? EigenArrayMap<T>(Y_ptr, Y_W, Y_H) |
| : EigenArrayMap<T>(Y_ptr, C, Y_HxW); |
| for (int h = 0; h < Y_H; ++h) { |
| const int t = std::max(h * stride_h - pad_t, 0); |
| const int b = std::min(h * stride_h - pad_t + kernel_h, X_H); |
| for (int w = 0; w < Y_W; ++w) { |
| const int l = std::max(w * stride_w - pad_l, 0); |
| const int r = std::min(w * stride_w - pad_l + kernel_w, X_W); |
| const int y = h * Y_W + w; |
| ComputeMaxPool2D<T, kOrder>(X_W, t, b, l, r, y, X_arr, &Y_arr); |
| } |
| } |
| X_ptr += X_stride; |
| Y_ptr += Y_stride; |
| } |
| } |
| template <typename T, StorageOrder kOrder> |
| void RunMaxPool3D( |
| const int N, |
| const int C, |
| const int X_D, |
| const int X_H, |
| const int X_W, |
| const int Y_D, |
| const int Y_H, |
| const int Y_W, |
| const int kernel_d, |
| const int kernel_h, |
| const int kernel_w, |
| const int stride_d, |
| const int stride_h, |
| const int stride_w, |
| const int pad_p, |
| const int pad_t, |
| const int pad_l, |
| const T* X, |
| T* Y) { |
| const int batch_size = kOrder == StorageOrder::NCHW ? N * C : N; |
| const int X_HxW = X_D * X_H * X_W; |
| const int Y_HxW = Y_D * Y_H * Y_W; |
| const int X_stride = kOrder == StorageOrder::NCHW ? X_HxW : X_HxW * C; |
| const int Y_stride = kOrder == StorageOrder::NCHW ? Y_HxW : Y_HxW * C; |
| const T* X_ptr = X; |
| T* Y_ptr = Y; |
| for (int i = 0; i < batch_size; ++i) { |
| ConstEigenArrayMap<T> X_arr = kOrder == StorageOrder::NCHW |
| ? ConstEigenArrayMap<T>(X_ptr, X_W, X_D * X_H) |
| : ConstEigenArrayMap<T>(X_ptr, C, X_HxW); |
| EigenArrayMap<T> Y_arr = kOrder == StorageOrder::NCHW |
| ? EigenArrayMap<T>(Y_ptr, Y_W, Y_D * Y_H) |
| : EigenArrayMap<T>(Y_ptr, C, Y_HxW); |
| for (int d = 0; d < Y_D; ++d) { |
| const int p = std::max(d * stride_d - pad_p, 0); |
| const int a = std::min(d * stride_d - pad_p + kernel_d, X_D); |
| for (int h = 0; h < Y_H; ++h) { |
| const int t = std::max(h * stride_h - pad_t, 0); |
| const int b = std::min(h * stride_h - pad_t + kernel_h, X_H); |
| for (int w = 0; w < Y_W; ++w) { |
| const int l = std::max(w * stride_w - pad_l, 0); |
| const int r = std::min(w * stride_w - pad_l + kernel_w, X_W); |
| const int y = d * Y_H * Y_W + h * Y_W + w; |
| ComputeMaxPool3D<T, kOrder>( |
| X_H, X_W, p, a, t, b, l, r, y, X_arr, &Y_arr); |
| } |
| } |
| } |
| X_ptr += X_stride; |
| Y_ptr += Y_stride; |
| } |
| } |
| |
| } // namespace |
| |
| template <> |
| template <> |
| bool AveragePoolFunctor<CPUContext>:: |
| GlobalPoolingForward<float, StorageOrder::NCHW>( |
| const int N, |
| const int C, |
| const int HxW, |
| const float* X, |
| float* Y, |
| CPUContext* context) const { |
| const std::array<int, 2> X_dims = {N * C, HxW}; |
| const std::array<int, 2> Y_dims = {N * C, 1}; |
| math::ReduceMean<float, CPUContext>( |
| 2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context); |
| return true; |
| } |
| |
| template <> |
| template <> |
| bool AveragePoolFunctor<CPUContext>:: |
| GlobalPoolingForward<float, StorageOrder::NHWC>( |
| const int N, |
| const int C, |
| const int HxW, |
| const float* X, |
| float* Y, |
| CPUContext* context) const { |
| math::Set<float, CPUContext>(N * C, 0.0f, Y, context); |
| const float* X_ptr = X; |
| float* Y_ptr = Y; |
| for (int i = 0; i < N; ++i) { |
| for (int j = 0; j < HxW; ++j) { |
| math::Add<float, CPUContext>(C, Y_ptr, X_ptr + j * C, Y_ptr, context); |
| } |
| X_ptr += HxW * C; |
| Y_ptr += C; |
| } |
| math::Scale<float, float, CPUContext>( |
| N * C, 1.0f / static_cast<float>(HxW), Y, Y, context); |
| return true; |
| } |
| |
| #define CAFFE2_SPECIALIZED_AVERAGE_POOL_FUNCTOR_FORWARD(T, kOrder) \ |
| template <> \ |
| template <> \ |
| bool AveragePoolFunctor<CPUContext>::Forward<T, kOrder>( \ |
| const int N, \ |
| const int C, \ |
| const std::vector<int>& X_dims, \ |
| const std::vector<int>& Y_dims, \ |
| const std::vector<int>& kernel, \ |
| const std::vector<int>& dilation, \ |
| const std::vector<int>& stride, \ |
| const std::vector<int>& pads, \ |
| const T* X, \ |
| T* Y, \ |
| CPUContext* /* context */) const { \ |
| const int ndim = X_dims.size(); \ |
| switch (ndim) { \ |
| case 1: { \ |
| RunAveragePool1D<T, kOrder>( \ |
| N, \ |
| C, \ |
| X_dims[0], \ |
| Y_dims[0], \ |
| kernel[0], \ |
| stride[0], \ |
| pads[0], \ |
| count_include_pad, \ |
| X, \ |
| Y); \ |
| return true; \ |
| } \ |
| case 2: { \ |
| if (std::is_same<T, float>::value && kOrder == StorageOrder::NCHW && \ |
| pool_op_util::IsNeon4x4p0s0Eligible( \ |
| X_dims[0], \ |
| X_dims[1], \ |
| Y_dims[0], \ |
| Y_dims[1], \ |
| kernel[0], \ |
| kernel[1], \ |
| stride[0], \ |
| stride[1], \ |
| pads[0], \ |
| pads[1], \ |
| pads[2], \ |
| pads[3], \ |
| dilation[0], \ |
| dilation[1], \ |
| X, \ |
| Y)) { \ |
| pool_op_util::RunNeonAveragePool4x4p0s0NCHW( \ |
| N, C, X_dims[0], X_dims[1], X, Y); \ |
| } else { \ |
| RunAveragePool2D<T, kOrder>( \ |
| N, \ |
| C, \ |
| X_dims[0], \ |
| X_dims[1], \ |
| Y_dims[0], \ |
| Y_dims[1], \ |
| kernel[0], \ |
| kernel[1], \ |
| stride[0], \ |
| stride[1], \ |
| pads[0], \ |
| pads[1], \ |
| count_include_pad, \ |
| X, \ |
| Y); \ |
| } \ |
| return true; \ |
| } \ |
| case 3: { \ |
| RunAveragePool3D<T, kOrder>( \ |
| N, \ |
| C, \ |
| X_dims[0], \ |
| X_dims[1], \ |
| X_dims[2], \ |
| Y_dims[0], \ |
| Y_dims[1], \ |
| Y_dims[2], \ |
| kernel[0], \ |
| kernel[1], \ |
| kernel[2], \ |
| stride[0], \ |
| stride[1], \ |
| stride[2], \ |
| pads[0], \ |
| pads[1], \ |
| pads[2], \ |
| count_include_pad, \ |
| X, \ |
| Y); \ |
| return true; \ |
| } \ |
| default: { \ |
| CAFFE_THROW("Unsupported pooling dim: ", ndim); \ |
| return false; \ |
| } \ |
| } \ |
| } |
| CAFFE2_SPECIALIZED_AVERAGE_POOL_FUNCTOR_FORWARD(float, StorageOrder::NCHW) |
| CAFFE2_SPECIALIZED_AVERAGE_POOL_FUNCTOR_FORWARD(float, StorageOrder::NHWC) |
| #undef CAFFE2_SPECIALIZED_AVERAGE_POOL_FUNCTOR_FORWARD |
| |
| template <> |
| template <> |
| bool MaxPoolFunctor<CPUContext>:: |
| GlobalPoolingForward<float, StorageOrder::NCHW>( |
| const int N, |
| const int C, |
| const int HxW, |
| const float* X, |
| float* Y, |
| CPUContext* context) const { |
| const std::array<int, 2> X_dims = {N * C, HxW}; |
| const std::array<int, 2> Y_dims = {N * C, 1}; |
| math::ReduceMax<float, CPUContext>( |
| 2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context); |
| return true; |
| } |
| |
| template <> |
| template <> |
| bool MaxPoolFunctor<CPUContext>:: |
| GlobalPoolingForward<float, StorageOrder::NHWC>( |
| const int N, |
| const int C, |
| const int HxW, |
| const float* X, |
| float* Y, |
| CPUContext* context) const { |
| math::Set<float, CPUContext>( |
| N * C, std::numeric_limits<float>::lowest(), Y, context); |
| const float* X_ptr = X; |
| float* Y_ptr = Y; |
| for (int i = 0; i < N; ++i) { |
| ConstEigenArrayMap<float> X_arr(X_ptr, C, HxW); |
| EigenVectorArrayMap<float> Y_arr(Y_ptr, C); |
| for (int j = 0; j < HxW; ++j) { |
| Y_arr = Y_arr.max(X_arr.col(j)); |
| } |
| X_ptr += HxW * C; |
| Y_ptr += C; |
| } |
| return true; |
| } |
| |
| #define CAFFE2_SPECIALIZED_MAX_POOL_FUNCTOR_FORWARD(T, kOrder) \ |
| template <> \ |
| template <> \ |
| bool MaxPoolFunctor<CPUContext>::Forward<T, kOrder>( \ |
| const int N, \ |
| const int C, \ |
| const std::vector<int>& X_dims, \ |
| const std::vector<int>& Y_dims, \ |
| const std::vector<int>& kernel, \ |
| const std::vector<int>& dilation, \ |
| const std::vector<int>& stride, \ |
| const std::vector<int>& pads, \ |
| const T* X, \ |
| T* Y, \ |
| CPUContext* /* context */) const { \ |
| const int ndim = X_dims.size(); \ |
| switch (ndim) { \ |
| case 1: { \ |
| RunMaxPool1D<T, kOrder>( \ |
| N, C, X_dims[0], Y_dims[0], kernel[0], stride[0], pads[0], X, Y); \ |
| return true; \ |
| } \ |
| case 2: { \ |
| if (std::is_same<T, float>::value && kOrder == StorageOrder::NCHW && \ |
| pool_op_util::IsNeon2x2p0s0Eligible( \ |
| X_dims[0], \ |
| X_dims[1], \ |
| Y_dims[0], \ |
| Y_dims[1], \ |
| kernel[0], \ |
| kernel[1], \ |
| stride[0], \ |
| stride[1], \ |
| pads[0], \ |
| pads[1], \ |
| pads[2], \ |
| pads[3], \ |
| dilation[0], \ |
| dilation[1], \ |
| X, \ |
| Y)) { \ |
| pool_op_util::RunNeonMaxPool2x2p0s0NCHW( \ |
| N, C, X_dims[0], X_dims[1], X, Y); \ |
| } else { \ |
| RunMaxPool2D<T, kOrder>( \ |
| N, \ |
| C, \ |
| X_dims[0], \ |
| X_dims[1], \ |
| Y_dims[0], \ |
| Y_dims[1], \ |
| kernel[0], \ |
| kernel[1], \ |
| stride[0], \ |
| stride[1], \ |
| pads[0], \ |
| pads[1], \ |
| X, \ |
| Y); \ |
| } \ |
| return true; \ |
| } \ |
| case 3: { \ |
| RunMaxPool3D<T, kOrder>( \ |
| N, \ |
| C, \ |
| X_dims[0], \ |
| X_dims[1], \ |
| X_dims[2], \ |
| Y_dims[0], \ |
| Y_dims[1], \ |
| Y_dims[2], \ |
| kernel[0], \ |
| kernel[1], \ |
| kernel[2], \ |
| stride[0], \ |
| stride[1], \ |
| stride[2], \ |
| pads[0], \ |
| pads[1], \ |
| pads[2], \ |
| X, \ |
| Y); \ |
| return true; \ |
| } \ |
| default: { \ |
| CAFFE_THROW("Unsupported pooling dim: ", ndim); \ |
| return false; \ |
| } \ |
| } \ |
| } |
| CAFFE2_SPECIALIZED_MAX_POOL_FUNCTOR_FORWARD(float, StorageOrder::NCHW) |
| CAFFE2_SPECIALIZED_MAX_POOL_FUNCTOR_FORWARD(float, StorageOrder::NHWC) |
| #undef CAFFE2_SPECIALIZED_MAX_POOL_FUNCTOR_FORWARD |
| |
| constexpr char kAveragePoolDoc[] = R"DOC( |
| consumes an input blob and applies average pooling across the the blob according |
| to kernel sizes, stride sizes, pad lengths and dilation. Average pooling consists |
| of taking the average value of a subset of the input tensor according to the kernel |
| size and downsampling the data into the output blob for further processing. The |
| `brew` module has a wrapper for this operator for use in a `ModelHelper` object. |
| |
| Pooling layers reduce the spatial dimensionality of the input blob. Each of the |
| output blob's dimensions will reduce according to: |
| |
| $$dim_{out}=\frac{dim_{in}-kernel+2*pad}{stride}+1$$ |
| |
| Github Links: |
| |
| - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.h |
| - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.cc |
| - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_pool_op_base.h |
| |
| |
| <details> |
| |
| <summary> <b>Example</b> </summary> |
| |
| **Code** |
| |
| ``` |
| workspace.ResetWorkspace() |
| |
| op = core.CreateOperator( |
| "AveragePool", |
| ["X"], |
| ["Y"], |
| kernel=2, |
| stride=2, |
| ) |
| |
| workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW |
| print("X:\n", workspace.FetchBlob("X"), "\n") |
| workspace.RunOperatorOnce(op) |
| print("Y:\n", workspace.FetchBlob("Y")) |
| ``` |
| |
| **Result** |
| |
| ``` |
| X: |
| [[[[-0.2883434 0.43498734 0.05417408 1.912558 0.09390241 |
| -0.33173105] |
| [ 1.633709 1.2047161 0.36964908 0.99961185 0.4184147 |
| 0.9989975 ] |
| [ 1.7644193 0.1789665 1.5812988 -0.6038542 -0.36090398 |
| 0.33195344] |
| [ 0.9457722 -0.95174325 -0.78124577 1.2062047 1.1903144 |
| 0.2586746 ] |
| [ 1.252104 0.32645547 1.8073524 -0.78397465 0.9978303 |
| -0.97614396] |
| [ 0.5440196 1.5778259 -0.76750124 0.5051756 0.8838398 |
| -0.37085298]]]] |
| |
| Y: |
| [[[[0.7462672 0.83399826 0.2948959 ] |
| [0.4843537 0.3506009 0.35500962] |
| [0.9251013 0.19026303 0.13366827]]]] |
| ``` |
| |
| </details> |
| |
| )DOC"; |
| |
| constexpr char kMaxPoolDoc[] = R"DOC( |
| consumes an input blob and applies max pooling across the the blob according to |
| kernel sizes, stride sizes, pad lengths and dilation. Max pooling consists of |
| taking the maximum value of a subset of the input tensor according to the kernel |
| size and downsampling the data into the output blob for further processing. The |
| `brew` module has a wrapper for this operator for use in a `ModelHelper` object. |
| |
| Pooling layers reduce the spatial dimensionality of the input blob. Each of the |
| output blob's dimensions will reduce according to: |
| |
| $$dim_{out}=\frac{dim_{in}-kernel+2*pad}{stride}+1$$ |
| |
| Github Links: |
| |
| - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.h |
| - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.cc |
| - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_pool_op_base.h |
| |
| <details> |
| |
| <summary> <b>Example</b> </summary> |
| |
| **Code** |
| |
| ``` |
| workspace.ResetWorkspace() |
| |
| op = core.CreateOperator( |
| "MaxPool", |
| ["X"], |
| ["Y"], |
| kernel=2, |
| stride=2, |
| ) |
| |
| workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW |
| print("X:\n", workspace.FetchBlob("X"), "\n") |
| workspace.RunOperatorOnce(op) |
| print("Y:\n", workspace.FetchBlob("Y")) |
| ``` |
| |
| **Result** |
| |
| ``` |
| X: |
| [[[[-2.8534958e-01 -1.7719941e+00 -8.2277227e-04 1.1088650e+00 |
| -2.1476576e+00 -3.5070452e-01] |
| [-9.0058845e-01 -3.0070004e-01 -1.7907504e+00 -7.1746534e-01 |
| 1.2798511e+00 -3.2214901e-01] |
| [ 1.5806322e+00 1.6845188e+00 -2.6633200e-01 -3.8576153e-01 |
| -9.6424848e-02 -3.9696163e-01] |
| [ 1.2572408e-01 6.3612902e-01 -3.9554062e-01 -6.9735396e-01 |
| -9.1898698e-01 -1.9609968e-01] |
| [-1.1587460e+00 2.4605224e+00 -1.5497679e+00 1.3020347e-01 |
| -8.1293899e-01 -7.8803545e-01] |
| [ 1.4323474e+00 1.3618395e+00 9.8975077e-02 -1.1307785e-01 |
| 7.2035044e-01 2.7642491e-01]]]] |
| |
| Y: |
| [[[[-0.28534958 1.108865 1.2798511 ] |
| [ 1.6845188 -0.266332 -0.09642485] |
| [ 2.4605224 0.13020347 0.72035044]]]] |
| |
| ``` |
| |
| </details> |
| |
| )DOC"; |
| |
| std::function<void(OpSchema&)> AveragePoolDocGenerator(const char* dim) { |
| return [=](OpSchema& schema) { |
| std::string doc = "AveragePool{dim} {pool_doc}"; |
| c10::ReplaceAll(doc, "{dim}", dim); |
| c10::ReplaceAll(doc, "{pool_doc}", kAveragePoolDoc); |
| schema.SetDoc(doc); |
| schema.Input( |
| 0, |
| "X", |
| "*(type: Tensor`<float>`)* Input data tensor of shape NCHW or NHWC."); |
| schema.Output(0, "Y", "*(type: Tensor`<float>`)* Output data tensor."); |
| // schema.Arg( |
| // "kernel", "*(type: int)* Size of the window to take an average |
| // over."); |
| // schema.Arg("stride", "*(type: int)* Stride of the window."); |
| // schema.Arg( |
| // "pad", |
| // "*(type: int)* Implicit zero padding to be added on both sides."); |
| // schema.Arg( |
| // "dilation", |
| // "*(type: int)* Parameter that controls the stride of elements in the |
| // " "window."); |
| // schema.Arg( |
| // "order", |
| // "*(type: string; default: 'NCHW')* Order of the blob dimensions."); |
| // schema.Arg( |
| // "count_include_pad", |
| // "*(type: bool; default: False)* When True, will include the " |
| // "zero-padding in the averaging."); |
| }; |
| } |
| |
| std::function<void(OpSchema&)> MaxPoolDocGenerator(const char* dim) { |
| return [=](OpSchema& schema) { |
| std::string doc = "MaxPool{dim} {pool_doc}"; |
| c10::ReplaceAll(doc, "{dim}", dim); |
| c10::ReplaceAll(doc, "{pool_doc}", kMaxPoolDoc); |
| schema.SetDoc(doc); |
| schema.Input( |
| 0, |
| "X", |
| "*(type: Tensor`<float>`)* Input data tensor of shape NCHW or NHWC."); |
| schema.Output(0, "Y", "*(type: Tensor`<float>`)* Output data tensor."); |
| /* |
| schema.Arg("kernel", "*(type: int)* Size of the window to take an average |
| over."); schema.Arg("stride", "*(type: int)* Stride of the window."); |
| schema.Arg("pad", "*(type: int)* Implicit zero padding to be added on both |
| sides."); schema.Arg("dilation", "*(type: int)* Parameter that controls |
| the stride of elements in the window."); schema.Arg("order", "*(type: |
| string; default: 'NCHW')* Order of the blob dimensions."); |
| */ |
| }; |
| } |
| REGISTER_CPU_OPERATOR( |
| AveragePool, |
| PoolOp<float, CPUContext, AveragePoolFunctor<CPUContext>>); |
| |
| OPERATOR_SCHEMA(AveragePool) |
| .NumInputs(1) |
| .NumOutputs(1) |
| .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool) |
| .FillUsing(AveragePoolDocGenerator("")) |
| .InheritOnnxSchema(); |
| |
| REGISTER_CPU_OPERATOR( |
| AveragePool1D, |
| PoolOp<float, CPUContext, AveragePoolFunctor<CPUContext>>); |
| |
| OPERATOR_SCHEMA(AveragePool1D) |
| .NumInputs(1) |
| .NumOutputs(1) |
| .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool) |
| .FillUsing(AveragePoolDocGenerator("1D")) |
| .InheritOnnxSchema("AveragePool"); |
| |
| REGISTER_CPU_OPERATOR( |
| AveragePool2D, |
| PoolOp<float, CPUContext, AveragePoolFunctor<CPUContext>>); |
| |
| OPERATOR_SCHEMA(AveragePool2D) |
| .NumInputs(1) |
| .NumOutputs(1) |
| .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool) |
| .FillUsing(AveragePoolDocGenerator("2D")) |
| .InheritOnnxSchema("AveragePool"); |
| |
| REGISTER_CPU_OPERATOR( |
| AveragePool3D, |
| PoolOp<float, CPUContext, AveragePoolFunctor<CPUContext>>); |
| |
| OPERATOR_SCHEMA(AveragePool3D) |
| .NumInputs(1) |
| .NumOutputs(1) |
| .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool) |
| .FillUsing(AveragePoolDocGenerator("3D")) |
| .InheritOnnxSchema("AveragePool"); |
| |
| REGISTER_CPU_OPERATOR( |
| MaxPool, |
| PoolOp<float, CPUContext, MaxPoolFunctor<CPUContext>>); |
| |
| OPERATOR_SCHEMA(MaxPool) |
| .NumInputs(1) |
| .NumOutputs(1) |
| .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool) |
| .FillUsing(MaxPoolDocGenerator("")) |
| .InheritOnnxSchema(); |
| |
| REGISTER_CPU_OPERATOR( |
| MaxPool1D, |
| PoolOp<float, CPUContext, MaxPoolFunctor<CPUContext>>); |
| |
| OPERATOR_SCHEMA(MaxPool1D) |
| .NumInputs(1) |
| .NumOutputs(1) |
| .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool) |
| .FillUsing(MaxPoolDocGenerator("1D")) |
| .InheritOnnxSchema("MaxPool"); |
| |
| REGISTER_CPU_OPERATOR( |
| MaxPool2D, |
| PoolOp<float, CPUContext, MaxPoolFunctor<CPUContext>>); |
| |
| OPERATOR_SCHEMA(MaxPool2D) |
| .NumInputs(1) |
| .NumOutputs(1) |
| .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool) |
| .FillUsing(MaxPoolDocGenerator("2D")) |
| .InheritOnnxSchema("MaxPool"); |
| |
| REGISTER_CPU_OPERATOR( |
| MaxPool3D, |
| PoolOp<float, CPUContext, MaxPoolFunctor<CPUContext>>); |
| |
| OPERATOR_SCHEMA(MaxPool3D) |
| .NumInputs(1) |
| .NumOutputs(1) |
| .TensorInferenceFunction(ConvPoolOpBase<CPUContext>::TensorInferenceForPool) |
| .FillUsing(MaxPoolDocGenerator("3D")) |
| .InheritOnnxSchema("MaxPool"); |
| |
| } // namespace caffe2 |