caffe2/operators/roi_align_op.cc - platform/external/pytorch - Git at Google

 #include "caffe2/operators/roi_align_op.h"

 #include <vector>

 #include "caffe2/utils/eigen_utils.h"
 #include "caffe2/utils/math.h"

 namespace caffe2 {

 namespace {

 template <typename T>
 struct BilinearInterpolationParam {
   int64_t p1;
   int64_t p2;
   int64_t p3;
   int64_t p4;
   T w1;
   T w2;
   T w3;
   T w4;
 };

 template <typename T>
 std::vector<BilinearInterpolationParam<T>> MakeBilinearInterpolationParams(
     int64_t H,
     int64_t W,
     int64_t pooled_h,
     int64_t pooled_w,
     T bin_size_h,
     T bin_size_w,
     int64_t bin_grid_h,
     int64_t bin_grid_w,
     T roi_start_h,
     T roi_start_w) {
   std::vector<BilinearInterpolationParam<T>> params(
       pooled_h * pooled_w * bin_grid_h * bin_grid_w);
   const T ch = bin_size_h / static_cast<T>(bin_grid_h);
   const T cw = bin_size_w / static_cast<T>(bin_grid_w);
   int64_t cnt = 0;
   for (int64_t ph = 0; ph < pooled_h; ++ph) {
     for (int64_t pw = 0; pw < pooled_w; ++pw) {
       for (int64_t iy = 0; iy < bin_grid_h; ++iy) {
         const T yy = roi_start_h + static_cast<T>(ph) * bin_size_h +
             (static_cast<T>(iy) + T(0.5)) * ch;
         if (yy < T(-1) || yy > static_cast<T>(H)) {
           std::memset(params.data() + cnt, 0, bin_grid_w * sizeof(params[0]));
           cnt += bin_grid_w;
           continue;
         }
         for (int64_t ix = 0; ix < bin_grid_w; ++ix) {
           const T xx = roi_start_w + pw * bin_size_w +
               (static_cast<T>(ix) + T(0.5f)) * cw;
           BilinearInterpolationParam<T>& param = params[cnt++];
           if (xx < T(-1) || xx > static_cast<T>(W)) {
             std::memset(&param, 0, sizeof(param));
             continue;
           }
           const T y = std::min(std::max(yy, T(0)), static_cast<T>(H - 1));
           const T x = std::min(std::max(xx, T(0)), static_cast<T>(W - 1));
           const int64_t yl = static_cast<int64_t>(std::floor(y));
           const int64_t xl = static_cast<int64_t>(std::floor(x));
           const int64_t yh = std::min(yl + 1, H - 1);
           const int64_t xh = std::min(xl + 1, W - 1);
           const T py = y - static_cast<T>(yl);
           const T px = x - static_cast<T>(xl);
           const T qy = T(1) - py;
           const T qx = T(1) - px;
           param.p1 = yl * W + xl;
           param.p2 = yl * W + xh;
           param.p3 = yh * W + xl;
           param.p4 = yh * W + xh;
           param.w1 = qy * qx;
           param.w2 = qy * px;
           param.w3 = py * qx;
           param.w4 = py * px;
         }
       }
     }
   }
   return params;
 }

 } // namespace

 template <>
 C10_EXPORT bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNCHW(
     int64_t N,
     int64_t C,
     int64_t H,
     int64_t W,
     int64_t roi_cols,
     const float* X,
     const float* R,
     float* Y) {
   DCHECK(roi_cols == 4 || roi_cols == 5);
   const float roi_offset = aligned_ ? 0.5f : 0.0f;

 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
   for (int64_t n = 0; n < N; ++n) {
     const int64_t roi_batch_idx = roi_cols == 4 ? 0 : R[n * roi_cols];
     const float* X_ptr = X + roi_batch_idx * C * H * W;
     const float* R_ptr = R + n * roi_cols + (roi_cols == 5);
     float* Y_ptr = Y + n * C * pooled_h_ * pooled_w_;

     // Do not using rounding; this implementation detail is critical
     const float roi_w1 = R_ptr[0] * spatial_scale_ - roi_offset;
     const float roi_h1 = R_ptr[1] * spatial_scale_ - roi_offset;
     const float roi_w2 = R_ptr[2] * spatial_scale_ - roi_offset;
     const float roi_h2 = R_ptr[3] * spatial_scale_ - roi_offset;
     float roi_w = roi_w2 - roi_w1;
     float roi_h = roi_h2 - roi_h1;
     if (aligned_) {
       CAFFE_ENFORCE(
           roi_w >= 0.0f && roi_h >= 0.0f,
           "ROIs in ROIAlign do not have non-negative size!");
     } else { // backward compatibility
       // Force malformed ROIs to be 1x1
       roi_w = std::max(roi_w, 1.0f);
       roi_h = std::max(roi_h, 1.0f);
     }
     const float bin_size_h = roi_h / static_cast<float>(pooled_h_);
     const float bin_size_w = roi_w / static_cast<float>(pooled_w_);

     // We use roi_bin_grid to sample the grid and mimic integral
     const int64_t bin_grid_h = (sampling_ratio_ > 0)
         ? sampling_ratio_
         : static_cast<int64_t>(ceil(roi_h / static_cast<float>(pooled_h_)));
     const int64_t bin_grid_w = (sampling_ratio_ > 0)
         ? sampling_ratio_
         : static_cast<int64_t>(ceil(roi_w / static_cast<float>(pooled_w_)));

     const std::vector<BilinearInterpolationParam<float>> params =
         MakeBilinearInterpolationParams(
             H,
             W,
             pooled_h_,
             pooled_w_,
             bin_size_h,
             bin_size_w,
             bin_grid_h,
             bin_grid_w,
             roi_h1,
             roi_w1);

     const float scale = 1.0f / static_cast<float>(bin_grid_h * bin_grid_w);
     for (int64_t c = 0; c < C; ++c) {
       int64_t cnt = 0;
       for (int64_t ph = 0; ph < pooled_h_; ++ph) {
         for (int64_t pw = 0; pw < pooled_w_; ++pw) {
           float sum = 0.0f;
           for (int64_t iy = 0; iy < bin_grid_h; ++iy) {
             for (int64_t ix = 0; ix < bin_grid_w; ++ix) {
               const BilinearInterpolationParam<float>& param = params[cnt++];
               sum += param.w1 * X_ptr[param.p1] + param.w2 * X_ptr[param.p2] +
                   param.w3 * X_ptr[param.p3] + param.w4 * X_ptr[param.p4];
             }
           }
           Y_ptr[ph * pooled_w_ + pw] = sum * scale;
         }
       }
       X_ptr += H * W;
       Y_ptr += pooled_h_ * pooled_w_;
     }
   }

   return true;
 }

 template <>
 C10_EXPORT bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNHWC(
     int64_t N,
     int64_t C,
     int64_t H,
     int64_t W,
     int64_t roi_cols,
     const float* X,
     const float* R,
     float* Y) {
   DCHECK(roi_cols == 4 || roi_cols == 5);
   const float roi_offset = aligned_ ? 0.5f : 0.0f;

 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
   for (int64_t n = 0; n < N; ++n) {
     const int64_t roi_batch_idx = roi_cols == 4 ? 0 : R[n * roi_cols];
     const float* X_ptr = X + roi_batch_idx * C * H * W;
     const float* R_ptr = R + n * roi_cols + (roi_cols == 5);
     float* Y_ptr = Y + n * C * pooled_h_ * pooled_w_;

     // Do not using rounding; this implementation detail is critical
     const float roi_w1 = R_ptr[0] * spatial_scale_ - roi_offset;
     const float roi_h1 = R_ptr[1] * spatial_scale_ - roi_offset;
     const float roi_w2 = R_ptr[2] * spatial_scale_ - roi_offset;
     const float roi_h2 = R_ptr[3] * spatial_scale_ - roi_offset;
     float roi_w = roi_w2 - roi_w1;
     float roi_h = roi_h2 - roi_h1;
     if (aligned_) {
       CAFFE_ENFORCE(
           roi_w >= 0.0f && roi_h >= 0.0f,
           "ROIs in ROIAlign do not have non-negative size!");
     } else { // backward compatibility
       // Force malformed ROIs to be 1x1
       roi_w = std::max(roi_w, 1.0f);
       roi_h = std::max(roi_h, 1.0f);
     }
     const float bin_size_h = roi_h / static_cast<float>(pooled_h_);
     const float bin_size_w = roi_w / static_cast<float>(pooled_w_);

     // We use roi_bin_grid to sample the grid and mimic integral
     const int64_t bin_grid_h = (sampling_ratio_ > 0)
         ? sampling_ratio_
         : static_cast<int64_t>(ceil(roi_h / static_cast<float>(pooled_h_)));
     const int64_t bin_grid_w = (sampling_ratio_ > 0)
         ? sampling_ratio_
         : static_cast<int64_t>(ceil(roi_w / static_cast<float>(pooled_w_)));

     const std::vector<BilinearInterpolationParam<float>> params =
         MakeBilinearInterpolationParams(
             H,
             W,
             pooled_h_,
             pooled_w_,
             bin_size_h,
             bin_size_w,
             bin_grid_h,
             bin_grid_w,
             roi_h1,
             roi_w1);

     const float scale = 1.0f / static_cast<float>(bin_grid_h * bin_grid_w);
     int64_t cnt = 0;
     for (int64_t ph = 0; ph < pooled_h_; ++ph) {
       for (int64_t pw = 0; pw < pooled_w_; ++pw) {
         EigenVectorArrayMap<float> Y_arr(Y_ptr + (ph * pooled_w_ + pw) * C, C);
         Y_arr.setZero();
         for (int64_t iy = 0; iy < bin_grid_h; ++iy) {
           for (int64_t ix = 0; ix < bin_grid_w; ++ix) {
             const BilinearInterpolationParam<float>& param = params[cnt++];
             ConstEigenVectorArrayMap<float> x1_arr(X_ptr + param.p1 * C, C);
             ConstEigenVectorArrayMap<float> x2_arr(X_ptr + param.p2 * C, C);
             ConstEigenVectorArrayMap<float> x3_arr(X_ptr + param.p3 * C, C);
             ConstEigenVectorArrayMap<float> x4_arr(X_ptr + param.p4 * C, C);
             Y_arr += param.w1 * x1_arr + param.w2 * x2_arr + param.w3 * x3_arr +
                 param.w4 * x4_arr;
           }
         }
         Y_arr *= scale;
       }
     }
   }

   return true;
 }

 REGISTER_CPU_OPERATOR(RoIAlign, RoIAlignOp<float, CPUContext>);

 // Input: X, rois; Output: Y
 OPERATOR_SCHEMA(RoIAlign)
     .NumInputs(2)
     .NumOutputs(1)
     .SetDoc(R"DOC(
 Region of Interest (RoI) align operation as used in Mask R-CNN.
 )DOC")
     .Arg(
         "spatial_scale",
         "(float) default 1.0; Spatial scale of the input feature map X "
         "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
         "w.r.t. the input image.")
     .Arg("pooled_h", "(int) default 1; Pooled output Y's height.")
     .Arg("pooled_w", "(int) default 1; Pooled output Y's width.")
     .Arg(
         "sampling_ratio",
         "(int) default -1; number of sampling points in the interpolation grid "
         "used to compute the output value of each pooled output bin. If > 0, "
         "then exactly sampling_ratio x sampling_ratio grid points are used. If "
         "<= 0, then an adaptive number of grid points are used (computed as "
         "ceil(roi_width / pooled_w), and likewise for height).")
     .Input(0, "X", "4D feature map input of shape (N, C, H, W).")
     .Input(
         1,
         "RoIs",
         "2D input of shape (R, 4 or 5) specifying R RoIs "
         "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
         "coordinates are in the coordinate system of the input image. For "
         "inputs corresponding to a single image, batch index can be excluded "
         "to have just 4 columns.")
     .Output(
         0,
         "Y",
         "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
         "is a pooled feature map cooresponding to the r-th RoI.");

 template <typename T>
 using RoIAlignCPUOp = caffe2::RoIAlignOp<T, CPUContext>;

 } // namespace caffe2

 C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
     RoIAlign,
     "_caffe2::RoIAlign("
     "    Tensor features,"
     "    Tensor rois,"
     "    str order,"
     "    float spatial_scale,"
     "    int pooled_h,"
     "    int pooled_w,"
     "    int sampling_ratio,"
     "    bool aligned"
     ") -> Tensor",
     caffe2::RoIAlignCPUOp<float>);
	#include "caffe2/operators/roi_align_op.h"

	#include <vector>

	#include "caffe2/utils/eigen_utils.h"
	#include "caffe2/utils/math.h"

	namespace caffe2 {

	namespace {

	template <typename T>
	struct BilinearInterpolationParam {
	int64_t p1;
	int64_t p2;
	int64_t p3;
	int64_t p4;
	T w1;
	T w2;
	T w3;
	T w4;
	};

	template <typename T>
	std::vector<BilinearInterpolationParam<T>> MakeBilinearInterpolationParams(
	int64_t H,
	int64_t W,
	int64_t pooled_h,
	int64_t pooled_w,
	T bin_size_h,
	T bin_size_w,
	int64_t bin_grid_h,
	int64_t bin_grid_w,
	T roi_start_h,
	T roi_start_w) {
	std::vector<BilinearInterpolationParam<T>> params(
	pooled_h * pooled_w * bin_grid_h * bin_grid_w);
	const T ch = bin_size_h / static_cast<T>(bin_grid_h);
	const T cw = bin_size_w / static_cast<T>(bin_grid_w);
	int64_t cnt = 0;
	for (int64_t ph = 0; ph < pooled_h; ++ph) {
	for (int64_t pw = 0; pw < pooled_w; ++pw) {
	for (int64_t iy = 0; iy < bin_grid_h; ++iy) {
	const T yy = roi_start_h + static_cast<T>(ph) * bin_size_h +
	(static_cast<T>(iy) + T(0.5)) * ch;
	if (yy < T(-1) \|\| yy > static_cast<T>(H)) {
	std::memset(params.data() + cnt, 0, bin_grid_w * sizeof(params[0]));
	cnt += bin_grid_w;
	continue;
	}
	for (int64_t ix = 0; ix < bin_grid_w; ++ix) {
	const T xx = roi_start_w + pw * bin_size_w +
	(static_cast<T>(ix) + T(0.5f)) * cw;
	BilinearInterpolationParam<T>& param = params[cnt++];
	if (xx < T(-1) \|\| xx > static_cast<T>(W)) {
	std::memset(&param, 0, sizeof(param));
	continue;
	}
	const T y = std::min(std::max(yy, T(0)), static_cast<T>(H - 1));
	const T x = std::min(std::max(xx, T(0)), static_cast<T>(W - 1));
	const int64_t yl = static_cast<int64_t>(std::floor(y));
	const int64_t xl = static_cast<int64_t>(std::floor(x));
	const int64_t yh = std::min(yl + 1, H - 1);
	const int64_t xh = std::min(xl + 1, W - 1);
	const T py = y - static_cast<T>(yl);
	const T px = x - static_cast<T>(xl);
	const T qy = T(1) - py;
	const T qx = T(1) - px;
	param.p1 = yl * W + xl;
	param.p2 = yl * W + xh;
	param.p3 = yh * W + xl;
	param.p4 = yh * W + xh;
	param.w1 = qy * qx;
	param.w2 = qy * px;
	param.w3 = py * qx;
	param.w4 = py * px;
	}
	}
	}
	}
	return params;
	}

	} // namespace

	template <>
	C10_EXPORT bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNCHW(
	int64_t N,
	int64_t C,
	int64_t H,
	int64_t W,
	int64_t roi_cols,
	const float* X,
	const float* R,
	float* Y) {
	DCHECK(roi_cols == 4 \|\| roi_cols == 5);
	const float roi_offset = aligned_ ? 0.5f : 0.0f;

	#ifdef _OPENMP
	#pragma omp parallel for
	#endif
	for (int64_t n = 0; n < N; ++n) {
	const int64_t roi_batch_idx = roi_cols == 4 ? 0 : R[n * roi_cols];
	const float* X_ptr = X + roi_batch_idx * C * H * W;
	const float* R_ptr = R + n * roi_cols + (roi_cols == 5);
	float* Y_ptr = Y + n * C * pooled_h_ * pooled_w_;

	// Do not using rounding; this implementation detail is critical
	const float roi_w1 = R_ptr[0] * spatial_scale_ - roi_offset;
	const float roi_h1 = R_ptr[1] * spatial_scale_ - roi_offset;
	const float roi_w2 = R_ptr[2] * spatial_scale_ - roi_offset;
	const float roi_h2 = R_ptr[3] * spatial_scale_ - roi_offset;
	float roi_w = roi_w2 - roi_w1;
	float roi_h = roi_h2 - roi_h1;
	if (aligned_) {
	CAFFE_ENFORCE(
	roi_w >= 0.0f && roi_h >= 0.0f,
	"ROIs in ROIAlign do not have non-negative size!");
	} else { // backward compatibility
	// Force malformed ROIs to be 1x1
	roi_w = std::max(roi_w, 1.0f);
	roi_h = std::max(roi_h, 1.0f);
	}
	const float bin_size_h = roi_h / static_cast<float>(pooled_h_);
	const float bin_size_w = roi_w / static_cast<float>(pooled_w_);

	// We use roi_bin_grid to sample the grid and mimic integral
	const int64_t bin_grid_h = (sampling_ratio_ > 0)
	? sampling_ratio_
	: static_cast<int64_t>(ceil(roi_h / static_cast<float>(pooled_h_)));
	const int64_t bin_grid_w = (sampling_ratio_ > 0)
	? sampling_ratio_
	: static_cast<int64_t>(ceil(roi_w / static_cast<float>(pooled_w_)));

	const std::vector<BilinearInterpolationParam<float>> params =
	MakeBilinearInterpolationParams(
	H,
	W,
	pooled_h_,
	pooled_w_,
	bin_size_h,
	bin_size_w,
	bin_grid_h,
	bin_grid_w,
	roi_h1,
	roi_w1);

	const float scale = 1.0f / static_cast<float>(bin_grid_h * bin_grid_w);
	for (int64_t c = 0; c < C; ++c) {
	int64_t cnt = 0;
	for (int64_t ph = 0; ph < pooled_h_; ++ph) {
	for (int64_t pw = 0; pw < pooled_w_; ++pw) {
	float sum = 0.0f;
	for (int64_t iy = 0; iy < bin_grid_h; ++iy) {
	for (int64_t ix = 0; ix < bin_grid_w; ++ix) {
	const BilinearInterpolationParam<float>& param = params[cnt++];
	sum += param.w1 * X_ptr[param.p1] + param.w2 * X_ptr[param.p2] +
	param.w3 * X_ptr[param.p3] + param.w4 * X_ptr[param.p4];
	}
	}
	Y_ptr[ph * pooled_w_ + pw] = sum * scale;
	}
	}
	X_ptr += H * W;
	Y_ptr += pooled_h_ * pooled_w_;
	}
	}

	return true;
	}

	template <>
	C10_EXPORT bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNHWC(
	int64_t N,
	int64_t C,
	int64_t H,
	int64_t W,
	int64_t roi_cols,
	const float* X,
	const float* R,
	float* Y) {
	DCHECK(roi_cols == 4 \|\| roi_cols == 5);
	const float roi_offset = aligned_ ? 0.5f : 0.0f;

	#ifdef _OPENMP
	#pragma omp parallel for
	#endif
	for (int64_t n = 0; n < N; ++n) {
	const int64_t roi_batch_idx = roi_cols == 4 ? 0 : R[n * roi_cols];
	const float* X_ptr = X + roi_batch_idx * C * H * W;
	const float* R_ptr = R + n * roi_cols + (roi_cols == 5);
	float* Y_ptr = Y + n * C * pooled_h_ * pooled_w_;

	// Do not using rounding; this implementation detail is critical
	const float roi_w1 = R_ptr[0] * spatial_scale_ - roi_offset;
	const float roi_h1 = R_ptr[1] * spatial_scale_ - roi_offset;
	const float roi_w2 = R_ptr[2] * spatial_scale_ - roi_offset;
	const float roi_h2 = R_ptr[3] * spatial_scale_ - roi_offset;
	float roi_w = roi_w2 - roi_w1;
	float roi_h = roi_h2 - roi_h1;
	if (aligned_) {
	CAFFE_ENFORCE(
	roi_w >= 0.0f && roi_h >= 0.0f,
	"ROIs in ROIAlign do not have non-negative size!");
	} else { // backward compatibility
	// Force malformed ROIs to be 1x1
	roi_w = std::max(roi_w, 1.0f);
	roi_h = std::max(roi_h, 1.0f);
	}
	const float bin_size_h = roi_h / static_cast<float>(pooled_h_);
	const float bin_size_w = roi_w / static_cast<float>(pooled_w_);

	// We use roi_bin_grid to sample the grid and mimic integral
	const int64_t bin_grid_h = (sampling_ratio_ > 0)
	? sampling_ratio_
	: static_cast<int64_t>(ceil(roi_h / static_cast<float>(pooled_h_)));
	const int64_t bin_grid_w = (sampling_ratio_ > 0)
	? sampling_ratio_
	: static_cast<int64_t>(ceil(roi_w / static_cast<float>(pooled_w_)));

	const std::vector<BilinearInterpolationParam<float>> params =
	MakeBilinearInterpolationParams(
	H,
	W,
	pooled_h_,
	pooled_w_,
	bin_size_h,
	bin_size_w,
	bin_grid_h,
	bin_grid_w,
	roi_h1,
	roi_w1);

	const float scale = 1.0f / static_cast<float>(bin_grid_h * bin_grid_w);
	int64_t cnt = 0;
	for (int64_t ph = 0; ph < pooled_h_; ++ph) {
	for (int64_t pw = 0; pw < pooled_w_; ++pw) {
	EigenVectorArrayMap<float> Y_arr(Y_ptr + (ph * pooled_w_ + pw) * C, C);
	Y_arr.setZero();
	for (int64_t iy = 0; iy < bin_grid_h; ++iy) {
	for (int64_t ix = 0; ix < bin_grid_w; ++ix) {
	const BilinearInterpolationParam<float>& param = params[cnt++];
	ConstEigenVectorArrayMap<float> x1_arr(X_ptr + param.p1 * C, C);
	ConstEigenVectorArrayMap<float> x2_arr(X_ptr + param.p2 * C, C);
	ConstEigenVectorArrayMap<float> x3_arr(X_ptr + param.p3 * C, C);
	ConstEigenVectorArrayMap<float> x4_arr(X_ptr + param.p4 * C, C);
	Y_arr += param.w1 * x1_arr + param.w2 * x2_arr + param.w3 * x3_arr +
	param.w4 * x4_arr;
	}
	}
	Y_arr *= scale;
	}
	}
	}

	return true;
	}

	REGISTER_CPU_OPERATOR(RoIAlign, RoIAlignOp<float, CPUContext>);

	// Input: X, rois; Output: Y
	OPERATOR_SCHEMA(RoIAlign)
	.NumInputs(2)
	.NumOutputs(1)
	.SetDoc(R"DOC(
	Region of Interest (RoI) align operation as used in Mask R-CNN.
	)DOC")
	.Arg(
	"spatial_scale",
	"(float) default 1.0; Spatial scale of the input feature map X "
	"relative to the input image. E.g., 0.0625 if X has a stride of 16 "
	"w.r.t. the input image.")
	.Arg("pooled_h", "(int) default 1; Pooled output Y's height.")
	.Arg("pooled_w", "(int) default 1; Pooled output Y's width.")
	.Arg(
	"sampling_ratio",
	"(int) default -1; number of sampling points in the interpolation grid "
	"used to compute the output value of each pooled output bin. If > 0, "
	"then exactly sampling_ratio x sampling_ratio grid points are used. If "
	"<= 0, then an adaptive number of grid points are used (computed as "
	"ceil(roi_width / pooled_w), and likewise for height).")
	.Input(0, "X", "4D feature map input of shape (N, C, H, W).")
	.Input(
	1,
	"RoIs",
	"2D input of shape (R, 4 or 5) specifying R RoIs "
	"representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
	"coordinates are in the coordinate system of the input image. For "
	"inputs corresponding to a single image, batch index can be excluded "
	"to have just 4 columns.")
	.Output(
	0,
	"Y",
	"4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
	"is a pooled feature map cooresponding to the r-th RoI.");

	template <typename T>
	using RoIAlignCPUOp = caffe2::RoIAlignOp<T, CPUContext>;

	} // namespace caffe2

	C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
	RoIAlign,
	"_caffe2::RoIAlign("
	" Tensor features,"
	" Tensor rois,"
	" str order,"
	" float spatial_scale,"
	" int pooled_h,"
	" int pooled_w,"
	" int sampling_ratio,"
	" bool aligned"
	") -> Tensor",
	caffe2::RoIAlignCPUOp<float>);