caffe2/operators/generate_proposals_op.cc - platform/external/pytorch - Git at Google

 #include "caffe2/operators/generate_proposals_op.h"
 #include "caffe2/operators/generate_proposals_op_util_boxes.h"
 #include "generate_proposals_op_util_nms.h"

 #ifdef CAFFE2_USE_MKL
 #include "caffe2/mkl/operators/operator_fallback_mkl.h"
 #endif // CAFFE2_USE_MKL

 namespace caffe2 {

 namespace {

 // Compute the 1-d index of a n-dimensional contiguous row-major tensor for
 //     a given n-dimensional index 'index'
 size_t ComputeStartIndex(
     const TensorCPU& tensor,
     const std::vector<int>& index) {
   DCHECK_EQ(index.size(), tensor.ndim());

   size_t ret = 0;
   for (int i = 0; i < index.size(); i++) {
     ret += index[i] * tensor.size_from_dim(i + 1);
   }

   return ret;
 }

 // Get a sub tensor view from 'tensor' using data pointer from 'tensor'
 template <class T>
 utils::ConstTensorView<T> GetSubTensorView(
     const TensorCPU& tensor,
     int dim0_start_index) {
   DCHECK_EQ(tensor.meta().itemsize(), sizeof(T));

   if (tensor.size() == 0) {
     return utils::ConstTensorView<T>(nullptr, {});
   }

   std::vector<int> start_dims(tensor.ndim(), 0);
   start_dims.at(0) = dim0_start_index;
   auto st_idx = ComputeStartIndex(tensor, start_dims);
   auto ptr = tensor.data<T>() + st_idx;

   auto& input_dims = tensor.dims();
   std::vector<int> ret_dims(input_dims.begin() + 1, input_dims.end());

   utils::ConstTensorView<T> ret(ptr, ret_dims);
   return ret;
 }

 } // namespace

 namespace utils {

 ERMatXf ComputeAllAnchors(
     const TensorCPU& anchors,
     int height,
     int width,
     float feat_stride) {
   const auto K = height * width;
   const auto A = anchors.dim(0);
   const auto box_dim = anchors.dim(1);
   CAFFE_ENFORCE(box_dim == 4 || box_dim == 5);

   ERMatXf shift_x = (ERVecXf::LinSpaced(width, 0.0, width - 1.0) * feat_stride)
                         .replicate(height, 1);
   ERMatXf shift_y = (EVecXf::LinSpaced(height, 0.0, height - 1.0) * feat_stride)
                         .replicate(1, width);
   Eigen::MatrixXf shifts(K, box_dim);
   if (box_dim == 4) {
     // Upright boxes in [x1, y1, x2, y2] format
     shifts << ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
         ConstEigenVectorMap<float>(shift_y.data(), shift_y.size()),
         ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
         ConstEigenVectorMap<float>(shift_y.data(), shift_y.size());
   } else {
     // Rotated boxes in [ctr_x, ctr_y, w, h, angle] format.
     // Zero shift for width, height and angle.
     ERMatXf shift_zero = ERMatXf::Constant(height, width, 0.0);
     shifts << ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
         ConstEigenVectorMap<float>(shift_y.data(), shift_y.size()),
         ConstEigenVectorMap<float>(shift_zero.data(), shift_zero.size()),
         ConstEigenVectorMap<float>(shift_zero.data(), shift_zero.size()),
         ConstEigenVectorMap<float>(shift_zero.data(), shift_zero.size());
   }

   // Broacast anchors over shifts to enumerate all anchors at all positions
   // in the (H, W) grid:
   //   - add A anchors of shape (1, A, box_dim) to
   //   - K shifts of shape (K, 1, box_dim) to get
   //   - all shifted anchors of shape (K, A, box_dim)
   //   - reshape to (K*A, box_dim) shifted anchors
   ConstEigenMatrixMap<float> anchors_vec(
       anchors.template data<float>(), 1, A * box_dim);
   // equivalent to python code
   //  all_anchors = (
   //        self._model.anchors.reshape((1, A, box_dim)) +
   //        shifts.reshape((1, K, box_dim)).transpose((1, 0, 2)))
   //    all_anchors = all_anchors.reshape((K * A, box_dim))
   // all_anchors_vec: (K, A * box_dim)
   ERMatXf all_anchors_vec =
       anchors_vec.replicate(K, 1) + shifts.rowwise().replicate(A);

   // use the following to reshape to (K * A, box_dim)
   // Eigen::Map<const ERMatXf> all_anchors(
   //            all_anchors_vec.data(), K * A, box_dim);

   return all_anchors_vec;
 }

 } // namespace utils

 template <>
 void GenerateProposalsOp<CPUContext>::ProposalsForOneImage(
     const Eigen::Array3f& im_info,
     const Eigen::Map<const ERMatXf>& all_anchors,
     const utils::ConstTensorView<float>& bbox_deltas_tensor,
     const utils::ConstTensorView<float>& scores_tensor,
     ERArrXXf* out_boxes,
     EArrXf* out_probs) const {
   const auto& post_nms_topN = rpn_post_nms_topN_;
   const auto& nms_thresh = rpn_nms_thresh_;
   const auto& min_size = rpn_min_size_;
   const int box_dim = static_cast<int>(all_anchors.cols());
   CAFFE_ENFORCE(box_dim == 4 || box_dim == 5);

   // Transpose and reshape predicted bbox transformations to get them
   // into the same order as the anchors:
   //   - bbox deltas will be (box_dim * A, H, W) format from conv output
   //   - transpose to (H, W, box_dim * A)
   //   - reshape to (H * W * A, box_dim) where rows are ordered by (H, W, A)
   //     in slowest to fastest order to match the enumerated anchors
   CAFFE_ENFORCE_EQ(bbox_deltas_tensor.ndim(), 3);
   CAFFE_ENFORCE_EQ(bbox_deltas_tensor.dim(0) % box_dim, 0);
   auto A = bbox_deltas_tensor.dim(0) / box_dim;
   auto H = bbox_deltas_tensor.dim(1);
   auto W = bbox_deltas_tensor.dim(2);
   // equivalent to python code
   //  bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape((-1, box_dim))
   ERArrXXf bbox_deltas(H * W * A, box_dim);
   Eigen::Map<ERMatXf>(bbox_deltas.data(), H * W, box_dim * A) =
       Eigen::Map<const ERMatXf>(bbox_deltas_tensor.data(), A * box_dim, H * W)
           .transpose();
   CAFFE_ENFORCE_EQ(bbox_deltas.rows(), all_anchors.rows());

   // - scores are (A, H, W) format from conv output
   // - transpose to (H, W, A)
   // - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
   //   to match the order of anchors and bbox_deltas
   CAFFE_ENFORCE_EQ(scores_tensor.ndim(), 3);
   CAFFE_ENFORCE_EQ(scores_tensor.dims(), (vector<int>{A, H, W}));
   // equivalent to python code
   // scores = scores.transpose((1, 2, 0)).reshape((-1, 1))
   EArrXf scores(scores_tensor.size());
   Eigen::Map<ERMatXf>(scores.data(), H * W, A) =
       Eigen::Map<const ERMatXf>(scores_tensor.data(), A, H * W).transpose();

   std::vector<int> order(scores.size());
   std::iota(order.begin(), order.end(), 0);
   if (rpn_pre_nms_topN_ <= 0 || rpn_pre_nms_topN_ >= scores.size()) {
     // 4. sort all (proposal, score) pairs by score from highest to lowest
     // 5. take top pre_nms_topN (e.g. 6000)
     std::sort(order.begin(), order.end(), [&scores](int lhs, int rhs) {
       return scores[lhs] > scores[rhs];
     });
   } else {
     // Avoid sorting possibly large arrays; First partition to get top K
     // unsorted and then sort just those (~20x faster for 200k scores)
     std::partial_sort(
         order.begin(),
         order.begin() + rpn_pre_nms_topN_,
         order.end(),
         [&scores](int lhs, int rhs) { return scores[lhs] > scores[rhs]; });
     order.resize(rpn_pre_nms_topN_);
   }

   ERArrXXf bbox_deltas_sorted;
   ERArrXXf all_anchors_sorted;
   EArrXf scores_sorted;
   utils::GetSubArrayRows(
       bbox_deltas, utils::AsEArrXt(order), &bbox_deltas_sorted);
   utils::GetSubArrayRows(
       all_anchors.array(), utils::AsEArrXt(order), &all_anchors_sorted);
   utils::GetSubArray(scores, utils::AsEArrXt(order), &scores_sorted);

   // Transform anchors into proposals via bbox transformations
   static const std::vector<float> bbox_weights{1.0, 1.0, 1.0, 1.0};
   auto proposals = utils::bbox_transform(
       all_anchors_sorted,
       bbox_deltas_sorted,
       bbox_weights,
       utils::BBOX_XFORM_CLIP_DEFAULT,
       correct_transform_coords_,
       angle_bound_on_,
       angle_bound_lo_,
       angle_bound_hi_);

   // 2. clip proposals to image (may result in proposals with zero area
   // that will be removed in the next step)
   proposals =
       utils::clip_boxes(proposals, im_info[0], im_info[1], clip_angle_thresh_);

   // 3. remove predicted boxes with either height or width < min_size
   auto keep = utils::filter_boxes(proposals, min_size, im_info);
   DCHECK_LE(keep.size(), scores_sorted.size());

   // 6. apply loose nms (e.g. threshold = 0.7)
   // 7. take after_nms_topN (e.g. 300)
   // 8. return the top proposals (-> RoIs top)
   if (post_nms_topN > 0 && post_nms_topN < keep.size()) {
     keep = utils::nms_cpu(
         proposals, scores_sorted, keep, nms_thresh, post_nms_topN);
   } else {
     keep = utils::nms_cpu(proposals, scores_sorted, keep, nms_thresh);
   }

   // Generate outputs
   utils::GetSubArrayRows(proposals, utils::AsEArrXt(keep), out_boxes);
   utils::GetSubArray(scores_sorted, utils::AsEArrXt(keep), out_probs);
 }

 template <>
 bool GenerateProposalsOp<CPUContext>::RunOnDevice() {
   const auto& scores = Input(0);
   const auto& bbox_deltas = Input(1);
   const auto& im_info_tensor = Input(2);
   const auto& anchors = Input(3);
   auto* out_rois = Output(0);
   auto* out_rois_probs = Output(1);

   CAFFE_ENFORCE_EQ(scores.ndim(), 4, scores.ndim());
   CAFFE_ENFORCE(scores.template IsType<float>(), scores.meta().name());
   const auto num_images = scores.dim(0);
   const auto A = scores.dim(1);
   const auto height = scores.dim(2);
   const auto width = scores.dim(3);
   const auto K = height * width;
   const auto box_dim = anchors.dim(1);
   CAFFE_ENFORCE(box_dim == 4 || box_dim == 5);

   // bbox_deltas: (num_images, A * box_dim, H, W)
   CAFFE_ENFORCE_EQ(
       bbox_deltas.dims(),
       (vector<TIndex>{num_images, box_dim * A, height, width}));

   // im_info_tensor: (num_images, 3), format [height, width, scale; ...]
   CAFFE_ENFORCE_EQ(im_info_tensor.dims(), (vector<TIndex>{num_images, 3}));
   CAFFE_ENFORCE(
       im_info_tensor.template IsType<float>(), im_info_tensor.meta().name());

   // anchors: (A, box_dim)
   CAFFE_ENFORCE_EQ(anchors.dims(), (vector<TIndex>{A, box_dim}));
   CAFFE_ENFORCE(anchors.template IsType<float>(), anchors.meta().name());

   // Broadcast the anchors to all pixels
   auto all_anchors_vec =
       utils::ComputeAllAnchors(anchors, height, width, feat_stride_);
   Eigen::Map<const ERMatXf> all_anchors(all_anchors_vec.data(), K * A, box_dim);

   Eigen::Map<const ERArrXXf> im_info(
       im_info_tensor.data<float>(),
       im_info_tensor.dim(0),
       im_info_tensor.dim(1));

   const int roi_col_count = box_dim + 1;
   out_rois->Resize(0, roi_col_count);
   out_rois_probs->Resize(0);

   std::vector<ERArrXXf> im_boxes(num_images);
   std::vector<EArrXf> im_probs(num_images);
   for (int i = 0; i < num_images; i++) {
     auto cur_im_info = im_info.row(i);
     auto cur_bbox_deltas = GetSubTensorView<float>(bbox_deltas, i);
     auto cur_scores = GetSubTensorView<float>(scores, i);

     ERArrXXf& im_i_boxes = im_boxes[i];
     EArrXf& im_i_probs = im_probs[i];
     ProposalsForOneImage(
         cur_im_info,
         all_anchors,
         cur_bbox_deltas,
         cur_scores,
         &im_i_boxes,
         &im_i_probs);
   }

   int roi_counts = 0;
   for (int i = 0; i < num_images; i++) {
     roi_counts += im_boxes[i].rows();
   }
   out_rois->Extend(roi_counts, 50, &context_);
   out_rois_probs->Extend(roi_counts, 50, &context_);
   float* out_rois_ptr = out_rois->mutable_data<float>();
   float* out_rois_probs_ptr = out_rois_probs->mutable_data<float>();
   for (int i = 0; i < num_images; i++) {
     const ERArrXXf& im_i_boxes = im_boxes[i];
     const EArrXf& im_i_probs = im_probs[i];
     int csz = im_i_boxes.rows();

     // write rois
     Eigen::Map<ERArrXXf> cur_rois(out_rois_ptr, csz, roi_col_count);
     cur_rois.col(0).setConstant(i);
     cur_rois.block(0, 1, csz, box_dim) = im_i_boxes;

     // write rois_probs
     Eigen::Map<EArrXf>(out_rois_probs_ptr, csz) = im_i_probs;

     out_rois_ptr += csz * roi_col_count;
     out_rois_probs_ptr += csz;
   }

   return true;
 }

 namespace {

 REGISTER_CPU_OPERATOR(GenerateProposals, GenerateProposalsOp<CPUContext>);
 // For backward compatibility
 REGISTER_CPU_OPERATOR(GenerateProposalsCPP, GenerateProposalsOp<CPUContext>);

 #ifdef CAFFE2_HAS_MKL_DNN
 REGISTER_MKL_OPERATOR(
     GenerateProposals,
     mkl::MKLFallbackOp<GenerateProposalsOp<CPUContext>>);
 // For backward compatibility
 REGISTER_MKL_OPERATOR(
     GenerateProposalsCPP,
     mkl::MKLFallbackOp<GenerateProposalsOp<CPUContext>>);
 #endif // CAFFE2_HAS_MKL_DNN

 OPERATOR_SCHEMA(GenerateProposals)
     .NumInputs(4)
     .NumOutputs(2)
     .SetDoc(R"DOC(
 Generate bounding box proposals for Faster RCNN. The propoasls are generated for
 a list of images based on image score 'score', bounding box regression result
 'deltas' as well as predefined bounding box shapes 'anchors'. Greedy
 non-maximum suppression is applied to generate the final bounding boxes.
 )DOC")
     .Arg("spatial_scale", "(float) spatial scale")
     .Arg("pre_nms_topN", "(int) RPN_PRE_NMS_TOP_N")
     .Arg("post_nms_topN", "(int) RPN_POST_NMS_TOP_N")
     .Arg("nms_thresh", "(float) RPN_NMS_THRESH")
     .Arg("min_size", "(float) RPN_MIN_SIZE")
     .Arg(
         "correct_transform_coords",
         "bool (default false), Correct bounding box transform coordates,"
         " see bbox_transform() in boxes.py "
         "Set to true to match the detectron code, set to false for backward"
         " compatibility")
     .Arg(
         "angle_bound_on",
         "bool (default true). If set, for rotated boxes, angle is "
         "normalized to be within [angle_bound_lo, angle_bound_hi].")
     .Arg(
         "angle_bound_lo",
         "int (default -90 degrees). If set, for rotated boxes, angle is "
         "normalized to be within [angle_bound_lo, angle_bound_hi].")
     .Arg(
         "angle_bound_hi",
         "int (default 90 degrees). If set, for rotated boxes, angle is "
         "normalized to be within [angle_bound_lo, angle_bound_hi].")
     .Arg(
         "clip_angle_thresh",
         "float (default 1.0 degrees). For RRPN, clip almost horizontal boxes "
         "within this threshold of tolerance for backward compatibility. "
         "Set to negative value for no clipping.")
     .Input(0, "scores", "Scores from conv layer, size (img_count, A, H, W)")
     .Input(
         1,
         "bbox_deltas",
         "Bounding box deltas from conv layer, "
         "size (img_count, 4 * A, H, W)")
     .Input(
         2,
         "im_info",
         "Image info, size (img_count, 3), "
         "format (height, width, scale)")
     .Input(3, "anchors", "Bounding box anchors, size (A, 4)")
     .Output(
         0,
         "rois",
         "Proposals, size (n x 5), "
         "format (image_index, x1, y1, x2, y2)")
     .Output(1, "rois_probs", "scores of proposals, size (n)");
 // For backward compatibility
 OPERATOR_SCHEMA(GenerateProposalsCPP).NumInputs(4).NumOutputs(2);

 SHOULD_NOT_DO_GRADIENT(GenerateProposals);
 // For backward compatibility
 SHOULD_NOT_DO_GRADIENT(GenerateProposalsCPP);

 } // namespace
 } // namespace caffe2
	#include "caffe2/operators/generate_proposals_op.h"
	#include "caffe2/operators/generate_proposals_op_util_boxes.h"
	#include "generate_proposals_op_util_nms.h"

	#ifdef CAFFE2_USE_MKL
	#include "caffe2/mkl/operators/operator_fallback_mkl.h"
	#endif // CAFFE2_USE_MKL

	namespace caffe2 {

	namespace {

	// Compute the 1-d index of a n-dimensional contiguous row-major tensor for
	// a given n-dimensional index 'index'
	size_t ComputeStartIndex(
	const TensorCPU& tensor,
	const std::vector<int>& index) {
	DCHECK_EQ(index.size(), tensor.ndim());

	size_t ret = 0;
	for (int i = 0; i < index.size(); i++) {
	ret += index[i] * tensor.size_from_dim(i + 1);
	}

	return ret;
	}

	// Get a sub tensor view from 'tensor' using data pointer from 'tensor'
	template <class T>
	utils::ConstTensorView<T> GetSubTensorView(
	const TensorCPU& tensor,
	int dim0_start_index) {
	DCHECK_EQ(tensor.meta().itemsize(), sizeof(T));

	if (tensor.size() == 0) {
	return utils::ConstTensorView<T>(nullptr, {});
	}

	std::vector<int> start_dims(tensor.ndim(), 0);
	start_dims.at(0) = dim0_start_index;
	auto st_idx = ComputeStartIndex(tensor, start_dims);
	auto ptr = tensor.data<T>() + st_idx;

	auto& input_dims = tensor.dims();
	std::vector<int> ret_dims(input_dims.begin() + 1, input_dims.end());

	utils::ConstTensorView<T> ret(ptr, ret_dims);
	return ret;
	}

	} // namespace

	namespace utils {

	ERMatXf ComputeAllAnchors(
	const TensorCPU& anchors,
	int height,
	int width,
	float feat_stride) {
	const auto K = height * width;
	const auto A = anchors.dim(0);
	const auto box_dim = anchors.dim(1);
	CAFFE_ENFORCE(box_dim == 4 \|\| box_dim == 5);

	ERMatXf shift_x = (ERVecXf::LinSpaced(width, 0.0, width - 1.0) * feat_stride)
	.replicate(height, 1);
	ERMatXf shift_y = (EVecXf::LinSpaced(height, 0.0, height - 1.0) * feat_stride)
	.replicate(1, width);
	Eigen::MatrixXf shifts(K, box_dim);
	if (box_dim == 4) {
	// Upright boxes in [x1, y1, x2, y2] format
	shifts << ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
	ConstEigenVectorMap<float>(shift_y.data(), shift_y.size()),
	ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
	ConstEigenVectorMap<float>(shift_y.data(), shift_y.size());
	} else {
	// Rotated boxes in [ctr_x, ctr_y, w, h, angle] format.
	// Zero shift for width, height and angle.
	ERMatXf shift_zero = ERMatXf::Constant(height, width, 0.0);
	shifts << ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
	ConstEigenVectorMap<float>(shift_y.data(), shift_y.size()),
	ConstEigenVectorMap<float>(shift_zero.data(), shift_zero.size()),
	ConstEigenVectorMap<float>(shift_zero.data(), shift_zero.size()),
	ConstEigenVectorMap<float>(shift_zero.data(), shift_zero.size());
	}

	// Broacast anchors over shifts to enumerate all anchors at all positions
	// in the (H, W) grid:
	// - add A anchors of shape (1, A, box_dim) to
	// - K shifts of shape (K, 1, box_dim) to get
	// - all shifted anchors of shape (K, A, box_dim)
	// - reshape to (K*A, box_dim) shifted anchors
	ConstEigenMatrixMap<float> anchors_vec(
	anchors.template data<float>(), 1, A * box_dim);
	// equivalent to python code
	// all_anchors = (
	// self._model.anchors.reshape((1, A, box_dim)) +
	// shifts.reshape((1, K, box_dim)).transpose((1, 0, 2)))
	// all_anchors = all_anchors.reshape((K * A, box_dim))
	// all_anchors_vec: (K, A * box_dim)
	ERMatXf all_anchors_vec =
	anchors_vec.replicate(K, 1) + shifts.rowwise().replicate(A);

	// use the following to reshape to (K * A, box_dim)
	// Eigen::Map<const ERMatXf> all_anchors(
	// all_anchors_vec.data(), K * A, box_dim);

	return all_anchors_vec;
	}

	} // namespace utils

	template <>
	void GenerateProposalsOp<CPUContext>::ProposalsForOneImage(
	const Eigen::Array3f& im_info,
	const Eigen::Map<const ERMatXf>& all_anchors,
	const utils::ConstTensorView<float>& bbox_deltas_tensor,
	const utils::ConstTensorView<float>& scores_tensor,
	ERArrXXf* out_boxes,
	EArrXf* out_probs) const {
	const auto& post_nms_topN = rpn_post_nms_topN_;
	const auto& nms_thresh = rpn_nms_thresh_;
	const auto& min_size = rpn_min_size_;
	const int box_dim = static_cast<int>(all_anchors.cols());
	CAFFE_ENFORCE(box_dim == 4 \|\| box_dim == 5);

	// Transpose and reshape predicted bbox transformations to get them
	// into the same order as the anchors:
	// - bbox deltas will be (box_dim * A, H, W) format from conv output
	// - transpose to (H, W, box_dim * A)
	// - reshape to (H * W * A, box_dim) where rows are ordered by (H, W, A)
	// in slowest to fastest order to match the enumerated anchors
	CAFFE_ENFORCE_EQ(bbox_deltas_tensor.ndim(), 3);
	CAFFE_ENFORCE_EQ(bbox_deltas_tensor.dim(0) % box_dim, 0);
	auto A = bbox_deltas_tensor.dim(0) / box_dim;
	auto H = bbox_deltas_tensor.dim(1);
	auto W = bbox_deltas_tensor.dim(2);
	// equivalent to python code
	// bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape((-1, box_dim))
	ERArrXXf bbox_deltas(H * W * A, box_dim);
	Eigen::Map<ERMatXf>(bbox_deltas.data(), H * W, box_dim * A) =
	Eigen::Map<const ERMatXf>(bbox_deltas_tensor.data(), A * box_dim, H * W)
	.transpose();
	CAFFE_ENFORCE_EQ(bbox_deltas.rows(), all_anchors.rows());

	// - scores are (A, H, W) format from conv output
	// - transpose to (H, W, A)
	// - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
	// to match the order of anchors and bbox_deltas
	CAFFE_ENFORCE_EQ(scores_tensor.ndim(), 3);
	CAFFE_ENFORCE_EQ(scores_tensor.dims(), (vector<int>{A, H, W}));
	// equivalent to python code
	// scores = scores.transpose((1, 2, 0)).reshape((-1, 1))
	EArrXf scores(scores_tensor.size());
	Eigen::Map<ERMatXf>(scores.data(), H * W, A) =
	Eigen::Map<const ERMatXf>(scores_tensor.data(), A, H * W).transpose();

	std::vector<int> order(scores.size());
	std::iota(order.begin(), order.end(), 0);
	if (rpn_pre_nms_topN_ <= 0 \|\| rpn_pre_nms_topN_ >= scores.size()) {
	// 4. sort all (proposal, score) pairs by score from highest to lowest
	// 5. take top pre_nms_topN (e.g. 6000)
	std::sort(order.begin(), order.end(), [&scores](int lhs, int rhs) {
	return scores[lhs] > scores[rhs];
	});
	} else {
	// Avoid sorting possibly large arrays; First partition to get top K
	// unsorted and then sort just those (~20x faster for 200k scores)
	std::partial_sort(
	order.begin(),
	order.begin() + rpn_pre_nms_topN_,
	order.end(),
	[&scores](int lhs, int rhs) { return scores[lhs] > scores[rhs]; });
	order.resize(rpn_pre_nms_topN_);
	}

	ERArrXXf bbox_deltas_sorted;
	ERArrXXf all_anchors_sorted;
	EArrXf scores_sorted;
	utils::GetSubArrayRows(
	bbox_deltas, utils::AsEArrXt(order), &bbox_deltas_sorted);
	utils::GetSubArrayRows(
	all_anchors.array(), utils::AsEArrXt(order), &all_anchors_sorted);
	utils::GetSubArray(scores, utils::AsEArrXt(order), &scores_sorted);

	// Transform anchors into proposals via bbox transformations
	static const std::vector<float> bbox_weights{1.0, 1.0, 1.0, 1.0};
	auto proposals = utils::bbox_transform(
	all_anchors_sorted,
	bbox_deltas_sorted,
	bbox_weights,
	utils::BBOX_XFORM_CLIP_DEFAULT,
	correct_transform_coords_,
	angle_bound_on_,
	angle_bound_lo_,
	angle_bound_hi_);

	// 2. clip proposals to image (may result in proposals with zero area
	// that will be removed in the next step)
	proposals =
	utils::clip_boxes(proposals, im_info[0], im_info[1], clip_angle_thresh_);

	// 3. remove predicted boxes with either height or width < min_size
	auto keep = utils::filter_boxes(proposals, min_size, im_info);
	DCHECK_LE(keep.size(), scores_sorted.size());

	// 6. apply loose nms (e.g. threshold = 0.7)
	// 7. take after_nms_topN (e.g. 300)
	// 8. return the top proposals (-> RoIs top)
	if (post_nms_topN > 0 && post_nms_topN < keep.size()) {
	keep = utils::nms_cpu(
	proposals, scores_sorted, keep, nms_thresh, post_nms_topN);
	} else {
	keep = utils::nms_cpu(proposals, scores_sorted, keep, nms_thresh);
	}

	// Generate outputs
	utils::GetSubArrayRows(proposals, utils::AsEArrXt(keep), out_boxes);
	utils::GetSubArray(scores_sorted, utils::AsEArrXt(keep), out_probs);
	}

	template <>
	bool GenerateProposalsOp<CPUContext>::RunOnDevice() {
	const auto& scores = Input(0);
	const auto& bbox_deltas = Input(1);
	const auto& im_info_tensor = Input(2);
	const auto& anchors = Input(3);
	auto* out_rois = Output(0);
	auto* out_rois_probs = Output(1);

	CAFFE_ENFORCE_EQ(scores.ndim(), 4, scores.ndim());
	CAFFE_ENFORCE(scores.template IsType<float>(), scores.meta().name());
	const auto num_images = scores.dim(0);
	const auto A = scores.dim(1);
	const auto height = scores.dim(2);
	const auto width = scores.dim(3);
	const auto K = height * width;
	const auto box_dim = anchors.dim(1);
	CAFFE_ENFORCE(box_dim == 4 \|\| box_dim == 5);

	// bbox_deltas: (num_images, A * box_dim, H, W)
	CAFFE_ENFORCE_EQ(
	bbox_deltas.dims(),
	(vector<TIndex>{num_images, box_dim * A, height, width}));

	// im_info_tensor: (num_images, 3), format [height, width, scale; ...]
	CAFFE_ENFORCE_EQ(im_info_tensor.dims(), (vector<TIndex>{num_images, 3}));
	CAFFE_ENFORCE(
	im_info_tensor.template IsType<float>(), im_info_tensor.meta().name());

	// anchors: (A, box_dim)
	CAFFE_ENFORCE_EQ(anchors.dims(), (vector<TIndex>{A, box_dim}));
	CAFFE_ENFORCE(anchors.template IsType<float>(), anchors.meta().name());

	// Broadcast the anchors to all pixels
	auto all_anchors_vec =
	utils::ComputeAllAnchors(anchors, height, width, feat_stride_);
	Eigen::Map<const ERMatXf> all_anchors(all_anchors_vec.data(), K * A, box_dim);

	Eigen::Map<const ERArrXXf> im_info(
	im_info_tensor.data<float>(),
	im_info_tensor.dim(0),
	im_info_tensor.dim(1));

	const int roi_col_count = box_dim + 1;
	out_rois->Resize(0, roi_col_count);
	out_rois_probs->Resize(0);

	std::vector<ERArrXXf> im_boxes(num_images);
	std::vector<EArrXf> im_probs(num_images);
	for (int i = 0; i < num_images; i++) {
	auto cur_im_info = im_info.row(i);
	auto cur_bbox_deltas = GetSubTensorView<float>(bbox_deltas, i);
	auto cur_scores = GetSubTensorView<float>(scores, i);

	ERArrXXf& im_i_boxes = im_boxes[i];
	EArrXf& im_i_probs = im_probs[i];
	ProposalsForOneImage(
	cur_im_info,
	all_anchors,
	cur_bbox_deltas,
	cur_scores,
	&im_i_boxes,
	&im_i_probs);
	}

	int roi_counts = 0;
	for (int i = 0; i < num_images; i++) {
	roi_counts += im_boxes[i].rows();
	}
	out_rois->Extend(roi_counts, 50, &context_);
	out_rois_probs->Extend(roi_counts, 50, &context_);
	float* out_rois_ptr = out_rois->mutable_data<float>();
	float* out_rois_probs_ptr = out_rois_probs->mutable_data<float>();
	for (int i = 0; i < num_images; i++) {
	const ERArrXXf& im_i_boxes = im_boxes[i];
	const EArrXf& im_i_probs = im_probs[i];
	int csz = im_i_boxes.rows();

	// write rois
	Eigen::Map<ERArrXXf> cur_rois(out_rois_ptr, csz, roi_col_count);
	cur_rois.col(0).setConstant(i);
	cur_rois.block(0, 1, csz, box_dim) = im_i_boxes;

	// write rois_probs
	Eigen::Map<EArrXf>(out_rois_probs_ptr, csz) = im_i_probs;

	out_rois_ptr += csz * roi_col_count;
	out_rois_probs_ptr += csz;
	}

	return true;
	}

	namespace {

	REGISTER_CPU_OPERATOR(GenerateProposals, GenerateProposalsOp<CPUContext>);
	// For backward compatibility
	REGISTER_CPU_OPERATOR(GenerateProposalsCPP, GenerateProposalsOp<CPUContext>);

	#ifdef CAFFE2_HAS_MKL_DNN
	REGISTER_MKL_OPERATOR(
	GenerateProposals,
	mkl::MKLFallbackOp<GenerateProposalsOp<CPUContext>>);
	// For backward compatibility
	REGISTER_MKL_OPERATOR(
	GenerateProposalsCPP,
	mkl::MKLFallbackOp<GenerateProposalsOp<CPUContext>>);
	#endif // CAFFE2_HAS_MKL_DNN

	OPERATOR_SCHEMA(GenerateProposals)
	.NumInputs(4)
	.NumOutputs(2)
	.SetDoc(R"DOC(
	Generate bounding box proposals for Faster RCNN. The propoasls are generated for
	a list of images based on image score 'score', bounding box regression result
	'deltas' as well as predefined bounding box shapes 'anchors'. Greedy
	non-maximum suppression is applied to generate the final bounding boxes.
	)DOC")
	.Arg("spatial_scale", "(float) spatial scale")
	.Arg("pre_nms_topN", "(int) RPN_PRE_NMS_TOP_N")
	.Arg("post_nms_topN", "(int) RPN_POST_NMS_TOP_N")
	.Arg("nms_thresh", "(float) RPN_NMS_THRESH")
	.Arg("min_size", "(float) RPN_MIN_SIZE")
	.Arg(
	"correct_transform_coords",
	"bool (default false), Correct bounding box transform coordates,"
	" see bbox_transform() in boxes.py "
	"Set to true to match the detectron code, set to false for backward"
	" compatibility")
	.Arg(
	"angle_bound_on",
	"bool (default true). If set, for rotated boxes, angle is "
	"normalized to be within [angle_bound_lo, angle_bound_hi].")
	.Arg(
	"angle_bound_lo",
	"int (default -90 degrees). If set, for rotated boxes, angle is "
	"normalized to be within [angle_bound_lo, angle_bound_hi].")
	.Arg(
	"angle_bound_hi",
	"int (default 90 degrees). If set, for rotated boxes, angle is "
	"normalized to be within [angle_bound_lo, angle_bound_hi].")
	.Arg(
	"clip_angle_thresh",
	"float (default 1.0 degrees). For RRPN, clip almost horizontal boxes "
	"within this threshold of tolerance for backward compatibility. "
	"Set to negative value for no clipping.")
	.Input(0, "scores", "Scores from conv layer, size (img_count, A, H, W)")
	.Input(
	1,
	"bbox_deltas",
	"Bounding box deltas from conv layer, "
	"size (img_count, 4 * A, H, W)")
	.Input(
	2,
	"im_info",
	"Image info, size (img_count, 3), "
	"format (height, width, scale)")
	.Input(3, "anchors", "Bounding box anchors, size (A, 4)")
	.Output(
	0,
	"rois",
	"Proposals, size (n x 5), "
	"format (image_index, x1, y1, x2, y2)")
	.Output(1, "rois_probs", "scores of proposals, size (n)");
	// For backward compatibility
	OPERATOR_SCHEMA(GenerateProposalsCPP).NumInputs(4).NumOutputs(2);

	SHOULD_NOT_DO_GRADIENT(GenerateProposals);
	// For backward compatibility
	SHOULD_NOT_DO_GRADIENT(GenerateProposalsCPP);

	} // namespace
	} // namespace caffe2