blob: 0b4f3a6a9d755334896257258924c798a227f288 [file] [log] [blame]
#include "caffe2/operators/generate_proposals_op.h"
#include "caffe2/operators/generate_proposals_op_util_boxes.h"
#include "generate_proposals_op_util_nms.h"
#ifdef CAFFE2_USE_MKL
#include "caffe2/mkl/operators/operator_fallback_mkl.h"
#endif // CAFFE2_USE_MKL
namespace caffe2 {
namespace {
// Compute the 1-d index of a n-dimensional contiguous row-major tensor for
// a given n-dimensional index 'index'
size_t ComputeStartIndex(
const TensorCPU& tensor,
const std::vector<int>& index) {
DCHECK_EQ(index.size(), tensor.ndim());
size_t ret = 0;
for (int i = 0; i < index.size(); i++) {
ret += index[i] * tensor.size_from_dim(i + 1);
}
return ret;
}
// Get a sub tensor view from 'tensor' using data pointer from 'tensor'
template <class T>
utils::ConstTensorView<T> GetSubTensorView(
const TensorCPU& tensor,
int dim0_start_index) {
DCHECK_EQ(tensor.meta().itemsize(), sizeof(T));
if (tensor.size() == 0) {
return utils::ConstTensorView<T>(nullptr, {});
}
std::vector<int> start_dims(tensor.ndim(), 0);
start_dims.at(0) = dim0_start_index;
auto st_idx = ComputeStartIndex(tensor, start_dims);
auto ptr = tensor.data<T>() + st_idx;
auto& input_dims = tensor.dims();
std::vector<int> ret_dims(input_dims.begin() + 1, input_dims.end());
utils::ConstTensorView<T> ret(ptr, ret_dims);
return ret;
}
} // namespace
namespace utils {
ERMatXf ComputeAllAnchors(
const TensorCPU& anchors,
int height,
int width,
float feat_stride) {
const auto K = height * width;
const auto A = anchors.dim(0);
const auto box_dim = anchors.dim(1);
CAFFE_ENFORCE(box_dim == 4 || box_dim == 5);
ERMatXf shift_x = (ERVecXf::LinSpaced(width, 0.0, width - 1.0) * feat_stride)
.replicate(height, 1);
ERMatXf shift_y = (EVecXf::LinSpaced(height, 0.0, height - 1.0) * feat_stride)
.replicate(1, width);
Eigen::MatrixXf shifts(K, box_dim);
if (box_dim == 4) {
// Upright boxes in [x1, y1, x2, y2] format
shifts << ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
ConstEigenVectorMap<float>(shift_y.data(), shift_y.size()),
ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
ConstEigenVectorMap<float>(shift_y.data(), shift_y.size());
} else {
// Rotated boxes in [ctr_x, ctr_y, w, h, angle] format.
// Zero shift for width, height and angle.
ERMatXf shift_zero = ERMatXf::Constant(height, width, 0.0);
shifts << ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
ConstEigenVectorMap<float>(shift_y.data(), shift_y.size()),
ConstEigenVectorMap<float>(shift_zero.data(), shift_zero.size()),
ConstEigenVectorMap<float>(shift_zero.data(), shift_zero.size()),
ConstEigenVectorMap<float>(shift_zero.data(), shift_zero.size());
}
// Broacast anchors over shifts to enumerate all anchors at all positions
// in the (H, W) grid:
// - add A anchors of shape (1, A, box_dim) to
// - K shifts of shape (K, 1, box_dim) to get
// - all shifted anchors of shape (K, A, box_dim)
// - reshape to (K*A, box_dim) shifted anchors
ConstEigenMatrixMap<float> anchors_vec(
anchors.template data<float>(), 1, A * box_dim);
// equivalent to python code
// all_anchors = (
// self._model.anchors.reshape((1, A, box_dim)) +
// shifts.reshape((1, K, box_dim)).transpose((1, 0, 2)))
// all_anchors = all_anchors.reshape((K * A, box_dim))
// all_anchors_vec: (K, A * box_dim)
ERMatXf all_anchors_vec =
anchors_vec.replicate(K, 1) + shifts.rowwise().replicate(A);
// use the following to reshape to (K * A, box_dim)
// Eigen::Map<const ERMatXf> all_anchors(
// all_anchors_vec.data(), K * A, box_dim);
return all_anchors_vec;
}
} // namespace utils
template <>
void GenerateProposalsOp<CPUContext>::ProposalsForOneImage(
const Eigen::Array3f& im_info,
const Eigen::Map<const ERMatXf>& all_anchors,
const utils::ConstTensorView<float>& bbox_deltas_tensor,
const utils::ConstTensorView<float>& scores_tensor,
ERArrXXf* out_boxes,
EArrXf* out_probs) const {
const auto& post_nms_topN = rpn_post_nms_topN_;
const auto& nms_thresh = rpn_nms_thresh_;
const auto& min_size = rpn_min_size_;
const int box_dim = static_cast<int>(all_anchors.cols());
CAFFE_ENFORCE(box_dim == 4 || box_dim == 5);
// Transpose and reshape predicted bbox transformations to get them
// into the same order as the anchors:
// - bbox deltas will be (box_dim * A, H, W) format from conv output
// - transpose to (H, W, box_dim * A)
// - reshape to (H * W * A, box_dim) where rows are ordered by (H, W, A)
// in slowest to fastest order to match the enumerated anchors
CAFFE_ENFORCE_EQ(bbox_deltas_tensor.ndim(), 3);
CAFFE_ENFORCE_EQ(bbox_deltas_tensor.dim(0) % box_dim, 0);
auto A = bbox_deltas_tensor.dim(0) / box_dim;
auto H = bbox_deltas_tensor.dim(1);
auto W = bbox_deltas_tensor.dim(2);
// equivalent to python code
// bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape((-1, box_dim))
ERArrXXf bbox_deltas(H * W * A, box_dim);
Eigen::Map<ERMatXf>(bbox_deltas.data(), H * W, box_dim * A) =
Eigen::Map<const ERMatXf>(bbox_deltas_tensor.data(), A * box_dim, H * W)
.transpose();
CAFFE_ENFORCE_EQ(bbox_deltas.rows(), all_anchors.rows());
// - scores are (A, H, W) format from conv output
// - transpose to (H, W, A)
// - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
// to match the order of anchors and bbox_deltas
CAFFE_ENFORCE_EQ(scores_tensor.ndim(), 3);
CAFFE_ENFORCE_EQ(scores_tensor.dims(), (vector<int>{A, H, W}));
// equivalent to python code
// scores = scores.transpose((1, 2, 0)).reshape((-1, 1))
EArrXf scores(scores_tensor.size());
Eigen::Map<ERMatXf>(scores.data(), H * W, A) =
Eigen::Map<const ERMatXf>(scores_tensor.data(), A, H * W).transpose();
std::vector<int> order(scores.size());
std::iota(order.begin(), order.end(), 0);
if (rpn_pre_nms_topN_ <= 0 || rpn_pre_nms_topN_ >= scores.size()) {
// 4. sort all (proposal, score) pairs by score from highest to lowest
// 5. take top pre_nms_topN (e.g. 6000)
std::sort(order.begin(), order.end(), [&scores](int lhs, int rhs) {
return scores[lhs] > scores[rhs];
});
} else {
// Avoid sorting possibly large arrays; First partition to get top K
// unsorted and then sort just those (~20x faster for 200k scores)
std::partial_sort(
order.begin(),
order.begin() + rpn_pre_nms_topN_,
order.end(),
[&scores](int lhs, int rhs) { return scores[lhs] > scores[rhs]; });
order.resize(rpn_pre_nms_topN_);
}
ERArrXXf bbox_deltas_sorted;
ERArrXXf all_anchors_sorted;
EArrXf scores_sorted;
utils::GetSubArrayRows(
bbox_deltas, utils::AsEArrXt(order), &bbox_deltas_sorted);
utils::GetSubArrayRows(
all_anchors.array(), utils::AsEArrXt(order), &all_anchors_sorted);
utils::GetSubArray(scores, utils::AsEArrXt(order), &scores_sorted);
// Transform anchors into proposals via bbox transformations
static const std::vector<float> bbox_weights{1.0, 1.0, 1.0, 1.0};
auto proposals = utils::bbox_transform(
all_anchors_sorted,
bbox_deltas_sorted,
bbox_weights,
utils::BBOX_XFORM_CLIP_DEFAULT,
correct_transform_coords_,
angle_bound_on_,
angle_bound_lo_,
angle_bound_hi_);
// 2. clip proposals to image (may result in proposals with zero area
// that will be removed in the next step)
proposals =
utils::clip_boxes(proposals, im_info[0], im_info[1], clip_angle_thresh_);
// 3. remove predicted boxes with either height or width < min_size
auto keep = utils::filter_boxes(proposals, min_size, im_info);
DCHECK_LE(keep.size(), scores_sorted.size());
// 6. apply loose nms (e.g. threshold = 0.7)
// 7. take after_nms_topN (e.g. 300)
// 8. return the top proposals (-> RoIs top)
if (post_nms_topN > 0 && post_nms_topN < keep.size()) {
keep = utils::nms_cpu(
proposals, scores_sorted, keep, nms_thresh, post_nms_topN);
} else {
keep = utils::nms_cpu(proposals, scores_sorted, keep, nms_thresh);
}
// Generate outputs
utils::GetSubArrayRows(proposals, utils::AsEArrXt(keep), out_boxes);
utils::GetSubArray(scores_sorted, utils::AsEArrXt(keep), out_probs);
}
template <>
bool GenerateProposalsOp<CPUContext>::RunOnDevice() {
const auto& scores = Input(0);
const auto& bbox_deltas = Input(1);
const auto& im_info_tensor = Input(2);
const auto& anchors = Input(3);
auto* out_rois = Output(0);
auto* out_rois_probs = Output(1);
CAFFE_ENFORCE_EQ(scores.ndim(), 4, scores.ndim());
CAFFE_ENFORCE(scores.template IsType<float>(), scores.meta().name());
const auto num_images = scores.dim(0);
const auto A = scores.dim(1);
const auto height = scores.dim(2);
const auto width = scores.dim(3);
const auto K = height * width;
const auto box_dim = anchors.dim(1);
CAFFE_ENFORCE(box_dim == 4 || box_dim == 5);
// bbox_deltas: (num_images, A * box_dim, H, W)
CAFFE_ENFORCE_EQ(
bbox_deltas.dims(),
(vector<TIndex>{num_images, box_dim * A, height, width}));
// im_info_tensor: (num_images, 3), format [height, width, scale; ...]
CAFFE_ENFORCE_EQ(im_info_tensor.dims(), (vector<TIndex>{num_images, 3}));
CAFFE_ENFORCE(
im_info_tensor.template IsType<float>(), im_info_tensor.meta().name());
// anchors: (A, box_dim)
CAFFE_ENFORCE_EQ(anchors.dims(), (vector<TIndex>{A, box_dim}));
CAFFE_ENFORCE(anchors.template IsType<float>(), anchors.meta().name());
// Broadcast the anchors to all pixels
auto all_anchors_vec =
utils::ComputeAllAnchors(anchors, height, width, feat_stride_);
Eigen::Map<const ERMatXf> all_anchors(all_anchors_vec.data(), K * A, box_dim);
Eigen::Map<const ERArrXXf> im_info(
im_info_tensor.data<float>(),
im_info_tensor.dim(0),
im_info_tensor.dim(1));
const int roi_col_count = box_dim + 1;
out_rois->Resize(0, roi_col_count);
out_rois_probs->Resize(0);
std::vector<ERArrXXf> im_boxes(num_images);
std::vector<EArrXf> im_probs(num_images);
for (int i = 0; i < num_images; i++) {
auto cur_im_info = im_info.row(i);
auto cur_bbox_deltas = GetSubTensorView<float>(bbox_deltas, i);
auto cur_scores = GetSubTensorView<float>(scores, i);
ERArrXXf& im_i_boxes = im_boxes[i];
EArrXf& im_i_probs = im_probs[i];
ProposalsForOneImage(
cur_im_info,
all_anchors,
cur_bbox_deltas,
cur_scores,
&im_i_boxes,
&im_i_probs);
}
int roi_counts = 0;
for (int i = 0; i < num_images; i++) {
roi_counts += im_boxes[i].rows();
}
out_rois->Extend(roi_counts, 50, &context_);
out_rois_probs->Extend(roi_counts, 50, &context_);
float* out_rois_ptr = out_rois->mutable_data<float>();
float* out_rois_probs_ptr = out_rois_probs->mutable_data<float>();
for (int i = 0; i < num_images; i++) {
const ERArrXXf& im_i_boxes = im_boxes[i];
const EArrXf& im_i_probs = im_probs[i];
int csz = im_i_boxes.rows();
// write rois
Eigen::Map<ERArrXXf> cur_rois(out_rois_ptr, csz, roi_col_count);
cur_rois.col(0).setConstant(i);
cur_rois.block(0, 1, csz, box_dim) = im_i_boxes;
// write rois_probs
Eigen::Map<EArrXf>(out_rois_probs_ptr, csz) = im_i_probs;
out_rois_ptr += csz * roi_col_count;
out_rois_probs_ptr += csz;
}
return true;
}
namespace {
REGISTER_CPU_OPERATOR(GenerateProposals, GenerateProposalsOp<CPUContext>);
// For backward compatibility
REGISTER_CPU_OPERATOR(GenerateProposalsCPP, GenerateProposalsOp<CPUContext>);
#ifdef CAFFE2_HAS_MKL_DNN
REGISTER_MKL_OPERATOR(
GenerateProposals,
mkl::MKLFallbackOp<GenerateProposalsOp<CPUContext>>);
// For backward compatibility
REGISTER_MKL_OPERATOR(
GenerateProposalsCPP,
mkl::MKLFallbackOp<GenerateProposalsOp<CPUContext>>);
#endif // CAFFE2_HAS_MKL_DNN
OPERATOR_SCHEMA(GenerateProposals)
.NumInputs(4)
.NumOutputs(2)
.SetDoc(R"DOC(
Generate bounding box proposals for Faster RCNN. The propoasls are generated for
a list of images based on image score 'score', bounding box regression result
'deltas' as well as predefined bounding box shapes 'anchors'. Greedy
non-maximum suppression is applied to generate the final bounding boxes.
)DOC")
.Arg("spatial_scale", "(float) spatial scale")
.Arg("pre_nms_topN", "(int) RPN_PRE_NMS_TOP_N")
.Arg("post_nms_topN", "(int) RPN_POST_NMS_TOP_N")
.Arg("nms_thresh", "(float) RPN_NMS_THRESH")
.Arg("min_size", "(float) RPN_MIN_SIZE")
.Arg(
"correct_transform_coords",
"bool (default false), Correct bounding box transform coordates,"
" see bbox_transform() in boxes.py "
"Set to true to match the detectron code, set to false for backward"
" compatibility")
.Arg(
"angle_bound_on",
"bool (default true). If set, for rotated boxes, angle is "
"normalized to be within [angle_bound_lo, angle_bound_hi].")
.Arg(
"angle_bound_lo",
"int (default -90 degrees). If set, for rotated boxes, angle is "
"normalized to be within [angle_bound_lo, angle_bound_hi].")
.Arg(
"angle_bound_hi",
"int (default 90 degrees). If set, for rotated boxes, angle is "
"normalized to be within [angle_bound_lo, angle_bound_hi].")
.Arg(
"clip_angle_thresh",
"float (default 1.0 degrees). For RRPN, clip almost horizontal boxes "
"within this threshold of tolerance for backward compatibility. "
"Set to negative value for no clipping.")
.Input(0, "scores", "Scores from conv layer, size (img_count, A, H, W)")
.Input(
1,
"bbox_deltas",
"Bounding box deltas from conv layer, "
"size (img_count, 4 * A, H, W)")
.Input(
2,
"im_info",
"Image info, size (img_count, 3), "
"format (height, width, scale)")
.Input(3, "anchors", "Bounding box anchors, size (A, 4)")
.Output(
0,
"rois",
"Proposals, size (n x 5), "
"format (image_index, x1, y1, x2, y2)")
.Output(1, "rois_probs", "scores of proposals, size (n)");
// For backward compatibility
OPERATOR_SCHEMA(GenerateProposalsCPP).NumInputs(4).NumOutputs(2);
SHOULD_NOT_DO_GRADIENT(GenerateProposals);
// For backward compatibility
SHOULD_NOT_DO_GRADIENT(GenerateProposalsCPP);
} // namespace
} // namespace caffe2