Adds two new subpel search methods

One is a more aggressive version of the pruned subpel tree
search where only a single halfpel candidate is searched.
The search candidate is based on a surface fit result.
The other is a method to obtain the subpel position at one
shot based on the same surface fit.

The methods have not been deployed in any speed setting yet.

Change-Id: I34fef3f2e34f11396c9d1ba97f4be8c4ffca62d3
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 89c37d9..258b459 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -286,6 +286,190 @@
   bestmv->row *= 8;                                                        \
   bestmv->col *= 8;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#define SETUP_CENTER_ERROR                                                   \
+  if (second_pred != NULL) {                                                 \
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {                       \
+      DECLARE_ALIGNED_ARRAY(16, uint16_t, comp_pred16, 64 * 64);             \
+      vp9_high_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,     \
+                             y_stride);                                      \
+      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, z, src_stride,   \
+                        sse1);                                               \
+    } else {                                                                 \
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                \
+      vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
+      besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                  \
+    }                                                                        \
+  } else {                                                                   \
+    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);            \
+  }                                                                          \
+  *distortion = besterr;                                                     \
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+#else
+
+#define SETUP_CENTER_ERROR                                                   \
+  if (second_pred != NULL) {                                                 \
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                  \
+    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);   \
+    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                    \
+  } else {                                                                   \
+    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);            \
+  }                                                                          \
+  *distortion = besterr;                                                     \
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+
+
+
+static INLINE int divide_and_round(const int n, const int d) {
+  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_sad_list_wellbehaved(int *sad_list) {
+  return sad_list[0] >= sad_list[1] &&
+         sad_list[0] >= sad_list[2] &&
+         sad_list[0] >= sad_list[3] &&
+         sad_list[0] >= sad_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static void get_cost_surf_min(int *sad_list, int *ir, int *ic,
+                              int bits) {
+  *ic = divide_and_round((sad_list[1] - sad_list[3]) * (1 << (bits - 1)),
+                         (sad_list[1] - 2 * sad_list[0] + sad_list[3]));
+  *ir = divide_and_round((sad_list[4] - sad_list[2]) * (1 << (bits - 1)),
+                         (sad_list[4] - 2 * sad_list[0] + sad_list[2]));
+}
+
+int vp9_find_best_sub_pixel_surface_fit(const MACROBLOCK *x,
+                                        MV *bestmv, const MV *ref_mv,
+                                        int allow_hp,
+                                        int error_per_bit,
+                                        const vp9_variance_fn_ptr_t *vfp,
+                                        int forced_stop,
+                                        int iters_per_step,
+                                        int *sad_list,
+                                        int *mvjcost, int *mvcost[2],
+                                        int *distortion,
+                                        unsigned int *sse1,
+                                        const uint8_t *second_pred,
+                                        int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  SETUP_CENTER_ERROR;
+  (void) halfiters;
+  (void) quarteriters;
+  (void) eighthiters;
+  (void) whichdir;
+  (void) allow_hp;
+  (void) forced_stop;
+  (void) hstep;
+
+  if (sad_list &&
+      sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
+      sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
+      sad_list[4] != INT_MAX &&
+      is_sad_list_wellbehaved(sad_list)) {
+    int ir, ic;
+    unsigned int minpt;
+    get_cost_surf_min(sad_list, &ir, &ic, 3);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + ir, tc + ic);
+    }
+  }
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
+                                             MV *bestmv, const MV *ref_mv,
+                                             int allow_hp,
+                                             int error_per_bit,
+                                             const vp9_variance_fn_ptr_t *vfp,
+                                             int forced_stop,
+                                             int iters_per_step,
+                                             int *sad_list,
+                                             int *mvjcost, int *mvcost[2],
+                                             int *distortion,
+                                             unsigned int *sse1,
+                                             const uint8_t *second_pred,
+                                             int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  SETUP_CENTER_ERROR;
+  if (sad_list &&
+      sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
+      sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
+      sad_list[4] != INT_MAX &&
+      is_sad_list_wellbehaved(sad_list)) {
+    unsigned int minpt;
+    int ir, ic;
+    get_cost_surf_min(sad_list, &ir, &ic, 1);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
 int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
                                         MV *bestmv, const MV *ref_mv,
                                         int allow_hp,
@@ -300,30 +484,7 @@
                                         const uint8_t *second_pred,
                                         int w, int h) {
   SETUP_SUBPEL_SEARCH;
-  if (second_pred != NULL) {
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      DECLARE_ALIGNED_ARRAY(16, uint16_t, comp_pred16, 64 * 64);
-      vp9_high_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
-                             y_stride);
-      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, z, src_stride,
-                        sse1);
-    } else {
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
-      vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
-      besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
-    }
-#else
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
-    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
-    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  } else {
-    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);
-  }
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
+  SETUP_CENTER_ERROR;
   if (sad_list &&
       sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
       sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
@@ -415,29 +576,7 @@
                                  const uint8_t *second_pred,
                                  int w, int h) {
   SETUP_SUBPEL_SEARCH;
-  if (second_pred != NULL) {
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      DECLARE_ALIGNED_ARRAY(16, uint16_t, comp_pred16, 64 * 64);
-      vp9_high_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
-                             y_stride);
-      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, z, src_stride,
-                        sse1);
-    } else {
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
-      vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
-      besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
-    }
-#else
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
-    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
-    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  } else {
-    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);
-  }
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  SETUP_CENTER_ERROR;
   (void) sad_list;  // to silence compiler warning
 
   // Each subsequent iteration checks at least one point in
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index eb3f3eb..3156cb2 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -108,6 +108,8 @@
 
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_surface_fit;
 
 typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
                                     const MV *ref_mv, int sad_per_bit,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 92e3149..9368ac1 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -421,6 +421,10 @@
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
   } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_more;
+  } else if (sf->mv.subpel_search_method == SUBPEL_SURFACE_FIT) {
+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_surface_fit;
   }
 
   cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index ed84008..f224e65 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -79,6 +79,8 @@
 typedef enum {
   SUBPEL_TREE = 0,
   SUBPEL_TREE_PRUNED = 1,
+  SUBPEL_TREE_PRUNED_MORE = 2,
+  SUBPEL_SURFACE_FIT = 3,
   // Other methods to come
 } SUBPEL_SEARCH_METHODS;