Use the short filter in subpel motion search

While using accurate subpel search(sf->use_accurate_subpel_search=1), we
use 8-tap interplation filter, which brings high computational complexity.
This patch replaced it with 4-tap filter, and enabled it as a speed 1
feature. The borg test result showed
negligable coding performance change.
           avg_psnr  ovr_psnr   ssim
hdres set:  -0.010   -0.090    -0.001
midres set: -0.009   -0.015     0.015
lowres set:  0.021    0.046     0.005

The SIMD optimization of 4-tap filters will be added later that will give
encoder speedups. Also, the filter can be redesigned for this purpose.

STATS_CHANGED

Change-Id: I296c340a3c4977f43a623a9e7c826f6ea2bf18b8
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index e059652..2639e39 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -888,36 +888,37 @@
   #
   add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                           const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
-                                          int subpel_y_q3, const uint8_t *ref, int ref_stride";
+                                          int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search";
   specialize qw/aom_upsampled_pred sse2/;
 
   add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
                                                    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                   int ref_stride";
+                                                   int ref_stride, int subpel_search";
   specialize qw/aom_comp_avg_upsampled_pred sse2/;
 
   add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                        const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
                                                        int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                       int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+                                                       int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
   specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
 
 
   add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                  const MV *const mv, uint16_t *comp_pred, int width, int height, int subpel_x_q3,
-                                                 int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd";
+                                                 int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
   specialize qw/aom_highbd_upsampled_pred sse2/;
 
   add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                           const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
-                                                          int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd";
+                                                          int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride,
+                                                          int bd, int subpel_search";
   specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
 
   add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
                                                               const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
                                                               int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-                                                              int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param";
+                                                              int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
   specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/;
 
 
diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 817ebe1..d567d45 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
@@ -302,7 +302,7 @@
                           int mi_row, int mi_col, const MV *const mv,
                           uint8_t *comp_pred, int width, int height,
                           int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                          int ref_stride) {
+                          int ref_stride, int subpel_search) {
   // expect xd == NULL only in tests
   if (xd != NULL) {
     const MB_MODE_INFO *mi = xd->mi[0];
@@ -387,7 +387,9 @@
   }
 
   const InterpFilterParams *filter =
-      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+      (subpel_search == 1)
+          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
 
   if (!subpel_x_q3 && !subpel_y_q3) {
     for (int i = 0; i < height; i++) {
@@ -429,11 +431,11 @@
                                    uint8_t *comp_pred, const uint8_t *pred,
                                    int width, int height, int subpel_x_q3,
                                    int subpel_y_q3, const uint8_t *ref,
-                                   int ref_stride) {
+                                   int ref_stride, int subpel_search) {
   int i, j;
 
   aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
       comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
@@ -466,13 +468,13 @@
     MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const JNT_COMP_PARAMS *jcp_param) {
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
   int i, j;
   const int fwd_offset = jcp_param->fwd_offset;
   const int bck_offset = jcp_param->bck_offset;
 
   aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
 
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
@@ -889,7 +891,8 @@
                                  int mi_col, const MV *const mv,
                                  uint16_t *comp_pred, int width, int height,
                                  int subpel_x_q3, int subpel_y_q3,
-                                 const uint8_t *ref8, int ref_stride, int bd) {
+                                 const uint8_t *ref8, int ref_stride, int bd,
+                                 int subpel_search) {
   // expect xd == NULL only in tests
   if (xd != NULL) {
     const MB_MODE_INFO *mi = xd->mi[0];
@@ -975,7 +978,9 @@
   }
 
   const InterpFilterParams *filter =
-      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+      (subpel_search == 1)
+          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
 
   if (!subpel_x_q3 && !subpel_y_q3) {
     const uint16_t *ref;
@@ -1021,13 +1026,13 @@
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd) {
+    int ref_stride, int bd, int subpel_search) {
   int i, j;
 
   const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd);
+                            bd, subpel_search);
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
@@ -1063,7 +1068,8 @@
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) {
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+    int subpel_search) {
   int i, j;
   const int fwd_offset = jcp_param->fwd_offset;
   const int bck_offset = jcp_param->bck_offset;
@@ -1071,7 +1077,7 @@
 
   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd);
+                            bd, subpel_search);
 
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
@@ -1110,10 +1116,12 @@
                                   int width, int height, int subpel_x_q3,
                                   int subpel_y_q3, const uint8_t *ref,
                                   int ref_stride, const uint8_t *mask,
-                                  int mask_stride, int invert_mask) {
+                                  int mask_stride, int invert_mask,
+                                  int subpel_search) {
   if (subpel_x_q3 | subpel_y_q3) {
     aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride);
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
     ref = comp_pred;
     ref_stride = width;
   }
@@ -1190,10 +1198,10 @@
     const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
     int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int bd) {
+    int bd, int subpel_search) {
   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd);
+                            bd, subpel_search);
   aom_highbd_comp_mask_pred(comp_pred, pred8, width, height,
                             CONVERT_TO_BYTEPTR(comp_pred), width, mask,
                             mask_stride, invert_mask);
diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index b954470..a3e74b9 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h
@@ -74,14 +74,15 @@
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int subpel_search);
 
 void aom_highbd_comp_mask_upsampled_pred(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
     int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int bd);
+    int bd, int subpel_search);
 
 typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
                                           const int32_t *wsrc,
diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index e9b5e73..8e37811 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c
@@ -595,8 +595,8 @@
                                     int mi_row, int mi_col, const MV *const mv,
                                     uint16_t *comp_pred, int width, int height,
                                     int subpel_x_q3, int subpel_y_q3,
-                                    const uint8_t *ref8, int ref_stride,
-                                    int bd) {
+                                    const uint8_t *ref8, int ref_stride, int bd,
+                                    int subpel_search) {
   // expect xd == NULL only in tests
   if (xd != NULL) {
     const MB_MODE_INFO *mi = xd->mi[0];
@@ -680,7 +680,9 @@
   }
 
   const InterpFilterParams *filter =
-      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+      (subpel_search == 1)
+          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
 
   if (!subpel_x_q3 && !subpel_y_q3) {
     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
@@ -746,13 +748,13 @@
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd) {
+    int ref_stride, int bd, int subpel_search) {
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   int n;
   int i;
   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd);
+                            bd, subpel_search);
   /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
   assert(!(width * height & 7));
   n = width * height >> 3;
@@ -835,13 +837,14 @@
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) {
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+    int subpel_search) {
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   int n;
   int i;
   aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
                             height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd);
+                            bd, subpel_search);
   assert(!(width * height & 7));
   n = width * height >> 3;
 
diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
index eaf1f34..f9a41a2 100644
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/aom_dsp/x86/jnt_variance_ssse3.c
@@ -120,11 +120,11 @@
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const JNT_COMP_PARAMS *jcp_param) {
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
   int n;
   int i;
   aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
   /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
   assert(!(width * height & 15));
   n = width * height >> 4;
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 9efddb9..ae6fb34 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -486,7 +486,8 @@
                              int mi_row, int mi_col, const MV *const mv,
                              uint8_t *comp_pred, int width, int height,
                              int subpel_x_q3, int subpel_y_q3,
-                             const uint8_t *ref, int ref_stride) {
+                             const uint8_t *ref, int ref_stride,
+                             int subpel_search) {
   // expect xd == NULL only in tests
   if (xd != NULL) {
     const MB_MODE_INFO *mi = xd->mi[0];
@@ -571,7 +572,9 @@
   }
 
   const InterpFilterParams *filter =
-      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+      (subpel_search == 1)
+          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
 
   if (!subpel_x_q3 && !subpel_y_q3) {
     if (width >= 16) {
@@ -649,11 +652,11 @@
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride) {
+    int ref_stride, int subpel_search) {
   int n;
   int i;
   aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
   /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
   assert(!(width * height & 15));
   n = width * height >> 4;
diff --git a/av1/common/filter.h b/av1/common/filter.h
index 7f8ad58..48bc49a 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h
@@ -181,6 +181,11 @@
   return &av1_interp_filter_params_list[interp_filter];
 }
 
+static INLINE const InterpFilterParams *av1_get_4tap_interp_filter_params(
+    const InterpFilter interp_filter) {
+  return &av1_interp_4tap[interp_filter];
+}
+
 static INLINE const int16_t *av1_get_interp_filter_kernel(
     const InterpFilter interp_filter) {
   return av1_interp_filter_params_list[interp_filter].filter_ptr;
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index ee58802..ba66bae 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -220,7 +220,7 @@
     thismse = upsampled_pref_error(                                        \
         xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,    \
         pre(y, y_stride, r, c), y_stride, sp(c), sp(r), second_pred, mask, \
-        mask_stride, invert_mask, w, h, &sse);                             \
+        mask_stride, invert_mask, w, h, &sse, use_accurate_subpel_search); \
     v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);     \
     v += thismse;                                                          \
     if (v < besterr) {                                                     \
@@ -649,7 +649,7 @@
                                 int subpel_x_q3, int subpel_y_q3,
                                 const uint8_t *second_pred, const uint8_t *mask,
                                 int mask_stride, int invert_mask, int w, int h,
-                                unsigned int *sse) {
+                                unsigned int *sse, int subpel_search) {
   unsigned int besterr;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
@@ -657,20 +657,23 @@
       if (mask) {
         aom_highbd_comp_mask_upsampled_pred(
             xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h, subpel_x_q3,
-            subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd);
+            subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd,
+            subpel_search);
       } else {
         if (xd->jcp_param.use_jnt_comp_avg)
           aom_highbd_jnt_comp_avg_upsampled_pred(
               xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h,
-              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param);
+              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param,
+              subpel_search);
         else
-          aom_highbd_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16,
-                                             second_pred, w, h, subpel_x_q3,
-                                             subpel_y_q3, y, y_stride, xd->bd);
+          aom_highbd_comp_avg_upsampled_pred(
+              xd, cm, mi_row, mi_col, mv, pred16, second_pred, w, h,
+              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, subpel_search);
       }
     } else {
       aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h,
-                                subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd);
+                                subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+                                subpel_search);
     }
 
     besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse);
@@ -678,22 +681,23 @@
     DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
     if (second_pred != NULL) {
       if (mask) {
-        aom_comp_mask_upsampled_pred(
-            xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
-            subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask);
+        aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
+                                     second_pred, w, h, subpel_x_q3,
+                                     subpel_y_q3, y, y_stride, mask,
+                                     mask_stride, invert_mask, subpel_search);
       } else {
         if (xd->jcp_param.use_jnt_comp_avg)
           aom_jnt_comp_avg_upsampled_pred(
               xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
-              subpel_y_q3, y, y_stride, &xd->jcp_param);
+              subpel_y_q3, y, y_stride, &xd->jcp_param, subpel_search);
         else
           aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
                                       second_pred, w, h, subpel_x_q3,
-                                      subpel_y_q3, y, y_stride);
+                                      subpel_y_q3, y, y_stride, subpel_search);
       }
     } else {
       aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
-                         subpel_y_q3, y, y_stride);
+                         subpel_y_q3, y, y_stride, subpel_search);
     }
 
     besterr = vfp->vf(pred, w, src, src_stride, sse);
@@ -708,10 +712,11 @@
     const int src_stride, const uint8_t *const y, int y_stride,
     const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
     int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2],
-    unsigned int *sse1, int *distortion) {
-  unsigned int besterr = upsampled_pref_error(
-      xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride, y + offset,
-      y_stride, 0, 0, second_pred, mask, mask_stride, invert_mask, w, h, sse1);
+    unsigned int *sse1, int *distortion, int subpel_search) {
+  unsigned int besterr =
+      upsampled_pref_error(xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride,
+                           y + offset, y_stride, 0, 0, second_pred, mask,
+                           mask_stride, invert_mask, w, h, sse1, subpel_search);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
@@ -782,7 +787,8 @@
     besterr = upsampled_setup_center_error(
         xd, cm, mi_row, mi_col, bestmv, ref_mv, error_per_bit, vfp, src_address,
         src_stride, y, y_stride, second_pred, mask, mask_stride, invert_mask, w,
-        h, offset, mvjcost, mvcost, sse1, distortion);
+        h, offset, mvjcost, mvcost, sse1, distortion,
+        use_accurate_subpel_search);
   else
     besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                  src_address, src_stride, y, y_stride,
@@ -803,7 +809,8 @@
           thismse = upsampled_pref_error(
               xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
               pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
-              mask, mask_stride, invert_mask, w, h, &sse);
+              mask, mask_stride, invert_mask, w, h, &sse,
+              use_accurate_subpel_search);
         } else {
           thismse = estimate_upsampled_pref_error(
               xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
@@ -838,7 +845,8 @@
         thismse = upsampled_pref_error(
             xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
             pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
-            mask, mask_stride, invert_mask, w, h, &sse);
+            mask, mask_stride, invert_mask, w, h, &sse,
+            use_accurate_subpel_search);
       } else {
         thismse = estimate_upsampled_pref_error(
             xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
@@ -2304,7 +2312,8 @@
     MV this_mv = { r, c };                                                    \
     thismse = upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, &this_mv,     \
                                         mask, vfp, z, pre(y, y_stride, r, c), \
-                                        y_stride, sp(c), sp(r), w, h, &sse);  \
+                                        y_stride, sp(c), sp(r), w, h, &sse,   \
+                                        use_accurate_subpel_search);          \
     if ((v = MVC(r, c) + thismse) < besterr) {                                \
       besterr = v;                                                            \
       br = r;                                                                 \
@@ -2332,18 +2341,20 @@
     MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
     const MV *const mv, const int32_t *mask, const aom_variance_fn_ptr_t *vfp,
     const int32_t *const wsrc, const uint8_t *const y, int y_stride,
-    int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse) {
+    int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse,
+    int subpel_search) {
   unsigned int besterr;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
     aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred16, w, h,
-                              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd);
+                              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+                              subpel_search);
 
     besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse);
   } else {
     DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
     aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
-                       subpel_y_q3, y, y_stride);
+                       subpel_y_q3, y, y_stride, subpel_search);
 
     besterr = vfp->ovf(pred, w, wsrc, mask, sse);
   }
@@ -2355,10 +2366,11 @@
     const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
     const uint8_t *const y, int y_stride, int w, int h, int offset,
-    int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) {
-  unsigned int besterr =
-      upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc,
-                                y + offset, y_stride, 0, 0, w, h, sse1);
+    int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion,
+    int subpel_search) {
+  unsigned int besterr = upsampled_obmc_pref_error(
+      xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc, y + offset, y_stride, 0,
+      0, w, h, sse1, subpel_search);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
@@ -2413,11 +2425,12 @@
 
   bestmv->row *= 8;
   bestmv->col *= 8;
-  // use_accurate_subpel_search can be 0 or 1
+  // use_accurate_subpel_search can be 0 or 1 or 2
   if (use_accurate_subpel_search)
     besterr = upsampled_setup_obmc_center_error(
         xd, cm, mi_row, mi_col, mask, bestmv, ref_mv, error_per_bit, vfp, z, y,
-        y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion);
+        y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion,
+        use_accurate_subpel_search);
   else
     besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
                                       z, y, y_stride, offset, mvjcost, mvcost,
@@ -2433,7 +2446,8 @@
         if (use_accurate_subpel_search) {
           thismse = upsampled_obmc_pref_error(
               xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
-              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse);
+              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+              use_accurate_subpel_search);
         } else {
           thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
                               sp(tr), src_address, mask, &sse);
@@ -2464,7 +2478,8 @@
       if (use_accurate_subpel_search) {
         thismse = upsampled_obmc_pref_error(
             xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
-            pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse);
+            pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+            use_accurate_subpel_search);
       } else {
         thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr),
                             src_address, mask, &sse);
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 19ef55c..68da931 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -200,6 +200,7 @@
     sf->use_intra_txb_hash = 1;
     sf->optimize_b_precheck = 1;
     sf->dual_sgr_penalty_level = 1;
+    sf->use_accurate_subpel_search = 1;
   }
 
   if (speed >= 2) {
@@ -431,7 +432,7 @@
   sf->disable_filter_search_var_thresh = 0;
   sf->adaptive_interp_filter_search = 0;
   sf->allow_partition_search_skip = 0;
-  sf->use_accurate_subpel_search = 1;
+  sf->use_accurate_subpel_search = 2;
   sf->disable_wedge_search_var_thresh = 0;
   sf->fast_wedge_sign_estimate = 0;
   sf->drop_ref = 0;
diff --git a/test/comp_avg_pred_test.h b/test/comp_avg_pred_test.h
index ab2004c..7028d22 100644
--- a/test/comp_avg_pred_test.h
+++ b/test/comp_avg_pred_test.h
@@ -36,7 +36,7 @@
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const JNT_COMP_PARAMS *jcp_param);
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search);
 
 typedef void (*highbdjntcompavg_func)(uint16_t *comp_pred, const uint8_t *pred8,
                                       int width, int height,
@@ -47,7 +47,8 @@
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param);
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+    int subpel_search);
 
 typedef ::testing::tuple<jntcompavg_func, BLOCK_SIZE> JNTCOMPAVGParam;
 
@@ -217,33 +218,39 @@
     JNT_COMP_PARAMS jnt_comp_params;
     jnt_comp_params.use_jnt_comp_avg = 1;
     int sub_x_q3, sub_y_q3;
-    for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
-      for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
-        for (int ii = 0; ii < 2; ii++) {
-          for (int jj = 0; jj < 4; jj++) {
-            jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-            jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+    int subpel_search;
+    for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+      for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+        for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+          for (int ii = 0; ii < 2; ii++) {
+            for (int jj = 0; jj < 4; jj++) {
+              jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+              jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
 
-            const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
-            const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+              const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
 
-            aom_jnt_comp_avg_upsampled_pred_c(
-                NULL, NULL, 0, 0, NULL, output, pred8 + offset_r * w + offset_c,
-                in_w, in_h, sub_x_q3, sub_y_q3, ref8 + offset_r * w + offset_c,
-                in_w, &jnt_comp_params);
-            test_impl(NULL, NULL, 0, 0, NULL, output2,
-                      pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
-                      sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
-                      &jnt_comp_params);
+              aom_jnt_comp_avg_upsampled_pred_c(
+                  NULL, NULL, 0, 0, NULL, output,
+                  pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+                  sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+                  &jnt_comp_params, subpel_search);
+              test_impl(NULL, NULL, 0, 0, NULL, output2,
+                        pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+                        sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+                        &jnt_comp_params, subpel_search);
 
-            for (int i = 0; i < in_h; ++i) {
-              for (int j = 0; j < in_w; ++j) {
-                int idx = i * in_w + j;
-                ASSERT_EQ(output[idx], output2[idx])
-                    << "Mismatch at unit tests for AV1JNTCOMPAVGUPSAMPLEDTest\n"
-                    << in_w << "x" << in_h << " Pixel mismatch at index " << idx
-                    << " = (" << i << ", " << j << "), sub pixel offset = ("
-                    << sub_y_q3 << ", " << sub_x_q3 << ")";
+              for (int i = 0; i < in_h; ++i) {
+                for (int j = 0; j < in_w; ++j) {
+                  int idx = i * in_w + j;
+                  ASSERT_EQ(output[idx], output2[idx])
+                      << "Mismatch at unit tests for "
+                         "AV1JNTCOMPAVGUPSAMPLEDTest\n"
+                      << in_w << "x" << in_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << sub_y_q3 << ", "
+                      << sub_x_q3 << ")";
+                }
               }
             }
           }
@@ -280,11 +287,12 @@
     const int num_loops = 1000000000 / (in_w + in_h);
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
+    int subpel_search = 2;  // set to 1 to test 4-tap filter.
 
     for (int i = 0; i < num_loops; ++i)
       aom_jnt_comp_avg_upsampled_pred_c(NULL, NULL, 0, 0, NULL, output, pred8,
                                         in_w, in_h, sub_x_q3, sub_y_q3, ref8,
-                                        in_w, &jnt_comp_params);
+                                        in_w, &jnt_comp_params, subpel_search);
 
     aom_usec_timer_mark(&timer);
     const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
@@ -296,7 +304,7 @@
 
     for (int i = 0; i < num_loops; ++i)
       test_impl(NULL, NULL, 0, 0, NULL, output2, pred8, in_w, in_h, sub_x_q3,
-                sub_y_q3, ref8, in_w, &jnt_comp_params);
+                sub_y_q3, ref8, in_w, &jnt_comp_params, subpel_search);
 
     aom_usec_timer_mark(&timer1);
     const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
@@ -445,38 +453,41 @@
     JNT_COMP_PARAMS jnt_comp_params;
     jnt_comp_params.use_jnt_comp_avg = 1;
     int sub_x_q3, sub_y_q3;
+    int subpel_search;
+    for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+      for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+        for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+          for (int ii = 0; ii < 2; ii++) {
+            for (int jj = 0; jj < 4; jj++) {
+              jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+              jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
 
-    for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
-      for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
-        for (int ii = 0; ii < 2; ii++) {
-          for (int jj = 0; jj < 4; jj++) {
-            jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-            jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
 
-            const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
-            const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+              aom_highbd_jnt_comp_avg_upsampled_pred_c(
+                  NULL, NULL, 0, 0, NULL, output,
+                  CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+                  in_h, sub_x_q3, sub_y_q3,
+                  CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd,
+                  &jnt_comp_params, subpel_search);
+              test_impl(NULL, NULL, 0, 0, NULL, output2,
+                        CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c,
+                        in_w, in_h, sub_x_q3, sub_y_q3,
+                        CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
+                        in_w, bd, &jnt_comp_params, subpel_search);
 
-            aom_highbd_jnt_comp_avg_upsampled_pred_c(
-                NULL, NULL, 0, 0, NULL, output,
-                CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h,
-                sub_x_q3, sub_y_q3,
-                CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd,
-                &jnt_comp_params);
-            test_impl(NULL, NULL, 0, 0, NULL, output2,
-                      CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
-                      in_h, sub_x_q3, sub_y_q3,
-                      CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
-                      bd, &jnt_comp_params);
-
-            for (int i = 0; i < in_h; ++i) {
-              for (int j = 0; j < in_w; ++j) {
-                int idx = i * in_w + j;
-                ASSERT_EQ(output[idx], output2[idx])
-                    << "Mismatch at unit tests for "
-                       "AV1HighBDJNTCOMPAVGUPSAMPLEDTest\n"
-                    << in_w << "x" << in_h << " Pixel mismatch at index " << idx
-                    << " = (" << i << ", " << j << "), sub pixel offset = ("
-                    << sub_y_q3 << ", " << sub_x_q3 << ")";
+              for (int i = 0; i < in_h; ++i) {
+                for (int j = 0; j < in_w; ++j) {
+                  int idx = i * in_w + j;
+                  ASSERT_EQ(output[idx], output2[idx])
+                      << "Mismatch at unit tests for "
+                         "AV1HighBDJNTCOMPAVGUPSAMPLEDTest\n"
+                      << in_w << "x" << in_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << sub_y_q3 << ", "
+                      << sub_x_q3 << ")";
+                }
               }
             }
           }
@@ -511,12 +522,12 @@
     const int num_loops = 1000000000 / (in_w + in_h);
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
-
+    int subpel_search = 2;  // set to 1 to test 4-tap filter.
     for (int i = 0; i < num_loops; ++i)
       aom_highbd_jnt_comp_avg_upsampled_pred_c(
           NULL, NULL, 0, 0, NULL, output, CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
           sub_x_q3, sub_y_q3, CONVERT_TO_BYTEPTR(ref8), in_w, bd,
-          &jnt_comp_params);
+          &jnt_comp_params, subpel_search);
 
     aom_usec_timer_mark(&timer);
     const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
@@ -529,7 +540,7 @@
     for (int i = 0; i < num_loops; ++i)
       test_impl(NULL, NULL, 0, 0, NULL, output2, CONVERT_TO_BYTEPTR(pred8),
                 in_w, in_h, sub_x_q3, sub_y_q3, CONVERT_TO_BYTEPTR(ref8), in_w,
-                bd, &jnt_comp_params);
+                bd, &jnt_comp_params, subpel_search);
 
     aom_usec_timer_mark(&timer1);
     const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
diff --git a/test/comp_mask_variance_test.cc b/test/comp_mask_variance_test.cc
index 2d842c9..b2ab496 100644
--- a/test/comp_mask_variance_test.cc
+++ b/test/comp_mask_variance_test.cc
@@ -190,26 +190,29 @@
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
   int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  int subpel_search;
+  for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+    // loop through subx and suby
+    for (int sub = 0; sub < 8 * 8; ++sub) {
+      int subx = sub & 0x7;
+      int suby = (sub >> 3);
+      for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+        const uint8_t *mask =
+            av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
 
-  // loop through subx and suby
-  for (int sub = 0; sub < 8 * 8; ++sub) {
-    int subx = sub & 0x7;
-    int suby = (sub >> 3);
-    for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
-      const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+        aom_comp_mask_pred = aom_comp_mask_pred_c;  // ref
+        aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_,
+                                     w, h, subx, suby, ref_, MAX_SB_SIZE, mask,
+                                     w, inv, subpel_search);
 
-      aom_comp_mask_pred = aom_comp_mask_pred_c;  // ref
-      aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_,
-                                   w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w,
-                                   inv);
-
-      aom_comp_mask_pred = test_impl;  // test
-      aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred2_, pred_,
-                                   w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w,
-                                   inv);
-      ASSERT_EQ(CheckResult(w, h), true)
-          << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
-          << "," << suby << ")";
+        aom_comp_mask_pred = test_impl;  // test
+        aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred2_, pred_,
+                                     w, h, subx, suby, ref_, MAX_SB_SIZE, mask,
+                                     w, inv, subpel_search);
+        ASSERT_EQ(CheckResult(w, h), true)
+            << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
+            << "," << suby << ")";
+      }
     }
   }
 }
@@ -228,6 +231,7 @@
   const int num_loops = 1000000000 / (w + h);
   comp_mask_pred_func funcs[2] = { &aom_comp_mask_pred_c, test_impl };
   double elapsed_time[2] = { 0 };
+  int subpel_search = 2;  // set to 1 to test 4-tap filter.
   for (int i = 0; i < 2; ++i) {
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
@@ -235,7 +239,7 @@
     for (int j = 0; j < num_loops; ++j) {
       aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_,
                                    w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w,
-                                   0);
+                                   0, subpel_search);
     }
     aom_usec_timer_mark(&timer);
     double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
@@ -466,25 +470,31 @@
     ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
   }
 
-  // loop through subx and suby
-  for (int sub = 0; sub < 8 * 8; ++sub) {
-    int subx = sub & 0x7;
-    int suby = (sub >> 3);
-    for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
-      const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+  int subpel_search;
+  for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+    // loop through subx and suby
+    for (int sub = 0; sub < 8 * 8; ++sub) {
+      int subx = sub & 0x7;
+      int suby = (sub >> 3);
+      for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+        const uint8_t *mask =
+            av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
 
-      aom_highbd_comp_mask_pred = aom_highbd_comp_mask_pred_c;  // ref
-      aom_highbd_comp_mask_upsampled_pred(
-          NULL, NULL, 0, 0, NULL, comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h,
-          subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_);
+        aom_highbd_comp_mask_pred = aom_highbd_comp_mask_pred_c;  // ref
+        aom_highbd_comp_mask_upsampled_pred(
+            NULL, NULL, 0, 0, NULL, comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w,
+            h, subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv,
+            bd_, subpel_search);
 
-      aom_highbd_comp_mask_pred = test_impl;  // test
-      aom_highbd_comp_mask_upsampled_pred(
-          NULL, NULL, 0, 0, NULL, comp_pred2_, CONVERT_TO_BYTEPTR(pred_), w, h,
-          subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_);
-      ASSERT_EQ(CheckResult(w, h), true)
-          << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
-          << "," << suby << ")";
+        aom_highbd_comp_mask_pred = test_impl;  // test
+        aom_highbd_comp_mask_upsampled_pred(
+            NULL, NULL, 0, 0, NULL, comp_pred2_, CONVERT_TO_BYTEPTR(pred_), w,
+            h, subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv,
+            bd_, subpel_search);
+        ASSERT_EQ(CheckResult(w, h), true)
+            << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
+            << "," << suby << ")";
+      }
     }
   }
 }
@@ -516,10 +526,12 @@
     aom_usec_timer timer;
     aom_usec_timer_start(&timer);
     aom_highbd_comp_mask_pred = funcs[i];
+    int subpel_search = 2;  // set to 1 to test 4-tap filter.
     for (int j = 0; j < num_loops; ++j) {
       aom_highbd_comp_mask_upsampled_pred(
           NULL, NULL, 0, 0, NULL, comp_pred1_, CONVERT_TO_BYTEPTR(pred_), w, h,
-          subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, 0, bd_);
+          subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, 0, bd_,
+          subpel_search);
     }
     aom_usec_timer_mark(&timer);
     double time = static_cast<double>(aom_usec_timer_elapsed(&timer));