Add averaging-SAD functions for 8-point comp-inter motion search.

Makes first 50 frames of bus @ 1500kbps encode from 3min22.7 to 3min18.2,
i.e. 2.3% faster. In addition, use the sub_pixel_avg functions to calc
the variance of the averaging predictor. This is slightly suboptimal
because the function is subpixel-position-aware, but it will (at least
for the SSE2 version) not actually use a bilinear filter for a full-pixel
position, thus leading to approximately the same performance compared to
if we implemented an actual average-aware full-pixel variance function.
That gains another 0.3 seconds (i.e. encode time goes to 3min17.4), thus
leading to a total gain of 2.7%.

Change-Id: I3f059d2b04243921868cfed2568d4fa65d7b5acd
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index b956e6a..d71def5 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -369,7 +369,6 @@
 prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad8x8 mmx sse2
 
-# TODO(jingning): need to covert these functions into mmx/sse2 form
 prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
 specialize vp9_sad8x4 sse2
 
@@ -379,6 +378,45 @@
 prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad4x4 mmx sse
 
+prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad64x64_avg sse2
+
+prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x64_avg sse2
+
+prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad64x32_avg sse2
+
+prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x16_avg sse2
+
+prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x32_avg sse2
+
+prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad32x32_avg sse2
+
+prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x16_avg sse2
+
+prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad16x8_avg sse2
+
+prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x16_avg sse2
+
+prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x8_avg sse2
+
+prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad8x4_avg sse2
+
+prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad4x8_avg sse
+
+prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
+specialize vp9_sad4x4_avg sse
+
 prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar16x16_h sse2
 vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 53b70ad..212dce3 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -2353,16 +2353,12 @@
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  /* Compound pred buffer */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
-
   fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
   /* Get compound pred by averaging two pred blocks. */
-  comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
-
-  bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+  bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
+                         second_pred, 0x7fffffff) +
       mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
 
   for (i = 0; i < search_range; i++) {
@@ -2380,9 +2376,8 @@
             best_address;
 
         /* Get compound block and use it to calculate SAD. */
-        comp_avg_pred(comp_pred, second_pred, w, h, check_here,
-                      in_what_stride);
-        thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+        thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride,
+                               second_pred, bestsad);
 
         if (thissad < bestsad) {
           this_mv.as_mv.row = this_row_offset;
@@ -2412,10 +2407,11 @@
   this_mv.as_mv.col = ref_mv->as_mv.col << 3;
 
   if (bestsad < INT_MAX) {
-    int besterr;
-    comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
-    besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
-        (unsigned int *)(&thissad)) +
+    // FIXME(rbultje, yunqing): add full-pixel averaging variance functions
+    // so we don't have to use the subpixel with xoff=0,yoff=0 here.
+    int besterr = fn_ptr->svaf(best_address, in_what_stride, 0, 0,
+                               what, what_stride, (unsigned int *)(&thissad),
+                               second_pred) +
         mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
                     xd->allow_high_precision_mv);
     return besterr;
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 48a8b48..d6fa3fa 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1472,8 +1472,10 @@
   for (i = 0; i < MAX_MODES; i++)
     cpi->rd_thresh_mult[i] = 128;
 
-#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
+            SDX3F, SDX8F, SDX4DF)\
     cpi->fn_ptr[BT].sdf            = SDF; \
+    cpi->fn_ptr[BT].sdaf           = SDAF; \
     cpi->fn_ptr[BT].vf             = VF; \
     cpi->fn_ptr[BT].svf            = SVF; \
     cpi->fn_ptr[BT].svaf           = SVAF; \
@@ -1484,67 +1486,80 @@
     cpi->fn_ptr[BT].sdx8f          = SDX8F; \
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
-  BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
+  BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
+      vp9_variance32x16, vp9_sub_pixel_variance32x16,
       vp9_sub_pixel_avg_variance32x16, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad32x16x4d)
 
-  BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
+  BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
+      vp9_variance16x32, vp9_sub_pixel_variance16x32,
       vp9_sub_pixel_avg_variance16x32, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad16x32x4d)
 
-  BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
+  BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
+      vp9_variance64x32, vp9_sub_pixel_variance64x32,
       vp9_sub_pixel_avg_variance64x32, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad64x32x4d)
 
-  BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
+  BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
+      vp9_variance32x64, vp9_sub_pixel_variance32x64,
       vp9_sub_pixel_avg_variance32x64, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad32x64x4d)
 
-  BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
+  BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
+      vp9_variance32x32, vp9_sub_pixel_variance32x32,
       vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
       vp9_variance_halfpixvar32x32_v,
       vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
       vp9_sad32x32x4d)
 
-  BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
+  BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
+      vp9_variance64x64, vp9_sub_pixel_variance64x64,
       vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
       vp9_variance_halfpixvar64x64_v,
       vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
       vp9_sad64x64x4d)
 
-  BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
+  BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
+      vp9_variance16x16, vp9_sub_pixel_variance16x16,
       vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
       vp9_variance_halfpixvar16x16_v,
       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
       vp9_sad16x16x4d)
 
-  BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
+  BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
+      vp9_variance16x8, vp9_sub_pixel_variance16x8,
       vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
       vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
 
-  BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
+  BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
+      vp9_variance8x16, vp9_sub_pixel_variance8x16,
       vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
       vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
 
-  BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
+  BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
+      vp9_variance8x8, vp9_sub_pixel_variance8x8,
       vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
       vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
-  BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4,
+  BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
+      vp9_variance8x4, vp9_sub_pixel_variance8x4,
       vp9_sub_pixel_avg_variance8x4, NULL, NULL,
       NULL, NULL, vp9_sad8x4x8,
       vp9_sad8x4x4d)
 
-  BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8,
+  BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
+      vp9_variance4x8, vp9_sub_pixel_variance4x8,
       vp9_sub_pixel_avg_variance4x8, NULL, NULL,
       NULL, NULL, vp9_sad4x8x8,
       vp9_sad4x8x4d)
 
-  BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
+  BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
+      vp9_variance4x4, vp9_sub_pixel_variance4x4,
       vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
       vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c
index 6b1ba49..42ddb21 100644
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -11,25 +11,43 @@
 
 #include <stdlib.h>
 #include "vp9/common/vp9_sadmxn.h"
+#include "vp9/encoder/vp9_variance.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "./vp9_rtcd.h"
 
-unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
-                            int  src_stride,
-                            const uint8_t *ref_ptr,
-                            int  ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);
+#define sad_mxn_func(m, n) \
+unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \
+                                  int  src_stride, \
+                                  const uint8_t *ref_ptr, \
+                                  int  ref_stride, \
+                                  unsigned int max_sad) { \
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+} \
+unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \
+                                      int  src_stride, \
+                                      const uint8_t *ref_ptr, \
+                                      int  ref_stride, \
+                                      const uint8_t *second_pred, \
+                                      unsigned int max_sad) { \
+  uint8_t comp_pred[m * n]; \
+  comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
+  return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \
 }
 
-unsigned int vp9_sad64x32_c(const uint8_t *src_ptr,
-                            int  src_stride,
-                            const uint8_t *ref_ptr,
-                            int  ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 32);
-}
+sad_mxn_func(64, 64)
+sad_mxn_func(64, 32)
+sad_mxn_func(32, 64)
+sad_mxn_func(32, 32)
+sad_mxn_func(32, 16)
+sad_mxn_func(16, 32)
+sad_mxn_func(16, 16)
+sad_mxn_func(16, 8)
+sad_mxn_func(8, 16)
+sad_mxn_func(8, 8)
+sad_mxn_func(8, 4)
+sad_mxn_func(4, 8)
+sad_mxn_func(4, 4)
 
 void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
@@ -46,14 +64,6 @@
                               ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-unsigned int vp9_sad32x64_c(const uint8_t *src_ptr,
-                            int  src_stride,
-                            const uint8_t *ref_ptr,
-                            int  ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 64);
-}
-
 void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
                        const uint8_t* const ref_ptr[],
@@ -69,22 +79,6 @@
                               ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,
-                            int  src_stride,
-                            const uint8_t *ref_ptr,
-                            int  ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
-}
-
-unsigned int vp9_sad32x16_c(const uint8_t *src_ptr,
-                            int   src_stride,
-                            const uint8_t *ref_ptr,
-                            int   ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 16);
-}
-
 void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
                        const uint8_t* const ref_ptr[],
@@ -100,14 +94,6 @@
                               ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-unsigned int vp9_sad16x32_c(const uint8_t *src_ptr,
-                            int   src_stride,
-                            const uint8_t *ref_ptr,
-                            int   ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 32);
-}
-
 void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
                        const uint8_t* const ref_ptr[],
@@ -123,63 +109,6 @@
                               ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-unsigned int vp9_sad16x16_c(const uint8_t *src_ptr,
-                            int  src_stride,
-                            const uint8_t *ref_ptr,
-                            int  ref_stride,
-                            unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
-}
-
-unsigned int vp9_sad8x8_c(const uint8_t *src_ptr,
-                          int  src_stride,
-                          const uint8_t *ref_ptr,
-                          int  ref_stride,
-                          unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
-}
-
-
-unsigned int vp9_sad16x8_c(const uint8_t *src_ptr,
-                           int  src_stride,
-                           const uint8_t *ref_ptr,
-                           int  ref_stride,
-                           unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
-}
-
-unsigned int vp9_sad8x16_c(const uint8_t *src_ptr,
-                           int  src_stride,
-                           const uint8_t *ref_ptr,
-                           int  ref_stride,
-                           unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
-}
-
-unsigned int vp9_sad8x4_c(const uint8_t *src_ptr,
-                          int src_stride,
-                          const uint8_t *ref_ptr,
-                          int ref_stride,
-                          unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4);
-}
-
-unsigned int vp9_sad4x8_c(const uint8_t *src_ptr,
-                          int src_stride,
-                          const uint8_t *ref_ptr,
-                          int ref_stride,
-                          unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8);
-}
-
-unsigned int vp9_sad4x4_c(const uint8_t *src_ptr,
-                          int  src_stride,
-                          const uint8_t *ref_ptr,
-                          int  ref_stride,
-                          unsigned int max_sad) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
-}
-
 void vp9_sad64x64x3_c(const uint8_t *src_ptr,
                       int  src_stride,
                       const uint8_t *ref_ptr,
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 38808d7..6e686d6 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -20,6 +20,13 @@
                                     int ref_stride,
                                     unsigned int max_sad);
 
+typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
+                                        int source_stride,
+                                        const uint8_t *ref_ptr,
+                                        int ref_stride,
+                                        const uint8_t *second_pred,
+                                        unsigned int max_sad);
+
 typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
                                    int source_stride,
                                    const uint8_t *ref_ptr,
@@ -74,20 +81,21 @@
                                                    int  ref_stride);
 
 typedef struct vp9_variance_vtable {
-    vp9_sad_fn_t               sdf;
-    vp9_variance_fn_t          vf;
-    vp9_subpixvariance_fn_t    svf;
-    vp9_subp_avg_variance_fn_t svaf;
-    vp9_variance_fn_t          svf_halfpix_h;
-    vp9_variance_fn_t          svf_halfpix_v;
-    vp9_variance_fn_t          svf_halfpix_hv;
-    vp9_sad_multi_fn_t         sdx3f;
-    vp9_sad_multi1_fn_t        sdx8f;
-    vp9_sad_multi_d_fn_t       sdx4df;
+  vp9_sad_fn_t               sdf;
+  vp9_sad_avg_fn_t           sdaf;
+  vp9_variance_fn_t          vf;
+  vp9_subpixvariance_fn_t    svf;
+  vp9_subp_avg_variance_fn_t svaf;
+  vp9_variance_fn_t          svf_halfpix_h;
+  vp9_variance_fn_t          svf_halfpix_v;
+  vp9_variance_fn_t          svf_halfpix_hv;
+  vp9_sad_multi_fn_t         sdx3f;
+  vp9_sad_multi1_fn_t        sdx8f;
+  vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 
 static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                          int height, uint8_t *ref, int ref_stride) {
+                          int height, const uint8_t *ref, int ref_stride) {
   int i, j;
 
   for (i = 0; i < height; i++) {
diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm
index 8fb7d41..c4c5c54 100644
--- a/vp9/encoder/x86/vp9_sad_sse2.asm
+++ b/vp9/encoder/x86/vp9_sad_sse2.asm
@@ -12,12 +12,42 @@
 
 SECTION .text
 
-; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
-;                                uint8_t *ref, int ref_stride);
-%macro SAD64XN 1
-cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
+%macro SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
+;                                uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+  SAD_FN 64, %1, 5, %2
   mov              n_rowsd, %1
   pxor                  m0, m0
 .loop:
@@ -25,6 +55,13 @@
   movu                  m2, [refq+16]
   movu                  m3, [refq+32]
   movu                  m4, [refq+48]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
   psadbw                m1, [srcq]
   psadbw                m2, [srcq+16]
   psadbw                m3, [srcq+32]
@@ -47,21 +84,27 @@
 INIT_XMM sse2
 SAD64XN 64 ; sad64x64_sse2
 SAD64XN 32 ; sad64x32_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
 
 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
-%macro SAD32XN 1
-cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
+%macro SAD32XN 1-2 0
+  SAD_FN 32, %1, 5, %2
   mov              n_rowsd, %1/2
   pxor                  m0, m0
-
 .loop:
   movu                  m1, [refq]
   movu                  m2, [refq+16]
   movu                  m3, [refq+ref_strideq]
   movu                  m4, [refq+ref_strideq+16]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
   psadbw                m1, [srcq]
   psadbw                m2, [srcq+16]
   psadbw                m3, [srcq+src_strideq]
@@ -85,16 +128,14 @@
 SAD32XN 64 ; sad32x64_sse2
 SAD32XN 32 ; sad32x32_sse2
 SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
 
 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
-%macro SAD16XN 1
-cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
-                           src_stride3, ref_stride3, n_rows
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-  lea         src_stride3q, [src_strideq*3]
-  lea         ref_stride3q, [ref_strideq*3]
+%macro SAD16XN 1-2 0
+  SAD_FN 16, %1, 7, %2
   mov              n_rowsd, %1/4
   pxor                  m0, m0
 
@@ -103,6 +144,13 @@
   movu                  m2, [refq+ref_strideq]
   movu                  m3, [refq+ref_strideq*2]
   movu                  m4, [refq+ref_stride3q]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
   psadbw                m1, [srcq]
   psadbw                m2, [srcq+src_strideq]
   psadbw                m3, [srcq+src_strideq*2]
@@ -126,16 +174,14 @@
 SAD16XN 32 ; sad16x32_sse2
 SAD16XN 16 ; sad16x16_sse2
 SAD16XN  8 ; sad16x8_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN  8, 1 ; sad16x8_avg_sse2
 
 ; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
-%macro SAD8XN 1
-cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
-                          src_stride3, ref_stride3, n_rows
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-  lea         src_stride3q, [src_strideq*3]
-  lea         ref_stride3q, [ref_strideq*3]
+%macro SAD8XN 1-2 0
+  SAD_FN 8, %1, 7, %2
   mov              n_rowsd, %1/4
   pxor                  m0, m0
 
@@ -144,6 +190,11 @@
   movhps                m1, [refq+ref_strideq]
   movh                  m2, [refq+ref_strideq*2]
   movhps                m2, [refq+ref_stride3q]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  lea         second_predq, [second_predq+mmsize*2]
+%endif
   movh                  m3, [srcq]
   movhps                m3, [srcq+src_strideq]
   movh                  m4, [srcq+src_strideq*2]
@@ -167,16 +218,14 @@
 SAD8XN 16 ; sad8x16_sse2
 SAD8XN  8 ; sad8x8_sse2
 SAD8XN  4 ; sad8x4_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN  8, 1 ; sad8x8_avg_sse2
+SAD8XN  4, 1 ; sad8x4_avg_sse2
 
 ; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
 ;                                  uint8_t *ref, int ref_stride);
-%macro SAD4XN 1
-cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
-                          src_stride3, ref_stride3, n_rows
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-  lea         src_stride3q, [src_strideq*3]
-  lea         ref_stride3q, [ref_strideq*3]
+%macro SAD4XN 1-2 0
+  SAD_FN 4, %1, 7, %2
   mov              n_rowsd, %1/4
   pxor                  m0, m0
 
@@ -187,6 +236,11 @@
   movd                  m4, [refq+ref_stride3q]
   punpckldq             m1, m2
   punpckldq             m3, m4
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m3, [second_predq+mmsize*1]
+  lea         second_predq, [second_predq+mmsize*2]
+%endif
   movd                  m2, [srcq]
   movd                  m5, [srcq+src_strideq]
   movd                  m4, [srcq+src_strideq*2]
@@ -209,3 +263,5 @@
 INIT_MMX sse
 SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
+SAD4XN  8, 1 ; sad4x8_avg_sse
+SAD4XN  4, 1 ; sad4x4_avg_sse