Add an in-loop deringing experiment

Adds a per-frame, strength adjustable, in loop deringing filter. Uses
the existing vp9_post_proc_down_and_across 5 tap thresholded blur
code, with a brute force search for the threshold.

Results almost strictly positive on the YT HD set, either having no
effect or helping PSNR in the range of 1-3% (overall average 0.8%).
Results more mixed for the CIF set, (-0.5 min, 1.4 max, 0.1 avg).
This has an almost strictly negative impact to SSIM, so examining a
different filter or a more balanced search heuristic is in order.

Other test set results pending.

Change-Id: I5ca6ee8fe292dfa3f2eab7f65332423fa1710b58
diff --git a/configure b/configure
index 5197079..a791ae5 100755
--- a/configure
+++ b/configure
@@ -248,6 +248,7 @@
     code_nonzerocount
     useselectrefmv
     modelcoefprob
+    loop_dering
 "
 CONFIG_LIST="
     external_build
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 9ce5a63..d23f6f5 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -209,7 +209,8 @@
 void vp9_loop_filter_frame(VP9_COMMON *cm,
                            MACROBLOCKD *xd,
                            int frame_filter_level,
-                           int y_only) {
+                           int y_only,
+                           int dering) {
   YV12_BUFFER_CONFIG *post = cm->frame_to_show;
   loop_filter_info_n *lfi_n = &cm->lf_info;
   struct loop_filter_info lfi;
@@ -302,6 +303,62 @@
                                  post->uv_stride, &lfi);
             }
           }
+#if CONFIG_LOOP_DERING
+          if (dering) {
+            if (mb_row && mb_row < cm->mb_rows - 1 &&
+                mb_col && mb_col < cm->mb_cols - 1) {
+              vp9_post_proc_down_and_across(y_ptr, y_ptr,
+                                            post->y_stride, post->y_stride,
+                                            16, 16, dering);
+              if (!y_only) {
+                vp9_post_proc_down_and_across(u_ptr, u_ptr,
+                                              post->uv_stride, post->uv_stride,
+                                              8, 8, dering);
+                vp9_post_proc_down_and_across(v_ptr, v_ptr,
+                                              post->uv_stride, post->uv_stride,
+                                              8, 8, dering);
+              }
+            } else {
+              // Adjust the filter so that no out-of-frame data is used.
+              uint8_t *dr_y = y_ptr, *dr_u = u_ptr, *dr_v = v_ptr;
+              int w_adjust = 0;
+              int h_adjust = 0;
+
+              if (mb_col == 0) {
+                dr_y += 2;
+                dr_u += 2;
+                dr_v += 2;
+                w_adjust += 2;
+              }
+              if (mb_col == cm->mb_cols - 1)
+                w_adjust += 2;
+              if (mb_row == 0) {
+                dr_y += 2 * post->y_stride;
+                dr_u += 2 * post->uv_stride;
+                dr_v += 2 * post->uv_stride;
+                h_adjust += 2;
+              }
+              if (mb_row == cm->mb_rows - 1)
+                h_adjust += 2;
+              vp9_post_proc_down_and_across_c(dr_y, dr_y,
+                                              post->y_stride, post->y_stride,
+                                              16 - w_adjust, 16 - h_adjust,
+                                              dering);
+              if (!y_only) {
+                vp9_post_proc_down_and_across_c(dr_u, dr_u,
+                                                post->uv_stride,
+                                                post->uv_stride,
+                                                8 - w_adjust, 8 - h_adjust,
+                                                dering);
+                vp9_post_proc_down_and_across_c(dr_v, dr_v,
+                                                post->uv_stride,
+                                                post->uv_stride,
+                                                8 - w_adjust, 8 - h_adjust,
+                                                dering);
+              }
+            }
+          }
+#endif
         } else {
           // FIXME: Not 8x8 aware
           if (mb_col > 0 &&
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 53ec336..458afc5 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -83,7 +83,8 @@
 void vp9_loop_filter_frame(struct VP9Common *cm,
                            struct macroblockd *mbd,
                            int filter_level,
-                           int y_only);
+                           int y_only,
+                           int dering);
 
 void vp9_loop_filter_partial_frame(struct VP9Common *cm,
                                    struct macroblockd *mbd,
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index c42e941..a76f4c5 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -244,6 +244,7 @@
   int filter_level;
   int last_sharpness_level;
   int sharpness_level;
+  int dering_enabled;
 
   int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */
 
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 16ed9a7..cb6421f 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -1508,6 +1508,12 @@
   pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);
   pc->filter_level = vp9_read_literal(&header_bc, 6);
   pc->sharpness_level = vp9_read_literal(&header_bc, 3);
+#if CONFIG_LOOP_DERING
+  if (vp9_read_bit(&header_bc))
+    pc->dering_enabled = 1 + vp9_read_literal(&header_bc, 4);
+  else
+    pc->dering_enabled = 0;
+#endif
 
   /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */
   xd->mode_ref_lf_delta_update = 0;
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index a243f41..bcb5897 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -356,7 +356,8 @@
 
     if (cm->filter_level) {
       /* Apply the loop filter if appropriate. */
-      vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0);
+      vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0,
+                            cm->dering_enabled);
     }
     vp8_yv12_extend_frame_borders(cm->frame_to_show);
   }
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 25df767..c0ae5ba 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -2515,6 +2515,14 @@
   vp9_write_bit(&header_bc, pc->filter_type);
   vp9_write_literal(&header_bc, pc->filter_level, 6);
   vp9_write_literal(&header_bc, pc->sharpness_level, 3);
+#if CONFIG_LOOP_DERING
+  if (pc->dering_enabled) {
+    vp9_write_bit(&header_bc, 1);
+    vp9_write_literal(&header_bc, pc->dering_enabled - 1, 4);
+  } else {
+    vp9_write_bit(&header_bc, 0);
+  }
+#endif
 
   // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
   vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 0fb5c56..490a639 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -2526,7 +2526,8 @@
 
   if (cm->filter_level > 0) {
     vp9_set_alt_lf_level(cpi, cm->filter_level);
-    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0);
+    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0,
+                          cm->dering_enabled);
   }
 
   vp8_yv12_extend_frame_borders(cm->frame_to_show);
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index d80ea02..645d66b 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -9,6 +9,7 @@
  */
 
 #include <assert.h>
+#include <limits.h>
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_picklpf.h"
@@ -267,7 +268,7 @@
 
   // Get baseline error score
   vp9_set_alt_lf_level(cpi, filt_mid);
-  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1);
+  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1, 0);
 
   best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
   filt_best = filt_mid;
@@ -292,7 +293,7 @@
     if ((filt_direction <= 0) && (filt_low != filt_mid)) {
       // Get Low filter error score
       vp9_set_alt_lf_level(cpi, filt_low);
-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1);
+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1, 0);
 
       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
 
@@ -312,7 +313,7 @@
     // Now look at filt_high
     if ((filt_direction >= 0) && (filt_high != filt_mid)) {
       vp9_set_alt_lf_level(cpi, filt_high);
-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1);
+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, 0);
 
       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
 
@@ -337,4 +338,30 @@
   }
 
   cm->filter_level = filt_best;
+
+#if CONFIG_LOOP_DERING
+  /* Decide whether to turn on deringing filter */
+  {  // NOLINT
+    int best_dering = 0;
+    int this_dering;
+    int last_err_diff = INT_MAX;
+
+    for (this_dering = 1; this_dering <= 16; this_dering++) {
+      vp9_set_alt_lf_level(cpi, filt_best);
+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, this_dering);
+      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
+      vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+      if (filt_err < best_err) {
+        best_err = filt_err;
+        best_dering = this_dering;
+        last_err_diff = INT_MAX;
+      } else {
+        if (filt_err - best_err > last_err_diff)
+          break;
+        last_err_diff = filt_err - best_err;
+      }
+    }
+    cm->dering_enabled = best_dering;
+  }
+#endif
 }