Consider quantization factor in plane-wise filter.

In plane-wise temporal filtering strategy, filter weight is assigned
independently from the quantization factor used in video compression.
However, when we use small q (i.e., with high bitrate), a weaker
filtering strength is expected since we would like to keep more details.

This CL improves the plane-wise strategy by considering the quantization
factor for filter weight assignment. In particular, when q is large
enough (>=16), nothing is changed. When q is less than 16, we will
reduce the filtering strength w.r.t. the q-value. This change
significantly improves the performance on high-bitrate encoding.

NOTE: This CL only affects the performance on midres and hdres datasets.

Experimental results:

Under Speed-4 (two-pass mode):
          avg PSNR   ovr PSNR     SSIM
midres      -0.116     -0.099   -0.040
midres2     -0.047     -0.044   -0.017
hdres       -0.142     -0.197   -0.075
hdres2      -0.010     -0.012   -0.004

Under Speed-1 (two-pass mode):
          avg PSNR   ovr PSNR     SSIM
midres      -0.132     -0.124   -0.040
midres2     -0.057     -0.053   -0.020
hdres       -0.158     -0.177   -0.066
hdres2      -0.014     -0.018   -0.005

STATS_CHANGED

Change-Id: I486e66770a4454fb1f72c3276cd20a4b3ae3dd3d
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index f78bde4..296c6c5 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -289,7 +289,7 @@
   }
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int use_subblock, const int block_mse, const int *subblock_mses, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int use_subblock, const int block_mse, const int *subblock_mses, const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count";
     specialize qw/av1_apply_temporal_filter_planewise sse2 avx2/;
   }
   add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 37487c3..583cfb7 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -645,6 +645,8 @@
 //   use_subblock: Whether to use 4 sub-blocks to replace the original block.
 //   block_mse: Motion search error (MSE) for the entire block.
 //   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
+//   q_factor: Quantization factor. This is actually the `q` defined in libaom,
+//             which is converted from `qindex`.
 //   pred: Pointer to the well-built predictors.
 //   accum: Pointer to the pixel-wise accumulator for filtering.
 //   count: Pointer to the pixel-wise counter fot filtering.
@@ -655,8 +657,8 @@
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
     const int num_planes, const double *noise_levels, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const uint8_t *pred,
-    uint32_t *accum, uint16_t *count) {
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
 
   // Block information.
@@ -747,10 +749,11 @@
         // Control factor for non-local mean approach.
         const double r =
             (double)decay_control * (0.7 + log(noise_levels[plane] + 1.0));
+        const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
 
         // Compute filter weight.
         const double scaled_diff =
-            AOMMAX(-(window_error + block_error / 10) / (2 * r * r), -15.0);
+            AOMMAX(-(window_error + block_error / 10) / (2 * r * r * q), -15.0);
         const int adjusted_weight =
             (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
 
@@ -792,6 +795,7 @@
 //                 strategy)
 //   block_mse: Motion search error (MSE) for the entire block.
 //   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
+//   q_factor: Quantization factor.
 //   pred: Pointer to the well-built predictors.
 //   accum: Pointer to the pixel-wise accumulator for filtering.
 //   count: Pointer to the pixel-wise counter fot filtering.
@@ -804,7 +808,7 @@
     const int num_planes, const int use_planewise_strategy, const int strength,
     const int use_subblock, const int *subblock_filter_weights,
     const double *noise_levels, const int block_mse, const int *subblock_mses,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+    const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count) {
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
 
   if (use_planewise_strategy) {  // Commonly used for high-resolution video.
@@ -812,13 +816,13 @@
     if (is_frame_high_bitdepth(frame_to_filter)) {
       av1_apply_temporal_filter_planewise_c(
           frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-          noise_levels, use_subblock, block_mse, subblock_mses, pred, accum,
-          count);
+          noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
+          accum, count);
     } else {
-      av1_apply_temporal_filter_planewise(frame_to_filter, mbd, block_size,
-                                          mb_row, mb_col, num_planes,
-                                          noise_levels, use_subblock, block_mse,
-                                          subblock_mses, pred, accum, count);
+      av1_apply_temporal_filter_planewise(
+          frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+          noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
+          accum, count);
     }
   } else {  // Commonly used for low-resolution video.
     if (subblock_filter_weights[0] == 0 && subblock_filter_weights[1] == 0 &&
@@ -1026,11 +1030,17 @@
                                          subblock_filter_weights[0], pred,
                                          accum, count);
         } else {  // Other reference frames.
+          const FRAME_TYPE frame_type =
+              (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME
+                                                           : KEY_FRAME;
+          const int q_factor =
+              (int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[frame_type],
+                                           cpi->common.seq_params.bit_depth);
           av1_apply_temporal_filter_others(
               frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
               use_planewise_strategy, strength, use_subblock,
               subblock_filter_weights, noise_levels, block_mse, subblock_mses,
-              pred, accum, count);
+              q_factor, pred, accum, count);
         }
       }
 
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index 07e14f7..a11f791 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -131,9 +131,9 @@
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
     const double sigma, const int decay_control, const int use_subblock,
-    const int block_mse, const int *subblock_mses, unsigned int *accumulator,
-    uint16_t *count, uint16_t *luma_sq_error, uint16_t *chroma_sq_error,
-    int plane, int ss_x_shift, int ss_y_shift) {
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
+    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
   assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
   assert(((block_width == 32) && (block_height == 32)) ||
          ((block_width == 16) && (block_height == 16)));
@@ -141,6 +141,7 @@
 
   uint32_t acc_5x5_sse[BH][BW];
   const double h = decay_control * (0.7 + log(sigma + 1.0));
+  const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
   uint16_t *frame_sse =
       (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
 
@@ -226,7 +227,7 @@
           (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
 
       const double scaled_diff =
-          AOMMAX(-(window_error + block_error / 10) / (2 * h * h), -15.0);
+          AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0);
       const int adjusted_weight =
           (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
 
@@ -240,8 +241,8 @@
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
     const int num_planes, const double *noise_levels, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const uint8_t *pred,
-    uint32_t *accum, uint16_t *count) {
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
   const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
   if (is_high_bitdepth) {
     assert(0 && "Only support low bit-depth with avx2!");
@@ -275,8 +276,9 @@
     apply_temporal_filter_planewise(
         ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
         noise_levels[plane], decay_control, use_subblock, block_mse,
-        subblock_mses, accum + mb_pels * plane, count + mb_pels * plane,
-        luma_sq_error, chroma_sq_error, plane, ss_x_shift, ss_y_shift);
+        subblock_mses, q_factor, accum + mb_pels * plane,
+        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
+        ss_x_shift, ss_y_shift);
   }
   if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
 }
diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index 4fc8738..98a6b82 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c
@@ -106,9 +106,9 @@
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
     const double sigma, const int decay_control, const int use_subblock,
-    const int block_mse, const int *subblock_mses, unsigned int *accumulator,
-    uint16_t *count, uint16_t *luma_sq_error, uint16_t *chroma_sq_error,
-    int plane, int ss_x_shift, int ss_y_shift) {
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
+    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
   assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
   assert(((block_width == 32) && (block_height == 32)) ||
          ((block_width == 16) && (block_height == 16)));
@@ -116,6 +116,7 @@
 
   uint32_t acc_5x5_sse[BH][BW];
   const double h = decay_control * (0.7 + log(sigma + 1.0));
+  const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
   uint16_t *frame_sse =
       (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
 
@@ -204,7 +205,7 @@
           (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
 
       const double scaled_diff =
-          AOMMAX(-(window_error + block_error / 10) / (2 * h * h), -15.0);
+          AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0);
       const int adjusted_weight =
           (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
 
@@ -218,8 +219,8 @@
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
     const int num_planes, const double *noise_levels, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const uint8_t *pred,
-    uint32_t *accum, uint16_t *count) {
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
   const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
   if (is_high_bitdepth) {
     assert(0 && "Only support low bit-depth with sse2!");
@@ -253,8 +254,9 @@
     apply_temporal_filter_planewise(
         ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
         noise_levels[plane], decay_control, use_subblock, block_mse,
-        subblock_mses, accum + mb_pels * plane, count + mb_pels * plane,
-        luma_sq_error, chroma_sq_error, plane, ss_x_shift, ss_y_shift);
+        subblock_mses, q_factor, accum + mb_pels * plane,
+        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
+        ss_x_shift, ss_y_shift);
   }
   if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
 }
diff --git a/test/temporal_filter_planewise_test.cc b/test/temporal_filter_planewise_test.cc
index b19ec29..c3f3e9e 100644
--- a/test/temporal_filter_planewise_test.cc
+++ b/test/temporal_filter_planewise_test.cc
@@ -41,8 +41,8 @@
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
     const int num_planes, const double *noise_level, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const uint8_t *pred,
-    uint32_t *accum, uint16_t *count);
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count);
 typedef libaom_test::FuncParam<TemporalFilterPlanewiseFunc>
     TemporalFilterPlanewiseFuncParam;
 
@@ -126,8 +126,9 @@
     assert(width == 32 && height == 32);
     const BLOCK_SIZE block_size = BLOCK_32X32;
     const int use_subblock = 0;
-    const int block_mse = 0;
-    const int subblock_mses[4] = { 0, 0, 0, 0 };
+    const int block_mse = 20;
+    const int subblock_mses[4] = { 15, 16, 17, 18 };
+    const int q_factor = 12;
     const int mb_row = 0;
     const int mb_col = 0;
     const int num_planes = 1;
@@ -147,18 +148,18 @@
     mbd->bd = 8;
 
     params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                     sigma, use_subblock, block_mse, subblock_mses, src2_,
-                     accumulator_ref, count_ref);
+                     sigma, use_subblock, block_mse, subblock_mses, q_factor,
+                     src2_, accumulator_ref, count_ref);
     params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                     sigma, use_subblock, block_mse, subblock_mses, src2_,
-                     accumulator_mod, count_mod);
+                     sigma, use_subblock, block_mse, subblock_mses, q_factor,
+                     src2_, accumulator_mod, count_mod);
 
     if (run_times > 1) {
       aom_usec_timer_start(&ref_timer);
       for (int j = 0; j < run_times; j++) {
         params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                         sigma, use_subblock, block_mse, subblock_mses, src2_,
-                         accumulator_ref, count_ref);
+                         sigma, use_subblock, block_mse, subblock_mses,
+                         q_factor, src2_, accumulator_ref, count_ref);
       }
       aom_usec_timer_mark(&ref_timer);
       const int elapsed_time_c =
@@ -167,8 +168,8 @@
       aom_usec_timer_start(&test_timer);
       for (int j = 0; j < run_times; j++) {
         params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                         sigma, use_subblock, block_mse, subblock_mses, src2_,
-                         accumulator_mod, count_mod);
+                         sigma, use_subblock, block_mse, subblock_mses,
+                         q_factor, src2_, accumulator_mod, count_mod);
       }
       aom_usec_timer_mark(&test_timer);
       const int elapsed_time_simd =