Speed up mv cost table setup

Compute in O(n) instead of O(n log(n)) (where n is the number of motion
vector values). Leads to a x15 speedup with clang (~x10 with gcc due to
not autovectorizing).

Change-Id: Ibe650a9e247d864ca3b7b237d64208580a5946bb
diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index 4a7d874..a03cdfa 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c
@@ -115,14 +115,21 @@
         2);
 }
 
-static void build_nmv_component_cost_table(int *mvcost,
-                                           const nmv_component *const mvcomp,
-                                           MvSubpelPrecision precision) {
-  int i, v;
+/* TODO(siekyleb@amazon.com): This function writes MV_VALS ints or 128 KiB. This
+ *   is more than most L1D caches and is a significant chunk of L2. Write
+ *   SIMD that uses streaming writes to avoid loading all of that into L1, or
+ *   just don't update the larger component costs every time this called
+ *   (or both).
+ */
+void av1_build_nmv_component_cost_table(int *mvcost,
+                                        const nmv_component *const mvcomp,
+                                        MvSubpelPrecision precision) {
+  int i, j, v, o, mantissa;
   int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
   int bits_cost[MV_OFFSET_BITS][2];
-  int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
-  int class0_hp_cost[2], hp_cost[2];
+  int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE] = { 0 },
+      fp_cost[MV_FP_SIZE] = { 0 };
+  int class0_hp_cost[2] = { 0 }, hp_cost[2] = { 0 };
 
   av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL);
   av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL);
@@ -131,45 +138,114 @@
     av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL);
   }
 
-  for (i = 0; i < CLASS0_SIZE; ++i)
-    av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], NULL);
-  av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
+  if (precision > MV_SUBPEL_NONE) {
+    for (i = 0; i < CLASS0_SIZE; ++i)
+      av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i],
+                               NULL);
+    av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
+  }
 
   if (precision > MV_SUBPEL_LOW_PRECISION) {
     av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL);
     av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL);
   }
+
+  // Instead of accumulating the cost of each vector component's bits
+  //   individually, compute the costs based on smaller vectors. Costs for
+  //   [2^exp, 2 * 2^exp - 1] are calculated based on [0, 2^exp - 1]
+  //   respectively. Offsets are maintained to swap both 1) class costs when
+  //   treated as a complete vector component with the highest set bit when
+  //   treated as a mantissa (significand) and 2) leading zeros to account for
+  //   the current exponent.
+
+  // Cost offsets
+  int cost_swap[MV_OFFSET_BITS] = { 0 };
+  // Delta to convert positive vector to negative vector costs
+  int negate_sign = sign_cost[1] - sign_cost[0];
+
+  // Initialize with offsets to swap the class costs with the costs of the
+  //   highest set bit.
+  for (i = 1; i < MV_OFFSET_BITS; ++i) {
+    cost_swap[i] = bits_cost[i - 1][1];
+    if (i > CLASS0_BITS) cost_swap[i] -= class_cost[i - CLASS0_BITS];
+  }
+
+  // Seed the fractional costs onto the output (overwritten latter).
+  for (o = 0; o < MV_FP_SIZE; ++o) {
+    int hp;
+    for (hp = 0; hp < 2; ++hp) {
+      v = 2 * o + hp + 1;
+      mvcost[v] = fp_cost[o] + hp_cost[hp] + sign_cost[0];
+    }
+  }
+
   mvcost[0] = 0;
-  for (v = 1; v <= MV_MAX; ++v) {
-    int z, c, o, d, e, f, cost = 0;
-    z = v - 1;
-    c = av1_get_mv_class(z, &o);
-    cost += class_cost[c];
-    d = (o >> 3);     /* int mv data */
-    f = (o >> 1) & 3; /* fractional pel mv data */
-    e = (o & 1);      /* high precision mv data */
-    if (c == MV_CLASS_0) {
-      cost += class0_cost[d];
-    } else {
-      const int b = c + CLASS0_BITS - 1; /* number of bits */
-      for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)];
+  // Fill the costs for each exponent's vectors, using the costs set in the
+  //   previous exponents.
+  for (i = 0; i < MV_OFFSET_BITS; ++i) {
+    const int exponent = (2 * MV_FP_SIZE) << i;
+
+    int class = 0;
+    if (i >= CLASS0_BITS) {
+      class = class_cost[i - CLASS0_BITS + 1];
     }
-    if (precision > MV_SUBPEL_NONE) {
-      if (c == MV_CLASS_0) {
-        cost += class0_fp_cost[d][f];
-      } else {
-        cost += fp_cost[f];
+
+    // Iterate through mantissas, keeping track of the location
+    //   of the highest set bit for the mantissa.
+    // To be clear: in the outer loop, the position of the highest set bit
+    //   (exponent) is tracked and, in this loop, the highest set bit of the
+    //   mantissa is tracked.
+    mantissa = 0;
+    for (j = 0; j <= i; ++j) {
+      for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) {
+        int cost = mvcost[mantissa + 1] + class + cost_swap[j];
+        v = exponent + mantissa + 1;
+        mvcost[v] = cost;
+        mvcost[-v] = cost + negate_sign;
       }
-      if (precision > MV_SUBPEL_LOW_PRECISION) {
-        if (c == MV_CLASS_0) {
-          cost += class0_hp_cost[e];
-        } else {
-          cost += hp_cost[e];
-        }
+      cost_swap[j] += bits_cost[i][0];
+    }
+  }
+
+  // Special case to avoid buffer overrun
+  {
+    int exponent = (2 * MV_FP_SIZE) << MV_OFFSET_BITS;
+    int class = class_cost[MV_CLASSES - 1];
+    mantissa = 0;
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) {
+        int cost = mvcost[mantissa + 1] + class + cost_swap[j];
+        v = exponent + mantissa + 1;
+        mvcost[v] = cost;
+        mvcost[-v] = cost + negate_sign;
       }
     }
-    mvcost[v] = cost + sign_cost[0];
-    mvcost[-v] = cost + sign_cost[1];
+    // t = ((2 * MV_FP_SIZE) << MV_OFFSET_BITS) - 1
+
+    // Manually calculate the final cost offset
+    int cost_swap_hi =
+        bits_cost[MV_OFFSET_BITS - 1][1] - class_cost[MV_CLASSES - 2];
+    for (; mantissa < exponent - 1; ++mantissa) {
+      int cost = mvcost[mantissa + 1] + class + cost_swap_hi;
+      v = exponent + mantissa + 1;
+      mvcost[v] = cost;
+      mvcost[-v] = cost + negate_sign;
+    }
+  }
+
+  // Fill costs for class0 vectors, overwriting previous placeholder values
+  //   used for calculating the costs of the larger vectors.
+  for (i = 0; i < CLASS0_SIZE; ++i) {
+    const int top = i * 2 * MV_FP_SIZE;
+    for (o = 0; o < MV_FP_SIZE; ++o) {
+      int hp;
+      int cost = class0_fp_cost[i][o] + class_cost[0] + class0_cost[i];
+      for (hp = 0; hp < 2; ++hp) {
+        v = top + 2 * o + hp + 1;
+        mvcost[v] = cost + class0_hp_cost[hp] + sign_cost[0];
+        mvcost[-v] = cost + class0_hp_cost[hp] + sign_cost[1];
+      }
+    }
   }
 }
 
@@ -219,8 +295,8 @@
                               const nmv_context *ctx,
                               MvSubpelPrecision precision) {
   av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL);
-  build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision);
-  build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision);
+  av1_build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision);
+  av1_build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision);
 }
 
 int_mv av1_get_ref_mv_from_stack(int ref_idx,
diff --git a/av1/encoder/encodemv.h b/av1/encoder/encodemv.h
index 650fc1b..c39001a 100644
--- a/av1/encoder/encodemv.h
+++ b/av1/encoder/encodemv.h
@@ -27,6 +27,9 @@
 void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context *mvctx,
                               MvSubpelPrecision precision);
+void av1_build_nmv_component_cost_table(int *mvcost,
+                                        const nmv_component *const mvcomp,
+                                        MvSubpelPrecision precision);
 
 void av1_update_mv_count(ThreadData *td);
 
diff --git a/test/mv_cost_test.cc b/test/mv_cost_test.cc
new file mode 100644
index 0000000..86e310c
--- /dev/null
+++ b/test/mv_cost_test.cc
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+void ReferenceBuildNmvComponentCostTable(int *mvcost,
+                                         const nmv_component *const mvcomp,
+                                         MvSubpelPrecision precision) {
+  int i, v;
+  int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+  int bits_cost[MV_OFFSET_BITS][2];
+  int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
+  int class0_hp_cost[2], hp_cost[2];
+  av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL);
+  av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL);
+  av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL);
+  for (i = 0; i < MV_OFFSET_BITS; ++i) {
+    av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL);
+  }
+  for (i = 0; i < CLASS0_SIZE; ++i)
+    av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], NULL);
+  av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
+  if (precision > MV_SUBPEL_LOW_PRECISION) {
+    av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL);
+    av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL);
+  }
+  mvcost[0] = 0;
+  for (v = 1; v <= MV_MAX; ++v) {
+    int z, c, o, d, e, f, cost = 0;
+    z = v - 1;
+    c = av1_get_mv_class(z, &o);
+    cost += class_cost[c];
+    d = (o >> 3);     /* int mv data */
+    f = (o >> 1) & 3; /* fractional pel mv data */
+    e = (o & 1);      /* high precision mv data */
+    if (c == MV_CLASS_0) {
+      cost += class0_cost[d];
+    } else {
+      const int b = c + CLASS0_BITS - 1; /* number of bits */
+      for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)];
+    }
+    if (precision > MV_SUBPEL_NONE) {
+      if (c == MV_CLASS_0) {
+        cost += class0_fp_cost[d][f];
+      } else {
+        cost += fp_cost[f];
+      }
+      if (precision > MV_SUBPEL_LOW_PRECISION) {
+        if (c == MV_CLASS_0) {
+          cost += class0_hp_cost[e];
+        } else {
+          cost += hp_cost[e];
+        }
+      }
+    }
+    mvcost[v] = cost + sign_cost[0];
+    mvcost[-v] = cost + sign_cost[1];
+  }
+}
+
+// Test using the default context, except for sign
+static const nmv_component kTestComponentContext = {
+  { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757, 32762,
+              32767) },  // class_cdf // fp
+  { { AOM_CDF4(16384, 24576, 26624) },
+    { AOM_CDF4(12288, 21248, 24128) } },  // class0_fp_cdf
+  { AOM_CDF4(8192, 17408, 21248) },       // fp_cdf
+  { AOM_CDF2(70 * 128) },                 // sign_cdf
+  { AOM_CDF2(160 * 128) },                // class0_hp_cdf
+  { AOM_CDF2(128 * 128) },                // hp_cdf
+  { AOM_CDF2(216 * 128) },                // class0_cdf
+  { { AOM_CDF2(128 * 136) },
+    { AOM_CDF2(128 * 140) },
+    { AOM_CDF2(128 * 148) },
+    { AOM_CDF2(128 * 160) },
+    { AOM_CDF2(128 * 176) },
+    { AOM_CDF2(128 * 192) },
+    { AOM_CDF2(128 * 224) },
+    { AOM_CDF2(128 * 234) },
+    { AOM_CDF2(128 * 234) },
+    { AOM_CDF2(128 * 240) } },  // bits_cdf
+};
+
+void TestMvComponentCostTable(MvSubpelPrecision precision) {
+  std::unique_ptr<int[]> mvcost_ref_buf(new int[MV_VALS]);
+  std::unique_ptr<int[]> mvcost_buf(new int[MV_VALS]);
+  int *mvcost_ref = mvcost_ref_buf.get() + MV_MAX;
+  int *mvcost = mvcost_buf.get() + MV_MAX;
+
+  ReferenceBuildNmvComponentCostTable(mvcost_ref, &kTestComponentContext,
+                                      precision);
+  av1_build_nmv_component_cost_table(mvcost, &kTestComponentContext, precision);
+
+  for (int v = 0; v <= MV_MAX; ++v) {
+    ASSERT_EQ(mvcost_ref[v], mvcost[v]) << "v = " << v;
+    ASSERT_EQ(mvcost_ref[-v], mvcost[-v]) << "v = " << v;
+  }
+}
+
+TEST(MvCostTest, BuildMvComponentCostTableTest1) {
+  TestMvComponentCostTable(MV_SUBPEL_NONE);
+}
+
+TEST(MvCostTest, BuildMvComponentCostTableTest2) {
+  TestMvComponentCostTable(MV_SUBPEL_LOW_PRECISION);
+}
+
+TEST(MvCostTest, BuildMvComponentCostTableTest3) {
+  TestMvComponentCostTable(MV_SUBPEL_HIGH_PRECISION);
+}
+
+}  // namespace
\ No newline at end of file
diff --git a/test/test.cmake b/test/test.cmake
index 92c8512..9ad832e 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -179,6 +179,7 @@
               "${AOM_ROOT}/test/masked_sad_test.cc"
               "${AOM_ROOT}/test/masked_variance_test.cc"
               "${AOM_ROOT}/test/motion_vector_test.cc"
+              "${AOM_ROOT}/test/mv_cost_test.cc"
               "${AOM_ROOT}/test/noise_model_test.cc"
               "${AOM_ROOT}/test/obmc_sad_test.cc"
               "${AOM_ROOT}/test/obmc_variance_test.cc"