Use SAD for global motion error metric

Replace custom error metric (sum of |diff|^0.7) with SAD.
This has three advantages:
1) Better results
2) SAD is faster to calculate
3) We can remove ~1300 lines of code dedicated to the custom error
   metric

Note: Despite point (2), the overall runtime is increased slightly.
This is expected - improving the accuracy of global motion leads to it
being evaluated more often, and this outweighs the time savings from
speeding up the error metric evaluations. But overall this significantly
improves the gain : encode time cost ratio of global motion.

Impact on "good" mode:

 Speed | BDRATE-PSNR | BDRATE-SSIM |   Enc time
-------+-------------+-------------+-------------
   1   |   -0.087%   |   -0.102%   |   +0.085%
   2   |   -0.075%   |   -0.074%   |   +0.331%
   3   |   -0.111%   |   -0.122%   |   +0.445%
   4   |   -0.110%   |   -0.118%   |   +0.598%

Other modes, and good mode speed 5+, are unaffected.

STATS_CHANGED

Change-Id: Ifdc52c5012d6936a97a87cff489062402e81e495
diff --git a/av1/av1.cmake b/av1/av1.cmake
index f5a8a9f..1bb0539 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -314,7 +314,6 @@
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/error_intrin_sse2.c"
-            "${AOM_ROOT}/av1/encoder/x86/frame_error_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
@@ -344,7 +343,6 @@
             "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
-            "${AOM_ROOT}/av1/encoder/x86/frame_error_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_k_means_avx2.c"
             "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c"
@@ -358,7 +356,6 @@
             "${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
-            "${AOM_ROOT}/av1/encoder/arm/neon/frame_error_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
@@ -487,7 +484,6 @@
               "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c")
 
   list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
-              "${AOM_ROOT}/av1/encoder/arm/neon/highbd_frame_error_neon.c"
               "${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_neon.c"
               "${AOM_ROOT}/av1/encoder/arm/neon/highbd_rdopt_neon.c"
               "${AOM_ROOT}/av1/encoder/arm/neon/highbd_temporal_filter_neon.c")
@@ -508,14 +504,10 @@
 endif()
 
 if(CONFIG_REALTIME_ONLY)
-  list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE2
-                   "${AOM_ROOT}/av1/encoder/x86/frame_error_sse2.c")
-
   list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE4_1
                    "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
 
   list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_AVX2
-                   "${AOM_ROOT}/av1/encoder/x86/frame_error_avx2.c"
                    "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c"
                    "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c")
 
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 9941947..bd2acda 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -494,17 +494,6 @@
     add_proto qw/int av1_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
     specialize qw/av1_denoiser_filter neon sse2/;
   }
-
-  # Global motion
-  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int ref_stride, const uint8_t *const dst, int dst_stride, int p_width, int p_height";
-    specialize qw/av1_calc_frame_error sse2 avx2 neon/;
-
-    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/int64_t av1_calc_highbd_frame_error/, "const uint16_t *const ref, int ref_stride, const uint16_t *const dst, int dst_stride, int p_width, int p_height, int bd";
-      specialize qw/av1_calc_highbd_frame_error sse2 avx2 neon/;
-    }
-  }
 }
 # end encoder functions
 
diff --git a/av1/encoder/arm/neon/frame_error_neon.c b/av1/encoder/arm/neon/frame_error_neon.c
deleted file mode 100644
index a00d821..0000000
--- a/av1/encoder/arm/neon/frame_error_neon.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-/* clang-format off */
-// Error metric used for global motion evaluation.
-static const uint16_t error_measure_lut[256] = {
-        0,   339,   550,   731,   894,  1045,  1187,  1323,
-     1452,  1577,  1698,  1815,  1929,  2040,  2148,  2255,
-     2359,  2461,  2562,  2661,  2758,  2854,  2948,  3041,
-     3133,  3224,  3314,  3402,  3490,  3577,  3663,  3748,
-     3832,  3916,  3998,  4080,  4162,  4242,  4322,  4401,
-     4480,  4558,  4636,  4713,  4789,  4865,  4941,  5015,
-     5090,  5164,  5237,  5311,  5383,  5456,  5527,  5599,
-     5670,  5741,  5811,  5881,  5950,  6020,  6089,  6157,
-     6225,  6293,  6361,  6428,  6495,  6562,  6628,  6695,
-     6760,  6826,  6891,  6956,  7021,  7086,  7150,  7214,
-     7278,  7341,  7405,  7468,  7531,  7593,  7656,  7718,
-     7780,  7842,  7903,  7965,  8026,  8087,  8148,  8208,
-     8269,  8329,  8389,  8449,  8508,  8568,  8627,  8686,
-     8745,  8804,  8862,  8921,  8979,  9037,  9095,  9153,
-     9211,  9268,  9326,  9383,  9440,  9497,  9553,  9610,
-     9666,  9723,  9779,  9835,  9891,  9947, 10002, 10058,
-    10113, 10168, 10224, 10279, 10333, 10388, 10443, 10497,
-    10552, 10606, 10660, 10714, 10768, 10822, 10875, 10929,
-    10982, 11036, 11089, 11142, 11195, 11248, 11301, 11353,
-    11406, 11458, 11511, 11563, 11615, 11667, 11719, 11771,
-    11823, 11875, 11926, 11978, 12029, 12080, 12132, 12183,
-    12234, 12285, 12335, 12386, 12437, 12487, 12538, 12588,
-    12639, 12689, 12739, 12789, 12839, 12889, 12939, 12988,
-    13038, 13088, 13137, 13187, 13236, 13285, 13334, 13383,
-    13432, 13481, 13530, 13579, 13628, 13676, 13725, 13773,
-    13822, 13870, 13918, 13967, 14015, 14063, 14111, 14159,
-    14206, 14254, 14302, 14350, 14397, 14445, 14492, 14539,
-    14587, 14634, 14681, 14728, 14775, 14822, 14869, 14916,
-    14963, 15010, 15056, 15103, 15149, 15196, 15242, 15289,
-    15335, 15381, 15427, 15474, 15520, 15566, 15612, 15657,
-    15703, 15749, 15795, 15840, 15886, 15932, 15977, 16022,
-    16068, 16113, 16158, 16204, 16249, 16294, 16339, 16384,
-};
-/* clang-format on */
-
-int64_t av1_calc_frame_error_neon(const uint8_t *const ref, int ref_stride,
-                                  const uint8_t *const dst, int dst_stride,
-                                  int width, int height) {
-  int64_t sum_error[4] = { 0, 0, 0, 0 };
-  int r = 0;
-  int d = 0;
-
-  do {
-    int w = width;
-    int rr = r;
-    int dd = d;
-
-    do {
-      uint8x16_t dst_v = vld1q_u8(&dst[dd]);
-      uint8x16_t ref_v = vld1q_u8(&ref[rr]);
-
-#if AOM_ARCH_AARCH64
-      uint64x2_t abs_v = vreinterpretq_u64_u8(vabdq_u8(dst_v, ref_v));
-
-      uint64_t abs0 = vgetq_lane_u64(abs_v, 0);
-      uint64_t abs1 = vgetq_lane_u64(abs_v, 1);
-
-      sum_error[0] += error_measure_lut[(abs0 >> 0) & 0xFF];
-      sum_error[1] += error_measure_lut[(abs0 >> 8) & 0xFF];
-      sum_error[2] += error_measure_lut[(abs0 >> 16) & 0xFF];
-      sum_error[3] += error_measure_lut[(abs0 >> 24) & 0xFF];
-      sum_error[0] += error_measure_lut[(abs0 >> 32) & 0xFF];
-      sum_error[1] += error_measure_lut[(abs0 >> 40) & 0xFF];
-      sum_error[2] += error_measure_lut[(abs0 >> 48) & 0xFF];
-      sum_error[3] += error_measure_lut[(abs0 >> 56) & 0xFF];
-
-      sum_error[0] += error_measure_lut[(abs1 >> 0) & 0xFF];
-      sum_error[1] += error_measure_lut[(abs1 >> 8) & 0xFF];
-      sum_error[2] += error_measure_lut[(abs1 >> 16) & 0xFF];
-      sum_error[3] += error_measure_lut[(abs1 >> 24) & 0xFF];
-      sum_error[0] += error_measure_lut[(abs1 >> 32) & 0xFF];
-      sum_error[1] += error_measure_lut[(abs1 >> 40) & 0xFF];
-      sum_error[2] += error_measure_lut[(abs1 >> 48) & 0xFF];
-      sum_error[3] += error_measure_lut[(abs1 >> 56) & 0xFF];
-#else   // !AOM_ARCH_AARCH64
-      uint32x4_t abs_v = vreinterpretq_u32_u8(vabdq_u8(dst_v, ref_v));
-
-      uint32_t abs0 = vgetq_lane_u32(abs_v, 0);
-      uint32_t abs1 = vgetq_lane_u32(abs_v, 1);
-      uint32_t abs2 = vgetq_lane_u32(abs_v, 2);
-      uint32_t abs3 = vgetq_lane_u32(abs_v, 3);
-
-      sum_error[0] += error_measure_lut[(abs0 >> 0) & 0xFF];
-      sum_error[1] += error_measure_lut[(abs0 >> 8) & 0xFF];
-      sum_error[2] += error_measure_lut[(abs0 >> 16) & 0xFF];
-      sum_error[3] += error_measure_lut[(abs0 >> 24) & 0xFF];
-      sum_error[0] += error_measure_lut[(abs1 >> 0) & 0xFF];
-      sum_error[1] += error_measure_lut[(abs1 >> 8) & 0xFF];
-      sum_error[2] += error_measure_lut[(abs1 >> 16) & 0xFF];
-      sum_error[3] += error_measure_lut[(abs1 >> 24) & 0xFF];
-
-      sum_error[0] += error_measure_lut[(abs2 >> 0) & 0xFF];
-      sum_error[1] += error_measure_lut[(abs2 >> 8) & 0xFF];
-      sum_error[2] += error_measure_lut[(abs2 >> 16) & 0xFF];
-      sum_error[3] += error_measure_lut[(abs2 >> 24) & 0xFF];
-      sum_error[0] += error_measure_lut[(abs3 >> 0) & 0xFF];
-      sum_error[1] += error_measure_lut[(abs3 >> 8) & 0xFF];
-      sum_error[2] += error_measure_lut[(abs3 >> 16) & 0xFF];
-      sum_error[3] += error_measure_lut[(abs3 >> 24) & 0xFF];
-#endif  // AOM_ARCH_AARCH64
-
-      dd += 16;
-      rr += 16;
-      w -= 16;
-    } while (w >= 16);
-
-    while (w-- != 0) {
-      sum_error[0] += error_measure_lut[abs(dst[dd] - ref[rr])];
-      dd++;
-      rr++;
-    }
-
-    r += ref_stride;
-    d += dst_stride;
-  } while (--height != 0);
-
-  return sum_error[0] + sum_error[1] + sum_error[2] + sum_error[3];
-}
diff --git a/av1/encoder/arm/neon/highbd_frame_error_neon.c b/av1/encoder/arm/neon/highbd_frame_error_neon.c
deleted file mode 100644
index 58875f6..0000000
--- a/av1/encoder/arm/neon/highbd_frame_error_neon.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2023, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-static const uint16_t error_measure_lut_diff[257] = {
-  339, 211, 181, 163, 151, 142, 136, 129, 125, 121, 117, 114, 111, 108, 107,
-  104, 102, 101, 99,  97,  96,  94,  93,  92,  91,  90,  88,  88,  87,  86,
-  85,  84,  84,  82,  82,  82,  80,  80,  79,  79,  78,  78,  77,  76,  76,
-  76,  74,  75,  74,  73,  74,  72,  73,  71,  72,  71,  71,  70,  70,  69,
-  70,  69,  68,  68,  68,  68,  67,  67,  67,  66,  67,  65,  66,  65,  65,
-  65,  65,  64,  64,  64,  63,  64,  63,  63,  62,  63,  62,  62,  62,  61,
-  62,  61,  61,  61,  60,  61,  60,  60,  60,  59,  60,  59,  59,  59,  59,
-  58,  59,  58,  58,  58,  58,  58,  57,  58,  57,  57,  57,  56,  57,  56,
-  57,  56,  56,  56,  56,  55,  56,  55,  55,  56,  55,  54,  55,  55,  54,
-  55,  54,  54,  54,  54,  54,  53,  54,  53,  54,  53,  53,  53,  53,  53,
-  52,  53,  52,  53,  52,  52,  52,  52,  52,  52,  52,  51,  52,  51,  51,
-  52,  51,  51,  51,  50,  51,  51,  50,  51,  50,  51,  50,  50,  50,  50,
-  50,  50,  49,  50,  50,  49,  50,  49,  49,  49,  49,  49,  49,  49,  49,
-  49,  48,  49,  48,  49,  48,  48,  49,  48,  48,  48,  48,  47,  48,  48,
-  48,  47,  48,  47,  47,  48,  47,  47,  47,  47,  47,  47,  47,  47,  47,
-  46,  47,  46,  47,  46,  47,  46,  46,  46,  47,  46,  46,  46,  45,  46,
-  46,  46,  45,  46,  46,  45,  45,  46,  45,  45,  46,  45,  45,  45,  45,
-  0
-};
-
-static const int hbd_12_error_measure_lut[257] = {
-  0,      5424,   8800,   11696,  14304,  16720,  18992,  21168,  23232,
-  25232,  27168,  29040,  30864,  32640,  34368,  36080,  37744,  39376,
-  40992,  42576,  44128,  45664,  47168,  48656,  50128,  51584,  53024,
-  54432,  55840,  57232,  58608,  59968,  61312,  62656,  63968,  65280,
-  66592,  67872,  69152,  70416,  71680,  72928,  74176,  75408,  76624,
-  77840,  79056,  80240,  81440,  82624,  83792,  84976,  86128,  87296,
-  88432,  89584,  90720,  91856,  92976,  94096,  95200,  96320,  97424,
-  98512,  99600,  100688, 101776, 102848, 103920, 104992, 106048, 107120,
-  108160, 109216, 110256, 111296, 112336, 113376, 114400, 115424, 116448,
-  117456, 118480, 119488, 120496, 121488, 122496, 123488, 124480, 125472,
-  126448, 127440, 128416, 129392, 130368, 131328, 132304, 133264, 134224,
-  135184, 136128, 137088, 138032, 138976, 139920, 140864, 141792, 142736,
-  143664, 144592, 145520, 146448, 147376, 148288, 149216, 150128, 151040,
-  151952, 152848, 153760, 154656, 155568, 156464, 157360, 158256, 159152,
-  160032, 160928, 161808, 162688, 163584, 164464, 165328, 166208, 167088,
-  167952, 168832, 169696, 170560, 171424, 172288, 173152, 174000, 174864,
-  175712, 176576, 177424, 178272, 179120, 179968, 180816, 181648, 182496,
-  183328, 184176, 185008, 185840, 186672, 187504, 188336, 189168, 190000,
-  190816, 191648, 192464, 193280, 194112, 194928, 195744, 196560, 197360,
-  198176, 198992, 199792, 200608, 201408, 202224, 203024, 203824, 204624,
-  205424, 206224, 207024, 207808, 208608, 209408, 210192, 210992, 211776,
-  212560, 213344, 214128, 214912, 215696, 216480, 217264, 218048, 218816,
-  219600, 220368, 221152, 221920, 222688, 223472, 224240, 225008, 225776,
-  226544, 227296, 228064, 228832, 229600, 230352, 231120, 231872, 232624,
-  233392, 234144, 234896, 235648, 236400, 237152, 237904, 238656, 239408,
-  240160, 240896, 241648, 242384, 243136, 243872, 244624, 245360, 246096,
-  246832, 247584, 248320, 249056, 249792, 250512, 251248, 251984, 252720,
-  253440, 254176, 254912, 255632, 256352, 257088, 257808, 258528, 259264,
-  259984, 260704, 261424, 262144, 262144,
-};
-
-static const int hbd_10_error_measure_lut[257] = {
-  0,     1356,  2200,  2924,  3576,  4180,  4748,  5292,  5808,  6308,  6792,
-  7260,  7716,  8160,  8592,  9020,  9436,  9844,  10248, 10644, 11032, 11416,
-  11792, 12164, 12532, 12896, 13256, 13608, 13960, 14308, 14652, 14992, 15328,
-  15664, 15992, 16320, 16648, 16968, 17288, 17604, 17920, 18232, 18544, 18852,
-  19156, 19460, 19764, 20060, 20360, 20656, 20948, 21244, 21532, 21824, 22108,
-  22396, 22680, 22964, 23244, 23524, 23800, 24080, 24356, 24628, 24900, 25172,
-  25444, 25712, 25980, 26248, 26512, 26780, 27040, 27304, 27564, 27824, 28084,
-  28344, 28600, 28856, 29112, 29364, 29620, 29872, 30124, 30372, 30624, 30872,
-  31120, 31368, 31612, 31860, 32104, 32348, 32592, 32832, 33076, 33316, 33556,
-  33796, 34032, 34272, 34508, 34744, 34980, 35216, 35448, 35684, 35916, 36148,
-  36380, 36612, 36844, 37072, 37304, 37532, 37760, 37988, 38212, 38440, 38664,
-  38892, 39116, 39340, 39564, 39788, 40008, 40232, 40452, 40672, 40896, 41116,
-  41332, 41552, 41772, 41988, 42208, 42424, 42640, 42856, 43072, 43288, 43500,
-  43716, 43928, 44144, 44356, 44568, 44780, 44992, 45204, 45412, 45624, 45832,
-  46044, 46252, 46460, 46668, 46876, 47084, 47292, 47500, 47704, 47912, 48116,
-  48320, 48528, 48732, 48936, 49140, 49340, 49544, 49748, 49948, 50152, 50352,
-  50556, 50756, 50956, 51156, 51356, 51556, 51756, 51952, 52152, 52352, 52548,
-  52748, 52944, 53140, 53336, 53532, 53728, 53924, 54120, 54316, 54512, 54704,
-  54900, 55092, 55288, 55480, 55672, 55868, 56060, 56252, 56444, 56636, 56824,
-  57016, 57208, 57400, 57588, 57780, 57968, 58156, 58348, 58536, 58724, 58912,
-  59100, 59288, 59476, 59664, 59852, 60040, 60224, 60412, 60596, 60784, 60968,
-  61156, 61340, 61524, 61708, 61896, 62080, 62264, 62448, 62628, 62812, 62996,
-  63180, 63360, 63544, 63728, 63908, 64088, 64272, 64452, 64632, 64816, 64996,
-  65176, 65356, 65536, 65536,
-};
-
-static const int hbd_8_error_measure_lut[257] = {
-  0,     339,   550,   731,   894,   1045,  1187,  1323,  1452,  1577,  1698,
-  1815,  1929,  2040,  2148,  2255,  2359,  2461,  2562,  2661,  2758,  2854,
-  2948,  3041,  3133,  3224,  3314,  3402,  3490,  3577,  3663,  3748,  3832,
-  3916,  3998,  4080,  4162,  4242,  4322,  4401,  4480,  4558,  4636,  4713,
-  4789,  4865,  4941,  5015,  5090,  5164,  5237,  5311,  5383,  5456,  5527,
-  5599,  5670,  5741,  5811,  5881,  5950,  6020,  6089,  6157,  6225,  6293,
-  6361,  6428,  6495,  6562,  6628,  6695,  6760,  6826,  6891,  6956,  7021,
-  7086,  7150,  7214,  7278,  7341,  7405,  7468,  7531,  7593,  7656,  7718,
-  7780,  7842,  7903,  7965,  8026,  8087,  8148,  8208,  8269,  8329,  8389,
-  8449,  8508,  8568,  8627,  8686,  8745,  8804,  8862,  8921,  8979,  9037,
-  9095,  9153,  9211,  9268,  9326,  9383,  9440,  9497,  9553,  9610,  9666,
-  9723,  9779,  9835,  9891,  9947,  10002, 10058, 10113, 10168, 10224, 10279,
-  10333, 10388, 10443, 10497, 10552, 10606, 10660, 10714, 10768, 10822, 10875,
-  10929, 10982, 11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406, 11458,
-  11511, 11563, 11615, 11667, 11719, 11771, 11823, 11875, 11926, 11978, 12029,
-  12080, 12132, 12183, 12234, 12285, 12335, 12386, 12437, 12487, 12538, 12588,
-  12639, 12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038, 13088, 13137,
-  13187, 13236, 13285, 13334, 13383, 13432, 13481, 13530, 13579, 13628, 13676,
-  13725, 13773, 13822, 13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
-  14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587, 14634, 14681, 14728,
-  14775, 14822, 14869, 14916, 14963, 15010, 15056, 15103, 15149, 15196, 15242,
-  15289, 15335, 15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703, 15749,
-  15795, 15840, 15886, 15932, 15977, 16022, 16068, 16113, 16158, 16204, 16249,
-  16294, 16339, 16384, 16384,
-};
-
-// Split error into two parts and do an interpolated table lookup.
-// To compute the table index and interpolation value, we want to calculate
-// the quotient and remainder of (dst - ref) / 2^(bd - 8).
-#define HBD_CALC_FRAME_ERROR(bd, offset, mask)                              \
-  static INLINE int highbd_##bd##_error_measure(int q, int r) {             \
-    return (hbd_##bd##_error_measure_lut[q]) +                              \
-           (error_measure_lut_diff[q]) * r;                                 \
-  }                                                                         \
-                                                                            \
-  int64_t av1_calc_highbd_##bd##_frame_error_neon(                          \
-      const uint16_t *const ref, int ref_stride, const uint16_t *const dst, \
-      int dst_stride, int width, int height) {                              \
-    int64_t sum_error[4] = { 0, 0, 0, 0 };                                  \
-    int r = 0;                                                              \
-    int d = 0;                                                              \
-                                                                            \
-    do {                                                                    \
-      int w = width;                                                        \
-      int rr = r;                                                           \
-      int dd = d;                                                           \
-                                                                            \
-      do {                                                                  \
-        uint16x8_t dst_v = vld1q_u16(&dst[dd]);                             \
-        uint16x8_t ref_v = vld1q_u16(&ref[rr]);                             \
-                                                                            \
-        uint64x2_t abs_v = vreinterpretq_u64_u16(vabdq_u16(dst_v, ref_v));  \
-                                                                            \
-        uint64_t abs0 = vgetq_lane_u64(abs_v, 0);                           \
-        uint64_t abs1 = vgetq_lane_u64(abs_v, 1);                           \
-                                                                            \
-        sum_error[0] += highbd_##bd##_error_measure(                        \
-            (abs0 >> (0 + offset)) & 0xFF, (abs0 >> 0) & mask);             \
-        sum_error[1] += highbd_##bd##_error_measure(                        \
-            (abs0 >> (16 + offset)) & 0xFF, (abs0 >> 16) & mask);           \
-        sum_error[2] += highbd_##bd##_error_measure(                        \
-            (abs0 >> (32 + offset)) & 0xFF, (abs0 >> 32) & mask);           \
-        sum_error[3] += highbd_##bd##_error_measure(                        \
-            (abs0 >> (48 + offset)) & 0xFF, (abs0 >> 48) & mask);           \
-                                                                            \
-        sum_error[0] += highbd_##bd##_error_measure(                        \
-            (abs1 >> (0 + offset)) & 0xFF, (abs1 >> 0) & mask);             \
-        sum_error[1] += highbd_##bd##_error_measure(                        \
-            (abs1 >> (16 + offset)) & 0xFF, (abs1 >> 16) & mask);           \
-        sum_error[2] += highbd_##bd##_error_measure(                        \
-            (abs1 >> (32 + offset)) & 0xFF, (abs1 >> 32) & mask);           \
-        sum_error[3] += highbd_##bd##_error_measure(                        \
-            (abs1 >> (48 + offset)) & 0xFF, (abs1 >> 48) & mask);           \
-                                                                            \
-        dd += 8;                                                            \
-        rr += 8;                                                            \
-        w -= 8;                                                             \
-      } while (w >= 8);                                                     \
-                                                                            \
-      while (w-- != 0) {                                                    \
-        uint16_t abs_u16 = abs(dst[dd] - ref[rr]);                          \
-        sum_error[0] +=                                                     \
-            highbd_##bd##_error_measure(abs_u16 >> offset, abs_u16 & mask); \
-        dd++;                                                               \
-        rr++;                                                               \
-      }                                                                     \
-                                                                            \
-      r += ref_stride;                                                      \
-      d += dst_stride;                                                      \
-    } while (--height != 0);                                                \
-                                                                            \
-    return sum_error[0] + sum_error[1] + sum_error[2] + sum_error[3];       \
-  }
-
-// 12 bitdepth
-HBD_CALC_FRAME_ERROR(12, 4, 0xF)
-// 10 bitdepth
-HBD_CALC_FRAME_ERROR(10, 2, 0x3)
-// 8 bitdepth
-HBD_CALC_FRAME_ERROR(8, 0, 0x0)
-
-int64_t av1_calc_highbd_frame_error_neon(const uint16_t *const ref,
-                                         int ref_stride,
-                                         const uint16_t *const dst,
-                                         int dst_stride, int width, int height,
-                                         int bd) {
-  switch (bd) {
-    case 8:
-    default:
-      return av1_calc_highbd_8_frame_error_neon(ref, ref_stride, dst,
-                                                dst_stride, width, height);
-    case 10:
-      return av1_calc_highbd_10_frame_error_neon(ref, ref_stride, dst,
-                                                 dst_stride, width, height);
-    case 12:
-      return av1_calc_highbd_12_frame_error_neon(ref, ref_stride, dst,
-                                                 dst_stride, width, height);
-  }
-}
diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c
index ca9367a..73910de 100644
--- a/av1/encoder/global_motion.c
+++ b/av1/encoder/global_motion.c
@@ -187,23 +187,31 @@
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-int64_t av1_calc_highbd_frame_error_c(const uint16_t *const ref, int ref_stride,
-                                      const uint16_t *const dst, int dst_stride,
-                                      int p_width, int p_height, int bd) {
-  int64_t sum_error = 0;
+static INLINE int generic_sad_highbd(const uint16_t *const ref, int ref_stride,
+                                     const uint16_t *const dst, int dst_stride,
+                                     int p_width, int p_height) {
+  // This function should only be called for patches smaller than
+  // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels
+  // small enough that we don't need a 64-bit accumulator
+  assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK);
+
+  int sad = 0;
   for (int i = 0; i < p_height; ++i) {
     for (int j = 0; j < p_width; ++j) {
-      sum_error += highbd_error_measure(
-          dst[j + i * dst_stride] - ref[j + i * ref_stride], bd);
+      sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]);
     }
   }
-  return sum_error;
+  return sad;
 }
 
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in highbd_segmented_frame_error"
+#endif  // WARP_ERROR_BLOCK != 32
 static int64_t highbd_segmented_frame_error(
     const uint16_t *const ref, int ref_stride, const uint16_t *const dst,
     int dst_stride, int p_width, int p_height, int bd, uint8_t *segment_map,
     int segment_map_stride) {
+  (void)bd;
   int patch_w, patch_h;
   const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
   const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
@@ -219,14 +227,24 @@
       // avoid computing error into the frame padding
       patch_w = AOMMIN(error_bsize_w, p_width - j);
       patch_h = AOMMIN(error_bsize_h, p_height - i);
-      sum_error += av1_calc_highbd_frame_error(
-          ref + j + i * ref_stride, ref_stride, dst + j + i * dst_stride,
-          dst_stride, patch_w, patch_h, bd);
+
+      if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) {
+        sum_error += aom_highbd_sad32x32(
+            CONVERT_TO_BYTEPTR(ref + j + i * ref_stride), ref_stride,
+            CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride);
+      } else {
+        sum_error += generic_sad_highbd(ref + j + i * ref_stride, ref_stride,
+                                        dst + j + i * dst_stride, dst_stride,
+                                        patch_w, patch_h);
+      }
     }
   }
   return sum_error;
 }
 
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in highbd_warp_error"
+#endif  // WARP_ERROR_BLOCK != 32
 static int64_t highbd_warp_error(WarpedMotionParams *wm,
                                  const uint16_t *const ref, int ref_width,
                                  int ref_height, int ref_stride,
@@ -256,9 +274,17 @@
       highbd_warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i,
                         warp_w, warp_h, WARP_ERROR_BLOCK, subsampling_x,
                         subsampling_y, bd, &conv_params);
-      gm_sumerr += av1_calc_highbd_frame_error(tmp, WARP_ERROR_BLOCK,
-                                               dst + j + i * dst_stride,
-                                               dst_stride, warp_w, warp_h, bd);
+
+      if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) {
+        gm_sumerr += aom_highbd_sad32x32(
+            CONVERT_TO_BYTEPTR(tmp), WARP_ERROR_BLOCK,
+            CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride);
+      } else {
+        gm_sumerr +=
+            generic_sad_highbd(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
+                               dst_stride, warp_w, warp_h);
+      }
+
       if (gm_sumerr > best_error) return INT64_MAX;
     }
   }
@@ -266,19 +292,26 @@
 }
 #endif
 
-int64_t av1_calc_frame_error_c(const uint8_t *const ref, int ref_stride,
-                               const uint8_t *const dst, int dst_stride,
-                               int p_width, int p_height) {
-  int64_t sum_error = 0;
+static INLINE int generic_sad(const uint8_t *const ref, int ref_stride,
+                              const uint8_t *const dst, int dst_stride,
+                              int p_width, int p_height) {
+  // This function should only be called for patches smaller than
+  // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels
+  // small enough that we don't need a 64-bit accumulator
+  assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK);
+
+  int sad = 0;
   for (int i = 0; i < p_height; ++i) {
     for (int j = 0; j < p_width; ++j) {
-      sum_error += (int64_t)error_measure(dst[j + i * dst_stride] -
-                                          ref[j + i * ref_stride]);
+      sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]);
     }
   }
-  return sum_error;
+  return sad;
 }
 
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in segmented_warp_error"
+#endif  // WARP_ERROR_BLOCK != 32
 static int64_t segmented_frame_error(const uint8_t *const ref, int ref_stride,
                                      const uint8_t *const dst, int dst_stride,
                                      int p_width, int p_height,
@@ -299,14 +332,23 @@
       // avoid computing error into the frame padding
       patch_w = AOMMIN(error_bsize_w, p_width - j);
       patch_h = AOMMIN(error_bsize_h, p_height - i);
-      sum_error += av1_calc_frame_error(ref + j + i * ref_stride, ref_stride,
-                                        dst + j + i * dst_stride, dst_stride,
-                                        patch_w, patch_h);
+
+      if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) {
+        sum_error += aom_sad32x32(ref + j + i * ref_stride, ref_stride,
+                                  dst + j + i * dst_stride, dst_stride);
+      } else {
+        sum_error +=
+            generic_sad(ref + j + i * ref_stride, ref_stride,
+                        dst + j + i * dst_stride, dst_stride, patch_w, patch_h);
+      }
     }
   }
   return sum_error;
 }
 
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in warp_error"
+#endif  // WARP_ERROR_BLOCK != 32
 static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
                           int ref_width, int ref_height, int ref_stride,
                           const uint8_t *const dst, int dst_stride, int p_col,
@@ -337,31 +379,21 @@
                  warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y,
                  &conv_params);
 
-      gm_sumerr +=
-          av1_calc_frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
-                               dst_stride, warp_w, warp_h);
+      if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) {
+        gm_sumerr += aom_sad32x32(tmp, WARP_ERROR_BLOCK,
+                                  dst + j + i * dst_stride, dst_stride);
+      } else {
+        gm_sumerr +=
+            generic_sad(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
+                        dst_stride, warp_w, warp_h);
+      }
+
       if (gm_sumerr > best_error) return INT64_MAX;
     }
   }
   return gm_sumerr;
 }
 
-int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int ref_stride,
-                        uint8_t *dst, int dst_stride, int p_width,
-                        int p_height) {
-#if CONFIG_AV1_HIGHBITDEPTH
-  if (use_hbd) {
-    return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), ref_stride,
-                                       CONVERT_TO_SHORTPTR(dst), dst_stride,
-                                       p_width, p_height, bd);
-  }
-#endif
-  (void)use_hbd;
-  (void)bd;
-  return av1_calc_frame_error(ref, ref_stride, dst, dst_stride, p_width,
-                              p_height);
-}
-
 int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
                                   int ref_stride, uint8_t *dst, int dst_stride,
                                   int p_width, int p_height,
diff --git a/av1/encoder/global_motion.h b/av1/encoder/global_motion.h
index ff2287b..8c9c60f 100644
--- a/av1/encoder/global_motion.h
+++ b/av1/encoder/global_motion.h
@@ -128,11 +128,6 @@
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-// Returns the error between the frame described by 'ref' and the frame
-// described by 'dst'.
-int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
-                        uint8_t *dst, int p_width, int p_height, int p_stride);
-
 int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
                                   int ref_stride, uint8_t *dst, int dst_stride,
                                   int p_width, int p_height,
diff --git a/av1/encoder/x86/frame_error_avx2.c b/av1/encoder/x86/frame_error_avx2.c
deleted file mode 100644
index ceac2b2..0000000
--- a/av1/encoder/x86/frame_error_avx2.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-#include "config/av1_rtcd.h"
-#include "av1/common/warped_motion.h"
-#include "av1/encoder/global_motion.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#if CONFIG_AV1_HIGHBITDEPTH
-int64_t av1_calc_highbd_frame_error_avx2(const uint16_t *const ref,
-                                         int ref_stride,
-                                         const uint16_t *const dst,
-                                         int dst_stride, int p_width,
-                                         int p_height, int bd) {
-  const int b = bd - 8;
-  const __m128i shift = _mm_cvtsi32_si128(b);
-  const __m256i bmask = _mm256_set1_epi16((1 << b) - 1);
-  const __m256i v = _mm256_set1_epi16(1 << b);
-
-  int64_t sum_error = 0;
-  int i, j;
-  __m256i row_error, col_error;
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i dup_256 = _mm256_set1_epi16(256);
-  const __m256i dup_257 = _mm256_set1_epi16(257);
-  col_error = zero;
-  for (i = 0; i < (p_height / 2); i++) {
-    row_error = _mm256_setzero_si256();
-    for (j = 0; j < (p_width / 16); j++) {
-      const __m256i ref_1_16 = _mm256_load_si256(
-          (__m256i *)(ref + (j * 16) + (((i * 2) + 0) * ref_stride)));
-      const __m256i dst_1_16 = _mm256_load_si256(
-          (__m256i *)(dst + (j * 16) + (((i * 2) + 0) * dst_stride)));
-      const __m256i ref_2_16 = _mm256_load_si256(
-          (__m256i *)(ref + (j * 16) + (((i * 2) + 1) * ref_stride)));
-      const __m256i dst_2_16 = _mm256_load_si256(
-          (__m256i *)(dst + (j * 16) + (((i * 2) + 1) * dst_stride)));
-
-      const __m256i diff_1 = _mm256_sub_epi16(dst_1_16, ref_1_16);
-      const __m256i diff_2 = _mm256_sub_epi16(dst_2_16, ref_2_16);
-
-      const __m256i e1_1 = _mm256_sra_epi16(diff_1, shift);
-      const __m256i e2_1 = _mm256_and_si256(diff_1, bmask);
-      const __m256i e1_2 = _mm256_sra_epi16(diff_2, shift);
-      const __m256i e2_2 = _mm256_and_si256(diff_2, bmask);
-
-      // We need to fetch two 16-bit error values per pixel, so that we can
-      // interpolate, but AVX2 only supports 32-bit gathers. Therefore we
-      // need to expand each register of indices into two 8x32-bit registers,
-      // gather, then re-combine into 16x16-bit registers.
-      const __m256i idx1_1 = _mm256_add_epi16(e1_1, dup_256);
-      const __m256i idx2_1 = _mm256_add_epi16(e1_1, dup_257);
-      const __m256i idx1_2 = _mm256_add_epi16(e1_2, dup_256);
-      const __m256i idx2_2 = _mm256_add_epi16(e1_2, dup_257);
-
-      const __m256i idx1_lo_1 = _mm256_unpacklo_epi16(idx1_1, zero);
-      const __m256i idx1_hi_1 = _mm256_unpackhi_epi16(idx1_1, zero);
-      const __m256i idx2_lo_1 = _mm256_unpacklo_epi16(idx2_1, zero);
-      const __m256i idx2_hi_1 = _mm256_unpackhi_epi16(idx2_1, zero);
-
-      const __m256i idx1_lo_2 = _mm256_unpacklo_epi16(idx1_2, zero);
-      const __m256i idx1_hi_2 = _mm256_unpackhi_epi16(idx1_2, zero);
-      const __m256i idx2_lo_2 = _mm256_unpacklo_epi16(idx2_2, zero);
-      const __m256i idx2_hi_2 = _mm256_unpackhi_epi16(idx2_2, zero);
-
-      const __m256i error_1_lo_1 =
-          _mm256_i32gather_epi32(error_measure_lut, idx1_lo_1, 4);
-      const __m256i error_1_hi_1 =
-          _mm256_i32gather_epi32(error_measure_lut, idx1_hi_1, 4);
-      const __m256i error_2_lo_1 =
-          _mm256_i32gather_epi32(error_measure_lut, idx2_lo_1, 4);
-      const __m256i error_2_hi_1 =
-          _mm256_i32gather_epi32(error_measure_lut, idx2_hi_1, 4);
-      const __m256i error_1_lo_2 =
-          _mm256_i32gather_epi32(error_measure_lut, idx1_lo_2, 4);
-      const __m256i error_1_hi_2 =
-          _mm256_i32gather_epi32(error_measure_lut, idx1_hi_2, 4);
-      const __m256i error_2_lo_2 =
-          _mm256_i32gather_epi32(error_measure_lut, idx2_lo_2, 4);
-      const __m256i error_2_hi_2 =
-          _mm256_i32gather_epi32(error_measure_lut, idx2_hi_2, 4);
-
-      const __m256i error_1_1 = _mm256_packus_epi32(error_1_lo_1, error_1_hi_1);
-      const __m256i error_2_1 = _mm256_packus_epi32(error_2_lo_1, error_2_hi_1);
-      const __m256i error_1_2 = _mm256_packus_epi32(error_1_lo_2, error_1_hi_2);
-      const __m256i error_2_2 = _mm256_packus_epi32(error_2_lo_2, error_2_hi_2);
-
-      // Interleave the error and multiplier arrays
-      // The unpack instructions implicitly reorder the pixels, but the
-      // reordering is consistent between the two arrays being multiplied,
-      // and we sum everything into one value at the end, so this does not
-      // affect the final result.
-      const __m256i e2_inv_1 = _mm256_sub_epi16(v, e2_1);
-      const __m256i e2_inv_2 = _mm256_sub_epi16(v, e2_2);
-
-      const __m256i error_lo_1 = _mm256_unpacklo_epi16(error_1_1, error_2_1);
-      const __m256i error_hi_1 = _mm256_unpackhi_epi16(error_1_1, error_2_1);
-      const __m256i mul_lo_1 = _mm256_unpacklo_epi16(e2_inv_1, e2_1);
-      const __m256i mul_hi_1 = _mm256_unpackhi_epi16(e2_inv_1, e2_1);
-
-      const __m256i error_lo_2 = _mm256_unpacklo_epi16(error_1_2, error_2_2);
-      const __m256i error_hi_2 = _mm256_unpackhi_epi16(error_1_2, error_2_2);
-      const __m256i mul_lo_2 = _mm256_unpacklo_epi16(e2_inv_2, e2_2);
-      const __m256i mul_hi_2 = _mm256_unpackhi_epi16(e2_inv_2, e2_2);
-
-      const __m256i result_lo_1 = _mm256_madd_epi16(error_lo_1, mul_lo_1);
-      const __m256i result_hi_1 = _mm256_madd_epi16(error_hi_1, mul_hi_1);
-      const __m256i result_lo_2 = _mm256_madd_epi16(error_lo_2, mul_lo_2);
-      const __m256i result_hi_2 = _mm256_madd_epi16(error_hi_2, mul_hi_2);
-
-      const __m256i partial_sum =
-          _mm256_add_epi32(_mm256_add_epi32(result_lo_1, result_hi_1),
-                           _mm256_add_epi32(result_lo_2, result_hi_2));
-
-      row_error = _mm256_add_epi32(row_error, partial_sum);
-    }
-    const __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
-    const __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
-    const __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
-    col_error = _mm256_add_epi64(col_error, col_error_temp);
-    // Error summation for remaining width, which is not multiple of 16
-    if (p_width & 0xf) {
-      for (int k = 0; k < 2; ++k) {
-        for (int l = j * 16; l < p_width; ++l) {
-          sum_error += (int64_t)highbd_error_measure(
-              dst[l + ((i * 2) + k) * dst_stride] -
-                  ref[l + ((i * 2) + k) * ref_stride],
-              bd);
-        }
-      }
-    }
-  }
-  const __m128i sum_error_q =
-      _mm_add_epi64(_mm256_castsi256_si128(col_error),
-                    _mm256_extracti128_si256(col_error, 1));
-  int64_t sum_error_d_0, sum_error_d_1;
-  xx_storel_64(&sum_error_d_0, sum_error_q);
-  xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q, 8));
-  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
-  // Error summation for remaining height, which is not multiple of 2
-  if (p_height & 0x1) {
-    for (int k = i * 2; k < p_height; ++k) {
-      for (int l = 0; l < p_width; ++l) {
-        sum_error += (int64_t)highbd_error_measure(
-            dst[l + k * dst_stride] - ref[l + k * ref_stride], bd);
-      }
-    }
-  }
-  return sum_error;
-}
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
-                                  const uint8_t *const dst, int dst_stride,
-                                  int p_width, int p_height) {
-  int64_t sum_error = 0;
-  int i, j;
-  __m256i row_error, col_error;
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i dup_256 = _mm256_set1_epi16(256);
-  col_error = zero;
-
-  for (i = 0; i < (p_height / 4); i++) {
-    row_error = _mm256_setzero_si256();
-    for (j = 0; j < (p_width / 16); j++) {
-      const __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
-      const __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
-      const __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
-      const __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
-      const __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
-      const __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
-      const __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
-      const __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
-          (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
-
-      const __m256i diff_1 =
-          _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_256);
-      const __m256i diff_2 =
-          _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_256);
-      const __m256i diff_3 =
-          _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_256);
-      const __m256i diff_4 =
-          _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_256);
-
-      const __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
-      const __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
-      const __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
-      const __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
-      const __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
-      const __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
-      const __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
-      const __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
-
-      const __m256i error_1_lo =
-          _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
-      const __m256i error_1_hi =
-          _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
-      const __m256i error_2_lo =
-          _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
-      const __m256i error_2_hi =
-          _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
-      const __m256i error_3_lo =
-          _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
-      const __m256i error_3_hi =
-          _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
-      const __m256i error_4_lo =
-          _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
-      const __m256i error_4_hi =
-          _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
-
-      const __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
-      const __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
-      const __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
-      const __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
-
-      const __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
-      const __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
-
-      const __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
-      row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
-    }
-    const __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
-    const __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
-    const __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
-    col_error = _mm256_add_epi64(col_error, col_error_temp);
-    // Error summation for remaining width, which is not multiple of 16
-    if (p_width & 0xf) {
-      for (int k = 0; k < 4; ++k) {
-        for (int l = j * 16; l < p_width; ++l) {
-          sum_error +=
-              (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
-                                     ref[l + ((i * 4) + k) * ref_stride]);
-        }
-      }
-    }
-  }
-  const __m128i sum_error_q =
-      _mm_add_epi64(_mm256_castsi256_si128(col_error),
-                    _mm256_extracti128_si256(col_error, 1));
-  int64_t sum_error_d_0, sum_error_d_1;
-  xx_storel_64(&sum_error_d_0, sum_error_q);
-  xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q, 8));
-  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
-  // Error summation for remaining height, which is not multiple of 4
-  if (p_height & 0x3) {
-    for (int k = i * 4; k < p_height; ++k) {
-      for (int l = 0; l < p_width; ++l) {
-        sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
-                                            ref[l + k * ref_stride]);
-      }
-    }
-  }
-  return sum_error;
-}
diff --git a/av1/encoder/x86/frame_error_sse2.c b/av1/encoder/x86/frame_error_sse2.c
deleted file mode 100644
index fbe39fa..0000000
--- a/av1/encoder/x86/frame_error_sse2.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "aom_dsp/x86/synonyms.h"
-#include "av1/common/warped_motion.h"
-#include "av1/encoder/global_motion.h"
-#include "config/av1_rtcd.h"
-
-#if CONFIG_AV1_HIGHBITDEPTH
-int64_t av1_calc_highbd_frame_error_sse2(const uint16_t *const ref,
-                                         int ref_stride,
-                                         const uint16_t *const dst,
-                                         int dst_stride, int p_width,
-                                         int p_height, int bd) {
-  const int b = bd - 8;
-  const __m128i shift = _mm_cvtsi32_si128(b);
-  const __m128i bmask = _mm_set1_epi16((1 << b) - 1);
-  const __m128i v = _mm_set1_epi16(1 << b);
-
-  int64_t sum_error = 0;
-  int i, j;
-  __m128i row_error, col_error;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i dup_256 = _mm_set1_epi16(256);
-  const __m128i dup_257 = _mm_set1_epi16(257);
-  col_error = zero;
-  for (i = 0; i < (p_height); i++) {
-    row_error = zero;
-    for (j = 0; j < (p_width / 16); j++) {
-      const __m128i ref_1 =
-          _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride)));
-      const __m128i dst_1 =
-          _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride)));
-      const __m128i ref_2 =
-          _mm_load_si128((__m128i *)(ref + (j * 16 + 8) + (i * ref_stride)));
-      const __m128i dst_2 =
-          _mm_load_si128((__m128i *)(dst + (j * 16 + 8) + (i * dst_stride)));
-
-      const __m128i diff_1 = _mm_sub_epi16(dst_1, ref_1);
-      const __m128i diff_2 = _mm_sub_epi16(dst_2, ref_2);
-
-      const __m128i e1_1 = _mm_sra_epi16(diff_1, shift);
-      const __m128i e2_1 = _mm_and_si128(diff_1, bmask);
-      const __m128i e1_2 = _mm_sra_epi16(diff_2, shift);
-      const __m128i e2_2 = _mm_and_si128(diff_2, bmask);
-
-      // For each 16-bit element in e1 and e2, we need to accumulate
-      // the value:
-      //   error_measure_lut[256 + e1] * (v - e2) +
-      //   error_measure_lut[257 + e1] * e2
-      // To do this, we first synthesize two 16-bit gathers, then
-      // interleave the factors in such a way that we can use _mm_madd_epi16
-      // to do eight 16x16->32 multiplies in one go
-      const __m128i idx1_1 = _mm_add_epi16(e1_1, dup_256);
-      const __m128i error1_1 =
-          _mm_set_epi16(error_measure_lut[_mm_extract_epi16(idx1_1, 7)],
-                        error_measure_lut[_mm_extract_epi16(idx1_1, 6)],
-                        error_measure_lut[_mm_extract_epi16(idx1_1, 5)],
-                        error_measure_lut[_mm_extract_epi16(idx1_1, 4)],
-                        error_measure_lut[_mm_extract_epi16(idx1_1, 3)],
-                        error_measure_lut[_mm_extract_epi16(idx1_1, 2)],
-                        error_measure_lut[_mm_extract_epi16(idx1_1, 1)],
-                        error_measure_lut[_mm_extract_epi16(idx1_1, 0)]);
-
-      const __m128i idx2_1 = _mm_add_epi16(e1_1, dup_257);
-      const __m128i error2_1 =
-          _mm_set_epi16(error_measure_lut[_mm_extract_epi16(idx2_1, 7)],
-                        error_measure_lut[_mm_extract_epi16(idx2_1, 6)],
-                        error_measure_lut[_mm_extract_epi16(idx2_1, 5)],
-                        error_measure_lut[_mm_extract_epi16(idx2_1, 4)],
-                        error_measure_lut[_mm_extract_epi16(idx2_1, 3)],
-                        error_measure_lut[_mm_extract_epi16(idx2_1, 2)],
-                        error_measure_lut[_mm_extract_epi16(idx2_1, 1)],
-                        error_measure_lut[_mm_extract_epi16(idx2_1, 0)]);
-
-      const __m128i error_lo_1 = _mm_unpacklo_epi16(error1_1, error2_1);
-      const __m128i error_hi_1 = _mm_unpackhi_epi16(error1_1, error2_1);
-
-      const __m128i idx1_2 = _mm_add_epi16(e1_2, dup_256);
-      const __m128i error1_2 =
-          _mm_set_epi16(error_measure_lut[_mm_extract_epi16(idx1_2, 7)],
-                        error_measure_lut[_mm_extract_epi16(idx1_2, 6)],
-                        error_measure_lut[_mm_extract_epi16(idx1_2, 5)],
-                        error_measure_lut[_mm_extract_epi16(idx1_2, 4)],
-                        error_measure_lut[_mm_extract_epi16(idx1_2, 3)],
-                        error_measure_lut[_mm_extract_epi16(idx1_2, 2)],
-                        error_measure_lut[_mm_extract_epi16(idx1_2, 1)],
-                        error_measure_lut[_mm_extract_epi16(idx1_2, 0)]);
-
-      const __m128i idx2_2 = _mm_add_epi16(e1_2, dup_257);
-      const __m128i error2_2 =
-          _mm_set_epi16(error_measure_lut[_mm_extract_epi16(idx2_2, 7)],
-                        error_measure_lut[_mm_extract_epi16(idx2_2, 6)],
-                        error_measure_lut[_mm_extract_epi16(idx2_2, 5)],
-                        error_measure_lut[_mm_extract_epi16(idx2_2, 4)],
-                        error_measure_lut[_mm_extract_epi16(idx2_2, 3)],
-                        error_measure_lut[_mm_extract_epi16(idx2_2, 2)],
-                        error_measure_lut[_mm_extract_epi16(idx2_2, 1)],
-                        error_measure_lut[_mm_extract_epi16(idx2_2, 0)]);
-
-      const __m128i error_lo_2 = _mm_unpacklo_epi16(error1_2, error2_2);
-      const __m128i error_hi_2 = _mm_unpackhi_epi16(error1_2, error2_2);
-
-      // Compute multipliers
-      const __m128i e2_inv_1 = _mm_sub_epi16(v, e2_1);
-      const __m128i mul_lo_1 = _mm_unpacklo_epi16(e2_inv_1, e2_1);
-      const __m128i mul_hi_1 = _mm_unpackhi_epi16(e2_inv_1, e2_1);
-
-      const __m128i e2_inv_2 = _mm_sub_epi16(v, e2_2);
-      const __m128i mul_lo_2 = _mm_unpacklo_epi16(e2_inv_2, e2_2);
-      const __m128i mul_hi_2 = _mm_unpackhi_epi16(e2_inv_2, e2_2);
-
-      // Multiply and accumulate
-      const __m128i result1_1 = _mm_madd_epi16(error_lo_1, mul_lo_1);
-      const __m128i result2_1 = _mm_madd_epi16(error_hi_1, mul_hi_1);
-      const __m128i result1_2 = _mm_madd_epi16(error_lo_2, mul_lo_2);
-      const __m128i result2_2 = _mm_madd_epi16(error_hi_2, mul_hi_2);
-
-      const __m128i partial_sum =
-          _mm_add_epi32(_mm_add_epi32(result1_1, result2_1),
-                        _mm_add_epi32(result1_2, result2_2));
-
-      row_error = _mm_add_epi32(row_error, partial_sum);
-    }
-
-    const __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero);
-    const __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero);
-    const __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi);
-    col_error = _mm_add_epi64(col_error, col_error_temp);
-    // Error summation for remaining width, which is not multiple of 16
-    if (p_width & 0xf) {
-      for (int l = j * 16; l < p_width; ++l) {
-        sum_error += (int64_t)highbd_error_measure(
-            dst[l + i * dst_stride] - ref[l + i * ref_stride], bd);
-      }
-    }
-  }
-  int64_t sum_error_d_0, sum_error_d_1;
-  xx_storel_64(&sum_error_d_0, col_error);
-  xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8));
-  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
-  return sum_error;
-}
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride,
-                                  const uint8_t *const dst, int dst_stride,
-                                  int p_width, int p_height) {
-  int64_t sum_error = 0;
-  int i, j;
-  __m128i row_error, col_error;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i dup_256 = _mm_set1_epi16(256);
-  col_error = zero;
-  for (i = 0; i < (p_height); i++) {
-    row_error = zero;
-    for (j = 0; j < (p_width / 16); j++) {
-      const __m128i ref_8 =
-          _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride)));
-      const __m128i dst_8 =
-          _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride)));
-      const __m128i ref_16_lo = _mm_unpacklo_epi8(ref_8, zero);
-      const __m128i ref_16_hi = _mm_unpackhi_epi8(ref_8, zero);
-      const __m128i dst_16_lo = _mm_unpacklo_epi8(dst_8, zero);
-      const __m128i dst_16_hi = _mm_unpackhi_epi8(dst_8, zero);
-
-      const __m128i diff_1 =
-          _mm_add_epi16(_mm_sub_epi16(dst_16_lo, ref_16_lo), dup_256);
-      const __m128i diff_2 =
-          _mm_add_epi16(_mm_sub_epi16(dst_16_hi, ref_16_hi), dup_256);
-
-      const __m128i error_1_lo =
-          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 3)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 2)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 1)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 0)]);
-      const __m128i error_1_hi =
-          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 7)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 6)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 5)],
-                        error_measure_lut[_mm_extract_epi16(diff_1, 4)]);
-      const __m128i error_2_lo =
-          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 3)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 2)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 1)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 0)]);
-      const __m128i error_2_hi =
-          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 7)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 6)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 5)],
-                        error_measure_lut[_mm_extract_epi16(diff_2, 4)]);
-
-      const __m128i error_1 = _mm_add_epi32(error_1_lo, error_1_hi);
-      const __m128i error_2 = _mm_add_epi32(error_2_lo, error_2_hi);
-      const __m128i error_1_2 = _mm_add_epi32(error_1, error_2);
-
-      row_error = _mm_add_epi32(row_error, error_1_2);
-    }
-    const __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero);
-    const __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero);
-    const __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi);
-    col_error = _mm_add_epi64(col_error, col_error_temp);
-    // Error summation for remaining width, which is not multiple of 16
-    if (p_width & 0xf) {
-      for (int l = j * 16; l < p_width; ++l) {
-        sum_error += (int64_t)error_measure(dst[l + i * dst_stride] -
-                                            ref[l + i * ref_stride]);
-      }
-    }
-  }
-  int64_t sum_error_d_0, sum_error_d_1;
-  xx_storel_64(&sum_error_d_0, col_error);
-  xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8));
-  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
-  return sum_error;
-}
diff --git a/test/frame_error_test.cc b/test/frame_error_test.cc
deleted file mode 100644
index 35733d9..0000000
--- a/test/frame_error_test.cc
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <tuple>
-
-#include "config/av1_rtcd.h"
-
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/aom_timer.h"
-#include "aom_ports/mem.h"
-#include "test/acm_random.h"
-#include "test/util.h"
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-namespace frame_error_test {
-typedef int64_t (*frame_error_func)(const uint8_t *const ref, int ref_stride,
-                                    const uint8_t *const dst, int dst_stride,
-                                    int p_width, int p_height);
-#if HAVE_AVX2 || HAVE_SSE2 || HAVE_NEON
-const int kBlockWidth[] = {
-  832, 834, 640, 1280, 1920,
-};
-const int kBlockHeight[] = {
-  480, 482, 360, 720, 1080,
-};
-#endif
-typedef std::tuple<frame_error_func, int, int> FrameErrorParam;
-
-class AV1FrameErrorTest : public ::testing::TestWithParam<FrameErrorParam> {
- public:
-  ~AV1FrameErrorTest() override = default;
-  void SetUp() override {
-    rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
-  }
-
- protected:
-  void RandomValues(frame_error_func test_impl, int width, int height);
-  void ExtremeValues(frame_error_func test_impl, int width, int height);
-  void RunSpeedTest(frame_error_func test_impl, int width, int height);
-  libaom_test::ACMRandom rnd_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1FrameErrorTest);
-
-void AV1FrameErrorTest::RandomValues(frame_error_func test_impl, int width,
-                                     int height) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint8_t *const dst =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
-  uint8_t *const ref =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
-  ASSERT_NE(dst, nullptr);
-  ASSERT_NE(ref, nullptr);
-  for (int i = 0; i < max_blk_size; ++i) {
-    dst[i] = rnd_.Rand8();
-    ref[i] = rnd_.Rand8();
-  }
-  const int64_t ref_error =
-      av1_calc_frame_error_c(ref, stride, dst, stride, width, height);
-  const int64_t test_error = test_impl(ref, stride, dst, stride, width, height);
-  ASSERT_EQ(test_error, ref_error) << width << "x" << height;
-  aom_free(dst);
-  aom_free(ref);
-}
-
-void AV1FrameErrorTest::ExtremeValues(frame_error_func test_impl, int width,
-                                      int height) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint8_t *const dst =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
-  uint8_t *const ref =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
-  ASSERT_NE(dst, nullptr);
-  ASSERT_NE(ref, nullptr);
-  for (int r = 0; r < 2; r++) {
-    if (r == 0) {
-      memset(dst, 0, max_blk_size);
-      memset(ref, 255, max_blk_size);
-    } else if (r == 1) {
-      memset(dst, 255, max_blk_size);
-      memset(ref, 0, max_blk_size);
-    }
-    const int64_t ref_error =
-        av1_calc_frame_error_c(ref, stride, dst, stride, width, height);
-    const int64_t test_error =
-        test_impl(ref, stride, dst, stride, width, height);
-    ASSERT_EQ(test_error, ref_error) << width << "x" << height;
-  }
-  aom_free(dst);
-  aom_free(ref);
-}
-
-void AV1FrameErrorTest::RunSpeedTest(frame_error_func test_impl, int width,
-                                     int height) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint8_t *const dst =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
-  uint8_t *const ref =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
-  ASSERT_NE(dst, nullptr);
-  ASSERT_NE(ref, nullptr);
-  for (int i = 0; i < max_blk_size; ++i) {
-    dst[i] = ref[i] = rnd_.Rand8();
-  }
-  const int num_loops = 10000000 / (width + height);
-  frame_error_func funcs[2] = { av1_calc_frame_error_c, test_impl };
-  double elapsed_time[2] = { 0 };
-  for (int i = 0; i < 2; ++i) {
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    frame_error_func func = funcs[i];
-    for (int j = 0; j < num_loops; ++j) {
-      func(ref, stride, dst, stride, width, height);
-    }
-    aom_usec_timer_mark(&timer);
-    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    elapsed_time[i] = 1000.0 * time / num_loops;
-  }
-  aom_free(dst);
-  aom_free(ref);
-  printf("av1_calc_frame_error %3dx%-3d: %7.2f/%7.2fns", width, height,
-         elapsed_time[0], elapsed_time[1]);
-  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
-}
-
-TEST_P(AV1FrameErrorTest, CheckOutput) {
-  RandomValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
-  ExtremeValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
-}
-
-TEST_P(AV1FrameErrorTest, DISABLED_Speed) {
-  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
-}
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AV1FrameErrorTest,
-    ::testing::Combine(::testing::Values(&av1_calc_frame_error_sse2),
-                       ::testing::ValuesIn(kBlockWidth),
-                       ::testing::ValuesIn(kBlockHeight)));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AV1FrameErrorTest,
-    ::testing::Combine(::testing::Values(&av1_calc_frame_error_avx2),
-                       ::testing::ValuesIn(kBlockWidth),
-                       ::testing::ValuesIn(kBlockHeight)));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AV1FrameErrorTest,
-    ::testing::Combine(::testing::Values(&av1_calc_frame_error_neon),
-                       ::testing::ValuesIn(kBlockWidth),
-                       ::testing::ValuesIn(kBlockHeight)));
-#endif
-}  // namespace frame_error_test
-
-#if CONFIG_AV1_HIGHBITDEPTH
-namespace highbd_frame_error_test {
-typedef int64_t (*highbd_frame_error_func)(const uint16_t *const ref,
-                                           int ref_stride,
-                                           const uint16_t *const dst,
-                                           int dst_stride, int p_width,
-                                           int p_height, int bd);
-const int kBlockWidth[] = {
-  832, 834, 640, 1280, 1920,
-};
-const int kBlockHeight[] = {
-  480, 482, 360, 720, 1080,
-};
-#if HAVE_AVX2 || HAVE_SSE2 || HAVE_NEON
-const int kBitDepths[] = { 8, 10, 12 };
-#endif
-typedef std::tuple<highbd_frame_error_func, int, int, int>
-    HighbdFrameErrorParam;
-
-class AV1HighbdFrameErrorTest
-    : public ::testing::TestWithParam<HighbdFrameErrorParam> {
- public:
-  ~AV1HighbdFrameErrorTest() override = default;
-  void SetUp() override {
-    rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
-  }
-
- protected:
-  void RandomValues(highbd_frame_error_func test_impl, int width, int height,
-                    int bd);
-  void ExtremeValues(highbd_frame_error_func test_impl, int width, int height,
-                     int bd);
-  void RunSpeedTest(highbd_frame_error_func test_impl, int width, int height,
-                    int bd);
-  libaom_test::ACMRandom rnd_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdFrameErrorTest);
-
-void AV1HighbdFrameErrorTest::RandomValues(highbd_frame_error_func test_impl,
-                                           int width, int height, int bd) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint16_t *const dst =
-      static_cast<uint16_t *>(aom_memalign(32, max_blk_size * sizeof(*dst)));
-  uint16_t *const ref =
-      static_cast<uint16_t *>(aom_memalign(32, max_blk_size * sizeof(*ref)));
-  ASSERT_NE(dst, nullptr);
-  ASSERT_NE(ref, nullptr);
-  int mask = (1 << bd) - 1;
-  for (int i = 0; i < max_blk_size; ++i) {
-    dst[i] = rnd_.Rand16() & mask;
-    ref[i] = rnd_.Rand16() & mask;
-  }
-  const int64_t ref_error = av1_calc_highbd_frame_error_c(
-      ref, stride, dst, stride, width, height, bd);
-  const int64_t test_error =
-      test_impl(ref, stride, dst, stride, width, height, bd);
-  ASSERT_EQ(test_error, ref_error) << width << "x" << height << " bd=" << bd;
-  aom_free(dst);
-  aom_free(ref);
-}
-
-void AV1HighbdFrameErrorTest::ExtremeValues(highbd_frame_error_func test_impl,
-                                            int width, int height, int bd) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint16_t *const dst =
-      static_cast<uint16_t *>(aom_memalign(32, max_blk_size * sizeof(*dst)));
-  uint16_t *const ref =
-      static_cast<uint16_t *>(aom_memalign(32, max_blk_size * sizeof(*ref)));
-  ASSERT_NE(dst, nullptr);
-  ASSERT_NE(ref, nullptr);
-  int mask = (1 << bd) - 1;
-  for (int r = 0; r < 2; r++) {
-    // Silence static analysis warnings
-    assert(dst);
-    assert(ref);
-    if (r == 0) {
-      aom_memset16(dst, 0, max_blk_size);
-      aom_memset16(ref, mask, max_blk_size);
-    } else if (r == 1) {
-      aom_memset16(dst, mask, max_blk_size);
-      aom_memset16(ref, 0, max_blk_size);
-    }
-    const int64_t ref_error = av1_calc_highbd_frame_error_c(
-        ref, stride, dst, stride, width, height, bd);
-    const int64_t test_error =
-        test_impl(ref, stride, dst, stride, width, height, bd);
-    ASSERT_EQ(test_error, ref_error) << width << "x" << height << " bd=" << bd;
-  }
-  aom_free(dst);
-  aom_free(ref);
-}
-
-void AV1HighbdFrameErrorTest::RunSpeedTest(highbd_frame_error_func test_impl,
-                                           int width, int height, int bd) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint16_t *const dst =
-      static_cast<uint16_t *>(aom_memalign(32, max_blk_size * sizeof(*dst)));
-  uint16_t *const ref =
-      static_cast<uint16_t *>(aom_memalign(32, max_blk_size * sizeof(*ref)));
-  ASSERT_NE(dst, nullptr);
-  ASSERT_NE(ref, nullptr);
-  int mask = (1 << bd) - 1;
-  for (int i = 0; i < max_blk_size; ++i) {
-    dst[i] = ref[i] = rnd_.Rand16() & mask;
-  }
-  const int num_loops = 10000000 / (width + height);
-  highbd_frame_error_func funcs[2] = { av1_calc_highbd_frame_error_c,
-                                       test_impl };
-  double elapsed_time[2] = { 0 };
-  for (int i = 0; i < 2; ++i) {
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    highbd_frame_error_func func = funcs[i];
-    for (int j = 0; j < num_loops; ++j) {
-      func(ref, stride, dst, stride, width, height, bd);
-    }
-    aom_usec_timer_mark(&timer);
-    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
-    elapsed_time[i] = 1000.0 * time / num_loops;
-  }
-  aom_free(dst);
-  aom_free(ref);
-  printf("av1_calc_highbd_frame_error %3dx%-3d bd=%2d: %7.2f/%7.2fns", width,
-         height, bd, elapsed_time[0], elapsed_time[1]);
-  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
-}
-
-TEST_P(AV1HighbdFrameErrorTest, CheckOutput) {
-  RandomValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2), GET_PARAM(3));
-  ExtremeValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2), GET_PARAM(3));
-}
-
-TEST_P(AV1HighbdFrameErrorTest, DISABLED_Speed) {
-  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2), GET_PARAM(3));
-}
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(
-    SSE2, AV1HighbdFrameErrorTest,
-    ::testing::Combine(::testing::Values(&av1_calc_highbd_frame_error_sse2),
-                       ::testing::ValuesIn(kBlockWidth),
-                       ::testing::ValuesIn(kBlockHeight),
-                       ::testing::ValuesIn(kBitDepths)));
-#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_SUITE_P(
-    AVX2, AV1HighbdFrameErrorTest,
-    ::testing::Combine(::testing::Values(&av1_calc_highbd_frame_error_avx2),
-                       ::testing::ValuesIn(kBlockWidth),
-                       ::testing::ValuesIn(kBlockHeight),
-                       ::testing::ValuesIn(kBitDepths)));
-#endif
-
-#if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AV1HighbdFrameErrorTest,
-    ::testing::Combine(::testing::Values(&av1_calc_highbd_frame_error_neon),
-                       ::testing::ValuesIn(kBlockWidth),
-                       ::testing::ValuesIn(kBlockHeight),
-                       ::testing::ValuesIn(kBitDepths)));
-#endif
-
-// Check that 8-bit and 16-bit code paths give the same results for
-// 8-bit content
-typedef std::tuple<int, int> HighbdFrameErrorConsistencyParam;
-
-class AV1HighbdFrameErrorConsistencyTest
-    : public ::testing::TestWithParam<HighbdFrameErrorConsistencyParam> {
- public:
-  ~AV1HighbdFrameErrorConsistencyTest() override = default;
-  void SetUp() override {
-    rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
-  }
-
- protected:
-  void RandomValues(int width, int height);
-  void ExtremeValues(int width, int height);
-  libaom_test::ACMRandom rnd_;
-};
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(
-    AV1HighbdFrameErrorConsistencyTest);
-
-void AV1HighbdFrameErrorConsistencyTest::RandomValues(int width, int height) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint8_t *const dst8 =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst8)));
-  uint8_t *const ref8 =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref8)));
-  uint16_t *const dst16 =
-      static_cast<uint16_t *>(aom_memalign(32, max_blk_size * sizeof(*dst16)));
-  uint16_t *const ref16 =
-      static_cast<uint16_t *>(aom_memalign(32, max_blk_size * sizeof(*ref16)));
-  ASSERT_NE(dst8, nullptr);
-  ASSERT_NE(ref8, nullptr);
-  ASSERT_NE(dst16, nullptr);
-  ASSERT_NE(ref16, nullptr);
-  // Set up parallel 8-bit and 16-bit buffers with the same content
-  for (int i = 0; i < max_blk_size; ++i) {
-    dst16[i] = dst8[i] = rnd_.Rand8();
-    ref16[i] = ref8[i] = rnd_.Rand8();
-  }
-  const int64_t error8 =
-      av1_calc_frame_error_c(ref8, stride, dst8, stride, width, height);
-  const int64_t error16 = av1_calc_highbd_frame_error_c(
-      ref16, stride, dst16, stride, width, height, 8);
-  ASSERT_EQ(error8, error16) << width << "x" << height;
-  aom_free(dst8);
-  aom_free(ref8);
-  aom_free(dst16);
-  aom_free(ref16);
-}
-
-void AV1HighbdFrameErrorConsistencyTest::ExtremeValues(int width, int height) {
-  const int stride = (((width * 3) / 2) + 15) & ~15;
-  const int max_blk_size = stride * height;
-  uint8_t *const dst8 =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst8)));
-  uint8_t *const ref8 =
-      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref8)));
-  uint16_t *const dst16 =
-      static_cast<uint16_t *>(aom_memalign(32, max_blk_size * sizeof(*dst16)));
-  uint16_t *const ref16 =
-      static_cast<uint16_t *>(aom_memalign(32, max_blk_size * sizeof(*ref16)));
-  ASSERT_NE(dst8, nullptr);
-  ASSERT_NE(ref8, nullptr);
-  ASSERT_NE(dst16, nullptr);
-  ASSERT_NE(ref16, nullptr);
-  for (int r = 0; r < 2; r++) {
-    // Silence static analysis warnings
-    assert(dst16);
-    assert(ref16);
-    // Set up parallel 8-bit and 16-bit buffers with the same content
-    if (r == 0) {
-      memset(dst8, 0, max_blk_size);
-      aom_memset16(dst16, 0, max_blk_size);
-      memset(ref8, 255, max_blk_size);
-      aom_memset16(ref16, 255, max_blk_size);
-    } else if (r == 1) {
-      memset(dst8, 255, max_blk_size);
-      aom_memset16(dst16, 255, max_blk_size);
-      memset(ref8, 0, max_blk_size);
-      aom_memset16(ref16, 0, max_blk_size);
-    }
-    const int64_t error8 =
-        av1_calc_frame_error_c(ref8, stride, dst8, stride, width, height);
-    const int64_t error16 = av1_calc_highbd_frame_error_c(
-        ref16, stride, dst16, stride, width, height, 8);
-    ASSERT_EQ(error8, error16) << width << "x" << height;
-  }
-  aom_free(dst8);
-  aom_free(ref8);
-  aom_free(dst16);
-  aom_free(ref16);
-}
-
-TEST_P(AV1HighbdFrameErrorConsistencyTest, CheckOutput) {
-  RandomValues(GET_PARAM(0), GET_PARAM(1));
-  ExtremeValues(GET_PARAM(0), GET_PARAM(1));
-}
-
-INSTANTIATE_TEST_SUITE_P(C, AV1HighbdFrameErrorConsistencyTest,
-                         ::testing::Combine(::testing::ValuesIn(kBlockWidth),
-                                            ::testing::ValuesIn(kBlockHeight)));
-}  // namespace highbd_frame_error_test
-#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/test/test.cmake b/test/test.cmake
index 50b55ac..2ca7e64 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -216,7 +216,6 @@
               "${AOM_ROOT}/test/fdct4x4_test.cc"
               "${AOM_ROOT}/test/fft_test.cc"
               "${AOM_ROOT}/test/firstpass_test.cc"
-              "${AOM_ROOT}/test/frame_error_test.cc"
               "${AOM_ROOT}/test/fwht4x4_test.cc"
               "${AOM_ROOT}/test/hadamard_test.cc"
               "${AOM_ROOT}/test/horver_correlation_test.cc"
@@ -381,7 +380,6 @@
                      "${AOM_ROOT}/test/end_to_end_qmpsnr_test.cc"
                      "${AOM_ROOT}/test/end_to_end_ssim_test.cc"
                      "${AOM_ROOT}/test/firstpass_test.cc"
-                     "${AOM_ROOT}/test/frame_error_test.cc"
                      "${AOM_ROOT}/test/motion_vector_test.cc"
                      "${AOM_ROOT}/test/obmc_sad_test.cc"
                      "${AOM_ROOT}/test/obmc_variance_test.cc"