tensorflow/lite/kernels/internal/tensor_utils_test.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include <gmock/gmock.h>
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/test_util.h"

 #ifdef DOTPROD_BENCHMARKS
 #include "testing/base/public/benchmark.h"
 #endif  // DOTPROD_BENCHMARKS

 namespace tflite {
 namespace tensor_utils {

 TEST(uKernels, ClipTest) {
   constexpr int kVectorSize = 10;
   constexpr float kAbsLimit = 2.0;
   static float input[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
                                      -2.5, 3.0,  -3.5, 4.0,  -4.5};
   std::vector<float> output(kVectorSize);
   ClipVector(input, kVectorSize, kAbsLimit, output.data());
   EXPECT_THAT(output,
               ElementsAreArray(ArrayFloatNear(
                   {0.0, -0.5, 1.0, -1.5, 2.0, -2.0, 2.0, -2.0, 2.0, -2.0})));
 }

 TEST(uKernels, VectorScalarMultiply) {
   constexpr int kVectorSize = 29;
   static int8_t input[kVectorSize];
   for (int i = 0; i < 29; ++i) {
     input[i] = static_cast<int8_t>(i - 14);
   }
   const float scale = 0.1f;
   std::vector<float> output(kVectorSize, 0.0f);
   VectorScalarMultiply(input, kVectorSize, scale, output.data());
   EXPECT_THAT(output,
               ElementsAreArray(ArrayFloatNear(
                   {-1.4, -1.3, -1.2, -1.1, -1.0, -0.9, -0.8, -0.7, -0.6, -0.5,
                    -0.4, -0.3, -0.2, -0.1, 0,    0.1,  0.2,  0.3,  0.4,  0.5,
                    0.6,  0.7,  0.8,  0.9,  1.0,  1.1,  1.2,  1.3,  1.4})));
 }

 TEST(uKernels, IsZeroTest) {
   constexpr int kVectorSize = 21;
   static float zeros[kVectorSize] = {0.0};
   EXPECT_TRUE(IsZeroVector(zeros, kVectorSize));

   static float nonzeros[kVectorSize] = {
       1e-6,  1e-7,  1e-8,  1e-9,  1e-10, 1e-11, 1e-12,
       1e-13, 1e-14, 1e-15, 1e-16, 1e-17, 1e-18, 1e-19,
       1e-20, 1e-21, 1e-22, 1e-23, 1e-24, 1e-25, 1e-26};
   EXPECT_FALSE(IsZeroVector(nonzeros, kVectorSize));
 }

 TEST(uKernels, GeneratedIsZeroTest) {
   constexpr int kVectorSize = 39;
   std::vector<float> input(kVectorSize);
   ZeroVector(input.data(), kVectorSize);
   EXPECT_TRUE(IsZeroVector(input.data(), kVectorSize));
 }

 TEST(uKernels, SymmetricQuantizeFloatsTest) {
   constexpr int kVectorSize = 9;
   static float input[kVectorSize] = {-640, -635.0, -630, 10.0,  2.0,
                                      -5.0, -10.0,  0.0,  1000.0};

   int8_t output[kVectorSize];
   float min, max, scaling_factor;
   SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max,
                           &scaling_factor);

   EXPECT_EQ(min, -640);
   EXPECT_EQ(max, 1000);
   // EQ won't work due to fpoint.
   EXPECT_NEAR(scaling_factor, 1000 / 127.0, 1e-6);
   EXPECT_THAT(output,
               testing::ElementsAreArray({-81, -81, -80, 1, 0, -1, -1, 0, 127}));
 }

 TEST(uKernels, SymmetricQuantizeFloatsAllZerosTest) {
   constexpr int kVectorSize = 9;
   static float input[kVectorSize] = {0, 0, 0, 0, 0, 0, 0, 0, 0};

   int8_t output[kVectorSize];
   float min, max, scaling_factor;
   SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max,
                           &scaling_factor);

   EXPECT_EQ(min, 0);
   EXPECT_EQ(max, 0);
   EXPECT_EQ(scaling_factor, 1);
   EXPECT_THAT(output, testing::ElementsAreArray({0, 0, 0, 0, 0, 0, 0, 0, 0}));
 }

 TEST(uKernels, SymmetricQuantizeFloatsAllAlmostZeroTest) {
   constexpr int kVectorSize = 9;
   static float input[kVectorSize] = {-1e-5, 3e-5, -7e-6, -9e-5, 1e-6,
                                      4e-5,  9e-6, 2e-4,  0};

   int8_t output[kVectorSize];
   float min, max, scaling_factor;
   SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max,
                           &scaling_factor);

   EXPECT_NEAR(min, -9e-05, 1e-6);
   EXPECT_NEAR(max, 0.0002, 1e-6);
   EXPECT_NEAR(scaling_factor, 1.57e-6, 1e-6);
   EXPECT_THAT(output,
               testing::ElementsAreArray({-6, 19, -4, -57, 1, 25, 6, 127, 0}));
 }

 TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) {
   constexpr int kRow = 3;
   constexpr int kCol = 4;
   constexpr int kBatch = 2;
   static float matrix[kRow * kCol] = {1.0,  2.0,  3.0,  4.0,   //
                                       -1.0, -2.0, -3.0, -4.0,  //
                                       1.0,  -2.0, 3.0,  -4.0};
   static float vector[kCol * kBatch] = {1.0, -1.0, 1.0, -1.0,  //
                                         2.0, -2.0, 2.0, -2.0};
   std::vector<float> output(kRow * kBatch);
   std::fill(output.begin(), output.end(), 3.0);
   MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
                                       output.data(), /*result_stride=*/1);
   EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({1., 5., 13.,  //
                                                        -1., 7., 23.})));

   std::vector<float> output_with_stride2(kRow * kBatch * 2);
   std::fill(output_with_stride2.begin(), output_with_stride2.end(), 3.0);
   MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
                                       output_with_stride2.data(),
                                       /*result_stride=*/2);
   EXPECT_THAT(output_with_stride2,
               ElementsAreArray(ArrayFloatNear({1., 3., 5., 3., 13., 3.,  //
                                                -1., 3., 7., 3., 23., 3.})));
 }

 struct MatrixVectorData {
   // Contains dense parameters.
   std::vector<int8_t> matrix;

   // Like matrix, but with about half of the parameters set to zero.
   // Use this to create golden output for sparse matrix tests.
   std::vector<int8_t> zeroed_matrix;

   // zeroed_matrix described in sparse form.
   std::vector<int8_t> sparse_matrix;
   std::vector<uint8_t> ledger;

   std::vector<int8_t> vectors;
   std::vector<float> scale_factors;
   std::vector<float> results;

   int rows;
   int cols;
   int batch;
 };

 MatrixVectorData SetupMatrixVectorData(int rows, int cols, int batch,
                                        bool negative = false) {
   MatrixVectorData data;
   data.rows = rows;
   data.cols = cols;
   data.batch = batch;

   for (int i = 0; i < rows * cols; i++) {
     int sign = 1;
     if ((i % 3) == 0 && negative) sign = -1;
     data.matrix.push_back(sign * (i % 70));
   }
   for (int i = 0; i < cols * batch; i++) {
     int sign = 1;
     if ((i % 5) == 0 && negative) sign = -1;
     data.vectors.push_back(sign * (i % 50));
   }
   data.scale_factors = {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8};
   data.results.resize(rows * batch, 0);

   data.zeroed_matrix = data.matrix;

   // Make a sparsification ledger.
   for (int i = 0; i < rows; i++) {
     int max_chunks = cols / 16;
     int selected_chunks = (max_chunks / 2);
     bool row_is_odd = (i % 2) > 0;
     bool max_chunks_is_odd = (max_chunks % 2) > 0;

     data.ledger.push_back(selected_chunks);
     if (max_chunks_is_odd && row_is_odd) {
       selected_chunks++;
     }

     // In odd rows, use odd chunk indexes.
     // In even rows, use even chunk indexes.
     for (int j = 0; j < max_chunks; j++) {
       const int chunk_start = i * cols + (j * 16);
       const int chunk_end = i * cols + (j * 16) + 16;
       if ((j % 2) == (i % 2)) {
         // Copy this chunk into the sparse matrix.
         data.ledger.push_back(j);
         for (int k = chunk_start; k < chunk_end; k++) {
           data.sparse_matrix.push_back(data.matrix[k]);
         }
       } else {
         // Zero this part out of zeroed_matrix.
         for (int k = chunk_start; k < chunk_end; k++) {
           data.zeroed_matrix[k] = 0;
         }
       }
     }
   }
   return data;
 }

 std::vector<float> TestDotprodMatrixBatchVectorMultiply(int rows, int cols,
                                                         int batch,
                                                         bool negative = false) {
   MatrixVectorData data = SetupMatrixVectorData(rows, cols, batch, negative);

   // All partial sums in this computation are small enough to fit in the
   // mantissa of a float, and the scale factors are all integers, so we expect
   // an exact result.
   MatrixBatchVectorMultiplyAccumulate(
       data.matrix.data(), rows, cols, data.vectors.data(),
       data.scale_factors.data(), batch, &data.results[0], 1);
   return data.results;
 }

 std::vector<float> TestSparseDotprodMatrixBatchVectorMultiply(
     int rows, int cols, int batch, bool negative = false) {
   MatrixVectorData data = SetupMatrixVectorData(rows, cols, batch, negative);
   SparseMatrixBatchVectorMultiplyAccumulate(
       data.sparse_matrix.data(), data.ledger.data(), rows, cols,
       data.vectors.data(), data.scale_factors.data(), batch, &data.results[0],
       1);
   return data.results;
 }

 TEST(uKernels, DotprodMatrixBatchVectorMultiplyAccumulateTest) {
   ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(4, 16, 1),
               testing::ElementsAre(1240, 3160, 5080, 7000));

   ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(4, 32, 2),
               testing::ElementsAre(10416, 26288, 8490, 23312, 18276, 70756,
                                    37416, 60916));

   ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(4, 32, 3),
               testing::ElementsAre(10416, 26288, 8490, 23312, 18276, 70756,
                                    37416, 60916, 52080, 142704, 55878, 125712));

   ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(8, 1024, 3),
               testing::ElementsAreArray(
                   {841094,  853168,  866642,  840286,  860760,  862754,
                    843678,  872552,  1724476, 1769072, 1747588, 1738844,
                    1758240, 1742916, 1761612, 1755808, 2506896, 2564262,
                    2629188, 2515824, 2598390, 2569236, 2537352, 2645118}));

   const bool kNegative = true;
   ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(4, 64, 1, kNegative),
               testing::ElementsAre(13696, 6904, 7764, 11806));
   ASSERT_THAT(
       TestDotprodMatrixBatchVectorMultiply(4, 32, 2, kNegative),
       testing::ElementsAre(3436, 3522, 1590, 6972, 2516, 20520, 456, 10628));
 }

 TEST(uKernels, DotprodMatrixBatchFourVectorMultiplyAccumulateDotprodTest) {
   ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(2, 16, 4),
               testing::ElementsAreArray(
                   {1240, 3160, 6320, 18352, 15240, 45576, 4200, 16232}));
   ASSERT_THAT(TestDotprodMatrixBatchVectorMultiply(2, 64, 4),
               testing::ElementsAreArray({45794, 38948, 88536, 84252, 157626,
                                          165312, 209864, 246128}));
   ASSERT_THAT(
       TestDotprodMatrixBatchVectorMultiply(2, 64, 8),
       testing::ElementsAreArray({45794, 38948, 88536, 84252, 157626, 165312,
                                  209864, 246128, 219700, 195550, 279684, 278928,
                                  413616, 445662, 374896, 365952}));

   ASSERT_THAT(
       TestDotprodMatrixBatchVectorMultiply(4, 64, 8),
       testing::ElementsAreArray(
           {45794,  38948,  34622,  32816,  88536,  84252,  85008,  90804,
            157626, 165312, 180558, 203364, 209864, 246128, 236472, 208896,
            219700, 195550, 184000, 185050, 279684, 278928, 293292, 322776,
            413616, 445662, 495348, 513674, 374896, 365952, 321168, 296544}));

   ASSERT_THAT(
       TestDotprodMatrixBatchVectorMultiply(16, 1024, 4),
       testing::ElementsAreArray(
           {841094,  853168,  866642,  840286,  860760,  862754,  843678,
            872552,  837586,  851270,  877414,  834188,  863062,  857846,
            841780,  879054,  1724476, 1769072, 1747588, 1738844, 1758240,
            1742916, 1761612, 1755808, 1737684, 1750780, 1747356, 1754152,
            1748348, 1753324, 1743320, 1754316, 2506896, 2564262, 2629188,
            2515824, 2598390, 2569236, 2537352, 2645118, 2508444, 2571480,
            2610576, 2510442, 2618208, 2566584, 2544570, 2614536, 3458904,
            3502688, 3474792, 3505976, 3499360, 3488264, 3485848, 3512832,
            3500616, 3482520, 3489624, 3469008, 3495992, 3524376, 3465680,
            3526264}));

   ASSERT_THAT(
       TestDotprodMatrixBatchVectorMultiply(4, 128, 4),
       testing::ElementsAreArray({87920, 80024, 92288, 103712, 228148, 224820,
                                  233812, 213124, 271284, 271788, 332772, 328236,
                                  419328, 431328, 411968, 417248}));

   ASSERT_THAT(
       TestDotprodMatrixBatchVectorMultiply(4, 128, 8),
       testing::ElementsAreArray(
           {87920,  80024,  92288,  103712, 228148, 224820, 233812, 213124,
            271284, 271788, 332772, 328236, 419328, 431328, 411968, 417248,
            482680, 523840, 560800, 593560, 563940, 609924, 566868, 644772,
            743708, 857780, 818972, 823284, 708384, 695008, 730912, 872096}));

   const bool kNegative = true;
   EXPECT_THAT(TestDotprodMatrixBatchVectorMultiply(1, 16, 1, kNegative),
               testing::ElementsAre(450));
   EXPECT_THAT(TestDotprodMatrixBatchVectorMultiply(2, 64, 8, kNegative),
               testing::ElementsAreArray({13696, 6904, 9952, 12368, 22848, 61632,
                                          40424, 46776, 57630, 38670, 62976,
                                          49824, 39032, 71988, 60128, 148992}));

   std::vector<float> results =
       TestDotprodMatrixBatchVectorMultiply(256, 1024, 8);
   int64_t sum = 0;
   for (int i = 0; i < results.size(); i++) {
     sum += static_cast<int64_t>(results[i]);
   }
   EXPECT_EQ(7980076336, sum);
 }

 TEST(uKernels, DotprodSparseMatrixBatchVectorMultiplyAccumulate) {
   EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(1, 16, 1),
               testing::ElementsAre(0));
   EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(1, 32, 1),
               testing::ElementsAre(1240));
   EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(1, 64, 1),
               testing::ElementsAre(26544));
   EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(1, 64, 2),
               testing::ElementsAre(26544, 24344));
   EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(4, 64, 4),
               testing::ElementsAreArray(
                   {26544, 15866, 22140, 11408, 24344, 53248, 42704, 39900,
                    48000, 94146, 101892, 81876, 87712, 105160, 148304, 75936}));

   const bool kNegative = true;
   EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(1, 64, 1, kNegative),
               testing::ElementsAre(8764));
   EXPECT_THAT(TestSparseDotprodMatrixBatchVectorMultiply(2, 64, 2, kNegative),
               testing::ElementsAre(8764, 5196, 7204, 11148));
 }

 #ifdef __ANDROID__
 TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
   // Note we use 29 columns as this exercises all the neon kernel: the
   // 16-block SIMD code, the 8-block postamble, and the leftover postamble.
   const int a_rows = 4, a_cols = 29;
   const int kWeightsPerUint32 = 4;
   /* clang-format off */
   const float a_float_data[] = {
       /* 1st row */
       1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
       14.14, 15.15, 16.16, 17.17, 18.18, 19.19, 20.2, 21.21, 22.22, 23.23,
       24.24, 25.25, 26.26, 27.27, 28.28, 0,
       /* 2nd row */
       -1.1, -2.2, -3.3, -4.4, -5.5, -6.6, -7.7, -8.8, -9.9, -10.1, -11.11,
       -12.12, -13.13, -14.14, -15.15, -16.16, -17.17, -18.18, -19.19, -20.2,
       -21.21, -22.22, -23.23, -24.24, -25.25, -26.26, -27.27, -28.28, 0,
       /* 3rd row */
       1.1, -2.2, 3.3, -4.4, 5.5, -6.6, 7.7, -8.8, 9.9, -10.1, 11.11, -12.12,
       13.13, -14.14, 15.15, -16.16, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22,
       23.23, -24.24, 25.25, -26.26, 27.27, -28.28, 0,
       /* 4th row */
       -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
       -13.13, 14.14, -15.15, 16.16, -17.17, 18.18, -19.19, 20.2, -21.21, 22.22,
       -23.23, 24.24, -25.25, 26.26, -27.27, 28.28, 0};

   int8_t* a_int8_data = reinterpret_cast<int8_t*>(
       aligned_malloc(a_rows * a_cols, kWeightsPerUint32));
   float a_min, a_max;
   float scaling_factor_a;
   SymmetricQuantizeFloats(a_float_data, a_rows * a_cols, a_int8_data, &a_min,
                           &a_max, &scaling_factor_a);
   const int8_t expected_a_int8_data[] = {
     /* 1st row */
     5, 10, 15, 20, 25, 30, 35, 40, 44, 45, 50, 54, 59, 64, 68, 73, 77, 82, 86,
     91, 95, 100, 104, 109, 113, 118, 122, 127, 0,
     /* 2nd row */
     -5, -10, -15, -20, -25, -30, -35, -40, -44, -45, -50, -54, -59, -64, -68,
     -73, -77, -82, -86, -91, -95, -100, -104, -109, -113, -118, -122, -127, 0,
     /* 3rd row */
     5, -10, 15, -20, 25, -30, 35, -40, 44, -45, 50, -54, 59, -64, 68, -73, 77,
     -82, 86, -91, 95, -100, 104, -109, 113, -118, 122, -127, 0,
     /* 4th row */
     -5, 10, -15, 20, -25, 30, -35, 40, -44, 45, -50, 54, -59, 64, -68, 73, -77,
     82, -86, 91, -95, 100, -104, 109, -113, 118, -122, 127, 0,
   };
   for (int i = 0; i < a_rows * a_cols; ++i) {
     EXPECT_EQ(expected_a_int8_data[i], a_int8_data[i]);
   }

   const int b_rows = 29, b_cols = 1, batches = 2;
   const float b_float_data[] = {
     /* batch 1 */
     1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
     1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
     1.0,
     /* batch 2 */
     2.5, -2.1, 3.0, -1.3, 1.3, -1.1, 2.0, -1.7, 1.9, -1.5, 0.5, -0.7, 0.8, -0.3,
     2.8, -2.8, 1.1, -2.3, 1.9, -1.9, 2.1, -0.5, 2.4, -0.1, 1.0, -2.5, 0.7, -1.9,
     0.2,
   };

   // Quantized values of B:
   int8_t b_int8_data[b_rows * b_cols * batches];
   float b_min, b_max;
   float scaling_factor_b[batches];
   SymmetricQuantizeFloats(b_float_data, b_rows * b_cols, b_int8_data, &b_min,
                           &b_max, &scaling_factor_b[0]);
   SymmetricQuantizeFloats(&b_float_data[b_rows * b_cols], b_rows * b_cols,
                           &b_int8_data[b_rows * b_cols], &b_min, &b_max,
                           &scaling_factor_b[1]);

   const int8_t expected_b_int8_data[] = {
     /* batch 1 */
     127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127,
     127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127,
     127,
     /* batch 2 */
     106, -89, 127, -55, 55, -47, 85, -72, 80, -64, 21, -30, 34, -13, 119, -119,
     47, -97, 80, -80, 89, -21, 102, -4, 42, -106, 30, -80, 8,
   };
   /* clang-format on */
   for (int i = 0; i < b_rows * b_cols * batches; ++i) {
     EXPECT_EQ(expected_b_int8_data[i], b_int8_data[i]);
   }

   // Full float operation results in:
   // -13.69, 13.69, 414.11, -414.11
   // -6.325, 6.325, 631.263, -631.263
   float c_float_data[a_rows * b_cols * batches];
   for (int i = 0; i < a_rows * b_cols * batches; ++i) {
     c_float_data[i] = 0.0;
   }

   // Testing product.
   const float scaling_factor_c[2] = {
       scaling_factor_a * scaling_factor_b[0],
       scaling_factor_a * scaling_factor_b[1],
   };
   MatrixBatchVectorMultiplyAccumulate(a_int8_data, a_rows, a_cols, b_int8_data,
                                       scaling_factor_c, batches, c_float_data,
                                       /*result_stride=*/1);

   // Assert we obtain the expected recovered float values.
   const float expected_c_float_data[] = {
       -14.474, 14.474, 414.402, -414.402, -6.92228, 6.92228, 632.042, -632.042,
   };
   for (int i = 0; i < a_rows * b_cols * batches; ++i) {
     EXPECT_NEAR(expected_c_float_data[i], c_float_data[i], 0.001);
   }

   aligned_free(a_int8_data);
 }
 #endif  // __ANDROID__

 TEST(uKernels, SparseMatrixBatchVectorMultiplyAccumulateTest) {
   const int kRow = 4;
   const int kCol = 48;
   const int kBatch = 2;
   /* clang-format off */
   float matrix[kRow * kCol] = {
       /* 1st row */
       1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
       14.14, 15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38,
       39.39, 40.40, 41.41, 42.42, 43.43, 44.44, 0, 0, 0, 0,
       /* 2nd row */
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24,
       -25.25, -26.26, -27.27, -28.28, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0,
       /* 3rd row */
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25,
       -26.26, 27.27, -28.28, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0,
       /* 4th row */
       -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
       -13.13, 14.14, -15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -33.33, 34.34, -35.35, 36.36, -37.37,
       38.38, -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0, 0, 0, 0};

   // BCSR format of the above matrix.
   float matrix_values[] = {
       /* 1st row */
       1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
       14.14, 15.15, 16.16, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38, 39.39,
       40.40, 41.41, 42.42, 43.43, 44.44, 0, 0, 0, 0,
       /* 2nd row */
       -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24, -25.25,
       -26.26, -27.27, -28.28, 0, 0.0, 0.0, 0.0,
       /* 3rd row */
       17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25, -26.26,
       27.27, -28.28, 0, 0.0, 0.0, 0.0,
       /* 4th row */
       -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
       -13.13, 14.14, -15.15, 16.16, -33.33, 34.34, -35.35, 36.36, -37.37, 38.38,
       -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0, 0, 0, 0};
   uint8_t ledger[] = {
       2, 0,  2,  // 1st row
       1, 1,      // 2nd row
       1, 1,      // 3rd row
       2, 0,  2   // 4th row
   };

   float vector[kBatch * kCol] = {
     /* 1st batch */
     1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
     1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
     1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
     1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
     /* 2nd batch */
     2.5, 0.0, -2.1, 0.0, 3.0, 0.0, -1.3, 0.0, 1.3, 0.0, -1.1, 0.0, 2.0, 0.0,
     -1.7, 0.0, 1.9, 0.0, -1.5, 0.0, 0.5, 0.0, -0.7, 0.0, 0.8, 0.0, -0.3, 0.0,
     2.8, 0.0, -2.8, 0.0, 1.1, -2.3, 1.9, -1.9, 2.1, -0.5, 2.4, -0.1, 1.0, -2.5,
     0.7, -1.9, 0.2, 0.0, 0.1, 0.2,
   };
   /* clang-format on */

   std::vector<float> dense_output(kRow * kBatch, 0.0);
   MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
                                       dense_output.data(), /*result_stride=*/1);

   EXPECT_THAT(dense_output, ElementsAreArray(ArrayFloatNear(
                                 {-13.69, 6.06001, 272.7, -608.03, -9.66602,
                                  -10.201, 10.201, -713.897949},
                                 1e-4)));

   std::vector<float> sparse_output(kRow * kBatch, 0.0);
   SparseMatrixBatchVectorMultiplyAccumulate(
       matrix_values, ledger, kRow, kCol, vector, kBatch, sparse_output.data(),
       /*result_stride=*/1);

   EXPECT_THAT(sparse_output,
               ElementsAreArray(ArrayFloatNear(dense_output, 1e-4)));
 }

 #ifdef __ANDROID__
 TEST(uKernels,
      SparseMatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
   const int kRow = 4;
   const int kCol = 48;
   const int kBatch = 2;
   /* clang-format off */
   const int8_t quantized_matrix[] = {
       /* 1st row */
       3, 6, 9, 13, 16, 19, 22, 25, 28, 29, 32, 35, 38, 40, 43, 46, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 95, 98, 101, 104, 107, 110, 113, 115,
       118, 121, 124, 127, 0, 0, 0, 0,
       /* 2nd row */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -49, -52, -55, -58, -61,
       -64, -66, -69, -72, -75, -78, -81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0,
       /* 3rd row */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 49, -52, 55, -58, 61, -64,
       66, -69, 72, -75, 78, -81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0,
       /* 4th row */
       -3, 6, -9, 13, -16, 19, -22, 25, -28, 29, -32, 35, -38, 40, -43, 46, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -95, 98, -101, 104, -107, 110,
       -113, 115, -118, 121, -124, 127, 0, 0, 0, 0,
   };
   const int8_t quantized_matrix_values[] = {
       /* 1st row */
       3, 6, 9, 13, 16, 19, 22, 25, 28, 29, 32, 35, 38, 40, 43, 46, 95, 98, 101,
       104, 107, 110, 113, 115, 118, 121, 124, 127, 0, 0, 0, 0,
       /* 2nd row */
       -49, -52, -55, -58, -61, -64, -66, -69, -72, -75, -78, -81, 0, 0, 0, 0,
       /* 3rd row */
       49, -52, 55, -58, 61, -64, 66, -69, 72, -75, 78, -81, 0, 0, 0, 0,
       /* 4th row */
       -3, 6, -9, 13, -16, 19, -22, 25, -28, 29, -32, 35, -38, 40, -43, 46, -95,
       98, -101, 104, -107, 110, -113, 115, -118, 121, -124, 127, 0, 0, 0, 0,
   };
   uint8_t ledger[] = {
       2, 0,  2,  // 1st row
       1, 1,      // 2nd row
       1, 1,      // 3rd row
       2, 0,  2   // 4th row
   };

   float matrix_scaling_factor = 0.349921;

   const int8_t quantized_vector[] = {
       /* 1st batch */
       127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127,
       -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127,
       127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127,
       -127, 127, -127, 127, -127, 127, -127, 127, -127,
       /* 2nd batch */
       106, 0, -89, 0, 127, 0, -55, 0, 55, 0, -47, 0, 85, 0, -72, 0, 80, 0,
       -64, 0, 21, 0, -30, 0, 34, 0, -13, 0, 119, 0, -119, 0, 47, -97, 80, -80,
       89, -21, 102, -4, 42, -106, 30, -80, 8, 1, 2, 3,
   };
   float vector_scaling_factor[2] = {0.00787402, 0.023622};

   /* clang-format on */
   float result_scaling_factor[2] = {
       matrix_scaling_factor * vector_scaling_factor[0],
       matrix_scaling_factor * vector_scaling_factor[1],
   };
   std::vector<float> dense_output(kRow * kBatch, 0.0);
   MatrixBatchVectorMultiplyAccumulate(quantized_matrix, kRow, kCol,
                                       quantized_vector, result_scaling_factor,
                                       kBatch, dense_output.data(),
                                       /*result_stride=*/1);

   EXPECT_THAT(dense_output,
               ElementsAreArray(ArrayFloatNear(
                   {-13.646927, 6.298582, 272.938538, -607.813110, -6.637464,
                    -9.381721, 9.381721, -713.845642})));

   std::vector<float> sparse_output(kRow * kBatch, 0.0);
   SparseMatrixBatchVectorMultiplyAccumulate(
       quantized_matrix_values, ledger, kRow, kCol, quantized_vector,
       result_scaling_factor, kBatch, sparse_output.data(),
       /*result_stride=*/1);

   EXPECT_THAT(sparse_output,
               ElementsAreArray(ArrayFloatNear(
                   {-13.646927, 6.298582, 272.938538, -607.813110, -6.637464,
                    -9.381721, 9.381721, -713.845642})));
 }
 #endif  // __ANDROID__

 TEST(uKernels, VectorVectorCwiseProductTest) {
   constexpr int kVectorSize = 10;
   static float input1[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
                                       -2.5, 3.0,  -3.5, 4.0,  -4.5};
   static float input2[kVectorSize] = {0.1,  -0.1, 0.1,  -0.1, 0.1,
                                       -0.1, 0.1,  -0.1, 0.1,  -0.1};
   std::vector<float> output(kVectorSize);
   VectorVectorCwiseProduct(input1, input2, kVectorSize, output.data());
   EXPECT_THAT(output,
               ElementsAreArray(ArrayFloatNear(
                   {0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45})));
 }

 TEST(uKernels, VectorVectorCwiseProductAccumulateTest) {
   constexpr int kVectorSize = 10;
   static float input1[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
                                       -2.5, 3.0,  -3.5, 4.0,  -4.5};
   static float input2[kVectorSize] = {0.1,  -0.1, 0.1,  -0.1, 0.1,
                                       -0.1, 0.1,  -0.1, 0.1,  -0.1};
   std::vector<float> output(kVectorSize);
   std::fill(output.begin(), output.end(), 1.0);
   VectorVectorCwiseProductAccumulate(input1, input2, kVectorSize,
                                      output.data());
   EXPECT_THAT(output,
               ElementsAreArray(ArrayFloatNear(
                   {1.0, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4, 1.45})));
 }

 TEST(uKernels, VectorBatchVectorAddTest) {
   constexpr int kVectorSize = 3;
   constexpr int kBatchSize = 2;
   static float input[kVectorSize] = {0.0, -0.5, 1.0};
   std::vector<float> output = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
   VectorBatchVectorAdd(input, kVectorSize, kBatchSize, output.data());
   EXPECT_THAT(output,
               testing::ElementsAreArray({1.0, 1.5, 4.0, 4.0, 4.5, 7.0}));
 }

 TEST(uKernels, VectorBatchVectorAssignTest) {
   constexpr int kVectorSize = 5;
   constexpr int kBatchSize = 3;
   static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
   std::vector<float> output(kVectorSize * kBatchSize);
   VectorBatchVectorAssign(input, kVectorSize, kBatchSize, output.data());
   EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
                           {0.0, -0.5, 1.0, -1.5, 2.0, 0.0, -0.5, 1.0, -1.5, 2.0,
                            0.0, -0.5, 1.0, -1.5, 2.0})));
 }

 TEST(uKernels, ApplySigmoidToVectorTest) {
   constexpr int kVectorSize = 5;
   static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
   std::vector<float> output(kVectorSize);
   ApplySigmoidToVector(input, kVectorSize, output.data());
   EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
                           {0.5, 0.377541, 0.731059, 0.182426, 0.880797})));
 }

 TEST(uKernels, ApplyActivationToVectorTest) {
   constexpr int kVectorSize = 5;
   static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
   std::vector<float> output(kVectorSize);
   ApplyActivationToVector(input, kVectorSize, kTfLiteActRelu, output.data());
   EXPECT_THAT(output,
               ElementsAreArray(ArrayFloatNear({0.0, 0.0, 1.0, 0.0, 2.0})));

   ApplyActivationToVector(input, kVectorSize, kTfLiteActTanh, output.data());
   EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
                           {0.0, -0.462117, 0.761594, -0.905148, 0.964028})));
 }

 TEST(uKernels, CopyVectorTest) {
   constexpr int kVectorSize = 5;
   static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
   std::vector<float> output(kVectorSize);
   CopyVector(input, kVectorSize, output.data());
   EXPECT_THAT(output,
               ElementsAreArray(ArrayFloatNear({0.0, -0.5, 1.0, -1.5, 2.0})));
 }

 TEST(uKernels, Sub1VectorTest) {
   constexpr int kVectorSize = 5;
   static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
   std::vector<float> output(kVectorSize);
   Sub1Vector(input, kVectorSize, output.data());
   EXPECT_THAT(output,
               ElementsAreArray(ArrayFloatNear({1.0, 1.5, 0.0, 2.5, -1.0})));
 }

 TEST(uKernels, ZeroVectorTest) {
   constexpr int kVectorSize = 5;
   std::vector<float> output(kVectorSize);
   ZeroVector(output.data(), kVectorSize);
   EXPECT_THAT(output,
               ElementsAreArray(ArrayFloatNear({0.0, 0.0, 0.0, 0.0, 0.0})));
 }

 TEST(uKernels, VectorBatchVectorCwiseProductAccumulate) {
   constexpr int kVectorSize = 29;
   constexpr int kBatchSize = 4;
   static float input[kVectorSize] = {
       1.1,   2.2,   3.3,   4.4,   5.5,   6.6,   7.7,   8.8,   9.9,   10.1,
       11.11, 12.12, 13.13, 14.14, 15.15, 16.16, 17.17, 18.18, 19.19, 20.2,
       21.21, 22.22, 23.23, 24.24, 25.25, 26.26, 27.27, 28.28, 0};
   std::vector<float> output = {
       /* batch 0 */
       1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
       14.14, 15.15, 16.16, 17.17, 18.18, 19.19, 20.2, 21.21, 22.22, 23.23,
       24.24, 25.25, 26.26, 27.27, 28.28, 0,
       /* batch 1 */
       -1.1, -2.2, -3.3, -4.4, -5.5, -6.6, -7.7, -8.8, -9.9, -10.1, -11.11,
       -12.12, -13.13, -14.14, -15.15, -16.16, -17.17, -18.18, -19.19, -20.2,
       -21.21, -22.22, -23.23, -24.24, -25.25, -26.26, -27.27, -28.28, 0,
       /* batch 2 */
       1.1, -2.2, 3.3, -4.4, 5.5, -6.6, 7.7, -8.8, 9.9, -10.1, 11.11, -12.12,
       13.13, -14.14, 15.15, -16.16, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22,
       23.23, -24.24, 25.25, -26.26, 27.27, -28.28, 0,
       /* batch 3 */
       -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
       -13.13, 14.14, -15.15, 16.16, -17.17, 18.18, -19.19, 20.2, -21.21, 22.22,
       -23.23, 24.24, -25.25, 26.26, -27.27, 28.28, 0};
   VectorBatchVectorCwiseProductAccumulate(input, kVectorSize, output.data(),
                                           kBatchSize, output.data());

   // Expect output = input * output + output.
   const std::vector<float> expected_output = {
       /* batch 0 */
       2.310000, 7.040000, 14.190000, 23.760000, 35.750000, 50.159996, 66.989998,
       86.240005, 107.909996, 112.110008, 134.542084, 159.014389, 185.526901,
       214.079605, 244.672485, 277.305603, 311.978912, 348.692413, 387.446136,
       428.240051, 471.074066, 515.948364, 562.862854, 611.817566, 662.812500,
       715.847595, 770.922974, 828.038452, 0.000000,
       /* batch 1 */
       -2.310000, -7.040000, -14.190000, -23.760000, -35.750000, -50.159996,
       -66.989998, -86.240005, -107.909996, -112.110008, -134.542084,
       -159.014389, -185.526901, -214.079605, -244.672485, -277.305603,
       -311.978912, -348.692413, -387.446136, -428.240051, -471.074066,
       -515.948364, -562.862854, -611.817566, -662.812500, -715.847595,
       -770.922974, -828.038452, 0.000000,
       /* batch 2 */
       2.310000, -7.040000, 14.190000, -23.760000, 35.750000, -50.159996,
       66.989998, -86.240005, 107.909996, -112.110008, 134.542084, -159.014389,
       185.526901, -214.079605, 244.672485, -277.305603, 311.978912, -348.692413,
       387.446136, -428.240051, 471.074066, -515.948364, 562.862854, -611.817566,
       662.812500, -715.847595, 770.922974, -828.038452, 0.000000,
       /* batch 3 */
       -2.310000, 7.040000, -14.190000, 23.760000, -35.750000, 50.159996,
       -66.989998, 86.240005, -107.909996, 112.110008, -134.542084, 159.014389,
       -185.526901, 214.079605, -244.672485, 277.305603, -311.978912, 348.692413,
       -387.446136, 428.240051, -471.074066, 515.948364, -562.862854, 611.817566,
       -662.812500, 715.847595, -770.922974, 828.038452, 0.000000};
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }

 TEST(uKernels, VectorBatchVectorCwiseProductNoAccumulate) {
   constexpr int kVectorSize = 29;
   constexpr int kBatchSize = 4;
   static float input[kVectorSize] = {
       1.1,   2.2,   3.3,   4.4,   5.5,   6.6,   7.7,   8.8,   9.9,   10.1,
       11.11, 12.12, 13.13, 14.14, 15.15, 16.16, 17.17, 18.18, 19.19, 20.2,
       21.21, 22.22, 23.23, 24.24, 25.25, 26.26, 27.27, 28.28, 0};
   std::vector<float> output = {
       /* batch 0 */
       1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
       14.14, 15.15, 16.16, 17.17, 18.18, 19.19, 20.2, 21.21, 22.22, 23.23,
       24.24, 25.25, 26.26, 27.27, 28.28, 0,
       /* batch 1 */
       -1.1, -2.2, -3.3, -4.4, -5.5, -6.6, -7.7, -8.8, -9.9, -10.1, -11.11,
       -12.12, -13.13, -14.14, -15.15, -16.16, -17.17, -18.18, -19.19, -20.2,
       -21.21, -22.22, -23.23, -24.24, -25.25, -26.26, -27.27, -28.28, 0,
       /* batch 2 */
       1.1, -2.2, 3.3, -4.4, 5.5, -6.6, 7.7, -8.8, 9.9, -10.1, 11.11, -12.12,
       13.13, -14.14, 15.15, -16.16, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22,
       23.23, -24.24, 25.25, -26.26, 27.27, -28.28, 0,
       /* batch 3 */
       -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
       -13.13, 14.14, -15.15, 16.16, -17.17, 18.18, -19.19, 20.2, -21.21, 22.22,
       -23.23, 24.24, -25.25, 26.26, -27.27, 28.28, 0};
   VectorBatchVectorCwiseProduct(input, kVectorSize, output.data(), kBatchSize,
                                 output.data());

   // Expect output = input * output + output.
   const std::vector<float> expected_output = {
       /* batch 0 */
       1.210000, 4.840000, 10.889999, 19.360001, 30.250000, 43.559998, 59.289997,
       77.440002, 98.009995, 102.010010, 123.432091, 146.894394, 172.396896,
       199.939606, 229.522491, 261.145599, 294.808899, 330.512421, 368.256134,
       408.040039, 449.864075, 493.728363, 539.632874, 587.577576, 637.562500,
       689.587585, 743.652954, 799.758423, 0.000000,
       /* batch 1 */
       -1.210000, -4.840000, -10.889999, -19.360001, -30.250000, -43.559998,
       -59.289997, -77.440002, -98.009995, -102.010010, -123.432091, -146.894394,
       -172.396896, -199.939606, -229.522491, -261.145599, -294.808899,
       -330.512421, -368.256134, -408.040039, -449.864075, -493.728363,
       -539.632874, -587.577576, -637.562500, -689.587585, -743.652954,
       -799.758423, 0.000000,
       /* batch 2 */
       1.210000, -4.840000, 10.889999, -19.360001, 30.250000, -43.559998,
       59.289997, -77.440002, 98.009995, -102.010010, 123.432091, -146.894394,
       172.396896, -199.939606, 229.522491, -261.145599, 294.808899, -330.512421,
       368.256134, -408.040039, 449.864075, -493.728363, 539.632874, -587.577576,
       637.562500, -689.587585, 743.652954, -799.758423, 0.000000,
       /* batch 3 */
       -1.210000, 4.840000, -10.889999, 19.360001, -30.250000, 43.559998,
       -59.289997, 77.440002, -98.009995, 102.010010, -123.432091, 146.894394,
       -172.396896, 199.939606, -229.522491, 261.145599, -294.808899, 330.512421,
       -368.256134, 408.040039, -449.864075, 493.728363, -539.632874, 587.577576,
       -637.562500, 689.587585, -743.652954, 799.758423, 0.000000};
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }

 TEST(uKernels, BatchVectorBatchVectorDotProductTest) {
   constexpr int kVectorSize = 5;
   constexpr int kBatch = 2;
   static float input1[kVectorSize * kBatch] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
                                                -2.5, 3.0,  -3.5, 4.0,  -4.5};
   static float input2[kVectorSize * kBatch] = {0.1,  -0.1, 0.1,  -0.1, 0.1,
                                                -0.1, 0.1,  -0.1, 0.1,  -0.1};
   std::vector<float> output(kBatch);
   BatchVectorBatchVectorDotProduct(input1, input2, kVectorSize, kBatch,
                                    output.data(), /*result_stride=*/1);
   EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({0.5, 1.75})));
 }

 TEST(uKernels, VectorShiftLeftTest) {
   constexpr int kVectorSize = 5;
   static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
   std::vector<float> result(kVectorSize);
   VectorShiftLeft(input, kVectorSize, 3.0f);
   result.assign(input, input + kVectorSize);
   EXPECT_THAT(result,
               ElementsAreArray(ArrayFloatNear({-0.5, 1.0, -1.5, 2.0, 3.0})));
 }

 TEST(uKernels, ReductionSumVectorTest) {
   constexpr int kInputVectorSize = 10;
   constexpr int kOutputVectorSize1 = 5;
   constexpr int kReductionSize1 = 2;
   static float input[kInputVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0,
                                           0.0, -0.5, 1.0, 1.0,  2.0};
   std::vector<float> result1(kOutputVectorSize1);
   ReductionSumVector(input, result1.data(), kOutputVectorSize1,
                      kReductionSize1);
   EXPECT_THAT(result1,
               ElementsAreArray(ArrayFloatNear({-0.5, -0.5, 2.0, 0.5, 3.0})));

   constexpr int kOutputVectorSize2 = 2;
   constexpr int kReductionSize2 = 5;
   std::vector<float> result2(kOutputVectorSize2);
   ReductionSumVector(input, result2.data(), kOutputVectorSize2,
                      kReductionSize2);
   EXPECT_THAT(result2, ElementsAreArray(ArrayFloatNear({1.0, 3.5})));
 }

 TEST(uKernels, MeanStddevNormalizationNoneZeroInput) {
   constexpr int kVectorSize = 4;
   constexpr int kBatchSize = 2;
   constexpr float kNormalizationEpsilon = 1e-8;

   // None-zero input.
   static float input[kVectorSize * kBatchSize] = {
       0.1, 0.2, 0.3, 0.4,  // batch 0
       0.9, 1.0, 1.1, 1.2,  // batch 1
   };
   std::vector<float> output(kVectorSize * kBatchSize);
   MeanStddevNormalization(input, output.data(), kVectorSize, kBatchSize,
                           kNormalizationEpsilon);
   const std::vector<float> expected_output = {
       -1.34164071, -0.447213531, 0.44721365,  1.34164071,  // batch 0
       -1.34163153, -0.447210163, 0.447211236, 1.3416326,   // batch 1
   };
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }

 TEST(uKernels, MeanStddevNormalizationAllZeroInput) {
   constexpr int kVectorSize = 4;
   constexpr int kBatchSize = 2;
   constexpr float kNormalizationEpsilon = 1e-8;

   // Zero input.
   static float input[kVectorSize * kBatchSize] = {
       0.0, 0.0, 0.0, 0.0,  // batch 0
       0.0, 0.0, 0.0, 0.0,  // batch 1
   };
   std::vector<float> output(kVectorSize * kBatchSize);
   MeanStddevNormalization(input, output.data(), kVectorSize, kBatchSize,
                           kNormalizationEpsilon);
   const std::vector<float> expected_output = {
       0.0, 0.0, 0.0, 0.0,  // batch 0
       0.0, 0.0, 0.0, 0.0,  // batch 1
   };
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }

 TEST(uKernels, MeanStddevNormalizationMixed) {
   constexpr int kVectorSize = 4;
   constexpr int kBatchSize = 2;
   constexpr float kNormalizationEpsilon = 1e-8;

   // Mix of zero and non-zero input.
   static float input[kVectorSize * kBatchSize] = {
       0.0, 0.0, 0.0, 0.0,  // batch 0
       0.1, 0.2, 0.3, 0.4,  // batch 1
   };
   std::vector<float> output(kVectorSize * kBatchSize);
   MeanStddevNormalization(input, output.data(), kVectorSize, kBatchSize,
                           kNormalizationEpsilon);
   const std::vector<float> expected_output = {
       0.0,         0.0,          0.0,        0.0,         // batch 0
       -1.34164071, -0.447213531, 0.44721365, 1.34164071,  // batch 1
   };
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }

 TEST(uKernels, MeanStddevNormalizationSmallValue) {
   constexpr int kVectorSize = 4;
   constexpr int kBatchSize = 2;
   constexpr float kNormalizationEpsilon = 1e-8;

   // Mix of zero and non-zero input.
   static float input[kVectorSize * kBatchSize] = {
       3e-5, -7e-6, -9e-5, 1e-6,  // batch 0
       4e-5, 9e-6,  2e-4,  0.0,   // batch 1
   };
   std::vector<float> output(kVectorSize * kBatchSize);
   MeanStddevNormalization(input, output.data(), kVectorSize, kBatchSize,
                           kNormalizationEpsilon);
   const std::vector<float> expected_output = {
       1.04231524,   0.212946132,  -1.64753067, 0.392269224,   // batch 0
       -0.275023013, -0.658201098, 1.70267045,  -0.769446373,  // batch 1
   };
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }

 }  // namespace tensor_utils
 }  // namespace tflite

 #ifdef DOTPROD_BENCHMARKS

 // Compile with --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1" and
 // --copt="-DDOTPROD_BENCHMARKS"
 // Run with --benchmarks=all
 void BM_DotprodBatchOneMultiply(benchmark::State& state) {
   const int rows = state.range(0);
   const int cols = state.range(1);
   const int batch = state.range(2);

   tflite::tensor_utils::MatrixVectorData data =
       tflite::tensor_utils::SetupMatrixVectorData(rows, cols, batch);
   for (auto _ : state) {
     for (int i = 0; i < batch; i++) {
       tflite::tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           data.matrix.data(), data.rows, data.cols,
           data.vectors.data() + (data.cols * i), data.scale_factors.data(), 1,
           &data.results[0], 1);
       testing::DoNotOptimize(data.results[2]);
     }
   }
 }
 BENCHMARK(BM_DotprodBatchOneMultiply)
     ->Args({16, 16, 1})
     ->Args({16, 16, 4})
     ->Args({32, 32, 1})
     ->Args({32, 32, 4})
     ->Args({64, 64, 1})
     ->Args({64, 64, 4})
     ->Args({128, 128, 1})
     ->Args({128, 128, 4})
     ->Args({992, 992, 1})
     ->Args({992, 992, 8})
     ->Args({1024, 1024, 1})
     ->Args({1024, 1024, 4})
     ->Args({1024, 1024, 8})
     ->Args({640, 2048, 1})
     ->Args({640, 2048, 4})
     ->Args({640, 2048, 8})
     ->Args({2048, 2048, 1})
     ->Args({2048, 2048, 8});

 void BM_DotprodBatchFourMultiply(benchmark::State& state) {
   const int rows = state.range(0);
   const int cols = state.range(1);
   const int batch = state.range(2);

   tflite::tensor_utils::MatrixVectorData data =
       tflite::tensor_utils::SetupMatrixVectorData(rows, cols, batch);
   for (auto _ : state) {
     tflite::tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         data.matrix.data(), data.rows, data.cols, data.vectors.data(),
         data.scale_factors.data(), data.batch, &data.results[0], 1);
     testing::DoNotOptimize(data.results[2]);
   }
 }
 BENCHMARK(BM_DotprodBatchFourMultiply)
     ->Args({16, 16, 4})
     ->Args({32, 32, 4})
     ->Args({64, 64, 4})
     ->Args({64, 256, 64})
     ->Args({64, 256, 256})
     ->Args({64, 256, 1024})
     ->Args({64, 256, 12544})
     ->Args({128, 128, 4})
     ->Args({640, 640, 4})
     ->Args({992, 992, 8})
     ->Args({1024, 1024, 4})
     ->Args({1024, 1024, 8})
     ->Args({1024, 1024, 256})
     ->Args({640, 2048, 4})
     ->Args({640, 2048, 8})
     ->Args({2048, 2048, 4})
     ->Args({2048, 2048, 8});

 void BM_DotprodSparseMultiply(benchmark::State& state) {
   const int rows = state.range(0);
   const int cols = state.range(1);
   const int batch = state.range(2);

   tflite::tensor_utils::MatrixVectorData data =
       tflite::tensor_utils::SetupMatrixVectorData(rows, cols, batch);
   for (auto _ : state) {
     tflite::tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate(
         data.sparse_matrix.data(), data.ledger.data(), data.rows, data.cols,
         data.vectors.data(), data.scale_factors.data(), data.batch,
         &data.results[0], 1);
     testing::DoNotOptimize(data.results[2]);
   }
 }
 BENCHMARK(BM_DotprodSparseMultiply)
     ->Args({128, 128, 1})
     ->Args({128, 128, 4})
     ->Args({640, 640, 4})
     ->Args({992, 992, 8})
     ->Args({1024, 1024, 1})
     ->Args({1024, 1024, 4})
     ->Args({1024, 1024, 8})
     ->Args({640, 2048, 1})
     ->Args({640, 2048, 4})
     ->Args({640, 2048, 8})
     ->Args({2048, 2048, 1})
     ->Args({2048, 2048, 8});

 #endif  // DOTPROD_BENCHMARKS