tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>

 #include <cstdlib>
 #include <cstring>
 #include <vector>

 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/experimental/ruy/detect_arm.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
 #include "tensorflow/lite/kernels/internal/round.h"

 #ifdef USE_NEON

 #define kFloatWeightsPerNeonLane 4

 // aligned_alloc is available (via cstdlib/stdlib.h) with C++17/C11.
 #if __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L
 #if !defined(__ANDROID__) || __ANDROID_API__ >= 28
 #if !defined(__APPLE__)  // Apple does not provide aligned_alloc.
 #define TFLITE_USE_STD_ALIGNED_ALLOC
 #endif
 #endif
 #endif

 namespace tflite {
 namespace tensor_utils {
 namespace {

 // Allocates, at least, size bytes of uninitialized storage whose alignment is
 // specified by alignment. The size parameter must be an integral multiple of
 // alignment.
 // Caller is responsible by freeing the allocated memory by calling free on
 // the passed freeing_buffer pointer.
 inline void* aligned_alloc(size_t alignment, size_t size,
                            void** freeing_buffer) {
 #ifdef TFLITE_USE_STD_ALIGNED_ALLOC
   *freeing_buffer = ::aligned_alloc(
       alignment, (size + alignment - 1) / alignment * alignment);
   return *freeing_buffer;
 #else
   *freeing_buffer = malloc(size + alignment);
   const size_t offset = ((uintptr_t)*freeing_buffer) % alignment;  // NOLINT
   return offset == 0
              ? *freeing_buffer
              : ((char*)*freeing_buffer + (alignment - offset));  // NOLINT
 #endif
 }

 bool HasSdotInstruction() {
   static const bool has_dotprod = ruy::DetectDotprod();
   return has_dotprod;
 }

 inline float AccumulateNeonLane(const float32x4_t lane) {
 #ifdef __aarch64__
   return vaddvq_f32(lane);
 #else
   return vgetq_lane_f32(lane, 0) + vgetq_lane_f32(lane, 1) +
          vgetq_lane_f32(lane, 2) + vgetq_lane_f32(lane, 3);
 #endif
 }

 inline int32_t AccumulateNeonLane(const int32x4_t lane) {
 #ifdef __aarch64__
   return vaddvq_s32(lane);
 #else
   int64x2_t pairwiseAdded = vpaddlq_s32(lane);
   return vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1);
 #endif
 }

 }  // namespace

 void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                              int m_cols, const float* vector,
                                              int n_batch, float* result,
                                              int result_stride) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_start shows
   // the start index where this should happen.
   const int postamble_start =
       m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));

   for (int b = 0; b < n_batch; b++) {
     float* result_in_batch = result + b * m_rows * result_stride;
     const float* vector_in_batch = vector + b * m_cols;
     const float* matrix_row = matrix;

     // Main matrix by vector multiplication loop
     for (int r = 0; r < m_rows; r++) {
       float32x4_t acc_32x4 = vmovq_n_f32(0.0);
       for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
         // Load 4 float values from vector and matrix row.
         float32x4_t vector_f32x4 = vld1q_f32(vector_in_batch + c);
         float32x4_t matrix_f32x4 = vld1q_f32(matrix_row + c);
         // Multiply the vector and matrix row and add to accumulator.
         acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
       }
       // Add the 4 intermediate sum values to get the final dot-prod value for
       // this column.
       *result_in_batch += AccumulateNeonLane(acc_32x4);
       for (int c = postamble_start; c < m_cols; c++) {
         *result_in_batch += matrix_row[c] * vector_in_batch[c];
       }
       matrix_row += m_cols;
       result_in_batch += result_stride;
     }
   }
 }

 #ifdef __aarch64__

 // We interleave vector data to make the dot product logic more efficient.
 // Suppose that vectors is:
 //     a0 a1 a2 a3 a4 a5 ...
 //     b0 b1 b2 b3 b4 b5 ...
 //     c0 c1 c2 c3 c4 c5 ...
 //     d0 d1 d2 d3 d4 d5 ...
 //     e0 e1 e2 e3 e4 e5 ...
 // This code interleaves them like this:
 //     a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3 d0 d1 d2 d3 a4 a5 a6 a7 b4 ...
 //     e0 e1 e2 e3 f0 f1 f2 f3 ...
 // Once the data is interleaved, each 16-byte read from the vectors pointer
 // contains 4 bytes from each of 4 vectors.
 const int8_t* ShuffleVectors(const int8_t* vectors, const int n_batch,
                              const int m_cols, void** shuffled_vectors_free) {
   const int kWeightsPerUint32 = 4;

   int8* shuffled_vectors = reinterpret_cast<int8*>(aligned_alloc(
       kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free));

   for (int i = 0; i < n_batch; i += 4) {
     int8* shuffled_vectors_ptr = shuffled_vectors + (i * m_cols);
     const int8* unshuffled_vec0_ptr =
         reinterpret_cast<const int8*>(vectors) + (i * m_cols);
     const int8* unshuffled_vec1_ptr =
         reinterpret_cast<const int8*>(vectors) + ((i + 1) * m_cols);
     const int8* unshuffled_vec2_ptr =
         reinterpret_cast<const int8*>(vectors) + ((i + 2) * m_cols);
     const int8* unshuffled_vec3_ptr =
         reinterpret_cast<const int8*>(vectors) + ((i + 3) * m_cols);
     const int8* const end_vec0_ptr = unshuffled_vec1_ptr;

     while (unshuffled_vec0_ptr != end_vec0_ptr) {
       asm volatile(
           // This code path requires that (n_cols % 16) == 0 so we can safely
           // read in 16-byte chunks from each row.
           "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
           "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
           "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
           "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"

           "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
           "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
           "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
           "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"

           : [ unshuffled_vec0_ptr ] "+r"(unshuffled_vec0_ptr),
             [ unshuffled_vec1_ptr ] "+r"(unshuffled_vec1_ptr),
             [ unshuffled_vec2_ptr ] "+r"(unshuffled_vec2_ptr),
             [ unshuffled_vec3_ptr ] "+r"(unshuffled_vec3_ptr),
             [ shuffled_vectors_ptr ] "+r"(shuffled_vectors_ptr)
           :
           : "v0", "v1", "v2", "v3", "cc", "memory");
     }
   }

   return reinterpret_cast<const int8_t*>(shuffled_vectors);
 }

 static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* vectors, const float* scaling_factors, int n_batch,
     float* __restrict__ result) {
   void* shuffled_vectors_free;

   const int8_t* shuffled_vectors =
       ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);

   for (int row = 0; row < m_rows; row += 2) {
     for (int batch = 0; batch < n_batch; batch += 4) {
       float* result_ptr = result + (batch * m_rows) + row;
       const int8* mat_ptr0 = matrix + (row * m_cols);
       const int8* mat_ptr1 = matrix + ((row + 1) * m_cols);
       const int8* mat_ptr0_end = mat_ptr1;
       const int8* vec_ptr = shuffled_vectors + (batch * m_cols);
       const float* scaling_factors_ptr = scaling_factors + batch;
       const uint64_t wide_rows = m_rows * sizeof(float);

       asm volatile(
           // Zero out the accumulator registers.
           "dup v0.4s, wzr\n"
           "dup v1.4s, wzr\n"
           "dup v2.4s, wzr\n"
           "dup v3.4s, wzr\n"

           "1:\n"  // batch_cols_loop

           // Read 16 more bytes from a pair of matrix rows.
           "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"

           // Read from input vectors 4 times; 64 bytes total.
           // Each 16-byte register contains parts of 4 vectors; see the
           // shuffle logic above.

           // From Benoit, places to look in the future:
           // - Move load instructions further from sdot
           // - Switch loop use-then-reload
           // - Do partial unrolling to use register space better
           "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
           ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
           "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
           ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
           "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
           ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
           "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
           ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"

           // Re-use those vectors for the next row as well.
           "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
           ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
           ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
           ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
           ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"

           // If we're not done with these rows, continue.
           "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
           "bne 1b\n"  // batch_cols_loop

           // Done with the rows, sum the results.
           "add v0.4s, v0.4s, v1.4s\n"
           "add v2.4s, v2.4s, v3.4s\n"

           // Convert the per-vector sums to floating point.
           "scvtf v0.4s, v0.4s\n"
           "scvtf v1.4s, v2.4s\n"

           // Fetch scale factors.
           "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"

           // Multiply scale factors times sums.
           "fmul v0.4s, v4.4s, v0.4s\n"
           "fmul v1.4s, v4.4s, v1.4s\n"

           // Load previous result values.
           // The result position is:
           //   result[batch * m_rows + row]
           // Here that is factored into:
           //   result_ptr = result + row
           //   *result_ptr = res[0]
           //   (uint8*)result_ptr += (m_rows * sizeof(float))
           //   *result_ptr = res[1]
           //   ...
           // Since we're reading two rows at a time, though, we read both
           //   result[batch * m_rows + row]
           // and
           //   result[batch * m_rows + row + 1]
           "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
           "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
           "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
           "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"

           // Go back to the starting position (subtract wide_rows * 4).
           "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"

           // Add previous result values.
           "fadd v9.4s, v9.4s, v0.4s\n"
           "fadd v10.4s, v10.4s, v1.4s\n"

           // Store results.
           "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
           "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
           "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
           "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
           : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1),
             [ vec_ptr ] "+r"(vec_ptr), [ result_ptr ] "+r"(result_ptr)
           : [ mat_ptr0_end ] "r"(mat_ptr0_end),
             [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
             [ wide_rows ] "r"(wide_rows)
           : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
             "v10", "v11", "v12", "v13", "cc", "memory");
     }
   }

   free(shuffled_vectors_free);
 }

 static void DotprodSparseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
     const int m_cols, const int8_t* __restrict__ vectors,
     const float* scaling_factors, int n_batch, float* __restrict__ result,
     int result_stride) {
   const uint8_t* ledger_ptr = ledger;
   const int8* mat_ptr = matrix;

   for (int row = 0; row < m_rows; row++) {
     int num_nonzero_chunks = *ledger_ptr;
     ledger_ptr++;
     const uint8* ledger_start = ledger_ptr;
     const uint8* ledger_end = ledger_ptr + num_nonzero_chunks;
     const int8* mat_start = mat_ptr;

     for (int batch = 0; batch < n_batch; batch++) {
       const int8* vec_ptr = vectors + (batch * m_cols);
       int64_t row_sum = 0;

       mat_ptr = mat_start;
       ledger_ptr = ledger_start;

       if (ledger_ptr != ledger_end) {
         asm volatile(
             "dup v0.4s, wzr\n"
             "dup v1.4s, wzr\n"
             "dup v8.4s, wzr\n"
             "mov x7, 0\n"

             "1:\n"  // chunks_loop

             // Single matrix chunk, 16 bytes
             "ld1 {v8.16b}, [%[mat_ptr]], #16\n"

             // Read the next ledger index and increment.
             "ldrb w7, [%[ledger_ptr]], #1\n"

             // Read 16 bytes of vector data from (vec_ptr + (ledger_index * 16))
             "add x8, %[vec_ptr], x7, lsl #4\n"
             "ld1 {v9.16b}, [x8]\n"

             // Dot product of matrix row and vector.
             ".word 0x4e889520  // sdot v0.4s, v9.16b, v8.16b\n"

             "cmp %[ledger_ptr], %[ledger_end]\n"
             "blt 1b\n"  // chunks_loop

             // Sum the 4 vector components into a 32-bit value.
             "addv s1, v0.4s\n"
             // row_sum is 64-bit, so we copy 64 bits of v1 into it.
             // We have to be careful to cast this value to 32 bits in order
             // to interpret the sign bit properly.
             "mov %[row_sum], v1.d[0]\n"
             : [ row_sum ] "=r"(row_sum), [ ledger_ptr ] "+r"(ledger_ptr),
               [ mat_ptr ] "+r"(mat_ptr), [ vec_ptr ] "+r"(vec_ptr)
             : [ ledger_end ] "r"(ledger_end)
             : "x0", "x1", "x7", "x8", "v0", "v1", "v8", "v9", "cc", "memory");
       }
       result[(batch * m_rows + row) * result_stride] +=
           static_cast<int32>(row_sum) * scaling_factors[batch];
     }
   }
 }

 #endif  // __aarch64__

 void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride) {
 #ifdef __aarch64__
   if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 &&
       m_rows >= n_batch) {
     if (n_batch % 4 == 0 && result_stride == 1) {
       // Benchmarks suggest that it's always better to use the batch code
       // when we can, even on small matrices.
       DotprodMatrixBatchFourVectorMultiplyAccumulate(
           matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
       return;
     }
   }
 #endif  // __aarch64__

   static const int kWeightsPerUint32 = 4;
   static const int kWeightsPerNeonLane = 16;
   // Assuming *matrix is kWeightsPerUint32-byte aligned,
   // every row of the matrix is also
   // kWeightsPerUint32-byte aligned as long as cols is
   // a multiple of kWeightsPerUint32. The assumption
   // is currently satisfied by TFLite's 16-byte memory
   // alignment scheme.
   //
   // Otherwise, we allocate an aligned memory block and set
   // a flag to later copy rows from matrix to the block
   // for aligned multiplication.
   bool unaligned = false;
   int8_t* aligned_row = nullptr;
   void* aligned_row_free = nullptr;
   if ((m_cols & (kWeightsPerUint32 - 1)) != 0) {
     unaligned = true;
     aligned_row = (int8_t*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
                                          &aligned_row_free);
   }
   void* aligned_vec_free = nullptr;
   int8_t* aligned_vec =
       (int8_t*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
                              &aligned_vec_free);

   // If m_cols is not at least kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_half_start
   // shows the start index where this should happen. Between postamble_start and
   // postamble_half_start we can still process kWeightsPerNeonLane >> 1 in a
   // vectorized form.
   const int postamble_half_start = m_cols & ~(kWeightsPerNeonLane - 1);
   const int postamble_start = m_cols & ~((kWeightsPerNeonLane >> 1) - 1);

   for (int batch = 0; batch < n_batch; ++batch) {
     const float batch_scaling_factor = scaling_factors[batch];
     // Copy the vector data to an aligned vector.
     memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols);
     // Compute dot-product for every column.
     for (int row = 0; row < m_rows; ++row, result += result_stride) {
       // Get the address of the first element of the row.
       int8_t* row_ptr = (int8_t*)matrix + row * m_cols;  // NOLINT
       if (unaligned) {
         memcpy(aligned_row, row_ptr, sizeof(int8_t) * m_cols);
         row_ptr = aligned_row;
       }

       // Initialize the dot product sum for the row to 0.
       int32x4_t dotprod_32x4 = vmovq_n_s32(0);

       // Prefetch the row to cache.
       __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
                          3 /* temporal locality */);

       // For every block of 16 8-bit elements.
       int col = 0;
       for (; col < postamble_half_start; col += kWeightsPerNeonLane) {
         // Load 16 8-bit values from the row and vector, each, to operate on.
         // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
         // performance may suffer significantly.
         TFLITE_DCHECK_EQ(  // NOLINT
             (uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1), 0);
         const int8x16_t s1_8x16 = vld1q_s8((const int8_t*)(aligned_vec + col));
         const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr + col));
         // Multiply the low bits (i.e. the lower 8 8bit numbers in the
         // registers).
         int16x8_t prod_16x8 =
             vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
         // Multiply the high bits (i.e. the higher 8 8bit numbers in the
         // registers), and accumulate with the result of the low bits product.
         // The assumption here is that overflow will not happen as we quantize
         // our values to be in the range [-127, 127]. As such the sum of the 2
         // products is always strictly smaller than 15-bits (32767 in absolute
         // value).
         prod_16x8 =
             vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));

         dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
       }  // for col

       // Half iteration dealing only 8 elements
       // TODO(raziel): if (ABSL_PREDICT_FALSE(col < postamble_start))
       if (col < postamble_start) {
         // Load 8 8-bit values from the row and column each to operate on.
         // Here the assumption is that each buffer is 4-bytes aligned.
         // Otherwise, performance may suffer significantly.
         TFLITE_DCHECK_EQ(  // NOLINT
             (uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1), 0);
         const int8x8_t s1_8x8 = vld1_s8((const int8_t*)(aligned_vec + col));
         const int8x8_t s2_8x8 = vld1_s8((const int8_t*)(row_ptr + col));
         const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
         dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
         col += (kWeightsPerNeonLane >> 1);
       }
       // Add the 4 intermediate sum values to get the final dot-prod value for
       // this row.
       int32_t dotprod = AccumulateNeonLane(dotprod_32x4);
       // Postamble loop.
       // TODO(raziel): if (ABSL_PREDICT_FALSE(col < m_cols))
       for (; col < m_cols; ++col) {
         dotprod += row_ptr[col] * aligned_vec[col];
       }  // for col

       *result += dotprod * batch_scaling_factor;
     }  // for row
   }    // for batch

   if (unaligned) {
     free(aligned_row_free);
   }
   free(aligned_vec_free);
 }

 void NeonSparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
     float* __restrict__ result, int result_stride) {
   const int kBlockSize = 16;
   const int kNeonLanesPerBlock = 4;
   TFLITE_DCHECK_EQ(  // NOLINT
       m_cols % kBlockSize, 0);

   float* result_in_batch = result;
   for (int b = 0; b < n_batch; b++) {
     const float* matrix_ptr = matrix;
     const uint8_t* ledger_ptr = ledger;
     for (int r = 0; r < m_rows; r++) {
       int num_nonzero_blocks = *ledger_ptr++;
       if (num_nonzero_blocks > 0) {
         float32x4_t acc_32x4 = vmovq_n_f32(0.0);
         const float* vector_in_batch = vector + b * m_cols;

         for (int i = 0; i < num_nonzero_blocks; i++) {
           const int block_start_index = *ledger_ptr++ * kBlockSize;
           const float* vector_block_in_batch_ptr =
               vector_in_batch + block_start_index;

           for (int c = 0; c < kNeonLanesPerBlock; c++) {
             // Load 4 float values from the vector and matrix row.
             float32x4_t vector_f32x4 = vld1q_f32(vector_block_in_batch_ptr +
                                                  c * kFloatWeightsPerNeonLane);
             float32x4_t matrix_f32x4 =
                 vld1q_f32(matrix_ptr + c * kFloatWeightsPerNeonLane);
             // Multiply the vector and matrix row and add to accumulator.
             acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
           }
           matrix_ptr += kBlockSize;
         }
         *result_in_batch += AccumulateNeonLane(acc_32x4);
       }
       result_in_batch += result_stride;
     }
   }
 }

 void NeonSparseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
     const int m_cols, const int8_t* __restrict__ vectors,
     const float* scaling_factors, int n_batch, float* __restrict__ result,
     int result_stride) {
 #ifdef __aarch64__
   if (HasSdotInstruction() && m_cols % 16 == 0) {
     DotprodSparseMatrixBatchVectorMultiplyAccumulate(
         matrix, ledger, m_rows, m_cols, vectors, scaling_factors, n_batch,
         result, result_stride);
     return;
   }
 #endif  // __aarch64__

   const int kWeightsPerUint32 = 4;
   const int kWeightsPerNeonLane = 16;
   const int kBlockSize = kWeightsPerNeonLane;
   TFLITE_DCHECK_EQ(  // NOLINT
       m_cols % kBlockSize, 0);
   void* aligned_vec_free = nullptr;
   int8_t* aligned_vec =
       (int8_t*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
                              &aligned_vec_free);

   for (int batch = 0; batch < n_batch; ++batch) {
     const float batch_scaling_factor = scaling_factors[batch];
     // Copy the vector data to an aligned vector.
     memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8) * m_cols);

     const uint8_t* ledger_ptr = ledger;
     const int8_t* row_ptr = matrix;
     for (int row = 0; row < m_rows; ++row, result += result_stride) {
       // Initialize the dot product sum for the row to 0.
       int32x4_t dotprod_32x4 = vmovq_n_s32(0);
       int num_nonzero_blocks = *ledger_ptr++;
       if (num_nonzero_blocks > 0) {
         // Prefetch the row to cache.
         __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
                            3 /* temporal locality */);
         for (int i = 0; i < num_nonzero_blocks; i++) {
           const int col_index = *ledger_ptr++ * kBlockSize;
           // Load 16 8-bit values from the row and vector, each, to operate on.
           // Here the assumption is that each buffer is 4-byte aligned.
           // Otherwise, performance may suffer significantly.
           TFLITE_DCHECK_EQ(  // NOLINT
               (uintptr_t)(&row_ptr) & (kWeightsPerUint32 - 1), 0);
           const int8x16_t s1_8x16 =
               vld1q_s8((const int8_t*)(aligned_vec + col_index));
           const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr));
           // Multiply the low bits (i.e. the lower 8 8bit numbers in the
           // registers).
           int16x8_t prod_16x8 =
               vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
           // Multiply the high bits (i.e. the lower 8 8bit numbers in the
           // registers), and accumulate with the result of the low bits product.
           // The assumption here is that overflow will not happen as we quantize
           // our values to be in the range [-127, 127]. As such the sum of the 2
           // products is always strictly smaller than 15-bits (32767 in absolute
           // value).
           prod_16x8 =
               vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));

           dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
           row_ptr += kBlockSize;
         }
         // Add the 4 intermediate sum values to get the final dot-prod value for
         // this row.
         int32_t dotprod = AccumulateNeonLane(dotprod_32x4);
         *result += dotprod * batch_scaling_factor;
       }
     }  // for row
   }    // for batch
   free(aligned_vec_free);
 }

 void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
                                   int v_size, float* result) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_start shows
   // the start index where this should happen.
   const int postamble_start =
       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
     // Load 4 float values from vector1 and vector2.
     float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
     float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
     // Vector multiply 4 float
     float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4);
     // Save to result array.
     vst1q_f32(&result[v], mul_32x4);
   }
   for (int v = postamble_start; v < v_size; v++) {
     result[v] = vector1[v] * vector2[v];
   }
 }

 void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
                                             const float* vector2, int v_size,
                                             float* result) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_start shows
   // the start index where this should happen.
   const int postamble_start =
       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
     // Load 4 float values from vector1 and vector2 and accumulator.
     float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
     float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
     float32x4_t acc_32x4 = vld1q_f32(result + v);
     // Vector multiply-accumulate 4 float
     acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
     // Save to result array.
     vst1q_f32(&result[v], acc_32x4);
   }
   for (int v = postamble_start; v < v_size; v++) {
     result[v] += vector1[v] * vector2[v];
   }
 }

 void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size,
                                        const float* batch_vector, int n_batch,
                                        float* result) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_start shows
   // the start index where this should happen.
   const int postamble_start =
       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));

   for (int b = 0; b < n_batch; b++) {
     for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
       // Load from memory to vectors.
       float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector + v);
       float32x4_t vector_f32x4 = vld1q_f32(vector + v);
       // Multiply.
       float32x4_t result_f32x4 = vmulq_f32(batch_vector_f32x4, vector_f32x4);
       // Store.
       vst1q_f32(result + v, result_f32x4);
     }
     // Postamble loop
     for (int v = postamble_start; v < v_size; v++) {
       result[v] = vector[v] * batch_vector[v];
     }
     // Update the pointers.
     result += v_size;
     batch_vector += v_size;
   }
 }

 void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
                                                  int v_size,
                                                  const float* batch_vector,
                                                  int n_batch, float* result) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_start shows
   // the start index where this should happen.
   const int postamble_start =
       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));

   float* result_ptr = result;
   const float* batch_vector_ptr = batch_vector;
   for (int b = 0; b < n_batch; b++) {
     for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
       // Load from memory to vectors.
       float32x4_t result_f32x4 = vld1q_f32(result_ptr + v);
       float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v);
       float32x4_t vector_f32x4 = vld1q_f32(vector + v);
       // Multiply-accumulate.
       result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4, vector_f32x4);
       // Store.
       vst1q_f32(result_ptr + v, result_f32x4);
     }
     // Postamble loop
     for (int v = postamble_start; v < v_size; v++) {
       result_ptr[v] += vector[v] * batch_vector_ptr[v];
     }
     // Update the pointers.
     result_ptr += v_size;
     batch_vector_ptr += v_size;
   }
 }

 void NeonSub1Vector(const float* vector, int v_size, float* result) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_start shows
   // the start index where this should happen.
   const int postamble_start =
       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));

   float32x4_t one_f32x4 = vmovq_n_f32(1.0);
   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
     // Load 4 float values from the current pointers of the input column and
     // subtract from 1.
     float32x4_t v_f32x4 = vld1q_f32(vector + v);
     float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4);
     // Save to output.
     vst1q_f32(result + v, result_f32x4);
   }
   for (int v = postamble_start; v < v_size; v++) {
     result[v] = 1.0f - vector[v];
   }
 }

 bool NeonIsZeroVector(const float* vector, int v_size) {
   // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot
   // use the main vectorized loop, and we need to process sequentially.
   // postamble_start shows the start index where this should happen.
   const int postamble_start =
       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));

   const float32x4_t zero_x4_float = vmovq_n_f32(0.0f);
   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
     const float32x4_t i_x4_float = vld1q_f32(vector + v);
     uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float);
     if (vgetq_lane_u32(cmp_result, 0) == 0) return false;
     if (vgetq_lane_u32(cmp_result, 1) == 0) return false;
     if (vgetq_lane_u32(cmp_result, 2) == 0) return false;
     if (vgetq_lane_u32(cmp_result, 3) == 0) return false;
   }

   // Postamble loop
   for (int v = postamble_start; v < v_size; ++v) {
     if (vector[v] != 0.0) return false;
   }
   return true;
 }

 void NeonClipVector(const float* vector, int v_size, float abs_limit,
                     float* result) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_start shows
   // the start index where this should happen.
   const int postamble_start =
       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));

   // Replicate abs_limit and -abs_limit in two vectors.
   const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit);
   const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit);

   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
     // Load from memory to vector.
     float32x4_t v_f32x4 = vld1q_f32(vector + v);
     // Clip between abs_limit and -abs_limit.
     float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4);
     result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4);
     // Save to output.
     vst1q_f32(result + v, result_f32x4);
   }
   // Postamble loop.
   for (int v = postamble_start; v < v_size; v++) {
     result[v] = (abs_limit < vector[v]) ? abs_limit : vector[v];
     result[v] = (-abs_limit > result[v]) ? -abs_limit : result[v];
   }
 }

 void NeonVectorScalarMultiply(const int8_t* vector, const int v_size,
                               const float scale, float* result) {
   // Here the assumption is that each buffer is 4-byte aligned.
   const int kWeightsPerUint32 = 4;
   TFLITE_CHECK_EQ((intptr_t)(&vector[0]) & (kWeightsPerUint32 - 1), 0);
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_start shows
   // the start index where this should happen.
   const int kWeightsPerNeonLane = 16;
   const int postamble_start = v_size - (v_size & (kWeightsPerNeonLane - 1));

   // Create a vector of 4 floats with the scale value.
   const float32x4_t scale_f32x4 = vdupq_n_f32(scale);
   int v = 0;
   for (; v < postamble_start; v += kWeightsPerNeonLane) {
     // Load int8 values, sixteen at a time.
     const int8x16_t v_i8x16 = vld1q_s8(vector + v);
     // Split it into two components of size eight.
     const int8x8_t v0_i8x8 = vget_low_s8(v_i8x16);
     const int8x8_t v1_i8x8 = vget_high_s8(v_i8x16);
     // Convert both components to int16 first.
     const int16x8_t v0_i16x8 = vmovl_s8(v0_i8x8);
     const int16x8_t v1_i16x8 = vmovl_s8(v1_i8x8);
     // Split each of them into two components each.
     const int16x4_t v0_i16x4 = vget_low_s16(v0_i16x8);
     const int16x4_t v1_i16x4 = vget_high_s16(v0_i16x8);
     const int16x4_t v2_i16x4 = vget_low_s16(v1_i16x8);
     const int16x4_t v3_i16x4 = vget_high_s16(v1_i16x8);
     // Convert these to int32 and then to float.
     float32x4_t v0_f32x4 = vcvtq_f32_s32(vmovl_s16(v0_i16x4));
     float32x4_t v1_f32x4 = vcvtq_f32_s32(vmovl_s16(v1_i16x4));
     float32x4_t v2_f32x4 = vcvtq_f32_s32(vmovl_s16(v2_i16x4));
     float32x4_t v3_f32x4 = vcvtq_f32_s32(vmovl_s16(v3_i16x4));
     // Vector multiply four floats at a time.
     v0_f32x4 = vmulq_f32(v0_f32x4, scale_f32x4);
     v1_f32x4 = vmulq_f32(v1_f32x4, scale_f32x4);
     v2_f32x4 = vmulq_f32(v2_f32x4, scale_f32x4);
     v3_f32x4 = vmulq_f32(v3_f32x4, scale_f32x4);
     // Store the results.
     vst1q_f32(result + v, v0_f32x4);
     vst1q_f32(result + v + 4, v1_f32x4);
     vst1q_f32(result + v + 8, v2_f32x4);
     vst1q_f32(result + v + 12, v3_f32x4);
   }

   if (v_size - postamble_start >= (kWeightsPerNeonLane >> 1)) {
     // Load eight int8 values, if there is at least eight remaining.
     const int8x8_t v_i8x8 = vld1_s8(vector + v);
     // Convert them to int16 first.
     const int16x8_t v_i16x8 = vmovl_s8(v_i8x8);
     // Split it into two components.
     const int16x4_t v0_i16x4 = vget_low_s16(v_i16x8);
     const int16x4_t v1_i16x4 = vget_high_s16(v_i16x8);
     // Convert the components two floats.
     float32x4_t v0_f32x4 = vcvtq_f32_s32(vmovl_s16(v0_i16x4));
     float32x4_t v1_f32x4 = vcvtq_f32_s32(vmovl_s16(v1_i16x4));
     // Vector multiply four floats at a time.
     v0_f32x4 = vmulq_f32(v0_f32x4, scale_f32x4);
     v1_f32x4 = vmulq_f32(v1_f32x4, scale_f32x4);
     // Store the results.
     vst1q_f32(result + v, v0_f32x4);
     vst1q_f32(result + v + 4, v1_f32x4);
     v += (kWeightsPerNeonLane >> 1);
   }

   // Postamble loop.
   for (; v < v_size; v++) {
     result[v] = scale * vector[v];
   }
 }

 // TODO(renjieliu): Avoid duplicating the logic.
 // Also consider changing the rounding stragey from "ties to away" to
 // "ties to even" since vcvtnq_s32_f32 is generally more available.
 inline int32x4_t RoundToNearest(const float32x4_t input) {
 #if defined(_ACAT_ARM64)
   return vcvtaq_s32_f32(input);
 #else
   static const float32x4_t zero_val_dup = vdupq_n_f32(0.0f);
   static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);

   const int32x4_t mask = vreinterpretq_s32_u32(vcltq_f32(input, zero_val_dup));
   const float32x4_t casted_mask = vcvtq_f32_s32(mask);
   const float32x4_t round = vaddq_f32(casted_mask, point5_val_dup);
   return vcvtq_s32_f32(vaddq_f32(input, round));
 #endif
 }

 void NeonSymmetricQuantizeFloats(const float* values, const int size,
                                  int8_t* quantized_values, float* min,
                                  float* max, float* scaling_factor) {
   // TODO(raziel): vectorize min/max calculation.
   auto minmax = std::minmax_element(values, values + size);
   *min = *minmax.first;
   *max = *minmax.second;
   const int kScale = 127;
   const float range = std::max(std::abs(*min), std::abs(*max));
   if (range == 0) {
     memset(quantized_values, 0, size * sizeof(int8_t));
     *scaling_factor = 1;
     return;
   }
   *scaling_factor = range / kScale;
   const float scaling_factor_inv = kScale / range;

   const int postamble_start =
       size - (size & (2 * kFloatWeightsPerNeonLane - 1));

   // Vectorized constants.
   const float32x4_t q_factor_f32x4 = vmovq_n_f32(scaling_factor_inv);
   const int32x4_t scale_i32x4 = vmovq_n_s32(kScale);
   const int32x4_t neg_scale_i32x4 = vmovq_n_s32(-kScale);

   for (int i = 0; i < postamble_start; i += 2 * kFloatWeightsPerNeonLane) {
     // Implements the vectorized version of the following:
     // const int32 quantized_value = static_cast<int32>(
     //    std::round(*scaling_factor * values[i]));
     float32x4_t value0_f32x4 = vld1q_f32(&values[i]);
     float32x4_t value1_f32x4 = vld1q_f32(&values[i + kFloatWeightsPerNeonLane]);
     float32x4_t mul0_f32x4 = vmulq_f32(value0_f32x4, q_factor_f32x4);
     float32x4_t mul1_f32x4 = vmulq_f32(value1_f32x4, q_factor_f32x4);

     const int32x4_t f2i0_i32x4 = RoundToNearest(mul0_f32x4);
     const int32x4_t f2i1_i32x4 = RoundToNearest(mul1_f32x4);

     // Implements the vectorized version of the folowing block:
     //  quantized_values[i] = std::min(kScale, std::max(-kScale,
     //  quantized_value));
     int32x4_t max0_i32x4 = vmaxq_s32(f2i0_i32x4, neg_scale_i32x4);
     int32x4_t max1_i32x4 = vmaxq_s32(f2i1_i32x4, neg_scale_i32x4);
     int32x4_t min0_i32x4 = vminq_s32(max0_i32x4, scale_i32x4);
     int32x4_t min1_i32x4 = vminq_s32(max1_i32x4, scale_i32x4);

     int16x4_t min0_16x4 = vmovn_s32(min0_i32x4);
     int16x4_t min1_16x4 = vmovn_s32(min1_i32x4);

     int16x8_t min_16x8 = vcombine_s16(min0_16x4, min1_16x4);
     int8x8_t min_s8x8 = vqmovn_s16(min_16x8);
     vst1_s8(&quantized_values[i], min_s8x8);
   }

   for (int i = postamble_start; i < size; ++i) {
     const int32 quantized_value =
         static_cast<int32>(TfLiteRound(scaling_factor_inv * values[i]));
     quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
   }
 }

 float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
                                  int v_size) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_start shows
   // the start index where this should happen.
   const int postamble_start =
       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
   float32x4_t acc_32x4 = vmovq_n_f32(0.0);
   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
     // Load 4 float values from vector1 and vector2 and accumulator.
     float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
     float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
     // Vector multiply-accumulate 4 float
     acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
   }
   float result = AccumulateNeonLane(acc_32x4);
   // Postamble loop.
   for (int v = postamble_start; v < v_size; v++) {
     result += vector1[v] * vector2[v];
   }
   return result;
 }

 void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
                                           const float* vector2, int v_size,
                                           int n_batch, float* result,
                                           int result_stride) {
   float* result_ptr = result;
   const float* vector1_ptr = vector1;
   const float* vector2_ptr = vector2;
   for (int b = 0; b < n_batch; b++) {
     *result_ptr = NeonVectorVectorDotProduct(vector1_ptr, vector2_ptr, v_size);
     vector1_ptr += v_size;
     vector2_ptr += v_size;
     result_ptr += result_stride;
   }
 }

 void NeonReductionSumVector(const float* input_vector, float* output_vector,
                             int output_size, int reduction_size) {
   const float* input_vector_ptr = input_vector;
   for (int o = 0; o < output_size; o++) {
     // If reduction_size is not divisible by kWeightsPerNeonLane, we cannot use
     // the main vectorized loop, and we need to process sequentially.
     // postamble_start shows the start index where this should happen.
     const int postamble_start =
         reduction_size - (reduction_size & (kFloatWeightsPerNeonLane - 1));
     float32x4_t sum_f32x4 = vmovq_n_f32(0.0);
     for (int r = 0; r < postamble_start; r += kFloatWeightsPerNeonLane) {
       float32x4_t v1_f32x4 = vld1q_f32(input_vector_ptr + r);
       sum_f32x4 = vaddq_f32(sum_f32x4, v1_f32x4);
     }
     output_vector[o] += AccumulateNeonLane(sum_f32x4);
     input_vector_ptr += postamble_start;

     // Postamble loop.
     for (int r = postamble_start; r < reduction_size; r++) {
       output_vector[o] += *input_vector_ptr++;
     }
   }
 }

 }  // namespace tensor_utils
 }  // namespace tflite

 #endif  // USE_NEON