libgav1/src/dsp/arm/intrapred_cfl_neon.cc - platform/external/libgav1 - Git at Google

 // Copyright 2019 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/dsp/intrapred.h"
 #include "src/utils/cpu.h"

 #if LIBGAV1_ENABLE_NEON

 #include <arm_neon.h>

 #include <cassert>
 #include <cstddef>
 #include <cstdint>

 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"

 namespace libgav1 {
 namespace dsp {
 namespace low_bitdepth {
 namespace {

 uint8x16_t Set2ValuesQ(const uint8_t* a) {
   uint16_t combined_values = a[0] | a[1] << 8;
   return vreinterpretq_u8_u16(vdupq_n_u16(combined_values));
 }

 uint32_t SumVector(uint32x2_t a) {
 #if defined(__aarch64__)
   return vaddv_u32(a);
 #else
   const uint64x1_t b = vpaddl_u32(a);
   return vget_lane_u32(vreinterpret_u32_u64(b), 0);
 #endif  // defined(__aarch64__)
 }

 uint32_t SumVector(uint32x4_t a) {
 #if defined(__aarch64__)
   return vaddvq_u32(a);
 #else
   const uint64x2_t b = vpaddlq_u32(a);
   const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
   return vget_lane_u32(vreinterpret_u32_u64(c), 0);
 #endif  // defined(__aarch64__)
 }

 // Divide by the number of elements.
 uint32_t Average(const uint32_t sum, const int width, const int height) {
   return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
 }

 // Subtract |val| from every element in |a|.
 void BlockSubtract(const uint32_t val,
                    int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
                    const int width, const int height) {
   assert(val <= INT16_MAX);
   const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));

   for (int y = 0; y < height; ++y) {
     if (width == 4) {
       const int16x4_t b = vld1_s16(a[y]);
       vst1_s16(a[y], vsub_s16(b, vget_low_s16(val_v)));
     } else if (width == 8) {
       const int16x8_t b = vld1q_s16(a[y]);
       vst1q_s16(a[y], vsubq_s16(b, val_v));
     } else if (width == 16) {
       const int16x8_t b = vld1q_s16(a[y]);
       const int16x8_t c = vld1q_s16(a[y] + 8);
       vst1q_s16(a[y], vsubq_s16(b, val_v));
       vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
     } else /* block_width == 32 */ {
       const int16x8_t b = vld1q_s16(a[y]);
       const int16x8_t c = vld1q_s16(a[y] + 8);
       const int16x8_t d = vld1q_s16(a[y] + 16);
       const int16x8_t e = vld1q_s16(a[y] + 24);
       vst1q_s16(a[y], vsubq_s16(b, val_v));
       vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
       vst1q_s16(a[y] + 16, vsubq_s16(d, val_v));
       vst1q_s16(a[y] + 24, vsubq_s16(e, val_v));
     }
   }
 }

 template <int block_width, int block_height>
 void CflSubsampler420_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
     const void* const source, const ptrdiff_t stride) {
   const auto* src = static_cast<const uint8_t*>(source);
   uint32_t sum;
   if (block_width == 4) {
     assert(max_luma_width >= 8);
     uint32x2_t running_sum = vdup_n_u32(0);

     for (int y = 0; y < block_height; ++y) {
       const uint8x8_t row0 = vld1_u8(src);
       const uint8x8_t row1 = vld1_u8(src + stride);

       uint16x4_t sum_row = vpadal_u8(vpaddl_u8(row0), row1);
       sum_row = vshl_n_u16(sum_row, 1);
       running_sum = vpadal_u16(running_sum, sum_row);
       vst1_s16(luma[y], vreinterpret_s16_u16(sum_row));

       if (y << 1 < max_luma_height - 2) {
         // Once this threshold is reached the loop could be simplified.
         src += stride << 1;
       }
     }

     sum = SumVector(running_sum);
   } else if (block_width == 8) {
     const uint8x16_t x_index = {0, 0, 2,  2,  4,  4,  6,  6,
                                 8, 8, 10, 10, 12, 12, 14, 14};
     const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
     const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);

     uint32x4_t running_sum = vdupq_n_u32(0);

     for (int y = 0; y < block_height; ++y) {
       const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2);
       const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride);

       uint8x16_t row0 = vld1q_u8(src);
       row0 = vbslq_u8(x_mask, row0, x_max0);
       uint8x16_t row1 = vld1q_u8(src + stride);
       row1 = vbslq_u8(x_mask, row1, x_max1);

       uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
       sum_row = vshlq_n_u16(sum_row, 1);
       running_sum = vpadalq_u16(running_sum, sum_row);
       vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row));

       if (y << 1 < max_luma_height - 2) {
         src += stride << 1;
       }
     }

     sum = SumVector(running_sum);
   } else /* block_width >= 16 */ {
     const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
     uint32x4_t running_sum = vdupq_n_u32(0);

     for (int y = 0; y < block_height; ++y) {
       uint8x16_t x_index = {0,  2,  4,  6,  8,  10, 12, 14,
                             16, 18, 20, 22, 24, 26, 28, 30};
       const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]);
       const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]);
       const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]);
       const uint8x16_t x_max11 =
           vdupq_n_u8(src[stride + max_luma_width - 2 + 1]);
       for (int x = 0; x < block_width; x += 16) {
         const ptrdiff_t src_x_offset = x << 1;
         const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
         const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset);
         const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride);
         const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00);
         const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01);
         const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10);
         const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11);

         uint16x8_t sum_row_lo =
             vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01));
         sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10));
         sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11));
         sum_row_lo = vshlq_n_u16(sum_row_lo, 1);
         running_sum = vpadalq_u16(running_sum, sum_row_lo);
         vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo));

         uint16x8_t sum_row_hi =
             vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01));
         sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10));
         sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11));
         sum_row_hi = vshlq_n_u16(sum_row_hi, 1);
         running_sum = vpadalq_u16(running_sum, sum_row_hi);
         vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi));

         x_index = vaddq_u8(x_index, vdupq_n_u8(32));
       }
       if (y << 1 < max_luma_height - 2) {
         src += stride << 1;
       }
     }
     sum = SumVector(running_sum);
   }

   const uint32_t average = Average(sum, block_width, block_height);
   BlockSubtract(average, luma, block_width, block_height);
 }

 template <int block_width, int block_height>
 void CflSubsampler444_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int max_luma_width, const int max_luma_height,
     const void* const source, const ptrdiff_t stride) {
   const auto* src = static_cast<const uint8_t*>(source);
   uint32_t sum;
   if (block_width == 4) {
     assert(max_luma_width >= 4);
     uint32x4_t running_sum = vdupq_n_u32(0);
     uint8x8_t row = vdup_n_u8(0);

     for (int y = 0; y < block_height; y += 2) {
       row = Load4<0>(src, row);
       row = Load4<1>(src + stride, row);
       if (y < (max_luma_height - 1)) {
         src += stride << 1;
       }

       const uint16x8_t row_shifted = vshll_n_u8(row, 3);
       running_sum = vpadalq_u16(running_sum, row_shifted);
       vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
       vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
     }

     sum = SumVector(running_sum);
   } else if (block_width == 8) {
     const uint8x8_t x_index = {0, 1, 2, 3, 4, 5, 6, 7};
     const uint8x8_t x_max_index = vdup_n_u8(max_luma_width - 1);
     const uint8x8_t x_mask = vclt_u8(x_index, x_max_index);

     uint32x4_t running_sum = vdupq_n_u32(0);

     for (int y = 0; y < block_height; ++y) {
       const uint8x8_t x_max = vdup_n_u8(src[max_luma_width - 1]);
       const uint8x8_t row = vbsl_u8(x_mask, vld1_u8(src), x_max);

       const uint16x8_t row_shifted = vshll_n_u8(row, 3);
       running_sum = vpadalq_u16(running_sum, row_shifted);
       vst1q_s16(luma[y], vreinterpretq_s16_u16(row_shifted));

       if (y < max_luma_height - 1) {
         src += stride;
       }
     }

     sum = SumVector(running_sum);
   } else /* block_width >= 16 */ {
     const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 1);
     uint32x4_t running_sum = vdupq_n_u32(0);

     for (int y = 0; y < block_height; ++y) {
       uint8x16_t x_index = {0, 1, 2,  3,  4,  5,  6,  7,
                             8, 9, 10, 11, 12, 13, 14, 15};
       const uint8x16_t x_max = vdupq_n_u8(src[max_luma_width - 1]);
       for (int x = 0; x < block_width; x += 16) {
         const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
         const uint8x16_t row = vbslq_u8(x_mask, vld1q_u8(src + x), x_max);

         const uint16x8_t row_shifted_low = vshll_n_u8(vget_low_u8(row), 3);
         const uint16x8_t row_shifted_high = vshll_n_u8(vget_high_u8(row), 3);
         running_sum = vpadalq_u16(running_sum, row_shifted_low);
         running_sum = vpadalq_u16(running_sum, row_shifted_high);
         vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(row_shifted_low));
         vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(row_shifted_high));

         x_index = vaddq_u8(x_index, vdupq_n_u8(16));
       }
       if (y < max_luma_height - 1) {
         src += stride;
       }
     }
     sum = SumVector(running_sum);
   }

   const uint32_t average = Average(sum, block_width, block_height);
   BlockSubtract(average, luma, block_width, block_height);
 }

 // Saturate |dc + ((alpha * luma) >> 6))| to uint8_t.
 inline uint8x8_t Combine8(const int16x8_t luma, const int alpha,
                           const int16x8_t dc) {
   const int16x8_t la = vmulq_n_s16(luma, alpha);
   // Subtract the sign bit to round towards zero.
   const int16x8_t sub_sign = vsraq_n_s16(la, la, 15);
   // Shift and accumulate.
   const int16x8_t result = vrsraq_n_s16(dc, sub_sign, 6);
   return vqmovun_s16(result);
 }

 // The range of luma/alpha is not really important because it gets saturated to
 // uint8_t. Saturated int16_t >> 6 outranges uint8_t.
 template <int block_height>
 inline void CflIntraPredictor4xN_NEON(
     void* const dest, const ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint8_t*>(dest);
   const int16x8_t dc = vdupq_n_s16(dst[0]);
   for (int y = 0; y < block_height; y += 2) {
     const int16x4_t luma_row0 = vld1_s16(luma[y]);
     const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
     const uint8x8_t sum =
         Combine8(vcombine_s16(luma_row0, luma_row1), alpha, dc);
     StoreLo4(dst, sum);
     dst += stride;
     StoreHi4(dst, sum);
     dst += stride;
   }
 }

 template <int block_height>
 inline void CflIntraPredictor8xN_NEON(
     void* const dest, const ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint8_t*>(dest);
   const int16x8_t dc = vdupq_n_s16(dst[0]);
   for (int y = 0; y < block_height; ++y) {
     const int16x8_t luma_row = vld1q_s16(luma[y]);
     const uint8x8_t sum = Combine8(luma_row, alpha, dc);
     vst1_u8(dst, sum);
     dst += stride;
   }
 }

 template <int block_height>
 inline void CflIntraPredictor16xN_NEON(
     void* const dest, const ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint8_t*>(dest);
   const int16x8_t dc = vdupq_n_s16(dst[0]);
   for (int y = 0; y < block_height; ++y) {
     const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
     const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
     const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
     const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
     vst1_u8(dst, sum_0);
     vst1_u8(dst + 8, sum_1);
     dst += stride;
   }
 }

 template <int block_height>
 inline void CflIntraPredictor32xN_NEON(
     void* const dest, const ptrdiff_t stride,
     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
     const int alpha) {
   auto* dst = static_cast<uint8_t*>(dest);
   const int16x8_t dc = vdupq_n_s16(dst[0]);
   for (int y = 0; y < block_height; ++y) {
     const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
     const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
     const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
     const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
     const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
     const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
     const uint8x8_t sum_2 = Combine8(luma_row_2, alpha, dc);
     const uint8x8_t sum_3 = Combine8(luma_row_3, alpha, dc);
     vst1_u8(dst, sum_0);
     vst1_u8(dst + 8, sum_1);
     vst1_u8(dst + 16, sum_2);
     vst1_u8(dst + 24, sum_3);
     dst += stride;
   }
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);

   dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
       CflSubsampler420_NEON<4, 4>;
   dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
       CflSubsampler420_NEON<4, 8>;
   dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
       CflSubsampler420_NEON<4, 16>;

   dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
       CflSubsampler420_NEON<8, 4>;
   dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
       CflSubsampler420_NEON<8, 8>;
   dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
       CflSubsampler420_NEON<8, 16>;
   dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
       CflSubsampler420_NEON<8, 32>;

   dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
       CflSubsampler420_NEON<16, 4>;
   dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
       CflSubsampler420_NEON<16, 8>;
   dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
       CflSubsampler420_NEON<16, 16>;
   dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
       CflSubsampler420_NEON<16, 32>;

   dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
       CflSubsampler420_NEON<32, 8>;
   dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
       CflSubsampler420_NEON<32, 16>;
   dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
       CflSubsampler420_NEON<32, 32>;

   dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
       CflSubsampler444_NEON<4, 4>;
   dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
       CflSubsampler444_NEON<4, 8>;
   dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
       CflSubsampler444_NEON<4, 16>;

   dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
       CflSubsampler444_NEON<8, 4>;
   dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
       CflSubsampler444_NEON<8, 8>;
   dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
       CflSubsampler444_NEON<8, 16>;
   dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
       CflSubsampler444_NEON<8, 32>;

   dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
       CflSubsampler444_NEON<16, 4>;
   dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
       CflSubsampler444_NEON<16, 8>;
   dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
       CflSubsampler444_NEON<16, 16>;
   dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
       CflSubsampler444_NEON<16, 32>;

   dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
       CflSubsampler444_NEON<32, 8>;
   dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
       CflSubsampler444_NEON<32, 16>;
   dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
       CflSubsampler444_NEON<32, 32>;

   dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
   dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
   dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;

   dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
   dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
   dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
   dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;

   dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
   dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
   dsp->cfl_intra_predictors[kTransformSize16x16] =
       CflIntraPredictor16xN_NEON<16>;
   dsp->cfl_intra_predictors[kTransformSize16x32] =
       CflIntraPredictor16xN_NEON<32>;

   dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
   dsp->cfl_intra_predictors[kTransformSize32x16] =
       CflIntraPredictor32xN_NEON<16>;
   dsp->cfl_intra_predictors[kTransformSize32x32] =
       CflIntraPredictor32xN_NEON<32>;
   // Max Cfl predictor size is 32x32.
 }

 }  // namespace
 }  // namespace low_bitdepth

 void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); }

 }  // namespace dsp
 }  // namespace libgav1

 #else  // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {

 void IntraPredCflInit_NEON() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_ENABLE_NEON
	// Copyright 2019 The libgav1 Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "src/dsp/intrapred.h"
	#include "src/utils/cpu.h"

	#if LIBGAV1_ENABLE_NEON

	#include <arm_neon.h>

	#include <cassert>
	#include <cstddef>
	#include <cstdint>

	#include "src/dsp/arm/common_neon.h"
	#include "src/dsp/constants.h"
	#include "src/dsp/dsp.h"
	#include "src/utils/common.h"

	namespace libgav1 {
	namespace dsp {
	namespace low_bitdepth {
	namespace {

	uint8x16_t Set2ValuesQ(const uint8_t* a) {
	uint16_t combined_values = a[0] \| a[1] << 8;
	return vreinterpretq_u8_u16(vdupq_n_u16(combined_values));
	}

	uint32_t SumVector(uint32x2_t a) {
	#if defined(__aarch64__)
	return vaddv_u32(a);
	#else
	const uint64x1_t b = vpaddl_u32(a);
	return vget_lane_u32(vreinterpret_u32_u64(b), 0);
	#endif // defined(__aarch64__)
	}

	uint32_t SumVector(uint32x4_t a) {
	#if defined(__aarch64__)
	return vaddvq_u32(a);
	#else
	const uint64x2_t b = vpaddlq_u32(a);
	const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
	return vget_lane_u32(vreinterpret_u32_u64(c), 0);
	#endif // defined(__aarch64__)
	}

	// Divide by the number of elements.
	uint32_t Average(const uint32_t sum, const int width, const int height) {
	return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
	}

	// Subtract \|val\| from every element in \|a\|.
	void BlockSubtract(const uint32_t val,
	int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
	const int width, const int height) {
	assert(val <= INT16_MAX);
	const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));

	for (int y = 0; y < height; ++y) {
	if (width == 4) {
	const int16x4_t b = vld1_s16(a[y]);
	vst1_s16(a[y], vsub_s16(b, vget_low_s16(val_v)));
	} else if (width == 8) {
	const int16x8_t b = vld1q_s16(a[y]);
	vst1q_s16(a[y], vsubq_s16(b, val_v));
	} else if (width == 16) {
	const int16x8_t b = vld1q_s16(a[y]);
	const int16x8_t c = vld1q_s16(a[y] + 8);
	vst1q_s16(a[y], vsubq_s16(b, val_v));
	vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
	} else /* block_width == 32 */ {
	const int16x8_t b = vld1q_s16(a[y]);
	const int16x8_t c = vld1q_s16(a[y] + 8);
	const int16x8_t d = vld1q_s16(a[y] + 16);
	const int16x8_t e = vld1q_s16(a[y] + 24);
	vst1q_s16(a[y], vsubq_s16(b, val_v));
	vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
	vst1q_s16(a[y] + 16, vsubq_s16(d, val_v));
	vst1q_s16(a[y] + 24, vsubq_s16(e, val_v));
	}
	}
	}

	template <int block_width, int block_height>
	void CflSubsampler420_NEON(
	int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
	const int max_luma_width, const int max_luma_height,
	const void* const source, const ptrdiff_t stride) {
	const auto* src = static_cast<const uint8_t*>(source);
	uint32_t sum;
	if (block_width == 4) {
	assert(max_luma_width >= 8);
	uint32x2_t running_sum = vdup_n_u32(0);

	for (int y = 0; y < block_height; ++y) {
	const uint8x8_t row0 = vld1_u8(src);
	const uint8x8_t row1 = vld1_u8(src + stride);

	uint16x4_t sum_row = vpadal_u8(vpaddl_u8(row0), row1);
	sum_row = vshl_n_u16(sum_row, 1);
	running_sum = vpadal_u16(running_sum, sum_row);
	vst1_s16(luma[y], vreinterpret_s16_u16(sum_row));

	if (y << 1 < max_luma_height - 2) {
	// Once this threshold is reached the loop could be simplified.
	src += stride << 1;
	}
	}

	sum = SumVector(running_sum);
	} else if (block_width == 8) {
	const uint8x16_t x_index = {0, 0, 2, 2, 4, 4, 6, 6,
	8, 8, 10, 10, 12, 12, 14, 14};
	const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
	const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);

	uint32x4_t running_sum = vdupq_n_u32(0);

	for (int y = 0; y < block_height; ++y) {
	const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2);
	const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride);

	uint8x16_t row0 = vld1q_u8(src);
	row0 = vbslq_u8(x_mask, row0, x_max0);
	uint8x16_t row1 = vld1q_u8(src + stride);
	row1 = vbslq_u8(x_mask, row1, x_max1);

	uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
	sum_row = vshlq_n_u16(sum_row, 1);
	running_sum = vpadalq_u16(running_sum, sum_row);
	vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row));

	if (y << 1 < max_luma_height - 2) {
	src += stride << 1;
	}
	}

	sum = SumVector(running_sum);
	} else /* block_width >= 16 */ {
	const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
	uint32x4_t running_sum = vdupq_n_u32(0);

	for (int y = 0; y < block_height; ++y) {
	uint8x16_t x_index = {0, 2, 4, 6, 8, 10, 12, 14,
	16, 18, 20, 22, 24, 26, 28, 30};
	const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]);
	const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]);
	const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]);
	const uint8x16_t x_max11 =
	vdupq_n_u8(src[stride + max_luma_width - 2 + 1]);
	for (int x = 0; x < block_width; x += 16) {
	const ptrdiff_t src_x_offset = x << 1;
	const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
	const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset);
	const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride);
	const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00);
	const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01);
	const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10);
	const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11);

	uint16x8_t sum_row_lo =
	vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01));
	sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10));
	sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11));
	sum_row_lo = vshlq_n_u16(sum_row_lo, 1);
	running_sum = vpadalq_u16(running_sum, sum_row_lo);
	vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo));

	uint16x8_t sum_row_hi =
	vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01));
	sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10));
	sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11));
	sum_row_hi = vshlq_n_u16(sum_row_hi, 1);
	running_sum = vpadalq_u16(running_sum, sum_row_hi);
	vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi));

	x_index = vaddq_u8(x_index, vdupq_n_u8(32));
	}
	if (y << 1 < max_luma_height - 2) {
	src += stride << 1;
	}
	}
	sum = SumVector(running_sum);
	}

	const uint32_t average = Average(sum, block_width, block_height);
	BlockSubtract(average, luma, block_width, block_height);
	}

	template <int block_width, int block_height>
	void CflSubsampler444_NEON(
	int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
	const int max_luma_width, const int max_luma_height,
	const void* const source, const ptrdiff_t stride) {
	const auto* src = static_cast<const uint8_t*>(source);
	uint32_t sum;
	if (block_width == 4) {
	assert(max_luma_width >= 4);
	uint32x4_t running_sum = vdupq_n_u32(0);
	uint8x8_t row = vdup_n_u8(0);

	for (int y = 0; y < block_height; y += 2) {
	row = Load4<0>(src, row);
	row = Load4<1>(src + stride, row);
	if (y < (max_luma_height - 1)) {
	src += stride << 1;
	}

	const uint16x8_t row_shifted = vshll_n_u8(row, 3);
	running_sum = vpadalq_u16(running_sum, row_shifted);
	vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
	vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
	}

	sum = SumVector(running_sum);
	} else if (block_width == 8) {
	const uint8x8_t x_index = {0, 1, 2, 3, 4, 5, 6, 7};
	const uint8x8_t x_max_index = vdup_n_u8(max_luma_width - 1);
	const uint8x8_t x_mask = vclt_u8(x_index, x_max_index);

	uint32x4_t running_sum = vdupq_n_u32(0);

	for (int y = 0; y < block_height; ++y) {
	const uint8x8_t x_max = vdup_n_u8(src[max_luma_width - 1]);
	const uint8x8_t row = vbsl_u8(x_mask, vld1_u8(src), x_max);

	const uint16x8_t row_shifted = vshll_n_u8(row, 3);
	running_sum = vpadalq_u16(running_sum, row_shifted);
	vst1q_s16(luma[y], vreinterpretq_s16_u16(row_shifted));

	if (y < max_luma_height - 1) {
	src += stride;
	}
	}

	sum = SumVector(running_sum);
	} else /* block_width >= 16 */ {
	const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 1);
	uint32x4_t running_sum = vdupq_n_u32(0);

	for (int y = 0; y < block_height; ++y) {
	uint8x16_t x_index = {0, 1, 2, 3, 4, 5, 6, 7,
	8, 9, 10, 11, 12, 13, 14, 15};
	const uint8x16_t x_max = vdupq_n_u8(src[max_luma_width - 1]);
	for (int x = 0; x < block_width; x += 16) {
	const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
	const uint8x16_t row = vbslq_u8(x_mask, vld1q_u8(src + x), x_max);

	const uint16x8_t row_shifted_low = vshll_n_u8(vget_low_u8(row), 3);
	const uint16x8_t row_shifted_high = vshll_n_u8(vget_high_u8(row), 3);
	running_sum = vpadalq_u16(running_sum, row_shifted_low);
	running_sum = vpadalq_u16(running_sum, row_shifted_high);
	vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(row_shifted_low));
	vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(row_shifted_high));

	x_index = vaddq_u8(x_index, vdupq_n_u8(16));
	}
	if (y < max_luma_height - 1) {
	src += stride;
	}
	}
	sum = SumVector(running_sum);
	}

	const uint32_t average = Average(sum, block_width, block_height);
	BlockSubtract(average, luma, block_width, block_height);
	}

	// Saturate \|dc + ((alpha * luma) >> 6))\| to uint8_t.
	inline uint8x8_t Combine8(const int16x8_t luma, const int alpha,
	const int16x8_t dc) {
	const int16x8_t la = vmulq_n_s16(luma, alpha);
	// Subtract the sign bit to round towards zero.
	const int16x8_t sub_sign = vsraq_n_s16(la, la, 15);
	// Shift and accumulate.
	const int16x8_t result = vrsraq_n_s16(dc, sub_sign, 6);
	return vqmovun_s16(result);
	}

	// The range of luma/alpha is not really important because it gets saturated to
	// uint8_t. Saturated int16_t >> 6 outranges uint8_t.
	template <int block_height>
	inline void CflIntraPredictor4xN_NEON(
	void* const dest, const ptrdiff_t stride,
	const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
	const int alpha) {
	auto* dst = static_cast<uint8_t*>(dest);
	const int16x8_t dc = vdupq_n_s16(dst[0]);
	for (int y = 0; y < block_height; y += 2) {
	const int16x4_t luma_row0 = vld1_s16(luma[y]);
	const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
	const uint8x8_t sum =
	Combine8(vcombine_s16(luma_row0, luma_row1), alpha, dc);
	StoreLo4(dst, sum);
	dst += stride;
	StoreHi4(dst, sum);
	dst += stride;
	}
	}

	template <int block_height>
	inline void CflIntraPredictor8xN_NEON(
	void* const dest, const ptrdiff_t stride,
	const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
	const int alpha) {
	auto* dst = static_cast<uint8_t*>(dest);
	const int16x8_t dc = vdupq_n_s16(dst[0]);
	for (int y = 0; y < block_height; ++y) {
	const int16x8_t luma_row = vld1q_s16(luma[y]);
	const uint8x8_t sum = Combine8(luma_row, alpha, dc);
	vst1_u8(dst, sum);
	dst += stride;
	}
	}

	template <int block_height>
	inline void CflIntraPredictor16xN_NEON(
	void* const dest, const ptrdiff_t stride,
	const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
	const int alpha) {
	auto* dst = static_cast<uint8_t*>(dest);
	const int16x8_t dc = vdupq_n_s16(dst[0]);
	for (int y = 0; y < block_height; ++y) {
	const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
	const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
	const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
	const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
	vst1_u8(dst, sum_0);
	vst1_u8(dst + 8, sum_1);
	dst += stride;
	}
	}

	template <int block_height>
	inline void CflIntraPredictor32xN_NEON(
	void* const dest, const ptrdiff_t stride,
	const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
	const int alpha) {
	auto* dst = static_cast<uint8_t*>(dest);
	const int16x8_t dc = vdupq_n_s16(dst[0]);
	for (int y = 0; y < block_height; ++y) {
	const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
	const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
	const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
	const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
	const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
	const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
	const uint8x8_t sum_2 = Combine8(luma_row_2, alpha, dc);
	const uint8x8_t sum_3 = Combine8(luma_row_3, alpha, dc);
	vst1_u8(dst, sum_0);
	vst1_u8(dst + 8, sum_1);
	vst1_u8(dst + 16, sum_2);
	vst1_u8(dst + 24, sum_3);
	dst += stride;
	}
	}

	void Init8bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
	assert(dsp != nullptr);

	dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
	CflSubsampler420_NEON<4, 4>;
	dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
	CflSubsampler420_NEON<4, 8>;
	dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
	CflSubsampler420_NEON<4, 16>;

	dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
	CflSubsampler420_NEON<8, 4>;
	dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
	CflSubsampler420_NEON<8, 8>;
	dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
	CflSubsampler420_NEON<8, 16>;
	dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
	CflSubsampler420_NEON<8, 32>;

	dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
	CflSubsampler420_NEON<16, 4>;
	dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
	CflSubsampler420_NEON<16, 8>;
	dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
	CflSubsampler420_NEON<16, 16>;
	dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
	CflSubsampler420_NEON<16, 32>;

	dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
	CflSubsampler420_NEON<32, 8>;
	dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
	CflSubsampler420_NEON<32, 16>;
	dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
	CflSubsampler420_NEON<32, 32>;

	dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
	CflSubsampler444_NEON<4, 4>;
	dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
	CflSubsampler444_NEON<4, 8>;
	dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
	CflSubsampler444_NEON<4, 16>;

	dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
	CflSubsampler444_NEON<8, 4>;
	dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
	CflSubsampler444_NEON<8, 8>;
	dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
	CflSubsampler444_NEON<8, 16>;
	dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
	CflSubsampler444_NEON<8, 32>;

	dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
	CflSubsampler444_NEON<16, 4>;
	dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
	CflSubsampler444_NEON<16, 8>;
	dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
	CflSubsampler444_NEON<16, 16>;
	dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
	CflSubsampler444_NEON<16, 32>;

	dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
	CflSubsampler444_NEON<32, 8>;
	dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
	CflSubsampler444_NEON<32, 16>;
	dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
	CflSubsampler444_NEON<32, 32>;

	dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
	dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
	dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;

	dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
	dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
	dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
	dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;

	dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
	dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
	dsp->cfl_intra_predictors[kTransformSize16x16] =
	CflIntraPredictor16xN_NEON<16>;
	dsp->cfl_intra_predictors[kTransformSize16x32] =
	CflIntraPredictor16xN_NEON<32>;

	dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
	dsp->cfl_intra_predictors[kTransformSize32x16] =
	CflIntraPredictor32xN_NEON<16>;
	dsp->cfl_intra_predictors[kTransformSize32x32] =
	CflIntraPredictor32xN_NEON<32>;
	// Max Cfl predictor size is 32x32.
	}

	} // namespace
	} // namespace low_bitdepth

	void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); }

	} // namespace dsp
	} // namespace libgav1

	#else // !LIBGAV1_ENABLE_NEON
	namespace libgav1 {
	namespace dsp {

	void IntraPredCflInit_NEON() {}

	} // namespace dsp
	} // namespace libgav1
	#endif // LIBGAV1_ENABLE_NEON