libgav1/src/dsp/arm/motion_field_projection_neon.cc - platform/external/libgav1 - Git at Google

 // Copyright 2020 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/dsp/motion_field_projection.h"
 #include "src/utils/cpu.h"

 #if LIBGAV1_ENABLE_NEON

 #include <arm_neon.h>

 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>

 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"
 #include "src/utils/constants.h"
 #include "src/utils/types.h"

 namespace libgav1 {
 namespace dsp {
 namespace {

 inline int16x8_t LoadDivision(const int8x8x2_t division_table,
                               const int8x8_t reference_offset) {
   const int8x8_t kOne = vcreate_s8(0x0100010001000100);
   const int8x16_t kOneQ = vcombine_s8(kOne, kOne);
   const int8x8_t t = vadd_s8(reference_offset, reference_offset);
   const int8x8x2_t tt = vzip_s8(t, t);
   const int8x16_t t1 = vcombine_s8(tt.val[0], tt.val[1]);
   const int8x16_t idx = vaddq_s8(t1, kOneQ);
   const int8x8_t idx_low = vget_low_s8(idx);
   const int8x8_t idx_high = vget_high_s8(idx);
   const int16x4_t d0 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_low));
   const int16x4_t d1 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_high));
   return vcombine_s16(d0, d1);
 }

 inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
                               const int numerator) {
   const int32x4_t m0 = vmull_s16(mv, denominator);
   const int32x4_t m = vmulq_n_s32(m0, numerator);
   // Add the sign (0 or -1) to round towards zero.
   const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
   return vqrshrn_n_s32(add_sign, 14);
 }

 inline int16x8_t MvProjectionClip(const int16x8_t mv,
                                   const int16x8_t denominator,
                                   const int numerator) {
   const int16x4_t mv0 = vget_low_s16(mv);
   const int16x4_t mv1 = vget_high_s16(mv);
   const int16x4_t s0 = MvProjection(mv0, vget_low_s16(denominator), numerator);
   const int16x4_t s1 = MvProjection(mv1, vget_high_s16(denominator), numerator);
   const int16x8_t projection = vcombine_s16(s0, s1);
   const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
   const int16x8_t clamp = vminq_s16(projection, projection_mv_clamp);
   return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
 }

 inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) {
   // Add 63 to negative delta so that it shifts towards zero.
   const int16x8_t delta_sign = vshrq_n_s16(delta, 15);
   const uint16x8_t delta_u = vreinterpretq_u16_s16(delta);
   const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign);
   const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10);
   const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u);
   const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6);
   const int16x8_t offset1 = veorq_s16(offset0, dst_sign);
   const int16x8_t offset2 = vsubq_s16(offset1, dst_sign);
   return vqmovn_s16(offset2);
 }

 inline void GetPosition(
     const int8x8x2_t division_table, const MotionVector* const mv,
     const int numerator, const int x8_start, const int x8_end, const int x8,
     const int8x8_t r_offsets, const int8x8_t source_reference_type8,
     const int8x8_t skip_r, const int8x8_t y8_floor8, const int8x8_t y8_ceiling8,
     const int16x8_t d_sign, const int delta, int8x8_t* const r,
     int8x8_t* const position_y8, int8x8_t* const position_x8,
     int64_t* const skip_64, int32x4_t mvs[2]) {
   const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
   *r = vtbl1_s8(r_offsets, source_reference_type8);
   const int16x8_t denorm = LoadDivision(division_table, source_reference_type8);
   int16x8_t projection_mv[2];
   mvs[0] = vld1q_s32(mv_int + 0);
   mvs[1] = vld1q_s32(mv_int + 4);
   // Deinterlace x and y components
   const int16x8_t mv0 = vreinterpretq_s16_s32(mvs[0]);
   const int16x8_t mv1 = vreinterpretq_s16_s32(mvs[1]);
   const int16x8x2_t mv_yx = vuzpq_s16(mv0, mv1);
   // numerator could be 0.
   projection_mv[0] = MvProjectionClip(mv_yx.val[0], denorm, numerator);
   projection_mv[1] = MvProjectionClip(mv_yx.val[1], denorm, numerator);
   // Do not update the motion vector if the block position is not valid or
   // if position_x8 is outside the current range of x8_start and x8_end.
   // Note that position_y8 will always be within the range of y8_start and
   // y8_end.
   // After subtracting the base, valid projections are within 8-bit.
   *position_y8 = Project_NEON(projection_mv[0], d_sign);
   const int8x8_t position_x = Project_NEON(projection_mv[1], d_sign);
   const int8x8_t k01234567 = vcreate_s8(uint64_t{0x0706050403020100});
   *position_x8 = vqadd_s8(position_x, k01234567);
   const int8x16_t position_xy = vcombine_s8(*position_x8, *position_y8);
   const int x8_floor = std::max(
       x8_start - x8, delta - kProjectionMvMaxHorizontalOffset);  // [-8, 8]
   const int x8_ceiling = std::min(
       x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset);  // [0, 16]
   const int8x8_t x8_floor8 = vdup_n_s8(x8_floor);
   const int8x8_t x8_ceiling8 = vdup_n_s8(x8_ceiling);
   const int8x16_t floor_xy = vcombine_s8(x8_floor8, y8_floor8);
   const int8x16_t ceiling_xy = vcombine_s8(x8_ceiling8, y8_ceiling8);
   const uint8x16_t underflow = vcltq_s8(position_xy, floor_xy);
   const uint8x16_t overflow = vcgeq_s8(position_xy, ceiling_xy);
   const int8x16_t out = vreinterpretq_s8_u8(vorrq_u8(underflow, overflow));
   const int8x8_t skip_low = vorr_s8(skip_r, vget_low_s8(out));
   const int8x8_t skip = vorr_s8(skip_low, vget_high_s8(out));
   *skip_64 = vget_lane_s64(vreinterpret_s64_s8(skip), 0);
 }

 template <int idx>
 inline void Store(const int16x8_t position, const int8x8_t reference_offset,
                   const int32x4_t mv, int8_t* dst_reference_offset,
                   MotionVector* dst_mv) {
   const ptrdiff_t offset = vgetq_lane_s16(position, idx);
   auto* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
   vst1q_lane_s32(d_mv, mv, idx & 3);
   vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
 }

 template <int idx>
 inline void CheckStore(const int8_t* skips, const int16x8_t position,
                        const int8x8_t reference_offset, const int32x4_t mv,
                        int8_t* dst_reference_offset, MotionVector* dst_mv) {
   if (skips[idx] == 0) {
     Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
   }
 }

 // 7.9.2.
 void MotionFieldProjectionKernel_NEON(const ReferenceInfo& reference_info,
                                       const int reference_to_current_with_sign,
                                       const int dst_sign, const int y8_start,
                                       const int y8_end, const int x8_start,
                                       const int x8_end,
                                       TemporalMotionField* const motion_field) {
   const ptrdiff_t stride = motion_field->mv.columns();
   // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
   // coordinates in that range could end up being position_x8 because of
   // projection.
   const int adjusted_x8_start =
       std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
   const int adjusted_x8_end = std::min(
       x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
   const int adjusted_x8_end8 = adjusted_x8_end & ~7;
   const int leftover = adjusted_x8_end - adjusted_x8_end8;
   const int8_t* const reference_offsets =
       reference_info.relative_distance_to.data();
   const bool* const skip_references = reference_info.skip_references.data();
   const int16_t* const projection_divisions =
       reference_info.projection_divisions.data();
   const ReferenceFrameType* source_reference_types =
       &reference_info.motion_field_reference_frame[y8_start][0];
   const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
   int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
   MotionVector* dst_mv = motion_field->mv[y8_start];
   const int16x8_t d_sign = vdupq_n_s16(dst_sign);

   static_assert(sizeof(int8_t) == sizeof(bool), "");
   static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
   static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
   assert(dst_sign == 0 || dst_sign == -1);
   assert(stride == motion_field->reference_offset.columns());
   assert((y8_start & 7) == 0);
   assert((adjusted_x8_start & 7) == 0);
   // The final position calculation is represented with int16_t. Valid
   // position_y8 from its base is at most 7. After considering the horizontal
   // offset which is at most |stride - 1|, we have the following assertion,
   // which means this optimization works for frame width up to 32K (each
   // position is a 8x8 block).
   assert(8 * stride <= 32768);
   const int8x8_t skip_reference =
       vld1_s8(reinterpret_cast<const int8_t*>(skip_references));
   const int8x8_t r_offsets = vld1_s8(reference_offsets);
   const int8x16_t table = vreinterpretq_s8_s16(vld1q_s16(projection_divisions));
   int8x8x2_t division_table;
   division_table.val[0] = vget_low_s8(table);
   division_table.val[1] = vget_high_s8(table);

   int y8 = y8_start;
   do {
     const int y8_floor = (y8 & ~7) - y8;                         // [-7, 0]
     const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8);  // [1, 8]
     const int8x8_t y8_floor8 = vdup_n_s8(y8_floor);
     const int8x8_t y8_ceiling8 = vdup_n_s8(y8_ceiling);
     int x8;

     for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
       const int8x8_t source_reference_type8 =
           vld1_s8(reinterpret_cast<const int8_t*>(source_reference_types + x8));
       const int8x8_t skip_r = vtbl1_s8(skip_reference, source_reference_type8);
       const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
       // Early termination #1 if all are skips. Chance is typically ~30-40%.
       if (early_skip == -1) continue;
       int64_t skip_64;
       int8x8_t r, position_x8, position_y8;
       int32x4_t mvs[2];
       GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
                   x8_end, x8, r_offsets, source_reference_type8, skip_r,
                   y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_y8,
                   &position_x8, &skip_64, mvs);
       // Early termination #2 if all are skips.
       // Chance is typically ~15-25% after Early termination #1.
       if (skip_64 == -1) continue;
       const int16x8_t p_y = vmovl_s8(position_y8);
       const int16x8_t p_x = vmovl_s8(position_x8);
       const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
       const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
       if (skip_64 == 0) {
         // Store all. Chance is typically ~70-85% after Early termination #2.
         Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
         Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
         Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
         Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
         Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
         Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
         Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
         Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
       } else {
         // Check and store each.
         // Chance is typically ~15-30% after Early termination #2.
         // The compiler is smart enough to not create the local buffer skips[].
         int8_t skips[8];
         memcpy(skips, &skip_64, sizeof(skips));
         CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
         CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
         CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
         CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
         CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
         CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
         CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
         CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
       }
     }

     // The following leftover processing cannot be moved out of the do...while
     // loop. Doing so may change the result storing orders of the same position.
     if (leftover > 0) {
       // Use SIMD only when leftover is at least 4, and there are at least 8
       // elements in a row.
       if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
         // Process the last 8 elements to avoid loading invalid memory. Some
         // elements may have been processed in the above loop, which is OK.
         const int delta = 8 - leftover;
         x8 = adjusted_x8_end - 8;
         const int8x8_t source_reference_type8 = vld1_s8(
             reinterpret_cast<const int8_t*>(source_reference_types + x8));
         const int8x8_t skip_r =
             vtbl1_s8(skip_reference, source_reference_type8);
         const int64_t early_skip =
             vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
         // Early termination #1 if all are skips.
         if (early_skip != -1) {
           int64_t skip_64;
           int8x8_t r, position_x8, position_y8;
           int32x4_t mvs[2];
           GetPosition(division_table, mv, reference_to_current_with_sign,
                       x8_start, x8_end, x8, r_offsets, source_reference_type8,
                       skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
                       &position_y8, &position_x8, &skip_64, mvs);
           // Early termination #2 if all are skips.
           if (skip_64 != -1) {
             const int16x8_t p_y = vmovl_s8(position_y8);
             const int16x8_t p_x = vmovl_s8(position_x8);
             const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
             const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
             // Store up to 7 elements since leftover is at most 7.
             if (skip_64 == 0) {
               // Store all.
               Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
               Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
               Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
               Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
               Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
               Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
               Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
             } else {
               // Check and store each.
               // The compiler is smart enough to not create the local buffer
               // skips[].
               int8_t skips[8];
               memcpy(skips, &skip_64, sizeof(skips));
               CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
                             dst_mv);
               CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
                             dst_mv);
               CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
                             dst_mv);
               CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
                             dst_mv);
               CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
                             dst_mv);
               CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
                             dst_mv);
               CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
                             dst_mv);
             }
           }
         }
       } else {
         for (; x8 < adjusted_x8_end; ++x8) {
           const int source_reference_type = source_reference_types[x8];
           if (skip_references[source_reference_type]) continue;
           MotionVector projection_mv;
           // reference_to_current_with_sign could be 0.
           GetMvProjection(mv[x8], reference_to_current_with_sign,
                           projection_divisions[source_reference_type],
                           &projection_mv);
           // Do not update the motion vector if the block position is not valid
           // or if position_x8 is outside the current range of x8_start and
           // x8_end. Note that position_y8 will always be within the range of
           // y8_start and y8_end.
           const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
           if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue;
           const int x8_base = x8 & ~7;
           const int x8_floor =
               std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
           const int x8_ceiling =
               std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
           const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
           if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
           dst_mv[position_y8 * stride + position_x8] = mv[x8];
           dst_reference_offset[position_y8 * stride + position_x8] =
               reference_offsets[source_reference_type];
         }
       }
     }

     source_reference_types += stride;
     mv += stride;
     dst_reference_offset += stride;
     dst_mv += stride;
   } while (++y8 < y8_end);
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
   dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
 }

 #if LIBGAV1_MAX_BITDEPTH >= 10
 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);
   dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
 }
 #endif

 }  // namespace

 void MotionFieldProjectionInit_NEON() {
   Init8bpp();
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif
 }

 }  // namespace dsp
 }  // namespace libgav1

 #else  // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {

 void MotionFieldProjectionInit_NEON() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_ENABLE_NEON
	// Copyright 2020 The libgav1 Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "src/dsp/motion_field_projection.h"
	#include "src/utils/cpu.h"

	#if LIBGAV1_ENABLE_NEON

	#include <arm_neon.h>

	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>

	#include "src/dsp/constants.h"
	#include "src/dsp/dsp.h"
	#include "src/utils/common.h"
	#include "src/utils/constants.h"
	#include "src/utils/types.h"

	namespace libgav1 {
	namespace dsp {
	namespace {

	inline int16x8_t LoadDivision(const int8x8x2_t division_table,
	const int8x8_t reference_offset) {
	const int8x8_t kOne = vcreate_s8(0x0100010001000100);
	const int8x16_t kOneQ = vcombine_s8(kOne, kOne);
	const int8x8_t t = vadd_s8(reference_offset, reference_offset);
	const int8x8x2_t tt = vzip_s8(t, t);
	const int8x16_t t1 = vcombine_s8(tt.val[0], tt.val[1]);
	const int8x16_t idx = vaddq_s8(t1, kOneQ);
	const int8x8_t idx_low = vget_low_s8(idx);
	const int8x8_t idx_high = vget_high_s8(idx);
	const int16x4_t d0 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_low));
	const int16x4_t d1 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_high));
	return vcombine_s16(d0, d1);
	}

	inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
	const int numerator) {
	const int32x4_t m0 = vmull_s16(mv, denominator);
	const int32x4_t m = vmulq_n_s32(m0, numerator);
	// Add the sign (0 or -1) to round towards zero.
	const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
	return vqrshrn_n_s32(add_sign, 14);
	}

	inline int16x8_t MvProjectionClip(const int16x8_t mv,
	const int16x8_t denominator,
	const int numerator) {
	const int16x4_t mv0 = vget_low_s16(mv);
	const int16x4_t mv1 = vget_high_s16(mv);
	const int16x4_t s0 = MvProjection(mv0, vget_low_s16(denominator), numerator);
	const int16x4_t s1 = MvProjection(mv1, vget_high_s16(denominator), numerator);
	const int16x8_t projection = vcombine_s16(s0, s1);
	const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
	const int16x8_t clamp = vminq_s16(projection, projection_mv_clamp);
	return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
	}

	inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) {
	// Add 63 to negative delta so that it shifts towards zero.
	const int16x8_t delta_sign = vshrq_n_s16(delta, 15);
	const uint16x8_t delta_u = vreinterpretq_u16_s16(delta);
	const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign);
	const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10);
	const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u);
	const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6);
	const int16x8_t offset1 = veorq_s16(offset0, dst_sign);
	const int16x8_t offset2 = vsubq_s16(offset1, dst_sign);
	return vqmovn_s16(offset2);
	}

	inline void GetPosition(
	const int8x8x2_t division_table, const MotionVector* const mv,
	const int numerator, const int x8_start, const int x8_end, const int x8,
	const int8x8_t r_offsets, const int8x8_t source_reference_type8,
	const int8x8_t skip_r, const int8x8_t y8_floor8, const int8x8_t y8_ceiling8,
	const int16x8_t d_sign, const int delta, int8x8_t* const r,
	int8x8_t* const position_y8, int8x8_t* const position_x8,
	int64_t* const skip_64, int32x4_t mvs[2]) {
	const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
	*r = vtbl1_s8(r_offsets, source_reference_type8);
	const int16x8_t denorm = LoadDivision(division_table, source_reference_type8);
	int16x8_t projection_mv[2];
	mvs[0] = vld1q_s32(mv_int + 0);
	mvs[1] = vld1q_s32(mv_int + 4);
	// Deinterlace x and y components
	const int16x8_t mv0 = vreinterpretq_s16_s32(mvs[0]);
	const int16x8_t mv1 = vreinterpretq_s16_s32(mvs[1]);
	const int16x8x2_t mv_yx = vuzpq_s16(mv0, mv1);
	// numerator could be 0.
	projection_mv[0] = MvProjectionClip(mv_yx.val[0], denorm, numerator);
	projection_mv[1] = MvProjectionClip(mv_yx.val[1], denorm, numerator);
	// Do not update the motion vector if the block position is not valid or
	// if position_x8 is outside the current range of x8_start and x8_end.
	// Note that position_y8 will always be within the range of y8_start and
	// y8_end.
	// After subtracting the base, valid projections are within 8-bit.
	*position_y8 = Project_NEON(projection_mv[0], d_sign);
	const int8x8_t position_x = Project_NEON(projection_mv[1], d_sign);
	const int8x8_t k01234567 = vcreate_s8(uint64_t{0x0706050403020100});
	*position_x8 = vqadd_s8(position_x, k01234567);
	const int8x16_t position_xy = vcombine_s8(position_x8, position_y8);
	const int x8_floor = std::max(
	x8_start - x8, delta - kProjectionMvMaxHorizontalOffset); // [-8, 8]
	const int x8_ceiling = std::min(
	x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset); // [0, 16]
	const int8x8_t x8_floor8 = vdup_n_s8(x8_floor);
	const int8x8_t x8_ceiling8 = vdup_n_s8(x8_ceiling);
	const int8x16_t floor_xy = vcombine_s8(x8_floor8, y8_floor8);
	const int8x16_t ceiling_xy = vcombine_s8(x8_ceiling8, y8_ceiling8);
	const uint8x16_t underflow = vcltq_s8(position_xy, floor_xy);
	const uint8x16_t overflow = vcgeq_s8(position_xy, ceiling_xy);
	const int8x16_t out = vreinterpretq_s8_u8(vorrq_u8(underflow, overflow));
	const int8x8_t skip_low = vorr_s8(skip_r, vget_low_s8(out));
	const int8x8_t skip = vorr_s8(skip_low, vget_high_s8(out));
	*skip_64 = vget_lane_s64(vreinterpret_s64_s8(skip), 0);
	}

	template <int idx>
	inline void Store(const int16x8_t position, const int8x8_t reference_offset,
	const int32x4_t mv, int8_t* dst_reference_offset,
	MotionVector* dst_mv) {
	const ptrdiff_t offset = vgetq_lane_s16(position, idx);
	auto* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
	vst1q_lane_s32(d_mv, mv, idx & 3);
	vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
	}

	template <int idx>
	inline void CheckStore(const int8_t* skips, const int16x8_t position,
	const int8x8_t reference_offset, const int32x4_t mv,
	int8_t* dst_reference_offset, MotionVector* dst_mv) {
	if (skips[idx] == 0) {
	Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
	}
	}

	// 7.9.2.
	void MotionFieldProjectionKernel_NEON(const ReferenceInfo& reference_info,
	const int reference_to_current_with_sign,
	const int dst_sign, const int y8_start,
	const int y8_end, const int x8_start,
	const int x8_end,
	TemporalMotionField* const motion_field) {
	const ptrdiff_t stride = motion_field->mv.columns();
	// The column range has to be offset by kProjectionMvMaxHorizontalOffset since
	// coordinates in that range could end up being position_x8 because of
	// projection.
	const int adjusted_x8_start =
	std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
	const int adjusted_x8_end = std::min(
	x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
	const int adjusted_x8_end8 = adjusted_x8_end & ~7;
	const int leftover = adjusted_x8_end - adjusted_x8_end8;
	const int8_t* const reference_offsets =
	reference_info.relative_distance_to.data();
	const bool* const skip_references = reference_info.skip_references.data();
	const int16_t* const projection_divisions =
	reference_info.projection_divisions.data();
	const ReferenceFrameType* source_reference_types =
	&reference_info.motion_field_reference_frame[y8_start][0];
	const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
	int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
	MotionVector* dst_mv = motion_field->mv[y8_start];
	const int16x8_t d_sign = vdupq_n_s16(dst_sign);

	static_assert(sizeof(int8_t) == sizeof(bool), "");
	static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
	static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
	assert(dst_sign == 0 \|\| dst_sign == -1);
	assert(stride == motion_field->reference_offset.columns());
	assert((y8_start & 7) == 0);
	assert((adjusted_x8_start & 7) == 0);
	// The final position calculation is represented with int16_t. Valid
	// position_y8 from its base is at most 7. After considering the horizontal
	// offset which is at most \|stride - 1\|, we have the following assertion,
	// which means this optimization works for frame width up to 32K (each
	// position is a 8x8 block).
	assert(8 * stride <= 32768);
	const int8x8_t skip_reference =
	vld1_s8(reinterpret_cast<const int8_t*>(skip_references));
	const int8x8_t r_offsets = vld1_s8(reference_offsets);
	const int8x16_t table = vreinterpretq_s8_s16(vld1q_s16(projection_divisions));
	int8x8x2_t division_table;
	division_table.val[0] = vget_low_s8(table);
	division_table.val[1] = vget_high_s8(table);

	int y8 = y8_start;
	do {
	const int y8_floor = (y8 & ~7) - y8; // [-7, 0]
	const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8); // [1, 8]
	const int8x8_t y8_floor8 = vdup_n_s8(y8_floor);
	const int8x8_t y8_ceiling8 = vdup_n_s8(y8_ceiling);
	int x8;

	for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
	const int8x8_t source_reference_type8 =
	vld1_s8(reinterpret_cast<const int8_t*>(source_reference_types + x8));
	const int8x8_t skip_r = vtbl1_s8(skip_reference, source_reference_type8);
	const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
	// Early termination #1 if all are skips. Chance is typically ~30-40%.
	if (early_skip == -1) continue;
	int64_t skip_64;
	int8x8_t r, position_x8, position_y8;
	int32x4_t mvs[2];
	GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
	x8_end, x8, r_offsets, source_reference_type8, skip_r,
	y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_y8,
	&position_x8, &skip_64, mvs);
	// Early termination #2 if all are skips.
	// Chance is typically ~15-25% after Early termination #1.
	if (skip_64 == -1) continue;
	const int16x8_t p_y = vmovl_s8(position_y8);
	const int16x8_t p_x = vmovl_s8(position_x8);
	const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
	const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
	if (skip_64 == 0) {
	// Store all. Chance is typically ~70-85% after Early termination #2.
	Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
	Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
	Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
	Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
	Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
	Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
	Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
	Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
	} else {
	// Check and store each.
	// Chance is typically ~15-30% after Early termination #2.
	// The compiler is smart enough to not create the local buffer skips[].
	int8_t skips[8];
	memcpy(skips, &skip_64, sizeof(skips));
	CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
	CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
	CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
	CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
	CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
	CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
	CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
	CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
	}
	}

	// The following leftover processing cannot be moved out of the do...while
	// loop. Doing so may change the result storing orders of the same position.
	if (leftover > 0) {
	// Use SIMD only when leftover is at least 4, and there are at least 8
	// elements in a row.
	if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
	// Process the last 8 elements to avoid loading invalid memory. Some
	// elements may have been processed in the above loop, which is OK.
	const int delta = 8 - leftover;
	x8 = adjusted_x8_end - 8;
	const int8x8_t source_reference_type8 = vld1_s8(
	reinterpret_cast<const int8_t*>(source_reference_types + x8));
	const int8x8_t skip_r =
	vtbl1_s8(skip_reference, source_reference_type8);
	const int64_t early_skip =
	vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
	// Early termination #1 if all are skips.
	if (early_skip != -1) {
	int64_t skip_64;
	int8x8_t r, position_x8, position_y8;
	int32x4_t mvs[2];
	GetPosition(division_table, mv, reference_to_current_with_sign,
	x8_start, x8_end, x8, r_offsets, source_reference_type8,
	skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
	&position_y8, &position_x8, &skip_64, mvs);
	// Early termination #2 if all are skips.
	if (skip_64 != -1) {
	const int16x8_t p_y = vmovl_s8(position_y8);
	const int16x8_t p_x = vmovl_s8(position_x8);
	const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
	const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
	// Store up to 7 elements since leftover is at most 7.
	if (skip_64 == 0) {
	// Store all.
	Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
	Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
	Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
	Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
	Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
	Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
	Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
	} else {
	// Check and store each.
	// The compiler is smart enough to not create the local buffer
	// skips[].
	int8_t skips[8];
	memcpy(skips, &skip_64, sizeof(skips));
	CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
	dst_mv);
	CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
	dst_mv);
	CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
	dst_mv);
	CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
	dst_mv);
	CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
	dst_mv);
	CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
	dst_mv);
	CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
	dst_mv);
	}
	}
	}
	} else {
	for (; x8 < adjusted_x8_end; ++x8) {
	const int source_reference_type = source_reference_types[x8];
	if (skip_references[source_reference_type]) continue;
	MotionVector projection_mv;
	// reference_to_current_with_sign could be 0.
	GetMvProjection(mv[x8], reference_to_current_with_sign,
	projection_divisions[source_reference_type],
	&projection_mv);
	// Do not update the motion vector if the block position is not valid
	// or if position_x8 is outside the current range of x8_start and
	// x8_end. Note that position_y8 will always be within the range of
	// y8_start and y8_end.
	const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
	if (position_y8 < y8_floor \|\| position_y8 >= y8_ceiling) continue;
	const int x8_base = x8 & ~7;
	const int x8_floor =
	std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
	const int x8_ceiling =
	std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
	const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
	if (position_x8 < x8_floor \|\| position_x8 >= x8_ceiling) continue;
	dst_mv[position_y8 * stride + position_x8] = mv[x8];
	dst_reference_offset[position_y8 * stride + position_x8] =
	reference_offsets[source_reference_type];
	}
	}
	}

	source_reference_types += stride;
	mv += stride;
	dst_reference_offset += stride;
	dst_mv += stride;
	} while (++y8 < y8_end);
	}

	void Init8bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
	assert(dsp != nullptr);
	dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
	}

	#if LIBGAV1_MAX_BITDEPTH >= 10
	void Init10bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
	assert(dsp != nullptr);
	dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
	}
	#endif

	} // namespace

	void MotionFieldProjectionInit_NEON() {
	Init8bpp();
	#if LIBGAV1_MAX_BITDEPTH >= 10
	Init10bpp();
	#endif
	}

	} // namespace dsp
	} // namespace libgav1

	#else // !LIBGAV1_ENABLE_NEON
	namespace libgav1 {
	namespace dsp {

	void MotionFieldProjectionInit_NEON() {}

	} // namespace dsp
	} // namespace libgav1
	#endif // LIBGAV1_ENABLE_NEON