libgav1/src/dsp/arm/warp_neon.cc - platform/external/libgav1 - Git at Google

 // Copyright 2019 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/dsp/dsp.h"
 #include "src/dsp/warp.h"

 #if LIBGAV1_ENABLE_NEON

 #include <arm_neon.h>

 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>

 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/utils/common.h"
 #include "src/utils/constants.h"

 namespace libgav1 {
 namespace dsp {
 namespace low_bitdepth {
 namespace {

 // Number of extra bits of precision in warped filtering.
 constexpr int kWarpedDiffPrecisionBits = 10;

 void Warp_NEON(const void* const source, const ptrdiff_t source_stride,
                const int source_width, const int source_height,
                const int* const warp_params, const int subsampling_x,
                const int subsampling_y, const int inter_round_bits_vertical,
                const int block_start_x, const int block_start_y,
                const int block_width, const int block_height,
                const int16_t alpha, const int16_t beta, const int16_t gamma,
                const int16_t delta, uint16_t* dest,
                const ptrdiff_t dest_stride) {
   constexpr int bitdepth = 8;
   // Intermediate_result is the output of the horizontal filtering and rounding.
   // The range is within 13 (= bitdepth + kFilterBits + 1 -
   // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t type
   // so that we can multiply it by kWarpedFilters (which has signed values)
   // using vmlal_s16().
   int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
   const int horizontal_offset = 1 << (bitdepth + kFilterBits - 1);
   const int vertical_offset =
       1 << (bitdepth + 2 * kFilterBits - kInterRoundBitsHorizontal);
   const auto* const src = static_cast<const uint8_t*>(source);

   assert(block_width >= 8);
   assert(block_height >= 8);

   // Warp process applies for each 8x8 block (or smaller).
   int start_y = block_start_y;
   do {
     int start_x = block_start_x;
     do {
       const int src_x = (start_x + 4) << subsampling_x;
       const int src_y = (start_y + 4) << subsampling_y;
       const int dst_x =
           src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
       const int dst_y =
           src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
       const int x4 = dst_x >> subsampling_x;
       const int y4 = dst_y >> subsampling_y;
       const int ix4 = x4 >> kWarpedModelPrecisionBits;
       const int iy4 = y4 >> kWarpedModelPrecisionBits;

       // Horizontal filter.
       int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
       for (int y = -7; y < 8; ++y) {
         // TODO(chengchen):
         // Because of warping, the index could be out of frame boundary. Thus
         // clip is needed. However, can we remove or reduce usage of clip?
         // Besides, special cases exist, for example,
         // if iy4 - 7 >= source_height or iy4 + 7 < 0, there's no need to do the
         // filtering.
         const int row = Clip3(iy4 + y, 0, source_height - 1);
         const uint8_t* const src_row = src + row * source_stride;
         // Check for two simple special cases.
         if (ix4 - 7 >= source_width - 1) {
           // Every sample is equal to src_row[source_width - 1]. Since the sum
           // of the warped filter coefficients is 128 (= 2^7), the filtering is
           // equivalent to multiplying src_row[source_width - 1] by 128.
           const int16_t s =
               (horizontal_offset >> kInterRoundBitsHorizontal) +
               (src_row[source_width - 1] << (7 - kInterRoundBitsHorizontal));
           const int16x8_t sum = vdupq_n_s16(s);
           vst1q_s16(intermediate_result[y + 7], sum);
           sx4 += beta;
           continue;
         }
         if (ix4 + 7 <= 0) {
           // Every sample is equal to src_row[0]. Since the sum of the warped
           // filter coefficients is 128 (= 2^7), the filtering is equivalent to
           // multiplying src_row[0] by 128.
           const int16_t s = (horizontal_offset >> kInterRoundBitsHorizontal) +
                             (src_row[0] << (7 - kInterRoundBitsHorizontal));
           const int16x8_t sum = vdupq_n_s16(s);
           vst1q_s16(intermediate_result[y + 7], sum);
           sx4 += beta;
           continue;
         }
         // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
         // read but is ignored.
         //
         // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
         // bytes after src_row[source_width - 1]. We assume the source frame
         // has left and right borders of at least 13 bytes that extend the
         // frame boundary pixels. We also assume there is at least one extra
         // padding byte after the right border of the last source row.
         const uint8x16_t src_row_u8 = vld1q_u8(&src_row[ix4 - 7]);
         const int16x8_t src_row_low_s16 =
             vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src_row_u8)));
         const int16x8_t src_row_high_s16 =
             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src_row_u8)));
         int sx = sx4 - MultiplyBy4(alpha);
         int16x8_t filter[8];
         for (int x = 0; x < 8; ++x) {
           const int offset =
               RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
               kWarpedPixelPrecisionShifts;
           filter[x] = vld1q_s16(kWarpedFilters[offset]);
           sx += alpha;
         }
         Transpose8x8(&filter[0], &filter[1], &filter[2], &filter[3], &filter[4],
                      &filter[5], &filter[6], &filter[7]);
         // For 8 bit, the range of sum is within uint16_t, if we add a
         // horizontal offset. horizontal_offset guarantees sum is nonnegative.
         //
         // Proof:
         // Given that the minimum (most negative) sum of the negative filter
         // coefficients is -47 and the maximum sum of the positive filter
         // coefficients is 175, the range of the horizontal filter output is
         //   -47 * 255 <= output <= 175 * 255
         // Since -2^14 < -47 * 255, adding -2^14 (= horizontal_offset) to the
         // horizontal filter output produces a positive value:
         //   0 < output + 2^14 <= 175 * 255 + 2^14
         // The final rounding right shift by 3 (= kInterRoundBitsHorizontal)
         // bits adds 2^2 to the sum:
         //   0 < output + 2^14 + 2^2 <= 175 * 255 + 2^14 + 2^2 = 61013
         // Since 61013 < 2^16, the final sum (right before the right shift by 3
         // bits) will not overflow uint16_t. In addition, the value after the
         // right shift by 3 bits is in the following range:
         //   0 <= intermediate_result[y][x] < 2^13
         // This property is used in determining the range of the vertical
         // filtering output. [End of proof.]
         //
         // We can do signed int16_t arithmetic and just treat the final result
         // as uint16_t when we shift it right.
         int16x8_t sum = vdupq_n_s16(horizontal_offset);
         // Unrolled k = 0..7 loop. We need to manually unroll the loop because
         // the third argument (an index value) to vextq_s16() must be a
         // constant (immediate).
         // k = 0.
         int16x8_t src_row_v_s16 = src_row_low_s16;
         sum = vmlaq_s16(sum, filter[0], src_row_v_s16);
         // k = 1.
         src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 1);
         sum = vmlaq_s16(sum, filter[1], src_row_v_s16);
         // k = 2.
         src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 2);
         sum = vmlaq_s16(sum, filter[2], src_row_v_s16);
         // k = 3.
         src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 3);
         sum = vmlaq_s16(sum, filter[3], src_row_v_s16);
         // k = 4.
         src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 4);
         sum = vmlaq_s16(sum, filter[4], src_row_v_s16);
         // k = 5.
         src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 5);
         sum = vmlaq_s16(sum, filter[5], src_row_v_s16);
         // k = 6.
         src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 6);
         sum = vmlaq_s16(sum, filter[6], src_row_v_s16);
         // k = 7.
         src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 7);
         sum = vmlaq_s16(sum, filter[7], src_row_v_s16);
         // End of unrolled k = 0..7 loop.
         // Treat sum as unsigned for the right shift.
         sum = vreinterpretq_s16_u16(vrshrq_n_u16(vreinterpretq_u16_s16(sum),
                                                  kInterRoundBitsHorizontal));
         vst1q_s16(intermediate_result[y + 7], sum);
         sx4 += beta;
       }

       // Vertical filter.
       uint16_t* dst_row = dest + start_x - block_start_x;
       int sy4 =
           (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
       for (int y = -4; y < 4; ++y) {
         int sy = sy4 - MultiplyBy4(gamma);
         int16x8_t filter[8];
         for (int x = 0; x < 8; ++x) {
           const int offset =
               RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
               kWarpedPixelPrecisionShifts;
           filter[x] = vld1q_s16(kWarpedFilters[offset]);
           sy += gamma;
         }
         Transpose8x8(&filter[0], &filter[1], &filter[2], &filter[3], &filter[4],
                      &filter[5], &filter[6], &filter[7]);
         // Similar to horizontal_offset, vertical_offset guarantees sum before
         // shifting is nonnegative.
         //
         // Proof:
         // The range of an entry in intermediate_result is
         //   0 <= intermediate_result[y][x] < 2^13
         // The range of the vertical filter output is
         //   -47 * 2^13 < output < 175 * 2^13
         // Since -2^19 < -47 * 2^13, adding -2^19 (= vertical_offset) to the
         // vertical filter output produces a positive value:
         //   0 < output + 2^19 < 175 * 2^13 + 2^19
         // The final rounding right shift by either 7 or 11 bits adds at most
         // 2^10 to the sum:
         //   0 < output + 2^19 + rounding < 175 * 2^13 + 2^19 + 2^10 = 1958912
         // Since 1958912 = 0x1DE400 < 2^22, shifting it right by 7 or 11 bits
         // brings the value under 2^15, which fits in uint16_t.
         int32x4_t sum_low = vdupq_n_s32(vertical_offset);
         int32x4_t sum_high = sum_low;
         for (int k = 0; k < 8; ++k) {
           const int16x8_t intermediate =
               vld1q_s16(intermediate_result[y + 4 + k]);
           sum_low = vmlal_s16(sum_low, vget_low_s16(filter[k]),
                               vget_low_s16(intermediate));
           sum_high = vmlal_s16(sum_high, vget_high_s16(filter[k]),
                                vget_high_s16(intermediate));
         }
         assert(inter_round_bits_vertical == 7 ||
                inter_round_bits_vertical == 11);
         // Since inter_round_bits_vertical can be 7 or 11, and all the narrowing
         // shift intrinsics require the shift argument to be a constant
         // (immediate), we have two options:
         // 1. Call a non-narrowing shift, followed by a narrowing extract.
         // 2. Call a narrowing shift (with a constant shift of 7 or 11) in an
         //    if-else statement.
 #if defined(__aarch64__)
         // This version is slightly faster for arm64 (1106 ms vs 1112 ms).
         // This version is slower for 32-bit arm (1235 ms vs 1149 ms).
         const int32x4_t shift = vdupq_n_s32(-inter_round_bits_vertical);
         const uint32x4_t sum_low_shifted =
             vrshlq_u32(vreinterpretq_u32_s32(sum_low), shift);
         const uint32x4_t sum_high_shifted =
             vrshlq_u32(vreinterpretq_u32_s32(sum_high), shift);
         const uint16x4_t sum_low_16 = vmovn_u32(sum_low_shifted);
         const uint16x4_t sum_high_16 = vmovn_u32(sum_high_shifted);
 #else   // !defined(__aarch64__)
         // This version is faster for 32-bit arm.
         // This version is slightly slower for arm64.
         uint16x4_t sum_low_16;
         uint16x4_t sum_high_16;
         if (inter_round_bits_vertical == 7) {
           sum_low_16 = vrshrn_n_u32(vreinterpretq_u32_s32(sum_low), 7);
           sum_high_16 = vrshrn_n_u32(vreinterpretq_u32_s32(sum_high), 7);
         } else {
           sum_low_16 = vrshrn_n_u32(vreinterpretq_u32_s32(sum_low), 11);
           sum_high_16 = vrshrn_n_u32(vreinterpretq_u32_s32(sum_high), 11);
         }
 #endif  // defined(__aarch64__)
         // vst1q_u16 can also be used:
         //   vst1q_u16(dst_row, vcombine_u16(sum_low_16, sum_high_16));
         // But it is slightly slower for arm64 (the same speed for 32-bit arm).
         //
         // vst1_u16_x2 could be used, but it is also slightly slower for arm64
         // and causes a bus error for 32-bit arm. Also, it is not supported by
         // gcc 7.2.0.
         vst1_u16(dst_row, sum_low_16);
         vst1_u16(dst_row + 4, sum_high_16);
         dst_row += dest_stride;
         sy4 += delta;
       }
       start_x += 8;
     } while (start_x < block_start_x + block_width);
     dest += 8 * dest_stride;
     start_y += 8;
   } while (start_y < block_start_y + block_height);
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
   assert(dsp != nullptr);
   dsp->warp = Warp_NEON;
 }

 }  // namespace
 }  // namespace low_bitdepth

 void WarpInit_NEON() { low_bitdepth::Init8bpp(); }

 }  // namespace dsp
 }  // namespace libgav1
 #else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {

 void WarpInit_NEON() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_ENABLE_NEON
	// Copyright 2019 The libgav1 Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "src/dsp/dsp.h"
	#include "src/dsp/warp.h"

	#if LIBGAV1_ENABLE_NEON

	#include <arm_neon.h>

	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstdlib>

	#include "src/dsp/arm/common_neon.h"
	#include "src/dsp/constants.h"
	#include "src/utils/common.h"
	#include "src/utils/constants.h"

	namespace libgav1 {
	namespace dsp {
	namespace low_bitdepth {
	namespace {

	// Number of extra bits of precision in warped filtering.
	constexpr int kWarpedDiffPrecisionBits = 10;

	void Warp_NEON(const void* const source, const ptrdiff_t source_stride,
	const int source_width, const int source_height,
	const int* const warp_params, const int subsampling_x,
	const int subsampling_y, const int inter_round_bits_vertical,
	const int block_start_x, const int block_start_y,
	const int block_width, const int block_height,
	const int16_t alpha, const int16_t beta, const int16_t gamma,
	const int16_t delta, uint16_t* dest,
	const ptrdiff_t dest_stride) {
	constexpr int bitdepth = 8;
	// Intermediate_result is the output of the horizontal filtering and rounding.
	// The range is within 13 (= bitdepth + kFilterBits + 1 -
	// kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t type
	// so that we can multiply it by kWarpedFilters (which has signed values)
	// using vmlal_s16().
	int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
	const int horizontal_offset = 1 << (bitdepth + kFilterBits - 1);
	const int vertical_offset =
	1 << (bitdepth + 2 * kFilterBits - kInterRoundBitsHorizontal);
	const auto* const src = static_cast<const uint8_t*>(source);

	assert(block_width >= 8);
	assert(block_height >= 8);

	// Warp process applies for each 8x8 block (or smaller).
	int start_y = block_start_y;
	do {
	int start_x = block_start_x;
	do {
	const int src_x = (start_x + 4) << subsampling_x;
	const int src_y = (start_y + 4) << subsampling_y;
	const int dst_x =
	src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
	const int dst_y =
	src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
	const int x4 = dst_x >> subsampling_x;
	const int y4 = dst_y >> subsampling_y;
	const int ix4 = x4 >> kWarpedModelPrecisionBits;
	const int iy4 = y4 >> kWarpedModelPrecisionBits;

	// Horizontal filter.
	int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
	for (int y = -7; y < 8; ++y) {
	// TODO(chengchen):
	// Because of warping, the index could be out of frame boundary. Thus
	// clip is needed. However, can we remove or reduce usage of clip?
	// Besides, special cases exist, for example,
	// if iy4 - 7 >= source_height or iy4 + 7 < 0, there's no need to do the
	// filtering.
	const int row = Clip3(iy4 + y, 0, source_height - 1);
	const uint8_t* const src_row = src + row * source_stride;
	// Check for two simple special cases.
	if (ix4 - 7 >= source_width - 1) {
	// Every sample is equal to src_row[source_width - 1]. Since the sum
	// of the warped filter coefficients is 128 (= 2^7), the filtering is
	// equivalent to multiplying src_row[source_width - 1] by 128.
	const int16_t s =
	(horizontal_offset >> kInterRoundBitsHorizontal) +
	(src_row[source_width - 1] << (7 - kInterRoundBitsHorizontal));
	const int16x8_t sum = vdupq_n_s16(s);
	vst1q_s16(intermediate_result[y + 7], sum);
	sx4 += beta;
	continue;
	}
	if (ix4 + 7 <= 0) {
	// Every sample is equal to src_row[0]. Since the sum of the warped
	// filter coefficients is 128 (= 2^7), the filtering is equivalent to
	// multiplying src_row[0] by 128.
	const int16_t s = (horizontal_offset >> kInterRoundBitsHorizontal) +
	(src_row[0] << (7 - kInterRoundBitsHorizontal));
	const int16x8_t sum = vdupq_n_s16(s);
	vst1q_s16(intermediate_result[y + 7], sum);
	sx4 += beta;
	continue;
	}
	// Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
	// read but is ignored.
	//
	// NOTE: This may read up to 13 bytes before src_row[0] or up to 14
	// bytes after src_row[source_width - 1]. We assume the source frame
	// has left and right borders of at least 13 bytes that extend the
	// frame boundary pixels. We also assume there is at least one extra
	// padding byte after the right border of the last source row.
	const uint8x16_t src_row_u8 = vld1q_u8(&src_row[ix4 - 7]);
	const int16x8_t src_row_low_s16 =
	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src_row_u8)));
	const int16x8_t src_row_high_s16 =
	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src_row_u8)));
	int sx = sx4 - MultiplyBy4(alpha);
	int16x8_t filter[8];
	for (int x = 0; x < 8; ++x) {
	const int offset =
	RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
	kWarpedPixelPrecisionShifts;
	filter[x] = vld1q_s16(kWarpedFilters[offset]);
	sx += alpha;
	}
	Transpose8x8(&filter[0], &filter[1], &filter[2], &filter[3], &filter[4],
	&filter[5], &filter[6], &filter[7]);
	// For 8 bit, the range of sum is within uint16_t, if we add a
	// horizontal offset. horizontal_offset guarantees sum is nonnegative.
	//
	// Proof:
	// Given that the minimum (most negative) sum of the negative filter
	// coefficients is -47 and the maximum sum of the positive filter
	// coefficients is 175, the range of the horizontal filter output is
	// -47 * 255 <= output <= 175 * 255
	// Since -2^14 < -47 * 255, adding -2^14 (= horizontal_offset) to the
	// horizontal filter output produces a positive value:
	// 0 < output + 2^14 <= 175 * 255 + 2^14
	// The final rounding right shift by 3 (= kInterRoundBitsHorizontal)
	// bits adds 2^2 to the sum:
	// 0 < output + 2^14 + 2^2 <= 175 * 255 + 2^14 + 2^2 = 61013
	// Since 61013 < 2^16, the final sum (right before the right shift by 3
	// bits) will not overflow uint16_t. In addition, the value after the
	// right shift by 3 bits is in the following range:
	// 0 <= intermediate_result[y][x] < 2^13
	// This property is used in determining the range of the vertical
	// filtering output. [End of proof.]
	//
	// We can do signed int16_t arithmetic and just treat the final result
	// as uint16_t when we shift it right.
	int16x8_t sum = vdupq_n_s16(horizontal_offset);
	// Unrolled k = 0..7 loop. We need to manually unroll the loop because
	// the third argument (an index value) to vextq_s16() must be a
	// constant (immediate).
	// k = 0.
	int16x8_t src_row_v_s16 = src_row_low_s16;
	sum = vmlaq_s16(sum, filter[0], src_row_v_s16);
	// k = 1.
	src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 1);
	sum = vmlaq_s16(sum, filter[1], src_row_v_s16);
	// k = 2.
	src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 2);
	sum = vmlaq_s16(sum, filter[2], src_row_v_s16);
	// k = 3.
	src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 3);
	sum = vmlaq_s16(sum, filter[3], src_row_v_s16);
	// k = 4.
	src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 4);
	sum = vmlaq_s16(sum, filter[4], src_row_v_s16);
	// k = 5.
	src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 5);
	sum = vmlaq_s16(sum, filter[5], src_row_v_s16);
	// k = 6.
	src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 6);
	sum = vmlaq_s16(sum, filter[6], src_row_v_s16);
	// k = 7.
	src_row_v_s16 = vextq_s16(src_row_low_s16, src_row_high_s16, 7);
	sum = vmlaq_s16(sum, filter[7], src_row_v_s16);
	// End of unrolled k = 0..7 loop.
	// Treat sum as unsigned for the right shift.
	sum = vreinterpretq_s16_u16(vrshrq_n_u16(vreinterpretq_u16_s16(sum),
	kInterRoundBitsHorizontal));
	vst1q_s16(intermediate_result[y + 7], sum);
	sx4 += beta;
	}

	// Vertical filter.
	uint16_t* dst_row = dest + start_x - block_start_x;
	int sy4 =
	(y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
	for (int y = -4; y < 4; ++y) {
	int sy = sy4 - MultiplyBy4(gamma);
	int16x8_t filter[8];
	for (int x = 0; x < 8; ++x) {
	const int offset =
	RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
	kWarpedPixelPrecisionShifts;
	filter[x] = vld1q_s16(kWarpedFilters[offset]);
	sy += gamma;
	}
	Transpose8x8(&filter[0], &filter[1], &filter[2], &filter[3], &filter[4],
	&filter[5], &filter[6], &filter[7]);
	// Similar to horizontal_offset, vertical_offset guarantees sum before
	// shifting is nonnegative.
	//
	// Proof:
	// The range of an entry in intermediate_result is
	// 0 <= intermediate_result[y][x] < 2^13
	// The range of the vertical filter output is
	// -47 * 2^13 < output < 175 * 2^13
	// Since -2^19 < -47 * 2^13, adding -2^19 (= vertical_offset) to the
	// vertical filter output produces a positive value:
	// 0 < output + 2^19 < 175 * 2^13 + 2^19
	// The final rounding right shift by either 7 or 11 bits adds at most
	// 2^10 to the sum:
	// 0 < output + 2^19 + rounding < 175 * 2^13 + 2^19 + 2^10 = 1958912
	// Since 1958912 = 0x1DE400 < 2^22, shifting it right by 7 or 11 bits
	// brings the value under 2^15, which fits in uint16_t.
	int32x4_t sum_low = vdupq_n_s32(vertical_offset);
	int32x4_t sum_high = sum_low;
	for (int k = 0; k < 8; ++k) {
	const int16x8_t intermediate =
	vld1q_s16(intermediate_result[y + 4 + k]);
	sum_low = vmlal_s16(sum_low, vget_low_s16(filter[k]),
	vget_low_s16(intermediate));
	sum_high = vmlal_s16(sum_high, vget_high_s16(filter[k]),
	vget_high_s16(intermediate));
	}
	assert(inter_round_bits_vertical == 7 \|\|
	inter_round_bits_vertical == 11);
	// Since inter_round_bits_vertical can be 7 or 11, and all the narrowing
	// shift intrinsics require the shift argument to be a constant
	// (immediate), we have two options:
	// 1. Call a non-narrowing shift, followed by a narrowing extract.
	// 2. Call a narrowing shift (with a constant shift of 7 or 11) in an
	// if-else statement.
	#if defined(__aarch64__)
	// This version is slightly faster for arm64 (1106 ms vs 1112 ms).
	// This version is slower for 32-bit arm (1235 ms vs 1149 ms).
	const int32x4_t shift = vdupq_n_s32(-inter_round_bits_vertical);
	const uint32x4_t sum_low_shifted =
	vrshlq_u32(vreinterpretq_u32_s32(sum_low), shift);
	const uint32x4_t sum_high_shifted =
	vrshlq_u32(vreinterpretq_u32_s32(sum_high), shift);
	const uint16x4_t sum_low_16 = vmovn_u32(sum_low_shifted);
	const uint16x4_t sum_high_16 = vmovn_u32(sum_high_shifted);
	#else // !defined(__aarch64__)
	// This version is faster for 32-bit arm.
	// This version is slightly slower for arm64.
	uint16x4_t sum_low_16;
	uint16x4_t sum_high_16;
	if (inter_round_bits_vertical == 7) {
	sum_low_16 = vrshrn_n_u32(vreinterpretq_u32_s32(sum_low), 7);
	sum_high_16 = vrshrn_n_u32(vreinterpretq_u32_s32(sum_high), 7);
	} else {
	sum_low_16 = vrshrn_n_u32(vreinterpretq_u32_s32(sum_low), 11);
	sum_high_16 = vrshrn_n_u32(vreinterpretq_u32_s32(sum_high), 11);
	}
	#endif // defined(__aarch64__)
	// vst1q_u16 can also be used:
	// vst1q_u16(dst_row, vcombine_u16(sum_low_16, sum_high_16));
	// But it is slightly slower for arm64 (the same speed for 32-bit arm).
	//
	// vst1_u16_x2 could be used, but it is also slightly slower for arm64
	// and causes a bus error for 32-bit arm. Also, it is not supported by
	// gcc 7.2.0.
	vst1_u16(dst_row, sum_low_16);
	vst1_u16(dst_row + 4, sum_high_16);
	dst_row += dest_stride;
	sy4 += delta;
	}
	start_x += 8;
	} while (start_x < block_start_x + block_width);
	dest += 8 * dest_stride;
	start_y += 8;
	} while (start_y < block_start_y + block_height);
	}

	void Init8bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
	assert(dsp != nullptr);
	dsp->warp = Warp_NEON;
	}

	} // namespace
	} // namespace low_bitdepth

	void WarpInit_NEON() { low_bitdepth::Init8bpp(); }

	} // namespace dsp
	} // namespace libgav1
	#else // !LIBGAV1_ENABLE_NEON
	namespace libgav1 {
	namespace dsp {

	void WarpInit_NEON() {}

	} // namespace dsp
	} // namespace libgav1
	#endif // LIBGAV1_ENABLE_NEON