src/dsp/arm/film_grain_neon.cc - platform/external/libgav1 - Git at Google

 // Copyright 2019 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/dsp/film_grain.h"
 #include "src/utils/cpu.h"

 #if LIBGAV1_ENABLE_NEON
 #include <arm_neon.h>

 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <new>

 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/arm/film_grain_neon.h"
 #include "src/dsp/common.h"
 #include "src/utils/common.h"
 #include "src/utils/logging.h"

 namespace libgav1 {
 namespace dsp {
 namespace {

 // Section 7.18.3.1.
 template <int bitdepth>
 bool FilmGrainSynthesis_NEON(
     const void* source_plane_y, ptrdiff_t source_stride_y,
     const void* source_plane_u, ptrdiff_t source_stride_u,
     const void* source_plane_v, ptrdiff_t source_stride_v,
     const FilmGrainParams& film_grain_params, const bool is_monochrome,
     const bool color_matrix_is_identity, const int width, const int height,
     const int subsampling_x, const int subsampling_y, void* dest_plane_y,
     ptrdiff_t dest_stride_y, void* dest_plane_u, ptrdiff_t dest_stride_u,
     void* dest_plane_v, ptrdiff_t dest_stride_v) {
   FilmGrain<bitdepth> film_grain(film_grain_params, is_monochrome,
                                  color_matrix_is_identity, subsampling_x,
                                  subsampling_y, width, height);
   return film_grain.AddNoise_NEON(
       source_plane_y, source_stride_y, source_plane_u, source_stride_u,
       source_plane_v, source_stride_v, dest_plane_y, dest_stride_y,
       dest_plane_u, dest_stride_u, dest_plane_v, dest_stride_v);
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
   assert(dsp != nullptr);
   dsp->film_grain_synthesis = FilmGrainSynthesis_NEON<8>;
 }

 #if LIBGAV1_MAX_BITDEPTH >= 10
 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
   assert(dsp != nullptr);
   dsp->film_grain_synthesis = FilmGrainSynthesis_NEON<10>;
 }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10

 inline int16x8_t GetSource8(const int8_t* src) {
   return vmovl_s8(vld1_s8(src));
 }

 #if LIBGAV1_MAX_BITDEPTH >= 10
 inline int16x8_t GetSource8(const int16_t* src) { return vld1q_s16(src); }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10

 template <int position_offset>
 inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
                                            const int16x8_t grain_hi,
                                            int16_t coeff, int32x4x2_t sum) {
   const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset);
   sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff);
   sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff);
   return sum;
 }

 template <int bitdepth, int auto_regression_coeff_lag, int lane>
 inline void WriteFinalAutoRegression(int8_t* luma_grain_cursor, int32x4x2_t sum,
                                      const int8_t* coeffs, int pos, int shift) {
   int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);

   for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
     result += luma_grain_cursor[lane + delta_col] * coeffs[pos];
     ++pos;
   }
   luma_grain_cursor[lane] =
       Clip3(luma_grain_cursor[lane] + RightShiftWithRounding(result, shift),
             FilmGrain<bitdepth>::kGrainMin, FilmGrain<bitdepth>::kGrainMax);
 }

 #if LIBGAV1_MAX_BITDEPTH >= 10
 template <int bitdepth, int auto_regression_coeff_lag, int lane>
 inline void WriteFinalAutoRegression(int16_t* luma_grain_cursor,
                                      int32x4x2_t sum, const int8_t* coeffs,
                                      int pos, int shift) {
   int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);

   for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
     result += luma_grain_cursor[lane + delta_col] * coeffs[pos];
     ++pos;
   }
   luma_grain_cursor[lane] =
       Clip3(luma_grain_cursor[lane] + RightShiftWithRounding(result, shift),
             FilmGrain<bitdepth>::kGrainMin, FilmGrain<bitdepth>::kGrainMax);
 }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10

 inline void SetZero(int32x4x2_t* v) {
   v->val[0] = vdupq_n_s32(0);
   v->val[1] = vdupq_n_s32(0);
 }

 constexpr int kAutoRegressionBorder = 3;
 }  // namespace

 void FilmGrainInit_NEON() {
   Init8bpp();
 #if LIBGAV1_MAX_BITDEPTH >= 10
   Init10bpp();
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 }

 // Applies an auto-regressive filter to the white noise in luma_grain.
 template <int bitdepth>
 template <int auto_regression_coeff_lag>
 void FilmGrain<bitdepth>::ApplyAutoRegressiveFilterToLumaGrain_NEON(
     const FilmGrainParams& params, int /*grain_min*/, int /*grain_max*/,
     GrainType* luma_grain) {
   static_assert(auto_regression_coeff_lag > 0, "");
   const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y;
   const uint8_t auto_regression_shift = params.auto_regression_shift;

   int y = 3;
   luma_grain += kLumaWidth * y;
   do {
     // Each row is computed 8 values at a time in the following loop. At the
     // end of the loop, 4 values remain to write. They are given a special
     // reduced iteration at the end.
     int x = 3;
     do {
       int pos = 0;
       int32x4x2_t sum;
       SetZero(&sum);
       for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
            ++delta_row) {
         const int16x8_t src_grain_lo =
             GetSource8(luma_grain + x + delta_row * kLumaWidth -
                        auto_regression_coeff_lag);
         const int16x8_t src_grain_hi =
             GetSource8(luma_grain + x + delta_row * kLumaWidth -
                        auto_regression_coeff_lag + 8);

 #define ACCUMULATE_WEIGHTED_GRAIN(offset)                           \
   sum = AccumulateWeightedGrain<offset>(src_grain_lo, src_grain_hi, \
                                         auto_regression_coeff_y[pos++], sum);

         // A pictorial representation of the auto-regressive filter for
         // various values of params.auto_regression_coeff_lag. The letter 'O'
         // represents the current sample. (The filter always operates on the
         // current sample with filter coefficient 1.) The letters 'X'
         // represent the neighboring samples that the filter operates on, below
         // their corresponding "offset" number.
         //
         // params.auto_regression_coeff_lag == 3:
         //   0 1 2 3 4 5 6
         //   X X X X X X X
         //   X X X X X X X
         //   X X X X X X X
         //   X X X O
         // params.auto_regression_coeff_lag == 2:
         //     0 1 2 3 4
         //     X X X X X
         //     X X X X X
         //     X X O
         // params.auto_regression_coeff_lag == 1:
         //       0 1 2
         //       X X X
         //       X O
         // params.auto_regression_coeff_lag == 0:
         //         O
         // The function relies on the caller to skip the call in the 0 lag
         // case.
         ACCUMULATE_WEIGHTED_GRAIN(0);
         ACCUMULATE_WEIGHTED_GRAIN(1);
         ACCUMULATE_WEIGHTED_GRAIN(2);
         // The horizontal |auto_regression_coeff_lag| loop is replaced with
         // if-statements to give vextq_s16 an immediate param.
         if (auto_regression_coeff_lag > 1) {
           ACCUMULATE_WEIGHTED_GRAIN(3);
           ACCUMULATE_WEIGHTED_GRAIN(4);
         }
         if (auto_regression_coeff_lag > 2) {
           assert(auto_regression_coeff_lag == 3);
           ACCUMULATE_WEIGHTED_GRAIN(5);
           ACCUMULATE_WEIGHTED_GRAIN(6);
         }
       }
       // delta_row == 0
       // At this point in the filter, the source addresses and destination
       // addresses overlap. Because this is a causal filter, the higher lanes
       // cannot be computed without the results of the lower lanes.
       // Each call to WriteFinalAutoRegression incorporates preceding values
       // on the final row, and writes a single sample. This allows the next
       // pixel's value to be computed in the next call.
 #define WRITE_AUTO_REGRESSION_RESULT(lane)                             \
   WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( \
       luma_grain + x, sum, auto_regression_coeff_y, pos,               \
       auto_regression_shift);

       WRITE_AUTO_REGRESSION_RESULT(0);
       WRITE_AUTO_REGRESSION_RESULT(1);
       WRITE_AUTO_REGRESSION_RESULT(2);
       WRITE_AUTO_REGRESSION_RESULT(3);
       WRITE_AUTO_REGRESSION_RESULT(4);
       WRITE_AUTO_REGRESSION_RESULT(5);
       WRITE_AUTO_REGRESSION_RESULT(6);
       WRITE_AUTO_REGRESSION_RESULT(7);
       x += 8;
       // Leave the final four pixels for the special iteration below.
     } while (x < kLumaWidth - kAutoRegressionBorder - 4);

     // Final 4 pixels in the row.
     int pos = 0;
     int32x4x2_t sum;
     SetZero(&sum);
     for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
          ++delta_row) {
       const int16x8_t src_grain_lo = GetSource8(
           luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag);
       const int16x8_t src_grain_hi =
           GetSource8(luma_grain + x + delta_row * kLumaWidth -
                      auto_regression_coeff_lag + 8);

       ACCUMULATE_WEIGHTED_GRAIN(0);
       ACCUMULATE_WEIGHTED_GRAIN(1);
       ACCUMULATE_WEIGHTED_GRAIN(2);
       // The horizontal |auto_regression_coeff_lag| loop is replaced with
       // if-statements to give vextq_s16 an immediate param.
       if (auto_regression_coeff_lag > 1) {
         ACCUMULATE_WEIGHTED_GRAIN(3);
         ACCUMULATE_WEIGHTED_GRAIN(4);
       }
       if (auto_regression_coeff_lag > 2) {
         assert(auto_regression_coeff_lag == 3);
         ACCUMULATE_WEIGHTED_GRAIN(5);
         ACCUMULATE_WEIGHTED_GRAIN(6);
       }
     }
     // delta_row == 0
     WRITE_AUTO_REGRESSION_RESULT(0);
     WRITE_AUTO_REGRESSION_RESULT(1);
     WRITE_AUTO_REGRESSION_RESULT(2);
     WRITE_AUTO_REGRESSION_RESULT(3);
     luma_grain += kLumaWidth;
   } while (++y < kLumaHeight);

 #undef WRITE_AUTO_REGRESSION_RESULT
 #undef ACCUMULATE_WEIGHTED_GRAIN
 }

 template <int bitdepth>
 bool FilmGrain<bitdepth>::Init_NEON() {
   // Section 7.18.3.3. Generate grain process.
   GenerateLumaGrain(params_, luma_grain_);
   using AutoRegressionFn =
       void (*)(const FilmGrainParams& params, int grain_min, int grain_max,
                GrainType* luma_grain);
   static const AutoRegressionFn kAutoRegressionFnTableLuma[3] = {
       FilmGrain<bitdepth>::ApplyAutoRegressiveFilterToLumaGrain_NEON<1>,
       FilmGrain<bitdepth>::ApplyAutoRegressiveFilterToLumaGrain_NEON<2>,
       FilmGrain<bitdepth>::ApplyAutoRegressiveFilterToLumaGrain_NEON<3>};
   if (params_.auto_regression_coeff_lag > 0 && params_.num_y_points > 0) {
     kAutoRegressionFnTableLuma[params_.auto_regression_coeff_lag - 1](
         params_, kGrainMin, kGrainMax, luma_grain_);
   }
   if (!is_monochrome_) {
     GenerateChromaGrains(params_, chroma_width_, chroma_height_, u_grain_,
                          v_grain_);
     ApplyAutoRegressiveFilterToChromaGrains(
         params_, kGrainMin, kGrainMax, luma_grain_, subsampling_x_,
         subsampling_y_, chroma_width_, chroma_height_, u_grain_, v_grain_);
   }

   // Section 7.18.3.4. Scaling lookup initialization process.
   InitializeScalingLookupTable(params_.num_y_points, params_.point_y_value,
                                params_.point_y_scaling, scaling_lut_y_);
   if (!is_monochrome_) {
     if (params_.chroma_scaling_from_luma) {
       scaling_lut_u_ = scaling_lut_y_;
       scaling_lut_v_ = scaling_lut_y_;
     } else {
       scaling_lut_chroma_buffer_.reset(new (std::nothrow) uint8_t[256 * 2]);
       if (scaling_lut_chroma_buffer_ == nullptr) return false;
       scaling_lut_u_ = &scaling_lut_chroma_buffer_[0];
       scaling_lut_v_ = &scaling_lut_chroma_buffer_[256];
       InitializeScalingLookupTable(params_.num_u_points, params_.point_u_value,
                                    params_.point_u_scaling, scaling_lut_u_);
       InitializeScalingLookupTable(params_.num_v_points, params_.point_v_value,
                                    params_.point_v_scaling, scaling_lut_v_);
     }
   }
   return true;
 }

 template <int bitdepth>
 bool FilmGrain<bitdepth>::AddNoise_NEON(
     const void* source_plane_y, ptrdiff_t source_stride_y,
     const void* source_plane_u, ptrdiff_t source_stride_u,
     const void* source_plane_v, ptrdiff_t source_stride_v, void* dest_plane_y,
     ptrdiff_t dest_stride_y, void* dest_plane_u, ptrdiff_t dest_stride_u,
     void* dest_plane_v, ptrdiff_t dest_stride_v) {
   if (!Init_NEON()) {
     LIBGAV1_DLOG(ERROR, "Init() failed.");
     return false;
   }
   if (!AllocateNoiseStripes()) {
     LIBGAV1_DLOG(ERROR, "AllocateNoiseStripes() failed.");
     return false;
   }
   ConstructNoiseStripes();

   if (!AllocateNoiseImage()) {
     LIBGAV1_DLOG(ERROR, "AllocateNoiseImage() failed.");
     return false;
   }
   ConstructNoiseImage();

   BlendNoiseWithImage(source_plane_y, source_stride_y, source_plane_u,
                       source_stride_u, source_plane_v, source_stride_v,
                       dest_plane_y, dest_stride_y, dest_plane_u, dest_stride_u,
                       dest_plane_v, dest_stride_v);

   return true;
 }

 // Explicit instantiations.
 template class FilmGrain<8>;
 #if LIBGAV1_MAX_BITDEPTH >= 10
 template class FilmGrain<10>;
 #endif

 }  // namespace dsp
 }  // namespace libgav1

 #else   // !LIBGAV1_ENABLE_NEON

 namespace libgav1 {
 namespace dsp {

 void FilmGrainInit_NEON() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_ENABLE_NEON
	// Copyright 2019 The libgav1 Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "src/dsp/film_grain.h"
	#include "src/utils/cpu.h"

	#if LIBGAV1_ENABLE_NEON
	#include <arm_neon.h>

	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstring>
	#include <new>

	#include "src/dsp/arm/common_neon.h"
	#include "src/dsp/arm/film_grain_neon.h"
	#include "src/dsp/common.h"
	#include "src/utils/common.h"
	#include "src/utils/logging.h"

	namespace libgav1 {
	namespace dsp {
	namespace {

	// Section 7.18.3.1.
	template <int bitdepth>
	bool FilmGrainSynthesis_NEON(
	const void* source_plane_y, ptrdiff_t source_stride_y,
	const void* source_plane_u, ptrdiff_t source_stride_u,
	const void* source_plane_v, ptrdiff_t source_stride_v,
	const FilmGrainParams& film_grain_params, const bool is_monochrome,
	const bool color_matrix_is_identity, const int width, const int height,
	const int subsampling_x, const int subsampling_y, void* dest_plane_y,
	ptrdiff_t dest_stride_y, void* dest_plane_u, ptrdiff_t dest_stride_u,
	void* dest_plane_v, ptrdiff_t dest_stride_v) {
	FilmGrain<bitdepth> film_grain(film_grain_params, is_monochrome,
	color_matrix_is_identity, subsampling_x,
	subsampling_y, width, height);
	return film_grain.AddNoise_NEON(
	source_plane_y, source_stride_y, source_plane_u, source_stride_u,
	source_plane_v, source_stride_v, dest_plane_y, dest_stride_y,
	dest_plane_u, dest_stride_u, dest_plane_v, dest_stride_v);
	}

	void Init8bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
	assert(dsp != nullptr);
	dsp->film_grain_synthesis = FilmGrainSynthesis_NEON<8>;
	}

	#if LIBGAV1_MAX_BITDEPTH >= 10
	void Init10bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
	assert(dsp != nullptr);
	dsp->film_grain_synthesis = FilmGrainSynthesis_NEON<10>;
	}
	#endif // LIBGAV1_MAX_BITDEPTH >= 10

	inline int16x8_t GetSource8(const int8_t* src) {
	return vmovl_s8(vld1_s8(src));
	}

	#if LIBGAV1_MAX_BITDEPTH >= 10
	inline int16x8_t GetSource8(const int16_t* src) { return vld1q_s16(src); }
	#endif // LIBGAV1_MAX_BITDEPTH >= 10

	template <int position_offset>
	inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
	const int16x8_t grain_hi,
	int16_t coeff, int32x4x2_t sum) {
	const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset);
	sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff);
	sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff);
	return sum;
	}

	template <int bitdepth, int auto_regression_coeff_lag, int lane>
	inline void WriteFinalAutoRegression(int8_t* luma_grain_cursor, int32x4x2_t sum,
	const int8_t* coeffs, int pos, int shift) {
	int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);

	for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
	result += luma_grain_cursor[lane + delta_col] * coeffs[pos];
	++pos;
	}
	luma_grain_cursor[lane] =
	Clip3(luma_grain_cursor[lane] + RightShiftWithRounding(result, shift),
	FilmGrain<bitdepth>::kGrainMin, FilmGrain<bitdepth>::kGrainMax);
	}

	#if LIBGAV1_MAX_BITDEPTH >= 10
	template <int bitdepth, int auto_regression_coeff_lag, int lane>
	inline void WriteFinalAutoRegression(int16_t* luma_grain_cursor,
	int32x4x2_t sum, const int8_t* coeffs,
	int pos, int shift) {
	int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);

	for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
	result += luma_grain_cursor[lane + delta_col] * coeffs[pos];
	++pos;
	}
	luma_grain_cursor[lane] =
	Clip3(luma_grain_cursor[lane] + RightShiftWithRounding(result, shift),
	FilmGrain<bitdepth>::kGrainMin, FilmGrain<bitdepth>::kGrainMax);
	}
	#endif // LIBGAV1_MAX_BITDEPTH >= 10

	inline void SetZero(int32x4x2_t* v) {
	v->val[0] = vdupq_n_s32(0);
	v->val[1] = vdupq_n_s32(0);
	}

	constexpr int kAutoRegressionBorder = 3;
	} // namespace

	void FilmGrainInit_NEON() {
	Init8bpp();
	#if LIBGAV1_MAX_BITDEPTH >= 10
	Init10bpp();
	#endif // LIBGAV1_MAX_BITDEPTH >= 10
	}

	// Applies an auto-regressive filter to the white noise in luma_grain.
	template <int bitdepth>
	template <int auto_regression_coeff_lag>
	void FilmGrain<bitdepth>::ApplyAutoRegressiveFilterToLumaGrain_NEON(
	const FilmGrainParams& params, int /grain_min/, int /grain_max/,
	GrainType* luma_grain) {
	static_assert(auto_regression_coeff_lag > 0, "");
	const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y;
	const uint8_t auto_regression_shift = params.auto_regression_shift;

	int y = 3;
	luma_grain += kLumaWidth * y;
	do {
	// Each row is computed 8 values at a time in the following loop. At the
	// end of the loop, 4 values remain to write. They are given a special
	// reduced iteration at the end.
	int x = 3;
	do {
	int pos = 0;
	int32x4x2_t sum;
	SetZero(&sum);
	for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
	++delta_row) {
	const int16x8_t src_grain_lo =
	GetSource8(luma_grain + x + delta_row * kLumaWidth -
	auto_regression_coeff_lag);
	const int16x8_t src_grain_hi =
	GetSource8(luma_grain + x + delta_row * kLumaWidth -
	auto_regression_coeff_lag + 8);

	#define ACCUMULATE_WEIGHTED_GRAIN(offset) \
	sum = AccumulateWeightedGrain<offset>(src_grain_lo, src_grain_hi, \
	auto_regression_coeff_y[pos++], sum);

	// A pictorial representation of the auto-regressive filter for
	// various values of params.auto_regression_coeff_lag. The letter 'O'
	// represents the current sample. (The filter always operates on the
	// current sample with filter coefficient 1.) The letters 'X'
	// represent the neighboring samples that the filter operates on, below
	// their corresponding "offset" number.
	//
	// params.auto_regression_coeff_lag == 3:
	// 0 1 2 3 4 5 6
	// X X X X X X X
	// X X X X X X X
	// X X X X X X X
	// X X X O
	// params.auto_regression_coeff_lag == 2:
	// 0 1 2 3 4
	// X X X X X
	// X X X X X
	// X X O
	// params.auto_regression_coeff_lag == 1:
	// 0 1 2
	// X X X
	// X O
	// params.auto_regression_coeff_lag == 0:
	// O
	// The function relies on the caller to skip the call in the 0 lag
	// case.
	ACCUMULATE_WEIGHTED_GRAIN(0);
	ACCUMULATE_WEIGHTED_GRAIN(1);
	ACCUMULATE_WEIGHTED_GRAIN(2);
	// The horizontal \|auto_regression_coeff_lag\| loop is replaced with
	// if-statements to give vextq_s16 an immediate param.
	if (auto_regression_coeff_lag > 1) {
	ACCUMULATE_WEIGHTED_GRAIN(3);
	ACCUMULATE_WEIGHTED_GRAIN(4);
	}
	if (auto_regression_coeff_lag > 2) {
	assert(auto_regression_coeff_lag == 3);
	ACCUMULATE_WEIGHTED_GRAIN(5);
	ACCUMULATE_WEIGHTED_GRAIN(6);
	}
	}
	// delta_row == 0
	// At this point in the filter, the source addresses and destination
	// addresses overlap. Because this is a causal filter, the higher lanes
	// cannot be computed without the results of the lower lanes.
	// Each call to WriteFinalAutoRegression incorporates preceding values
	// on the final row, and writes a single sample. This allows the next
	// pixel's value to be computed in the next call.
	#define WRITE_AUTO_REGRESSION_RESULT(lane) \
	WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( \
	luma_grain + x, sum, auto_regression_coeff_y, pos, \
	auto_regression_shift);

	WRITE_AUTO_REGRESSION_RESULT(0);
	WRITE_AUTO_REGRESSION_RESULT(1);
	WRITE_AUTO_REGRESSION_RESULT(2);
	WRITE_AUTO_REGRESSION_RESULT(3);
	WRITE_AUTO_REGRESSION_RESULT(4);
	WRITE_AUTO_REGRESSION_RESULT(5);
	WRITE_AUTO_REGRESSION_RESULT(6);
	WRITE_AUTO_REGRESSION_RESULT(7);
	x += 8;
	// Leave the final four pixels for the special iteration below.
	} while (x < kLumaWidth - kAutoRegressionBorder - 4);

	// Final 4 pixels in the row.
	int pos = 0;
	int32x4x2_t sum;
	SetZero(&sum);
	for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
	++delta_row) {
	const int16x8_t src_grain_lo = GetSource8(
	luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag);
	const int16x8_t src_grain_hi =
	GetSource8(luma_grain + x + delta_row * kLumaWidth -
	auto_regression_coeff_lag + 8);

	ACCUMULATE_WEIGHTED_GRAIN(0);
	ACCUMULATE_WEIGHTED_GRAIN(1);
	ACCUMULATE_WEIGHTED_GRAIN(2);
	// The horizontal \|auto_regression_coeff_lag\| loop is replaced with
	// if-statements to give vextq_s16 an immediate param.
	if (auto_regression_coeff_lag > 1) {
	ACCUMULATE_WEIGHTED_GRAIN(3);
	ACCUMULATE_WEIGHTED_GRAIN(4);
	}
	if (auto_regression_coeff_lag > 2) {
	assert(auto_regression_coeff_lag == 3);
	ACCUMULATE_WEIGHTED_GRAIN(5);
	ACCUMULATE_WEIGHTED_GRAIN(6);
	}
	}
	// delta_row == 0
	WRITE_AUTO_REGRESSION_RESULT(0);
	WRITE_AUTO_REGRESSION_RESULT(1);
	WRITE_AUTO_REGRESSION_RESULT(2);
	WRITE_AUTO_REGRESSION_RESULT(3);
	luma_grain += kLumaWidth;
	} while (++y < kLumaHeight);

	#undef WRITE_AUTO_REGRESSION_RESULT
	#undef ACCUMULATE_WEIGHTED_GRAIN
	}

	template <int bitdepth>
	bool FilmGrain<bitdepth>::Init_NEON() {
	// Section 7.18.3.3. Generate grain process.
	GenerateLumaGrain(params_, luma_grain_);
	using AutoRegressionFn =
	void (*)(const FilmGrainParams& params, int grain_min, int grain_max,
	GrainType* luma_grain);
	static const AutoRegressionFn kAutoRegressionFnTableLuma[3] = {
	FilmGrain<bitdepth>::ApplyAutoRegressiveFilterToLumaGrain_NEON<1>,
	FilmGrain<bitdepth>::ApplyAutoRegressiveFilterToLumaGrain_NEON<2>,
	FilmGrain<bitdepth>::ApplyAutoRegressiveFilterToLumaGrain_NEON<3>};
	if (params_.auto_regression_coeff_lag > 0 && params_.num_y_points > 0) {
	kAutoRegressionFnTableLuma[params_.auto_regression_coeff_lag - 1](
	params_, kGrainMin, kGrainMax, luma_grain_);
	}
	if (!is_monochrome_) {
	GenerateChromaGrains(params_, chroma_width_, chroma_height_, u_grain_,
	v_grain_);
	ApplyAutoRegressiveFilterToChromaGrains(
	params_, kGrainMin, kGrainMax, luma_grain_, subsampling_x_,
	subsampling_y_, chroma_width_, chroma_height_, u_grain_, v_grain_);
	}

	// Section 7.18.3.4. Scaling lookup initialization process.
	InitializeScalingLookupTable(params_.num_y_points, params_.point_y_value,
	params_.point_y_scaling, scaling_lut_y_);
	if (!is_monochrome_) {
	if (params_.chroma_scaling_from_luma) {
	scaling_lut_u_ = scaling_lut_y_;
	scaling_lut_v_ = scaling_lut_y_;
	} else {
	scaling_lut_chroma_buffer_.reset(new (std::nothrow) uint8_t[256 * 2]);
	if (scaling_lut_chroma_buffer_ == nullptr) return false;
	scaling_lut_u_ = &scaling_lut_chroma_buffer_[0];
	scaling_lut_v_ = &scaling_lut_chroma_buffer_[256];
	InitializeScalingLookupTable(params_.num_u_points, params_.point_u_value,
	params_.point_u_scaling, scaling_lut_u_);
	InitializeScalingLookupTable(params_.num_v_points, params_.point_v_value,
	params_.point_v_scaling, scaling_lut_v_);
	}
	}
	return true;
	}

	template <int bitdepth>
	bool FilmGrain<bitdepth>::AddNoise_NEON(
	const void* source_plane_y, ptrdiff_t source_stride_y,
	const void* source_plane_u, ptrdiff_t source_stride_u,
	const void* source_plane_v, ptrdiff_t source_stride_v, void* dest_plane_y,
	ptrdiff_t dest_stride_y, void* dest_plane_u, ptrdiff_t dest_stride_u,
	void* dest_plane_v, ptrdiff_t dest_stride_v) {
	if (!Init_NEON()) {
	LIBGAV1_DLOG(ERROR, "Init() failed.");
	return false;
	}
	if (!AllocateNoiseStripes()) {
	LIBGAV1_DLOG(ERROR, "AllocateNoiseStripes() failed.");
	return false;
	}
	ConstructNoiseStripes();

	if (!AllocateNoiseImage()) {
	LIBGAV1_DLOG(ERROR, "AllocateNoiseImage() failed.");
	return false;
	}
	ConstructNoiseImage();

	BlendNoiseWithImage(source_plane_y, source_stride_y, source_plane_u,
	source_stride_u, source_plane_v, source_stride_v,
	dest_plane_y, dest_stride_y, dest_plane_u, dest_stride_u,
	dest_plane_v, dest_stride_v);

	return true;
	}

	// Explicit instantiations.
	template class FilmGrain<8>;
	#if LIBGAV1_MAX_BITDEPTH >= 10
	template class FilmGrain<10>;
	#endif

	} // namespace dsp
	} // namespace libgav1

	#else // !LIBGAV1_ENABLE_NEON

	namespace libgav1 {
	namespace dsp {

	void FilmGrainInit_NEON() {}

	} // namespace dsp
	} // namespace libgav1
	#endif // LIBGAV1_ENABLE_NEON