| // Copyright 2019 The libgav1 Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "src/dsp/film_grain.h" |
| #include "src/utils/cpu.h" |
| |
| #if LIBGAV1_ENABLE_NEON |
| #include <arm_neon.h> |
| |
| #include <algorithm> |
| #include <cassert> |
| #include <cstddef> |
| #include <cstdint> |
| #include <cstring> |
| #include <new> |
| |
| #include "src/dsp/arm/common_neon.h" |
| #include "src/dsp/arm/film_grain_neon.h" |
| #include "src/dsp/common.h" |
| #include "src/dsp/dsp.h" |
| #include "src/dsp/film_grain_impl.h" |
| #include "src/utils/common.h" |
| #include "src/utils/compiler_attributes.h" |
| #include "src/utils/logging.h" |
| |
| namespace libgav1 { |
| namespace dsp { |
| namespace film_grain { |
| namespace { |
| |
| // This function is overloaded for both possible GrainTypes in order to simplify |
| // loading in a template function. |
| inline int16x8_t GetSource8(const int8_t* src) { |
| return vmovl_s8(vld1_s8(src)); |
| } |
| |
| #if LIBGAV1_MAX_BITDEPTH >= 10 |
| inline int16x8_t GetSource8(const int16_t* src) { return vld1q_s16(src); } |
| #endif // LIBGAV1_MAX_BITDEPTH >= 10 |
| |
| // Each element in |sum| represents one destination value's running |
| // autoregression formula. The fixed source values in |grain_lo| and |grain_hi| |
| // allow for a sliding window in successive calls to this function. |
| template <int position_offset> |
| inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo, |
| const int16x8_t grain_hi, |
| int16_t coeff, int32x4x2_t sum) { |
| const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset); |
| sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff); |
| sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff); |
| return sum; |
| } |
| |
| // Because the autoregressive filter requires the output of each pixel to |
| // compute pixels that come after in the row, we have to finish the calculations |
| // one at a time. |
| template <int bitdepth, int auto_regression_coeff_lag, int lane> |
| inline void WriteFinalAutoRegression(int8_t* grain_cursor, int32x4x2_t sum, |
| const int8_t* coeffs, int pos, int shift) { |
| int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3); |
| |
| for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) { |
| result += grain_cursor[lane + delta_col] * coeffs[pos]; |
| ++pos; |
| } |
| grain_cursor[lane] = |
| Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift), |
| GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>()); |
| } |
| |
| #if LIBGAV1_MAX_BITDEPTH >= 10 |
| template <int bitdepth, int auto_regression_coeff_lag, int lane> |
| inline void WriteFinalAutoRegression(int16_t* grain_cursor, int32x4x2_t sum, |
| const int8_t* coeffs, int pos, int shift) { |
| int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3); |
| |
| for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) { |
| result += grain_cursor[lane + delta_col] * coeffs[pos]; |
| ++pos; |
| } |
| grain_cursor[lane] = |
| Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift), |
| GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>()); |
| } |
| #endif // LIBGAV1_MAX_BITDEPTH >= 10 |
| |
| // Because the autoregressive filter requires the output of each pixel to |
| // compute pixels that come after in the row, we have to finish the calculations |
| // one at a time. |
| template <int bitdepth, int auto_regression_coeff_lag, int lane> |
| inline void WriteFinalAutoRegressionChroma(int8_t* u_grain_cursor, |
| int8_t* v_grain_cursor, |
| int32x4x2_t sum_u, int32x4x2_t sum_v, |
| const int8_t* coeffs_u, |
| const int8_t* coeffs_v, int pos, |
| int shift) { |
| WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( |
| u_grain_cursor, sum_u, coeffs_u, pos, shift); |
| WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( |
| v_grain_cursor, sum_v, coeffs_v, pos, shift); |
| } |
| |
| #if LIBGAV1_MAX_BITDEPTH >= 10 |
| template <int bitdepth, int auto_regression_coeff_lag, int lane> |
| inline void WriteFinalAutoRegressionChroma(int16_t* u_grain_cursor, |
| int16_t* v_grain_cursor, |
| int32x4x2_t sum_u, int32x4x2_t sum_v, |
| const int8_t* coeffs_u, |
| const int8_t* coeffs_v, int pos, |
| int shift) { |
| WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( |
| u_grain_cursor, sum_u, coeffs_u, pos, shift); |
| WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( |
| v_grain_cursor, sum_v, coeffs_v, pos, shift); |
| } |
| #endif // LIBGAV1_MAX_BITDEPTH >= 10 |
| |
| inline void SetZero(int32x4x2_t* v) { |
| v->val[0] = vdupq_n_s32(0); |
| v->val[1] = vdupq_n_s32(0); |
| } |
| |
| // Computes subsampled luma for use with chroma, by averaging in the x direction |
| // or y direction when applicable. |
| int16x8_t GetSubsampledLuma(const int8_t* const luma, int subsampling_x, |
| int subsampling_y, ptrdiff_t stride) { |
| if (subsampling_y != 0) { |
| assert(subsampling_x != 0); |
| const int8x16_t src0 = vld1q_s8(luma); |
| const int8x16_t src1 = vld1q_s8(luma + stride); |
| const int16x8_t ret0 = vcombine_s16(vpaddl_s8(vget_low_s8(src0)), |
| vpaddl_s8(vget_high_s8(src0))); |
| const int16x8_t ret1 = vcombine_s16(vpaddl_s8(vget_low_s8(src1)), |
| vpaddl_s8(vget_high_s8(src1))); |
| return vrshrq_n_s16(vaddq_s16(ret0, ret1), 2); |
| } |
| if (subsampling_x != 0) { |
| const int8x16_t src = vld1q_s8(luma); |
| return vrshrq_n_s16( |
| vcombine_s16(vpaddl_s8(vget_low_s8(src)), vpaddl_s8(vget_high_s8(src))), |
| 1); |
| } |
| return vmovl_s8(vld1_s8(luma)); |
| } |
| |
| #if LIBGAV1_MAX_BITDEPTH >= 10 |
| // Computes subsampled luma for use with chroma, by averaging in the x direction |
| // or y direction when applicable. |
| int16x8_t GetSubsampledLuma(const int16_t* const luma, int subsampling_x, |
| int subsampling_y, ptrdiff_t stride) { |
| if (subsampling_y != 0) { |
| assert(subsampling_x != 0); |
| int16x8_t src0_lo = vld1q_s16(luma); |
| int16x8_t src0_hi = vld1q_s16(luma + 8); |
| const int16x8_t src1_lo = vld1q_s16(luma + stride); |
| const int16x8_t src1_hi = vld1q_s16(luma + stride + 8); |
| const int16x8_t src0 = |
| vcombine_s16(vpadd_s16(vget_low_s16(src0_lo), vget_high_s16(src0_lo)), |
| vpadd_s16(vget_low_s16(src0_hi), vget_high_s16(src0_hi))); |
| const int16x8_t src1 = |
| vcombine_s16(vpadd_s16(vget_low_s16(src1_lo), vget_high_s16(src1_lo)), |
| vpadd_s16(vget_low_s16(src1_hi), vget_high_s16(src1_hi))); |
| return vrshrq_n_s16(vaddq_s16(src0, src1), 2); |
| } |
| if (subsampling_x != 0) { |
| const int16x8_t src_lo = vld1q_s16(luma); |
| const int16x8_t src_hi = vld1q_s16(luma + 8); |
| const int16x8_t ret = |
| vcombine_s16(vpadd_s16(vget_low_s16(src_lo), vget_high_s16(src_lo)), |
| vpadd_s16(vget_low_s16(src_hi), vget_high_s16(src_hi))); |
| return vrshrq_n_s16(ret, 1); |
| } |
| return vld1q_s16(luma); |
| } |
| #endif // LIBGAV1_MAX_BITDEPTH >= 10 |
| |
| template <int bitdepth, typename GrainType, int auto_regression_coeff_lag, |
| bool use_luma> |
| void ApplyAutoRegressiveFilterToChromaGrains_NEON(const FilmGrainParams& params, |
| const void* luma_grain_buffer, |
| int subsampling_x, |
| int subsampling_y, |
| void* u_grain_buffer, |
| void* v_grain_buffer) { |
| static_assert(auto_regression_coeff_lag <= 3, "Invalid autoregression lag."); |
| const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer); |
| auto* u_grain = static_cast<GrainType*>(u_grain_buffer); |
| auto* v_grain = static_cast<GrainType*>(v_grain_buffer); |
| const int auto_regression_shift = params.auto_regression_shift; |
| const int chroma_width = |
| (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth; |
| const int chroma_height = |
| (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight; |
| // When |chroma_width| == 44, we write 8 at a time from x in [3, 34], |
| // leaving [35, 40] to write at the end. |
| const int chroma_width_remainder = |
| (chroma_width - 2 * kAutoRegressionBorder) & 7; |
| |
| int y = kAutoRegressionBorder; |
| luma_grain += kLumaWidth * y; |
| u_grain += chroma_width * y; |
| v_grain += chroma_width * y; |
| do { |
| // Each row is computed 8 values at a time in the following loop. At the |
| // end of the loop, 4 values remain to write. They are given a special |
| // reduced iteration at the end. |
| int x = kAutoRegressionBorder; |
| int luma_x = kAutoRegressionBorder; |
| do { |
| int pos = 0; |
| int32x4x2_t sum_u; |
| int32x4x2_t sum_v; |
| SetZero(&sum_u); |
| SetZero(&sum_v); |
| |
| if (auto_regression_coeff_lag > 0) { |
| for (int delta_row = -auto_regression_coeff_lag; delta_row < 0; |
| ++delta_row) { |
| // These loads may overflow to the next row, but they are never called |
| // on the final row of a grain block. Therefore, they will never |
| // exceed the block boundaries. |
| const int16x8_t u_grain_lo = |
| GetSource8(u_grain + x + delta_row * chroma_width - |
| auto_regression_coeff_lag); |
| const int16x8_t u_grain_hi = |
| GetSource8(u_grain + x + delta_row * chroma_width - |
| auto_regression_coeff_lag + 8); |
| const int16x8_t v_grain_lo = |
| GetSource8(v_grain + x + delta_row * chroma_width - |
| auto_regression_coeff_lag); |
| const int16x8_t v_grain_hi = |
| GetSource8(v_grain + x + delta_row * chroma_width - |
| auto_regression_coeff_lag + 8); |
| #define ACCUMULATE_WEIGHTED_GRAIN(offset) \ |
| sum_u = AccumulateWeightedGrain<offset>( \ |
| u_grain_lo, u_grain_hi, params.auto_regression_coeff_u[pos], sum_u); \ |
| sum_v = AccumulateWeightedGrain<offset>( \ |
| v_grain_lo, v_grain_hi, params.auto_regression_coeff_v[pos++], sum_v) |
| |
| ACCUMULATE_WEIGHTED_GRAIN(0); |
| ACCUMULATE_WEIGHTED_GRAIN(1); |
| ACCUMULATE_WEIGHTED_GRAIN(2); |
| // The horizontal |auto_regression_coeff_lag| loop is replaced with |
| // if-statements to give vextq_s16 an immediate param. |
| if (auto_regression_coeff_lag > 1) { |
| ACCUMULATE_WEIGHTED_GRAIN(3); |
| ACCUMULATE_WEIGHTED_GRAIN(4); |
| } |
| if (auto_regression_coeff_lag > 2) { |
| assert(auto_regression_coeff_lag == 3); |
| ACCUMULATE_WEIGHTED_GRAIN(5); |
| ACCUMULATE_WEIGHTED_GRAIN(6); |
| } |
| } |
| } |
| |
| if (use_luma) { |
| const int16x8_t luma = GetSubsampledLuma( |
| luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth); |
| |
| // Luma samples get the final coefficient in the formula, but are best |
| // computed all at once before the final row. |
| const int coeff_u = |
| params.auto_regression_coeff_u[pos + auto_regression_coeff_lag]; |
| const int coeff_v = |
| params.auto_regression_coeff_v[pos + auto_regression_coeff_lag]; |
| |
| sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u); |
| sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u); |
| sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v); |
| sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v); |
| } |
| // At this point in the filter, the source addresses and destination |
| // addresses overlap. Because this is an auto-regressive filter, the |
| // higher lanes cannot be computed without the results of the lower lanes. |
| // Each call to WriteFinalAutoRegression incorporates preceding values |
| // on the final row, and writes a single sample. This allows the next |
| // pixel's value to be computed in the next call. |
| #define WRITE_AUTO_REGRESSION_RESULT(lane) \ |
| WriteFinalAutoRegressionChroma<bitdepth, auto_regression_coeff_lag, lane>( \ |
| u_grain + x, v_grain + x, sum_u, sum_v, params.auto_regression_coeff_u, \ |
| params.auto_regression_coeff_v, pos, auto_regression_shift) |
| |
| WRITE_AUTO_REGRESSION_RESULT(0); |
| WRITE_AUTO_REGRESSION_RESULT(1); |
| WRITE_AUTO_REGRESSION_RESULT(2); |
| WRITE_AUTO_REGRESSION_RESULT(3); |
| WRITE_AUTO_REGRESSION_RESULT(4); |
| WRITE_AUTO_REGRESSION_RESULT(5); |
| WRITE_AUTO_REGRESSION_RESULT(6); |
| WRITE_AUTO_REGRESSION_RESULT(7); |
| |
| x += 8; |
| luma_x += 8 << subsampling_x; |
| } while (x < chroma_width - kAutoRegressionBorder - chroma_width_remainder); |
| |
| // This is the "final iteration" of the above loop over width. We fill in |
| // the remainder of the width, which is less than 8. |
| int pos = 0; |
| int32x4x2_t sum_u; |
| int32x4x2_t sum_v; |
| SetZero(&sum_u); |
| SetZero(&sum_v); |
| |
| for (int delta_row = -auto_regression_coeff_lag; delta_row < 0; |
| ++delta_row) { |
| // These loads may overflow to the next row, but they are never called on |
| // the final row of a grain block. Therefore, they will never exceed the |
| // block boundaries. |
| const int16x8_t u_grain_lo = GetSource8( |
| u_grain + x + delta_row * chroma_width - auto_regression_coeff_lag); |
| const int16x8_t u_grain_hi = |
| GetSource8(u_grain + x + delta_row * chroma_width - |
| auto_regression_coeff_lag + 8); |
| const int16x8_t v_grain_lo = GetSource8( |
| v_grain + x + delta_row * chroma_width - auto_regression_coeff_lag); |
| const int16x8_t v_grain_hi = |
| GetSource8(v_grain + x + delta_row * chroma_width - |
| auto_regression_coeff_lag + 8); |
| |
| ACCUMULATE_WEIGHTED_GRAIN(0); |
| ACCUMULATE_WEIGHTED_GRAIN(1); |
| ACCUMULATE_WEIGHTED_GRAIN(2); |
| // The horizontal |auto_regression_coeff_lag| loop is replaced with |
| // if-statements to give vextq_s16 an immediate param. |
| if (auto_regression_coeff_lag > 1) { |
| ACCUMULATE_WEIGHTED_GRAIN(3); |
| ACCUMULATE_WEIGHTED_GRAIN(4); |
| } |
| if (auto_regression_coeff_lag > 2) { |
| assert(auto_regression_coeff_lag == 3); |
| ACCUMULATE_WEIGHTED_GRAIN(5); |
| ACCUMULATE_WEIGHTED_GRAIN(6); |
| } |
| } |
| |
| if (use_luma) { |
| const int16x8_t luma = GetSubsampledLuma( |
| luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth); |
| |
| // Luma samples get the final coefficient in the formula, but are best |
| // computed all at once before the final row. |
| const int coeff_u = |
| params.auto_regression_coeff_u[pos + auto_regression_coeff_lag]; |
| const int coeff_v = |
| params.auto_regression_coeff_v[pos + auto_regression_coeff_lag]; |
| |
| sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u); |
| sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u); |
| sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v); |
| sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v); |
| } |
| |
| WRITE_AUTO_REGRESSION_RESULT(0); |
| WRITE_AUTO_REGRESSION_RESULT(1); |
| WRITE_AUTO_REGRESSION_RESULT(2); |
| WRITE_AUTO_REGRESSION_RESULT(3); |
| if (chroma_width_remainder == 6) { |
| WRITE_AUTO_REGRESSION_RESULT(4); |
| WRITE_AUTO_REGRESSION_RESULT(5); |
| } |
| |
| luma_grain += kLumaWidth << subsampling_y; |
| u_grain += chroma_width; |
| v_grain += chroma_width; |
| } while (++y < chroma_height); |
| #undef ACCUMULATE_WEIGHTED_GRAIN |
| #undef WRITE_AUTO_REGRESSION_RESULT |
| } |
| |
| // Applies an auto-regressive filter to the white noise in luma_grain. |
| template <int bitdepth, typename GrainType, int auto_regression_coeff_lag> |
| void ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams& params, |
| void* luma_grain_buffer) { |
| static_assert(auto_regression_coeff_lag > 0, ""); |
| const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y; |
| const uint8_t auto_regression_shift = params.auto_regression_shift; |
| |
| int y = kAutoRegressionBorder; |
| auto* luma_grain = |
| static_cast<GrainType*>(luma_grain_buffer) + kLumaWidth * y; |
| do { |
| // Each row is computed 8 values at a time in the following loop. At the |
| // end of the loop, 4 values remain to write. They are given a special |
| // reduced iteration at the end. |
| int x = kAutoRegressionBorder; |
| do { |
| int pos = 0; |
| int32x4x2_t sum; |
| SetZero(&sum); |
| for (int delta_row = -auto_regression_coeff_lag; delta_row < 0; |
| ++delta_row) { |
| // These loads may overflow to the next row, but they are never called |
| // on the final row of a grain block. Therefore, they will never exceed |
| // the block boundaries. |
| const int16x8_t src_grain_lo = |
| GetSource8(luma_grain + x + delta_row * kLumaWidth - |
| auto_regression_coeff_lag); |
| const int16x8_t src_grain_hi = |
| GetSource8(luma_grain + x + delta_row * kLumaWidth - |
| auto_regression_coeff_lag + 8); |
| |
| // A pictorial representation of the auto-regressive filter for |
| // various values of params.auto_regression_coeff_lag. The letter 'O' |
| // represents the current sample. (The filter always operates on the |
| // current sample with filter coefficient 1.) The letters 'X' |
| // represent the neighboring samples that the filter operates on, below |
| // their corresponding "offset" number. |
| // |
| // params.auto_regression_coeff_lag == 3: |
| // 0 1 2 3 4 5 6 |
| // X X X X X X X |
| // X X X X X X X |
| // X X X X X X X |
| // X X X O |
| // params.auto_regression_coeff_lag == 2: |
| // 0 1 2 3 4 |
| // X X X X X |
| // X X X X X |
| // X X O |
| // params.auto_regression_coeff_lag == 1: |
| // 0 1 2 |
| // X X X |
| // X O |
| // params.auto_regression_coeff_lag == 0: |
| // O |
| // The function relies on the caller to skip the call in the 0 lag |
| // case. |
| |
| #define ACCUMULATE_WEIGHTED_GRAIN(offset) \ |
| sum = AccumulateWeightedGrain<offset>(src_grain_lo, src_grain_hi, \ |
| auto_regression_coeff_y[pos++], sum) |
| ACCUMULATE_WEIGHTED_GRAIN(0); |
| ACCUMULATE_WEIGHTED_GRAIN(1); |
| ACCUMULATE_WEIGHTED_GRAIN(2); |
| // The horizontal |auto_regression_coeff_lag| loop is replaced with |
| // if-statements to give vextq_s16 an immediate param. |
| if (auto_regression_coeff_lag > 1) { |
| ACCUMULATE_WEIGHTED_GRAIN(3); |
| ACCUMULATE_WEIGHTED_GRAIN(4); |
| } |
| if (auto_regression_coeff_lag > 2) { |
| assert(auto_regression_coeff_lag == 3); |
| ACCUMULATE_WEIGHTED_GRAIN(5); |
| ACCUMULATE_WEIGHTED_GRAIN(6); |
| } |
| } |
| // At this point in the filter, the source addresses and destination |
| // addresses overlap. Because this is an auto-regressive filter, the |
| // higher lanes cannot be computed without the results of the lower lanes. |
| // Each call to WriteFinalAutoRegression incorporates preceding values |
| // on the final row, and writes a single sample. This allows the next |
| // pixel's value to be computed in the next call. |
| #define WRITE_AUTO_REGRESSION_RESULT(lane) \ |
| WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( \ |
| luma_grain + x, sum, auto_regression_coeff_y, pos, \ |
| auto_regression_shift) |
| |
| WRITE_AUTO_REGRESSION_RESULT(0); |
| WRITE_AUTO_REGRESSION_RESULT(1); |
| WRITE_AUTO_REGRESSION_RESULT(2); |
| WRITE_AUTO_REGRESSION_RESULT(3); |
| WRITE_AUTO_REGRESSION_RESULT(4); |
| WRITE_AUTO_REGRESSION_RESULT(5); |
| WRITE_AUTO_REGRESSION_RESULT(6); |
| WRITE_AUTO_REGRESSION_RESULT(7); |
| x += 8; |
| // Leave the final four pixels for the special iteration below. |
| } while (x < kLumaWidth - kAutoRegressionBorder - 4); |
| |
| // Final 4 pixels in the row. |
| int pos = 0; |
| int32x4x2_t sum; |
| SetZero(&sum); |
| for (int delta_row = -auto_regression_coeff_lag; delta_row < 0; |
| ++delta_row) { |
| const int16x8_t src_grain_lo = GetSource8( |
| luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag); |
| const int16x8_t src_grain_hi = |
| GetSource8(luma_grain + x + delta_row * kLumaWidth - |
| auto_regression_coeff_lag + 8); |
| |
| ACCUMULATE_WEIGHTED_GRAIN(0); |
| ACCUMULATE_WEIGHTED_GRAIN(1); |
| ACCUMULATE_WEIGHTED_GRAIN(2); |
| // The horizontal |auto_regression_coeff_lag| loop is replaced with |
| // if-statements to give vextq_s16 an immediate param. |
| if (auto_regression_coeff_lag > 1) { |
| ACCUMULATE_WEIGHTED_GRAIN(3); |
| ACCUMULATE_WEIGHTED_GRAIN(4); |
| } |
| if (auto_regression_coeff_lag > 2) { |
| assert(auto_regression_coeff_lag == 3); |
| ACCUMULATE_WEIGHTED_GRAIN(5); |
| ACCUMULATE_WEIGHTED_GRAIN(6); |
| } |
| } |
| // delta_row == 0 |
| WRITE_AUTO_REGRESSION_RESULT(0); |
| WRITE_AUTO_REGRESSION_RESULT(1); |
| WRITE_AUTO_REGRESSION_RESULT(2); |
| WRITE_AUTO_REGRESSION_RESULT(3); |
| luma_grain += kLumaWidth; |
| } while (++y < kLumaHeight); |
| |
| #undef WRITE_AUTO_REGRESSION_RESULT |
| #undef ACCUMULATE_WEIGHTED_GRAIN |
| } |
| |
| void Init8bpp() { |
| Dsp* const dsp = dsp_internal::GetWritableDspTable(8); |
| assert(dsp != nullptr); |
| |
| // LumaAutoRegressionFunc[auto_regression_coeff_lag] |
| // Luma autoregression should never be called when lag is 0. |
| dsp->film_grain.luma_auto_regression[0] = nullptr; |
| dsp->film_grain.luma_auto_regression[1] = |
| ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 1>; |
| dsp->film_grain.luma_auto_regression[2] = |
| ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 2>; |
| dsp->film_grain.luma_auto_regression[3] = |
| ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 3>; |
| |
| // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag] |
| // Chroma autoregression should never be called when lag is 0 and use_luma is |
| // false. |
| dsp->film_grain.chroma_auto_regression[0][0] = nullptr; |
| dsp->film_grain.chroma_auto_regression[0][1] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 1, false>; |
| dsp->film_grain.chroma_auto_regression[0][2] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 2, false>; |
| dsp->film_grain.chroma_auto_regression[0][3] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 3, false>; |
| dsp->film_grain.chroma_auto_regression[1][0] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 0, true>; |
| dsp->film_grain.chroma_auto_regression[1][1] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 1, true>; |
| dsp->film_grain.chroma_auto_regression[1][2] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 2, true>; |
| dsp->film_grain.chroma_auto_regression[1][3] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 3, true>; |
| } |
| |
| #if LIBGAV1_MAX_BITDEPTH >= 10 |
| void Init10bpp() { |
| Dsp* const dsp = dsp_internal::GetWritableDspTable(10); |
| assert(dsp != nullptr); |
| |
| // LumaAutoRegressionFunc[auto_regression_coeff_lag] |
| // Luma autoregression should never be called when lag is 0. |
| dsp->film_grain.luma_auto_regression[0] = nullptr; |
| dsp->film_grain.luma_auto_regression[1] = |
| ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 1>; |
| dsp->film_grain.luma_auto_regression[2] = |
| ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 2>; |
| dsp->film_grain.luma_auto_regression[3] = |
| ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 3>; |
| |
| // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag][subsampling] |
| // Chroma autoregression should never be called when lag is 0 and use_luma is |
| // false. |
| dsp->film_grain.chroma_auto_regression[0][0] = nullptr; |
| dsp->film_grain.chroma_auto_regression[0][1] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 1, false>; |
| dsp->film_grain.chroma_auto_regression[0][2] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 2, false>; |
| dsp->film_grain.chroma_auto_regression[0][3] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 3, false>; |
| dsp->film_grain.chroma_auto_regression[1][0] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 0, true>; |
| dsp->film_grain.chroma_auto_regression[1][1] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 1, true>; |
| dsp->film_grain.chroma_auto_regression[1][2] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 2, true>; |
| dsp->film_grain.chroma_auto_regression[1][3] = |
| ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 3, true>; |
| } |
| #endif // LIBGAV1_MAX_BITDEPTH >= 10 |
| |
| } // namespace |
| } // namespace film_grain |
| |
| void FilmGrainInit_NEON() { |
| film_grain::Init8bpp(); |
| #if LIBGAV1_MAX_BITDEPTH >= 10 |
| film_grain::Init10bpp(); |
| #endif // LIBGAV1_MAX_BITDEPTH >= 10 |
| } |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| |
| #else // !LIBGAV1_ENABLE_NEON |
| |
| namespace libgav1 { |
| namespace dsp { |
| |
| void FilmGrainInit_NEON() {} |
| |
| } // namespace dsp |
| } // namespace libgav1 |
| #endif // LIBGAV1_ENABLE_NEON |