blob: 77121e0316f3dd5193db9ea0bf7215977381c500 [file] [log] [blame]
// Copyright 2019 The libgav1 Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "src/dsp/film_grain.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
#include <arm_neon.h>
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <new>
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/arm/film_grain_neon.h"
#include "src/dsp/common.h"
#include "src/utils/common.h"
#include "src/utils/logging.h"
namespace libgav1 {
namespace dsp {
namespace {
// Section 7.18.3.1.
template <int bitdepth>
bool FilmGrainSynthesis_NEON(
const void* source_plane_y, ptrdiff_t source_stride_y,
const void* source_plane_u, ptrdiff_t source_stride_u,
const void* source_plane_v, ptrdiff_t source_stride_v,
const FilmGrainParams& film_grain_params, const bool is_monochrome,
const bool color_matrix_is_identity, const int width, const int height,
const int subsampling_x, const int subsampling_y, void* dest_plane_y,
ptrdiff_t dest_stride_y, void* dest_plane_u, ptrdiff_t dest_stride_u,
void* dest_plane_v, ptrdiff_t dest_stride_v) {
FilmGrain<bitdepth> film_grain(film_grain_params, is_monochrome,
color_matrix_is_identity, subsampling_x,
subsampling_y, width, height);
return film_grain.AddNoise_NEON(
source_plane_y, source_stride_y, source_plane_u, source_stride_u,
source_plane_v, source_stride_v, dest_plane_y, dest_stride_y,
dest_plane_u, dest_stride_u, dest_plane_v, dest_stride_v);
}
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
assert(dsp != nullptr);
dsp->film_grain_synthesis = FilmGrainSynthesis_NEON<8>;
}
#if LIBGAV1_MAX_BITDEPTH >= 10
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
assert(dsp != nullptr);
dsp->film_grain_synthesis = FilmGrainSynthesis_NEON<10>;
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
inline int16x8_t GetSource8(const int8_t* src) {
return vmovl_s8(vld1_s8(src));
}
#if LIBGAV1_MAX_BITDEPTH >= 10
inline int16x8_t GetSource8(const int16_t* src) { return vld1q_s16(src); }
#endif // LIBGAV1_MAX_BITDEPTH >= 10
template <int position_offset>
inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
const int16x8_t grain_hi,
int16_t coeff, int32x4x2_t sum) {
const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset);
sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff);
sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff);
return sum;
}
template <int bitdepth, int auto_regression_coeff_lag, int lane>
inline void WriteFinalAutoRegression(int8_t* luma_grain_cursor, int32x4x2_t sum,
const int8_t* coeffs, int pos, int shift) {
int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
result += luma_grain_cursor[lane + delta_col] * coeffs[pos];
++pos;
}
luma_grain_cursor[lane] =
Clip3(luma_grain_cursor[lane] + RightShiftWithRounding(result, shift),
FilmGrain<bitdepth>::kGrainMin, FilmGrain<bitdepth>::kGrainMax);
}
#if LIBGAV1_MAX_BITDEPTH >= 10
template <int bitdepth, int auto_regression_coeff_lag, int lane>
inline void WriteFinalAutoRegression(int16_t* luma_grain_cursor,
int32x4x2_t sum, const int8_t* coeffs,
int pos, int shift) {
int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
result += luma_grain_cursor[lane + delta_col] * coeffs[pos];
++pos;
}
luma_grain_cursor[lane] =
Clip3(luma_grain_cursor[lane] + RightShiftWithRounding(result, shift),
FilmGrain<bitdepth>::kGrainMin, FilmGrain<bitdepth>::kGrainMax);
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
inline void SetZero(int32x4x2_t* v) {
v->val[0] = vdupq_n_s32(0);
v->val[1] = vdupq_n_s32(0);
}
constexpr int kAutoRegressionBorder = 3;
} // namespace
void FilmGrainInit_NEON() {
Init8bpp();
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif // LIBGAV1_MAX_BITDEPTH >= 10
}
// Applies an auto-regressive filter to the white noise in luma_grain.
template <int bitdepth>
template <int auto_regression_coeff_lag>
void FilmGrain<bitdepth>::ApplyAutoRegressiveFilterToLumaGrain_NEON(
const FilmGrainParams& params, int /*grain_min*/, int /*grain_max*/,
GrainType* luma_grain) {
static_assert(auto_regression_coeff_lag > 0, "");
const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y;
const uint8_t auto_regression_shift = params.auto_regression_shift;
int y = 3;
luma_grain += kLumaWidth * y;
do {
// Each row is computed 8 values at a time in the following loop. At the
// end of the loop, 4 values remain to write. They are given a special
// reduced iteration at the end.
int x = 3;
do {
int pos = 0;
int32x4x2_t sum;
SetZero(&sum);
for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
++delta_row) {
const int16x8_t src_grain_lo =
GetSource8(luma_grain + x + delta_row * kLumaWidth -
auto_regression_coeff_lag);
const int16x8_t src_grain_hi =
GetSource8(luma_grain + x + delta_row * kLumaWidth -
auto_regression_coeff_lag + 8);
#define ACCUMULATE_WEIGHTED_GRAIN(offset) \
sum = AccumulateWeightedGrain<offset>(src_grain_lo, src_grain_hi, \
auto_regression_coeff_y[pos++], sum);
// A pictorial representation of the auto-regressive filter for
// various values of params.auto_regression_coeff_lag. The letter 'O'
// represents the current sample. (The filter always operates on the
// current sample with filter coefficient 1.) The letters 'X'
// represent the neighboring samples that the filter operates on, below
// their corresponding "offset" number.
//
// params.auto_regression_coeff_lag == 3:
// 0 1 2 3 4 5 6
// X X X X X X X
// X X X X X X X
// X X X X X X X
// X X X O
// params.auto_regression_coeff_lag == 2:
// 0 1 2 3 4
// X X X X X
// X X X X X
// X X O
// params.auto_regression_coeff_lag == 1:
// 0 1 2
// X X X
// X O
// params.auto_regression_coeff_lag == 0:
// O
// The function relies on the caller to skip the call in the 0 lag
// case.
ACCUMULATE_WEIGHTED_GRAIN(0);
ACCUMULATE_WEIGHTED_GRAIN(1);
ACCUMULATE_WEIGHTED_GRAIN(2);
// The horizontal |auto_regression_coeff_lag| loop is replaced with
// if-statements to give vextq_s16 an immediate param.
if (auto_regression_coeff_lag > 1) {
ACCUMULATE_WEIGHTED_GRAIN(3);
ACCUMULATE_WEIGHTED_GRAIN(4);
}
if (auto_regression_coeff_lag > 2) {
assert(auto_regression_coeff_lag == 3);
ACCUMULATE_WEIGHTED_GRAIN(5);
ACCUMULATE_WEIGHTED_GRAIN(6);
}
}
// delta_row == 0
// At this point in the filter, the source addresses and destination
// addresses overlap. Because this is a causal filter, the higher lanes
// cannot be computed without the results of the lower lanes.
// Each call to WriteFinalAutoRegression incorporates preceding values
// on the final row, and writes a single sample. This allows the next
// pixel's value to be computed in the next call.
#define WRITE_AUTO_REGRESSION_RESULT(lane) \
WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( \
luma_grain + x, sum, auto_regression_coeff_y, pos, \
auto_regression_shift);
WRITE_AUTO_REGRESSION_RESULT(0);
WRITE_AUTO_REGRESSION_RESULT(1);
WRITE_AUTO_REGRESSION_RESULT(2);
WRITE_AUTO_REGRESSION_RESULT(3);
WRITE_AUTO_REGRESSION_RESULT(4);
WRITE_AUTO_REGRESSION_RESULT(5);
WRITE_AUTO_REGRESSION_RESULT(6);
WRITE_AUTO_REGRESSION_RESULT(7);
x += 8;
// Leave the final four pixels for the special iteration below.
} while (x < kLumaWidth - kAutoRegressionBorder - 4);
// Final 4 pixels in the row.
int pos = 0;
int32x4x2_t sum;
SetZero(&sum);
for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
++delta_row) {
const int16x8_t src_grain_lo = GetSource8(
luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag);
const int16x8_t src_grain_hi =
GetSource8(luma_grain + x + delta_row * kLumaWidth -
auto_regression_coeff_lag + 8);
ACCUMULATE_WEIGHTED_GRAIN(0);
ACCUMULATE_WEIGHTED_GRAIN(1);
ACCUMULATE_WEIGHTED_GRAIN(2);
// The horizontal |auto_regression_coeff_lag| loop is replaced with
// if-statements to give vextq_s16 an immediate param.
if (auto_regression_coeff_lag > 1) {
ACCUMULATE_WEIGHTED_GRAIN(3);
ACCUMULATE_WEIGHTED_GRAIN(4);
}
if (auto_regression_coeff_lag > 2) {
assert(auto_regression_coeff_lag == 3);
ACCUMULATE_WEIGHTED_GRAIN(5);
ACCUMULATE_WEIGHTED_GRAIN(6);
}
}
// delta_row == 0
WRITE_AUTO_REGRESSION_RESULT(0);
WRITE_AUTO_REGRESSION_RESULT(1);
WRITE_AUTO_REGRESSION_RESULT(2);
WRITE_AUTO_REGRESSION_RESULT(3);
luma_grain += kLumaWidth;
} while (++y < kLumaHeight);
#undef WRITE_AUTO_REGRESSION_RESULT
#undef ACCUMULATE_WEIGHTED_GRAIN
}
template <int bitdepth>
bool FilmGrain<bitdepth>::Init_NEON() {
// Section 7.18.3.3. Generate grain process.
GenerateLumaGrain(params_, luma_grain_);
using AutoRegressionFn =
void (*)(const FilmGrainParams& params, int grain_min, int grain_max,
GrainType* luma_grain);
static const AutoRegressionFn kAutoRegressionFnTableLuma[3] = {
FilmGrain<bitdepth>::ApplyAutoRegressiveFilterToLumaGrain_NEON<1>,
FilmGrain<bitdepth>::ApplyAutoRegressiveFilterToLumaGrain_NEON<2>,
FilmGrain<bitdepth>::ApplyAutoRegressiveFilterToLumaGrain_NEON<3>};
if (params_.auto_regression_coeff_lag > 0 && params_.num_y_points > 0) {
kAutoRegressionFnTableLuma[params_.auto_regression_coeff_lag - 1](
params_, kGrainMin, kGrainMax, luma_grain_);
}
if (!is_monochrome_) {
GenerateChromaGrains(params_, chroma_width_, chroma_height_, u_grain_,
v_grain_);
ApplyAutoRegressiveFilterToChromaGrains(
params_, kGrainMin, kGrainMax, luma_grain_, subsampling_x_,
subsampling_y_, chroma_width_, chroma_height_, u_grain_, v_grain_);
}
// Section 7.18.3.4. Scaling lookup initialization process.
InitializeScalingLookupTable(params_.num_y_points, params_.point_y_value,
params_.point_y_scaling, scaling_lut_y_);
if (!is_monochrome_) {
if (params_.chroma_scaling_from_luma) {
scaling_lut_u_ = scaling_lut_y_;
scaling_lut_v_ = scaling_lut_y_;
} else {
scaling_lut_chroma_buffer_.reset(new (std::nothrow) uint8_t[256 * 2]);
if (scaling_lut_chroma_buffer_ == nullptr) return false;
scaling_lut_u_ = &scaling_lut_chroma_buffer_[0];
scaling_lut_v_ = &scaling_lut_chroma_buffer_[256];
InitializeScalingLookupTable(params_.num_u_points, params_.point_u_value,
params_.point_u_scaling, scaling_lut_u_);
InitializeScalingLookupTable(params_.num_v_points, params_.point_v_value,
params_.point_v_scaling, scaling_lut_v_);
}
}
return true;
}
template <int bitdepth>
bool FilmGrain<bitdepth>::AddNoise_NEON(
const void* source_plane_y, ptrdiff_t source_stride_y,
const void* source_plane_u, ptrdiff_t source_stride_u,
const void* source_plane_v, ptrdiff_t source_stride_v, void* dest_plane_y,
ptrdiff_t dest_stride_y, void* dest_plane_u, ptrdiff_t dest_stride_u,
void* dest_plane_v, ptrdiff_t dest_stride_v) {
if (!Init_NEON()) {
LIBGAV1_DLOG(ERROR, "Init() failed.");
return false;
}
if (!AllocateNoiseStripes()) {
LIBGAV1_DLOG(ERROR, "AllocateNoiseStripes() failed.");
return false;
}
ConstructNoiseStripes();
if (!AllocateNoiseImage()) {
LIBGAV1_DLOG(ERROR, "AllocateNoiseImage() failed.");
return false;
}
ConstructNoiseImage();
BlendNoiseWithImage(source_plane_y, source_stride_y, source_plane_u,
source_stride_u, source_plane_v, source_stride_v,
dest_plane_y, dest_stride_y, dest_plane_u, dest_stride_u,
dest_plane_v, dest_stride_v);
return true;
}
// Explicit instantiations.
template class FilmGrain<8>;
#if LIBGAV1_MAX_BITDEPTH >= 10
template class FilmGrain<10>;
#endif
} // namespace dsp
} // namespace libgav1
#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
void FilmGrainInit_NEON() {}
} // namespace dsp
} // namespace libgav1
#endif // LIBGAV1_ENABLE_NEON