blob: d77b9c76ced0cda86aff328d64639b13bfe4a2c0 [file] [log] [blame]
// Copyright 2020 The libgav1 Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/super_res.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
#include <arm_neon.h>
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
namespace low_bitdepth {
namespace {
void ComputeSuperRes_NEON(const void* source, const int upscaled_width,
const int initial_subpixel_x, const int step,
void* const dest) {
const auto* src = static_cast<const uint8_t*>(source);
auto* dst = static_cast<uint8_t*>(dest);
src -= kSuperResFilterTaps >> 1;
int p = initial_subpixel_x;
uint16x8_t weighted_src[8];
for (int x = 0; x < upscaled_width; x += 8) {
for (int i = 0; i < kSuperResFilterTaps; ++i, p += step) {
const uint8x8_t src_x = vld1_u8(&src[p >> kSuperResScaleBits]);
const int remainder = p & kSuperResScaleMask;
const uint8x8_t filter =
vld1_u8(kUpscaleFilterUnsigned[remainder >> kSuperResExtraBits]);
weighted_src[i] = vmull_u8(src_x, filter);
}
Transpose8x8(weighted_src);
// Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
// Maximum sum: 255*171 == 0xAA55
// The sum is clipped to [0, 255], so adding all positive and then
// subtracting all negative with saturation is sufficient.
// 0 1 2 3 4 5 6 7
// tap sign: - + - + + - + -
uint16x8_t res = weighted_src[1];
res = vaddq_u16(res, weighted_src[3]);
res = vaddq_u16(res, weighted_src[4]);
res = vaddq_u16(res, weighted_src[6]);
res = vqsubq_u16(res, weighted_src[0]);
res = vqsubq_u16(res, weighted_src[2]);
res = vqsubq_u16(res, weighted_src[5]);
res = vqsubq_u16(res, weighted_src[7]);
vst1_u8(&dst[x], vqrshrn_n_u16(res, kFilterBits));
}
}
void Init8bpp() {
Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
dsp->super_res_row = ComputeSuperRes_NEON;
}
} // namespace
} // namespace low_bitdepth
void SuperResInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
void SuperResInit_NEON() {}
} // namespace dsp
} // namespace libgav1
#endif // LIBGAV1_ENABLE_NEON