blob: cad9f7bb873956245341f32c9130e37efb14e81e [file] [log] [blame]
#include "src/dsp/arm/intrapred_neon.h"
#include "src/dsp/dsp.h"
#if LIBGAV1_ENABLE_NEON
#include <arm_neon.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include "src/dsp/arm/common_neon.h"
namespace libgav1 {
namespace dsp {
namespace low_bitdepth {
namespace {
// Saturate |dc + ((alpha * luma) >> 6))| to uint8_t.
inline uint8x8_t Combine8(const int16x8_t luma, const int alpha,
const int16x8_t dc) {
const int16x8_t la = vmulq_n_s16(luma, alpha);
// Subtract the sign bit to round towards zero.
const int16x8_t sub_sign = vsubq_s16(
la, vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(la), 15)));
// Shift and accumulate.
const int16x8_t result = vrsraq_n_s16(dc, sub_sign, 6);
return vqmovun_s16(result);
}
// The range of luma/alpha is not really important because it gets saturated to
// uint8_t. Saturated int16_t >> 6 outranges uint8_t.
template <int block_height>
inline void CflIntraPredictor4xN_NEON(
void* const dest, const ptrdiff_t stride,
const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int alpha) {
auto* dst = static_cast<uint8_t*>(dest);
const int16x8_t dc = vdupq_n_s16(dst[0]);
for (int y = 0; y < block_height; y += 2) {
const int16x4_t luma_row0 = vld1_s16(luma[y]);
const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
const uint8x8_t sum =
Combine8(vcombine_s16(luma_row0, luma_row1), alpha, dc);
StoreLo4(dst, sum);
dst += stride;
StoreHi4(dst, sum);
dst += stride;
}
}
template <int block_height>
inline void CflIntraPredictor8xN_NEON(
void* const dest, const ptrdiff_t stride,
const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int alpha) {
auto* dst = static_cast<uint8_t*>(dest);
const int16x8_t dc = vdupq_n_s16(dst[0]);
for (int y = 0; y < block_height; ++y) {
const int16x8_t luma_row = vld1q_s16(luma[y]);
const uint8x8_t sum = Combine8(luma_row, alpha, dc);
vst1_u8(dst, sum);
dst += stride;
}
}
template <int block_height>
inline void CflIntraPredictor16xN_NEON(
void* const dest, const ptrdiff_t stride,
const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int alpha) {
auto* dst = static_cast<uint8_t*>(dest);
const int16x8_t dc = vdupq_n_s16(dst[0]);
for (int y = 0; y < block_height; ++y) {
const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
vst1_u8(dst, sum_0);
vst1_u8(dst + 8, sum_1);
dst += stride;
}
}
template <int block_height>
inline void CflIntraPredictor32xN_NEON(
void* const dest, const ptrdiff_t stride,
const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
const int alpha) {
auto* dst = static_cast<uint8_t*>(dest);
const int16x8_t dc = vdupq_n_s16(dst[0]);
for (int y = 0; y < block_height; ++y) {
const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
const uint8x8_t sum_2 = Combine8(luma_row_2, alpha, dc);
const uint8x8_t sum_3 = Combine8(luma_row_3, alpha, dc);
vst1_u8(dst, sum_0);
vst1_u8(dst + 8, sum_1);
vst1_u8(dst + 16, sum_2);
vst1_u8(dst + 24, sum_3);
dst += stride;
}
}
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
assert(dsp != nullptr);
dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
dsp->cfl_intra_predictors[kTransformSize16x16] =
CflIntraPredictor16xN_NEON<16>;
dsp->cfl_intra_predictors[kTransformSize16x32] =
CflIntraPredictor16xN_NEON<32>;
dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
dsp->cfl_intra_predictors[kTransformSize32x16] =
CflIntraPredictor32xN_NEON<16>;
dsp->cfl_intra_predictors[kTransformSize32x32] =
CflIntraPredictor32xN_NEON<32>;
// Max Cfl predictor size is 32x32.
}
} // namespace
} // namespace low_bitdepth
void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
void IntraPredCflInit_NEON() {}
} // namespace dsp
} // namespace libgav1
#endif // LIBGAV1_ENABLE_NEON