blob: 8a403a6b31248864807e499173198f9a8a705565 [file] [log] [blame]
// Copyright 2020 The libgav1 Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "src/dsp/motion_vector_search.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
#include <arm_neon.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
#include "src/utils/constants.h"
#include "src/utils/types.h"
namespace libgav1 {
namespace dsp {
namespace {
inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
const int32x4_t numerator) {
const int32x4_t m0 = vmull_s16(mv, denominator);
const int32x4_t m = vmulq_s32(m0, numerator);
// Add the sign (0 or -1) to round towards zero.
const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
return vqrshrn_n_s32(add_sign, 14);
}
inline int16x4_t MvProjectionCompound(const int16x4_t mv,
const int temporal_reference_offsets,
const int reference_offsets[2]) {
const int16x4_t denominator =
vdup_n_s16(kProjectionMvDivisionLookup[temporal_reference_offsets]);
const int32x2_t offset = vld1_s32(reference_offsets);
const int32x2x2_t offsets = vzip_s32(offset, offset);
const int32x4_t numerator = vcombine_s32(offsets.val[0], offsets.val[1]);
return MvProjection(mv, denominator, numerator);
}
inline int16x8_t ProjectionClip(const int16x4_t mv0, const int16x4_t mv1) {
const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
const int16x8_t mv = vcombine_s16(mv0, mv1);
const int16x8_t clamp = vminq_s16(mv, projection_mv_clamp);
return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
}
inline int16x8_t MvProjectionCompoundClip(
const MotionVector* const temporal_mvs,
const int8_t* const temporal_reference_offsets,
const int reference_offsets[2]) {
const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
const int32x2_t temporal_mv = vld1_s32(tmvs);
const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0));
const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1));
const int16x4_t mv0 = MvProjectionCompound(
tmv0, temporal_reference_offsets[0], reference_offsets);
const int16x4_t mv1 = MvProjectionCompound(
tmv1, temporal_reference_offsets[1], reference_offsets);
return ProjectionClip(mv0, mv1);
}
inline int16x8_t MvProjectionSingleClip(
const MotionVector* const temporal_mvs,
const int8_t* const temporal_reference_offsets, const int reference_offset,
int16x4_t* const lookup) {
const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
const int16x8_t temporal_mv = vld1q_s16(tmvs);
*lookup = vld1_lane_s16(
&kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0);
*lookup = vld1_lane_s16(
&kProjectionMvDivisionLookup[temporal_reference_offsets[1]], *lookup, 1);
*lookup = vld1_lane_s16(
&kProjectionMvDivisionLookup[temporal_reference_offsets[2]], *lookup, 2);
*lookup = vld1_lane_s16(
&kProjectionMvDivisionLookup[temporal_reference_offsets[3]], *lookup, 3);
const int16x4x2_t denominator = vzip_s16(*lookup, *lookup);
const int16x4_t tmv0 = vget_low_s16(temporal_mv);
const int16x4_t tmv1 = vget_high_s16(temporal_mv);
const int32x4_t numerator = vdupq_n_s32(reference_offset);
const int16x4_t mv0 = MvProjection(tmv0, denominator.val[0], numerator);
const int16x4_t mv1 = MvProjection(tmv1, denominator.val[1], numerator);
return ProjectionClip(mv0, mv1);
}
inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
const int16x8_t kRoundDownMask = vdupq_n_s16(1);
const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask);
vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1);
}
inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
const int16x8_t kRoundDownMask = vdupq_n_s16(7);
const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3));
const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask);
vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2);
}
void MvProjectionCompoundLowPrecision_NEON(
const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
const int reference_offsets[2], const int count,
CompoundMotionVector* candidate_mvs) {
// |reference_offsets| non-zero check usually equals true and is ignored.
// To facilitate the compilers, make a local copy of |reference_offsets|.
const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
// One more element could be calculated.
int loop_count = (count + 1) >> 1;
do {
const int16x8_t mv = MvProjectionCompoundClip(
temporal_mvs, temporal_reference_offsets, offsets);
LowPrecision(mv, candidate_mvs);
temporal_mvs += 2;
temporal_reference_offsets += 2;
candidate_mvs += 2;
} while (--loop_count);
}
void MvProjectionCompoundForceInteger_NEON(
const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
const int reference_offsets[2], const int count,
CompoundMotionVector* candidate_mvs) {
// |reference_offsets| non-zero check usually equals true and is ignored.
// To facilitate the compilers, make a local copy of |reference_offsets|.
const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
// One more element could be calculated.
int loop_count = (count + 1) >> 1;
do {
const int16x8_t mv = MvProjectionCompoundClip(
temporal_mvs, temporal_reference_offsets, offsets);
ForceInteger(mv, candidate_mvs);
temporal_mvs += 2;
temporal_reference_offsets += 2;
candidate_mvs += 2;
} while (--loop_count);
}
void MvProjectionCompoundHighPrecision_NEON(
const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
const int reference_offsets[2], const int count,
CompoundMotionVector* candidate_mvs) {
// |reference_offsets| non-zero check usually equals true and is ignored.
// To facilitate the compilers, make a local copy of |reference_offsets|.
const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
// One more element could be calculated.
int loop_count = (count + 1) >> 1;
do {
const int16x8_t mv = MvProjectionCompoundClip(
temporal_mvs, temporal_reference_offsets, offsets);
vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
temporal_mvs += 2;
temporal_reference_offsets += 2;
candidate_mvs += 2;
} while (--loop_count);
}
void MvProjectionSingleLowPrecision_NEON(
const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
const int reference_offset, const int count, MotionVector* candidate_mvs) {
// Up to three more elements could be calculated.
int loop_count = (count + 3) >> 2;
int16x4_t lookup = vdup_n_s16(0);
do {
const int16x8_t mv = MvProjectionSingleClip(
temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
LowPrecision(mv, candidate_mvs);
temporal_mvs += 4;
temporal_reference_offsets += 4;
candidate_mvs += 4;
} while (--loop_count);
}
void MvProjectionSingleForceInteger_NEON(
const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
const int reference_offset, const int count, MotionVector* candidate_mvs) {
// Up to three more elements could be calculated.
int loop_count = (count + 3) >> 2;
int16x4_t lookup = vdup_n_s16(0);
do {
const int16x8_t mv = MvProjectionSingleClip(
temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
ForceInteger(mv, candidate_mvs);
temporal_mvs += 4;
temporal_reference_offsets += 4;
candidate_mvs += 4;
} while (--loop_count);
}
void MvProjectionSingleHighPrecision_NEON(
const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
const int reference_offset, const int count, MotionVector* candidate_mvs) {
// Up to three more elements could be calculated.
int loop_count = (count + 3) >> 2;
int16x4_t lookup = vdup_n_s16(0);
do {
const int16x8_t mv = MvProjectionSingleClip(
temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
temporal_mvs += 4;
temporal_reference_offsets += 4;
candidate_mvs += 4;
} while (--loop_count);
}
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
}
#if LIBGAV1_MAX_BITDEPTH >= 10
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
}
#endif
} // namespace
void MotionVectorSearchInit_NEON() {
Init8bpp();
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
}
} // namespace dsp
} // namespace libgav1
#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
void MotionVectorSearchInit_NEON() {}
} // namespace dsp
} // namespace libgav1
#endif // LIBGAV1_ENABLE_NEON