blob: 1dd904ff9beeba395f936da6d42558023bfa302a [file] [log] [blame]
#include "src/dsp/warp.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
#include "src/utils/constants.h"
#include "src/utils/memory.h"
namespace libgav1 {
namespace dsp {
namespace {
// Number of extra bits of precision in warped filtering.
constexpr int kWarpedDiffPrecisionBits = 10;
template <int bitdepth, typename Pixel>
void Warp_C(const void* const source, ptrdiff_t source_stride,
const int source_width, const int source_height,
const int* const warp_params, const int subsampling_x,
const int subsampling_y, const int inter_round_bits_vertical,
const int block_start_x, const int block_start_y,
const int block_width, const int block_height, const int16_t alpha,
const int16_t beta, const int16_t gamma, const int16_t delta,
uint16_t* dest, const ptrdiff_t dest_stride) {
constexpr int kRoundBitsHorizontal = (bitdepth == 12)
? kInterRoundBitsHorizontal12bpp
: kInterRoundBitsHorizontal;
// Intermediate_result is the output of the horizontal filtering and rounding.
// The range is within 16 bits (unsigned).
uint16_t intermediate_result[15][8]; // 15 rows, 8 columns.
const int horizontal_offset = 1 << (bitdepth + kFilterBits - 1);
const int vertical_offset =
1 << (bitdepth + 2 * kFilterBits - kRoundBitsHorizontal);
const auto* const src = static_cast<const Pixel*>(source);
source_stride /= sizeof(Pixel);
assert(block_width >= 8);
assert(block_height >= 8);
// Warp process applies for each 8x8 block (or smaller).
for (int start_y = block_start_y; start_y < block_start_y + block_height;
start_y += 8) {
for (int start_x = block_start_x; start_x < block_start_x + block_width;
start_x += 8) {
const int src_x = (start_x + 4) << subsampling_x;
const int src_y = (start_y + 4) << subsampling_y;
const int dst_x =
src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
const int dst_y =
src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
const int x4 = dst_x >> subsampling_x;
const int y4 = dst_y >> subsampling_y;
const int ix4 = x4 >> kWarpedModelPrecisionBits;
const int iy4 = y4 >> kWarpedModelPrecisionBits;
// Horizontal filter.
int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
for (int y = -7; y < 8; ++y) {
// TODO(chengchen):
// Because of warping, the index could be out of frame boundary. Thus
// clip is needed. However, can we remove or reduce usage of clip?
// Besides, special cases exist, for example,
// if iy4 - 7 >= source_height or iy4 + 7 < 0, there's no need to do the
// filtering.
const int row = Clip3(iy4 + y, 0, source_height - 1);
const Pixel* const src_row = src + row * source_stride;
// Check for two simple special cases.
if (ix4 - 7 >= source_width - 1) {
// Every sample is equal to src_row[source_width - 1]. Since the sum
// of the warped filter coefficients is 128 (= 2^7), the filtering is
// equivalent to multiplying src_row[source_width - 1] by 128.
const int s =
(horizontal_offset >> kInterRoundBitsHorizontal) +
(src_row[source_width - 1] << (7 - kInterRoundBitsHorizontal));
Memset(intermediate_result[y + 7], s, 8);
sx4 += beta;
continue;
}
if (ix4 + 7 <= 0) {
// Every sample is equal to src_row[0]. Since the sum of the warped
// filter coefficients is 128 (= 2^7), the filtering is equivalent to
// multiplying src_row[0] by 128.
const int s = (horizontal_offset >> kInterRoundBitsHorizontal) +
(src_row[0] << (7 - kInterRoundBitsHorizontal));
Memset(intermediate_result[y + 7], s, 8);
sx4 += beta;
continue;
}
// At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
// It follows that -6 <= ix4 <= source_width + 5. This inequality is
// used below.
int sx = sx4 - MultiplyBy4(alpha);
for (int x = -4; x < 4; ++x) {
const int offset =
RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
kWarpedPixelPrecisionShifts;
// Since alpha and beta have been validated by SetupShear(), one can
// prove that 0 <= offset <= 3 * 2^6.
assert(offset >= 0);
assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
// For SIMD optimization:
// For 8 bit, the range of sum is within uint16_t, if we add an
// horizontal offset:
int sum = horizontal_offset;
// Horizontal_offset guarantees sum is non negative.
// If horizontal_offset is used, intermediate_result needs to be
// uint16_t.
// For 10/12 bit, the range of sum is within 32 bits.
for (int k = 0; k < 8; ++k) {
// We assume the source frame has left and right borders of at
// least 13 pixels that extend the frame boundary pixels.
//
// Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on ix4
// above, we have -13 <= ix4 + x + k - 3 <= source_width + 12, or
// -13 <= column <= (source_width - 1) + 13. Therefore we may
// over-read up to 13 pixels before the source row, or up to 13
// pixels after the source row.
const int column = ix4 + x + k - 3;
sum += kWarpedFilters[offset][k] * src_row[column];
}
assert(sum >= 0 && sum < (horizontal_offset << 2));
intermediate_result[y + 7][x + 4] = static_cast<uint16_t>(
RightShiftWithRounding(sum, kRoundBitsHorizontal));
sx += alpha;
}
sx4 += beta;
}
// Vertical filter.
uint16_t* dst_row = dest + start_x - block_start_x;
int sy4 =
(y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
// The spec says we should use the following loop condition:
// y < std::min(4, block_start_y + block_height - start_y - 4);
// We can prove that block_start_y + block_height - start_y >= 8, which
// implies std::min(4, block_start_y + block_height - start_y - 4) = 4.
// So the loop condition is simply y < 4.
//
// Proof:
// start_y < block_start_y + block_height
// => block_start_y + block_height - start_y > 0
// => block_height - (start_y - block_start_y) > 0
//
// Since block_height >= 8 and is a power of 2, it follows that
// block_height is a multiple of 8. start_y - block_start_y is also a
// multiple of 8. Therefore their difference is a multiple of 8. Since
// their difference is > 0, their difference must be >= 8.
for (int y = -4; y < 4; ++y) {
int sy = sy4 - MultiplyBy4(gamma);
// The spec says we should use the following loop condition:
// x < std::min(4, block_start_x + block_width - start_x - 4);
// Similar to the above, we can prove that the loop condition can be
// simplified to x < 4.
for (int x = -4; x < 4; ++x) {
const int offset =
RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
kWarpedPixelPrecisionShifts;
// Since gamma and delta have been validated by SetupShear(), one can
// prove that 0 <= offset <= 3 * 2^6.
assert(offset >= 0);
assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
// Similar to horizontal_offset, vertical_offset guarantees sum
// before shifting is non negative:
int sum = vertical_offset;
for (int k = 0; k < 8; ++k) {
sum += kWarpedFilters[offset][k] *
intermediate_result[y + 4 + k][x + 4];
}
assert(sum >= 0 && sum < (vertical_offset << 2));
sum = RightShiftWithRounding(sum, inter_round_bits_vertical);
// Warp output is a predictor, whose type is uint16_t.
// Do not clip it here. The clipping is applied at the stage of
// final pixel value output.
dst_row[x + 4] = static_cast<uint16_t>(sum);
sy += gamma;
}
dst_row += dest_stride;
sy4 += delta;
}
}
dest += 8 * dest_stride;
}
}
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
assert(dsp != nullptr);
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
dsp->warp = Warp_C<8, uint8_t>;
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
static_cast<void>(dsp);
#ifndef LIBGAV1_Dsp8bpp_Warp
dsp->warp = Warp_C<8, uint8_t>;
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
#if LIBGAV1_MAX_BITDEPTH >= 10
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
assert(dsp != nullptr);
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
dsp->warp = Warp_C<10, uint16_t>;
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
static_cast<void>(dsp);
#ifndef LIBGAV1_Dsp10bpp_Warp
dsp->warp = Warp_C<10, uint16_t>;
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
#endif
} // namespace
void WarpInit_C() {
Init8bpp();
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
}
} // namespace dsp
} // namespace libgav1