blob: fbde65addb010f9b48d56ca78aed192fc0ef052f [file] [log] [blame]
// Copyright 2019 The libgav1 Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "src/dsp/warp.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <type_traits>
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
#include "src/utils/constants.h"
#include "src/utils/memory.h"
namespace libgav1 {
namespace dsp {
namespace {
// Number of extra bits of precision in warped filtering.
constexpr int kWarpedDiffPrecisionBits = 10;
// Warp prediction output ranges from WarpTest.ShowRange.
// Bitdepth: 8 Input range: [ 0, 255]
// 8bpp intermediate offset: 16384.
// intermediate range: [ 4399, 61009]
// first pass output range: [ 550, 7626]
// 8bpp intermediate offset removal: 262144.
// intermediate range: [ -620566, 1072406]
// second pass output range: [ 0, 255]
// compound second pass output range: [ -4848, 8378]
//
// Bitdepth: 10 Input range: [ 0, 1023]
// intermediate range: [ -48081, 179025]
// first pass output range: [ -6010, 22378]
// intermediate range: [-2103516, 4198620]
// second pass output range: [ 0, 1023]
// compound second pass output range: [ 8142, 57378]
//
// Bitdepth: 12 Input range: [ 0, 4095]
// intermediate range: [ -192465, 716625]
// first pass output range: [ -6015, 22395]
// intermediate range: [-2105190, 4201830]
// second pass output range: [ 0, 4095]
// compound second pass output range: [ 8129, 57403]
template <bool is_compound, int bitdepth, typename Pixel>
void Warp_C(const void* const source, ptrdiff_t source_stride,
const int source_width, const int source_height,
const int* const warp_params, const int subsampling_x,
const int subsampling_y, const int block_start_x,
const int block_start_y, const int block_width,
const int block_height, const int16_t alpha, const int16_t beta,
const int16_t gamma, const int16_t delta, void* dest,
ptrdiff_t dest_stride) {
assert(block_width >= 8 && block_height >= 8);
if (is_compound) {
assert(dest_stride == block_width);
}
constexpr int kRoundBitsHorizontal = (bitdepth == 12)
? kInterRoundBitsHorizontal12bpp
: kInterRoundBitsHorizontal;
constexpr int kRoundBitsVertical =
is_compound ? kInterRoundBitsCompoundVertical
: (bitdepth == 12) ? kInterRoundBitsVertical12bpp
: kInterRoundBitsVertical;
// Only used for 8bpp. Allows for keeping the first pass intermediates within
// uint16_t. With 10/12bpp the intermediate value will always require int32_t.
constexpr int first_pass_offset = (bitdepth == 8) ? 1 << 14 : 0;
constexpr int offset_removal =
(first_pass_offset >> kRoundBitsHorizontal) * 128;
constexpr int kMaxPixel = (1 << bitdepth) - 1;
union {
// |intermediate_result| is the output of the horizontal filtering and
// rounding. The range is within int16_t.
int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
// In the simple special cases where the samples in each row are all the
// same, store one sample per row in a column vector.
int16_t intermediate_result_column[15];
};
const auto* const src = static_cast<const Pixel*>(source);
source_stride /= sizeof(Pixel);
using DestType =
typename std::conditional<is_compound, uint16_t, Pixel>::type;
auto* dst = static_cast<DestType*>(dest);
if (!is_compound) dest_stride /= sizeof(dst[0]);
assert(block_width >= 8);
assert(block_height >= 8);
// Warp process applies for each 8x8 block (or smaller).
for (int start_y = block_start_y; start_y < block_start_y + block_height;
start_y += 8) {
for (int start_x = block_start_x; start_x < block_start_x + block_width;
start_x += 8) {
const int src_x = (start_x + 4) << subsampling_x;
const int src_y = (start_y + 4) << subsampling_y;
const int dst_x =
src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
const int dst_y =
src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
const int x4 = dst_x >> subsampling_x;
const int y4 = dst_y >> subsampling_y;
const int ix4 = x4 >> kWarpedModelPrecisionBits;
const int iy4 = y4 >> kWarpedModelPrecisionBits;
// A prediction block may fall outside the frame's boundaries. If a
// prediction block is calculated using only samples outside the frame's
// boundary, the filtering can be simplified. We can divide the plane
// into several regions and handle them differently.
//
// | |
// 1 | 3 | 1
// | |
// -------+-----------+-------
// |***********|
// 2 |*****4*****| 2
// |***********|
// -------+-----------+-------
// | |
// 1 | 3 | 1
// | |
//
// At the center, region 4 represents the frame and is the general case.
//
// In regions 1 and 2, the prediction block is outside the frame's
// boundary horizontally. Therefore the horizontal filtering can be
// simplified. Furthermore, in the region 1 (at the four corners), the
// prediction is outside the frame's boundary both horizontally and
// vertically, so we get a constant prediction block.
//
// In region 3, the prediction block is outside the frame's boundary
// vertically. Unfortunately because we apply the horizontal filters
// first, by the time we apply the vertical filters, they no longer see
// simple inputs. So the only simplification is that all the rows are
// the same, but we still need to apply all the horizontal and vertical
// filters.
// Check for two simple special cases, where the horizontal filter can
// be significantly simplified.
//
// In general, for each row, the horizontal filter is calculated as
// follows:
// for (int x = -4; x < 4; ++x) {
// const int offset = ...;
// int sum = first_pass_offset;
// for (int k = 0; k < 8; ++k) {
// const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
// sum += kWarpedFilters[offset][k] * src_row[column];
// }
// ...
// }
// The column index before clipping, ix4 + x + k - 3, varies in the range
// ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
// or ix4 + 7 <= 0, then all the column indexes are clipped to the same
// border index (source_width - 1 or 0, respectively). Then for each x,
// the inner for loop of the horizontal filter is reduced to multiplying
// the border pixel by the sum of the filter coefficients.
if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
// Regions 1 and 2.
// Points to the left or right border of the first row of |src|.
const Pixel* first_row_border =
(ix4 + 7 <= 0) ? src : src + source_width - 1;
// In general, for y in [-7, 8), the row number iy4 + y is clipped:
// const int row = Clip3(iy4 + y, 0, source_height - 1);
// In two special cases, iy4 + y is clipped to either 0 or
// source_height - 1 for all y. In the rest of the cases, iy4 + y is
// bounded and we can avoid clipping iy4 + y by relying on a reference
// frame's boundary extension on the top and bottom.
if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
// Region 1.
// Every sample used to calculate the prediction block has the same
// value. So the whole prediction block has the same value.
const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
const Pixel row_border_pixel = first_row_border[row * source_stride];
DestType* dst_row = dst + start_x - block_start_x;
if (is_compound) {
int sum = row_border_pixel
<< ((14 - kRoundBitsHorizontal) - kRoundBitsVertical);
sum += (bitdepth == 8) ? 0 : kCompoundOffset;
Memset(dst_row, sum, 8);
} else {
Memset(dst_row, row_border_pixel, 8);
}
const DestType* const first_dst_row = dst_row;
dst_row += dest_stride;
for (int y = 1; y < 8; ++y) {
memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
dst_row += dest_stride;
}
// End of region 1. Continue the |start_x| for loop.
continue;
}
// Region 2.
// Horizontal filter.
// The input values in this region are generated by extending the border
// which makes them identical in the horizontal direction. This
// computation could be inlined in the vertical pass but most
// implementations will need a transpose of some sort.
// It is not necessary to use the offset values here because the
// horizontal pass is a simple shift and the vertical pass will always
// require using 32 bits.
for (int y = -7; y < 8; ++y) {
// We may over-read up to 13 pixels above the top source row, or up
// to 13 pixels below the bottom source row. This is proved below.
const int row = iy4 + y;
int sum = first_row_border[row * source_stride];
sum <<= kFilterBits - kRoundBitsHorizontal;
intermediate_result_column[y + 7] = sum;
}
// Vertical filter.
DestType* dst_row = dst + start_x - block_start_x;
int sy4 =
(y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
for (int y = 0; y < 8; ++y) {
int sy = sy4 - MultiplyBy4(gamma);
for (int x = 0; x < 8; ++x) {
const int offset =
RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
kWarpedPixelPrecisionShifts;
assert(offset >= 0);
assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
int sum = 0;
for (int k = 0; k < 8; ++k) {
sum +=
kWarpedFilters[offset][k] * intermediate_result_column[y + k];
}
sum = RightShiftWithRounding(sum, kRoundBitsVertical);
if (is_compound) {
sum += (bitdepth == 8) ? 0 : kCompoundOffset;
dst_row[x] = static_cast<DestType>(sum);
} else {
dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
}
sy += gamma;
}
dst_row += dest_stride;
sy4 += delta;
}
// End of region 2. Continue the |start_x| for loop.
continue;
}
// Regions 3 and 4.
// At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
// It follows that -6 <= ix4 <= source_width + 5. This inequality is
// used below.
// In general, for y in [-7, 8), the row number iy4 + y is clipped:
// const int row = Clip3(iy4 + y, 0, source_height - 1);
// In two special cases, iy4 + y is clipped to either 0 or
// source_height - 1 for all y. In the rest of the cases, iy4 + y is
// bounded and we can avoid clipping iy4 + y by relying on a reference
// frame's boundary extension on the top and bottom.
if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
// Region 3.
// Horizontal filter.
const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
const Pixel* const src_row = src + row * source_stride;
int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
for (int y = -7; y < 8; ++y) {
int sx = sx4 - MultiplyBy4(alpha);
for (int x = -4; x < 4; ++x) {
const int offset =
RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
kWarpedPixelPrecisionShifts;
// Since alpha and beta have been validated by SetupShear(), one
// can prove that 0 <= offset <= 3 * 2^6.
assert(offset >= 0);
assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
// For SIMD optimization:
// |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
// For 10/12 bit, the range of sum requires 32 bits.
int sum = first_pass_offset;
for (int k = 0; k < 8; ++k) {
// We assume the source frame has left and right borders of at
// least 13 pixels that extend the frame boundary pixels.
//
// Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
// ix4 above, we have
// -13 <= ix4 + x + k - 3 <= source_width + 12,
// or
// -13 <= column <= (source_width - 1) + 13.
// Therefore we may over-read up to 13 pixels before the source
// row, or up to 13 pixels after the source row.
const int column = ix4 + x + k - 3;
sum += kWarpedFilters[offset][k] * src_row[column];
}
intermediate_result[y + 7][x + 4] =
RightShiftWithRounding(sum, kRoundBitsHorizontal);
sx += alpha;
}
sx4 += beta;
}
} else {
// Region 4.
// Horizontal filter.
// At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0.
// It follows that -6 <= iy4 <= source_height + 5. This inequality is
// used below.
int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
for (int y = -7; y < 8; ++y) {
// We assume the source frame has top and bottom borders of at least
// 13 pixels that extend the frame boundary pixels.
//
// Since -7 <= y <= 7, using the inequality on iy4 above, we have
// -13 <= iy4 + y <= source_height + 12,
// or
// -13 <= row <= (source_height - 1) + 13.
// Therefore we may over-read up to 13 pixels above the top source
// row, or up to 13 pixels below the bottom source row.
const int row = iy4 + y;
const Pixel* const src_row = src + row * source_stride;
int sx = sx4 - MultiplyBy4(alpha);
for (int x = -4; x < 4; ++x) {
const int offset =
RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
kWarpedPixelPrecisionShifts;
// Since alpha and beta have been validated by SetupShear(), one
// can prove that 0 <= offset <= 3 * 2^6.
assert(offset >= 0);
assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
// For SIMD optimization:
// |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
// For 10/12 bit, the range of sum requires 32 bits.
int sum = first_pass_offset;
for (int k = 0; k < 8; ++k) {
// We assume the source frame has left and right borders of at
// least 13 pixels that extend the frame boundary pixels.
//
// Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
// ix4 above, we have
// -13 <= ix4 + x + k - 3 <= source_width + 12,
// or
// -13 <= column <= (source_width - 1) + 13.
// Therefore we may over-read up to 13 pixels before the source
// row, or up to 13 pixels after the source row.
const int column = ix4 + x + k - 3;
sum += kWarpedFilters[offset][k] * src_row[column];
}
intermediate_result[y + 7][x + 4] =
RightShiftWithRounding(sum, kRoundBitsHorizontal) -
offset_removal;
sx += alpha;
}
sx4 += beta;
}
}
// Regions 3 and 4.
// Vertical filter.
DestType* dst_row = dst + start_x - block_start_x;
int sy4 =
(y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
// The spec says we should use the following loop condition:
// y < std::min(4, block_start_y + block_height - start_y - 4);
// We can prove that block_start_y + block_height - start_y >= 8, which
// implies std::min(4, block_start_y + block_height - start_y - 4) = 4.
// So the loop condition is simply y < 4.
//
// Proof:
// start_y < block_start_y + block_height
// => block_start_y + block_height - start_y > 0
// => block_height - (start_y - block_start_y) > 0
//
// Since block_height >= 8 and is a power of 2, it follows that
// block_height is a multiple of 8. start_y - block_start_y is also a
// multiple of 8. Therefore their difference is a multiple of 8. Since
// their difference is > 0, their difference must be >= 8.
//
// We then add an offset of 4 to y so that the loop starts with y = 0
// and continues if y < 8.
for (int y = 0; y < 8; ++y) {
int sy = sy4 - MultiplyBy4(gamma);
// The spec says we should use the following loop condition:
// x < std::min(4, block_start_x + block_width - start_x - 4);
// Similar to the above, we can prove that the loop condition can be
// simplified to x < 4.
//
// We then add an offset of 4 to x so that the loop starts with x = 0
// and continues if x < 8.
for (int x = 0; x < 8; ++x) {
const int offset =
RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
kWarpedPixelPrecisionShifts;
// Since gamma and delta have been validated by SetupShear(), one can
// prove that 0 <= offset <= 3 * 2^6.
assert(offset >= 0);
assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
int sum = 0;
for (int k = 0; k < 8; ++k) {
sum += kWarpedFilters[offset][k] * intermediate_result[y + k][x];
}
sum -= offset_removal;
sum = RightShiftWithRounding(sum, kRoundBitsVertical);
if (is_compound) {
sum += (bitdepth == 8) ? 0 : kCompoundOffset;
dst_row[x] = static_cast<DestType>(sum);
} else {
dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
}
sy += gamma;
}
dst_row += dest_stride;
sy4 += delta;
}
}
dst += 8 * dest_stride;
}
}
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
assert(dsp != nullptr);
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
static_cast<void>(dsp);
#ifndef LIBGAV1_Dsp8bpp_Warp
dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
#endif
#ifndef LIBGAV1_Dsp8bpp_WarpCompound
dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
#if LIBGAV1_MAX_BITDEPTH >= 10
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
assert(dsp != nullptr);
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
static_cast<void>(dsp);
#ifndef LIBGAV1_Dsp10bpp_Warp
dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
#endif
#ifndef LIBGAV1_Dsp10bpp_WarpCompound
dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
#endif
} // namespace
void WarpInit_C() {
Init8bpp();
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
}
} // namespace dsp
} // namespace libgav1