blob: 9b6bb009e67c388f15c59aabc63d02491011212f [file] [log] [blame]
// Copyright 2020 The libgav1 Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "src/post_filter.h"
#include "src/utils/blocking_counter.h"
#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
namespace libgav1 {
namespace {
constexpr int kStep64x64 = 16; // =64/4.
constexpr int kCdefSkip = 8;
constexpr uint8_t kCdefUvDirection[2][2][8] = {
{{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
{{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
template <typename Pixel>
void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
bool is_frame_left, bool is_frame_right,
uint16_t* const dst) {
if (sizeof(src[0]) == sizeof(dst[0])) {
if (is_frame_left) {
Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder);
} else {
memcpy(dst - kCdefBorder, src - kCdefBorder,
kCdefBorder * sizeof(dst[0]));
}
memcpy(dst, src, block_width * sizeof(dst[0]));
if (is_frame_right) {
Memset(dst + block_width, kCdefLargeValue,
unit_width + kCdefBorder - block_width);
} else {
memcpy(dst + block_width, src + block_width,
(unit_width + kCdefBorder - block_width) * sizeof(dst[0]));
}
return;
}
for (int x = -kCdefBorder; x < 0; ++x) {
dst[x] = is_frame_left ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
}
for (int x = 0; x < block_width; ++x) {
dst[x] = src[x];
}
for (int x = block_width; x < unit_width + kCdefBorder; ++x) {
dst[x] = is_frame_right ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
}
}
// For |height| rows, copy |width| pixels of size |pixel_size| from |src| to
// |dst|.
void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst,
int dst_stride, int width, int height, size_t pixel_size) {
int y = height;
do {
memcpy(dst, src, width * pixel_size);
src += src_stride;
dst += dst_stride;
} while (--y != 0);
}
} // namespace
uint8_t* PostFilter::GetCdefBufferAndStride(const int start_x,
const int start_y, const int plane,
const int window_buffer_plane_size,
int* cdef_stride) const {
if (thread_pool_ != nullptr) {
// write output to threaded_window_buffer.
*cdef_stride = window_buffer_width_ * pixel_size_;
const int column_window =
start_x % (window_buffer_width_ >> subsampling_x_[plane]);
const int row_window =
start_y % (window_buffer_height_ >> subsampling_y_[plane]);
return threaded_window_buffer_ + plane * window_buffer_plane_size +
row_window * (*cdef_stride) + column_window * pixel_size_;
}
// write output to |cdef_buffer_|.
*cdef_stride = frame_buffer_.stride(plane);
return cdef_buffer_[plane] + start_y * (*cdef_stride) + start_x * pixel_size_;
}
template <typename Pixel>
void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
int row4x4, int column4x4,
uint16_t* cdef_source, ptrdiff_t cdef_stride,
const bool y_plane) {
assert(y_plane || planes_ == kMaxPlanes);
const int max_planes = y_plane ? 1 : kMaxPlanes;
const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
const int plane_width = RightShiftWithRounding(width_, subsampling_x);
const int plane_height = RightShiftWithRounding(height_, subsampling_y);
const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
// unit_width, unit_height are the same as block_width, block_height unless
// it reaches the frame boundary, where block_width < 64 or
// block_height < 64. unit_width, unit_height guarantee we build blocks on
// a multiple of 8.
const int unit_width = Align(block_width, 8 >> subsampling_x);
const int unit_height = Align(block_height, 8 >> subsampling_y);
const bool is_frame_left = column4x4 == 0;
const bool is_frame_right = start_x + block_width >= plane_width;
const bool is_frame_top = row4x4 == 0;
const bool is_frame_bottom = start_y + block_height >= plane_height;
const int y_offset = is_frame_top ? 0 : kCdefBorder;
for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
uint16_t* cdef_src = cdef_source + plane * kCdefUnitSizeWithBorders *
kCdefUnitSizeWithBorders;
const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
const Pixel* src_buffer =
reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
(start_y - y_offset) * src_stride + start_x;
// All the copying code will use negative indices for populating the left
// border. So the starting point is set to kCdefBorder.
cdef_src += kCdefBorder;
// Copy the top 2 rows.
if (is_frame_top) {
for (int y = 0; y < kCdefBorder; ++y) {
Memset(cdef_src - kCdefBorder, kCdefLargeValue,
unit_width + 2 * kCdefBorder);
cdef_src += cdef_stride;
}
} else {
for (int y = 0; y < kCdefBorder; ++y) {
CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
is_frame_right, cdef_src);
src_buffer += src_stride;
cdef_src += cdef_stride;
}
}
// Copy the body.
int y = block_height;
do {
CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
is_frame_right, cdef_src);
cdef_src += cdef_stride;
src_buffer += src_stride;
} while (--y != 0);
// Copy the bottom 2 rows.
if (is_frame_bottom) {
do {
Memset(cdef_src - kCdefBorder, kCdefLargeValue,
unit_width + 2 * kCdefBorder);
cdef_src += cdef_stride;
} while (++y < kCdefBorder + unit_height - block_height);
} else {
do {
CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
is_frame_right, cdef_src);
src_buffer += src_stride;
cdef_src += cdef_stride;
} while (++y < kCdefBorder + unit_height - block_height);
}
}
}
template <typename Pixel>
void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
const int block_width4x4,
const int block_height4x4,
const int row4x4_start,
const int column4x4_start) {
// Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
static constexpr int kStep = 8;
static constexpr int kStep4x4 = 2;
const int window_buffer_plane_size =
window_buffer_width_ * window_buffer_height_ * sizeof(Pixel);
int cdef_buffer_row_base_stride[kMaxPlanes];
int cdef_buffer_stride[kMaxPlanes];
uint8_t* cdef_buffer_row_base[kMaxPlanes];
int src_buffer_row_base_stride[kMaxPlanes];
const uint8_t* src_buffer_row_base[kMaxPlanes];
int column_step[kMaxPlanes];
assert(planes_ >= 1);
for (int plane = kPlaneY; plane < planes_; ++plane) {
const int start_y = MultiplyBy4(row4x4_start) >> subsampling_y_[plane];
const int start_x = MultiplyBy4(column4x4_start) >> subsampling_x_[plane];
cdef_buffer_row_base[plane] = GetCdefBufferAndStride(
start_x, start_y, plane, window_buffer_plane_size,
&cdef_buffer_stride[plane]);
cdef_buffer_row_base_stride[plane] =
cdef_buffer_stride[plane] * (kStep >> subsampling_y_[plane]);
src_buffer_row_base[plane] = source_buffer_[plane] +
start_y * frame_buffer_.stride(plane) +
start_x * sizeof(Pixel);
src_buffer_row_base_stride[plane] =
frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel);
}
if (index == -1) {
for (int plane = kPlaneY; plane < planes_; ++plane) {
CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
cdef_buffer_row_base[plane], cdef_buffer_stride[plane],
MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
sizeof(Pixel));
}
return;
}
PrepareCdefBlock<Pixel>(block_width4x4, block_height4x4, row4x4_start,
column4x4_start, cdef_block, kCdefUnitSizeWithBorders,
true);
// Stored direction used during the u/v pass. If bit 3 is set, then block is
// a skip.
int direction_y[8 * 8];
int y_index = 0;
const uint8_t y_primary_strength =
frame_header_.cdef.y_primary_strength[index];
const uint8_t y_secondary_strength =
frame_header_.cdef.y_secondary_strength[index];
// y_strength_index is 0 for both primary and secondary strengths being
// non-zero, 1 for primary only, 2 for secondary only. This will be updated
// with y_primary_strength after variance is applied.
int y_strength_index = static_cast<int>(y_secondary_strength == 0);
const bool compute_direction_and_variance =
(y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0;
BlockParameters* const* bp_row0_base =
block_parameters_.Address(row4x4_start, column4x4_start);
BlockParameters* const* bp_row1_base =
bp_row0_base + block_parameters_.columns4x4();
const int bp_stride = MultiplyBy2(block_parameters_.columns4x4());
int row4x4 = row4x4_start;
do {
uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
BlockParameters* const* bp0 = bp_row0_base;
BlockParameters* const* bp1 = bp_row1_base;
int column4x4 = column4x4_start;
do {
const int block_width = kStep;
const int block_height = kStep;
const int cdef_stride = cdef_buffer_stride[kPlaneY];
uint8_t* const cdef_buffer = cdef_buffer_base;
const int src_stride = frame_buffer_.stride(kPlaneY);
const uint8_t* const src_buffer = src_buffer_base;
const bool skip = (*bp0)->skip && (*(bp0 + 1))->skip && (*bp1)->skip &&
(*(bp1 + 1))->skip;
if (skip) { // No cdef filtering.
direction_y[y_index] = kCdefSkip;
CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
block_width, block_height, sizeof(Pixel));
} else {
// Zero out residual skip flag.
direction_y[y_index] = 0;
int variance = 0;
if (compute_direction_and_variance) {
dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
&variance);
}
const int direction =
(y_primary_strength == 0) ? 0 : direction_y[y_index];
const int variance_strength =
((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12) : 0;
const uint8_t primary_strength =
(variance != 0)
? (y_primary_strength * (4 + variance_strength) + 8) >> 4
: 0;
if ((primary_strength | y_secondary_strength) == 0) {
CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
block_width, block_height, sizeof(Pixel));
} else {
uint16_t* cdef_src =
cdef_block + kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
cdef_src +=
(MultiplyBy4(row4x4 - row4x4_start)) * kCdefUnitSizeWithBorders +
(MultiplyBy4(column4x4 - column4x4_start));
const int strength_index =
y_strength_index | (static_cast<int>(primary_strength == 0) << 1);
dsp_.cdef_filters[1][strength_index](
cdef_src, kCdefUnitSizeWithBorders, block_height,
primary_strength, y_secondary_strength,
frame_header_.cdef.damping, direction, cdef_buffer, cdef_stride);
}
}
cdef_buffer_base += column_step[kPlaneY];
src_buffer_base += column_step[kPlaneY];
bp0 += kStep4x4;
bp1 += kStep4x4;
column4x4 += kStep4x4;
y_index++;
} while (column4x4 < column4x4_start + block_width4x4);
cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
bp_row0_base += bp_stride;
bp_row1_base += bp_stride;
row4x4 += kStep4x4;
} while (row4x4 < row4x4_start + block_height4x4);
if (planes_ == kMaxPlanesMonochrome) {
return;
}
const uint8_t uv_primary_strength =
frame_header_.cdef.uv_primary_strength[index];
const uint8_t uv_secondary_strength =
frame_header_.cdef.uv_secondary_strength[index];
if ((uv_primary_strength | uv_secondary_strength) == 0) {
for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
cdef_buffer_row_base[plane], cdef_buffer_stride[plane],
MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
sizeof(Pixel));
}
return;
}
PrepareCdefBlock<Pixel>(block_width4x4, block_height4x4, row4x4_start,
column4x4_start, cdef_block, kCdefUnitSizeWithBorders,
false);
// uv_strength_index is 0 for both primary and secondary strengths being
// non-zero, 1 for primary only, 2 for secondary only.
const int uv_strength_index =
(static_cast<int>(uv_primary_strength == 0) << 1) |
static_cast<int>(uv_secondary_strength == 0);
for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
const int8_t subsampling_x = subsampling_x_[plane];
const int8_t subsampling_y = subsampling_y_[plane];
const int block_width = kStep >> subsampling_x;
const int block_height = kStep >> subsampling_y;
int row4x4 = row4x4_start;
y_index = 0;
do {
uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
const uint8_t* src_buffer_base = src_buffer_row_base[plane];
int column4x4 = column4x4_start;
do {
const int cdef_stride = cdef_buffer_stride[plane];
uint8_t* const cdef_buffer = cdef_buffer_base;
const int src_stride = frame_buffer_.stride(plane);
const uint8_t* const src_buffer = src_buffer_base;
const bool skip = (direction_y[y_index] & kCdefSkip) != 0;
int dual_cdef = 0;
if (skip) { // No cdef filtering.
CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
block_width, block_height, sizeof(Pixel));
} else {
// Make sure block pair is not out of bounds.
if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
// Enable dual processing if subsampling_x is 1.
dual_cdef = subsampling_x;
}
int direction = (uv_primary_strength == 0)
? 0
: kCdefUvDirection[subsampling_x][subsampling_y]
[direction_y[y_index]];
if (dual_cdef != 0) {
if (uv_primary_strength &&
direction_y[y_index] != direction_y[y_index + 1]) {
// Disable dual processing if the second block of the pair does
// not have the same direction.
dual_cdef = 0;
}
// Disable dual processing if the second block of the pair is a
// skip.
if (direction_y[y_index + 1] == kCdefSkip) {
dual_cdef = 0;
}
}
uint16_t* cdef_src = cdef_block + plane * kCdefUnitSizeWithBorders *
kCdefUnitSizeWithBorders;
cdef_src += kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
cdef_src +=
(MultiplyBy4(row4x4 - row4x4_start) >> subsampling_y) *
kCdefUnitSizeWithBorders +
(MultiplyBy4(column4x4 - column4x4_start) >> subsampling_x);
// Block width is 8 if either dual_cdef is true or subsampling_x == 0.
const int width_index = dual_cdef | (subsampling_x ^ 1);
dsp_.cdef_filters[width_index][uv_strength_index](
cdef_src, kCdefUnitSizeWithBorders, block_height,
uv_primary_strength, uv_secondary_strength,
frame_header_.cdef.damping - 1, direction, cdef_buffer,
cdef_stride);
}
// When dual_cdef is set, the above cdef_filter() will process 2 blocks,
// so adjust the pointers and indexes for 2 blocks.
cdef_buffer_base += column_step[plane] << dual_cdef;
src_buffer_base += column_step[plane] << dual_cdef;
column4x4 += kStep4x4 << dual_cdef;
y_index += 1 << dual_cdef;
} while (column4x4 < column4x4_start + block_width4x4);
cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
row4x4 += kStep4x4;
} while (row4x4 < row4x4_start + block_height4x4);
}
}
void PostFilter::ApplyCdefForOneSuperBlockRowHelper(int row4x4,
int block_height4x4) {
for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
column4x4 += kStep64x64) {
const int index = cdef_index_[DivideBy16(row4x4)][DivideBy16(column4x4)];
const int block_width4x4 =
std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
#if LIBGAV1_MAX_BITDEPTH >= 10
if (bitdepth_ >= 10) {
ApplyCdefForOneUnit<uint16_t>(cdef_block_, index, block_width4x4,
block_height4x4, row4x4, column4x4);
continue;
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
ApplyCdefForOneUnit<uint8_t>(cdef_block_, index, block_width4x4,
block_height4x4, row4x4, column4x4);
}
}
void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4,
bool is_last_row) {
assert(row4x4_start >= 0);
assert(DoCdef());
for (int y = 0; y < sb4x4; y += kStep64x64) {
const int row4x4 = row4x4_start + y;
if (row4x4 >= frame_header_.rows4x4) return;
// Apply cdef for the last 8 rows of the previous superblock row.
// One exception: If the superblock size is 128x128 and is_last_row is true,
// then we simply apply cdef for the entire superblock row without any lag.
// In that case, apply cdef for the previous superblock row only during the
// first iteration (y == 0).
if (row4x4 > 0 && (!is_last_row || y == 0)) {
assert(row4x4 >= 16);
ApplyCdefForOneSuperBlockRowHelper(row4x4 - 2, 2);
}
// Apply cdef for the current superblock row. If this is the last superblock
// row we apply cdef for all the rows, otherwise we leave out the last 8
// rows.
const int block_height4x4 =
std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2);
if (height4x4 > 0) {
ApplyCdefForOneSuperBlockRowHelper(row4x4, height4x4);
}
}
}
template <typename Pixel>
void PostFilter::ApplyCdefForOneRowInWindow(const int row4x4,
const int column4x4_start) {
uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 3];
for (int column4x4_64x64 = 0;
column4x4_64x64 < std::min(DivideBy4(window_buffer_width_),
frame_header_.columns4x4 - column4x4_start);
column4x4_64x64 += kStep64x64) {
const int column4x4 = column4x4_start + column4x4_64x64;
const int index = cdef_index_[DivideBy16(row4x4)][DivideBy16(column4x4)];
const int block_width4x4 =
std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
const int block_height4x4 =
std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
ApplyCdefForOneUnit<Pixel>(cdef_block, index, block_width4x4,
block_height4x4, row4x4, column4x4);
}
}
// Each thread processes one row inside the window.
// Y, U, V planes are processed together inside one thread.
template <typename Pixel>
void PostFilter::ApplyCdefThreaded() {
assert((window_buffer_height_ & 63) == 0);
const int num_workers = thread_pool_->num_threads();
const int window_buffer_plane_size =
window_buffer_width_ * window_buffer_height_;
const int window_buffer_height4x4 = DivideBy4(window_buffer_height_);
for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
row4x4 += window_buffer_height4x4) {
const int actual_window_height4x4 =
std::min(window_buffer_height4x4, frame_header_.rows4x4 - row4x4);
const int vertical_units_per_window =
DivideBy16(actual_window_height4x4 + 15);
for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
column4x4 += DivideBy4(window_buffer_width_)) {
const int jobs_for_threadpool =
vertical_units_per_window * num_workers / (num_workers + 1);
BlockingCounter pending_jobs(jobs_for_threadpool);
int job_count = 0;
for (int row64x64 = 0; row64x64 < actual_window_height4x4;
row64x64 += kStep64x64) {
if (job_count < jobs_for_threadpool) {
thread_pool_->Schedule(
[this, row4x4, column4x4, row64x64, &pending_jobs]() {
ApplyCdefForOneRowInWindow<Pixel>(row4x4 + row64x64, column4x4);
pending_jobs.Decrement();
});
} else {
ApplyCdefForOneRowInWindow<Pixel>(row4x4 + row64x64, column4x4);
}
++job_count;
}
pending_jobs.Wait();
// Copy |threaded_window_buffer_| to |cdef_buffer_|.
for (int plane = kPlaneY; plane < planes_; ++plane) {
const ptrdiff_t src_stride =
frame_buffer_.stride(plane) / sizeof(Pixel);
const int plane_row = MultiplyBy4(row4x4) >> subsampling_y_[plane];
const int plane_column =
MultiplyBy4(column4x4) >> subsampling_x_[plane];
int copy_width = std::min(frame_header_.columns4x4 - column4x4,
DivideBy4(window_buffer_width_));
copy_width = MultiplyBy4(copy_width) >> subsampling_x_[plane];
int copy_height =
std::min(frame_header_.rows4x4 - row4x4, window_buffer_height4x4);
copy_height = MultiplyBy4(copy_height) >> subsampling_y_[plane];
CopyPlane<Pixel>(
reinterpret_cast<const Pixel*>(threaded_window_buffer_) +
plane * window_buffer_plane_size,
window_buffer_width_, copy_width, copy_height,
reinterpret_cast<Pixel*>(cdef_buffer_[plane]) +
plane_row * src_stride + plane_column,
src_stride);
}
}
}
}
void PostFilter::ApplyCdef() {
#if LIBGAV1_MAX_BITDEPTH >= 10
if (bitdepth_ >= 10) {
ApplyCdefThreaded<uint16_t>();
return;
}
#endif
ApplyCdefThreaded<uint8_t>();
}
} // namespace libgav1