blob: b8106d6c0291c0d6d294e5844c50ce07ebd99530 [file] [log] [blame]
// Copyright 2020 The libgav1 Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "src/post_filter.h"
#include "src/utils/blocking_counter.h"
namespace libgav1 {
template <typename Pixel>
void PostFilter::ApplyLoopRestorationForOneUnit(
uint8_t* const cdef_buffer, const ptrdiff_t cdef_buffer_stride,
const Plane plane, const int plane_height, const int x, const int y,
const int row, const int column, const int unit_row,
const int current_process_unit_height, const int plane_process_unit_width,
const int plane_unit_size, const int num_horizontal_units,
const int plane_width, Array2DView<Pixel>* const loop_restored_window) {
const int unit_x = x + column;
const int unit_y = y + row;
const int current_process_unit_width =
(unit_x + plane_process_unit_width <= plane_width)
? plane_process_unit_width
: plane_width - unit_x;
uint8_t* cdef_unit_buffer =
cdef_buffer + unit_y * cdef_buffer_stride + unit_x * pixel_size_;
const int unit_column =
std::min(unit_x / plane_unit_size, num_horizontal_units - 1);
const int unit_id = unit_row * num_horizontal_units + unit_column;
const LoopRestorationType type =
restoration_info_
->loop_restoration_info(static_cast<Plane>(plane), unit_id)
.type;
if (type == kLoopRestorationTypeNone) {
Pixel* dest = &(*loop_restored_window)[row][column];
for (int k = 0; k < current_process_unit_height; ++k) {
memcpy(dest, cdef_unit_buffer, current_process_unit_width * pixel_size_);
dest += loop_restored_window->columns();
cdef_unit_buffer += cdef_buffer_stride;
}
return;
}
// The SIMD implementation of wiener filter (currently WienerFilter_SSE4_1())
// over-reads 6 bytes, so add 6 extra bytes at the end of block_buffer for 8
// bit.
alignas(alignof(uint16_t)) uint8_t
block_buffer[kRestorationUnitHeightWithBorders *
kRestorationUnitWidthWithBorders * sizeof(Pixel) +
((sizeof(Pixel) == 1) ? 6 : 0)];
const ptrdiff_t block_buffer_stride =
kRestorationUnitWidthWithBorders * pixel_size_;
IntermediateBuffers intermediate_buffers;
RestorationBuffer restoration_buffer = {
{intermediate_buffers.box_filter.output[0],
intermediate_buffers.box_filter.output[1]},
plane_process_unit_width,
{intermediate_buffers.box_filter.intermediate_a,
intermediate_buffers.box_filter.intermediate_b},
kRestorationUnitWidthWithBorders + kRestorationPadding,
intermediate_buffers.wiener,
kRestorationUnitWidth};
const int deblock_buffer_units = 64 >> subsampling_y_[plane];
uint8_t* const deblock_buffer = deblock_buffer_.data(plane);
const int deblock_buffer_stride = deblock_buffer_.stride(plane);
const int deblock_unit_y =
std::max(MultiplyBy4(Ceil(unit_y, deblock_buffer_units)) - 4, 0);
uint8_t* deblock_unit_buffer =
(deblock_buffer != nullptr)
? deblock_buffer + deblock_unit_y * deblock_buffer_stride +
unit_x * pixel_size_
: nullptr;
assert(type == kLoopRestorationTypeSgrProj ||
type == kLoopRestorationTypeWiener);
const dsp::LoopRestorationFunc restoration_func =
dsp_.loop_restorations[type - 2];
PrepareLoopRestorationBlock<Pixel>(
DoCdef(), cdef_unit_buffer, cdef_buffer_stride, deblock_unit_buffer,
deblock_buffer_stride, block_buffer, block_buffer_stride,
current_process_unit_width, current_process_unit_height, unit_y == 0,
unit_y + current_process_unit_height >= plane_height);
restoration_func(reinterpret_cast<const uint8_t*>(
block_buffer + kRestorationBorder * block_buffer_stride +
kRestorationBorder * pixel_size_),
&(*loop_restored_window)[row][column],
restoration_info_->loop_restoration_info(
static_cast<Plane>(plane), unit_id),
kRestorationUnitWidthWithBorders,
loop_restored_window->columns(), current_process_unit_width,
current_process_unit_height, &restoration_buffer);
}
template <typename Pixel>
void PostFilter::ApplyLoopRestorationForSuperBlock(
const Plane plane, const int x, const int y, const int unit_row,
const int current_process_unit_height, const int process_unit_width) {
const int stride = frame_buffer_.stride(plane);
const int plane_unit_size = loop_restoration_.unit_size[plane];
const int num_horizontal_units =
restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
const int plane_width =
RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
const int plane_height =
RightShiftWithRounding(height_, subsampling_y_[plane]);
Array2DView<Pixel> loop_restored_window(
current_process_unit_height, stride / sizeof(Pixel),
reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane] + y * stride +
x * pixel_size_));
ApplyLoopRestorationForOneUnit<Pixel>(
superres_buffer_[plane], stride, plane, plane_height, x, y, 0, 0,
unit_row, current_process_unit_height, process_unit_width,
plane_unit_size, num_horizontal_units, plane_width,
&loop_restored_window);
}
void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start,
int sb4x4) {
assert(row4x4_start >= 0);
assert(DoRestoration());
const int plane_process_unit_width[kMaxPlanes] = {
kRestorationUnitWidth, kRestorationUnitWidth >> subsampling_x_[kPlaneU],
kRestorationUnitWidth >> subsampling_x_[kPlaneV]};
const int plane_process_unit_height[kMaxPlanes] = {
kRestorationUnitHeight, kRestorationUnitHeight >> subsampling_y_[kPlaneU],
kRestorationUnitHeight >> subsampling_y_[kPlaneV]};
for (int plane = 0; plane < planes_; ++plane) {
if (frame_header_.loop_restoration.type[plane] ==
kLoopRestorationTypeNone) {
continue;
}
const int unit_height_offset =
kRestorationUnitOffset >> subsampling_y_[plane];
const int plane_height =
RightShiftWithRounding(frame_header_.height, subsampling_y_[plane]);
const int plane_width = RightShiftWithRounding(frame_header_.upscaled_width,
subsampling_x_[plane]);
const int num_vertical_units =
restoration_info_->num_vertical_units(static_cast<Plane>(plane));
const int process_unit_width = plane_process_unit_width[plane];
for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
const int row4x4 = row4x4_start + sb_y;
const int y = (MultiplyBy4(row4x4) - (row4x4 == 0 ? 0 : 8)) >>
subsampling_y_[plane];
if (y >= plane_height) break;
const int plane_unit_size =
frame_header_.loop_restoration.unit_size[plane];
const int unit_row = std::min((y + unit_height_offset) / plane_unit_size,
num_vertical_units - 1);
const int expected_height = plane_process_unit_height[plane] +
((y == 0) ? -unit_height_offset : 0);
const int current_process_unit_height =
(y + expected_height <= plane_height) ? expected_height
: plane_height - y;
for (int column4x4 = 0;; column4x4 += 16) {
const int x = MultiplyBy4(column4x4) >> subsampling_x_[plane];
if (x >= plane_width) break;
#if LIBGAV1_MAX_BITDEPTH >= 10
if (bitdepth_ >= 10) {
ApplyLoopRestorationForSuperBlock<uint16_t>(
static_cast<Plane>(plane), x, y, unit_row,
current_process_unit_height, process_unit_width);
continue;
}
#endif
ApplyLoopRestorationForSuperBlock<uint8_t>(
static_cast<Plane>(plane), x, y, unit_row,
current_process_unit_height, process_unit_width);
}
}
}
}
template <typename Pixel>
void PostFilter::ApplyLoopRestorationForOneRowInWindow(
uint8_t* const cdef_buffer, const ptrdiff_t cdef_buffer_stride,
const Plane plane, const int plane_height, const int plane_width,
const int x, const int y, const int row, const int unit_row,
const int current_process_unit_height, const int process_unit_width,
const int window_width, const int plane_unit_size,
const int num_horizontal_units) {
Array2DView<Pixel> loop_restored_window(
window_buffer_height_, window_buffer_width_,
reinterpret_cast<Pixel*>(threaded_window_buffer_));
for (int column = 0; column < window_width; column += process_unit_width) {
ApplyLoopRestorationForOneUnit<Pixel>(
cdef_buffer, cdef_buffer_stride, plane, plane_height, x, y, row, column,
unit_row, current_process_unit_height, process_unit_width,
plane_unit_size, num_horizontal_units, plane_width,
&loop_restored_window);
}
}
// Multi-thread version of loop restoration, based on a moving window of size
// |window_buffer_width_|x|window_buffer_height_|. Inside the moving window, we
// create a filtering job for each row and each filtering job is submitted to
// the thread pool. Each free thread takes one job from the thread pool and
// completes filtering until all jobs are finished. This approach requires an
// extra buffer (|threaded_window_buffer_|) to hold the filtering output, whose
// size is the size of the window. It also needs block buffers (i.e.,
// |block_buffer| and |intermediate_buffers| in
// ApplyLoopRestorationForOneUnit()) to store intermediate results in loop
// restoration for each thread. After all units inside the window are filtered,
// the output is written to the frame buffer.
template <typename Pixel>
void PostFilter::ApplyLoopRestorationThreaded() {
const int plane_process_unit_width[kMaxPlanes] = {
kRestorationUnitWidth, kRestorationUnitWidth >> subsampling_x_[kPlaneU],
kRestorationUnitWidth >> subsampling_x_[kPlaneV]};
const int plane_process_unit_height[kMaxPlanes] = {
kRestorationUnitHeight, kRestorationUnitHeight >> subsampling_y_[kPlaneU],
kRestorationUnitHeight >> subsampling_y_[kPlaneV]};
for (int plane = kPlaneY; plane < planes_; ++plane) {
if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
continue;
}
const int unit_height_offset =
kRestorationUnitOffset >> subsampling_y_[plane];
uint8_t* const src_buffer = superres_buffer_[plane];
const int src_stride = frame_buffer_.stride(plane);
const int plane_unit_size = loop_restoration_.unit_size[plane];
const int num_vertical_units =
restoration_info_->num_vertical_units(static_cast<Plane>(plane));
const int num_horizontal_units =
restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
const int plane_width =
RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
const int plane_height =
RightShiftWithRounding(height_, subsampling_y_[plane]);
ExtendFrameBoundary(src_buffer, plane_width, plane_height, src_stride,
kRestorationBorder, kRestorationBorder,
kRestorationBorder, kRestorationBorder);
const int num_workers = thread_pool_->num_threads();
for (int y = 0; y < plane_height; y += window_buffer_height_) {
const int actual_window_height =
std::min(window_buffer_height_ - ((y == 0) ? unit_height_offset : 0),
plane_height - y);
int vertical_units_per_window =
(actual_window_height + plane_process_unit_height[plane] - 1) /
plane_process_unit_height[plane];
if (y == 0) {
// The first row of loop restoration processing units is not 64x64, but
// 64x56 (|unit_height_offset| = 8 rows less than other restoration
// processing units). For u/v with subsampling, the size is halved. To
// compute the number of vertical units per window, we need to take a
// special handling for it.
const int height_without_first_unit =
actual_window_height -
std::min(actual_window_height,
plane_process_unit_height[plane] - unit_height_offset);
vertical_units_per_window =
(height_without_first_unit + plane_process_unit_height[plane] - 1) /
plane_process_unit_height[plane] +
1;
}
for (int x = 0; x < plane_width; x += window_buffer_width_) {
const int actual_window_width =
std::min(window_buffer_width_, plane_width - x);
const int jobs_for_threadpool =
vertical_units_per_window * num_workers / (num_workers + 1);
assert(jobs_for_threadpool < vertical_units_per_window);
BlockingCounter pending_jobs(jobs_for_threadpool);
int job_count = 0;
int current_process_unit_height;
for (int row = 0; row < actual_window_height;
row += current_process_unit_height) {
const int unit_y = y + row;
const int expected_height = plane_process_unit_height[plane] +
((unit_y == 0) ? -unit_height_offset : 0);
current_process_unit_height =
(unit_y + expected_height <= plane_height)
? expected_height
: plane_height - unit_y;
const int unit_row =
std::min((unit_y + unit_height_offset) / plane_unit_size,
num_vertical_units - 1);
const int process_unit_width = plane_process_unit_width[plane];
if (job_count < jobs_for_threadpool) {
thread_pool_->Schedule(
[this, src_buffer, src_stride, process_unit_width,
current_process_unit_height, actual_window_width,
plane_unit_size, num_horizontal_units, x, y, row, unit_row,
plane_height, plane_width, plane, &pending_jobs]() {
ApplyLoopRestorationForOneRowInWindow<Pixel>(
src_buffer, src_stride, static_cast<Plane>(plane),
plane_height, plane_width, x, y, row, unit_row,
current_process_unit_height, process_unit_width,
actual_window_width, plane_unit_size,
num_horizontal_units);
pending_jobs.Decrement();
});
} else {
ApplyLoopRestorationForOneRowInWindow<Pixel>(
src_buffer, src_stride, static_cast<Plane>(plane), plane_height,
plane_width, x, y, row, unit_row, current_process_unit_height,
process_unit_width, actual_window_width, plane_unit_size,
num_horizontal_units);
}
++job_count;
}
// Wait for all jobs of current window to finish.
pending_jobs.Wait();
// Copy |threaded_window_buffer_| to output frame.
CopyPlane<Pixel>(
threaded_window_buffer_, window_buffer_width_ * pixel_size_,
actual_window_width, actual_window_height,
loop_restoration_buffer_[plane] + y * src_stride + x * pixel_size_,
src_stride);
}
if (y == 0) y -= unit_height_offset;
}
}
}
void PostFilter::ApplyLoopRestoration() {
assert(threaded_window_buffer_ != nullptr);
#if LIBGAV1_MAX_BITDEPTH >= 10
if (bitdepth_ >= 10) {
ApplyLoopRestorationThreaded<uint16_t>();
return;
}
#endif
ApplyLoopRestorationThreaded<uint8_t>();
}
} // namespace libgav1