blob: 761f1b650dba699f317f05bc53f29782c1f65841 [file] [log] [blame]
#include "src/post_filter.h"
#include <algorithm>
#include <atomic>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <memory>
#include "src/dsp/constants.h"
#include "src/utils/array_2d.h"
#include "src/utils/blocking_counter.h"
#include "src/utils/constants.h"
#include "src/utils/logging.h"
#include "src/utils/memory.h"
#include "src/utils/types.h"
namespace libgav1 {
namespace {
constexpr uint8_t kCdefUvDirection[2][2][8] = {
{{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
{{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
template <typename Pixel>
void ExtendFrame(uint8_t* const frame_start, const int width, const int height,
ptrdiff_t stride, const int left, const int right,
const int top, const int bottom) {
auto* const start = reinterpret_cast<Pixel*>(frame_start);
const Pixel* src = start;
Pixel* dst = start - left;
stride /= sizeof(Pixel);
// Copy to left and right borders.
for (int y = 0; y < height; ++y) {
Memset(dst, src[0], left);
Memset(dst + (left + width), src[width - 1], right);
src += stride;
dst += stride;
}
// Copy to top borders.
src = start - left;
dst = start - left - top * stride;
for (int y = 0; y < top; ++y) {
memcpy(dst, src, sizeof(Pixel) * stride);
dst += stride;
}
// Copy to bottom borders.
dst = start - left + height * stride;
src = dst - stride;
for (int y = 0; y < bottom; ++y) {
memcpy(dst, src, sizeof(Pixel) * stride);
dst += stride;
}
}
template <typename Pixel>
void CopyPlane(const uint8_t* source, int source_stride, const int width,
const int height, uint8_t* dest, int dest_stride) {
auto* dst = reinterpret_cast<Pixel*>(dest);
const auto* src = reinterpret_cast<const Pixel*>(source);
source_stride /= sizeof(Pixel);
dest_stride /= sizeof(Pixel);
for (int y = 0; y < height; ++y) {
memcpy(dst, src, width * sizeof(Pixel));
src += source_stride;
dst += dest_stride;
}
}
template <int bitdepth, typename Pixel>
void ComputeSuperRes(const uint8_t* source, uint32_t source_stride,
const int upscaled_width, const int height,
const int initial_subpixel_x, const int step,
uint8_t* dest, uint32_t dest_stride) {
const auto* src = reinterpret_cast<const Pixel*>(source);
auto* dst = reinterpret_cast<Pixel*>(dest);
source_stride /= sizeof(Pixel);
dest_stride /= sizeof(Pixel);
src -= DivideBy2(kSuperResFilterTaps);
for (int y = 0; y < height; ++y) {
int subpixel_x = initial_subpixel_x;
for (int x = 0; x < upscaled_width; ++x) {
int sum = 0;
const Pixel* const src_x = &src[subpixel_x >> kSuperResScaleBits];
const int src_x_subpixel =
(subpixel_x & kSuperResScaleMask) >> kSuperResExtraBits;
for (int i = 0; i < kSuperResFilterTaps; ++i) {
sum += src_x[i] * kUpscaleFilter[src_x_subpixel][i];
}
dst[x] = Clip3(RightShiftWithRounding(sum, kFilterBits), 0,
(1 << bitdepth) - 1);
subpixel_x += step;
}
src += source_stride;
dst += dest_stride;
}
}
} // namespace
// Static data member definitions.
constexpr int PostFilter::kCdefLargeValue;
bool PostFilter::ApplyFiltering() {
if (DoDeblock() && !ApplyDeblockFilter()) return false;
if (DoCdef() && !ApplyCdef()) return false;
if (DoSuperRes() && !ApplySuperRes()) return false;
if (DoRestoration() && !ApplyLoopRestoration()) return false;
// Extend frame boundary for inter frame convolution, referencing.
for (int plane = kPlaneY; plane < planes_; ++plane) {
const int8_t subsampling_x = (plane == kPlaneY) ? 0 : subsampling_x_;
const int8_t subsampling_y = (plane == kPlaneY) ? 0 : subsampling_y_;
const int plane_width =
RightShiftWithRounding(upscaled_width_, subsampling_x);
const int plane_height = RightShiftWithRounding(height_, subsampling_y);
assert(source_buffer_->left_border(plane) >= kMinLeftBorderPixels &&
source_buffer_->right_border(plane) >= kMinRightBorderPixels);
ExtendFrameBoundary(
source_buffer_->data(plane), plane_width, plane_height,
source_buffer_->stride(plane), source_buffer_->left_border(plane),
source_buffer_->right_border(plane), source_buffer_->top_border(plane),
source_buffer_->bottom_border(plane));
}
return true;
}
bool PostFilter::DoRestoration() const {
return DoRestoration(loop_restoration_, do_post_filter_mask_, planes_);
}
bool PostFilter::DoRestoration(const LoopRestoration& loop_restoration,
uint8_t do_post_filter_mask, int num_planes) {
if ((do_post_filter_mask & 0x08) == 0) return false;
if (num_planes == kMaxPlanesMonochrome) {
return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone;
}
return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone ||
loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone ||
loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone;
}
void PostFilter::ExtendFrameBoundary(uint8_t* const frame_start,
const int width, const int height,
const ptrdiff_t stride, const int left,
const int right, const int top,
const int bottom) {
if (bitdepth_ == 8) {
ExtendFrame<uint8_t>(frame_start, width, height, stride, left, right, top,
bottom);
} else {
ExtendFrame<uint16_t>(frame_start, width, height, stride, left, right, top,
bottom);
}
}
void PostFilter::DeblockFilterWorker(const DeblockFilterJob* jobs, int num_jobs,
std::atomic<int>* job_counter,
DeblockFilter deblock_filter) {
int job_index;
while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
num_jobs) {
const DeblockFilterJob& job = jobs[job_index];
for (int column4x4 = 0, column_unit = 0;
column4x4 < frame_header_.columns4x4;
column4x4 += kNum4x4InLoopFilterMaskUnit, ++column_unit) {
const int unit_id = GetDeblockUnitId(job.row_unit, column_unit);
(this->*deblock_filter)(static_cast<Plane>(job.plane), job.row4x4,
column4x4, unit_id);
}
}
}
bool PostFilter::ApplyDeblockFilterThreaded() {
const int jobs_per_plane = DivideBy16(frame_header_.rows4x4 + 15);
const int num_workers = thread_pool_->num_threads();
int planes[kMaxPlanes];
planes[0] = kPlaneY;
int num_planes = 1;
for (int plane = kPlaneU; plane < planes_; ++plane) {
if (frame_header_.loop_filter.level[plane + 1] != 0) {
planes[num_planes++] = plane;
}
}
const int num_jobs = num_planes * jobs_per_plane;
std::unique_ptr<DeblockFilterJob[]> jobs_unique_ptr(
new (std::nothrow) DeblockFilterJob[num_jobs]);
if (jobs_unique_ptr == nullptr) return false;
DeblockFilterJob* jobs = jobs_unique_ptr.get();
// The vertical filters are not dependent on each other. So simply schedule
// them for all possible rows.
//
// The horizontal filter for a row/column depends on the vertical filter being
// finished for the blocks to the top right and to the right. To work around
// this synchronization, we simply wait for the vertical filter to finish for
// all rows. Now, the horizontal filters can also be scheduled
// unconditionally similar to the vertical filters.
//
// The only synchronization involved is to know when the each directional
// filter is complete for the entire frame.
for (DeblockFilter deblock_filter : {&PostFilter::VerticalDeblockFilter,
&PostFilter::HorizontalDeblockFilter}) {
int job_index = 0;
for (int i = 0; i < num_planes; ++i) {
const int plane = planes[i];
for (int row4x4 = 0, row_unit = 0; row4x4 < frame_header_.rows4x4;
row4x4 += kNum4x4InLoopFilterMaskUnit, ++row_unit) {
assert(job_index < num_jobs);
DeblockFilterJob& job = jobs[job_index++];
job.plane = plane;
job.row4x4 = row4x4;
job.row_unit = row_unit;
}
}
assert(job_index == num_jobs);
std::atomic<int> job_counter(0);
BlockingCounter pending_workers(num_workers);
for (int i = 0; i < num_workers; ++i) {
thread_pool_->Schedule([this, jobs, num_jobs, &job_counter,
deblock_filter, &pending_workers]() {
DeblockFilterWorker(jobs, num_jobs, &job_counter, deblock_filter);
pending_workers.Decrement();
});
}
// Run the jobs on the current thread.
DeblockFilterWorker(jobs, num_jobs, &job_counter, deblock_filter);
// Wait for the threadpool jobs to finish.
pending_workers.Wait();
}
return true;
}
bool PostFilter::ApplyDeblockFilter() {
InitDeblockFilterParams();
if (thread_pool_ != nullptr) {
return ApplyDeblockFilterThreaded();
}
for (int plane = kPlaneY; plane < planes_; ++plane) {
if (plane != kPlaneY && frame_header_.loop_filter.level[plane + 1] == 0) {
continue;
}
// Iterate through each 64x64 block and apply deblock filtering.
for (int row4x4 = 0, row_unit = 0; row4x4 < frame_header_.rows4x4;
row4x4 += kNum4x4InLoopFilterMaskUnit, ++row_unit) {
int column4x4;
int column_unit;
for (column4x4 = 0, column_unit = 0; column4x4 < frame_header_.columns4x4;
column4x4 += kNum4x4InLoopFilterMaskUnit, ++column_unit) {
// First apply vertical filtering
const int unit_id = GetDeblockUnitId(row_unit, column_unit);
VerticalDeblockFilter(static_cast<Plane>(plane), row4x4, column4x4,
unit_id);
// Delay one superblock to apply horizontal filtering.
if (column4x4 != 0) {
HorizontalDeblockFilter(static_cast<Plane>(plane), row4x4,
column4x4 - kNum4x4InLoopFilterMaskUnit,
unit_id - 1);
}
}
// Horizontal filtering for the last 64x64 block.
const int unit_id = GetDeblockUnitId(row_unit, column_unit - 1);
HorizontalDeblockFilter(static_cast<Plane>(plane), row4x4,
column4x4 - kNum4x4InLoopFilterMaskUnit, unit_id);
}
}
return true;
}
void PostFilter::ComputeDeblockFilterLevels(
const int8_t delta_lf[kFrameLfCount],
uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
[kNumReferenceFrameTypes][2]) const {
if (!DoDeblock()) return;
for (int segment_id = 0;
segment_id < (frame_header_.segmentation.enabled ? kMaxSegments : 1);
++segment_id) {
int level_index = 0;
for (; level_index < 2; ++level_index) {
LoopFilterMask::ComputeDeblockFilterLevels(
frame_header_, segment_id, level_index, delta_lf,
deblock_filter_levels[segment_id][level_index]);
}
for (; level_index < kFrameLfCount; ++level_index) {
if (frame_header_.loop_filter.level[level_index] != 0) {
LoopFilterMask::ComputeDeblockFilterLevels(
frame_header_, segment_id, level_index, delta_lf,
deblock_filter_levels[segment_id][level_index]);
}
}
}
}
uint8_t* PostFilter::GetCdefBufferAndStride(
const int start_x, const int start_y, const int plane,
const int subsampling_x, const int subsampling_y,
const int window_buffer_plane_size, const int vertical_shift,
const int horizontal_shift, int* cdef_stride) {
if (!DoRestoration() && thread_pool_ != nullptr) {
// write output to threaded_window_buffer.
*cdef_stride = window_buffer_width_ * pixel_size_;
const int column_window = start_x % (window_buffer_width_ >> subsampling_x);
const int row_window = start_y % (window_buffer_height_ >> subsampling_y);
return threaded_window_buffer_ + plane * window_buffer_plane_size +
row_window * (*cdef_stride) + column_window * pixel_size_;
}
// write output to cdef_buffer_.
*cdef_stride = cdef_buffer_->stride(plane);
// In-place cdef is applied by writing the output to the top-left
// corner, if restoration is not present. In this case,
// cdef_buffer_ == source_buffer_.
const ptrdiff_t buffer_offset =
DoRestoration()
? 0
: vertical_shift * (*cdef_stride) + horizontal_shift * pixel_size_;
return cdef_buffer_->data(plane) + start_y * (*cdef_stride) +
start_x * pixel_size_ + buffer_offset;
}
template <typename Pixel>
void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
const int block_width4x4,
const int block_height4x4,
const int row4x4_start,
const int column4x4_start) {
const int coeff_shift = bitdepth_ - 8;
const int step = kNum4x4BlocksWide[kBlock8x8];
const int horizontal_shift = -source_buffer_->alignment() / pixel_size_;
const int vertical_shift = -kCdefBorder;
const int window_buffer_plane_size =
window_buffer_width_ * window_buffer_height_ * pixel_size_;
if (index == -1) {
for (int plane = kPlaneY; plane < planes_; ++plane) {
const int subsampling_x = (plane == kPlaneY) ? 0 : subsampling_x_;
const int subsampling_y = (plane == kPlaneY) ? 0 : subsampling_y_;
const int start_x = MultiplyBy4(column4x4_start) >> subsampling_x;
const int start_y = MultiplyBy4(row4x4_start) >> subsampling_y;
int cdef_stride;
uint8_t* const cdef_buffer = GetCdefBufferAndStride(
start_x, start_y, plane, subsampling_x, subsampling_y,
window_buffer_plane_size, vertical_shift, horizontal_shift,
&cdef_stride);
const int src_stride = source_buffer_->stride(plane);
uint8_t* const src_buffer = source_buffer_->data(plane) +
start_y * src_stride + start_x * pixel_size_;
const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
for (int y = 0; y < block_height; ++y) {
memcpy(cdef_buffer + y * cdef_stride, src_buffer + y * src_stride,
block_width * pixel_size_);
}
}
return;
}
PrepareCdefBlock<Pixel>(source_buffer_, planes_, subsampling_x_,
subsampling_y_, frame_header_.width,
frame_header_.height, block_width4x4, block_height4x4,
row4x4_start, column4x4_start, cdef_block,
kRestorationProcessingUnitSizeWithBorders);
for (int row4x4 = row4x4_start; row4x4 < row4x4_start + block_height4x4;
row4x4 += step) {
for (int column4x4 = column4x4_start;
column4x4 < column4x4_start + block_width4x4; column4x4 += step) {
const bool skip =
block_parameters_.Find(row4x4, column4x4) != nullptr &&
block_parameters_.Find(row4x4 + 1, column4x4) != nullptr &&
block_parameters_.Find(row4x4, column4x4 + 1) != nullptr &&
block_parameters_.Find(row4x4 + 1, column4x4 + 1) != nullptr &&
block_parameters_.Find(row4x4, column4x4)->skip &&
block_parameters_.Find(row4x4 + 1, column4x4)->skip &&
block_parameters_.Find(row4x4, column4x4 + 1)->skip &&
block_parameters_.Find(row4x4 + 1, column4x4 + 1)->skip;
int damping = frame_header_.cdef.damping + coeff_shift;
int direction_y;
int direction;
int variance;
uint8_t primary_strength;
uint8_t secondary_strength;
for (int plane = kPlaneY; plane < planes_; ++plane) {
const int subsampling_x = (plane == kPlaneY) ? 0 : subsampling_x_;
const int subsampling_y = (plane == kPlaneY) ? 0 : subsampling_y_;
const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
const int block_width = 8 >> subsampling_x;
const int block_height = 8 >> subsampling_y;
int cdef_stride;
uint8_t* const cdef_buffer = GetCdefBufferAndStride(
start_x, start_y, plane, subsampling_x, subsampling_y,
window_buffer_plane_size, vertical_shift, horizontal_shift,
&cdef_stride);
const int src_stride = source_buffer_->stride(plane);
uint8_t* const src_buffer = source_buffer_->data(plane) +
start_y * src_stride +
start_x * pixel_size_;
if (skip) { // No cdef filtering.
for (int y = 0; y < block_height; ++y) {
memcpy(cdef_buffer + y * cdef_stride, src_buffer + y * src_stride,
block_width * pixel_size_);
}
continue;
}
if (plane == kPlaneY) {
dsp_.cdef_direction(src_buffer, src_stride, &direction_y, &variance);
primary_strength = frame_header_.cdef.y_primary_strength[index]
<< coeff_shift;
secondary_strength = frame_header_.cdef.y_secondary_strength[index]
<< coeff_shift;
direction = (primary_strength == 0) ? 0 : direction_y;
const int variance_strength =
((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12)
: 0;
primary_strength =
(variance != 0)
? (primary_strength * (4 + variance_strength) + 8) >> 4
: 0;
} else {
primary_strength = frame_header_.cdef.uv_primary_strength[index]
<< coeff_shift;
secondary_strength = frame_header_.cdef.uv_secondary_strength[index]
<< coeff_shift;
direction = (primary_strength == 0)
? 0
: kCdefUvDirection[subsampling_x_][subsampling_y_]
[direction_y];
damping = frame_header_.cdef.damping + coeff_shift - 1;
}
if ((primary_strength | secondary_strength) == 0) {
for (int y = 0; y < block_height; ++y) {
memcpy(cdef_buffer + y * cdef_stride, src_buffer + y * src_stride,
block_width * pixel_size_);
}
continue;
}
uint16_t* cdef_src =
cdef_block + plane * kRestorationProcessingUnitSizeWithBorders *
kRestorationProcessingUnitSizeWithBorders;
cdef_src += kCdefBorder * kRestorationProcessingUnitSizeWithBorders +
kCdefBorder;
cdef_src += (MultiplyBy4(row4x4 - row4x4_start) >> subsampling_y) *
kRestorationProcessingUnitSizeWithBorders +
(MultiplyBy4(column4x4 - column4x4_start) >> subsampling_x);
dsp_.cdef_filter(cdef_src, kRestorationProcessingUnitSizeWithBorders,
frame_header_.rows4x4, frame_header_.columns4x4,
start_x, start_y, subsampling_x, subsampling_y,
primary_strength, secondary_strength, damping,
direction, cdef_buffer, cdef_stride);
}
}
}
}
template <typename Pixel>
void PostFilter::ApplyCdefForOneRowInWindow(const int row4x4,
const int column4x4_start) {
const int step_64x64 = 16; // = 64/4.
uint16_t cdef_block[kRestorationProcessingUnitSizeWithBorders *
kRestorationProcessingUnitSizeWithBorders * 3];
for (int column4x4_64x64 = 0;
column4x4_64x64 < std::min(DivideBy4(window_buffer_width_),
frame_header_.columns4x4 - column4x4_start);
column4x4_64x64 += step_64x64) {
const int column4x4 = column4x4_start + column4x4_64x64;
const int index = cdef_index_[DivideBy16(row4x4)][DivideBy16(column4x4)];
const int block_width4x4 =
std::min(step_64x64, frame_header_.columns4x4 - column4x4);
const int block_height4x4 =
std::min(step_64x64, frame_header_.rows4x4 - row4x4);
ApplyCdefForOneUnit<Pixel>(cdef_block, index, block_width4x4,
block_height4x4, row4x4, column4x4);
}
}
// Each thread processes one row inside the window.
// Y, U, V planes are processed together inside one thread.
template <typename Pixel>
bool PostFilter::ApplyCdefThreaded() {
assert((window_buffer_height_ & 63) == 0);
const int num_workers = thread_pool_->num_threads();
const int horizontal_shift = -source_buffer_->alignment() / pixel_size_;
const int vertical_shift = -kCdefBorder;
const int window_buffer_plane_size =
window_buffer_width_ * window_buffer_height_ * pixel_size_;
const int window_buffer_height4x4 = DivideBy4(window_buffer_height_);
const int step_64x64 = 16; // = 64/4.
for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
row4x4 += window_buffer_height4x4) {
const int actual_window_height4x4 =
std::min(window_buffer_height4x4, frame_header_.rows4x4 - row4x4);
const int vertical_units_per_window =
DivideBy16(actual_window_height4x4 + 15);
for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
column4x4 += DivideBy4(window_buffer_width_)) {
const int jobs_for_threadpool =
vertical_units_per_window * num_workers / (num_workers + 1);
BlockingCounter pending_jobs(jobs_for_threadpool);
int job_count = 0;
for (int row64x64 = 0; row64x64 < actual_window_height4x4;
row64x64 += step_64x64) {
if (job_count < jobs_for_threadpool) {
thread_pool_->Schedule(
[this, row4x4, column4x4, row64x64, &pending_jobs]() {
ApplyCdefForOneRowInWindow<Pixel>(row4x4 + row64x64, column4x4);
pending_jobs.Decrement();
});
} else {
ApplyCdefForOneRowInWindow<Pixel>(row4x4 + row64x64, column4x4);
}
++job_count;
}
pending_jobs.Wait();
if (DoRestoration()) continue;
// Copy |threaded_window_buffer_| to cdef_buffer_ (== source_buffer_).
assert(cdef_buffer_ == source_buffer_);
for (int plane = kPlaneY; plane < planes_; ++plane) {
const int cdef_stride = cdef_buffer_->stride(plane);
const ptrdiff_t buffer_offset =
vertical_shift * cdef_stride + horizontal_shift * pixel_size_;
const int8_t subsampling_x = (plane == kPlaneY) ? 0 : subsampling_x_;
const int8_t subsampling_y = (plane == kPlaneY) ? 0 : subsampling_y_;
const int plane_row = MultiplyBy4(row4x4) >> subsampling_y;
const int plane_column = MultiplyBy4(column4x4) >> subsampling_x;
int copy_width = std::min(frame_header_.columns4x4 - column4x4,
DivideBy4(window_buffer_width_));
copy_width = MultiplyBy4(copy_width) >> subsampling_x;
int copy_height =
std::min(frame_header_.rows4x4 - row4x4, window_buffer_height4x4);
copy_height = MultiplyBy4(copy_height) >> subsampling_y;
CopyPlane<Pixel>(
threaded_window_buffer_ + plane * window_buffer_plane_size,
window_buffer_width_ * pixel_size_, copy_width, copy_height,
cdef_buffer_->data(plane) + plane_row * cdef_stride +
plane_column * pixel_size_ + buffer_offset,
cdef_stride);
}
}
}
if (!DoRestoration()) {
for (int plane = kPlaneY; plane < planes_; ++plane) {
if (!cdef_buffer_->ShiftBuffer(plane, horizontal_shift, vertical_shift)) {
LIBGAV1_DLOG(ERROR,
"Error shifting frame buffer head pointer at plane: %d",
plane);
return false;
}
}
}
return true;
}
bool PostFilter::ApplyCdef() {
if (!DoRestoration()) {
cdef_buffer_ = source_buffer_;
} else {
if (!cdef_filtered_buffer_.Realloc(
bitdepth_, planes_ == kMaxPlanesMonochrome, upscaled_width_,
height_, subsampling_x_, subsampling_y_, kBorderPixels,
/*byte_alignment=*/0, nullptr, nullptr, nullptr)) {
return false;
}
cdef_buffer_ = &cdef_filtered_buffer_;
}
if (thread_pool_ != nullptr) {
#if LIBGAV1_MAX_BITDEPTH >= 10
if (bitdepth_ >= 10) {
return ApplyCdefThreaded<uint16_t>();
}
#endif
return ApplyCdefThreaded<uint8_t>();
}
const int step_64x64 = 16; // = 64/4.
// Apply cdef on each 8x8 Y block and
// (8 >> subsampling_x)x(8 >> subsampling_y) UV block.
for (int row4x4 = 0; row4x4 < frame_header_.rows4x4; row4x4 += step_64x64) {
for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
column4x4 += step_64x64) {
const int index = cdef_index_[DivideBy16(row4x4)][DivideBy16(column4x4)];
const int block_width4x4 =
std::min(step_64x64, frame_header_.columns4x4 - column4x4);
const int block_height4x4 =
std::min(step_64x64, frame_header_.rows4x4 - row4x4);
#if LIBGAV1_MAX_BITDEPTH >= 10
if (bitdepth_ >= 10) {
ApplyCdefForOneUnit<uint16_t>(cdef_block_, index, block_width4x4,
block_height4x4, row4x4, column4x4);
continue;
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
ApplyCdefForOneUnit<uint8_t>(cdef_block_, index, block_width4x4,
block_height4x4, row4x4, column4x4);
}
}
if (!DoRestoration()) {
const int horizontal_shift = -source_buffer_->alignment() / pixel_size_;
const int vertical_shift = -kCdefBorder;
for (int plane = kPlaneY; plane < planes_; ++plane) {
if (!source_buffer_->ShiftBuffer(plane, horizontal_shift,
vertical_shift)) {
LIBGAV1_DLOG(ERROR,
"Error shifting frame buffer head pointer at plane: %d",
plane);
return false;
}
}
}
return true;
}
void PostFilter::FrameSuperRes(YuvBuffer* const input_buffer) {
// Copy input_buffer to super_res_buffer_.
for (int plane = kPlaneY; plane < planes_; ++plane) {
const int8_t subsampling_x = (plane == kPlaneY) ? 0 : subsampling_x_;
const int8_t subsampling_y = (plane == kPlaneY) ? 0 : subsampling_y_;
const int border_height = kBorderPixels >> subsampling_y;
const int border_width = kBorderPixels >> subsampling_x;
const int plane_width =
MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
const int plane_height =
MultiplyBy4(frame_header_.rows4x4) >> subsampling_y;
if (bitdepth_ == 8) {
CopyPlane<uint8_t>(input_buffer->data(plane), input_buffer->stride(plane),
plane_width, plane_height,
super_res_buffer_.data(plane),
super_res_buffer_.stride(plane));
} else {
CopyPlane<uint16_t>(input_buffer->data(plane),
input_buffer->stride(plane), plane_width,
plane_height, super_res_buffer_.data(plane),
super_res_buffer_.stride(plane));
}
ExtendFrameBoundary(super_res_buffer_.data(plane), plane_width,
plane_height, super_res_buffer_.stride(plane),
border_width, border_width, border_height,
border_height);
}
// Upscale filter and write to frame.
for (int plane = kPlaneY; plane < planes_; ++plane) {
const int8_t subsampling_x = (plane == kPlaneY) ? 0 : subsampling_x_;
const int8_t subsampling_y = (plane == kPlaneY) ? 0 : subsampling_y_;
const int downscaled_width = RightShiftWithRounding(width_, subsampling_x);
const int upscaled_width =
RightShiftWithRounding(upscaled_width_, subsampling_x);
const int plane_height = RightShiftWithRounding(height_, subsampling_y);
const int superres_width = downscaled_width << kSuperResScaleBits;
const int step = (superres_width + upscaled_width / 2) / upscaled_width;
const int error = step * upscaled_width - superres_width;
int initial_subpixel_x =
(-((upscaled_width - downscaled_width) << (kSuperResScaleBits - 1)) +
DivideBy2(upscaled_width)) /
upscaled_width +
(1 << (kSuperResExtraBits - 1)) - error / 2;
initial_subpixel_x &= kSuperResScaleMask;
if (bitdepth_ == 8) {
ComputeSuperRes<8, uint8_t>(
super_res_buffer_.data(plane), super_res_buffer_.stride(plane),
upscaled_width, plane_height, initial_subpixel_x, step,
input_buffer->data(plane), input_buffer->stride(plane));
} else {
ComputeSuperRes<10, uint16_t>(
super_res_buffer_.data(plane), super_res_buffer_.stride(plane),
upscaled_width, plane_height, initial_subpixel_x, step,
input_buffer->data(plane), input_buffer->stride(plane));
}
}
// Extend original frame, copy to borders.
for (int plane = kPlaneY; plane < planes_; ++plane) {
const int8_t subsampling_x = (plane == kPlaneY) ? 0 : subsampling_x_;
uint8_t* const frame_start = input_buffer->data(plane);
const int plane_width =
RightShiftWithRounding(upscaled_width_, subsampling_x);
ExtendFrameBoundary(
frame_start, plane_width, input_buffer->displayed_height(plane),
input_buffer->stride(plane), input_buffer->left_border(plane),
input_buffer->right_border(plane), input_buffer->top_border(plane),
input_buffer->bottom_border(plane));
}
}
bool PostFilter::ApplySuperRes() {
if (!super_res_buffer_.Realloc(bitdepth_, planes_ == kMaxPlanesMonochrome,
MultiplyBy4(frame_header_.columns4x4),
MultiplyBy4(frame_header_.rows4x4),
subsampling_x_, subsampling_y_, kBorderPixels,
/*byte_alignment=*/0, nullptr, nullptr,
nullptr)) {
return false;
}
// cdef_buffer_ points to the buffer after cdef process (regardless whether
// cdef filtering is actually applied).
// source_buffer_ points to the deblocked buffer.
if (DoCdef()) {
// If loop restoration is present, it requires both deblocked buffer and
// cdef filtered buffer. Otherwise, only cdef filtered buffer is required.
FrameSuperRes(cdef_buffer_);
if (DoRestoration()) FrameSuperRes(source_buffer_);
} else {
FrameSuperRes(source_buffer_);
}
return true;
}
template <typename Pixel>
void PostFilter::ApplyLoopRestorationForOneRowInWindow(
uint8_t* const cdef_buffer, const ptrdiff_t cdef_buffer_stride,
uint8_t* const deblock_buffer, const ptrdiff_t deblock_buffer_stride,
const Plane plane, const int plane_height, const int plane_width,
const int x, const int y, const int row, const int unit_row,
const int current_process_unit_height, const int process_unit_width,
const int window_width, const int plane_unit_size,
const int num_horizontal_units) {
for (int column = 0; column < window_width; column += process_unit_width) {
const int unit_x = x + column;
const int unit_column =
std::min(unit_x / plane_unit_size, num_horizontal_units - 1);
const int unit_id = unit_row * num_horizontal_units + unit_column;
const LoopRestorationType type =
restoration_info_
->loop_restoration_info(static_cast<Plane>(plane), unit_id)
.type;
const int current_process_unit_width =
(unit_x + process_unit_width <= plane_width) ? process_unit_width
: plane_width - unit_x;
ApplyLoopRestorationForOneUnit<Pixel>(
cdef_buffer, cdef_buffer_stride, deblock_buffer, deblock_buffer_stride,
plane, plane_height, unit_id, type, x, y, row, column,
current_process_unit_width, current_process_unit_height,
process_unit_width, window_buffer_width_);
}
}
template <typename Pixel>
void PostFilter::ApplyLoopRestorationForOneUnit(
uint8_t* const cdef_buffer, const ptrdiff_t cdef_buffer_stride,
uint8_t* const deblock_buffer, const ptrdiff_t deblock_buffer_stride,
const Plane plane, const int plane_height, const int unit_id,
const LoopRestorationType type, const int x, const int y, const int row,
const int column, const int current_process_unit_width,
const int current_process_unit_height, const int plane_process_unit_width,
const int window_width) {
const int unit_x = x + column;
const int unit_y = y + row;
uint8_t* cdef_unit_buffer =
cdef_buffer + unit_y * cdef_buffer_stride + unit_x * pixel_size_;
Array2DView<Pixel> loop_restored_window(
window_buffer_height_, window_buffer_width_,
reinterpret_cast<Pixel*>(threaded_window_buffer_));
if (type == kLoopRestorationTypeNone) {
Pixel* dest = &loop_restored_window[row][column];
for (int k = 0; k < current_process_unit_height; ++k) {
memcpy(dest, cdef_unit_buffer, current_process_unit_width * pixel_size_);
dest += window_width;
cdef_unit_buffer += cdef_buffer_stride;
}
return;
}
// The SIMD implementation of wiener filter (currently WienerFilter_SSE4_1())
// over-reads 6 bytes, so add 6 extra bytes at the end of block_buffer for 8
// bit.
alignas(alignof(uint16_t))
uint8_t block_buffer[kRestorationProcessingUnitSizeWithBorders *
kRestorationProcessingUnitSizeWithBorders *
sizeof(Pixel) +
((sizeof(Pixel) == 1) ? 6 : 0)];
const ptrdiff_t block_buffer_stride =
kRestorationProcessingUnitSizeWithBorders * pixel_size_;
IntermediateBuffers intermediate_buffers;
RestorationBuffer restoration_buffer = {
{intermediate_buffers.box_filter.output[0],
intermediate_buffers.box_filter.output[1]},
plane_process_unit_width,
{intermediate_buffers.box_filter.intermediate_a,
intermediate_buffers.box_filter.intermediate_b},
kRestorationProcessingUnitSizeWithBorders + kRestorationPadding,
intermediate_buffers.wiener,
kMaxSuperBlockSizeInPixels};
uint8_t* deblock_unit_buffer =
deblock_buffer + unit_y * deblock_buffer_stride + unit_x * pixel_size_;
assert(type == kLoopRestorationTypeSgrProj ||
type == kLoopRestorationTypeWiener);
const dsp::LoopRestorationFunc restoration_func =
dsp_.loop_restorations[type - 2];
PrepareLoopRestorationBlock<Pixel>(
cdef_unit_buffer, cdef_buffer_stride, deblock_unit_buffer,
deblock_buffer_stride, block_buffer, block_buffer_stride,
current_process_unit_width, current_process_unit_height, unit_y == 0,
unit_y + current_process_unit_height >= plane_height);
restoration_func(reinterpret_cast<const uint8_t*>(
block_buffer + kRestorationBorder * block_buffer_stride +
kRestorationBorder * pixel_size_),
&loop_restored_window[row][column],
restoration_info_->loop_restoration_info(
static_cast<Plane>(plane), unit_id),
block_buffer_stride, window_width * pixel_size_,
current_process_unit_width, current_process_unit_height,
&restoration_buffer);
}
// Multi-thread version of loop restoration, based on a moving window of size
// |window_buffer_width_|x|window_buffer_height_|. Inside the moving window, we
// create a filtering job for each row and each filtering job is submitted to
// the thread pool. Each free thread takes one job from the thread pool and
// completes filtering until all jobs are finished. This approach requires an
// extra buffer (|threaded_window_buffer_|) to hold the filtering output, whose
// size is the size of the window. It also needs block buffers (i.e.,
// |block_buffer| and |intermediate_buffers| in
// ApplyLoopRestorationForOneUnit()) to store intermediate results in loop
// restoration for each thread. After all units inside the window are filtered,
// the output is written to the frame buffer.
template <typename Pixel>
bool PostFilter::ApplyLoopRestorationThreaded() {
if (!DoCdef()) cdef_buffer_ = source_buffer_;
const int plane_process_unit_width[kMaxPlanes] = {
kRestorationProcessingUnitSize,
kRestorationProcessingUnitSize >> subsampling_x_,
kRestorationProcessingUnitSize >> subsampling_x_};
const int plane_process_unit_height[kMaxPlanes] = {
kRestorationProcessingUnitSize,
kRestorationProcessingUnitSize >> subsampling_y_,
kRestorationProcessingUnitSize >> subsampling_y_};
const int horizontal_shift = -source_buffer_->alignment() / pixel_size_;
const int vertical_shift = -kRestorationBorder;
for (int plane = kPlaneY; plane < planes_; ++plane) {
if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
if (!DoCdef()) continue;
CopyPlane<Pixel>(cdef_buffer_->data(plane), cdef_buffer_->stride(plane),
cdef_buffer_->displayed_width(plane),
cdef_buffer_->displayed_height(plane),
source_buffer_->data(plane),
source_buffer_->stride(plane));
continue;
}
const int8_t subsampling_x = (plane == kPlaneY) ? 0 : subsampling_x_;
const int8_t subsampling_y = (plane == kPlaneY) ? 0 : subsampling_y_;
const int unit_height_offset = kRestorationUnitOffset >> subsampling_y;
uint8_t* src_buffer = source_buffer_->data(plane);
const int src_stride = source_buffer_->stride(plane);
uint8_t* cdef_buffer = cdef_buffer_->data(plane);
const int cdef_buffer_stride = cdef_buffer_->stride(plane);
uint8_t* deblock_buffer = source_buffer_->data(plane);
const int deblock_buffer_stride = source_buffer_->stride(plane);
const int plane_unit_size = loop_restoration_.unit_size[plane];
const int num_vertical_units =
restoration_info_->num_vertical_units(static_cast<Plane>(plane));
const int num_horizontal_units =
restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
const int plane_width =
RightShiftWithRounding(upscaled_width_, subsampling_x);
const int plane_height = RightShiftWithRounding(height_, subsampling_y);
const ptrdiff_t src_unit_buffer_offset =
vertical_shift * src_stride + horizontal_shift * pixel_size_;
ExtendFrameBoundary(cdef_buffer, plane_width, plane_height,
cdef_buffer_stride, kRestorationBorder,
kRestorationBorder, kRestorationBorder,
kRestorationBorder);
if (DoCdef()) {
ExtendFrameBoundary(deblock_buffer, plane_width, plane_height,
deblock_buffer_stride, kRestorationBorder,
kRestorationBorder, kRestorationBorder,
kRestorationBorder);
}
const int num_workers = thread_pool_->num_threads();
for (int y = 0; y < plane_height; y += window_buffer_height_) {
const int actual_window_height =
std::min(window_buffer_height_ - ((y == 0) ? unit_height_offset : 0),
plane_height - y);
int vertical_units_per_window =
(actual_window_height + plane_process_unit_height[plane] - 1) /
plane_process_unit_height[plane];
if (y == 0) {
// The first row of loop restoration processing units is not 64x64, but
// 64x56 (|unit_height_offset| = 8 rows less than other restoration
// processing units). For u/v with subsampling, the size is halved. To
// compute the number of vertical units per window, we need to take a
// special handling for it.
const int height_without_first_unit =
actual_window_height -
std::min(actual_window_height,
plane_process_unit_height[plane] - unit_height_offset);
vertical_units_per_window =
(height_without_first_unit + plane_process_unit_height[plane] - 1) /
plane_process_unit_height[plane] +
1;
}
for (int x = 0; x < plane_width; x += window_buffer_width_) {
const int actual_window_width =
std::min(window_buffer_width_, plane_width - x);
const int jobs_for_threadpool =
vertical_units_per_window * num_workers / (num_workers + 1);
assert(jobs_for_threadpool < vertical_units_per_window);
BlockingCounter pending_jobs(jobs_for_threadpool);
int job_count = 0;
int current_process_unit_height;
for (int row = 0; row < actual_window_height;
row += current_process_unit_height) {
const int unit_y = y + row;
const int expected_height = plane_process_unit_height[plane] +
((unit_y == 0) ? -unit_height_offset : 0);
current_process_unit_height =
(unit_y + expected_height <= plane_height)
? expected_height
: plane_height - unit_y;
const int unit_row =
std::min((unit_y + unit_height_offset) / plane_unit_size,
num_vertical_units - 1);
const int process_unit_width = plane_process_unit_width[plane];
if (job_count < jobs_for_threadpool) {
thread_pool_->Schedule(
[this, cdef_buffer, cdef_buffer_stride, deblock_buffer,
deblock_buffer_stride, process_unit_width,
current_process_unit_height, actual_window_width,
plane_unit_size, num_horizontal_units, x, y, row, unit_row,
plane_height, plane_width, plane, &pending_jobs]() {
ApplyLoopRestorationForOneRowInWindow<Pixel>(
cdef_buffer, cdef_buffer_stride, deblock_buffer,
deblock_buffer_stride, static_cast<Plane>(plane),
plane_height, plane_width, x, y, row, unit_row,
current_process_unit_height, process_unit_width,
actual_window_width, plane_unit_size,
num_horizontal_units);
pending_jobs.Decrement();
});
} else {
ApplyLoopRestorationForOneRowInWindow<Pixel>(
cdef_buffer, cdef_buffer_stride, deblock_buffer,
deblock_buffer_stride, static_cast<Plane>(plane), plane_height,
plane_width, x, y, row, unit_row, current_process_unit_height,
process_unit_width, actual_window_width, plane_unit_size,
num_horizontal_units);
}
++job_count;
}
// Wait for all jobs of current window to finish.
pending_jobs.Wait();
// Copy |threaded_window_buffer_| to output frame.
CopyPlane<Pixel>(threaded_window_buffer_,
window_buffer_width_ * pixel_size_,
actual_window_width, actual_window_height,
src_buffer + y * src_stride + x * pixel_size_ +
src_unit_buffer_offset,
src_stride);
}
if (y == 0) y -= unit_height_offset;
}
if (!source_buffer_->ShiftBuffer(plane, horizontal_shift, vertical_shift)) {
LIBGAV1_DLOG(ERROR,
"Error shifting frame buffer head pointer at plane: %d",
plane);
return false;
}
}
return true;
}
bool PostFilter::ApplyLoopRestoration() {
if (thread_pool_ != nullptr) {
assert(threaded_window_buffer_ != nullptr);
#if LIBGAV1_MAX_BITDEPTH >= 10
if (bitdepth_ >= 10) {
return ApplyLoopRestorationThreaded<uint16_t>();
}
#endif
return ApplyLoopRestorationThreaded<uint8_t>();
}
if (!DoCdef()) cdef_buffer_ = source_buffer_;
const ptrdiff_t block_buffer_stride =
kRestorationProcessingUnitSizeWithBorders * pixel_size_;
const int plane_process_unit_width[kMaxPlanes] = {
kRestorationProcessingUnitSize,
kRestorationProcessingUnitSize >> subsampling_x_,
kRestorationProcessingUnitSize >> subsampling_x_};
const int plane_process_unit_height[kMaxPlanes] = {
kRestorationProcessingUnitSize,
kRestorationProcessingUnitSize >> subsampling_y_,
kRestorationProcessingUnitSize >> subsampling_y_};
IntermediateBuffers intermediate_buffers;
RestorationBuffer restoration_buffer = {
{intermediate_buffers.box_filter.output[0],
intermediate_buffers.box_filter.output[1]},
plane_process_unit_width[kPlaneY],
{intermediate_buffers.box_filter.intermediate_a,
intermediate_buffers.box_filter.intermediate_b},
kRestorationProcessingUnitSizeWithBorders + kRestorationPadding,
intermediate_buffers.wiener,
kMaxSuperBlockSizeInPixels};
for (int plane = kPlaneY; plane < planes_; ++plane) {
if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
if (!DoCdef()) continue;
if (cdef_buffer_->bitdepth() == 8) {
CopyPlane<uint8_t>(
cdef_buffer_->data(plane), cdef_buffer_->stride(plane),
cdef_buffer_->displayed_width(plane),
cdef_buffer_->displayed_height(plane), source_buffer_->data(plane),
source_buffer_->stride(plane));
#if LIBGAV1_MAX_BITDEPTH >= 10
} else {
CopyPlane<uint16_t>(
cdef_buffer_->data(plane), cdef_buffer_->stride(plane),
cdef_buffer_->displayed_width(plane),
cdef_buffer_->displayed_height(plane), source_buffer_->data(plane),
source_buffer_->stride(plane));
#endif
}
continue;
}
const int8_t subsampling_x = (plane == kPlaneY) ? 0 : subsampling_x_;
const int8_t subsampling_y = (plane == kPlaneY) ? 0 : subsampling_y_;
const int unit_height_offset = kRestorationUnitOffset >> subsampling_y;
restoration_buffer.box_filter_process_output_stride =
plane_process_unit_width[plane];
uint8_t* src_buffer = source_buffer_->data(plane);
const ptrdiff_t src_stride = source_buffer_->stride(plane);
uint8_t* cdef_buffer = cdef_buffer_->data(plane);
const ptrdiff_t cdef_buffer_stride = cdef_buffer_->stride(plane);
uint8_t* deblock_buffer = source_buffer_->data(plane);
const ptrdiff_t deblock_buffer_stride = source_buffer_->stride(plane);
const int plane_unit_size = loop_restoration_.unit_size[plane];
const int num_vertical_units =
restoration_info_->num_vertical_units(static_cast<Plane>(plane));
const int num_horizontal_units =
restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
const int plane_width =
RightShiftWithRounding(upscaled_width_, subsampling_x);
const int plane_height = RightShiftWithRounding(height_, subsampling_y);
ExtendFrameBoundary(cdef_buffer, plane_width, plane_height,
cdef_buffer_stride, kRestorationBorder,
kRestorationBorder, kRestorationBorder,
kRestorationBorder);
if (DoCdef()) {
ExtendFrameBoundary(deblock_buffer, plane_width, plane_height,
deblock_buffer_stride, kRestorationBorder,
kRestorationBorder, kRestorationBorder,
kRestorationBorder);
}
int loop_restored_rows = 0;
const int horizontal_shift = -source_buffer_->alignment() / pixel_size_;
const int vertical_shift = -kRestorationBorder;
const ptrdiff_t src_unit_buffer_offset =
vertical_shift * src_stride + horizontal_shift * pixel_size_;
for (int unit_row = 0; unit_row < num_vertical_units; ++unit_row) {
int current_unit_height = plane_unit_size;
// Note [1]: we need to identify the entire restoration area. So the
// condition check of finding the boundary is first. In contrast, Note [2]
// is a case where condition check of the first row is first.
if (unit_row == num_vertical_units - 1) {
// Take care of the last row. The max height of last row units could be
// 3/2 unit_size.
current_unit_height = plane_height - loop_restored_rows;
} else if (unit_row == 0) {
// The size of restoration units in the first row has to subtract the
// height offset.
current_unit_height -= unit_height_offset;
}
for (int unit_column = 0; unit_column < num_horizontal_units;
++unit_column) {
const int unit_id = unit_row * num_horizontal_units + unit_column;
const LoopRestorationType type =
restoration_info_
->loop_restoration_info(static_cast<Plane>(plane), unit_id)
.type;
uint8_t* src_unit_buffer =
src_buffer + unit_column * plane_unit_size * pixel_size_;
uint8_t* cdef_unit_buffer =
cdef_buffer + unit_column * plane_unit_size * pixel_size_;
uint8_t* deblock_unit_buffer =
deblock_buffer + unit_column * plane_unit_size * pixel_size_;
// Take care of the last column. The max width of last column unit
// could be 3/2 unit_size.
const int current_unit_width =
(unit_column == num_horizontal_units - 1)
? plane_width - plane_unit_size * unit_column
: plane_unit_size;
if (type == kLoopRestorationTypeNone) {
for (int y = 0; y < current_unit_height; ++y) {
memcpy(src_unit_buffer + src_unit_buffer_offset, cdef_unit_buffer,
current_unit_width * pixel_size_);
src_unit_buffer += src_stride;
cdef_unit_buffer += cdef_buffer_stride;
}
continue;
}
assert(type == kLoopRestorationTypeWiener ||
type == kLoopRestorationTypeSgrProj);
const dsp::LoopRestorationFunc restoration_func =
dsp_.loop_restorations[type - 2];
for (int row = 0; row < current_unit_height;) {
const int current_process_unit_height =
plane_process_unit_height[plane] +
((unit_row + row == 0) ? -unit_height_offset : 0);
for (int column = 0; column < current_unit_width;
column += plane_process_unit_width[plane]) {
const int processing_unit_width = std::min(
plane_process_unit_width[plane], current_unit_width - column);
int processing_unit_height = plane_process_unit_height[plane];
// Note [2]: the height of processing units in the first row has
// special cases where the frame height is less than
// plane_process_unit_height[plane].
if (unit_row + row == 0) {
processing_unit_height = std::min(
plane_process_unit_height[plane] - unit_height_offset,
current_unit_height);
} else if (current_unit_height - row <
plane_process_unit_height[plane]) {
// The height of last row of processing units.
processing_unit_height = current_unit_height - row;
}
// We apply in-place loop restoration, by copying the source block
// to a buffer and computing loop restoration on it. The restored
// pixel values are then stored to the frame buffer. However,
// loop restoration requires (a) 3 pixel extension on current 64x64
// processing unit, (b) unrestored pixels.
// To address this, we store the restored pixels not onto the start
// of current block on the source frame buffer, say point A,
// but to its top by three pixels and to the left by
// alignment/pixel_size_ pixels, say point B, such that
// next processing unit can fetch 3 pixel border of unrestored
// values. And we need to adjust the input frame buffer pointer to
// its left and top corner, point B.
uint8_t* const cdef_process_unit_buffer =
cdef_unit_buffer + column * pixel_size_;
uint8_t* const deblock_process_unit_buffer =
deblock_unit_buffer + column * pixel_size_;
const bool frame_top_border = unit_row + row == 0;
const bool frame_bottom_border =
(unit_row == num_vertical_units - 1) &&
(row + current_process_unit_height >= current_unit_height);
if (bitdepth_ == 8) {
PrepareLoopRestorationBlock<uint8_t>(
cdef_process_unit_buffer, cdef_buffer_stride,
deblock_process_unit_buffer, deblock_buffer_stride,
block_buffer_, block_buffer_stride, processing_unit_width,
processing_unit_height, frame_top_border,
frame_bottom_border);
} else {
PrepareLoopRestorationBlock<uint16_t>(
cdef_process_unit_buffer, cdef_buffer_stride,
deblock_process_unit_buffer, deblock_buffer_stride,
block_buffer_, block_buffer_stride, processing_unit_width,
processing_unit_height, frame_top_border,
frame_bottom_border);
}
restoration_func(
reinterpret_cast<const uint8_t*>(
block_buffer_ + kRestorationBorder * block_buffer_stride +
kRestorationBorder * pixel_size_),
src_unit_buffer + column * pixel_size_ + src_unit_buffer_offset,
restoration_info_->loop_restoration_info(
static_cast<Plane>(plane), unit_id),
block_buffer_stride, src_stride, processing_unit_width,
processing_unit_height, &restoration_buffer);
}
row += current_process_unit_height;
src_unit_buffer += current_process_unit_height * src_stride;
cdef_unit_buffer += current_process_unit_height * cdef_buffer_stride;
deblock_unit_buffer +=
current_process_unit_height * deblock_buffer_stride;
}
}
loop_restored_rows += current_unit_height;
src_buffer += current_unit_height * src_stride;
cdef_buffer += current_unit_height * cdef_buffer_stride;
deblock_buffer += current_unit_height * deblock_buffer_stride;
}
// Adjust frame buffer pointer once a plane is loop restored.
// If loop restoration is applied to a plane, we write the filtered frame
// to the upper-left side of original source_buffer_->data().
// The new buffer pointer is still within the physical frame buffer.
// Here negative shifts are used, to indicate shifting towards the
// upper-left corner. Shifts are in pixels.
if (!source_buffer_->ShiftBuffer(plane, horizontal_shift, vertical_shift)) {
LIBGAV1_DLOG(ERROR,
"Error shifting frame buffer head pointer at plane: %d",
plane);
return false;
}
}
return true;
}
void PostFilter::HorizontalDeblockFilter(Plane plane, int row4x4_start,
int column4x4_start, int unit_id) {
const int8_t subsampling_x = (plane == kPlaneY) ? 0 : subsampling_x_;
const int8_t subsampling_y = (plane == kPlaneY) ? 0 : subsampling_y_;
const int row_step = 1 << subsampling_y;
const int column_step = 1 << subsampling_x;
const size_t src_step = 4 * pixel_size_;
const ptrdiff_t row_stride = MultiplyBy4(source_buffer_->stride(plane));
const ptrdiff_t src_stride = source_buffer_->stride(plane);
uint8_t* src = SetBufferOffset(source_buffer_, plane, row4x4_start,
column4x4_start, subsampling_x, subsampling_y);
const uint64_t single_row_mask = 0xffff;
// 3 (11), 5 (0101).
const uint64_t two_block_mask = (subsampling_x > 0) ? 5 : 3;
const LoopFilterType type = kLoopFilterTypeHorizontal;
// Subsampled UV samples correspond to the right/bottom position of
// Y samples.
const int column = subsampling_x;
// AV1 smallest transform size is 4x4, thus minimum horizontal edge size is
// 4x4. For SIMD implementation, sse2 could compute 8 pixels at the same time.
// __m128i = 8 x uint16_t, AVX2 could compute 16 pixels at the same time.
// __m256i = 16 x uint16_t, assuming pixel type is 16 bit. It means we could
// filter 2 horizontal edges using sse2 and 4 edges using AVX2.
// The bitmask enables us to call different SIMD implementations to filter
// 1 edge, or 2 edges or 4 edges.
// TODO(chengchen): Here, the implementation only consider 1 and 2 edges.
// Add support for 4 edges. More branches involved, for example, if input is
// 8 bit, __m128i = 16 x 8 bit, we could apply filtering for 4 edges using
// sse2, 8 edges using AVX2. If input is 16 bit, __m128 = 8 x 16 bit, then
// we apply filtering for 2 edges using sse2, and 4 edges using AVX2.
for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ &&
row4x4 < kNum4x4InLoopFilterMaskUnit;
row4x4 += row_step) {
if (row4x4_start + row4x4 == 0) {
src += row_stride;
continue;
}
// Subsampled UV samples correspond to the right/bottom position of
// Y samples.
const int row = GetDeblockPosition(row4x4, subsampling_y);
const int index = GetIndex(row);
const int shift = GetShift(row, column);
const int level_offset = LoopFilterMask::GetLevelOffset(row, column);
// Mask of current row. mask4x4 represents the vertical filter length for
// the current horizontal edge is 4, and we needs to apply 3-tap filtering.
// Similarly, mask8x8 and mask16x16 represent filter lengths are 8 and 16.
uint64_t mask4x4 =
(masks_->GetTop(unit_id, plane, kLoopFilterTransformSizeId4x4, index) >>
shift) &
single_row_mask;
uint64_t mask8x8 =
(masks_->GetTop(unit_id, plane, kLoopFilterTransformSizeId8x8, index) >>
shift) &
single_row_mask;
uint64_t mask16x16 =
(masks_->GetTop(unit_id, plane, kLoopFilterTransformSizeId16x16,
index) >>
shift) &
single_row_mask;
// mask4x4, mask8x8, mask16x16 are mutually exclusive.
assert((mask4x4 & mask8x8) == 0 && (mask4x4 & mask16x16) == 0 &&
(mask8x8 & mask16x16) == 0);
// Apply deblock filter for one row.
uint8_t* src_row = src;
int column_offset = 0;
for (uint64_t mask = mask4x4 | mask8x8 | mask16x16; mask != 0;) {
int edge_count = 1;
if ((mask & 1) != 0) {
// Filter parameters of current edge.
const uint8_t level = masks_->GetLevel(unit_id, plane, type,
level_offset + column_offset);
int outer_thresh_0;
int inner_thresh_0;
int hev_thresh_0;
GetDeblockFilterParams(level, &outer_thresh_0, &inner_thresh_0,
&hev_thresh_0);
// Filter parameters of next edge. Clip the index to avoid over
// reading at the edge of the block. The values will be unused in that
// case.
const int level_next_index = level_offset + column_offset + column_step;
const uint8_t level_next =
masks_->GetLevel(unit_id, plane, type, level_next_index & 0xff);
int outer_thresh_1;
int inner_thresh_1;
int hev_thresh_1;
GetDeblockFilterParams(level_next, &outer_thresh_1, &inner_thresh_1,
&hev_thresh_1);
if ((mask16x16 & 1) != 0) {
const dsp::LoopFilterSize size = (plane == kPlaneY)
? dsp::kLoopFilterSize14
: dsp::kLoopFilterSize6;
const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
if ((mask16x16 & two_block_mask) == two_block_mask) {
edge_count = 2;
// Apply filtering for two edges.
filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
hev_thresh_0);
filter_func(src_row + src_step, src_stride, outer_thresh_1,
inner_thresh_1, hev_thresh_1);
} else {
// Apply single edge filtering.
filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
hev_thresh_0);
}
}
if ((mask8x8 & 1) != 0) {
const dsp::LoopFilterSize size =
plane == kPlaneY ? dsp::kLoopFilterSize8 : dsp::kLoopFilterSize6;
const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
if ((mask8x8 & two_block_mask) == two_block_mask) {
edge_count = 2;
// Apply filtering for two edges.
filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
hev_thresh_0);
filter_func(src_row + src_step, src_stride, outer_thresh_1,
inner_thresh_1, hev_thresh_1);
} else {
// Apply single edge filtering.
filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
hev_thresh_0);
}
}
if ((mask4x4 & 1) != 0) {
const dsp::LoopFilterSize size = dsp::kLoopFilterSize4;
const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
if ((mask4x4 & two_block_mask) == two_block_mask) {
edge_count = 2;
// Apply filtering for two edges.
filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
hev_thresh_0);
filter_func(src_row + src_step, src_stride, outer_thresh_1,
inner_thresh_1, hev_thresh_1);
} else {
// Apply single edge filtering.
filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
hev_thresh_0);
}
}
}
const int step = edge_count * column_step;
mask4x4 >>= step;
mask8x8 >>= step;
mask16x16 >>= step;
mask >>= step;
column_offset += step;
src_row += MultiplyBy4(edge_count) * pixel_size_;
}
src += row_stride;
}
}
void PostFilter::VerticalDeblockFilter(Plane plane, int row4x4_start,
int column4x4_start, int unit_id) {
const int8_t subsampling_x = (plane == kPlaneY) ? 0 : subsampling_x_;
const int8_t subsampling_y = (plane == kPlaneY) ? 0 : subsampling_y_;
const int row_step = 1 << subsampling_y;
const int two_row_step = row_step << 1;
const int column_step = 1 << subsampling_x;
const size_t src_step = (bitdepth_ == 8) ? 4 : 4 * sizeof(uint16_t);
const ptrdiff_t row_stride = MultiplyBy4(source_buffer_->stride(plane));
const ptrdiff_t two_row_stride = row_stride << 1;
const ptrdiff_t src_stride = source_buffer_->stride(plane);
uint8_t* src = SetBufferOffset(source_buffer_, plane, row4x4_start,
column4x4_start, subsampling_x, subsampling_y);
const uint64_t single_row_mask = 0xffff;
const LoopFilterType type = kLoopFilterTypeVertical;
// Subsampled UV samples correspond to the right/bottom position of
// Y samples.
const int column = subsampling_x;
// AV1 smallest transform size is 4x4, thus minimum vertical edge size is 4x4.
// For SIMD implementation, sse2 could compute 8 pixels at the same time.
// __m128i = 8 x uint16_t, AVX2 could compute 16 pixels at the same time.
// __m256i = 16 x uint16_t, assuming pixel type is 16 bit. It means we could
// filter 2 vertical edges using sse2 and 4 edges using AVX2.
// The bitmask enables us to call different SIMD implementations to filter
// 1 edge, or 2 edges or 4 edges.
// TODO(chengchen): Here, the implementation only consider 1 and 2 edges.
// Add support for 4 edges. More branches involved, for example, if input is
// 8 bit, __m128i = 16 x 8 bit, we could apply filtering for 4 edges using
// sse2, 8 edges using AVX2. If input is 16 bit, __m128 = 8 x 16 bit, then
// we apply filtering for 2 edges using sse2, and 4 edges using AVX2.
for (int row4x4 = 0; MultiplyBy4(row4x4_start + row4x4) < height_ &&
row4x4 < kNum4x4InLoopFilterMaskUnit;
row4x4 += two_row_step) {
// Subsampled UV samples correspond to the right/bottom position of
// Y samples.
const int row = GetDeblockPosition(row4x4, subsampling_y);
const int row_next = row + row_step;
const int index = GetIndex(row);
const int shift = GetShift(row, column);
const int level_offset = LoopFilterMask::GetLevelOffset(row, column);
const int index_next = GetIndex(row_next);
const int shift_next_row = GetShift(row_next, column);
const int level_offset_next_row =
LoopFilterMask::GetLevelOffset(row_next, column);
// TODO(chengchen): replace 0, 1, 2 to meaningful enum names.
// mask of current row. mask4x4 represents the horizontal filter length for
// the current vertical edge is 4, and we needs to apply 3-tap filtering.
// Similarly, mask8x8 and mask16x16 represent filter lengths are 8 and 16.
uint64_t mask4x4_0 =
(masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId4x4,
index) >>
shift) &
single_row_mask;
uint64_t mask8x8_0 =
(masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId8x8,
index) >>
shift) &
single_row_mask;
uint64_t mask16x16_0 =
(masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId16x16,
index) >>
shift) &
single_row_mask;
// mask4x4, mask8x8, mask16x16 are mutually exclusive.
assert((mask4x4_0 & mask8x8_0) == 0 && (mask4x4_0 & mask16x16_0) == 0 &&
(mask8x8_0 & mask16x16_0) == 0);
// mask of the next row. With mask of current and the next row, we can call
// the corresponding SIMD function to apply filtering for two vertical
// edges together.
uint64_t mask4x4_1 =
(masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId4x4,
index_next) >>
shift_next_row) &
single_row_mask;
uint64_t mask8x8_1 =
(masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId8x8,
index_next) >>
shift_next_row) &
single_row_mask;
uint64_t mask16x16_1 =
(masks_->GetLeft(unit_id, plane, kLoopFilterTransformSizeId16x16,
index_next) >>
shift_next_row) &
single_row_mask;
// mask4x4, mask8x8, mask16x16 are mutually exclusive.
assert((mask4x4_1 & mask8x8_1) == 0 && (mask4x4_1 & mask16x16_1) == 0 &&
(mask8x8_1 & mask16x16_1) == 0);
// Apply deblock filter for two rows.
uint8_t* src_row = src;
int column_offset = 0;
for (uint64_t mask = mask4x4_0 | mask8x8_0 | mask16x16_0 | mask4x4_1 |
mask8x8_1 | mask16x16_1;
mask != 0;) {
if ((mask & 1) != 0) {
// Filter parameters of current row.
const uint8_t level = masks_->GetLevel(unit_id, plane, type,
level_offset + column_offset);
int outer_thresh_0;
int inner_thresh_0;
int hev_thresh_0;
GetDeblockFilterParams(level, &outer_thresh_0, &inner_thresh_0,
&hev_thresh_0);
// Filter parameters of next row. Clip the index to avoid over
// reading at the edge of the block. The values will be unused in that
// case.
const int level_next_index = level_offset_next_row + column_offset;
const uint8_t level_next =
masks_->GetLevel(unit_id, plane, type, level_next_index & 0xff);
int outer_thresh_1;
int inner_thresh_1;
int hev_thresh_1;
GetDeblockFilterParams(level_next, &outer_thresh_1, &inner_thresh_1,
&hev_thresh_1);
uint8_t* const src_row_next = src_row + row_stride;
if (((mask16x16_0 | mask16x16_1) & 1) != 0) {
const dsp::LoopFilterSize size = (plane == kPlaneY)
? dsp::kLoopFilterSize14
: dsp::kLoopFilterSize6;
const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
if ((mask16x16_0 & mask16x16_1 & 1) != 0) {
// Apply dual vertical edge filtering.
filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
hev_thresh_0);
filter_func(src_row_next, src_stride, outer_thresh_1,
inner_thresh_1, hev_thresh_1);
} else if ((mask16x16_0 & 1) != 0) {
filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
hev_thresh_0);
} else {
filter_func(src_row_next, src_stride, outer_thresh_1,
inner_thresh_1, hev_thresh_1);
}
}
if (((mask8x8_0 | mask8x8_1) & 1) != 0) {
const dsp::LoopFilterSize size = (plane == kPlaneY)
? dsp::kLoopFilterSize8
: dsp::kLoopFilterSize6;
const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
if ((mask8x8_0 & mask8x8_1 & 1) != 0) {
filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
hev_thresh_0);
filter_func(src_row_next, src_stride, outer_thresh_1,
inner_thresh_1, hev_thresh_1);
} else if ((mask8x8_0 & 1) != 0) {
filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
hev_thresh_0);
} else {
filter_func(src_row_next, src_stride, outer_thresh_1,
inner_thresh_1, hev_thresh_1);
}
}
if (((mask4x4_0 | mask4x4_1) & 1) != 0) {
const dsp::LoopFilterSize size = dsp::kLoopFilterSize4;
const dsp::LoopFilterFunc filter_func = dsp_.loop_filters[size][type];
if ((mask4x4_0 & mask4x4_1 & 1) != 0) {
filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
hev_thresh_0);
filter_func(src_row_next, src_stride, outer_thresh_1,
inner_thresh_1, hev_thresh_1);
} else if ((mask4x4_0 & 1) != 0) {
filter_func(src_row, src_stride, outer_thresh_0, inner_thresh_0,
hev_thresh_0);
} else {
filter_func(src_row_next, src_stride, outer_thresh_1,
inner_thresh_1, hev_thresh_1);
}
}
}
mask4x4_0 >>= column_step;
mask8x8_0 >>= column_step;
mask16x16_0 >>= column_step;
mask4x4_1 >>= column_step;
mask8x8_1 >>= column_step;
mask16x16_1 >>= column_step;
mask >>= column_step;
column_offset += column_step;
src_row += src_step;
}
src += two_row_stride;
}
}
void PostFilter::InitDeblockFilterParams() {
const int8_t sharpness = frame_header_.loop_filter.sharpness;
assert(0 <= sharpness && sharpness < 8);
const int shift = DivideBy4(sharpness + 3); // ceil(sharpness / 4.0)
for (int level = 0; level <= kMaxLoopFilterValue; ++level) {
uint8_t limit = level >> shift;
if (sharpness > 0) {
limit = Clip3(limit, 1, 9 - sharpness);
} else {
limit = std::max(limit, static_cast<uint8_t>(1));
}
inner_thresh_[level] = limit;
outer_thresh_[level] = 2 * (level + 2) + limit;
hev_thresh_[level] = level >> 4;
}
}
void PostFilter::GetDeblockFilterParams(uint8_t level, int* outer_thresh,
int* inner_thresh,
int* hev_thresh) const {
*outer_thresh = outer_thresh_[level];
*inner_thresh = inner_thresh_[level];
*hev_thresh = hev_thresh_[level];
}
} // namespace libgav1