| // Copyright 2020 The libgav1 Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| #include "src/post_filter.h" |
| #include "src/utils/blocking_counter.h" |
| |
| namespace libgav1 { |
| namespace { |
| |
| template <typename Pixel> |
| void CopyTwoRows(const Pixel* src, const ptrdiff_t src_stride, Pixel** dst, |
| const ptrdiff_t dst_stride, const int width) { |
| for (int i = 0; i < kRestorationVerticalBorder; ++i) { |
| memcpy(*dst, src, sizeof(Pixel) * width); |
| src += src_stride; |
| *dst += dst_stride; |
| } |
| } |
| |
| } // namespace |
| |
| // static |
| template <typename Pixel> |
| void PostFilter::PrepareLoopRestorationBlock( |
| const Pixel* src_buffer, const ptrdiff_t src_stride, |
| const Pixel* deblock_buffer, const ptrdiff_t deblock_stride, Pixel* dst, |
| const ptrdiff_t dst_stride, const int width, const int height, |
| const bool frame_top_border, const bool frame_bottom_border) { |
| src_buffer -= |
| kRestorationVerticalBorder * src_stride + kRestorationHorizontalBorder; |
| deblock_buffer -= kRestorationHorizontalBorder; |
| int h = height; |
| // Top 2 rows. |
| if (frame_top_border) { |
| h += kRestorationVerticalBorder; |
| } else { |
| CopyTwoRows<Pixel>(deblock_buffer, deblock_stride, &dst, dst_stride, |
| width + 2 * kRestorationHorizontalBorder); |
| src_buffer += kRestorationVerticalBorder * src_stride; |
| // If |frame_top_border| is true, then we are in the first superblock row, |
| // so in that case, do not increment |deblock_buffer| since we don't store |
| // anything from the first superblock row into |deblock_buffer|. |
| deblock_buffer += 4 * deblock_stride; |
| } |
| if (frame_bottom_border) h += kRestorationVerticalBorder; |
| // Main body. |
| do { |
| memcpy(dst, src_buffer, |
| sizeof(Pixel) * (width + 2 * kRestorationHorizontalBorder)); |
| src_buffer += src_stride; |
| dst += dst_stride; |
| } while (--h != 0); |
| // Bottom 2 rows. |
| if (!frame_bottom_border) { |
| deblock_buffer += kRestorationVerticalBorder * deblock_stride; |
| CopyTwoRows<Pixel>(deblock_buffer, deblock_stride, &dst, dst_stride, |
| width + 2 * kRestorationHorizontalBorder); |
| } |
| } |
| |
| template void PostFilter::PrepareLoopRestorationBlock<uint8_t>( |
| const uint8_t* src_buffer, ptrdiff_t src_stride, |
| const uint8_t* deblock_buffer, ptrdiff_t deblock_stride, uint8_t* dst, |
| ptrdiff_t dst_stride, const int width, const int height, |
| const bool frame_top_border, const bool frame_bottom_border); |
| |
| #if LIBGAV1_MAX_BITDEPTH >= 10 |
| template void PostFilter::PrepareLoopRestorationBlock<uint16_t>( |
| const uint16_t* src_buffer, ptrdiff_t src_stride, |
| const uint16_t* deblock_buffer, ptrdiff_t deblock_stride, uint16_t* dst, |
| ptrdiff_t dst_stride, const int width, const int height, |
| const bool frame_top_border, const bool frame_bottom_border); |
| #endif |
| |
| template <typename Pixel> |
| void PostFilter::ApplyLoopRestorationForOneRowInWindow( |
| const Pixel* src_buffer, const Plane plane, const int plane_height, |
| const int plane_width, const int y, const int x, const int row, |
| const int unit_row, const int current_process_unit_height, |
| const int plane_unit_size, const int window_width, |
| Array2DView<Pixel>* const loop_restored_window) { |
| const int num_horizontal_units = |
| restoration_info_->num_horizontal_units(static_cast<Plane>(plane)); |
| const ptrdiff_t src_stride = frame_buffer_.stride(plane) / sizeof(Pixel); |
| const RestorationUnitInfo* const restoration_info = |
| restoration_info_->loop_restoration_info(static_cast<Plane>(plane), |
| unit_row * num_horizontal_units); |
| int unit_column = x / plane_unit_size; |
| src_buffer += (y + row) * src_stride + x; |
| int column = 0; |
| do { |
| const int unit_x = x + column; |
| const int unit_y = y + row; |
| const int current_process_unit_width = |
| std::min(plane_unit_size, plane_width - unit_x); |
| const Pixel* src = src_buffer + column; |
| unit_column = std::min(unit_column, num_horizontal_units - 1); |
| if (restoration_info[unit_column].type == kLoopRestorationTypeNone) { |
| const ptrdiff_t dst_stride = loop_restored_window->columns(); |
| Pixel* dst = &(*loop_restored_window)[row][column]; |
| for (int k = 0; k < current_process_unit_height; ++k) { |
| if (DoCdef()) { |
| memmove(dst, src, current_process_unit_width * sizeof(Pixel)); |
| } else { |
| memcpy(dst, src, current_process_unit_width * sizeof(Pixel)); |
| } |
| src += src_stride; |
| dst += dst_stride; |
| } |
| } else { |
| const ptrdiff_t block_buffer_stride = kRestorationUnitWidthWithBorders; |
| // The SIMD implementation of wiener filter over-reads 15 - |
| // |kRestorationHorizontalBorder| bytes, and the SIMD implementation of |
| // self-guided filter over-reads up to 7 bytes which happens when |
| // |current_process_unit_width| equals |kRestorationUnitWidth| - 7, and |
| // the radius of the first pass in sfg is 0. So add 8 extra bytes at the |
| // end of block_buffer for 8 bit. |
| Pixel |
| block_buffer[kRestorationUnitHeightWithBorders * block_buffer_stride + |
| ((sizeof(Pixel) == 1) ? 15 - kRestorationHorizontalBorder |
| : 0)]; |
| RestorationBuffer restoration_buffer; |
| const Pixel* source; |
| ptrdiff_t source_stride; |
| if (DoCdef()) { |
| const int deblock_buffer_units = 64 >> subsampling_y_[plane]; |
| const auto* const deblock_buffer = |
| reinterpret_cast<const Pixel*>(deblock_buffer_.data(plane)); |
| assert(deblock_buffer != nullptr); |
| const ptrdiff_t deblock_buffer_stride = |
| deblock_buffer_.stride(plane) / sizeof(Pixel); |
| const int deblock_unit_y = |
| std::max(MultiplyBy4(Ceil(unit_y, deblock_buffer_units)) - 4, 0); |
| const Pixel* const deblock_unit_buffer = |
| deblock_buffer + deblock_unit_y * deblock_buffer_stride + unit_x; |
| PrepareLoopRestorationBlock<Pixel>( |
| src, src_stride, deblock_unit_buffer, deblock_buffer_stride, |
| block_buffer, block_buffer_stride, current_process_unit_width, |
| current_process_unit_height, unit_y == 0, |
| unit_y + current_process_unit_height >= plane_height); |
| source = block_buffer + |
| kRestorationVerticalBorder * block_buffer_stride + |
| kRestorationHorizontalBorder; |
| source_stride = kRestorationUnitWidthWithBorders; |
| } else { |
| source = src; |
| source_stride = src_stride; |
| } |
| const LoopRestorationType type = restoration_info[unit_column].type; |
| assert(type == kLoopRestorationTypeSgrProj || |
| type == kLoopRestorationTypeWiener); |
| const dsp::LoopRestorationFunc restoration_func = |
| dsp_.loop_restorations[type - 2]; |
| restoration_func(source, &(*loop_restored_window)[row][column], |
| restoration_info[unit_column], source_stride, |
| loop_restored_window->columns(), |
| current_process_unit_width, current_process_unit_height, |
| &restoration_buffer); |
| } |
| ++unit_column; |
| column += plane_unit_size; |
| } while (column < window_width); |
| } |
| |
| template <typename Pixel> |
| void PostFilter::ApplyLoopRestorationSingleThread(const int row4x4_start, |
| const int sb4x4) { |
| assert(row4x4_start >= 0); |
| assert(DoRestoration()); |
| for (int plane = 0; plane < planes_; ++plane) { |
| if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) { |
| continue; |
| } |
| const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(Pixel); |
| const int unit_height_offset = |
| kRestorationUnitOffset >> subsampling_y_[plane]; |
| const int plane_height = |
| RightShiftWithRounding(height_, subsampling_y_[plane]); |
| const int plane_width = |
| RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]); |
| const int num_vertical_units = |
| restoration_info_->num_vertical_units(static_cast<Plane>(plane)); |
| const int plane_unit_size = loop_restoration_.unit_size[plane]; |
| const int plane_process_unit_height = |
| kRestorationUnitHeight >> subsampling_y_[plane]; |
| int y = (row4x4_start == 0) |
| ? 0 |
| : (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) - |
| unit_height_offset; |
| int expected_height = plane_process_unit_height - |
| ((row4x4_start == 0) ? unit_height_offset : 0); |
| int current_process_unit_height; |
| for (int sb_y = 0; sb_y < sb4x4; |
| sb_y += 16, y += current_process_unit_height) { |
| if (y >= plane_height) break; |
| const int unit_row = std::min((y + unit_height_offset) / plane_unit_size, |
| num_vertical_units - 1); |
| current_process_unit_height = std::min(expected_height, plane_height - y); |
| expected_height = plane_process_unit_height; |
| Array2DView<Pixel> loop_restored_window( |
| current_process_unit_height, static_cast<int>(stride), |
| reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) + |
| y * stride); |
| ApplyLoopRestorationForOneRowInWindow<Pixel>( |
| reinterpret_cast<Pixel*>(superres_buffer_[plane]), |
| static_cast<Plane>(plane), plane_height, plane_width, y, 0, 0, |
| unit_row, current_process_unit_height, plane_unit_size, plane_width, |
| &loop_restored_window); |
| } |
| } |
| } |
| |
| // Multi-thread version of loop restoration, based on a moving window of size |
| // |window_buffer_width_|x|window_buffer_height_|. Inside the moving window, we |
| // create a filtering job for each row and each filtering job is submitted to |
| // the thread pool. Each free thread takes one job from the thread pool and |
| // completes filtering until all jobs are finished. This approach requires an |
| // extra buffer (|threaded_window_buffer_|) to hold the filtering output, whose |
| // size is the size of the window. It also needs block buffers (i.e., |
| // |block_buffer| in ApplyLoopRestorationForOneRowInWindow()) to store |
| // intermediate results in loop restoration for each thread. After all units |
| // inside the window are filtered, the output is written to the frame buffer. |
| template <typename Pixel> |
| void PostFilter::ApplyLoopRestorationThreaded() { |
| const int plane_process_unit_height[kMaxPlanes] = { |
| kRestorationUnitHeight, kRestorationUnitHeight >> subsampling_y_[kPlaneU], |
| kRestorationUnitHeight >> subsampling_y_[kPlaneV]}; |
| Array2DView<Pixel> loop_restored_window; |
| if (!DoCdef()) { |
| loop_restored_window.Reset( |
| window_buffer_height_, window_buffer_width_, |
| reinterpret_cast<Pixel*>(threaded_window_buffer_)); |
| } |
| |
| for (int plane = kPlaneY; plane < planes_; ++plane) { |
| if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) { |
| continue; |
| } |
| |
| const int unit_height_offset = |
| kRestorationUnitOffset >> subsampling_y_[plane]; |
| auto* const src_buffer = reinterpret_cast<Pixel*>(superres_buffer_[plane]); |
| const ptrdiff_t src_stride = frame_buffer_.stride(plane) / sizeof(Pixel); |
| const int plane_unit_size = loop_restoration_.unit_size[plane]; |
| const int num_vertical_units = |
| restoration_info_->num_vertical_units(static_cast<Plane>(plane)); |
| const int plane_width = |
| RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]); |
| const int plane_height = |
| RightShiftWithRounding(height_, subsampling_y_[plane]); |
| PostFilter::ExtendFrame<Pixel>( |
| src_buffer, plane_width, plane_height, src_stride, |
| kRestorationHorizontalBorder, kRestorationHorizontalBorder, |
| kRestorationVerticalBorder, kRestorationVerticalBorder); |
| |
| const int num_workers = thread_pool_->num_threads(); |
| for (int y = 0; y < plane_height; y += window_buffer_height_) { |
| const int actual_window_height = |
| std::min(window_buffer_height_ - ((y == 0) ? unit_height_offset : 0), |
| plane_height - y); |
| int vertical_units_per_window = |
| (actual_window_height + plane_process_unit_height[plane] - 1) / |
| plane_process_unit_height[plane]; |
| if (y == 0) { |
| // The first row of loop restoration processing units is not 64x64, but |
| // 64x56 (|unit_height_offset| = 8 rows less than other restoration |
| // processing units). For u/v with subsampling, the size is halved. To |
| // compute the number of vertical units per window, we need to take a |
| // special handling for it. |
| const int height_without_first_unit = |
| actual_window_height - |
| std::min(actual_window_height, |
| plane_process_unit_height[plane] - unit_height_offset); |
| vertical_units_per_window = |
| (height_without_first_unit + plane_process_unit_height[plane] - 1) / |
| plane_process_unit_height[plane] + |
| 1; |
| } |
| const int jobs_for_threadpool = |
| vertical_units_per_window * num_workers / (num_workers + 1); |
| for (int x = 0; x < plane_width; x += window_buffer_width_) { |
| const int actual_window_width = |
| std::min(window_buffer_width_, plane_width - x); |
| assert(jobs_for_threadpool < vertical_units_per_window); |
| if (DoCdef()) { |
| loop_restored_window.Reset( |
| actual_window_height, static_cast<int>(src_stride), |
| reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) + |
| y * src_stride + x); |
| } |
| BlockingCounter pending_jobs(jobs_for_threadpool); |
| int job_count = 0; |
| int current_process_unit_height; |
| for (int row = 0; row < actual_window_height; |
| row += current_process_unit_height) { |
| const int unit_y = y + row; |
| const int expected_height = plane_process_unit_height[plane] - |
| ((unit_y == 0) ? unit_height_offset : 0); |
| current_process_unit_height = |
| std::min(expected_height, plane_height - unit_y); |
| const int unit_row = |
| std::min((unit_y + unit_height_offset) / plane_unit_size, |
| num_vertical_units - 1); |
| |
| if (job_count < jobs_for_threadpool) { |
| thread_pool_->Schedule( |
| [this, src_buffer, plane, plane_height, plane_width, y, x, row, |
| unit_row, current_process_unit_height, plane_unit_size, |
| actual_window_width, &loop_restored_window, &pending_jobs]() { |
| ApplyLoopRestorationForOneRowInWindow<Pixel>( |
| src_buffer, static_cast<Plane>(plane), plane_height, |
| plane_width, y, x, row, unit_row, |
| current_process_unit_height, plane_unit_size, |
| actual_window_width, &loop_restored_window); |
| pending_jobs.Decrement(); |
| }); |
| } else { |
| ApplyLoopRestorationForOneRowInWindow<Pixel>( |
| src_buffer, static_cast<Plane>(plane), plane_height, |
| plane_width, y, x, row, unit_row, current_process_unit_height, |
| plane_unit_size, actual_window_width, &loop_restored_window); |
| } |
| ++job_count; |
| } |
| // Wait for all jobs of current window to finish. |
| pending_jobs.Wait(); |
| if (!DoCdef()) { |
| // Copy |threaded_window_buffer_| to output frame. |
| CopyPlane<Pixel>( |
| reinterpret_cast<const Pixel*>(threaded_window_buffer_), |
| window_buffer_width_, actual_window_width, actual_window_height, |
| reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) + |
| y * src_stride + x, |
| src_stride); |
| } |
| } |
| if (y == 0) y -= unit_height_offset; |
| } |
| } |
| } |
| |
| void PostFilter::ApplyLoopRestoration(const int row4x4_start, const int sb4x4) { |
| #if LIBGAV1_MAX_BITDEPTH >= 10 |
| if (bitdepth_ >= 10) { |
| ApplyLoopRestorationSingleThread<uint16_t>(row4x4_start, sb4x4); |
| return; |
| } |
| #endif |
| ApplyLoopRestorationSingleThread<uint8_t>(row4x4_start, sb4x4); |
| } |
| |
| void PostFilter::ApplyLoopRestoration() { |
| assert(threaded_window_buffer_ != nullptr); |
| #if LIBGAV1_MAX_BITDEPTH >= 10 |
| if (bitdepth_ >= 10) { |
| ApplyLoopRestorationThreaded<uint16_t>(); |
| return; |
| } |
| #endif |
| ApplyLoopRestorationThreaded<uint8_t>(); |
| } |
| |
| } // namespace libgav1 |