libgav1/src/decoder_impl.cc - platform/external/libgav1 - Git at Google

 // Copyright 2019 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/decoder_impl.h"

 #include <algorithm>
 #include <atomic>
 #include <cassert>
 #include <iterator>
 #include <new>
 #include <utility>

 #include "src/dsp/common.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/film_grain.h"
 #include "src/frame_buffer_utils.h"
 #include "src/frame_scratch_buffer.h"
 #include "src/loop_restoration_info.h"
 #include "src/obu_parser.h"
 #include "src/post_filter.h"
 #include "src/prediction_mask.h"
 #include "src/quantizer.h"
 #include "src/threading_strategy.h"
 #include "src/utils/blocking_counter.h"
 #include "src/utils/common.h"
 #include "src/utils/constants.h"
 #include "src/utils/logging.h"
 #include "src/utils/parameter_tree.h"
 #include "src/utils/raw_bit_reader.h"
 #include "src/utils/segmentation.h"
 #include "src/utils/threadpool.h"
 #include "src/yuv_buffer.h"

 namespace libgav1 {
 namespace {

 constexpr int kMaxBlockWidth4x4 = 32;
 constexpr int kMaxBlockHeight4x4 = 32;

 // Computes the bottom border size in pixels. If CDEF, loop restoration or
 // SuperRes is enabled, adds extra border pixels to facilitate those steps to
 // happen nearly in-place (a few extra rows instead of an entire frame buffer).
 // The logic in this function should match the corresponding logic for
 // |vertical_shift| in the PostFilter constructor.
 int GetBottomBorderPixels(const bool do_cdef, const bool do_restoration,
                           const bool do_superres, const int subsampling_y) {
   int extra_border = 0;
   if (do_cdef) {
     extra_border += kCdefBorder;
   } else if (do_restoration) {
     // If CDEF is enabled, loop restoration is safe without extra border.
     extra_border += kRestorationVerticalBorder;
   }
   if (do_superres) extra_border += kSuperResVerticalBorder;
   // Double the number of extra bottom border pixels if the bottom border will
   // be subsampled.
   extra_border <<= subsampling_y;
   return Align(kBorderPixels + extra_border, 2);  // Must be a multiple of 2.
 }

 // Sets |frame_scratch_buffer->tile_decoding_failed| to true (while holding on
 // to |frame_scratch_buffer->superblock_row_mutex|) and notifies the first
 // |count| condition variables in
 // |frame_scratch_buffer->superblock_row_progress_condvar|.
 void SetFailureAndNotifyAll(FrameScratchBuffer* const frame_scratch_buffer,
                             int count) {
   {
     std::lock_guard<std::mutex> lock(
         frame_scratch_buffer->superblock_row_mutex);
     frame_scratch_buffer->tile_decoding_failed = true;
   }
   std::condition_variable* const condvars =
       frame_scratch_buffer->superblock_row_progress_condvar.get();
   for (int i = 0; i < count; ++i) {
     condvars[i].notify_one();
   }
 }

 // Helper class that releases the frame scratch buffer in the destructor.
 class FrameScratchBufferReleaser {
  public:
   FrameScratchBufferReleaser(
       FrameScratchBufferPool* frame_scratch_buffer_pool,
       std::unique_ptr<FrameScratchBuffer>* frame_scratch_buffer)
       : frame_scratch_buffer_pool_(frame_scratch_buffer_pool),
         frame_scratch_buffer_(frame_scratch_buffer) {}
   ~FrameScratchBufferReleaser() {
     frame_scratch_buffer_pool_->Release(std::move(*frame_scratch_buffer_));
   }

  private:
   FrameScratchBufferPool* const frame_scratch_buffer_pool_;
   std::unique_ptr<FrameScratchBuffer>* const frame_scratch_buffer_;
 };

 // Sets the |frame|'s segmentation map for two cases. The third case is handled
 // in Tile::DecodeBlock().
 void SetSegmentationMap(const ObuFrameHeader& frame_header,
                         const SegmentationMap* prev_segment_ids,
                         RefCountedBuffer* const frame) {
   if (!frame_header.segmentation.enabled) {
     // All segment_id's are 0.
     frame->segmentation_map()->Clear();
   } else if (!frame_header.segmentation.update_map) {
     // Copy from prev_segment_ids.
     if (prev_segment_ids == nullptr) {
       // Treat a null prev_segment_ids pointer as if it pointed to a
       // segmentation map containing all 0s.
       frame->segmentation_map()->Clear();
     } else {
       frame->segmentation_map()->CopyFrom(*prev_segment_ids);
     }
   }
 }

 StatusCode DecodeTilesNonFrameParallel(
     const ObuSequenceHeader& sequence_header,
     const ObuFrameHeader& frame_header,
     const Vector<std::unique_ptr<Tile>>& tiles,
     FrameScratchBuffer* const frame_scratch_buffer,
     PostFilter* const post_filter) {
   // Decode in superblock row order.
   const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
   std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
       frame_scratch_buffer->tile_scratch_buffer_pool.Get();
   if (tile_scratch_buffer == nullptr) return kLibgav1StatusOutOfMemory;
   for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
        row4x4 += block_width4x4) {
     for (const auto& tile_ptr : tiles) {
       if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
               row4x4, tile_scratch_buffer.get())) {
         return kLibgav1StatusUnknownError;
       }
     }
     post_filter->ApplyFilteringForOneSuperBlockRow(
         row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
         /*do_deblock=*/true);
   }
   frame_scratch_buffer->tile_scratch_buffer_pool.Release(
       std::move(tile_scratch_buffer));
   return kStatusOk;
 }

 StatusCode DecodeTilesThreadedNonFrameParallel(
     const Vector<std::unique_ptr<Tile>>& tiles,
     FrameScratchBuffer* const frame_scratch_buffer,
     PostFilter* const post_filter,
     BlockingCounterWithStatus* const pending_tiles) {
   ThreadingStrategy& threading_strategy =
       frame_scratch_buffer->threading_strategy;
   const int num_workers = threading_strategy.tile_thread_count();
   BlockingCounterWithStatus pending_workers(num_workers);
   std::atomic<int> tile_counter(0);
   const int tile_count = static_cast<int>(tiles.size());
   bool tile_decoding_failed = false;
   // Submit tile decoding jobs to the thread pool.
   for (int i = 0; i < num_workers; ++i) {
     threading_strategy.tile_thread_pool()->Schedule([&tiles, tile_count,
                                                      &tile_counter,
                                                      &pending_workers,
                                                      &pending_tiles]() {
       bool failed = false;
       int index;
       while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
              tile_count) {
         if (!failed) {
           const auto& tile_ptr = tiles[index];
           if (!tile_ptr->ParseAndDecode()) {
             LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
             failed = true;
           }
         } else {
           pending_tiles->Decrement(false);
         }
       }
       pending_workers.Decrement(!failed);
     });
   }
   // Have the current thread partake in tile decoding.
   int index;
   while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
          tile_count) {
     if (!tile_decoding_failed) {
       const auto& tile_ptr = tiles[index];
       if (!tile_ptr->ParseAndDecode()) {
         LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
         tile_decoding_failed = true;
       }
     } else {
       pending_tiles->Decrement(false);
     }
   }
   // Wait until all the workers are done. This ensures that all the tiles have
   // been parsed.
   tile_decoding_failed |= !pending_workers.Wait();
   // Wait until all the tiles have been decoded.
   tile_decoding_failed |= !pending_tiles->Wait();
   if (tile_decoding_failed) return kStatusUnknownError;
   assert(threading_strategy.post_filter_thread_pool() != nullptr);
   post_filter->ApplyFilteringThreaded();
   return kStatusOk;
 }

 StatusCode DecodeTilesFrameParallel(
     const ObuSequenceHeader& sequence_header,
     const ObuFrameHeader& frame_header,
     const Vector<std::unique_ptr<Tile>>& tiles,
     const SymbolDecoderContext& saved_symbol_decoder_context,
     const SegmentationMap* const prev_segment_ids,
     FrameScratchBuffer* const frame_scratch_buffer,
     PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
   // Parse the frame.
   for (const auto& tile : tiles) {
     if (!tile->Parse()) {
       LIBGAV1_DLOG(ERROR, "Failed to parse tile number: %d\n", tile->number());
       return kStatusUnknownError;
     }
   }
   if (frame_header.enable_frame_end_update_cdf) {
     frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
   }
   current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
   SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
   // Mark frame as parsed.
   current_frame->SetFrameState(kFrameStateParsed);
   std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
       frame_scratch_buffer->tile_scratch_buffer_pool.Get();
   if (tile_scratch_buffer == nullptr) {
     return kStatusOutOfMemory;
   }
   const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
   // Decode in superblock row order (inter prediction in the Tile class will
   // block until the required superblocks in the reference frame are decoded).
   for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
        row4x4 += block_width4x4) {
     for (const auto& tile_ptr : tiles) {
       if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
               row4x4, tile_scratch_buffer.get())) {
         LIBGAV1_DLOG(ERROR, "Failed to decode tile number: %d\n",
                      tile_ptr->number());
         return kStatusUnknownError;
       }
     }
     const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
         row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
         /*do_deblock=*/true);
     if (progress_row >= 0) {
       current_frame->SetProgress(progress_row);
     }
   }
   // Mark frame as decoded (we no longer care about row-level progress since the
   // entire frame has been decoded).
   current_frame->SetFrameState(kFrameStateDecoded);
   frame_scratch_buffer->tile_scratch_buffer_pool.Release(
       std::move(tile_scratch_buffer));
   return kStatusOk;
 }

 // Helper function used by DecodeTilesThreadedFrameParallel. Applies the
 // deblocking filter for tile boundaries for the superblock row at |row4x4|.
 void ApplyDeblockingFilterForTileBoundaries(
     PostFilter* const post_filter, const std::unique_ptr<Tile>* tile_row_base,
     const ObuFrameHeader& frame_header, int row4x4, int block_width4x4,
     int tile_columns, bool decode_entire_tiles_in_worker_threads) {
   // Apply vertical deblock filtering for the first 64 columns of each tile.
   for (int tile_column = 0; tile_column < tile_columns; ++tile_column) {
     const Tile& tile = *tile_row_base[tile_column];
     post_filter->ApplyDeblockFilter(
         kLoopFilterTypeVertical, row4x4, tile.column4x4_start(),
         tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
   }
   if (decode_entire_tiles_in_worker_threads &&
       row4x4 == tile_row_base[0]->row4x4_start()) {
     // This is the first superblock row of a tile row. In this case, apply
     // horizontal deblock filtering for the entire superblock row.
     post_filter->ApplyDeblockFilter(kLoopFilterTypeHorizontal, row4x4, 0,
                                     frame_header.columns4x4, block_width4x4);
   } else {
     // Apply horizontal deblock filtering for the first 64 columns of the
     // first tile.
     const Tile& first_tile = *tile_row_base[0];
     post_filter->ApplyDeblockFilter(
         kLoopFilterTypeHorizontal, row4x4, first_tile.column4x4_start(),
         first_tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
     // Apply horizontal deblock filtering for the last 64 columns of the
     // previous tile and the first 64 columns of the current tile.
     for (int tile_column = 1; tile_column < tile_columns; ++tile_column) {
       const Tile& tile = *tile_row_base[tile_column];
       // If the previous tile has more than 64 columns, then include those
       // for the horizontal deblock.
       const Tile& previous_tile = *tile_row_base[tile_column - 1];
       const int column4x4_start =
           tile.column4x4_start() -
           ((tile.column4x4_start() - kNum4x4InLoopFilterUnit !=
             previous_tile.column4x4_start())
                ? kNum4x4InLoopFilterUnit
                : 0);
       post_filter->ApplyDeblockFilter(
           kLoopFilterTypeHorizontal, row4x4, column4x4_start,
           tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
     }
     // Apply horizontal deblock filtering for the last 64 columns of the
     // last tile.
     const Tile& last_tile = *tile_row_base[tile_columns - 1];
     // Identify the last column4x4 value and do horizontal filtering for
     // that column4x4. The value of last column4x4 is the nearest multiple
     // of 16 that is before tile.column4x4_end().
     const int column4x4_start = (last_tile.column4x4_end() - 1) & ~15;
     // If column4x4_start is the same as tile.column4x4_start() then it
     // means that the last tile has <= 64 columns. So there is nothing left
     // to deblock (since it was already deblocked in the loop above).
     if (column4x4_start != last_tile.column4x4_start()) {
       post_filter->ApplyDeblockFilter(
           kLoopFilterTypeHorizontal, row4x4, column4x4_start,
           last_tile.column4x4_end(), block_width4x4);
     }
   }
 }

 // Helper function used by DecodeTilesThreadedFrameParallel. Decodes the
 // superblock row starting at |row4x4| for tile at index |tile_index| in the
 // list of tiles |tiles|. If the decoding is successful, then it does the
 // following:
 //   * Schedule the next superblock row in the current tile column for decoding
 //     (the next superblock row may be in a different tile than the current
 //     one).
 //   * If an entire superblock row of the frame has been decoded, it notifies
 //     the waiters (if there are any).
 void DecodeSuperBlockRowInTile(
     const Vector<std::unique_ptr<Tile>>& tiles, size_t tile_index, int row4x4,
     const int superblock_size4x4, const int tile_columns,
     const int superblock_rows, FrameScratchBuffer* const frame_scratch_buffer,
     PostFilter* const post_filter, BlockingCounter* const pending_jobs) {
   std::unique_ptr<TileScratchBuffer> scratch_buffer =
       frame_scratch_buffer->tile_scratch_buffer_pool.Get();
   if (scratch_buffer == nullptr) {
     SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
     return;
   }
   Tile& tile = *tiles[tile_index];
   const bool ok = tile.ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
       row4x4, scratch_buffer.get());
   frame_scratch_buffer->tile_scratch_buffer_pool.Release(
       std::move(scratch_buffer));
   if (!ok) {
     SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
     return;
   }
   if (post_filter->DoDeblock()) {
     // Apply vertical deblock filtering for all the columns in this tile except
     // for the first 64 columns.
     post_filter->ApplyDeblockFilter(
         kLoopFilterTypeVertical, row4x4,
         tile.column4x4_start() + kNum4x4InLoopFilterUnit, tile.column4x4_end(),
         superblock_size4x4);
     // Apply horizontal deblock filtering for all the columns in this tile
     // except for the first and the last 64 columns.
     // Note about the last tile of each row: For the last tile, column4x4_end
     // may not be a multiple of 16. In that case it is still okay to simply
     // subtract 16 since ApplyDeblockFilter() will only do the filters in
     // increments of 64 columns (or 32 columns for chroma with subsampling).
     post_filter->ApplyDeblockFilter(
         kLoopFilterTypeHorizontal, row4x4,
         tile.column4x4_start() + kNum4x4InLoopFilterUnit,
         tile.column4x4_end() - kNum4x4InLoopFilterUnit, superblock_size4x4);
   }
   const int superblock_size4x4_log2 = FloorLog2(superblock_size4x4);
   const int index = row4x4 >> superblock_size4x4_log2;
   int* const superblock_row_progress =
       frame_scratch_buffer->superblock_row_progress.get();
   std::condition_variable* const superblock_row_progress_condvar =
       frame_scratch_buffer->superblock_row_progress_condvar.get();
   bool notify;
   {
     std::lock_guard<std::mutex> lock(
         frame_scratch_buffer->superblock_row_mutex);
     notify = ++superblock_row_progress[index] == tile_columns;
   }
   if (notify) {
     // We are done decoding this superblock row. Notify the post filtering
     // thread.
     superblock_row_progress_condvar[index].notify_one();
   }
   // Schedule the next superblock row (if one exists).
   ThreadPool& thread_pool =
       *frame_scratch_buffer->threading_strategy.thread_pool();
   const int next_row4x4 = row4x4 + superblock_size4x4;
   if (!tile.IsRow4x4Inside(next_row4x4)) {
     tile_index += tile_columns;
   }
   if (tile_index >= tiles.size()) return;
   pending_jobs->IncrementBy(1);
   thread_pool.Schedule([&tiles, tile_index, next_row4x4, superblock_size4x4,
                         tile_columns, superblock_rows, frame_scratch_buffer,
                         post_filter, pending_jobs]() {
     DecodeSuperBlockRowInTile(tiles, tile_index, next_row4x4,
                               superblock_size4x4, tile_columns, superblock_rows,
                               frame_scratch_buffer, post_filter, pending_jobs);
     pending_jobs->Decrement();
   });
 }

 StatusCode DecodeTilesThreadedFrameParallel(
     const ObuSequenceHeader& sequence_header,
     const ObuFrameHeader& frame_header,
     const Vector<std::unique_ptr<Tile>>& tiles,
     const SymbolDecoderContext& saved_symbol_decoder_context,
     const SegmentationMap* const prev_segment_ids,
     FrameScratchBuffer* const frame_scratch_buffer,
     PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
   // Parse the frame.
   ThreadPool& thread_pool =
       *frame_scratch_buffer->threading_strategy.thread_pool();
   std::atomic<int> tile_counter(0);
   const int tile_count = static_cast<int>(tiles.size());
   const int num_workers = thread_pool.num_threads();
   BlockingCounterWithStatus parse_workers(num_workers);
   // Submit tile parsing jobs to the thread pool.
   for (int i = 0; i < num_workers; ++i) {
     thread_pool.Schedule([&tiles, tile_count, &tile_counter, &parse_workers]() {
       bool failed = false;
       int index;
       while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
              tile_count) {
         if (!failed) {
           const auto& tile_ptr = tiles[index];
           if (!tile_ptr->Parse()) {
             LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
             failed = true;
           }
         }
       }
       parse_workers.Decrement(!failed);
     });
   }

   // Have the current thread participate in parsing.
   bool failed = false;
   int index;
   while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
          tile_count) {
     if (!failed) {
       const auto& tile_ptr = tiles[index];
       if (!tile_ptr->Parse()) {
         LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
         failed = true;
       }
     }
   }

   // Wait until all the parse workers are done. This ensures that all the tiles
   // have been parsed.
   if (!parse_workers.Wait() || failed) {
     return kLibgav1StatusUnknownError;
   }
   if (frame_header.enable_frame_end_update_cdf) {
     frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
   }
   current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
   SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
   current_frame->SetFrameState(kFrameStateParsed);

   // Decode the frame.
   const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
   const int block_width4x4_log2 =
       sequence_header.use_128x128_superblock ? 5 : 4;
   const int superblock_rows =
       (frame_header.rows4x4 + block_width4x4 - 1) >> block_width4x4_log2;
   if (!frame_scratch_buffer->superblock_row_progress.Resize(superblock_rows) ||
       !frame_scratch_buffer->superblock_row_progress_condvar.Resize(
           superblock_rows)) {
     return kLibgav1StatusOutOfMemory;
   }
   int* const superblock_row_progress =
       frame_scratch_buffer->superblock_row_progress.get();
   memset(superblock_row_progress, 0,
          superblock_rows * sizeof(superblock_row_progress[0]));
   frame_scratch_buffer->tile_decoding_failed = false;
   const int tile_columns = frame_header.tile_info.tile_columns;
   const bool decode_entire_tiles_in_worker_threads =
       num_workers >= tile_columns;
   BlockingCounter pending_jobs(
       decode_entire_tiles_in_worker_threads ? num_workers : tile_columns);
   if (decode_entire_tiles_in_worker_threads) {
     // Submit tile decoding jobs to the thread pool.
     tile_counter = 0;
     for (int i = 0; i < num_workers; ++i) {
       thread_pool.Schedule([&tiles, tile_count, &tile_counter, &pending_jobs,
                             frame_scratch_buffer, superblock_rows]() {
         bool failed = false;
         int index;
         while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
                tile_count) {
           if (failed) continue;
           const auto& tile_ptr = tiles[index];
           if (!tile_ptr->Decode(
                   &frame_scratch_buffer->superblock_row_mutex,
                   frame_scratch_buffer->superblock_row_progress.get(),
                   frame_scratch_buffer->superblock_row_progress_condvar
                       .get())) {
             LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
             failed = true;
             SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
           }
         }
         pending_jobs.Decrement();
       });
     }
   } else {
     // Schedule the jobs for first tile row.
     for (int tile_index = 0; tile_index < tile_columns; ++tile_index) {
       thread_pool.Schedule([&tiles, tile_index, block_width4x4, tile_columns,
                             superblock_rows, frame_scratch_buffer, post_filter,
                             &pending_jobs]() {
         DecodeSuperBlockRowInTile(
             tiles, tile_index, 0, block_width4x4, tile_columns, superblock_rows,
             frame_scratch_buffer, post_filter, &pending_jobs);
         pending_jobs.Decrement();
       });
     }
   }

   // Current thread will do the post filters.
   std::condition_variable* const superblock_row_progress_condvar =
       frame_scratch_buffer->superblock_row_progress_condvar.get();
   const std::unique_ptr<Tile>* tile_row_base = &tiles[0];
   for (int row4x4 = 0, index = 0; row4x4 < frame_header.rows4x4;
        row4x4 += block_width4x4, ++index) {
     if (!tile_row_base[0]->IsRow4x4Inside(row4x4)) {
       tile_row_base += tile_columns;
     }
     {
       std::unique_lock<std::mutex> lock(
           frame_scratch_buffer->superblock_row_mutex);
       while (superblock_row_progress[index] != tile_columns &&
              !frame_scratch_buffer->tile_decoding_failed) {
         superblock_row_progress_condvar[index].wait(lock);
       }
       if (frame_scratch_buffer->tile_decoding_failed) break;
     }
     if (post_filter->DoDeblock()) {
       // Apply deblocking filter for the tile boundaries of this superblock row.
       // The deblocking filter for the internal blocks will be applied in the
       // tile worker threads. In this thread, we will only have to apply
       // deblocking filter for the tile boundaries.
       ApplyDeblockingFilterForTileBoundaries(
           post_filter, tile_row_base, frame_header, row4x4, block_width4x4,
           tile_columns, decode_entire_tiles_in_worker_threads);
     }
     // Apply all the post filters other than deblocking.
     const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
         row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
         /*do_deblock=*/false);
     if (progress_row >= 0) {
       current_frame->SetProgress(progress_row);
     }
   }
   // Wait until all the pending jobs are done. This ensures that all the tiles
   // have been decoded and wrapped up.
   pending_jobs.Wait();
   {
     std::lock_guard<std::mutex> lock(
         frame_scratch_buffer->superblock_row_mutex);
     if (frame_scratch_buffer->tile_decoding_failed) {
       return kLibgav1StatusUnknownError;
     }
   }

   current_frame->SetFrameState(kFrameStateDecoded);
   return kStatusOk;
 }

 }  // namespace

 // static
 StatusCode DecoderImpl::Create(const DecoderSettings* settings,
                                std::unique_ptr<DecoderImpl>* output) {
   if (settings->threads <= 0) {
     LIBGAV1_DLOG(ERROR, "Invalid settings->threads: %d.", settings->threads);
     return kStatusInvalidArgument;
   }
   if (settings->frame_parallel) {
     if (settings->release_input_buffer == nullptr) {
       LIBGAV1_DLOG(ERROR,
                    "release_input_buffer callback must not be null when "
                    "frame_parallel is true.");
       return kStatusInvalidArgument;
     }
   }
   std::unique_ptr<DecoderImpl> impl(new (std::nothrow) DecoderImpl(settings));
   if (impl == nullptr) {
     LIBGAV1_DLOG(ERROR, "Failed to allocate DecoderImpl.");
     return kStatusOutOfMemory;
   }
   const StatusCode status = impl->Init();
   if (status != kStatusOk) return status;
   *output = std::move(impl);
   return kStatusOk;
 }

 DecoderImpl::DecoderImpl(const DecoderSettings* settings)
     : buffer_pool_(settings->on_frame_buffer_size_changed,
                    settings->get_frame_buffer, settings->release_frame_buffer,
                    settings->callback_private_data),
       settings_(*settings) {
   dsp::DspInit();
 }

 DecoderImpl::~DecoderImpl() {
   // Clean up and wait until all the threads have stopped. We just have to pass
   // in a dummy status that is not kStatusOk or kStatusTryAgain to trigger the
   // path that clears all the threads and structs.
   SignalFailure(kStatusUnknownError);
   // Release any other frame buffer references that we may be holding on to.
   ReleaseOutputFrame();
   output_frame_queue_.Clear();
   for (auto& reference_frame : state_.reference_frame) {
     reference_frame = nullptr;
   }
 }

 StatusCode DecoderImpl::Init() {
   if (!GenerateWedgeMask(&wedge_masks_)) {
     LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
     return kStatusOutOfMemory;
   }
   if (!output_frame_queue_.Init(kMaxLayers)) {
     LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed.");
     return kStatusOutOfMemory;
   }
   return kStatusOk;
 }

 StatusCode DecoderImpl::InitializeFrameThreadPoolAndTemporalUnitQueue(
     const uint8_t* data, size_t size) {
   is_frame_parallel_ = false;
   if (settings_.frame_parallel) {
     DecoderState state;
     std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
         data, size, settings_.operating_point, &buffer_pool_, &state));
     if (obu == nullptr) {
       LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
       return kStatusOutOfMemory;
     }
     RefCountedBufferPtr current_frame;
     const StatusCode status = obu->ParseOneFrame(&current_frame);
     if (status != kStatusOk) {
       LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
       return status;
     }
     current_frame = nullptr;
     // We assume that the first frame that was parsed will contain the frame
     // header. This assumption is usually true in practice. So we will simply
     // not use frame parallel mode if this is not the case.
     if (settings_.threads > 1 &&
         !InitializeThreadPoolsForFrameParallel(
             settings_.threads, obu->frame_header().tile_info.tile_count,
             obu->frame_header().tile_info.tile_columns, &frame_thread_pool_,
             &frame_scratch_buffer_pool_)) {
       return kStatusOutOfMemory;
     }
   }
   const int max_allowed_frames =
       (frame_thread_pool_ != nullptr) ? frame_thread_pool_->num_threads() : 1;
   assert(max_allowed_frames > 0);
   if (!temporal_units_.Init(max_allowed_frames)) {
     LIBGAV1_DLOG(ERROR, "temporal_units_.Init() failed.");
     return kStatusOutOfMemory;
   }
   is_frame_parallel_ = frame_thread_pool_ != nullptr;
   return kStatusOk;
 }

 StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size,
                                      int64_t user_private_data,
                                      void* buffer_private_data) {
   if (data == nullptr || size == 0) return kStatusInvalidArgument;
   if (HasFailure()) return kStatusUnknownError;
   if (!seen_first_frame_) {
     seen_first_frame_ = true;
     const StatusCode status =
         InitializeFrameThreadPoolAndTemporalUnitQueue(data, size);
     if (status != kStatusOk) {
       return SignalFailure(status);
     }
   }
   if (temporal_units_.Full()) {
     return kStatusTryAgain;
   }
   if (is_frame_parallel_) {
     return ParseAndSchedule(data, size, user_private_data, buffer_private_data);
   }
   TemporalUnit temporal_unit(data, size, user_private_data,
                              buffer_private_data);
   temporal_units_.Push(std::move(temporal_unit));
   return kStatusOk;
 }

 StatusCode DecoderImpl::SignalFailure(StatusCode status) {
   if (status == kStatusOk || status == kStatusTryAgain) return status;
   // Set the |failure_status_| first so that any pending jobs in
   // |frame_thread_pool_| will exit right away when the thread pool is being
   // released below.
   {
     std::lock_guard<std::mutex> lock(mutex_);
     failure_status_ = status;
   }
   // Make sure all waiting threads exit.
   buffer_pool_.Abort();
   frame_thread_pool_ = nullptr;
   while (!temporal_units_.Empty()) {
     if (settings_.release_input_buffer != nullptr) {
       settings_.release_input_buffer(
           settings_.callback_private_data,
           temporal_units_.Front().buffer_private_data);
     }
     temporal_units_.Pop();
   }
   return status;
 }

 // DequeueFrame() follows the following policy to avoid holding unnecessary
 // frame buffer references in output_frame_: output_frame_ must be null when
 // DequeueFrame() returns false.
 StatusCode DecoderImpl::DequeueFrame(const DecoderBuffer** out_ptr) {
   if (out_ptr == nullptr) {
     LIBGAV1_DLOG(ERROR, "Invalid argument: out_ptr == nullptr.");
     return kStatusInvalidArgument;
   }
   // We assume a call to DequeueFrame() indicates that the caller is no longer
   // using the previous output frame, so we can release it.
   ReleaseOutputFrame();
   if (temporal_units_.Empty()) {
     // No input frames to decode.
     *out_ptr = nullptr;
     return kStatusNothingToDequeue;
   }
   TemporalUnit& temporal_unit = temporal_units_.Front();
   if (!is_frame_parallel_) {
     // If |output_frame_queue_| is not empty, then return the first frame from
     // that queue.
     if (!output_frame_queue_.Empty()) {
       RefCountedBufferPtr frame = std::move(output_frame_queue_.Front());
       output_frame_queue_.Pop();
       buffer_.user_private_data = temporal_unit.user_private_data;
       if (output_frame_queue_.Empty()) {
         temporal_units_.Pop();
       }
       const StatusCode status = CopyFrameToOutputBuffer(frame);
       if (status != kStatusOk) {
         return status;
       }
       *out_ptr = &buffer_;
       return kStatusOk;
     }
     // Decode the next available temporal unit and return.
     const StatusCode status = DecodeTemporalUnit(temporal_unit, out_ptr);
     if (status != kStatusOk) {
       // In case of failure, discard all the output frames that we may be
       // holding on references to.
       output_frame_queue_.Clear();
     }
     if (settings_.release_input_buffer != nullptr) {
       settings_.release_input_buffer(settings_.callback_private_data,
                                      temporal_unit.buffer_private_data);
     }
     if (output_frame_queue_.Empty()) {
       temporal_units_.Pop();
     }
     return status;
   }
   {
     std::unique_lock<std::mutex> lock(mutex_);
     if (settings_.blocking_dequeue) {
       while (!temporal_unit.decoded && failure_status_ == kStatusOk) {
         decoded_condvar_.wait(lock);
       }
     } else {
       if (!temporal_unit.decoded && failure_status_ == kStatusOk) {
         return kStatusTryAgain;
       }
     }
     if (failure_status_ != kStatusOk) {
       const StatusCode failure_status = failure_status_;
       lock.unlock();
       return SignalFailure(failure_status);
     }
   }
   if (settings_.release_input_buffer != nullptr &&
       !temporal_unit.released_input_buffer) {
     temporal_unit.released_input_buffer = true;
     settings_.release_input_buffer(settings_.callback_private_data,
                                    temporal_unit.buffer_private_data);
   }
   if (temporal_unit.status != kStatusOk) {
     temporal_units_.Pop();
     return SignalFailure(temporal_unit.status);
   }
   if (!temporal_unit.has_displayable_frame) {
     *out_ptr = nullptr;
     temporal_units_.Pop();
     return kStatusOk;
   }
   assert(temporal_unit.output_layer_count > 0);
   StatusCode status = CopyFrameToOutputBuffer(
       temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame);
   temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame =
       nullptr;
   if (status != kStatusOk) {
     temporal_units_.Pop();
     return SignalFailure(status);
   }
   buffer_.user_private_data = temporal_unit.user_private_data;
   *out_ptr = &buffer_;
   if (--temporal_unit.output_layer_count == 0) {
     temporal_units_.Pop();
   }
   return kStatusOk;
 }

 StatusCode DecoderImpl::ParseAndSchedule(const uint8_t* data, size_t size,
                                          int64_t user_private_data,
                                          void* buffer_private_data) {
   TemporalUnit temporal_unit(data, size, user_private_data,
                              buffer_private_data);
   std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
       temporal_unit.data, temporal_unit.size, settings_.operating_point,
       &buffer_pool_, &state_));
   if (obu == nullptr) {
     LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
     return kStatusOutOfMemory;
   }
   if (has_sequence_header_) {
     obu->set_sequence_header(sequence_header_);
   }
   StatusCode status;
   int position_in_temporal_unit = 0;
   while (obu->HasData()) {
     RefCountedBufferPtr current_frame;
     status = obu->ParseOneFrame(&current_frame);
     if (status != kStatusOk) {
       LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
       return status;
     }
     if (IsNewSequenceHeader(*obu)) {
       const ObuSequenceHeader& sequence_header = obu->sequence_header();
       const Libgav1ImageFormat image_format =
           ComposeImageFormat(sequence_header.color_config.is_monochrome,
                              sequence_header.color_config.subsampling_x,
                              sequence_header.color_config.subsampling_y);
       const int max_bottom_border = GetBottomBorderPixels(
           /*do_cdef=*/true, /*do_restoration=*/true,
           /*do_superres=*/true, sequence_header.color_config.subsampling_y);
       // TODO(vigneshv): This may not be the right place to call this callback
       // for the frame parallel case. Investigate and fix it.
       if (!buffer_pool_.OnFrameBufferSizeChanged(
               sequence_header.color_config.bitdepth, image_format,
               sequence_header.max_frame_width, sequence_header.max_frame_height,
               kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
         LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
         return kStatusUnknownError;
       }
     }
     // This can happen when there are multiple spatial/temporal layers and if
     // all the layers are outside the current operating point.
     if (current_frame == nullptr) {
       continue;
     }
     // Note that we cannot set EncodedFrame.temporal_unit here. It will be set
     // in the code below after |temporal_unit| is std::move'd into the
     // |temporal_units_| queue.
     if (!temporal_unit.frames.emplace_back(obu.get(), state_, current_frame,
                                            position_in_temporal_unit++)) {
       LIBGAV1_DLOG(ERROR, "temporal_unit.frames.emplace_back failed.");
       return kStatusOutOfMemory;
     }
     state_.UpdateReferenceFrames(current_frame,
                                  obu->frame_header().refresh_frame_flags);
   }
   // This function cannot fail after this point. So it is okay to move the
   // |temporal_unit| into |temporal_units_| queue.
   temporal_units_.Push(std::move(temporal_unit));
   if (temporal_units_.Back().frames.empty()) {
     std::lock_guard<std::mutex> lock(mutex_);
     temporal_units_.Back().has_displayable_frame = false;
     temporal_units_.Back().decoded = true;
     return kStatusOk;
   }
   for (auto& frame : temporal_units_.Back().frames) {
     EncodedFrame* const encoded_frame = &frame;
     encoded_frame->temporal_unit = &temporal_units_.Back();
     frame_thread_pool_->Schedule([this, encoded_frame]() {
       if (HasFailure()) return;
       const StatusCode status = DecodeFrame(encoded_frame);
       encoded_frame->state = {};
       encoded_frame->frame = nullptr;
       TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
       std::lock_guard<std::mutex> lock(mutex_);
       if (failure_status_ != kStatusOk) return;
       // temporal_unit's status defaults to kStatusOk. So we need to set it only
       // on error. If |failure_status_| is not kStatusOk at this point, it means
       // that there has already been a failure. So we don't care about this
       // subsequent failure.  We will simply return the error code of the first
       // failure.
       if (status != kStatusOk) {
         temporal_unit.status = status;
         if (failure_status_ == kStatusOk) {
           failure_status_ = status;
         }
       }
       temporal_unit.decoded =
           ++temporal_unit.decoded_count == temporal_unit.frames.size();
       if (temporal_unit.decoded && settings_.output_all_layers &&
           temporal_unit.output_layer_count > 1) {
         std::sort(
             temporal_unit.output_layers,
             temporal_unit.output_layers + temporal_unit.output_layer_count);
       }
       if (temporal_unit.decoded || failure_status_ != kStatusOk) {
         decoded_condvar_.notify_one();
       }
     });
   }
   return kStatusOk;
 }

 StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
   const ObuSequenceHeader& sequence_header = encoded_frame->sequence_header;
   const ObuFrameHeader& frame_header = encoded_frame->frame_header;
   RefCountedBufferPtr current_frame = std::move(encoded_frame->frame);

   std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
       frame_scratch_buffer_pool_.Get();
   if (frame_scratch_buffer == nullptr) {
     LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
     return kStatusOutOfMemory;
   }
   // |frame_scratch_buffer| will be released when this local variable goes out
   // of scope (i.e.) on any return path in this function.
   FrameScratchBufferReleaser frame_scratch_buffer_releaser(
       &frame_scratch_buffer_pool_, &frame_scratch_buffer);

   StatusCode status;
   if (!frame_header.show_existing_frame) {
     if (encoded_frame->tile_buffers.empty()) {
       // This means that the last call to ParseOneFrame() did not actually
       // have any tile groups. This could happen in rare cases (for example,
       // if there is a Metadata OBU after the TileGroup OBU). We currently do
       // not have a reason to handle those cases, so we simply continue.
       return kStatusOk;
     }
     status = DecodeTiles(sequence_header, frame_header,
                          encoded_frame->tile_buffers, encoded_frame->state,
                          frame_scratch_buffer.get(), current_frame.get());
     if (status != kStatusOk) {
       return status;
     }
   } else {
     if (!current_frame->WaitUntilDecoded()) {
       return kStatusUnknownError;
     }
   }
   if (!frame_header.show_frame && !frame_header.show_existing_frame) {
     // This frame is not displayable. Not an error.
     return kStatusOk;
   }
   RefCountedBufferPtr film_grain_frame;
   status = ApplyFilmGrain(
       sequence_header, frame_header, current_frame, &film_grain_frame,
       frame_scratch_buffer->threading_strategy.thread_pool());
   if (status != kStatusOk) {
     return status;
   }

   TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
   std::lock_guard<std::mutex> lock(mutex_);
   if (temporal_unit.has_displayable_frame && !settings_.output_all_layers) {
     assert(temporal_unit.output_frame_position >= 0);
     // A displayable frame was already found in this temporal unit. This can
     // happen if there are multiple spatial/temporal layers. Since
     // |settings_.output_all_layers| is false, we will output only the last
     // displayable frame.
     if (temporal_unit.output_frame_position >
         encoded_frame->position_in_temporal_unit) {
       return kStatusOk;
     }
     // Replace any output frame that we may have seen before with the current
     // frame.
     assert(temporal_unit.output_layer_count == 1);
     --temporal_unit.output_layer_count;
   }
   temporal_unit.has_displayable_frame = true;
   temporal_unit.output_layers[temporal_unit.output_layer_count].frame =
       std::move(film_grain_frame);
   temporal_unit.output_layers[temporal_unit.output_layer_count]
       .position_in_temporal_unit = encoded_frame->position_in_temporal_unit;
   ++temporal_unit.output_layer_count;
   temporal_unit.output_frame_position =
       encoded_frame->position_in_temporal_unit;
   return kStatusOk;
 }

 StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
                                            const DecoderBuffer** out_ptr) {
   std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
       temporal_unit.data, temporal_unit.size, settings_.operating_point,
       &buffer_pool_, &state_));
   if (obu == nullptr) {
     LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
     return kStatusOutOfMemory;
   }
   if (has_sequence_header_) {
     obu->set_sequence_header(sequence_header_);
   }
   StatusCode status;
   std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
       frame_scratch_buffer_pool_.Get();
   if (frame_scratch_buffer == nullptr) {
     LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
     return kStatusOutOfMemory;
   }
   // |frame_scratch_buffer| will be released when this local variable goes out
   // of scope (i.e.) on any return path in this function.
   FrameScratchBufferReleaser frame_scratch_buffer_releaser(
       &frame_scratch_buffer_pool_, &frame_scratch_buffer);

   while (obu->HasData()) {
     RefCountedBufferPtr current_frame;
     status = obu->ParseOneFrame(&current_frame);
     if (status != kStatusOk) {
       LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
       return status;
     }
     if (IsNewSequenceHeader(*obu)) {
       const ObuSequenceHeader& sequence_header = obu->sequence_header();
       const Libgav1ImageFormat image_format =
           ComposeImageFormat(sequence_header.color_config.is_monochrome,
                              sequence_header.color_config.subsampling_x,
                              sequence_header.color_config.subsampling_y);
       const int max_bottom_border = GetBottomBorderPixels(
           /*do_cdef=*/true, /*do_restoration=*/true,
           /*do_superres=*/true, sequence_header.color_config.subsampling_y);
       if (!buffer_pool_.OnFrameBufferSizeChanged(
               sequence_header.color_config.bitdepth, image_format,
               sequence_header.max_frame_width, sequence_header.max_frame_height,
               kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
         LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
         return kStatusUnknownError;
       }
     }
     if (!obu->frame_header().show_existing_frame) {
       if (obu->tile_buffers().empty()) {
         // This means that the last call to ParseOneFrame() did not actually
         // have any tile groups. This could happen in rare cases (for example,
         // if there is a Metadata OBU after the TileGroup OBU). We currently do
         // not have a reason to handle those cases, so we simply continue.
         continue;
       }
       status = DecodeTiles(obu->sequence_header(), obu->frame_header(),
                            obu->tile_buffers(), state_,
                            frame_scratch_buffer.get(), current_frame.get());
       if (status != kStatusOk) {
         return status;
       }
     }
     state_.UpdateReferenceFrames(current_frame,
                                  obu->frame_header().refresh_frame_flags);
     if (obu->frame_header().show_frame ||
         obu->frame_header().show_existing_frame) {
       if (!output_frame_queue_.Empty() && !settings_.output_all_layers) {
         // There is more than one displayable frame in the current operating
         // point and |settings_.output_all_layers| is false. In this case, we
         // simply return the last displayable frame as the output frame and
         // ignore the rest.
         assert(output_frame_queue_.Size() == 1);
         output_frame_queue_.Pop();
       }
       RefCountedBufferPtr film_grain_frame;
       status = ApplyFilmGrain(
           obu->sequence_header(), obu->frame_header(), current_frame,
           &film_grain_frame,
           frame_scratch_buffer->threading_strategy.film_grain_thread_pool());
       if (status != kStatusOk) return status;
       output_frame_queue_.Push(std::move(film_grain_frame));
     }
   }
   if (output_frame_queue_.Empty()) {
     // No displayable frame in the temporal unit. Not an error.
     *out_ptr = nullptr;
     return kStatusOk;
   }
   status = CopyFrameToOutputBuffer(output_frame_queue_.Front());
   output_frame_queue_.Pop();
   if (status != kStatusOk) {
     return status;
   }
   buffer_.user_private_data = temporal_unit.user_private_data;
   *out_ptr = &buffer_;
   return kStatusOk;
 }

 StatusCode DecoderImpl::CopyFrameToOutputBuffer(
     const RefCountedBufferPtr& frame) {
   YuvBuffer* yuv_buffer = frame->buffer();

   buffer_.chroma_sample_position = frame->chroma_sample_position();

   if (yuv_buffer->is_monochrome()) {
     buffer_.image_format = kImageFormatMonochrome400;
   } else {
     if (yuv_buffer->subsampling_x() == 0 && yuv_buffer->subsampling_y() == 0) {
       buffer_.image_format = kImageFormatYuv444;
     } else if (yuv_buffer->subsampling_x() == 1 &&
                yuv_buffer->subsampling_y() == 0) {
       buffer_.image_format = kImageFormatYuv422;
     } else if (yuv_buffer->subsampling_x() == 1 &&
                yuv_buffer->subsampling_y() == 1) {
       buffer_.image_format = kImageFormatYuv420;
     } else {
       LIBGAV1_DLOG(ERROR,
                    "Invalid chroma subsampling values: cannot determine buffer "
                    "image format.");
       return kStatusInvalidArgument;
     }
   }
   buffer_.color_range = sequence_header_.color_config.color_range;
   buffer_.color_primary = sequence_header_.color_config.color_primary;
   buffer_.transfer_characteristics =
       sequence_header_.color_config.transfer_characteristics;
   buffer_.matrix_coefficients =
       sequence_header_.color_config.matrix_coefficients;

   buffer_.bitdepth = yuv_buffer->bitdepth();
   const int num_planes =
       yuv_buffer->is_monochrome() ? kMaxPlanesMonochrome : kMaxPlanes;
   int plane = 0;
   for (; plane < num_planes; ++plane) {
     buffer_.stride[plane] = yuv_buffer->stride(plane);
     buffer_.plane[plane] = yuv_buffer->data(plane);
     buffer_.displayed_width[plane] = yuv_buffer->width(plane);
     buffer_.displayed_height[plane] = yuv_buffer->height(plane);
   }
   for (; plane < kMaxPlanes; ++plane) {
     buffer_.stride[plane] = 0;
     buffer_.plane[plane] = nullptr;
     buffer_.displayed_width[plane] = 0;
     buffer_.displayed_height[plane] = 0;
   }
   buffer_.spatial_id = frame->spatial_id();
   buffer_.temporal_id = frame->temporal_id();
   buffer_.buffer_private_data = frame->buffer_private_data();
   output_frame_ = frame;
   return kStatusOk;
 }

 void DecoderImpl::ReleaseOutputFrame() {
   for (auto& plane : buffer_.plane) {
     plane = nullptr;
   }
   output_frame_ = nullptr;
 }

 StatusCode DecoderImpl::DecodeTiles(
     const ObuSequenceHeader& sequence_header,
     const ObuFrameHeader& frame_header, const Vector<TileBuffer>& tile_buffers,
     const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
     RefCountedBuffer* const current_frame) {
   frame_scratch_buffer->tile_scratch_buffer_pool.Reset(
       sequence_header.color_config.bitdepth);
   if (!frame_scratch_buffer->loop_restoration_info.Reset(
           &frame_header.loop_restoration, frame_header.upscaled_width,
           frame_header.height, sequence_header.color_config.subsampling_x,
           sequence_header.color_config.subsampling_y,
           sequence_header.color_config.is_monochrome)) {
     LIBGAV1_DLOG(ERROR,
                  "Failed to allocate memory for loop restoration info units.");
     return kStatusOutOfMemory;
   }
   const bool do_cdef =
       PostFilter::DoCdef(frame_header, settings_.post_filter_mask);
   const int num_planes = sequence_header.color_config.is_monochrome
                              ? kMaxPlanesMonochrome
                              : kMaxPlanes;
   const bool do_restoration = PostFilter::DoRestoration(
       frame_header.loop_restoration, settings_.post_filter_mask, num_planes);
   const bool do_superres =
       PostFilter::DoSuperRes(frame_header, settings_.post_filter_mask);
   // Use kBorderPixels for the left, right, and top borders. Only the bottom
   // border may need to be bigger. SuperRes border is needed only if we are
   // applying SuperRes in-place which is being done only in single threaded
   // mode.
   const int bottom_border = GetBottomBorderPixels(
       do_cdef, do_restoration,
       do_superres &&
           frame_scratch_buffer->threading_strategy.post_filter_thread_pool() ==
               nullptr,
       sequence_header.color_config.subsampling_y);
   current_frame->set_chroma_sample_position(
       sequence_header.color_config.chroma_sample_position);
   if (!current_frame->Realloc(sequence_header.color_config.bitdepth,
                               sequence_header.color_config.is_monochrome,
                               frame_header.upscaled_width, frame_header.height,
                               sequence_header.color_config.subsampling_x,
                               sequence_header.color_config.subsampling_y,
                               /*left_border=*/kBorderPixels,
                               /*right_border=*/kBorderPixels,
                               /*top_border=*/kBorderPixels, bottom_border)) {
     LIBGAV1_DLOG(ERROR, "Failed to allocate memory for the decoder buffer.");
     return kStatusOutOfMemory;
   }
   if (sequence_header.enable_cdef) {
     if (!frame_scratch_buffer->cdef_index.Reset(
             DivideBy16(frame_header.rows4x4 + kMaxBlockHeight4x4),
             DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
             /*zero_initialize=*/false)) {
       LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef index.");
       return kStatusOutOfMemory;
     }
   }
   if (!frame_scratch_buffer->inter_transform_sizes.Reset(
           frame_header.rows4x4 + kMaxBlockHeight4x4,
           frame_header.columns4x4 + kMaxBlockWidth4x4,
           /*zero_initialize=*/false)) {
     LIBGAV1_DLOG(ERROR, "Failed to allocate memory for inter_transform_sizes.");
     return kStatusOutOfMemory;
   }
   if (frame_header.use_ref_frame_mvs) {
     if (!frame_scratch_buffer->motion_field.mv.Reset(
             DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
             /*zero_initialize=*/false) ||
         !frame_scratch_buffer->motion_field.reference_offset.Reset(
             DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
             /*zero_initialize=*/false)) {
       LIBGAV1_DLOG(ERROR,
                    "Failed to allocate memory for temporal motion vectors.");
       return kStatusOutOfMemory;
     }

     // For each motion vector, only mv[0] needs to be initialized to
     // kInvalidMvValue, mv[1] is not necessary to be initialized and can be
     // set to an arbitrary value. For simplicity, mv[1] is set to 0.
     // The following memory initialization of contiguous memory is very fast. It
     // is not recommended to make the initialization multi-threaded, unless the
     // memory which needs to be initialized in each thread is still contiguous.
     MotionVector invalid_mv;
     invalid_mv.mv[0] = kInvalidMvValue;
     invalid_mv.mv[1] = 0;
     MotionVector* const motion_field_mv =
         &frame_scratch_buffer->motion_field.mv[0][0];
     std::fill(motion_field_mv,
               motion_field_mv + frame_scratch_buffer->motion_field.mv.size(),
               invalid_mv);
   }

   // The addition of kMaxBlockHeight4x4 and kMaxBlockWidth4x4 is necessary so
   // that the block parameters cache can be filled in for the last row/column
   // without having to check for boundary conditions.
   if (!frame_scratch_buffer->block_parameters_holder.Reset(
           frame_header.rows4x4 + kMaxBlockHeight4x4,
           frame_header.columns4x4 + kMaxBlockWidth4x4,
           sequence_header.use_128x128_superblock)) {
     return kStatusOutOfMemory;
   }
   const dsp::Dsp* const dsp =
       dsp::GetDspTable(sequence_header.color_config.bitdepth);
   if (dsp == nullptr) {
     LIBGAV1_DLOG(ERROR, "Failed to get the dsp table for bitdepth %d.",
                  sequence_header.color_config.bitdepth);
     return kStatusInternalError;
   }

   const int tile_count = frame_header.tile_info.tile_count;
   assert(tile_count >= 1);
   Vector<std::unique_ptr<Tile>> tiles;
   if (!tiles.reserve(tile_count)) {
     LIBGAV1_DLOG(ERROR, "tiles.reserve(%d) failed.\n", tile_count);
     return kStatusOutOfMemory;
   }
   ThreadingStrategy& threading_strategy =
       frame_scratch_buffer->threading_strategy;
   if (!is_frame_parallel_ &&
       !threading_strategy.Reset(frame_header, settings_.threads)) {
     return kStatusOutOfMemory;
   }

   if (threading_strategy.row_thread_pool(0) != nullptr || is_frame_parallel_) {
     if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
       frame_scratch_buffer->residual_buffer_pool.reset(
           new (std::nothrow) ResidualBufferPool(
               sequence_header.use_128x128_superblock,
               sequence_header.color_config.subsampling_x,
               sequence_header.color_config.subsampling_y,
               sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
                                                          : sizeof(int32_t)));
       if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
         LIBGAV1_DLOG(ERROR, "Failed to allocate residual buffer.\n");
         return kStatusOutOfMemory;
       }
     } else {
       frame_scratch_buffer->residual_buffer_pool->Reset(
           sequence_header.use_128x128_superblock,
           sequence_header.color_config.subsampling_x,
           sequence_header.color_config.subsampling_y,
           sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
                                                      : sizeof(int32_t));
     }
   }

   if (threading_strategy.post_filter_thread_pool() != nullptr &&
       (do_cdef || do_restoration)) {
     const int window_buffer_width = PostFilter::GetWindowBufferWidth(
         threading_strategy.post_filter_thread_pool(), frame_header);
     size_t threaded_window_buffer_size =
         window_buffer_width *
         PostFilter::GetWindowBufferHeight(
             threading_strategy.post_filter_thread_pool(), frame_header) *
         (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
                                                     : sizeof(uint16_t));
     if (do_cdef) {
       // TODO(chengchen): for cdef U, V planes, if there's subsampling, we can
       // use smaller buffer.
       threaded_window_buffer_size *= num_planes;
     }
     // To avoid false sharing, PostFilter's window width in bytes should be a
     // multiple of the cache line size. For simplicity, we check the window
     // width in pixels.
     assert(window_buffer_width % kCacheLineSize == 0);
     if (!frame_scratch_buffer->threaded_window_buffer.Resize(
             threaded_window_buffer_size)) {
       LIBGAV1_DLOG(ERROR,
                    "Failed to resize threaded loop restoration buffer.\n");
       return kStatusOutOfMemory;
     }
   }

   if (do_cdef && do_restoration) {
     // We need to store 4 rows per 64x64 unit.
     const int num_deblock_units = MultiplyBy4(Ceil(frame_header.rows4x4, 16));
     // subsampling_y is set to zero irrespective of the actual frame's
     // subsampling since we need to store exactly |num_deblock_units| rows of
     // the deblocked pixels.
     if (!frame_scratch_buffer->deblock_buffer.Realloc(
             sequence_header.color_config.bitdepth,
             sequence_header.color_config.is_monochrome,
             frame_header.upscaled_width, num_deblock_units,
             sequence_header.color_config.subsampling_x,
             /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
             kBorderPixels, nullptr, nullptr, nullptr)) {
       return kStatusOutOfMemory;
     }
   }

   if (do_superres) {
     const int num_threads =
         1 + ((threading_strategy.post_filter_thread_pool() == nullptr)
                  ? 0
                  : threading_strategy.post_filter_thread_pool()->num_threads());
     const size_t superres_line_buffer_size =
         num_threads *
         (MultiplyBy4(frame_header.columns4x4) +
          MultiplyBy2(kSuperResHorizontalBorder) + kSuperResHorizontalPadding) *
         (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
                                                     : sizeof(uint16_t));
     if (!frame_scratch_buffer->superres_line_buffer.Resize(
             superres_line_buffer_size)) {
       LIBGAV1_DLOG(ERROR, "Failed to resize superres line buffer.\n");
       return kStatusOutOfMemory;
     }
   }

   PostFilter post_filter(frame_header, sequence_header, frame_scratch_buffer,
                          current_frame->buffer(), dsp,
                          settings_.post_filter_mask);

   if (is_frame_parallel_) {
     // We can parse the current frame if all the reference frames have been
     // parsed.
     for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
       if (!state.reference_valid[i] || state.reference_frame[i] == nullptr) {
         continue;
       }
       if (!state.reference_frame[i]->WaitUntilParsed()) {
         return kStatusUnknownError;
       }
     }
   }

   // If prev_segment_ids is a null pointer, it is treated as if it pointed to
   // a segmentation map containing all 0s.
   const SegmentationMap* prev_segment_ids = nullptr;
   if (frame_header.primary_reference_frame == kPrimaryReferenceNone) {
     frame_scratch_buffer->symbol_decoder_context.Initialize(
         frame_header.quantizer.base_index);
   } else {
     const int index =
         frame_header
             .reference_frame_index[frame_header.primary_reference_frame];
     assert(index != -1);
     const RefCountedBuffer* prev_frame = state.reference_frame[index].get();
     frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext();
     if (frame_header.segmentation.enabled &&
         prev_frame->columns4x4() == frame_header.columns4x4 &&
         prev_frame->rows4x4() == frame_header.rows4x4) {
       prev_segment_ids = prev_frame->segmentation_map();
     }
   }

   // The Tile class must make use of a separate buffer to store the unfiltered
   // pixels for the intra prediction of the next superblock row. This is done
   // only when one of the following conditions are true:
   //   * is_frame_parallel_ is true.
   //   * settings_.threads == 1.
   // In the non-frame-parallel multi-threaded case, we do not run the post
   // filters in the decode loop. So this buffer need not be used.
   const bool use_intra_prediction_buffer =
       is_frame_parallel_ || settings_.threads == 1;
   if (use_intra_prediction_buffer) {
     if (!frame_scratch_buffer->intra_prediction_buffers.Resize(
             frame_header.tile_info.tile_rows)) {
       LIBGAV1_DLOG(ERROR, "Failed to Resize intra_prediction_buffers.");
       return kStatusOutOfMemory;
     }
     IntraPredictionBuffer* const intra_prediction_buffers =
         frame_scratch_buffer->intra_prediction_buffers.get();
     for (int plane = 0; plane < num_planes; ++plane) {
       const int subsampling =
           (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
       const size_t intra_prediction_buffer_size =
           ((MultiplyBy4(frame_header.columns4x4) >> subsampling) *
            (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
                                                        : sizeof(uint16_t)));
       for (int tile_row = 0; tile_row < frame_header.tile_info.tile_rows;
            ++tile_row) {
         if (!intra_prediction_buffers[tile_row][plane].Resize(
                 intra_prediction_buffer_size)) {
           LIBGAV1_DLOG(ERROR,
                        "Failed to allocate intra prediction buffer for tile "
                        "row %d plane %d.\n",
                        tile_row, plane);
           return kStatusOutOfMemory;
         }
       }
     }
   }

   SymbolDecoderContext saved_symbol_decoder_context;
   BlockingCounterWithStatus pending_tiles(tile_count);
   for (int tile_number = 0; tile_number < tile_count; ++tile_number) {
     std::unique_ptr<Tile> tile = Tile::Create(
         tile_number, tile_buffers[tile_number].data,
         tile_buffers[tile_number].size, sequence_header, frame_header,
         current_frame, state, frame_scratch_buffer, wedge_masks_,
         &saved_symbol_decoder_context, prev_segment_ids, &post_filter, dsp,
         threading_strategy.row_thread_pool(tile_number), &pending_tiles,
         is_frame_parallel_, use_intra_prediction_buffer);
     if (tile == nullptr) {
       LIBGAV1_DLOG(ERROR, "Failed to create tile.");
       return kStatusOutOfMemory;
     }
     tiles.push_back_unchecked(std::move(tile));
   }
   assert(tiles.size() == static_cast<size_t>(tile_count));
   if (is_frame_parallel_) {
     if (frame_scratch_buffer->threading_strategy.thread_pool() == nullptr) {
       return DecodeTilesFrameParallel(
           sequence_header, frame_header, tiles, saved_symbol_decoder_context,
           prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
     }
     return DecodeTilesThreadedFrameParallel(
         sequence_header, frame_header, tiles, saved_symbol_decoder_context,
         prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
   }
   StatusCode status;
   if (settings_.threads == 1) {
     status = DecodeTilesNonFrameParallel(sequence_header, frame_header, tiles,
                                          frame_scratch_buffer, &post_filter);
   } else {
     status = DecodeTilesThreadedNonFrameParallel(tiles, frame_scratch_buffer,
                                                  &post_filter, &pending_tiles);
   }
   if (status != kStatusOk) return status;
   if (frame_header.enable_frame_end_update_cdf) {
     frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
   }
   current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
   SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
   return kStatusOk;
 }

 StatusCode DecoderImpl::ApplyFilmGrain(
     const ObuSequenceHeader& sequence_header,
     const ObuFrameHeader& frame_header,
     const RefCountedBufferPtr& displayable_frame,
     RefCountedBufferPtr* film_grain_frame, ThreadPool* thread_pool) {
   if (!sequence_header.film_grain_params_present ||
       !displayable_frame->film_grain_params().apply_grain ||
       (settings_.post_filter_mask & 0x10) == 0) {
     *film_grain_frame = displayable_frame;
     return kStatusOk;
   }
   if (!frame_header.show_existing_frame &&
       frame_header.refresh_frame_flags == 0) {
     // If show_existing_frame is true, then the current frame is a previously
     // saved reference frame. If refresh_frame_flags is nonzero, then the
     // state_.UpdateReferenceFrames() call above has saved the current frame as
     // a reference frame. Therefore, if both of these conditions are false, then
     // the current frame is not saved as a reference frame. displayable_frame
     // should hold the only reference to the current frame.
     assert(displayable_frame.use_count() == 1);
     // Add film grain noise in place.
     *film_grain_frame = displayable_frame;
   } else {
     *film_grain_frame = buffer_pool_.GetFreeBuffer();
     if (*film_grain_frame == nullptr) {
       LIBGAV1_DLOG(ERROR,
                    "Could not get film_grain_frame from the buffer pool.");
       return kStatusResourceExhausted;
     }
     if (!(*film_grain_frame)
              ->Realloc(displayable_frame->buffer()->bitdepth(),
                        displayable_frame->buffer()->is_monochrome(),
                        displayable_frame->upscaled_width(),
                        displayable_frame->frame_height(),
                        displayable_frame->buffer()->subsampling_x(),
                        displayable_frame->buffer()->subsampling_y(),
                        kBorderPixelsFilmGrain, kBorderPixelsFilmGrain,
                        kBorderPixelsFilmGrain, kBorderPixelsFilmGrain)) {
       LIBGAV1_DLOG(ERROR, "film_grain_frame->Realloc() failed.");
       return kStatusOutOfMemory;
     }
     (*film_grain_frame)
         ->set_chroma_sample_position(
             displayable_frame->chroma_sample_position());
     (*film_grain_frame)->set_spatial_id(displayable_frame->spatial_id());
     (*film_grain_frame)->set_temporal_id(displayable_frame->temporal_id());
   }
   const bool color_matrix_is_identity =
       sequence_header.color_config.matrix_coefficients ==
       kMatrixCoefficientsIdentity;
   assert(displayable_frame->buffer()->stride(kPlaneU) ==
          displayable_frame->buffer()->stride(kPlaneV));
   const int input_stride_uv = displayable_frame->buffer()->stride(kPlaneU);
   assert((*film_grain_frame)->buffer()->stride(kPlaneU) ==
          (*film_grain_frame)->buffer()->stride(kPlaneV));
   const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU);
 #if LIBGAV1_MAX_BITDEPTH >= 10
   if (displayable_frame->buffer()->bitdepth() > 8) {
     FilmGrain<10> film_grain(displayable_frame->film_grain_params(),
                              displayable_frame->buffer()->is_monochrome(),
                              color_matrix_is_identity,
                              displayable_frame->buffer()->subsampling_x(),
                              displayable_frame->buffer()->subsampling_y(),
                              displayable_frame->upscaled_width(),
                              displayable_frame->frame_height(), thread_pool);
     if (!film_grain.AddNoise(
             displayable_frame->buffer()->data(kPlaneY),
             displayable_frame->buffer()->stride(kPlaneY),
             displayable_frame->buffer()->data(kPlaneU),
             displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
             (*film_grain_frame)->buffer()->data(kPlaneY),
             (*film_grain_frame)->buffer()->stride(kPlaneY),
             (*film_grain_frame)->buffer()->data(kPlaneU),
             (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
       LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
       return kStatusOutOfMemory;
     }
     return kStatusOk;
   }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
   FilmGrain<8> film_grain(displayable_frame->film_grain_params(),
                           displayable_frame->buffer()->is_monochrome(),
                           color_matrix_is_identity,
                           displayable_frame->buffer()->subsampling_x(),
                           displayable_frame->buffer()->subsampling_y(),
                           displayable_frame->upscaled_width(),
                           displayable_frame->frame_height(), thread_pool);
   if (!film_grain.AddNoise(
           displayable_frame->buffer()->data(kPlaneY),
           displayable_frame->buffer()->stride(kPlaneY),
           displayable_frame->buffer()->data(kPlaneU),
           displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
           (*film_grain_frame)->buffer()->data(kPlaneY),
           (*film_grain_frame)->buffer()->stride(kPlaneY),
           (*film_grain_frame)->buffer()->data(kPlaneU),
           (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
     LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
     return kStatusOutOfMemory;
   }
   return kStatusOk;
 }

 bool DecoderImpl::IsNewSequenceHeader(const ObuParser& obu) {
   if (std::find_if(obu.obu_headers().begin(), obu.obu_headers().end(),
                    [](const ObuHeader& obu_header) {
                      return obu_header.type == kObuSequenceHeader;
                    }) == obu.obu_headers().end()) {
     return false;
   }
   const ObuSequenceHeader sequence_header = obu.sequence_header();
   const bool sequence_header_changed =
       !has_sequence_header_ ||
       sequence_header_.color_config.bitdepth !=
           sequence_header.color_config.bitdepth ||
       sequence_header_.color_config.is_monochrome !=
           sequence_header.color_config.is_monochrome ||
       sequence_header_.color_config.subsampling_x !=
           sequence_header.color_config.subsampling_x ||
       sequence_header_.color_config.subsampling_y !=
           sequence_header.color_config.subsampling_y ||
       sequence_header_.max_frame_width != sequence_header.max_frame_width ||
       sequence_header_.max_frame_height != sequence_header.max_frame_height;
   sequence_header_ = sequence_header;
   has_sequence_header_ = true;
   return sequence_header_changed;
 }

 }  // namespace libgav1