blob: 2864c34141ac669858d38aa06a1b32ce75f403f3 [file] [log] [blame]
// Copyright 2019 The libgav1 Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "src/threading_strategy.h"
#include <algorithm>
#include <cassert>
#include <memory>
#include "src/frame_scratch_buffer.h"
#include "src/utils/constants.h"
#include "src/utils/logging.h"
#include "src/utils/vector.h"
namespace libgav1 {
namespace {
#if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER)
constexpr int kFrameParallelThresholdMultiplier = 4;
#else
constexpr int kFrameParallelThresholdMultiplier =
LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER;
#endif
// Computes the number of frame threads to be used based on the following
// heuristic:
// * If |thread_count| == 1, return 0.
// * If |thread_count| <= |tile_count| * 4, return 0.
// * Otherwise, return the largest value of i which satisfies the following
// condition: i + i * tile_columns <= thread_count. This ensures that there
// are at least |tile_columns| worker threads for each frame thread.
// * This function will never return 1 or a value > |thread_count|.
//
// This heuristic is based empirical performance data. The in-frame threading
// model (combination of tile multithreading, superblock row multithreading and
// post filter multithreading) performs better than the frame parallel model
// until we reach the threshold of |thread_count| > |tile_count| *
// kFrameParallelThresholdMultiplier.
//
// It is a function of |tile_count| since tile threading and superblock row
// multithreading will scale only as a factor of |tile_count|. The threshold 4
// is arrived at based on empirical data. The general idea is that superblock
// row multithreading plateaus at 4 * |tile_count| because in most practical
// cases there aren't more than that many superblock rows and columns available
// to work on in parallel.
int ComputeFrameThreadCount(int thread_count, int tile_count,
int tile_columns) {
assert(thread_count > 0);
if (thread_count == 1) return 0;
return (thread_count <= tile_count * kFrameParallelThresholdMultiplier)
? 0
: std::max(2, thread_count / (1 + tile_columns));
}
} // namespace
bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
int thread_count) {
assert(thread_count > 0);
frame_parallel_ = false;
if (thread_count == 1) {
thread_pool_.reset(nullptr);
tile_thread_count_ = 0;
max_tile_index_for_row_threads_ = 0;
return true;
}
// We do work in the current thread, so it is sufficient to create
// |thread_count|-1 threads in the threadpool.
thread_count = std::min(thread_count, static_cast<int>(kMaxThreads)) - 1;
if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
thread_pool_ = ThreadPool::Create("libgav1", thread_count);
if (thread_pool_ == nullptr) {
LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
thread_count);
tile_thread_count_ = 0;
max_tile_index_for_row_threads_ = 0;
return false;
}
}
// Prefer tile threads first (but only if there is more than one tile).
const int tile_count = frame_header.tile_info.tile_count;
if (tile_count > 1) {
// We want 1 + tile_thread_count_ <= tile_count because the current thread
// is also used to decode tiles. This is equivalent to
// tile_thread_count_ <= tile_count - 1.
tile_thread_count_ = std::min(thread_count, tile_count - 1);
thread_count -= tile_thread_count_;
if (thread_count == 0) {
max_tile_index_for_row_threads_ = 0;
return true;
}
} else {
tile_thread_count_ = 0;
}
#if defined(__ANDROID__)
// Assign the remaining threads for each Tile. The heuristic used here is that
// we will assign two threads for each Tile. So for example, if |thread_count|
// is 2, for a stream with 2 tiles the first tile would get both the threads
// and the second tile would have row multi-threading turned off. This
// heuristic is based on the fact that row multi-threading is fast enough only
// when there are at least two threads to do the decoding (since one thread
// always does the parsing).
//
// This heuristic might stop working when SIMD optimizations make the decoding
// much faster and the parsing thread is only as fast as the decoding threads.
// So we will have to revisit this later to make sure that this is still
// optimal.
//
// Note that while this heuristic significantly improves performance on high
// end devices (like the Pixel 3), there are some performance regressions in
// some lower end devices (in some cases) and that needs to be revisited as we
// bring in more optimizations. Overall, the gains because of this heuristic
// seems to be much larger than the regressions.
for (int i = 0; i < tile_count; ++i) {
max_tile_index_for_row_threads_ = i + 1;
thread_count -= 2;
if (thread_count <= 0) break;
}
#else // !defined(__ANDROID__)
// Assign the remaining threads to each Tile.
for (int i = 0; i < tile_count; ++i) {
const int count = thread_count / tile_count +
static_cast<int>(i < thread_count % tile_count);
if (count == 0) {
// Once we see a 0 value, all subsequent values will be 0 since it is
// supposed to be assigned in a round-robin fashion.
break;
}
max_tile_index_for_row_threads_ = i + 1;
}
#endif // defined(__ANDROID__)
return true;
}
bool ThreadingStrategy::Reset(int thread_count) {
assert(thread_count > 0);
frame_parallel_ = true;
// In frame parallel mode, we simply access the underlying |thread_pool_|
// directly. So ensure all the other threadpool getter functions return
// nullptr. Also, superblock row multithreading is always disabled in frame
// parallel mode.
tile_thread_count_ = 0;
max_tile_index_for_row_threads_ = 0;
if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count);
if (thread_pool_ == nullptr) {
LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
thread_count);
return false;
}
}
return true;
}
bool InitializeThreadPoolsForFrameParallel(
int thread_count, int tile_count, int tile_columns,
std::unique_ptr<ThreadPool>* const frame_thread_pool,
FrameScratchBufferPool* const frame_scratch_buffer_pool) {
assert(*frame_thread_pool == nullptr);
thread_count = std::min(thread_count, static_cast<int>(kMaxThreads));
const int frame_threads =
ComputeFrameThreadCount(thread_count, tile_count, tile_columns);
if (frame_threads == 0) return true;
*frame_thread_pool = ThreadPool::Create(frame_threads);
if (*frame_thread_pool == nullptr) {
LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.",
frame_threads);
return false;
}
int remaining_threads = thread_count - frame_threads;
if (remaining_threads == 0) return true;
int threads_per_frame = remaining_threads / frame_threads;
const int extra_threads = remaining_threads % frame_threads;
Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
if (!frame_scratch_buffers.reserve(frame_threads)) return false;
// Create the tile thread pools.
for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) {
std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
frame_scratch_buffer_pool->Get();
if (frame_scratch_buffer == nullptr) {
return false;
}
// If the number of tile threads cannot be divided equally amongst all the
// frame threads, assign one extra thread to the first |extra_threads| frame
// threads.
const int current_frame_thread_count =
threads_per_frame + static_cast<int>(i < extra_threads);
if (!frame_scratch_buffer->threading_strategy.Reset(
current_frame_thread_count)) {
return false;
}
remaining_threads -= current_frame_thread_count;
frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer));
}
// We release the frame scratch buffers in reverse order so that the extra
// threads are allocated to buffers in the top of the stack.
for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0;
--i) {
frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i]));
}
return true;
}
} // namespace libgav1