blob: be11f7473e5b3ecefb611fb690f3f4163a3d51b6 [file] [log] [blame]
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// @lint-ignore-every CLANGTIDY facebook-hte-BadImplicitCast
#include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
#include <executorch/backends/vulkan/runtime/vk_api/QueryPool.h>
#include <cmath>
#include <iomanip>
#include <iostream>
#include <utility>
namespace vkcompute {
namespace vkapi {
namespace {
// On Mali gpus timestamp_period seems to return 0.
// For some reason when 52.08 is used op runtimes seem to make more sense
// TODO: Figure out what is special about 52.08
constexpr int64_t kDefaultNsPerTick = 52; // lround(52.08f);
} // namespace
#define EARLY_RETURN_IF_UNINITIALIZED() \
if (VK_NULL_HANDLE == querypool_) { \
return; \
}
QueryPool::QueryPool(const QueryPoolConfig& config, const Adapter* adapter_p)
: config_(config),
ns_per_tick_(1u),
device_(VK_NULL_HANDLE),
querypool_(VK_NULL_HANDLE),
num_queries_(0u),
shader_durations_(0),
mutex_{} {
initialize(adapter_p);
}
QueryPool::~QueryPool() {
EARLY_RETURN_IF_UNINITIALIZED();
vkDestroyQueryPool(device_, querypool_, nullptr);
}
void QueryPool::initialize(const Adapter* adapter_p) {
// No-op if adapter_p is nullptr or querypool is already created
if (!adapter_p || querypool_ != VK_NULL_HANDLE) {
return;
}
device_ = adapter_p->device_handle();
ns_per_tick_ = std::lround(adapter_p->timestamp_period());
ns_per_tick_ = (ns_per_tick_ == 0) ? kDefaultNsPerTick : ns_per_tick_;
shader_durations_.reserve(config_.initial_reserve_size);
const VkQueryPoolCreateInfo info{
VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, // sType
nullptr, // pNext
0u, // flags
VK_QUERY_TYPE_TIMESTAMP, // queryType
config_.max_query_count, // queryCount
0u, // pipelineStatistics
};
VK_CHECK(vkCreateQueryPool(device_, &info, nullptr, &querypool_));
}
size_t QueryPool::write_timestamp(const CommandBuffer& cmd) {
VK_CHECK_COND(
num_queries_ < config_.max_query_count,
"Vulkan QueryPool: Exceeded the maximum number of queries "
"allowed by the queryPool (",
config_.max_query_count,
")!");
cmd.write_timestamp(querypool_, num_queries_++);
return num_queries_ - 1;
}
void QueryPool::reset_querypool(const CommandBuffer& cmd) {
EARLY_RETURN_IF_UNINITIALIZED();
std::lock_guard<std::mutex> lock(mutex_);
cmd.reset_querypool(querypool_, 0u, config_.max_query_count);
reset_state();
}
void QueryPool::reset_state() {
num_queries_ = 0u;
shader_durations_.clear();
}
void QueryPool::shader_profile_begin(
const CommandBuffer& cmd,
const uint32_t dispatch_id,
const std::string& kernel_name,
const VkExtent3D global_workgroup_size,
const VkExtent3D local_workgroup_size) {
EARLY_RETURN_IF_UNINITIALIZED();
std::lock_guard<std::mutex> lock(mutex_);
uint32_t query_idx = write_timestamp(cmd);
ShaderDuration log_entry{
utils::safe_downcast<uint32_t>(shader_durations_.size()),
// Execution Properties
dispatch_id,
kernel_name,
global_workgroup_size,
local_workgroup_size,
// Query indexes
query_idx, // start query idx
UINT32_MAX, // end query idx
// Timings
0u, // start time
0u, // end time
0u, // duration
};
shader_durations_.emplace_back(log_entry);
}
void QueryPool::shader_profile_end(const CommandBuffer& cmd) {
EARLY_RETURN_IF_UNINITIALIZED();
std::lock_guard<std::mutex> lock(mutex_);
size_t query_idx = write_timestamp(cmd);
shader_durations_.back().end_query_idx = query_idx;
}
void QueryPool::extract_results() {
EARLY_RETURN_IF_UNINITIALIZED();
std::lock_guard<std::mutex> lock(mutex_);
const VkQueryResultFlags flags = VK_QUERY_RESULT_64_BIT;
std::vector<uint64_t> query_data;
query_data.resize(num_queries_);
VK_CHECK(vkGetQueryPoolResults(
device_,
querypool_,
0u, // firstQuery
num_queries_, // queryCount
sizeof(uint64_t) * num_queries_, // dataSize
query_data.data(), // pData
sizeof(uint64_t), // stride
flags)); // flags
for (ShaderDuration& entry : shader_durations_) {
entry.start_time_ns = query_data.at(entry.start_query_idx) * ns_per_tick_;
entry.end_time_ns = query_data.at(entry.end_query_idx) * ns_per_tick_;
entry.execution_duration_ns = entry.end_time_ns - entry.start_time_ns;
}
}
std::ostream& operator<<(std::ostream& os, const VkExtent3D& extents) {
os << "{" << extents.width << ", " << extents.height << ", " << extents.depth
<< "}";
return os;
}
std::string stringize(const VkExtent3D& extents) {
std::stringstream ss;
ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
<< "}";
return ss.str();
}
std::vector<std::tuple<std::string, uint32_t, uint64_t, uint64_t>>
QueryPool::get_shader_timestamp_data() {
if (VK_NULL_HANDLE == querypool_) {
return {};
}
std::lock_guard<std::mutex> lock(mutex_);
std::vector<std::tuple<std::string, uint32_t, uint64_t, uint64_t>>
shader_timestamp_data;
for (ShaderDuration& entry : shader_durations_) {
shader_timestamp_data.emplace_back(std::make_tuple(
entry.kernel_name,
entry.dispatch_id,
entry.start_time_ns,
entry.end_time_ns));
}
return shader_timestamp_data;
}
std::string QueryPool::generate_string_report() {
std::lock_guard<std::mutex> lock(mutex_);
std::stringstream ss;
int kernel_name_w = 40;
int global_size_w = 25;
int local_size_w = 25;
int duration_w = 25;
ss << std::left;
ss << std::setw(kernel_name_w) << "Kernel Name";
ss << std::setw(global_size_w) << "Global Workgroup Size";
ss << std::setw(local_size_w) << "Local Workgroup Size";
ss << std::right << std::setw(duration_w) << "Duration (ns)";
ss << std::endl;
ss << std::left;
ss << std::setw(kernel_name_w) << "===========";
ss << std::setw(global_size_w) << "=====================";
ss << std::setw(local_size_w) << "====================";
ss << std::right << std::setw(duration_w) << "=============";
ss << std::endl;
for (ShaderDuration& entry : shader_durations_) {
std::chrono::duration<size_t, std::nano> exec_duration_ns(
entry.execution_duration_ns);
ss << std::left;
ss << std::setw(kernel_name_w) << entry.kernel_name;
ss << std::setw(global_size_w) << stringize(entry.global_workgroup_size);
ss << std::setw(local_size_w) << stringize(entry.local_workgroup_size);
ss << std::right << std::setw(duration_w) << exec_duration_ns.count();
ss << std::endl;
}
return ss.str();
}
void QueryPool::print_results() {
EARLY_RETURN_IF_UNINITIALIZED();
std::cout << generate_string_report() << std::endl;
}
unsigned long QueryPool::get_total_shader_ns(std::string kernel_name) {
for (ShaderDuration& entry : shader_durations_) {
if (entry.kernel_name == kernel_name) {
std::chrono::duration<size_t, std::nano> exec_duration_ns(
entry.execution_duration_ns);
return exec_duration_ns.count();
}
}
return 0;
}
unsigned long QueryPool::get_mean_shader_ns(std::string kernel_name) {
uint64_t total_ns = 0;
uint32_t count = 0;
for (ShaderDuration& entry : shader_durations_) {
if (entry.kernel_name == kernel_name) {
std::chrono::duration<size_t, std::nano> exec_duration_ns(
entry.execution_duration_ns);
total_ns += exec_duration_ns.count();
count++;
}
}
if (count == 0) {
return 0;
}
return total_ns / count;
}
} // namespace vkapi
} // namespace vkcompute