backends/vulkan/runtime/vk_api/QueryPool.cpp - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 // @lint-ignore-every CLANGTIDY facebook-hte-BadImplicitCast

 #include <executorch/backends/vulkan/runtime/utils/VecUtils.h>

 #include <executorch/backends/vulkan/runtime/vk_api/QueryPool.h>

 #include <cmath>
 #include <iomanip>
 #include <iostream>
 #include <utility>

 namespace vkcompute {
 namespace vkapi {

 namespace {

 // On Mali gpus timestamp_period seems to return 0.
 // For some reason when 52.08 is used op runtimes seem to make more sense
 // TODO: Figure out what is special about 52.08
 constexpr int64_t kDefaultNsPerTick = 52; // lround(52.08f);

 } // namespace

 #define EARLY_RETURN_IF_UNINITIALIZED() \
   if (VK_NULL_HANDLE == querypool_) {   \
     return;                             \
   }

 QueryPool::QueryPool(const QueryPoolConfig& config, const Adapter* adapter_p)
     : config_(config),
       ns_per_tick_(1u),
       device_(VK_NULL_HANDLE),
       querypool_(VK_NULL_HANDLE),
       num_queries_(0u),
       shader_durations_(0),
       mutex_{} {
   initialize(adapter_p);
 }

 QueryPool::~QueryPool() {
   EARLY_RETURN_IF_UNINITIALIZED();
   vkDestroyQueryPool(device_, querypool_, nullptr);
 }

 void QueryPool::initialize(const Adapter* adapter_p) {
   // No-op if adapter_p is nullptr or querypool is already created
   if (!adapter_p || querypool_ != VK_NULL_HANDLE) {
     return;
   }

   device_ = adapter_p->device_handle();

   ns_per_tick_ = std::lround(adapter_p->timestamp_period());
   ns_per_tick_ = (ns_per_tick_ == 0) ? kDefaultNsPerTick : ns_per_tick_;

   shader_durations_.reserve(config_.initial_reserve_size);

   const VkQueryPoolCreateInfo info{
       VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, // sType
       nullptr, // pNext
       0u, // flags
       VK_QUERY_TYPE_TIMESTAMP, // queryType
       config_.max_query_count, // queryCount
       0u, // pipelineStatistics
   };

   VK_CHECK(vkCreateQueryPool(device_, &info, nullptr, &querypool_));
 }

 size_t QueryPool::write_timestamp(const CommandBuffer& cmd) {
   VK_CHECK_COND(
       num_queries_ < config_.max_query_count,
       "Vulkan QueryPool: Exceeded the maximum number of queries "
       "allowed by the queryPool (",
       config_.max_query_count,
       ")!");

   cmd.write_timestamp(querypool_, num_queries_++);
   return num_queries_ - 1;
 }

 void QueryPool::reset_querypool(const CommandBuffer& cmd) {
   EARLY_RETURN_IF_UNINITIALIZED();
   std::lock_guard<std::mutex> lock(mutex_);

   cmd.reset_querypool(querypool_, 0u, config_.max_query_count);
   reset_state();
 }

 void QueryPool::reset_state() {
   num_queries_ = 0u;
   shader_durations_.clear();
 }

 void QueryPool::shader_profile_begin(
     const CommandBuffer& cmd,
     const uint32_t dispatch_id,
     const std::string& kernel_name,
     const VkExtent3D global_workgroup_size,
     const VkExtent3D local_workgroup_size) {
   EARLY_RETURN_IF_UNINITIALIZED();
   std::lock_guard<std::mutex> lock(mutex_);

   uint32_t query_idx = write_timestamp(cmd);

   ShaderDuration log_entry{
       utils::safe_downcast<uint32_t>(shader_durations_.size()),
       // Execution Properties
       dispatch_id,
       kernel_name,
       global_workgroup_size,
       local_workgroup_size,
       // Query indexes
       query_idx, // start query idx
       UINT32_MAX, // end query idx
       // Timings
       0u, // start time
       0u, // end time
       0u, // duration
   };

   shader_durations_.emplace_back(log_entry);
 }

 void QueryPool::shader_profile_end(const CommandBuffer& cmd) {
   EARLY_RETURN_IF_UNINITIALIZED();
   std::lock_guard<std::mutex> lock(mutex_);

   size_t query_idx = write_timestamp(cmd);
   shader_durations_.back().end_query_idx = query_idx;
 }

 void QueryPool::extract_results() {
   EARLY_RETURN_IF_UNINITIALIZED();
   std::lock_guard<std::mutex> lock(mutex_);

   const VkQueryResultFlags flags = VK_QUERY_RESULT_64_BIT;

   std::vector<uint64_t> query_data;
   query_data.resize(num_queries_);

   VK_CHECK(vkGetQueryPoolResults(
       device_,
       querypool_,
       0u, // firstQuery
       num_queries_, // queryCount
       sizeof(uint64_t) * num_queries_, // dataSize
       query_data.data(), // pData
       sizeof(uint64_t), // stride
       flags)); // flags

   for (ShaderDuration& entry : shader_durations_) {
     entry.start_time_ns = query_data.at(entry.start_query_idx) * ns_per_tick_;
     entry.end_time_ns = query_data.at(entry.end_query_idx) * ns_per_tick_;
     entry.execution_duration_ns = entry.end_time_ns - entry.start_time_ns;
   }
 }

 std::ostream& operator<<(std::ostream& os, const VkExtent3D& extents) {
   os << "{" << extents.width << ", " << extents.height << ", " << extents.depth
      << "}";
   return os;
 }

 std::string stringize(const VkExtent3D& extents) {
   std::stringstream ss;
   ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
      << "}";
   return ss.str();
 }
 std::vector<std::tuple<std::string, uint32_t, uint64_t, uint64_t>>
 QueryPool::get_shader_timestamp_data() {
   if (VK_NULL_HANDLE == querypool_) {
     return {};
   }
   std::lock_guard<std::mutex> lock(mutex_);
   std::vector<std::tuple<std::string, uint32_t, uint64_t, uint64_t>>
       shader_timestamp_data;
   for (ShaderDuration& entry : shader_durations_) {
     shader_timestamp_data.emplace_back(std::make_tuple(
         entry.kernel_name,
         entry.dispatch_id,
         entry.start_time_ns,
         entry.end_time_ns));
   }
   return shader_timestamp_data;
 }

 std::string QueryPool::generate_string_report() {
   std::lock_guard<std::mutex> lock(mutex_);

   std::stringstream ss;

   int kernel_name_w = 40;
   int global_size_w = 25;
   int local_size_w = 25;
   int duration_w = 25;

   ss << std::left;
   ss << std::setw(kernel_name_w) << "Kernel Name";
   ss << std::setw(global_size_w) << "Global Workgroup Size";
   ss << std::setw(local_size_w) << "Local Workgroup Size";
   ss << std::right << std::setw(duration_w) << "Duration (ns)";
   ss << std::endl;

   ss << std::left;
   ss << std::setw(kernel_name_w) << "===========";
   ss << std::setw(global_size_w) << "=====================";
   ss << std::setw(local_size_w) << "====================";
   ss << std::right << std::setw(duration_w) << "=============";
   ss << std::endl;

   for (ShaderDuration& entry : shader_durations_) {
     std::chrono::duration<size_t, std::nano> exec_duration_ns(
         entry.execution_duration_ns);

     ss << std::left;
     ss << std::setw(kernel_name_w) << entry.kernel_name;
     ss << std::setw(global_size_w) << stringize(entry.global_workgroup_size);
     ss << std::setw(local_size_w) << stringize(entry.local_workgroup_size);
     ss << std::right << std::setw(duration_w) << exec_duration_ns.count();
     ss << std::endl;
   }

   return ss.str();
 }

 void QueryPool::print_results() {
   EARLY_RETURN_IF_UNINITIALIZED();
   std::cout << generate_string_report() << std::endl;
 }

 unsigned long QueryPool::get_total_shader_ns(std::string kernel_name) {
   for (ShaderDuration& entry : shader_durations_) {
     if (entry.kernel_name == kernel_name) {
       std::chrono::duration<size_t, std::nano> exec_duration_ns(
           entry.execution_duration_ns);
       return exec_duration_ns.count();
     }
   }
   return 0;
 }

 unsigned long QueryPool::get_mean_shader_ns(std::string kernel_name) {
   uint64_t total_ns = 0;
   uint32_t count = 0;
   for (ShaderDuration& entry : shader_durations_) {
     if (entry.kernel_name == kernel_name) {
       std::chrono::duration<size_t, std::nano> exec_duration_ns(
           entry.execution_duration_ns);
       total_ns += exec_duration_ns.count();
       count++;
     }
   }
   if (count == 0) {
     return 0;
   }
   return total_ns / count;
 }
 } // namespace vkapi
 } // namespace vkcompute
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	// @lint-ignore-every CLANGTIDY facebook-hte-BadImplicitCast

	#include <executorch/backends/vulkan/runtime/utils/VecUtils.h>

	#include <executorch/backends/vulkan/runtime/vk_api/QueryPool.h>

	#include <cmath>
	#include <iomanip>
	#include <iostream>
	#include <utility>

	namespace vkcompute {
	namespace vkapi {

	namespace {

	// On Mali gpus timestamp_period seems to return 0.
	// For some reason when 52.08 is used op runtimes seem to make more sense
	// TODO: Figure out what is special about 52.08
	constexpr int64_t kDefaultNsPerTick = 52; // lround(52.08f);

	} // namespace

	#define EARLY_RETURN_IF_UNINITIALIZED() \
	if (VK_NULL_HANDLE == querypool_) { \
	return; \
	}

	QueryPool::QueryPool(const QueryPoolConfig& config, const Adapter* adapter_p)
	: config_(config),
	ns_per_tick_(1u),
	device_(VK_NULL_HANDLE),
	querypool_(VK_NULL_HANDLE),
	num_queries_(0u),
	shader_durations_(0),
	mutex_{} {
	initialize(adapter_p);
	}

	QueryPool::~QueryPool() {
	EARLY_RETURN_IF_UNINITIALIZED();
	vkDestroyQueryPool(device_, querypool_, nullptr);
	}

	void QueryPool::initialize(const Adapter* adapter_p) {
	// No-op if adapter_p is nullptr or querypool is already created
	if (!adapter_p \|\| querypool_ != VK_NULL_HANDLE) {
	return;
	}

	device_ = adapter_p->device_handle();

	ns_per_tick_ = std::lround(adapter_p->timestamp_period());
	ns_per_tick_ = (ns_per_tick_ == 0) ? kDefaultNsPerTick : ns_per_tick_;

	shader_durations_.reserve(config_.initial_reserve_size);

	const VkQueryPoolCreateInfo info{
	VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, // sType
	nullptr, // pNext
	0u, // flags
	VK_QUERY_TYPE_TIMESTAMP, // queryType
	config_.max_query_count, // queryCount
	0u, // pipelineStatistics
	};

	VK_CHECK(vkCreateQueryPool(device_, &info, nullptr, &querypool_));
	}

	size_t QueryPool::write_timestamp(const CommandBuffer& cmd) {
	VK_CHECK_COND(
	num_queries_ < config_.max_query_count,
	"Vulkan QueryPool: Exceeded the maximum number of queries "
	"allowed by the queryPool (",
	config_.max_query_count,
	")!");

	cmd.write_timestamp(querypool_, num_queries_++);
	return num_queries_ - 1;
	}

	void QueryPool::reset_querypool(const CommandBuffer& cmd) {
	EARLY_RETURN_IF_UNINITIALIZED();
	std::lock_guard<std::mutex> lock(mutex_);

	cmd.reset_querypool(querypool_, 0u, config_.max_query_count);
	reset_state();
	}

	void QueryPool::reset_state() {
	num_queries_ = 0u;
	shader_durations_.clear();
	}

	void QueryPool::shader_profile_begin(
	const CommandBuffer& cmd,
	const uint32_t dispatch_id,
	const std::string& kernel_name,
	const VkExtent3D global_workgroup_size,
	const VkExtent3D local_workgroup_size) {
	EARLY_RETURN_IF_UNINITIALIZED();
	std::lock_guard<std::mutex> lock(mutex_);

	uint32_t query_idx = write_timestamp(cmd);

	ShaderDuration log_entry{
	utils::safe_downcast<uint32_t>(shader_durations_.size()),
	// Execution Properties
	dispatch_id,
	kernel_name,
	global_workgroup_size,
	local_workgroup_size,
	// Query indexes
	query_idx, // start query idx
	UINT32_MAX, // end query idx
	// Timings
	0u, // start time
	0u, // end time
	0u, // duration
	};

	shader_durations_.emplace_back(log_entry);
	}

	void QueryPool::shader_profile_end(const CommandBuffer& cmd) {
	EARLY_RETURN_IF_UNINITIALIZED();
	std::lock_guard<std::mutex> lock(mutex_);

	size_t query_idx = write_timestamp(cmd);
	shader_durations_.back().end_query_idx = query_idx;
	}

	void QueryPool::extract_results() {
	EARLY_RETURN_IF_UNINITIALIZED();
	std::lock_guard<std::mutex> lock(mutex_);

	const VkQueryResultFlags flags = VK_QUERY_RESULT_64_BIT;

	std::vector<uint64_t> query_data;
	query_data.resize(num_queries_);

	VK_CHECK(vkGetQueryPoolResults(
	device_,
	querypool_,
	0u, // firstQuery
	num_queries_, // queryCount
	sizeof(uint64_t) * num_queries_, // dataSize
	query_data.data(), // pData
	sizeof(uint64_t), // stride
	flags)); // flags

	for (ShaderDuration& entry : shader_durations_) {
	entry.start_time_ns = query_data.at(entry.start_query_idx) * ns_per_tick_;
	entry.end_time_ns = query_data.at(entry.end_query_idx) * ns_per_tick_;
	entry.execution_duration_ns = entry.end_time_ns - entry.start_time_ns;
	}
	}

	std::ostream& operator<<(std::ostream& os, const VkExtent3D& extents) {
	os << "{" << extents.width << ", " << extents.height << ", " << extents.depth
	<< "}";
	return os;
	}

	std::string stringize(const VkExtent3D& extents) {
	std::stringstream ss;
	ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
	<< "}";
	return ss.str();
	}
	std::vector<std::tuple<std::string, uint32_t, uint64_t, uint64_t>>
	QueryPool::get_shader_timestamp_data() {
	if (VK_NULL_HANDLE == querypool_) {
	return {};
	}
	std::lock_guard<std::mutex> lock(mutex_);
	std::vector<std::tuple<std::string, uint32_t, uint64_t, uint64_t>>
	shader_timestamp_data;
	for (ShaderDuration& entry : shader_durations_) {
	shader_timestamp_data.emplace_back(std::make_tuple(
	entry.kernel_name,
	entry.dispatch_id,
	entry.start_time_ns,
	entry.end_time_ns));
	}
	return shader_timestamp_data;
	}

	std::string QueryPool::generate_string_report() {
	std::lock_guard<std::mutex> lock(mutex_);

	std::stringstream ss;

	int kernel_name_w = 40;
	int global_size_w = 25;
	int local_size_w = 25;
	int duration_w = 25;

	ss << std::left;
	ss << std::setw(kernel_name_w) << "Kernel Name";
	ss << std::setw(global_size_w) << "Global Workgroup Size";
	ss << std::setw(local_size_w) << "Local Workgroup Size";
	ss << std::right << std::setw(duration_w) << "Duration (ns)";
	ss << std::endl;

	ss << std::left;
	ss << std::setw(kernel_name_w) << "===========";
	ss << std::setw(global_size_w) << "=====================";
	ss << std::setw(local_size_w) << "====================";
	ss << std::right << std::setw(duration_w) << "=============";
	ss << std::endl;

	for (ShaderDuration& entry : shader_durations_) {
	std::chrono::duration<size_t, std::nano> exec_duration_ns(
	entry.execution_duration_ns);

	ss << std::left;
	ss << std::setw(kernel_name_w) << entry.kernel_name;
	ss << std::setw(global_size_w) << stringize(entry.global_workgroup_size);
	ss << std::setw(local_size_w) << stringize(entry.local_workgroup_size);
	ss << std::right << std::setw(duration_w) << exec_duration_ns.count();
	ss << std::endl;
	}

	return ss.str();
	}

	void QueryPool::print_results() {
	EARLY_RETURN_IF_UNINITIALIZED();
	std::cout << generate_string_report() << std::endl;
	}

	unsigned long QueryPool::get_total_shader_ns(std::string kernel_name) {
	for (ShaderDuration& entry : shader_durations_) {
	if (entry.kernel_name == kernel_name) {
	std::chrono::duration<size_t, std::nano> exec_duration_ns(
	entry.execution_duration_ns);
	return exec_duration_ns.count();
	}
	}
	return 0;
	}

	unsigned long QueryPool::get_mean_shader_ns(std::string kernel_name) {
	uint64_t total_ns = 0;
	uint32_t count = 0;
	for (ShaderDuration& entry : shader_durations_) {
	if (entry.kernel_name == kernel_name) {
	std::chrono::duration<size_t, std::nano> exec_duration_ns(
	entry.execution_duration_ns);
	total_ns += exec_duration_ns.count();
	count++;
	}
	}
	if (count == 0) {
	return 0;
	}
	return total_ns / count;
	}
	} // namespace vkapi
	} // namespace vkcompute