blob: 4b37a28119055a24c3fb18ae86e9e4c2616ef97c [file]
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
#include <executorch/backends/vulkan/runtime/utils/MacroUtils.h>
#include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
#include <executorch/backends/vulkan/runtime/vk_api/Command.h>
#include <executorch/backends/vulkan/runtime/vk_api/Descriptor.h>
#include <executorch/backends/vulkan/runtime/vk_api/Fence.h>
#include <executorch/backends/vulkan/runtime/vk_api/QueryPool.h>
#include <executorch/backends/vulkan/runtime/vk_api/Runtime.h>
#include <executorch/backends/vulkan/runtime/vk_api/VkUtils.h>
namespace vkcompute {
namespace api {
struct ContextConfig final {
uint32_t cmd_submit_frequency;
vkapi::CommandPoolConfig cmd_pool_config;
vkapi::DescriptorPoolConfig descriptor_pool_config;
vkapi::QueryPoolConfig query_pool_config;
};
//
// Vulkan Context holds onto all relevant Vulkan state as it pertains to our
// use of Vulkan in PyTorch. A Context is associated with one, and only one,
// Adapter as a precursor to multi-GPU support. All Vulkan tensors in PyTorch
// are associated with a Context to make tensor <-> device affinity explicit.
// The context is currently a global object, but technically it does not need
// to be if we were to make it explicit to the user.
//
class Context final {
public:
explicit Context(size_t adapter_i, const ContextConfig&);
Context(const Context&) = delete;
Context& operator=(const Context&) = delete;
Context(Context&&) = delete;
Context& operator=(Context&&) = delete;
~Context();
private:
// Config
ContextConfig config_;
// Important handles
vkapi::Adapter* adapter_p_;
VkDevice device_;
vkapi::Adapter::Queue queue_;
// Resource Pools
vkapi::CommandPool command_pool_;
vkapi::DescriptorPool descriptor_pool_;
vkapi::FencePool fences_;
// Diagnostics
vkapi::QueryPool querypool_;
// Command buffers submission
std::mutex cmd_mutex_;
vkapi::CommandBuffer cmd_;
uint32_t submit_count_;
// Memory Management
std::mutex buffer_clearlist_mutex_;
std::vector<vkapi::VulkanBuffer> buffers_to_clear_;
std::mutex image_clearlist_mutex_;
std::vector<vkapi::VulkanImage> images_to_clear_;
// Misc
VkImageTiling preferred_image_tiling_;
public:
// Adapter access
inline vkapi::Adapter* adapter_ptr() {
return adapter_p_;
}
inline VkDevice device() {
return device_;
}
inline VkQueue queue() {
return queue_.handle;
}
// Device Caches
inline vkapi::ShaderLayoutCache& shader_layout_cache() {
return adapter_ptr()->shader_layout_cache();
}
inline vkapi::ShaderCache& shader_cache() {
return adapter_ptr()->shader_cache();
}
inline vkapi::PipelineLayoutCache& pipeline_layout_cache() {
return adapter_ptr()->pipeline_layout_cache();
}
inline vkapi::ComputePipelineCache& pipeline_cache() {
return adapter_ptr()->compute_pipeline_cache();
}
// Resource Pools
inline vkapi::DescriptorPool& descriptor_pool() {
return descriptor_pool_;
}
inline vkapi::FencePool& fences() {
return fences_;
}
// Diagnostics
inline vkapi::QueryPool& querypool() {
return querypool_;
}
inline VkImageTiling preferred_image_tiling() {
return preferred_image_tiling_;
}
/*
* By default, the querypool attached to a Context instance is uninitialized.
* This function triggers the querypool to be created via vkCreateQueryPool.
*/
void initialize_querypool();
/*
* Encodes a vkResetQueryPool command to the current command buffer, and reset
* the internal state of the querypool. If the querypool is not initialized
* this function is a no-op.
*/
void cmd_reset_querypool();
/*
* Encodes a vkCmdWriteTimestamp command to the current command buffer and
* record some metadata about the shader that will be dispatched. If the
* querypool is not initialized this function is a no-op.
*/
void report_shader_dispatch_start(
const std::string& shader_name,
const utils::uvec3& global_wg_size,
const utils::uvec3& local_wg_size,
const uint32_t dispatch_id = UINT32_MAX);
/*
* Encodes a vkCmdWriteTimstamp command to the current command buffer to
* record when the last shader that was dispatched has completed execution.
* If the querypool is not initialized this function is a no-op.
*/
void report_shader_dispatch_end();
// Memory Management
void register_buffer_cleanup(vkapi::VulkanBuffer& buffer) {
std::lock_guard<std::mutex> bufferlist_lock(buffer_clearlist_mutex_);
buffers_to_clear_.emplace_back(std::move(buffer));
}
void register_image_cleanup(vkapi::VulkanImage& image) {
std::lock_guard<std::mutex> imagelist_lock(image_clearlist_mutex_);
images_to_clear_.emplace_back(std::move(image));
}
// GPU RPC
inline std::unique_lock<std::mutex> dispatch_lock() {
return std::unique_lock<std::mutex>(cmd_mutex_);
}
inline void set_cmd(bool reusable = false) {
if (!cmd_) {
cmd_ = command_pool_.get_new_cmd(reusable);
cmd_.begin();
}
}
vkapi::DescriptorSet get_descriptor_set(
const vkapi::ShaderInfo&,
const utils::uvec3&,
const vkapi::SpecVarList&);
inline vkapi::DescriptorSet get_descriptor_set(
const vkapi::ShaderInfo& shader_descriptor,
const utils::uvec3& local_work_group_size) {
return get_descriptor_set(shader_descriptor, local_work_group_size, {});
}
void register_shader_dispatch(
const vkapi::DescriptorSet&,
vkapi::PipelineBarrier&,
const vkapi::ShaderInfo&,
const utils::uvec3&);
void register_blit(
vkapi::PipelineBarrier&,
vkapi::VulkanImage& src,
vkapi::VulkanImage& dst);
template <typename... Arguments>
bool submit_compute_job(
const vkapi::ShaderInfo&,
vkapi::PipelineBarrier&,
const utils::uvec3&,
const utils::uvec3&,
const vkapi::SpecVarList&,
VkFence fence_handle,
const uint32_t dispatch_id,
Arguments&&...);
void submit_cmd_to_gpu(
VkFence fence_handle = VK_NULL_HANDLE,
const bool final_use = false);
void flush();
};
bool available();
// The global runtime is retrieved using this function, where it is declared as
// a static local variable.
Context* context();
namespace detail {
inline void arg_is_empty(
bool& any_is_empty,
const vkapi::VulkanBuffer& buffer) {
// bool(buffer) will evaluate to false if no memory has been allocated
any_is_empty = any_is_empty || !buffer;
}
inline void arg_is_empty(bool& any_is_empty, const vkapi::VulkanImage& image) {
// bool(image) will evaluate to false if no memory has been allocated
any_is_empty = any_is_empty || !image;
}
inline void arg_is_empty(
bool& any_is_empty,
const vkapi::BufferBindInfo& bind_info) {
any_is_empty = any_is_empty || (bind_info.handle == VK_NULL_HANDLE);
}
/*
Reports if any VulkanBuffer or VulkanImage argument in a variadic argument
list does not have any memory associated with it.
*/
template <typename... Arguments>
inline bool any_arg_is_empty(Arguments&&... arguments) {
bool any_is_empty = false;
VK_UNUSED const int _[]{
0,
(arg_is_empty(any_is_empty, std::forward<Arguments>(arguments)), 0)...,
};
return any_is_empty;
}
template <size_t... Indices, typename... Arguments>
inline void bind(
vkapi::DescriptorSet& descriptor_set,
const std::index_sequence<Indices...>&,
Arguments&&... arguments) {
VK_UNUSED const int _[]{
0,
(descriptor_set.bind(Indices, std::forward<Arguments>(arguments)), 0)...,
};
}
} // namespace detail
/*
Records a compute shader dispatch into the current command buffer. If the
number of submit_*_job calls exceeds the configured frequency, or if a fence
is provided, then the command buffer is submitted to the GPU for execution.
Returns a bool indicating whether or not the function call resulted in a GPU
queue submission.
*/
template <typename... Arguments>
inline bool Context::submit_compute_job(
const vkapi::ShaderInfo& shader,
vkapi::PipelineBarrier& pipeline_barrier,
const utils::uvec3& global_work_group,
const utils::uvec3& local_work_group_size,
const vkapi::SpecVarList& specialization_constants,
VkFence fence_handle,
const uint32_t dispatch_id,
Arguments&&... arguments) {
// If any of the provided arguments does not have memory associated with it,
// then exit early as there is no work to be done. However, if a fence has
// been passed the command buffer is not empty, then the current command
// buffer must still be submitted so that the fence can be signaled.
if (detail::any_arg_is_empty(arguments...)) {
if (fence_handle != VK_NULL_HANDLE && submit_count_ > 0) {
submit_cmd_to_gpu(fence_handle);
return true;
}
return false;
}
// Serialize recording to the shared command buffer. Do not initialize with a
// mutex just yet, since in some cases it will be externally managed.
std::unique_lock<std::mutex> cmd_lock;
// If a fence was passed, then assume that the host intends to sync with
// the GPU, implying there will be imminent calls to fence.wait() and flush().
// We therefore assume the mutex is externally managed in this case, and the
// calling thread has already locked the mutex prior to calling the function,
// and will release the mutex manually after calling flush(). This will
// prevent more dispatches from being recorded until we have flushed the
// Context.
if (fence_handle == VK_NULL_HANDLE) {
cmd_lock = std::unique_lock<std::mutex>(cmd_mutex_);
}
set_cmd();
report_shader_dispatch_start(
shader.kernel_name,
global_work_group,
local_work_group_size,
dispatch_id);
// Factor out template parameter independent code to minimize code bloat.
vkapi::DescriptorSet descriptor_set = get_descriptor_set(
shader, local_work_group_size, specialization_constants);
detail::bind(
descriptor_set,
std::index_sequence_for<Arguments...>{},
std::forward<Arguments>(arguments)...);
// Factor out template parameter independent code to minimize code bloat.
register_shader_dispatch(
descriptor_set, pipeline_barrier, shader, global_work_group);
report_shader_dispatch_end();
submit_count_++;
if (fence_handle != VK_NULL_HANDLE ||
submit_count_ >= config_.cmd_submit_frequency) {
submit_cmd_to_gpu(fence_handle);
return true;
}
return false;
}
} // namespace api
} // namespace vkcompute