backends/vulkan/runtime/api/Context.h - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #pragma once

 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName

 #include <executorch/backends/vulkan/runtime/utils/MacroUtils.h>

 #include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
 #include <executorch/backends/vulkan/runtime/vk_api/Command.h>
 #include <executorch/backends/vulkan/runtime/vk_api/Descriptor.h>
 #include <executorch/backends/vulkan/runtime/vk_api/Fence.h>
 #include <executorch/backends/vulkan/runtime/vk_api/QueryPool.h>
 #include <executorch/backends/vulkan/runtime/vk_api/Runtime.h>
 #include <executorch/backends/vulkan/runtime/vk_api/VkUtils.h>

 namespace vkcompute {
 namespace api {

 struct ContextConfig final {
   uint32_t cmd_submit_frequency;
   vkapi::CommandPoolConfig cmd_pool_config;
   vkapi::DescriptorPoolConfig descriptor_pool_config;
   vkapi::QueryPoolConfig query_pool_config;
 };

 //
 // Vulkan Context holds onto all relevant Vulkan state as it pertains to our
 // use of Vulkan in PyTorch. A Context is associated with one, and only one,
 // Adapter as a precursor to multi-GPU support. All Vulkan tensors in PyTorch
 // are associated with a Context to make tensor <-> device affinity explicit.
 // The context is currently a global object, but technically it does not need
 // to be if we were to make it explicit to the user.
 //

 class Context final {
  public:
   explicit Context(size_t adapter_i, const ContextConfig&);

   Context(const Context&) = delete;
   Context& operator=(const Context&) = delete;

   Context(Context&&) = delete;
   Context& operator=(Context&&) = delete;

   ~Context();

  private:
   // Config
   ContextConfig config_;
   // Important handles
   vkapi::Adapter* adapter_p_;
   VkDevice device_;
   vkapi::Adapter::Queue queue_;
   // Resource Pools
   vkapi::CommandPool command_pool_;
   vkapi::DescriptorPool descriptor_pool_;
   vkapi::FencePool fences_;
   // Diagnostics
   vkapi::QueryPool querypool_;
   // Command buffers submission
   std::mutex cmd_mutex_;
   vkapi::CommandBuffer cmd_;
   uint32_t submit_count_;
   // Memory Management
   std::mutex buffer_clearlist_mutex_;
   std::vector<vkapi::VulkanBuffer> buffers_to_clear_;
   std::mutex image_clearlist_mutex_;
   std::vector<vkapi::VulkanImage> images_to_clear_;
   // Misc
   VkImageTiling preferred_image_tiling_;

  public:
   // Adapter access

   inline vkapi::Adapter* adapter_ptr() {
     return adapter_p_;
   }

   inline VkDevice device() {
     return device_;
   }

   inline VkQueue queue() {
     return queue_.handle;
   }

   // Device Caches

   inline vkapi::ShaderLayoutCache& shader_layout_cache() {
     return adapter_ptr()->shader_layout_cache();
   }

   inline vkapi::ShaderCache& shader_cache() {
     return adapter_ptr()->shader_cache();
   }

   inline vkapi::PipelineLayoutCache& pipeline_layout_cache() {
     return adapter_ptr()->pipeline_layout_cache();
   }

   inline vkapi::ComputePipelineCache& pipeline_cache() {
     return adapter_ptr()->compute_pipeline_cache();
   }

   // Resource Pools

   inline vkapi::DescriptorPool& descriptor_pool() {
     return descriptor_pool_;
   }

   inline vkapi::FencePool& fences() {
     return fences_;
   }

   // Diagnostics

   inline vkapi::QueryPool& querypool() {
     return querypool_;
   }

   inline VkImageTiling preferred_image_tiling() {
     return preferred_image_tiling_;
   }

   /*
    * By default, the querypool attached to a Context instance is uninitialized.
    * This function triggers the querypool to be created via vkCreateQueryPool.
    */
   void initialize_querypool();

   /*
    * Encodes a vkResetQueryPool command to the current command buffer, and reset
    * the internal state of the querypool. If the querypool is not initialized
    * this function is a no-op.
    */
   void cmd_reset_querypool();

   /*
    * Encodes a vkCmdWriteTimestamp command to the current command buffer and
    * record some metadata about the shader that will be dispatched. If the
    * querypool is not initialized this function is a no-op.
    */
   void report_shader_dispatch_start(
       const std::string& shader_name,
       const utils::uvec3& global_wg_size,
       const utils::uvec3& local_wg_size,
       const uint32_t dispatch_id = UINT32_MAX);

   /*
    * Encodes a vkCmdWriteTimstamp command to the current command buffer to
    * record when the last shader that was dispatched has completed execution.
    * If the querypool is not initialized this function is a no-op.
    */
   void report_shader_dispatch_end();

   // Memory Management

   void register_buffer_cleanup(vkapi::VulkanBuffer& buffer) {
     std::lock_guard<std::mutex> bufferlist_lock(buffer_clearlist_mutex_);
     buffers_to_clear_.emplace_back(std::move(buffer));
   }

   void register_image_cleanup(vkapi::VulkanImage& image) {
     std::lock_guard<std::mutex> imagelist_lock(image_clearlist_mutex_);
     images_to_clear_.emplace_back(std::move(image));
   }

   // GPU RPC

   inline std::unique_lock<std::mutex> dispatch_lock() {
     return std::unique_lock<std::mutex>(cmd_mutex_);
   }

   inline void set_cmd(bool reusable = false) {
     if (!cmd_) {
       cmd_ = command_pool_.get_new_cmd(reusable);
       cmd_.begin();
     }
   }

   vkapi::DescriptorSet get_descriptor_set(
       const vkapi::ShaderInfo&,
       const utils::uvec3&,
       const vkapi::SpecVarList&);

   inline vkapi::DescriptorSet get_descriptor_set(
       const vkapi::ShaderInfo& shader_descriptor,
       const utils::uvec3& local_work_group_size) {
     return get_descriptor_set(shader_descriptor, local_work_group_size, {});
   }

   void register_shader_dispatch(
       const vkapi::DescriptorSet&,
       vkapi::PipelineBarrier&,
       const vkapi::ShaderInfo&,
       const utils::uvec3&);

   void register_blit(
       vkapi::PipelineBarrier&,
       vkapi::VulkanImage& src,
       vkapi::VulkanImage& dst);

   template <typename... Arguments>
   bool submit_compute_job(
       const vkapi::ShaderInfo&,
       vkapi::PipelineBarrier&,
       const utils::uvec3&,
       const utils::uvec3&,
       const vkapi::SpecVarList&,
       VkFence fence_handle,
       const uint32_t dispatch_id,
       Arguments&&...);

   void submit_cmd_to_gpu(
       VkFence fence_handle = VK_NULL_HANDLE,
       const bool final_use = false);

   void flush();
 };

 bool available();

 // The global runtime is retrieved using this function, where it is declared as
 // a static local variable.
 Context* context();

 namespace detail {

 inline void arg_is_empty(
     bool& any_is_empty,
     const vkapi::VulkanBuffer& buffer) {
   // bool(buffer) will evaluate to false if no memory has been allocated
   any_is_empty = any_is_empty || !buffer;
 }

 inline void arg_is_empty(bool& any_is_empty, const vkapi::VulkanImage& image) {
   // bool(image) will evaluate to false if no memory has been allocated
   any_is_empty = any_is_empty || !image;
 }

 inline void arg_is_empty(
     bool& any_is_empty,
     const vkapi::BufferBindInfo& bind_info) {
   any_is_empty = any_is_empty || (bind_info.handle == VK_NULL_HANDLE);
 }

 /*
   Reports if any VulkanBuffer or VulkanImage argument in a variadic argument
   list does not have any memory associated with it.
  */
 template <typename... Arguments>
 inline bool any_arg_is_empty(Arguments&&... arguments) {
   bool any_is_empty = false;
   VK_UNUSED const int _[]{
       0,
       (arg_is_empty(any_is_empty, std::forward<Arguments>(arguments)), 0)...,
   };

   return any_is_empty;
 }

 template <size_t... Indices, typename... Arguments>
 inline void bind(
     vkapi::DescriptorSet& descriptor_set,
     const std::index_sequence<Indices...>&,
     Arguments&&... arguments) {
   VK_UNUSED const int _[]{
       0,
       (descriptor_set.bind(Indices, std::forward<Arguments>(arguments)), 0)...,
   };
 }

 } // namespace detail

 /*
   Records a compute shader dispatch into the current command buffer. If the
   number of submit_*_job calls exceeds the configured frequency, or if a fence
   is provided, then the command buffer is submitted to the GPU for execution.
   Returns a bool indicating whether or not the function call resulted in a GPU
   queue submission.
  */
 template <typename... Arguments>
 inline bool Context::submit_compute_job(
     const vkapi::ShaderInfo& shader,
     vkapi::PipelineBarrier& pipeline_barrier,
     const utils::uvec3& global_work_group,
     const utils::uvec3& local_work_group_size,
     const vkapi::SpecVarList& specialization_constants,
     VkFence fence_handle,
     const uint32_t dispatch_id,
     Arguments&&... arguments) {
   // If any of the provided arguments does not have memory associated with it,
   // then exit early as there is no work to be done. However, if a fence has
   // been passed the command buffer is not empty, then the current command
   // buffer must still be submitted so that the fence can be signaled.
   if (detail::any_arg_is_empty(arguments...)) {
     if (fence_handle != VK_NULL_HANDLE && submit_count_ > 0) {
       submit_cmd_to_gpu(fence_handle);
       return true;
     }
     return false;
   }

   // Serialize recording to the shared command buffer. Do not initialize with a
   // mutex just yet, since in some cases it will be externally managed.
   std::unique_lock<std::mutex> cmd_lock;
   // If a fence was passed, then assume that the host intends to sync with
   // the GPU, implying there will be imminent calls to fence.wait() and flush().
   // We therefore assume the mutex is externally managed in this case, and the
   // calling thread has already locked the mutex prior to calling the function,
   // and will release the mutex manually after calling flush(). This will
   // prevent more dispatches from being recorded until we have flushed the
   // Context.
   if (fence_handle == VK_NULL_HANDLE) {
     cmd_lock = std::unique_lock<std::mutex>(cmd_mutex_);
   }

   set_cmd();

   report_shader_dispatch_start(
       shader.kernel_name,
       global_work_group,
       local_work_group_size,
       dispatch_id);

   // Factor out template parameter independent code to minimize code bloat.
   vkapi::DescriptorSet descriptor_set = get_descriptor_set(
       shader, local_work_group_size, specialization_constants);

   detail::bind(
       descriptor_set,
       std::index_sequence_for<Arguments...>{},
       std::forward<Arguments>(arguments)...);

   // Factor out template parameter independent code to minimize code bloat.
   register_shader_dispatch(
       descriptor_set, pipeline_barrier, shader, global_work_group);

   report_shader_dispatch_end();

   submit_count_++;
   if (fence_handle != VK_NULL_HANDLE ||
       submit_count_ >= config_.cmd_submit_frequency) {
     submit_cmd_to_gpu(fence_handle);
     return true;
   }

   return false;
 }

 } // namespace api
 } // namespace vkcompute
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#pragma once

	// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName

	#include <executorch/backends/vulkan/runtime/utils/MacroUtils.h>

	#include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
	#include <executorch/backends/vulkan/runtime/vk_api/Command.h>
	#include <executorch/backends/vulkan/runtime/vk_api/Descriptor.h>
	#include <executorch/backends/vulkan/runtime/vk_api/Fence.h>
	#include <executorch/backends/vulkan/runtime/vk_api/QueryPool.h>
	#include <executorch/backends/vulkan/runtime/vk_api/Runtime.h>
	#include <executorch/backends/vulkan/runtime/vk_api/VkUtils.h>

	namespace vkcompute {
	namespace api {

	struct ContextConfig final {
	uint32_t cmd_submit_frequency;
	vkapi::CommandPoolConfig cmd_pool_config;
	vkapi::DescriptorPoolConfig descriptor_pool_config;
	vkapi::QueryPoolConfig query_pool_config;
	};

	//
	// Vulkan Context holds onto all relevant Vulkan state as it pertains to our
	// use of Vulkan in PyTorch. A Context is associated with one, and only one,
	// Adapter as a precursor to multi-GPU support. All Vulkan tensors in PyTorch
	// are associated with a Context to make tensor <-> device affinity explicit.
	// The context is currently a global object, but technically it does not need
	// to be if we were to make it explicit to the user.
	//

	class Context final {
	public:
	explicit Context(size_t adapter_i, const ContextConfig&);

	Context(const Context&) = delete;
	Context& operator=(const Context&) = delete;

	Context(Context&&) = delete;
	Context& operator=(Context&&) = delete;

	~Context();

	private:
	// Config
	ContextConfig config_;
	// Important handles
	vkapi::Adapter* adapter_p_;
	VkDevice device_;
	vkapi::Adapter::Queue queue_;
	// Resource Pools
	vkapi::CommandPool command_pool_;
	vkapi::DescriptorPool descriptor_pool_;
	vkapi::FencePool fences_;
	// Diagnostics
	vkapi::QueryPool querypool_;
	// Command buffers submission
	std::mutex cmd_mutex_;
	vkapi::CommandBuffer cmd_;
	uint32_t submit_count_;
	// Memory Management
	std::mutex buffer_clearlist_mutex_;
	std::vector<vkapi::VulkanBuffer> buffers_to_clear_;
	std::mutex image_clearlist_mutex_;
	std::vector<vkapi::VulkanImage> images_to_clear_;
	// Misc
	VkImageTiling preferred_image_tiling_;

	public:
	// Adapter access

	inline vkapi::Adapter* adapter_ptr() {
	return adapter_p_;
	}

	inline VkDevice device() {
	return device_;
	}

	inline VkQueue queue() {
	return queue_.handle;
	}

	// Device Caches

	inline vkapi::ShaderLayoutCache& shader_layout_cache() {
	return adapter_ptr()->shader_layout_cache();
	}

	inline vkapi::ShaderCache& shader_cache() {
	return adapter_ptr()->shader_cache();
	}

	inline vkapi::PipelineLayoutCache& pipeline_layout_cache() {
	return adapter_ptr()->pipeline_layout_cache();
	}

	inline vkapi::ComputePipelineCache& pipeline_cache() {
	return adapter_ptr()->compute_pipeline_cache();
	}

	// Resource Pools

	inline vkapi::DescriptorPool& descriptor_pool() {
	return descriptor_pool_;
	}

	inline vkapi::FencePool& fences() {
	return fences_;
	}

	// Diagnostics

	inline vkapi::QueryPool& querypool() {
	return querypool_;
	}

	inline VkImageTiling preferred_image_tiling() {
	return preferred_image_tiling_;
	}

	/*
	* By default, the querypool attached to a Context instance is uninitialized.
	* This function triggers the querypool to be created via vkCreateQueryPool.
	*/
	void initialize_querypool();

	/*
	* Encodes a vkResetQueryPool command to the current command buffer, and reset
	* the internal state of the querypool. If the querypool is not initialized
	* this function is a no-op.
	*/
	void cmd_reset_querypool();

	/*
	* Encodes a vkCmdWriteTimestamp command to the current command buffer and
	* record some metadata about the shader that will be dispatched. If the
	* querypool is not initialized this function is a no-op.
	*/
	void report_shader_dispatch_start(
	const std::string& shader_name,
	const utils::uvec3& global_wg_size,
	const utils::uvec3& local_wg_size,
	const uint32_t dispatch_id = UINT32_MAX);

	/*
	* Encodes a vkCmdWriteTimstamp command to the current command buffer to
	* record when the last shader that was dispatched has completed execution.
	* If the querypool is not initialized this function is a no-op.
	*/
	void report_shader_dispatch_end();

	// Memory Management

	void register_buffer_cleanup(vkapi::VulkanBuffer& buffer) {
	std::lock_guard<std::mutex> bufferlist_lock(buffer_clearlist_mutex_);
	buffers_to_clear_.emplace_back(std::move(buffer));
	}

	void register_image_cleanup(vkapi::VulkanImage& image) {
	std::lock_guard<std::mutex> imagelist_lock(image_clearlist_mutex_);
	images_to_clear_.emplace_back(std::move(image));
	}

	// GPU RPC

	inline std::unique_lock<std::mutex> dispatch_lock() {
	return std::unique_lock<std::mutex>(cmd_mutex_);
	}

	inline void set_cmd(bool reusable = false) {
	if (!cmd_) {
	cmd_ = command_pool_.get_new_cmd(reusable);
	cmd_.begin();
	}
	}

	vkapi::DescriptorSet get_descriptor_set(
	const vkapi::ShaderInfo&,
	const utils::uvec3&,
	const vkapi::SpecVarList&);

	inline vkapi::DescriptorSet get_descriptor_set(
	const vkapi::ShaderInfo& shader_descriptor,
	const utils::uvec3& local_work_group_size) {
	return get_descriptor_set(shader_descriptor, local_work_group_size, {});
	}

	void register_shader_dispatch(
	const vkapi::DescriptorSet&,
	vkapi::PipelineBarrier&,
	const vkapi::ShaderInfo&,
	const utils::uvec3&);

	void register_blit(
	vkapi::PipelineBarrier&,
	vkapi::VulkanImage& src,
	vkapi::VulkanImage& dst);

	template <typename... Arguments>
	bool submit_compute_job(
	const vkapi::ShaderInfo&,
	vkapi::PipelineBarrier&,
	const utils::uvec3&,
	const utils::uvec3&,
	const vkapi::SpecVarList&,
	VkFence fence_handle,
	const uint32_t dispatch_id,
	Arguments&&...);

	void submit_cmd_to_gpu(
	VkFence fence_handle = VK_NULL_HANDLE,
	const bool final_use = false);

	void flush();
	};

	bool available();

	// The global runtime is retrieved using this function, where it is declared as
	// a static local variable.
	Context* context();

	namespace detail {

	inline void arg_is_empty(
	bool& any_is_empty,
	const vkapi::VulkanBuffer& buffer) {
	// bool(buffer) will evaluate to false if no memory has been allocated
	any_is_empty = any_is_empty \|\| !buffer;
	}

	inline void arg_is_empty(bool& any_is_empty, const vkapi::VulkanImage& image) {
	// bool(image) will evaluate to false if no memory has been allocated
	any_is_empty = any_is_empty \|\| !image;
	}

	inline void arg_is_empty(
	bool& any_is_empty,
	const vkapi::BufferBindInfo& bind_info) {
	any_is_empty = any_is_empty \|\| (bind_info.handle == VK_NULL_HANDLE);
	}

	/*
	Reports if any VulkanBuffer or VulkanImage argument in a variadic argument
	list does not have any memory associated with it.
	*/
	template <typename... Arguments>
	inline bool any_arg_is_empty(Arguments&&... arguments) {
	bool any_is_empty = false;
	VK_UNUSED const int _[]{
	0,
	(arg_is_empty(any_is_empty, std::forward<Arguments>(arguments)), 0)...,
	};

	return any_is_empty;
	}

	template <size_t... Indices, typename... Arguments>
	inline void bind(
	vkapi::DescriptorSet& descriptor_set,
	const std::index_sequence<Indices...>&,
	Arguments&&... arguments) {
	VK_UNUSED const int _[]{
	0,
	(descriptor_set.bind(Indices, std::forward<Arguments>(arguments)), 0)...,
	};
	}

	} // namespace detail

	/*
	Records a compute shader dispatch into the current command buffer. If the
	number of submit_*_job calls exceeds the configured frequency, or if a fence
	is provided, then the command buffer is submitted to the GPU for execution.
	Returns a bool indicating whether or not the function call resulted in a GPU
	queue submission.
	*/
	template <typename... Arguments>
	inline bool Context::submit_compute_job(
	const vkapi::ShaderInfo& shader,
	vkapi::PipelineBarrier& pipeline_barrier,
	const utils::uvec3& global_work_group,
	const utils::uvec3& local_work_group_size,
	const vkapi::SpecVarList& specialization_constants,
	VkFence fence_handle,
	const uint32_t dispatch_id,
	Arguments&&... arguments) {
	// If any of the provided arguments does not have memory associated with it,
	// then exit early as there is no work to be done. However, if a fence has
	// been passed the command buffer is not empty, then the current command
	// buffer must still be submitted so that the fence can be signaled.
	if (detail::any_arg_is_empty(arguments...)) {
	if (fence_handle != VK_NULL_HANDLE && submit_count_ > 0) {
	submit_cmd_to_gpu(fence_handle);
	return true;
	}
	return false;
	}

	// Serialize recording to the shared command buffer. Do not initialize with a
	// mutex just yet, since in some cases it will be externally managed.
	std::unique_lock<std::mutex> cmd_lock;
	// If a fence was passed, then assume that the host intends to sync with
	// the GPU, implying there will be imminent calls to fence.wait() and flush().
	// We therefore assume the mutex is externally managed in this case, and the
	// calling thread has already locked the mutex prior to calling the function,
	// and will release the mutex manually after calling flush(). This will
	// prevent more dispatches from being recorded until we have flushed the
	// Context.
	if (fence_handle == VK_NULL_HANDLE) {
	cmd_lock = std::unique_lock<std::mutex>(cmd_mutex_);
	}

	set_cmd();

	report_shader_dispatch_start(
	shader.kernel_name,
	global_work_group,
	local_work_group_size,
	dispatch_id);

	// Factor out template parameter independent code to minimize code bloat.
	vkapi::DescriptorSet descriptor_set = get_descriptor_set(
	shader, local_work_group_size, specialization_constants);

	detail::bind(
	descriptor_set,
	std::index_sequence_for<Arguments...>{},
	std::forward<Arguments>(arguments)...);

	// Factor out template parameter independent code to minimize code bloat.
	register_shader_dispatch(
	descriptor_set, pipeline_barrier, shader, global_work_group);

	report_shader_dispatch_end();

	submit_count_++;
	if (fence_handle != VK_NULL_HANDLE \|\|
	submit_count_ >= config_.cmd_submit_frequency) {
	submit_cmd_to_gpu(fence_handle);
	return true;
	}

	return false;
	}

	} // namespace api
	} // namespace vkcompute