backends/vulkan/runtime/graph/ComputeGraph.h - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #pragma once

 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName

 #include <optional>
 #include <stack>

 #include <executorch/backends/vulkan/runtime/api/api.h>

 #include <executorch/backends/vulkan/runtime/graph/GraphConfig.h>

 #include <executorch/backends/vulkan/runtime/graph/containers/SharedObject.h>
 #include <executorch/backends/vulkan/runtime/graph/containers/Value.h>

 #include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/PrepackNode.h>

 namespace vkcompute {

 // Define valid scalar types that the Value class can
 // accept
 template <typename T>
 struct is_valid_scalar_type : std::false_type {};

 template <>
 struct is_valid_scalar_type<int64_t> : std::true_type {};

 template <>
 struct is_valid_scalar_type<double> : std::true_type {};

 template <>
 struct is_valid_scalar_type<bool> : std::true_type {};

 //
 // Guarded Pointer Classes
 //

 class ComputeGraph;

 #define DECL_VALUE_PTR_CLASS(classname, ctype)                         \
   class classname final {                                              \
     ComputeGraph* const graph_;                                        \
     ctype* ptr_;                                                       \
                                                                        \
    public:                                                             \
     explicit classname(ComputeGraph* const graph, const ValueRef idx); \
     ctype* operator->() const;                                         \
     ctype& operator*() const;                                          \
     ~classname();                                                      \
   };

 DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor)
 DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef)
 DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer)
 DECL_VALUE_PTR_CLASS(IntListPtr, std::vector<int64_t>)
 DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector<double>)
 DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector<bool>)
 DECL_VALUE_PTR_CLASS(ValueListPtr, std::vector<ValueRef>)
 DECL_VALUE_PTR_CLASS(SymIntPtr, SymInt);

 #undef DECL_VALUE_PTR_CLASS

 //
 // TmpTensor
 //

 /*
  * This struct is used to recycle the memory of temporary tensors that are
  * created during the execution of a node. Upon construction, this struct will
  * check the `tmp_shared_object_idxs_` of the provided `ComputeGraph` instance
  * if any shared objects are available; if not, then a new one is created. A
  * tensor value is then added to the `ComputeGraph` instance with the requested
  * specifications. Upon destruction, the shared object index of the temporary
  * tensor is returned to `tmp_shared_object_idxs_`.
  *
  * Note that instances of this struct can be used as if they were `ValueRef` due
  * to implementation of a custom casting operator.
  *
  * This class should only be used to create tensors whose lifetimes exist only
  * in a well defined scope (i.e. within a function).
  */
 struct TmpTensor {
   ComputeGraph* graph_p;
   int64_t sobj_idx;
   ValueRef vref;

   //
   // Match all available overloads of `add_tensor`
   //

   TmpTensor(
       ComputeGraph* const graph_ptr,
       const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const utils::StorageType storage_type,
       const utils::GPUMemoryLayout memory_layout);

   TmpTensor(
       ComputeGraph* const graph_ptr,
       const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const utils::StorageType storage_type);

   TmpTensor(
       ComputeGraph* const graph_ptr,
       const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const utils::GPUMemoryLayout memory_layout);

   TmpTensor(
       ComputeGraph* const graph_ptr,
       const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype);

   // No copy construction or assignment
   TmpTensor(TmpTensor& other) = delete;
   TmpTensor& operator=(TmpTensor& other) = delete;

   // No move construction or assignment
   TmpTensor(TmpTensor&& other) = delete;
   TmpTensor& operator=(TmpTensor&& other) = delete;

   // Custom cast to ValueRef
   operator ValueRef() const {
     return vref;
   };

   ~TmpTensor();

  private:
   // Helper function to get first available shared object index or request a new
   // one to be created.
   int64_t get_sobj_idx();
 };

 //
 // ComputeGraph
 //

 /*
  * This is the core data structure used to execute Vulkan models in graph mode.
  * As opposed to ATen/eager mode where a command buffer is encoded every
  * inference (since ops are executed with the model), in graph mode the ops that
  * compose the model are intended to be parsed only once, upon which a command
  * buffer will be encoded. Model inference will then execute the cached command
  * buffer without needing to encode a new one.
  */
 class ComputeGraph final {
  public:
   explicit ComputeGraph(GraphConfig config);

   ComputeGraph(ComputeGraph&&) = default;
   ComputeGraph& operator=(ComputeGraph&&) = default;

   ~ComputeGraph();

  private:
   GraphConfig config_;
   vkapi::DescriptorPoolConfig prepack_descriptor_counts_;
   vkapi::DescriptorPoolConfig execute_descriptor_counts_;

   std::unique_ptr<api::Context> context_;

   std::vector<SharedObject> shared_objects_;
   // This stack is used by `TmpTensor` instances to recycle shared objects
   // for temporary tensors. See the comments of `TmpTensor` for more details
   std::stack<int64_t> tmp_shared_object_idxs_;

   std::vector<Value> values_;
   std::vector<api::ParamsBuffer> param_ubos_;

   std::vector<std::unique_ptr<PrepackNode>> prepack_nodes_;
   std::vector<std::unique_ptr<ExecuteNode>> execute_nodes_;

   std::vector<IOValueRef> inputs_;
   std::vector<IOValueRef> outputs_;

  protected:
   size_t values_in_use_ = 0;

  public:
   //
   // Accessors
   //

   inline api::Context* context() {
     return context_.get();
   }

   inline std::vector<IOValueRef>& inputs() {
     return inputs_;
   }

   inline std::vector<IOValueRef>& outputs() {
     return outputs_;
   }

   inline std::vector<std::unique_ptr<PrepackNode>>& prepack_nodes() {
     return prepack_nodes_;
   }

   inline std::vector<std::unique_ptr<ExecuteNode>>& execute_nodes() {
     return execute_nodes_;
   }

   inline GraphConfig& graphconfig() {
     return config_;
   }

   //
   // Value Extraction
   //

 #define GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ptr_type, short_name, type_name) \
   inline ptr_type get_##short_name(const ValueRef idx) {                   \
     return ptr_type(this, idx);                                            \
   }                                                                        \
   inline bool val_is_##short_name(const ValueRef idx) {                    \
     return values_.at(idx).is##type_name();                                \
   }

   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(vTensorPtr, tensor, Tensor)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(TensorRefPtr, tref, TensorRef)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(StagingPtr, staging, Staging)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(IntListPtr, int_list, IntList)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(DoubleListPtr, double_list, DoubleList)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(BoolListPtr, bool_list, BoolList)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ValueListPtr, value_list, ValueList)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(SymIntPtr, symint, SymInt);

 #undef GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS

 #define GET_AND_CHECK_VAL_AS_TYPE_FNS(ctype, short_name, type_name) \
   inline ctype get_##short_name(const ValueRef idx) {               \
     return values_.at(idx).to##type_name();                         \
   }                                                                 \
   inline bool val_is_##short_name(const ValueRef idx) {             \
     return values_.at(idx).is##type_name();                         \
   }

   GET_AND_CHECK_VAL_AS_TYPE_FNS(int64_t, int, Int)
   GET_AND_CHECK_VAL_AS_TYPE_FNS(double, double, Double)
   GET_AND_CHECK_VAL_AS_TYPE_FNS(bool, bool, Bool)
   GET_AND_CHECK_VAL_AS_TYPE_FNS(std::string, string, String)

 #undef GET_AND_CHECK_VAL_AS_TYPE_FNS

   inline bool val_is_none(const ValueRef idx) {
     return values_.at(idx).isNone();
   }

   inline TypeTag get_val_type(const ValueRef idx) {
     return values_.at(idx).type();
   }

   //
   // Tensor Properties Accessors
   //

   std::vector<int64_t> sizes_of(const ValueRef idx) const;

   /*
    * Returns the size of the tensor at `idx` along the specified dimension.
    * Negative indexing is allowed.
    */
   template <typename T>
   T size_at(const int64_t dim, const ValueRef idx) const {
     const Value& val = values_.at(idx);
     if (val.isTensor()) {
       return static_cast<T>(utils::val_at(dim, val.toConstTensor().sizes()));
     } else if (val.isTensorRef()) {
       return static_cast<T>(utils::val_at(dim, val.toConstTensorRef().sizes));
     }
     VK_THROW("Could not get sizes of value with type ", val.type());
   }

   vkapi::ScalarType dtype_of(const ValueRef idx) const;

   inline utils::uvec3 image_extents_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().image_extents();
   }

   inline int32_t numel_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().numel();
   }

   inline utils::StorageType storage_type_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().storage_type();
   }

   inline bool is_buffer_storage(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().has_buffer_storage();
   }

   inline bool val_is_view_of(const ValueRef maybe_view, const ValueRef base)
       const {
     return values_.at(maybe_view)
         .toConstTensor()
         .is_view_of(values_.at(base).toConstTensor());
   }

   inline utils::GPUMemoryLayout memory_layout_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().gpu_memory_layout();
   }

   inline int32_t packed_dim_whcn_idx_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().packed_dim_whcn_idx();
   }

   inline vkapi::BufferBindInfo sizes_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().sizes_ubo();
   }

   inline vkapi::BufferBindInfo strides_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().strides_ubo();
   }

   inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().numel_ubo();
   }

   inline vkapi::BufferBindInfo axis_mapping_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().axis_mapping_ubo();
   }

   inline vkapi::BufferBindInfo texture_limits_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().texture_limits_ubo();
   }

   //
   // Scalar Value Extraction
   //

   template <typename T>
   T extract_scalar(const ValueRef idx) {
     Value& value = values_.at(idx);
     if (value.isInt()) {
       return static_cast<T>(value.toInt());
     }
     if (value.isDouble()) {
       return static_cast<T>(value.toDouble());
     }
     if (value.isBool()) {
       return static_cast<T>(value.toBool());
     }
     VK_THROW("Cannot extract scalar from Value with type ", value.type());
   }

   template <typename T>
   std::optional<T> extract_optional_scalar(const ValueRef idx) {
     if (val_is_none(idx)) {
       return ::std::nullopt;
     } else {
       return extract_scalar<T>(idx);
     }
   }

   std::string extract_string(const ValueRef idx) {
     return values_.at(idx).toString();
   }

   //
   // Utility functions
   //

   /*
    * Returns a suggested storage type (i.e. buffer or texture) that can be used
    * to construct `api::vTensor`s. The storage type is typically determined by
    * the GPU reported by the Vulkan context, unless a storage type override is
    * defined in the graph configuration. Some GPU architectures work better with
    * buffer storage, and others with texture storage. Current only texture
    * storage is supported.
    */
   utils::StorageType suggested_storage_type();

   /*
    * Returns a suggested memory layout (i.e. channels, width, or height packed)
    * that can be used to construct `api::vTensor`s. The memory layout impacts
    * which dimension will be treated as the vectorized dimension. For texture
    * storage, elements along the vectorized dimension are packed into texels.
    * The suggested memory layout is determined based on the sizes of the tensor,
    * unless a memory layout override is defined in the graph configuration.
    */
   utils::GPUMemoryLayout suggested_memory_layout(
       const std::vector<int64_t>& sizes);

   //
   // Graph Building
   //

  private:
   void check_no_active_value_ptrs();

  public:
   /*
    * Add a `api::vTensor` value to the graph with the specified properties.
    * There are various convenience overloads of this function that may be used
    * instead.
    */
   ValueRef add_tensor(
       const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const utils::StorageType storage_type,
       const utils::GPUMemoryLayout memory_layout,
       const int64_t shared_object_idx = -1);

   /*
    * Add a `api::vTensor` value to the graph with the specified properties. The
    * suggested memory layout will be used to construct the `api::vTensor`.
    */
   ValueRef add_tensor(
       const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const utils::StorageType storage_type,
       const int64_t shared_object_idx = -1);

   /*
    * Add a `api::vTensor` value to the graph with the specified properties. The
    * suggested storage type will be used to construct the `api::vTensor`.
    */
   ValueRef add_tensor(
       const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const utils::GPUMemoryLayout memory_layout,
       const int64_t shared_object_idx = -1);

   /*
    * Add a `api::vTensor` value to the graph with the specified properties. The
    * suggested storage type and memory layout will be used to construct the
    * `api::vTensor`.
    */
   ValueRef add_tensor(
       const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const int64_t shared_object_idx = -1);

   /*
    * Add a `api::vTensor` value to the graph with the properties of `vref`.
    */
   ValueRef add_tensor_like(
       const ValueRef vref,
       const utils::StorageType storage_type,
       const utils::GPUMemoryLayout memory_layout);

   /*
    * Add a `api::vTensor` value to the graph with the properties of `vref`. The
    * suggested storage type will be used to construct the `api::vTensor`.
    */
   ValueRef add_tensor_like(
       const ValueRef vref,
       const utils::GPUMemoryLayout memory_layout);

   /*
    * Use the copy constructor of `api::vTensor` to create a "view" of the
    * `vTensor` value at `vref`. See the copy constructor of `api::vTensor` for
    * more details.
    */
   ValueRef add_tensor_view(const ValueRef vref);

   /*
    * Use the copy constructor of `api::vTensor` to create a "view" of the
    * `vTensor` value at `vref` with different sizes and dim order. See the copy
    * constructor of `api::vTensor` for more details.
    */
   ValueRef add_tensor_view(
       const ValueRef vref,
       const std::vector<int64_t>& sizes,
       const std::vector<int64_t>& dim_order,
       const size_t offset_numel = 0);

   /*
    * Add a `TensorRef` value to the graph with the specific properties. A
    * `TensorRef` is a reference to a `api::vTensor` whose data is stored in an
    * external CPU buffer.
    */
   ValueRef add_tensorref(
       const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const void* const data);

   /*
    * Add a staging buffer to the graph. Staging buffers are data buffers that
    * use memory that is visible to both the CPU and GPU, and therefore is used
    * as a intermediary when transferring data between the CPU and GPU.
    */
   ValueRef add_staging(const vkapi::ScalarType dtype, const size_t numel);

   ValueRef add_none();

   template <typename T>
   typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
   add_scalar(T value);

   template <typename T>
   typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
   add_scalar_list(std::vector<T>&& value);

   ValueRef add_value_list(std::vector<ValueRef>&& value);

   ValueRef add_string(std::string&& str);

   ValueRef add_symint(const int32_t val);

   ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
   ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);

   template <typename Block>
   vkapi::BufferBindInfo create_params_buffer(const Block& data) {
     param_ubos_.emplace_back(api::ParamsBuffer(context_.get(), data));
     return vkapi::BufferBindInfo(param_ubos_.back().buffer());
   }

   /*
    * Given a ValueRef, do the following depending on the type of the Value:
    * - If it is a SymInt, return the BufferBindInfo of the ParamsBuffer object
    *   backing the SymInt.
    * - If it is a regular Int, create a new ParamsBuffer using the integer value
    *   and return the BufferBindInfo of the created ParamsBuffer.
    */
   vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx);

   void set_symint(const ValueRef idx, const int32_t val);

   /*
    * Convenience function to add an input tensor along with its staging buffer
    */
   inline IOValueRef add_input_tensor(
       const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const int64_t shared_object_idx = -1) {
     ValueRef t = add_tensor(sizes, dtype, shared_object_idx);
     ValueRef staging = set_input_tensor(t);
     return {t, staging};
   }

   /*
    * Convenience function to add an input tensor with a specific memory layout
    * along with its staging buffer
    */
   inline IOValueRef add_input_tensor(
       const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const utils::GPUMemoryLayout memory_layout,
       const int64_t shared_object_idx = -1) {
     ValueRef t = add_tensor(sizes, dtype, memory_layout, shared_object_idx);
     ValueRef staging = set_input_tensor(t);
     return {t, staging};
   }

   /*
    * Convenience function to add an input tensor with a specific storage type
    * along with its staging buffer
    */
   inline IOValueRef add_input_tensor(
       const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const utils::StorageType storage_type,
       const int64_t shared_object_idx = -1) {
     ValueRef t = add_tensor(sizes, dtype, storage_type, shared_object_idx);
     ValueRef staging = set_input_tensor(t);
     return {t, staging};
   }

   SharedObject& get_shared_object(const int64_t idx);

   //
   // Graph Preparation
   //

   void update_descriptor_counts(
       const vkapi::ShaderInfo& shader_info,
       bool execute);

   void prepare();

   //
   // Dispatch Utilities
   //

   /*
    * Create a global workgroup size for a given `api::vTensor` value assuming
    * that every shader invocation calculates one texel element of the output
    * tensor.
    *
    * For tensors that use texture storage, the image extents of the
    * `api::vTensor` will be used to set the global workgroup size.
    *
    * For tensor that use buffer storage, the number of texels in the texel
    * buffer will be used to set the x component of the global workgroup size.
    * All other components will be set to 1 (i.e. {ntexels, 1, 1} will be
    * returned).
    */
   utils::uvec3 create_global_wg_size(const ValueRef idx);

   /*
    * Suggest a local workgroup size for a given global workgroup size.
    *
    * The local workgroup size will be formed to try and minimize the number of
    * inactive invocations.
    *
    * Currently, the local workgroup size is hard-coded to contain a total of 64
    * shader invocations. In the future, this value can be configured.
    */
   utils::uvec3 create_local_wg_size(const utils::uvec3 global_wg_size);

   /*
    * Convenience function to suggest a local workgroup size for a given
    * `api::vTensor` value, assuming that every shader invocation calculates one
    * texel element of the output tensor.
    */
   utils::uvec3 create_local_wg_size(const ValueRef idx);

   //
   // Input/Output
   //

   void
   copy_into_staging(const ValueRef idx, const void* data, const size_t numel);
   void copy_from_staging(const ValueRef idx, void* data, const size_t numel);

   //
   // Graph Prepacking
   //

   void encode_prepack();
   void prepack() const;

   //
   // Graph Execution
   //

   void encode_execute();
   void execute() const;

   //
   // Dynamic Shape support
   //

   void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
   void propagate_resize();

   //
   // Miscellaneous Utilities
   //

   /*
    * Check whether the GPU supports 8 bit buffers.
    */
   inline bool int8_buffers_enabled() const {
     return context_->adapter_ptr()->has_full_int8_buffers_support();
   }

   //
   // Debug support (implemented in Logging.cpp)
   //

   void print_readable();

   //
   // Friend classes
   //

   friend class vTensorPtr;
   friend class TensorRefPtr;
   friend class StagingPtr;
   friend class IntListPtr;
   friend class DoubleListPtr;
   friend class BoolListPtr;
   friend class ValueListPtr;
   friend class SymIntPtr;

   friend struct TmpTensor;
 };

 template <typename T>
 inline typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
 ComputeGraph::add_scalar(T value) {
   ValueRef idx(static_cast<int>(values_.size()));
   check_no_active_value_ptrs();
   values_.emplace_back(value);
   return idx;
 }

 template <typename T>
 inline typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
 ComputeGraph::add_scalar_list(std::vector<T>&& value) {
   ValueRef idx(static_cast<int>(values_.size()));
   check_no_active_value_ptrs();
   values_.emplace_back(std::move(value));
   return idx;
 }

 } // namespace vkcompute
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#pragma once

	// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName

	#include <optional>
	#include <stack>

	#include <executorch/backends/vulkan/runtime/api/api.h>

	#include <executorch/backends/vulkan/runtime/graph/GraphConfig.h>

	#include <executorch/backends/vulkan/runtime/graph/containers/SharedObject.h>
	#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>

	#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
	#include <executorch/backends/vulkan/runtime/graph/ops/PrepackNode.h>

	namespace vkcompute {

	// Define valid scalar types that the Value class can
	// accept
	template <typename T>
	struct is_valid_scalar_type : std::false_type {};

	template <>
	struct is_valid_scalar_type<int64_t> : std::true_type {};

	template <>
	struct is_valid_scalar_type<double> : std::true_type {};

	template <>
	struct is_valid_scalar_type<bool> : std::true_type {};

	//
	// Guarded Pointer Classes
	//

	class ComputeGraph;

	#define DECL_VALUE_PTR_CLASS(classname, ctype) \
	class classname final { \
	ComputeGraph* const graph_; \
	ctype* ptr_; \
	\
	public: \
	explicit classname(ComputeGraph* const graph, const ValueRef idx); \
	ctype* operator->() const; \
	ctype& operator*() const; \
	~classname(); \
	};

	DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor)
	DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef)
	DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer)
	DECL_VALUE_PTR_CLASS(IntListPtr, std::vector<int64_t>)
	DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector<double>)
	DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector<bool>)
	DECL_VALUE_PTR_CLASS(ValueListPtr, std::vector<ValueRef>)
	DECL_VALUE_PTR_CLASS(SymIntPtr, SymInt);

	#undef DECL_VALUE_PTR_CLASS

	//
	// TmpTensor
	//

	/*
	* This struct is used to recycle the memory of temporary tensors that are
	* created during the execution of a node. Upon construction, this struct will
	* check the `tmp_shared_object_idxs_` of the provided `ComputeGraph` instance
	* if any shared objects are available; if not, then a new one is created. A
	* tensor value is then added to the `ComputeGraph` instance with the requested
	* specifications. Upon destruction, the shared object index of the temporary
	* tensor is returned to `tmp_shared_object_idxs_`.
	*
	* Note that instances of this struct can be used as if they were `ValueRef` due
	* to implementation of a custom casting operator.
	*
	* This class should only be used to create tensors whose lifetimes exist only
	* in a well defined scope (i.e. within a function).
	*/
	struct TmpTensor {
	ComputeGraph* graph_p;
	int64_t sobj_idx;
	ValueRef vref;

	//
	// Match all available overloads of `add_tensor`
	//

	TmpTensor(
	ComputeGraph* const graph_ptr,
	const std::vector<int64_t>& sizes,
	const vkapi::ScalarType dtype,
	const utils::StorageType storage_type,
	const utils::GPUMemoryLayout memory_layout);

	TmpTensor(
	ComputeGraph* const graph_ptr,
	const std::vector<int64_t>& sizes,
	const vkapi::ScalarType dtype,
	const utils::StorageType storage_type);

	TmpTensor(
	ComputeGraph* const graph_ptr,
	const std::vector<int64_t>& sizes,
	const vkapi::ScalarType dtype,
	const utils::GPUMemoryLayout memory_layout);

	TmpTensor(
	ComputeGraph* const graph_ptr,
	const std::vector<int64_t>& sizes,
	const vkapi::ScalarType dtype);

	// No copy construction or assignment
	TmpTensor(TmpTensor& other) = delete;
	TmpTensor& operator=(TmpTensor& other) = delete;

	// No move construction or assignment
	TmpTensor(TmpTensor&& other) = delete;
	TmpTensor& operator=(TmpTensor&& other) = delete;

	// Custom cast to ValueRef
	operator ValueRef() const {
	return vref;
	};

	~TmpTensor();

	private:
	// Helper function to get first available shared object index or request a new
	// one to be created.
	int64_t get_sobj_idx();
	};

	//
	// ComputeGraph
	//

	/*
	* This is the core data structure used to execute Vulkan models in graph mode.
	* As opposed to ATen/eager mode where a command buffer is encoded every
	* inference (since ops are executed with the model), in graph mode the ops that
	* compose the model are intended to be parsed only once, upon which a command
	* buffer will be encoded. Model inference will then execute the cached command
	* buffer without needing to encode a new one.
	*/
	class ComputeGraph final {
	public:
	explicit ComputeGraph(GraphConfig config);

	ComputeGraph(ComputeGraph&&) = default;
	ComputeGraph& operator=(ComputeGraph&&) = default;

	~ComputeGraph();

	private:
	GraphConfig config_;
	vkapi::DescriptorPoolConfig prepack_descriptor_counts_;
	vkapi::DescriptorPoolConfig execute_descriptor_counts_;

	std::unique_ptr<api::Context> context_;

	std::vector<SharedObject> shared_objects_;
	// This stack is used by `TmpTensor` instances to recycle shared objects
	// for temporary tensors. See the comments of `TmpTensor` for more details
	std::stack<int64_t> tmp_shared_object_idxs_;

	std::vector<Value> values_;
	std::vector<api::ParamsBuffer> param_ubos_;

	std::vector<std::unique_ptr<PrepackNode>> prepack_nodes_;
	std::vector<std::unique_ptr<ExecuteNode>> execute_nodes_;

	std::vector<IOValueRef> inputs_;
	std::vector<IOValueRef> outputs_;

	protected:
	size_t values_in_use_ = 0;

	public:
	//
	// Accessors
	//

	inline api::Context* context() {
	return context_.get();
	}

	inline std::vector<IOValueRef>& inputs() {
	return inputs_;
	}

	inline std::vector<IOValueRef>& outputs() {
	return outputs_;
	}

	inline std::vector<std::unique_ptr<PrepackNode>>& prepack_nodes() {
	return prepack_nodes_;
	}

	inline std::vector<std::unique_ptr<ExecuteNode>>& execute_nodes() {
	return execute_nodes_;
	}

	inline GraphConfig& graphconfig() {
	return config_;
	}

	//
	// Value Extraction
	//

	#define GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ptr_type, short_name, type_name) \
	inline ptr_type get_##short_name(const ValueRef idx) { \
	return ptr_type(this, idx); \
	} \
	inline bool val_is_##short_name(const ValueRef idx) { \
	return values_.at(idx).is##type_name(); \
	}

	GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(vTensorPtr, tensor, Tensor)
	GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(TensorRefPtr, tref, TensorRef)
	GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(StagingPtr, staging, Staging)
	GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(IntListPtr, int_list, IntList)
	GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(DoubleListPtr, double_list, DoubleList)
	GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(BoolListPtr, bool_list, BoolList)
	GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ValueListPtr, value_list, ValueList)
	GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(SymIntPtr, symint, SymInt);

	#undef GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS

	#define GET_AND_CHECK_VAL_AS_TYPE_FNS(ctype, short_name, type_name) \
	inline ctype get_##short_name(const ValueRef idx) { \
	return values_.at(idx).to##type_name(); \
	} \
	inline bool val_is_##short_name(const ValueRef idx) { \
	return values_.at(idx).is##type_name(); \
	}

	GET_AND_CHECK_VAL_AS_TYPE_FNS(int64_t, int, Int)
	GET_AND_CHECK_VAL_AS_TYPE_FNS(double, double, Double)
	GET_AND_CHECK_VAL_AS_TYPE_FNS(bool, bool, Bool)
	GET_AND_CHECK_VAL_AS_TYPE_FNS(std::string, string, String)

	#undef GET_AND_CHECK_VAL_AS_TYPE_FNS

	inline bool val_is_none(const ValueRef idx) {
	return values_.at(idx).isNone();
	}

	inline TypeTag get_val_type(const ValueRef idx) {
	return values_.at(idx).type();
	}

	//
	// Tensor Properties Accessors
	//

	std::vector<int64_t> sizes_of(const ValueRef idx) const;

	/*
	* Returns the size of the tensor at `idx` along the specified dimension.
	* Negative indexing is allowed.
	*/
	template <typename T>
	T size_at(const int64_t dim, const ValueRef idx) const {
	const Value& val = values_.at(idx);
	if (val.isTensor()) {
	return static_cast<T>(utils::val_at(dim, val.toConstTensor().sizes()));
	} else if (val.isTensorRef()) {
	return static_cast<T>(utils::val_at(dim, val.toConstTensorRef().sizes));
	}
	VK_THROW("Could not get sizes of value with type ", val.type());
	}

	vkapi::ScalarType dtype_of(const ValueRef idx) const;

	inline utils::uvec3 image_extents_of(const ValueRef idx) const {
	return values_.at(idx).toConstTensor().image_extents();
	}

	inline int32_t numel_of(const ValueRef idx) const {
	return values_.at(idx).toConstTensor().numel();
	}

	inline utils::StorageType storage_type_of(const ValueRef idx) const {
	return values_.at(idx).toConstTensor().storage_type();
	}

	inline bool is_buffer_storage(const ValueRef idx) const {
	return values_.at(idx).toConstTensor().has_buffer_storage();
	}

	inline bool val_is_view_of(const ValueRef maybe_view, const ValueRef base)
	const {
	return values_.at(maybe_view)
	.toConstTensor()
	.is_view_of(values_.at(base).toConstTensor());
	}

	inline utils::GPUMemoryLayout memory_layout_of(const ValueRef idx) const {
	return values_.at(idx).toConstTensor().gpu_memory_layout();
	}

	inline int32_t packed_dim_whcn_idx_of(const ValueRef idx) const {
	return values_.at(idx).toConstTensor().packed_dim_whcn_idx();
	}

	inline vkapi::BufferBindInfo sizes_ubo(const ValueRef idx) {
	return values_.at(idx).toTensor().sizes_ubo();
	}

	inline vkapi::BufferBindInfo strides_ubo(const ValueRef idx) {
	return values_.at(idx).toTensor().strides_ubo();
	}

	inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) {
	return values_.at(idx).toTensor().numel_ubo();
	}

	inline vkapi::BufferBindInfo axis_mapping_ubo(const ValueRef idx) {
	return values_.at(idx).toTensor().axis_mapping_ubo();
	}

	inline vkapi::BufferBindInfo texture_limits_ubo(const ValueRef idx) {
	return values_.at(idx).toTensor().texture_limits_ubo();
	}

	//
	// Scalar Value Extraction
	//

	template <typename T>
	T extract_scalar(const ValueRef idx) {
	Value& value = values_.at(idx);
	if (value.isInt()) {
	return static_cast<T>(value.toInt());
	}
	if (value.isDouble()) {
	return static_cast<T>(value.toDouble());
	}
	if (value.isBool()) {
	return static_cast<T>(value.toBool());
	}
	VK_THROW("Cannot extract scalar from Value with type ", value.type());
	}

	template <typename T>
	std::optional<T> extract_optional_scalar(const ValueRef idx) {
	if (val_is_none(idx)) {
	return ::std::nullopt;
	} else {
	return extract_scalar<T>(idx);
	}
	}

	std::string extract_string(const ValueRef idx) {
	return values_.at(idx).toString();
	}

	//
	// Utility functions
	//

	/*
	* Returns a suggested storage type (i.e. buffer or texture) that can be used
	* to construct `api::vTensor`s. The storage type is typically determined by
	* the GPU reported by the Vulkan context, unless a storage type override is
	* defined in the graph configuration. Some GPU architectures work better with
	* buffer storage, and others with texture storage. Current only texture
	* storage is supported.
	*/
	utils::StorageType suggested_storage_type();

	/*
	* Returns a suggested memory layout (i.e. channels, width, or height packed)
	* that can be used to construct `api::vTensor`s. The memory layout impacts
	* which dimension will be treated as the vectorized dimension. For texture
	* storage, elements along the vectorized dimension are packed into texels.
	* The suggested memory layout is determined based on the sizes of the tensor,
	* unless a memory layout override is defined in the graph configuration.
	*/
	utils::GPUMemoryLayout suggested_memory_layout(
	const std::vector<int64_t>& sizes);

	//
	// Graph Building
	//

	private:
	void check_no_active_value_ptrs();

	public:
	/*
	* Add a `api::vTensor` value to the graph with the specified properties.
	* There are various convenience overloads of this function that may be used
	* instead.
	*/
	ValueRef add_tensor(
	const std::vector<int64_t>& sizes,
	const vkapi::ScalarType dtype,
	const utils::StorageType storage_type,
	const utils::GPUMemoryLayout memory_layout,
	const int64_t shared_object_idx = -1);

	/*
	* Add a `api::vTensor` value to the graph with the specified properties. The
	* suggested memory layout will be used to construct the `api::vTensor`.
	*/
	ValueRef add_tensor(
	const std::vector<int64_t>& sizes,
	const vkapi::ScalarType dtype,
	const utils::StorageType storage_type,
	const int64_t shared_object_idx = -1);

	/*
	* Add a `api::vTensor` value to the graph with the specified properties. The
	* suggested storage type will be used to construct the `api::vTensor`.
	*/
	ValueRef add_tensor(
	const std::vector<int64_t>& sizes,
	const vkapi::ScalarType dtype,
	const utils::GPUMemoryLayout memory_layout,
	const int64_t shared_object_idx = -1);

	/*
	* Add a `api::vTensor` value to the graph with the specified properties. The
	* suggested storage type and memory layout will be used to construct the
	* `api::vTensor`.
	*/
	ValueRef add_tensor(
	const std::vector<int64_t>& sizes,
	const vkapi::ScalarType dtype,
	const int64_t shared_object_idx = -1);

	/*
	* Add a `api::vTensor` value to the graph with the properties of `vref`.
	*/
	ValueRef add_tensor_like(
	const ValueRef vref,
	const utils::StorageType storage_type,
	const utils::GPUMemoryLayout memory_layout);

	/*
	* Add a `api::vTensor` value to the graph with the properties of `vref`. The
	* suggested storage type will be used to construct the `api::vTensor`.
	*/
	ValueRef add_tensor_like(
	const ValueRef vref,
	const utils::GPUMemoryLayout memory_layout);

	/*
	* Use the copy constructor of `api::vTensor` to create a "view" of the
	* `vTensor` value at `vref`. See the copy constructor of `api::vTensor` for
	* more details.
	*/
	ValueRef add_tensor_view(const ValueRef vref);

	/*
	* Use the copy constructor of `api::vTensor` to create a "view" of the
	* `vTensor` value at `vref` with different sizes and dim order. See the copy
	* constructor of `api::vTensor` for more details.
	*/
	ValueRef add_tensor_view(
	const ValueRef vref,
	const std::vector<int64_t>& sizes,
	const std::vector<int64_t>& dim_order,
	const size_t offset_numel = 0);

	/*
	* Add a `TensorRef` value to the graph with the specific properties. A
	* `TensorRef` is a reference to a `api::vTensor` whose data is stored in an
	* external CPU buffer.
	*/
	ValueRef add_tensorref(
	const std::vector<int64_t>& sizes,
	const vkapi::ScalarType dtype,
	const void* const data);

	/*
	* Add a staging buffer to the graph. Staging buffers are data buffers that
	* use memory that is visible to both the CPU and GPU, and therefore is used
	* as a intermediary when transferring data between the CPU and GPU.
	*/
	ValueRef add_staging(const vkapi::ScalarType dtype, const size_t numel);

	ValueRef add_none();

	template <typename T>
	typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
	add_scalar(T value);

	template <typename T>
	typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
	add_scalar_list(std::vector<T>&& value);

	ValueRef add_value_list(std::vector<ValueRef>&& value);

	ValueRef add_string(std::string&& str);

	ValueRef add_symint(const int32_t val);

	ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
	ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);

	template <typename Block>
	vkapi::BufferBindInfo create_params_buffer(const Block& data) {
	param_ubos_.emplace_back(api::ParamsBuffer(context_.get(), data));
	return vkapi::BufferBindInfo(param_ubos_.back().buffer());
	}

	/*
	* Given a ValueRef, do the following depending on the type of the Value:
	* - If it is a SymInt, return the BufferBindInfo of the ParamsBuffer object
	* backing the SymInt.
	* - If it is a regular Int, create a new ParamsBuffer using the integer value
	* and return the BufferBindInfo of the created ParamsBuffer.
	*/
	vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx);

	void set_symint(const ValueRef idx, const int32_t val);

	/*
	* Convenience function to add an input tensor along with its staging buffer
	*/
	inline IOValueRef add_input_tensor(
	const std::vector<int64_t>& sizes,
	const vkapi::ScalarType dtype,
	const int64_t shared_object_idx = -1) {
	ValueRef t = add_tensor(sizes, dtype, shared_object_idx);
	ValueRef staging = set_input_tensor(t);
	return {t, staging};
	}

	/*
	* Convenience function to add an input tensor with a specific memory layout
	* along with its staging buffer
	*/
	inline IOValueRef add_input_tensor(
	const std::vector<int64_t>& sizes,
	const vkapi::ScalarType dtype,
	const utils::GPUMemoryLayout memory_layout,
	const int64_t shared_object_idx = -1) {
	ValueRef t = add_tensor(sizes, dtype, memory_layout, shared_object_idx);
	ValueRef staging = set_input_tensor(t);
	return {t, staging};
	}

	/*
	* Convenience function to add an input tensor with a specific storage type
	* along with its staging buffer
	*/
	inline IOValueRef add_input_tensor(
	const std::vector<int64_t>& sizes,
	const vkapi::ScalarType dtype,
	const utils::StorageType storage_type,
	const int64_t shared_object_idx = -1) {
	ValueRef t = add_tensor(sizes, dtype, storage_type, shared_object_idx);
	ValueRef staging = set_input_tensor(t);
	return {t, staging};
	}

	SharedObject& get_shared_object(const int64_t idx);

	//
	// Graph Preparation
	//

	void update_descriptor_counts(
	const vkapi::ShaderInfo& shader_info,
	bool execute);

	void prepare();

	//
	// Dispatch Utilities
	//

	/*
	* Create a global workgroup size for a given `api::vTensor` value assuming
	* that every shader invocation calculates one texel element of the output
	* tensor.
	*
	* For tensors that use texture storage, the image extents of the
	* `api::vTensor` will be used to set the global workgroup size.
	*
	* For tensor that use buffer storage, the number of texels in the texel
	* buffer will be used to set the x component of the global workgroup size.
	* All other components will be set to 1 (i.e. {ntexels, 1, 1} will be
	* returned).
	*/
	utils::uvec3 create_global_wg_size(const ValueRef idx);

	/*
	* Suggest a local workgroup size for a given global workgroup size.
	*
	* The local workgroup size will be formed to try and minimize the number of
	* inactive invocations.
	*
	* Currently, the local workgroup size is hard-coded to contain a total of 64
	* shader invocations. In the future, this value can be configured.
	*/
	utils::uvec3 create_local_wg_size(const utils::uvec3 global_wg_size);

	/*
	* Convenience function to suggest a local workgroup size for a given
	* `api::vTensor` value, assuming that every shader invocation calculates one
	* texel element of the output tensor.
	*/
	utils::uvec3 create_local_wg_size(const ValueRef idx);

	//
	// Input/Output
	//

	void
	copy_into_staging(const ValueRef idx, const void* data, const size_t numel);
	void copy_from_staging(const ValueRef idx, void* data, const size_t numel);

	//
	// Graph Prepacking
	//

	void encode_prepack();
	void prepack() const;

	//
	// Graph Execution
	//

	void encode_execute();
	void execute() const;

	//
	// Dynamic Shape support
	//

	void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
	void propagate_resize();

	//
	// Miscellaneous Utilities
	//

	/*
	* Check whether the GPU supports 8 bit buffers.
	*/
	inline bool int8_buffers_enabled() const {
	return context_->adapter_ptr()->has_full_int8_buffers_support();
	}

	//
	// Debug support (implemented in Logging.cpp)
	//

	void print_readable();

	//
	// Friend classes
	//

	friend class vTensorPtr;
	friend class TensorRefPtr;
	friend class StagingPtr;
	friend class IntListPtr;
	friend class DoubleListPtr;
	friend class BoolListPtr;
	friend class ValueListPtr;
	friend class SymIntPtr;

	friend struct TmpTensor;
	};

	template <typename T>
	inline typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
	ComputeGraph::add_scalar(T value) {
	ValueRef idx(static_cast<int>(values_.size()));
	check_no_active_value_ptrs();
	values_.emplace_back(value);
	return idx;
	}

	template <typename T>
	inline typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
	ComputeGraph::add_scalar_list(std::vector<T>&& value) {
	ValueRef idx(static_cast<int>(values_.size()));
	check_no_active_value_ptrs();
	values_.emplace_back(std::move(value));
	return idx;
	}

	} // namespace vkcompute