backends/vulkan/test/utils/test_utils.cpp - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #include <executorch/backends/vulkan/test/utils/test_utils.h>

 #include <executorch/runtime/core/exec_aten/exec_aten.h>

 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>

 #include <cassert>
 #include <random>

 using namespace vkcompute;

 //
 // Operator Recording Functions
 //

 void record_nchw_to_buffer_op(
     api::Context* const context,
     vkapi::VulkanBuffer& src_buffer,
     api::vTensor& v_dst) {
   vkapi::PipelineBarrier pipeline_barrier{};

   context->submit_compute_job(
       get_nchw_to_tensor_shader(v_dst),
       pipeline_barrier,
       {uint32_t(v_dst.numel()), 1, 1},
       {64, 1, 1},
       {},
       VK_NULL_HANDLE,
       0,
       v_dst.buffer(
           pipeline_barrier,
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::WRITE),
       src_buffer,
       v_dst.sizes_ubo(),
       v_dst.strides_ubo(),
       v_dst.numel_ubo());
 }

 void record_buffer_to_nchw_op(
     api::Context* const context,
     api::vTensor& v_src,
     vkapi::VulkanBuffer& dst_buffer) {
   vkapi::PipelineBarrier pipeline_barrier{};
   context->submit_compute_job(
       get_tensor_to_nchw_shader(v_src),
       pipeline_barrier,
       {uint32_t(v_src.numel()), 1, 1},
       {64, 1, 1},
       {},
       VK_NULL_HANDLE,
       0,
       dst_buffer,
       v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo(),
       v_src.strides_ubo(),
       v_src.numel_ubo());
 }

 void record_nchw_to_image_op(
     api::Context* const context,
     vkapi::VulkanBuffer& src_buffer,
     api::vTensor& v_dst) {
   vkapi::PipelineBarrier pipeline_barrier{};
   vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()};

   context->submit_compute_job(
       get_nchw_to_tensor_shader(
           v_dst, context->adapter_ptr()->has_full_int8_buffers_support()),
       pipeline_barrier,
       v_dst.logical_limits(),
       adaptive_work_group_size(v_dst.logical_limits()),
       specialization_constants,
       VK_NULL_HANDLE,
       0,
       v_dst.image(
           pipeline_barrier,
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::WRITE),
       src_buffer,
       v_dst.sizes_ubo());
 }

 void record_image_to_nchw_op(
     api::Context* const context,
     api::vTensor& v_src,
     vkapi::VulkanBuffer& dst_buffer) {
   vkapi::PipelineBarrier pipeline_barrier{};
   vkapi::SpecVarList specialization_constants = {v_src.hashed_layout()};

   context->submit_compute_job(
       get_tensor_to_nchw_shader(v_src),
       pipeline_barrier,
       v_src.logical_limits(),
       adaptive_work_group_size(v_src.logical_limits()),
       specialization_constants,
       VK_NULL_HANDLE,
       0,
       dst_buffer,
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo());
 }

 void record_bitw8_image_to_nchw_nobitw8buffer_op(
     api::Context* const context,
     api::vTensor& v_src,
     api::StagingBuffer& dst_buffer) {
   vkapi::PipelineBarrier pipeline_barrier{};
   uint32_t buffer_len = utils::safe_downcast<uint32_t>(dst_buffer.numel() / 4);
   utils::uvec3 global_wg_size = {buffer_len, 1, 1};

   std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
   add_storage_type_suffix(kernel_name, v_src);
   add_dtype_suffix(kernel_name, v_src);

   context->submit_compute_job(
       VK_KERNEL_FROM_STR(kernel_name),
       pipeline_barrier,
       global_wg_size,
       adaptive_work_group_size(global_wg_size),
       {v_src.hashed_layout()},
       VK_NULL_HANDLE,
       0,
       dst_buffer.buffer(),
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo(),
       v_src.numel_ubo());
 }

 void record_conv2d_prepack_weights_op(
     api::Context* const context,
     vkapi::VulkanBuffer& src_buffer,
     api::vTensor& v_dst,
     const std::vector<int64_t>& original_sizes,
     const bool transposed) {
   vkapi::PipelineBarrier pipeline_barrier{};

   std::string kernel_name;
   if (transposed) {
     kernel_name = "conv_transpose2d";
   } else {
     kernel_name = "conv2d";
   }
   kernel_name += "_prepack_weights";
   add_dtype_suffix(kernel_name, v_dst);
   vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);

   api::ParamsBuffer original_sizes_ubo(
       context, utils::make_ivec4(original_sizes, /*reverse = */ true));

   vkapi::SpecVarList specialization_constants = {};
   context->submit_compute_job(
       shader,
       pipeline_barrier,
       v_dst.logical_limits(),
       adaptive_work_group_size(v_dst.logical_limits()),
       specialization_constants,
       VK_NULL_HANDLE,
       0,
       v_dst.image(
           pipeline_barrier,
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::WRITE),
       src_buffer,
       v_dst.sizes_ubo(),
       original_sizes_ubo.buffer());
 }

 void record_binary_op(
     api::Context* const context,
     const std::string& op_name,
     api::vTensor& v_in1,
     api::vTensor& v_in2,
     api::vTensor& v_dst) {
   std::string kernel_name = "binary_" + op_name + "_nobroadcast__test";
   add_dtype_suffix(kernel_name, v_dst);

   vkapi::PipelineBarrier pipeline_barrier{};
   vkapi::SpecVarList specialization_constants = {};
   context->submit_compute_job(
       VK_KERNEL_FROM_STR(kernel_name),
       pipeline_barrier,
       v_dst.logical_limits(),
       adaptive_work_group_size(v_dst.logical_limits()),
       specialization_constants,
       VK_NULL_HANDLE,
       0,
       v_dst.image(
           pipeline_barrier,
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::WRITE),
       v_in1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_in2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_dst.sizes_ubo());
 }

 void execute_and_check_add(
     api::vTensor& a,
     api::vTensor& b,
     api::vTensor& c,
     float a_val,
     float b_val) {
   // Fill input tensors
   fill_vtensor(a, a_val);
   fill_vtensor(b, b_val);

   // a + b = c
   record_binary_op(api::context(), "add", a, b, c);

   // Extract output tensor
   std::vector<float> data_out = extract_vtensor(c);

   // Check output
   for (size_t i = 0; i < data_out.size(); ++i) {
     CHECK_VALUE(data_out, i, (a_val + b_val));
   }
 }

 void record_index_fill_buffer(api::Context* context, api::vTensor& v_ten) {
   std::string kernel_name("idx_fill_buffer");
   switch (v_ten.dtype()) {
     case vkapi::kFloat:
       kernel_name += "_float";
       break;
     case vkapi::kHalf:
       kernel_name += "_half";
       break;
     case vkapi::kQInt8:
       kernel_name += "_int8";
       break;
     case vkapi::kQUInt8:
       kernel_name += "_uint8";
       break;
     default:
       throw std::runtime_error("Unsupported dtype");
       break;
   }

   api::ParamsBuffer params(api::context(), int32_t(v_ten.numel()));

   {
     vkapi::PipelineBarrier pipeline_barrier{};
     vkapi::SpecVarList specialization_constants = {};
     api::context()->submit_compute_job(
         VK_KERNEL_FROM_STR(kernel_name),
         pipeline_barrier,
         {uint32_t(v_ten.numel()), 1, 1},
         {64, 1, 1},
         specialization_constants,
         VK_NULL_HANDLE,
         0,
         v_ten.buffer(
             pipeline_barrier,
             vkapi::PipelineStage::COMPUTE,
             vkapi::MemoryAccessType::READ),
         params.buffer());
   }
 }

 void record_scalar_add_buffer(
     api::Context* context,
     api::vTensor& v_ten,
     float offset) {
   vkapi::PipelineBarrier pipeline_barrier{};
   vkapi::SpecVarList specialization_constants = {SV(offset)};
   std::string kernel = "scalar_add_buffer";
   add_dtype_suffix(kernel, v_ten);
   api::context()->submit_compute_job(
       VK_KERNEL_FROM_STR(kernel),
       pipeline_barrier,
       {uint32_t(v_ten.numel()), 1, 1},
       {64, 1, 1},
       specialization_constants,
       VK_NULL_HANDLE,
       0,
       v_ten.buffer(
           pipeline_barrier,
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::READ | vkapi::MemoryAccessType::WRITE),
       v_ten.numel_ubo());
 }

 void record_reference_matmul(
     api::Context* context,
     api::vTensor& out,
     api::vTensor& mat1,
     api::vTensor& mat2) {
   vkapi::PipelineBarrier pipeline_barrier{};
   api::context()->submit_compute_job(
       VK_KERNEL(reference_matmul),
       pipeline_barrier,
       {uint32_t(out.size(1)), uint32_t(out.size(0)), 1},
       {64, 1, 1},
       {},
       VK_NULL_HANDLE,
       0,
       out.buffer(
           pipeline_barrier,
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::WRITE),
       mat1.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       mat2.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       out.sizes_ubo(),
       out.strides_ubo(),
       mat1.sizes_ubo(),
       mat1.strides_ubo(),
       mat2.sizes_ubo(),
       mat2.strides_ubo());
 }

 void record_matmul_texture3d(
     api::Context* context,
     api::vTensor& out,
     api::vTensor& mat1,
     api::vTensor& mat2) {
   std::string kernel_name = "matmul_naive";
   kernel_name.reserve(kShaderNameReserve);
   add_storage_type_suffix(kernel_name, out.storage_type());
   add_dtype_suffix(kernel_name, out.dtype());

   utils::uvec3 global_wg_size = out.logical_limits();

   vkapi::PipelineBarrier pipeline_barrier{};
   api::context()->submit_compute_job(
       VK_KERNEL_FROM_STR(kernel_name),
       pipeline_barrier,
       global_wg_size,
       {8, 8, 1},
       {out.hashed_layout(), mat1.hashed_layout(), mat2.hashed_layout()},
       VK_NULL_HANDLE,
       0,
       out.image(
           pipeline_barrier,
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::WRITE),
       mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       out.sizes_ubo(),
       out.logical_limits_ubo(),
       mat1.sizes_ubo(),
       mat2.sizes_ubo());
 }

 //
 // Input & Output Utilities
 //

 #define FORALL_SUPPORTED_TYPES(_) \
   _(uint8_t, Byte)                \
   _(int8_t, Char)                 \
   _(int32_t, Int)                 \
   _(executorch::aten::Half, Half) \
   _(float, Float)                 \
   _(int8_t, QInt8)

 void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
   api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size());

 #define CASE(ctype, name)                                     \
   case vkapi::ScalarType::name: {                             \
     std::vector<ctype> data_converted;                        \
     data_converted.resize(data.size());                       \
     for (int i = 0; i < data.size(); ++i) {                   \
       data_converted[i] = ctype(data[i]);                     \
     }                                                         \
     staging_buffer.copy_from(                                 \
         data_converted.data(), vten.staging_buffer_nbytes()); \
   } break;

   switch (vten.dtype()) {
     FORALL_SUPPORTED_TYPES(CASE)
     default:
       VK_THROW("Unsupported dtype");
   }

 #undef CASE

   if (vten.storage_type() == utils::StorageType::BUFFER) {
     record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten);
   } else {
     record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten);
   }
 }

 void fill_vtensor(api::vTensor& vten, float val, bool iota) {
   std::vector<float> vten_data(vten.staging_buffer_numel());
   if (iota) {
     std::iota(vten_data.begin(), vten_data.end(), val);
   } else {
     std::fill(vten_data.begin(), vten_data.end(), val);
   }

   fill_vtensor(vten, vten_data);
 }

 std::vector<float> create_random_float_buffer(
     const size_t numel,
     const float min,
     const float max) {
   std::vector<float> data(numel);
   std::default_random_engine rng;
   std::uniform_real_distribution<float> dist(min, max);

   for (size_t i = 0; i < data.size(); ++i) {
     data[i] = dist(rng);
   }
   return data;
 }

 std::vector<uint8_t> create_random_uint8_buffer(
     const size_t numel,
     const uint8_t min,
     const uint8_t max) {
   std::vector<uint8_t> data(numel);
   std::default_random_engine rng;
   std::uniform_real_distribution<float> dist(min, max);

   for (size_t i = 0; i < data.size(); ++i) {
     data[i] = (uint8_t)dist(rng);
   }
   return data;
 }

 void fill_vtensor(
     ComputeGraph& graph,
     const IOValueRef idx,
     float val,
     bool iota) {
   vTensorPtr t = graph.get_tensor(idx.value);
   std::vector<float> data(t->numel());
   if (t->storage_type() != utils::kBuffer) {
     data.resize(t->staging_buffer_numel());
   }
   if (iota) {
     std::iota(data.begin(), data.end(), val);
   } else {
     std::fill(data.begin(), data.end(), val);
   }

   graph.copy_into_staging(idx.staging, data.data(), data.size());
 }

 void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
   api::StagingBuffer staging_buffer(
       api::context(), vten.dtype(), vten.staging_buffer_numel());

   if (vten.storage_type() == utils::StorageType::BUFFER) {
     record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
   } else {
     record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
   }

   vkapi::VulkanFence fence = api::context()->fences().get_fence();
   api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
   fence.wait();

 #define CASE(ctype, name)                                     \
   case vkapi::ScalarType::name: {                             \
     std::vector<ctype> data_converted(data.size());           \
     staging_buffer.copy_to(                                   \
         data_converted.data(), vten.staging_buffer_nbytes()); \
     for (int i = 0; i < data.size(); ++i) {                   \
       data[i] = float(data_converted[i]);                     \
     }                                                         \
   } break;

   switch (vten.dtype()) {
     FORALL_SUPPORTED_TYPES(CASE)
     default:
       VK_THROW("Unsupported dtype");
   }

 #undef CASE
 }

 //
 // Context Management
 //

 void submit_to_gpu() {
   vkapi::VulkanFence fence = api::context()->fences().get_fence();
   api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
   fence.wait();
 }

 vkapi::Allocation allocate_memory_for(const api::vTensor& vten) {
   VmaAllocationCreateInfo alloc_create_info =
       api::context()->adapter_ptr()->vma().gpuonly_resource_create_info();
   return api::context()->adapter_ptr()->vma().create_allocation(
       vten.get_memory_requirements(), alloc_create_info);
 }

 VmaTotalStatistics get_vma_stats() {
   return api::context()->adapter_ptr()->vma().get_memory_statistics();
 }

 size_t get_vma_allocation_count() {
   return get_vma_stats().total.statistics.allocationCount;
 }

 //
 // Graph Test Utilities
 //

 void execute_graph_and_check_output(
     ComputeGraph& graph,
     std::vector<float> input_vals,
     std::vector<float> expected_outputs) {
   assert(input_vals.size() == graph.inputs().size());
   assert(expected_outputs.size() == graph.outputs().size());

   for (size_t i = 0; i < graph.inputs().size(); ++i) {
     fill_vtensor(graph, graph.inputs().at(i), input_vals.at(i));
   }

   graph.execute();

   for (size_t i = 0; i < graph.outputs().size(); ++i) {
     IOValueRef out_ioval = graph.outputs().at(i);
     vTensorPtr t_out = graph.get_tensor(out_ioval.value);

     std::vector<float> output_data(t_out->staging_buffer_numel());
     graph.copy_from_staging(
         out_ioval.staging, output_data.data(), output_data.size());

     for (size_t j = 0; j < t_out->numel(); ++j) {
       CHECK_VALUE(output_data, j, expected_outputs.at(i));
     }
   }
 }

 bool check_close(float a, float b, float atol, float rtol) {
   float max = std::max(std::abs(a), std::abs(b));
   float diff = std::abs(a - b);
   return diff <= (atol + rtol * max);
 }
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#include <executorch/backends/vulkan/test/utils/test_utils.h>

	#include <executorch/runtime/core/exec_aten/exec_aten.h>

	#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>

	#include <cassert>
	#include <random>

	using namespace vkcompute;

	//
	// Operator Recording Functions
	//

	void record_nchw_to_buffer_op(
	api::Context* const context,
	vkapi::VulkanBuffer& src_buffer,
	api::vTensor& v_dst) {
	vkapi::PipelineBarrier pipeline_barrier{};

	context->submit_compute_job(
	get_nchw_to_tensor_shader(v_dst),
	pipeline_barrier,
	{uint32_t(v_dst.numel()), 1, 1},
	{64, 1, 1},
	{},
	VK_NULL_HANDLE,
	0,
	v_dst.buffer(
	pipeline_barrier,
	vkapi::PipelineStage::COMPUTE,
	vkapi::MemoryAccessType::WRITE),
	src_buffer,
	v_dst.sizes_ubo(),
	v_dst.strides_ubo(),
	v_dst.numel_ubo());
	}

	void record_buffer_to_nchw_op(
	api::Context* const context,
	api::vTensor& v_src,
	vkapi::VulkanBuffer& dst_buffer) {
	vkapi::PipelineBarrier pipeline_barrier{};
	context->submit_compute_job(
	get_tensor_to_nchw_shader(v_src),
	pipeline_barrier,
	{uint32_t(v_src.numel()), 1, 1},
	{64, 1, 1},
	{},
	VK_NULL_HANDLE,
	0,
	dst_buffer,
	v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
	v_src.sizes_ubo(),
	v_src.strides_ubo(),
	v_src.numel_ubo());
	}

	void record_nchw_to_image_op(
	api::Context* const context,
	vkapi::VulkanBuffer& src_buffer,
	api::vTensor& v_dst) {
	vkapi::PipelineBarrier pipeline_barrier{};
	vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()};

	context->submit_compute_job(
	get_nchw_to_tensor_shader(
	v_dst, context->adapter_ptr()->has_full_int8_buffers_support()),
	pipeline_barrier,
	v_dst.logical_limits(),
	adaptive_work_group_size(v_dst.logical_limits()),
	specialization_constants,
	VK_NULL_HANDLE,
	0,
	v_dst.image(
	pipeline_barrier,
	vkapi::PipelineStage::COMPUTE,
	vkapi::MemoryAccessType::WRITE),
	src_buffer,
	v_dst.sizes_ubo());
	}

	void record_image_to_nchw_op(
	api::Context* const context,
	api::vTensor& v_src,
	vkapi::VulkanBuffer& dst_buffer) {
	vkapi::PipelineBarrier pipeline_barrier{};
	vkapi::SpecVarList specialization_constants = {v_src.hashed_layout()};

	context->submit_compute_job(
	get_tensor_to_nchw_shader(v_src),
	pipeline_barrier,
	v_src.logical_limits(),
	adaptive_work_group_size(v_src.logical_limits()),
	specialization_constants,
	VK_NULL_HANDLE,
	0,
	dst_buffer,
	v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
	v_src.sizes_ubo());
	}

	void record_bitw8_image_to_nchw_nobitw8buffer_op(
	api::Context* const context,
	api::vTensor& v_src,
	api::StagingBuffer& dst_buffer) {
	vkapi::PipelineBarrier pipeline_barrier{};
	uint32_t buffer_len = utils::safe_downcast<uint32_t>(dst_buffer.numel() / 4);
	utils::uvec3 global_wg_size = {buffer_len, 1, 1};

	std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
	add_storage_type_suffix(kernel_name, v_src);
	add_dtype_suffix(kernel_name, v_src);

	context->submit_compute_job(
	VK_KERNEL_FROM_STR(kernel_name),
	pipeline_barrier,
	global_wg_size,
	adaptive_work_group_size(global_wg_size),
	{v_src.hashed_layout()},
	VK_NULL_HANDLE,
	0,
	dst_buffer.buffer(),
	v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
	v_src.sizes_ubo(),
	v_src.numel_ubo());
	}

	void record_conv2d_prepack_weights_op(
	api::Context* const context,
	vkapi::VulkanBuffer& src_buffer,
	api::vTensor& v_dst,
	const std::vector<int64_t>& original_sizes,
	const bool transposed) {
	vkapi::PipelineBarrier pipeline_barrier{};

	std::string kernel_name;
	if (transposed) {
	kernel_name = "conv_transpose2d";
	} else {
	kernel_name = "conv2d";
	}
	kernel_name += "_prepack_weights";
	add_dtype_suffix(kernel_name, v_dst);
	vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);

	api::ParamsBuffer original_sizes_ubo(
	context, utils::make_ivec4(original_sizes, /reverse = / true));

	vkapi::SpecVarList specialization_constants = {};
	context->submit_compute_job(
	shader,
	pipeline_barrier,
	v_dst.logical_limits(),
	adaptive_work_group_size(v_dst.logical_limits()),
	specialization_constants,
	VK_NULL_HANDLE,
	0,
	v_dst.image(
	pipeline_barrier,
	vkapi::PipelineStage::COMPUTE,
	vkapi::MemoryAccessType::WRITE),
	src_buffer,
	v_dst.sizes_ubo(),
	original_sizes_ubo.buffer());
	}

	void record_binary_op(
	api::Context* const context,
	const std::string& op_name,
	api::vTensor& v_in1,
	api::vTensor& v_in2,
	api::vTensor& v_dst) {
	std::string kernel_name = "binary_" + op_name + "_nobroadcast__test";
	add_dtype_suffix(kernel_name, v_dst);

	vkapi::PipelineBarrier pipeline_barrier{};
	vkapi::SpecVarList specialization_constants = {};
	context->submit_compute_job(
	VK_KERNEL_FROM_STR(kernel_name),
	pipeline_barrier,
	v_dst.logical_limits(),
	adaptive_work_group_size(v_dst.logical_limits()),
	specialization_constants,
	VK_NULL_HANDLE,
	0,
	v_dst.image(
	pipeline_barrier,
	vkapi::PipelineStage::COMPUTE,
	vkapi::MemoryAccessType::WRITE),
	v_in1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
	v_in2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
	v_dst.sizes_ubo());
	}

	void execute_and_check_add(
	api::vTensor& a,
	api::vTensor& b,
	api::vTensor& c,
	float a_val,
	float b_val) {
	// Fill input tensors
	fill_vtensor(a, a_val);
	fill_vtensor(b, b_val);

	// a + b = c
	record_binary_op(api::context(), "add", a, b, c);

	// Extract output tensor
	std::vector<float> data_out = extract_vtensor(c);

	// Check output
	for (size_t i = 0; i < data_out.size(); ++i) {
	CHECK_VALUE(data_out, i, (a_val + b_val));
	}
	}

	void record_index_fill_buffer(api::Context* context, api::vTensor& v_ten) {
	std::string kernel_name("idx_fill_buffer");
	switch (v_ten.dtype()) {
	case vkapi::kFloat:
	kernel_name += "_float";
	break;
	case vkapi::kHalf:
	kernel_name += "_half";
	break;
	case vkapi::kQInt8:
	kernel_name += "_int8";
	break;
	case vkapi::kQUInt8:
	kernel_name += "_uint8";
	break;
	default:
	throw std::runtime_error("Unsupported dtype");
	break;
	}

	api::ParamsBuffer params(api::context(), int32_t(v_ten.numel()));

	{
	vkapi::PipelineBarrier pipeline_barrier{};
	vkapi::SpecVarList specialization_constants = {};
	api::context()->submit_compute_job(
	VK_KERNEL_FROM_STR(kernel_name),
	pipeline_barrier,
	{uint32_t(v_ten.numel()), 1, 1},
	{64, 1, 1},
	specialization_constants,
	VK_NULL_HANDLE,
	0,
	v_ten.buffer(
	pipeline_barrier,
	vkapi::PipelineStage::COMPUTE,
	vkapi::MemoryAccessType::READ),
	params.buffer());
	}
	}

	void record_scalar_add_buffer(
	api::Context* context,
	api::vTensor& v_ten,
	float offset) {
	vkapi::PipelineBarrier pipeline_barrier{};
	vkapi::SpecVarList specialization_constants = {SV(offset)};
	std::string kernel = "scalar_add_buffer";
	add_dtype_suffix(kernel, v_ten);
	api::context()->submit_compute_job(
	VK_KERNEL_FROM_STR(kernel),
	pipeline_barrier,
	{uint32_t(v_ten.numel()), 1, 1},
	{64, 1, 1},
	specialization_constants,
	VK_NULL_HANDLE,
	0,
	v_ten.buffer(
	pipeline_barrier,
	vkapi::PipelineStage::COMPUTE,
	vkapi::MemoryAccessType::READ \| vkapi::MemoryAccessType::WRITE),
	v_ten.numel_ubo());
	}

	void record_reference_matmul(
	api::Context* context,
	api::vTensor& out,
	api::vTensor& mat1,
	api::vTensor& mat2) {
	vkapi::PipelineBarrier pipeline_barrier{};
	api::context()->submit_compute_job(
	VK_KERNEL(reference_matmul),
	pipeline_barrier,
	{uint32_t(out.size(1)), uint32_t(out.size(0)), 1},
	{64, 1, 1},
	{},
	VK_NULL_HANDLE,
	0,
	out.buffer(
	pipeline_barrier,
	vkapi::PipelineStage::COMPUTE,
	vkapi::MemoryAccessType::WRITE),
	mat1.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
	mat2.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
	out.sizes_ubo(),
	out.strides_ubo(),
	mat1.sizes_ubo(),
	mat1.strides_ubo(),
	mat2.sizes_ubo(),
	mat2.strides_ubo());
	}

	void record_matmul_texture3d(
	api::Context* context,
	api::vTensor& out,
	api::vTensor& mat1,
	api::vTensor& mat2) {
	std::string kernel_name = "matmul_naive";
	kernel_name.reserve(kShaderNameReserve);
	add_storage_type_suffix(kernel_name, out.storage_type());
	add_dtype_suffix(kernel_name, out.dtype());

	utils::uvec3 global_wg_size = out.logical_limits();

	vkapi::PipelineBarrier pipeline_barrier{};
	api::context()->submit_compute_job(
	VK_KERNEL_FROM_STR(kernel_name),
	pipeline_barrier,
	global_wg_size,
	{8, 8, 1},
	{out.hashed_layout(), mat1.hashed_layout(), mat2.hashed_layout()},
	VK_NULL_HANDLE,
	0,
	out.image(
	pipeline_barrier,
	vkapi::PipelineStage::COMPUTE,
	vkapi::MemoryAccessType::WRITE),
	mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
	mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
	out.sizes_ubo(),
	out.logical_limits_ubo(),
	mat1.sizes_ubo(),
	mat2.sizes_ubo());
	}

	//
	// Input & Output Utilities
	//

	#define FORALL_SUPPORTED_TYPES(_) \
	_(uint8_t, Byte) \
	_(int8_t, Char) \
	_(int32_t, Int) \
	_(executorch::aten::Half, Half) \
	_(float, Float) \
	_(int8_t, QInt8)

	void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
	api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size());

	#define CASE(ctype, name) \
	case vkapi::ScalarType::name: { \
	std::vector<ctype> data_converted; \
	data_converted.resize(data.size()); \
	for (int i = 0; i < data.size(); ++i) { \
	data_converted[i] = ctype(data[i]); \
	} \
	staging_buffer.copy_from( \
	data_converted.data(), vten.staging_buffer_nbytes()); \
	} break;

	switch (vten.dtype()) {
	FORALL_SUPPORTED_TYPES(CASE)
	default:
	VK_THROW("Unsupported dtype");
	}

	#undef CASE

	if (vten.storage_type() == utils::StorageType::BUFFER) {
	record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten);
	} else {
	record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten);
	}
	}

	void fill_vtensor(api::vTensor& vten, float val, bool iota) {
	std::vector<float> vten_data(vten.staging_buffer_numel());
	if (iota) {
	std::iota(vten_data.begin(), vten_data.end(), val);
	} else {
	std::fill(vten_data.begin(), vten_data.end(), val);
	}

	fill_vtensor(vten, vten_data);
	}

	std::vector<float> create_random_float_buffer(
	const size_t numel,
	const float min,
	const float max) {
	std::vector<float> data(numel);
	std::default_random_engine rng;
	std::uniform_real_distribution<float> dist(min, max);

	for (size_t i = 0; i < data.size(); ++i) {
	data[i] = dist(rng);
	}
	return data;
	}

	std::vector<uint8_t> create_random_uint8_buffer(
	const size_t numel,
	const uint8_t min,
	const uint8_t max) {
	std::vector<uint8_t> data(numel);
	std::default_random_engine rng;
	std::uniform_real_distribution<float> dist(min, max);

	for (size_t i = 0; i < data.size(); ++i) {
	data[i] = (uint8_t)dist(rng);
	}
	return data;
	}

	void fill_vtensor(
	ComputeGraph& graph,
	const IOValueRef idx,
	float val,
	bool iota) {
	vTensorPtr t = graph.get_tensor(idx.value);
	std::vector<float> data(t->numel());
	if (t->storage_type() != utils::kBuffer) {
	data.resize(t->staging_buffer_numel());
	}
	if (iota) {
	std::iota(data.begin(), data.end(), val);
	} else {
	std::fill(data.begin(), data.end(), val);
	}

	graph.copy_into_staging(idx.staging, data.data(), data.size());
	}

	void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
	api::StagingBuffer staging_buffer(
	api::context(), vten.dtype(), vten.staging_buffer_numel());

	if (vten.storage_type() == utils::StorageType::BUFFER) {
	record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
	} else {
	record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
	}

	vkapi::VulkanFence fence = api::context()->fences().get_fence();
	api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
	fence.wait();

	#define CASE(ctype, name) \
	case vkapi::ScalarType::name: { \
	std::vector<ctype> data_converted(data.size()); \
	staging_buffer.copy_to( \
	data_converted.data(), vten.staging_buffer_nbytes()); \
	for (int i = 0; i < data.size(); ++i) { \
	data[i] = float(data_converted[i]); \
	} \
	} break;

	switch (vten.dtype()) {
	FORALL_SUPPORTED_TYPES(CASE)
	default:
	VK_THROW("Unsupported dtype");
	}

	#undef CASE
	}

	//
	// Context Management
	//

	void submit_to_gpu() {
	vkapi::VulkanFence fence = api::context()->fences().get_fence();
	api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
	fence.wait();
	}

	vkapi::Allocation allocate_memory_for(const api::vTensor& vten) {
	VmaAllocationCreateInfo alloc_create_info =
	api::context()->adapter_ptr()->vma().gpuonly_resource_create_info();
	return api::context()->adapter_ptr()->vma().create_allocation(
	vten.get_memory_requirements(), alloc_create_info);
	}

	VmaTotalStatistics get_vma_stats() {
	return api::context()->adapter_ptr()->vma().get_memory_statistics();
	}

	size_t get_vma_allocation_count() {
	return get_vma_stats().total.statistics.allocationCount;
	}

	//
	// Graph Test Utilities
	//

	void execute_graph_and_check_output(
	ComputeGraph& graph,
	std::vector<float> input_vals,
	std::vector<float> expected_outputs) {
	assert(input_vals.size() == graph.inputs().size());
	assert(expected_outputs.size() == graph.outputs().size());

	for (size_t i = 0; i < graph.inputs().size(); ++i) {
	fill_vtensor(graph, graph.inputs().at(i), input_vals.at(i));
	}

	graph.execute();

	for (size_t i = 0; i < graph.outputs().size(); ++i) {
	IOValueRef out_ioval = graph.outputs().at(i);
	vTensorPtr t_out = graph.get_tensor(out_ioval.value);

	std::vector<float> output_data(t_out->staging_buffer_numel());
	graph.copy_from_staging(
	out_ioval.staging, output_data.data(), output_data.size());

	for (size_t j = 0; j < t_out->numel(); ++j) {
	CHECK_VALUE(output_data, j, expected_outputs.at(i));
	}
	}
	}

	bool check_close(float a, float b, float atol, float rtol) {
	float max = std::max(std::abs(a), std::abs(b));
	float diff = std::abs(a - b);
	return diff <= (atol + rtol * max);
	}