blob: 3b6195a5c26a6ce2677bb6a1f23e474cbe1517c9 [file]
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <executorch/backends/vulkan/test/utils/test_utils.h>
#include <executorch/runtime/core/exec_aten/exec_aten.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
#include <cassert>
#include <random>
using namespace vkcompute;
//
// Operator Recording Functions
//
void record_nchw_to_buffer_op(
api::Context* const context,
vkapi::VulkanBuffer& src_buffer,
api::vTensor& v_dst) {
vkapi::PipelineBarrier pipeline_barrier{};
context->submit_compute_job(
get_nchw_to_tensor_shader(v_dst),
pipeline_barrier,
{uint32_t(v_dst.numel()), 1, 1},
{64, 1, 1},
{},
VK_NULL_HANDLE,
0,
v_dst.buffer(
pipeline_barrier,
vkapi::PipelineStage::COMPUTE,
vkapi::MemoryAccessType::WRITE),
src_buffer,
v_dst.sizes_ubo(),
v_dst.strides_ubo(),
v_dst.numel_ubo());
}
void record_buffer_to_nchw_op(
api::Context* const context,
api::vTensor& v_src,
vkapi::VulkanBuffer& dst_buffer) {
vkapi::PipelineBarrier pipeline_barrier{};
context->submit_compute_job(
get_tensor_to_nchw_shader(v_src),
pipeline_barrier,
{uint32_t(v_src.numel()), 1, 1},
{64, 1, 1},
{},
VK_NULL_HANDLE,
0,
dst_buffer,
v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
v_src.sizes_ubo(),
v_src.strides_ubo(),
v_src.numel_ubo());
}
void record_nchw_to_image_op(
api::Context* const context,
vkapi::VulkanBuffer& src_buffer,
api::vTensor& v_dst) {
vkapi::PipelineBarrier pipeline_barrier{};
vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()};
context->submit_compute_job(
get_nchw_to_tensor_shader(
v_dst, context->adapter_ptr()->has_full_int8_buffers_support()),
pipeline_barrier,
v_dst.logical_limits(),
adaptive_work_group_size(v_dst.logical_limits()),
specialization_constants,
VK_NULL_HANDLE,
0,
v_dst.image(
pipeline_barrier,
vkapi::PipelineStage::COMPUTE,
vkapi::MemoryAccessType::WRITE),
src_buffer,
v_dst.sizes_ubo());
}
void record_image_to_nchw_op(
api::Context* const context,
api::vTensor& v_src,
vkapi::VulkanBuffer& dst_buffer) {
vkapi::PipelineBarrier pipeline_barrier{};
vkapi::SpecVarList specialization_constants = {v_src.hashed_layout()};
context->submit_compute_job(
get_tensor_to_nchw_shader(v_src),
pipeline_barrier,
v_src.logical_limits(),
adaptive_work_group_size(v_src.logical_limits()),
specialization_constants,
VK_NULL_HANDLE,
0,
dst_buffer,
v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
v_src.sizes_ubo());
}
void record_bitw8_image_to_nchw_nobitw8buffer_op(
api::Context* const context,
api::vTensor& v_src,
api::StagingBuffer& dst_buffer) {
vkapi::PipelineBarrier pipeline_barrier{};
uint32_t buffer_len = utils::safe_downcast<uint32_t>(dst_buffer.numel() / 4);
utils::uvec3 global_wg_size = {buffer_len, 1, 1};
std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
add_storage_type_suffix(kernel_name, v_src);
add_dtype_suffix(kernel_name, v_src);
context->submit_compute_job(
VK_KERNEL_FROM_STR(kernel_name),
pipeline_barrier,
global_wg_size,
adaptive_work_group_size(global_wg_size),
{v_src.hashed_layout()},
VK_NULL_HANDLE,
0,
dst_buffer.buffer(),
v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
v_src.sizes_ubo(),
v_src.numel_ubo());
}
void record_conv2d_prepack_weights_op(
api::Context* const context,
vkapi::VulkanBuffer& src_buffer,
api::vTensor& v_dst,
const std::vector<int64_t>& original_sizes,
const bool transposed) {
vkapi::PipelineBarrier pipeline_barrier{};
std::string kernel_name;
if (transposed) {
kernel_name = "conv_transpose2d";
} else {
kernel_name = "conv2d";
}
kernel_name += "_prepack_weights";
add_dtype_suffix(kernel_name, v_dst);
vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
api::ParamsBuffer original_sizes_ubo(
context, utils::make_ivec4(original_sizes, /*reverse = */ true));
vkapi::SpecVarList specialization_constants = {};
context->submit_compute_job(
shader,
pipeline_barrier,
v_dst.logical_limits(),
adaptive_work_group_size(v_dst.logical_limits()),
specialization_constants,
VK_NULL_HANDLE,
0,
v_dst.image(
pipeline_barrier,
vkapi::PipelineStage::COMPUTE,
vkapi::MemoryAccessType::WRITE),
src_buffer,
v_dst.sizes_ubo(),
original_sizes_ubo.buffer());
}
void record_binary_op(
api::Context* const context,
const std::string& op_name,
api::vTensor& v_in1,
api::vTensor& v_in2,
api::vTensor& v_dst) {
std::string kernel_name = "binary_" + op_name + "_nobroadcast__test";
add_dtype_suffix(kernel_name, v_dst);
vkapi::PipelineBarrier pipeline_barrier{};
vkapi::SpecVarList specialization_constants = {};
context->submit_compute_job(
VK_KERNEL_FROM_STR(kernel_name),
pipeline_barrier,
v_dst.logical_limits(),
adaptive_work_group_size(v_dst.logical_limits()),
specialization_constants,
VK_NULL_HANDLE,
0,
v_dst.image(
pipeline_barrier,
vkapi::PipelineStage::COMPUTE,
vkapi::MemoryAccessType::WRITE),
v_in1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
v_in2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
v_dst.sizes_ubo());
}
void execute_and_check_add(
api::vTensor& a,
api::vTensor& b,
api::vTensor& c,
float a_val,
float b_val) {
// Fill input tensors
fill_vtensor(a, a_val);
fill_vtensor(b, b_val);
// a + b = c
record_binary_op(api::context(), "add", a, b, c);
// Extract output tensor
std::vector<float> data_out = extract_vtensor(c);
// Check output
for (size_t i = 0; i < data_out.size(); ++i) {
CHECK_VALUE(data_out, i, (a_val + b_val));
}
}
void record_index_fill_buffer(api::Context* context, api::vTensor& v_ten) {
std::string kernel_name("idx_fill_buffer");
switch (v_ten.dtype()) {
case vkapi::kFloat:
kernel_name += "_float";
break;
case vkapi::kHalf:
kernel_name += "_half";
break;
case vkapi::kQInt8:
kernel_name += "_int8";
break;
case vkapi::kQUInt8:
kernel_name += "_uint8";
break;
default:
throw std::runtime_error("Unsupported dtype");
break;
}
api::ParamsBuffer params(api::context(), int32_t(v_ten.numel()));
{
vkapi::PipelineBarrier pipeline_barrier{};
vkapi::SpecVarList specialization_constants = {};
api::context()->submit_compute_job(
VK_KERNEL_FROM_STR(kernel_name),
pipeline_barrier,
{uint32_t(v_ten.numel()), 1, 1},
{64, 1, 1},
specialization_constants,
VK_NULL_HANDLE,
0,
v_ten.buffer(
pipeline_barrier,
vkapi::PipelineStage::COMPUTE,
vkapi::MemoryAccessType::READ),
params.buffer());
}
}
void record_scalar_add_buffer(
api::Context* context,
api::vTensor& v_ten,
float offset) {
vkapi::PipelineBarrier pipeline_barrier{};
vkapi::SpecVarList specialization_constants = {SV(offset)};
std::string kernel = "scalar_add_buffer";
add_dtype_suffix(kernel, v_ten);
api::context()->submit_compute_job(
VK_KERNEL_FROM_STR(kernel),
pipeline_barrier,
{uint32_t(v_ten.numel()), 1, 1},
{64, 1, 1},
specialization_constants,
VK_NULL_HANDLE,
0,
v_ten.buffer(
pipeline_barrier,
vkapi::PipelineStage::COMPUTE,
vkapi::MemoryAccessType::READ | vkapi::MemoryAccessType::WRITE),
v_ten.numel_ubo());
}
void record_reference_matmul(
api::Context* context,
api::vTensor& out,
api::vTensor& mat1,
api::vTensor& mat2) {
vkapi::PipelineBarrier pipeline_barrier{};
api::context()->submit_compute_job(
VK_KERNEL(reference_matmul),
pipeline_barrier,
{uint32_t(out.size(1)), uint32_t(out.size(0)), 1},
{64, 1, 1},
{},
VK_NULL_HANDLE,
0,
out.buffer(
pipeline_barrier,
vkapi::PipelineStage::COMPUTE,
vkapi::MemoryAccessType::WRITE),
mat1.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
mat2.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
out.sizes_ubo(),
out.strides_ubo(),
mat1.sizes_ubo(),
mat1.strides_ubo(),
mat2.sizes_ubo(),
mat2.strides_ubo());
}
void record_matmul_texture3d(
api::Context* context,
api::vTensor& out,
api::vTensor& mat1,
api::vTensor& mat2) {
std::string kernel_name = "matmul_naive";
kernel_name.reserve(kShaderNameReserve);
add_storage_type_suffix(kernel_name, out.storage_type());
add_dtype_suffix(kernel_name, out.dtype());
utils::uvec3 global_wg_size = out.logical_limits();
vkapi::PipelineBarrier pipeline_barrier{};
api::context()->submit_compute_job(
VK_KERNEL_FROM_STR(kernel_name),
pipeline_barrier,
global_wg_size,
{8, 8, 1},
{out.hashed_layout(), mat1.hashed_layout(), mat2.hashed_layout()},
VK_NULL_HANDLE,
0,
out.image(
pipeline_barrier,
vkapi::PipelineStage::COMPUTE,
vkapi::MemoryAccessType::WRITE),
mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
out.sizes_ubo(),
out.logical_limits_ubo(),
mat1.sizes_ubo(),
mat2.sizes_ubo());
}
//
// Input & Output Utilities
//
#define FORALL_SUPPORTED_TYPES(_) \
_(uint8_t, Byte) \
_(int8_t, Char) \
_(int32_t, Int) \
_(executorch::aten::Half, Half) \
_(float, Float) \
_(int8_t, QInt8)
void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size());
#define CASE(ctype, name) \
case vkapi::ScalarType::name: { \
std::vector<ctype> data_converted; \
data_converted.resize(data.size()); \
for (int i = 0; i < data.size(); ++i) { \
data_converted[i] = ctype(data[i]); \
} \
staging_buffer.copy_from( \
data_converted.data(), vten.staging_buffer_nbytes()); \
} break;
switch (vten.dtype()) {
FORALL_SUPPORTED_TYPES(CASE)
default:
VK_THROW("Unsupported dtype");
}
#undef CASE
if (vten.storage_type() == utils::StorageType::BUFFER) {
record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten);
} else {
record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten);
}
}
void fill_vtensor(api::vTensor& vten, float val, bool iota) {
std::vector<float> vten_data(vten.staging_buffer_numel());
if (iota) {
std::iota(vten_data.begin(), vten_data.end(), val);
} else {
std::fill(vten_data.begin(), vten_data.end(), val);
}
fill_vtensor(vten, vten_data);
}
std::vector<float> create_random_float_buffer(
const size_t numel,
const float min,
const float max) {
std::vector<float> data(numel);
std::default_random_engine rng;
std::uniform_real_distribution<float> dist(min, max);
for (size_t i = 0; i < data.size(); ++i) {
data[i] = dist(rng);
}
return data;
}
std::vector<uint8_t> create_random_uint8_buffer(
const size_t numel,
const uint8_t min,
const uint8_t max) {
std::vector<uint8_t> data(numel);
std::default_random_engine rng;
std::uniform_real_distribution<float> dist(min, max);
for (size_t i = 0; i < data.size(); ++i) {
data[i] = (uint8_t)dist(rng);
}
return data;
}
void fill_vtensor(
ComputeGraph& graph,
const IOValueRef idx,
float val,
bool iota) {
vTensorPtr t = graph.get_tensor(idx.value);
std::vector<float> data(t->numel());
if (t->storage_type() != utils::kBuffer) {
data.resize(t->staging_buffer_numel());
}
if (iota) {
std::iota(data.begin(), data.end(), val);
} else {
std::fill(data.begin(), data.end(), val);
}
graph.copy_into_staging(idx.staging, data.data(), data.size());
}
void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
api::StagingBuffer staging_buffer(
api::context(), vten.dtype(), vten.staging_buffer_numel());
if (vten.storage_type() == utils::StorageType::BUFFER) {
record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
} else {
record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
}
vkapi::VulkanFence fence = api::context()->fences().get_fence();
api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
fence.wait();
#define CASE(ctype, name) \
case vkapi::ScalarType::name: { \
std::vector<ctype> data_converted(data.size()); \
staging_buffer.copy_to( \
data_converted.data(), vten.staging_buffer_nbytes()); \
for (int i = 0; i < data.size(); ++i) { \
data[i] = float(data_converted[i]); \
} \
} break;
switch (vten.dtype()) {
FORALL_SUPPORTED_TYPES(CASE)
default:
VK_THROW("Unsupported dtype");
}
#undef CASE
}
//
// Context Management
//
void submit_to_gpu() {
vkapi::VulkanFence fence = api::context()->fences().get_fence();
api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
fence.wait();
}
vkapi::Allocation allocate_memory_for(const api::vTensor& vten) {
VmaAllocationCreateInfo alloc_create_info =
api::context()->adapter_ptr()->vma().gpuonly_resource_create_info();
return api::context()->adapter_ptr()->vma().create_allocation(
vten.get_memory_requirements(), alloc_create_info);
}
VmaTotalStatistics get_vma_stats() {
return api::context()->adapter_ptr()->vma().get_memory_statistics();
}
size_t get_vma_allocation_count() {
return get_vma_stats().total.statistics.allocationCount;
}
//
// Graph Test Utilities
//
void execute_graph_and_check_output(
ComputeGraph& graph,
std::vector<float> input_vals,
std::vector<float> expected_outputs) {
assert(input_vals.size() == graph.inputs().size());
assert(expected_outputs.size() == graph.outputs().size());
for (size_t i = 0; i < graph.inputs().size(); ++i) {
fill_vtensor(graph, graph.inputs().at(i), input_vals.at(i));
}
graph.execute();
for (size_t i = 0; i < graph.outputs().size(); ++i) {
IOValueRef out_ioval = graph.outputs().at(i);
vTensorPtr t_out = graph.get_tensor(out_ioval.value);
std::vector<float> output_data(t_out->staging_buffer_numel());
graph.copy_from_staging(
out_ioval.staging, output_data.data(), output_data.size());
for (size_t j = 0; j < t_out->numel(); ++j) {
CHECK_VALUE(output_data, j, expected_outputs.at(i));
}
}
}
bool check_close(float a, float b, float atol, float rtol) {
float max = std::max(std::abs(a), std::abs(b));
float diff = std::abs(a - b);
return diff <= (atol + rtol * max);
}