blob: 47616dfa698ce262c09866e74f72ea07825d2613 [file] [log] [blame]
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <memory>
#include <vector>
#include <executorch/extension/data_loader/buffer_data_loader.h>
#include <executorch/runtime/executor/method.h>
#include <executorch/runtime/executor/program.h>
#include <executorch/runtime/platform/log.h>
#include <executorch/runtime/platform/runtime.h>
#include <executorch/util/read_file.h>
#include <executorch/util/util.h>
#include <gflags/gflags.h>
using namespace torch::executor;
/**
* @file
*
* In some hardware environments, the same model may run on different cores for
* different inference requests. The same core may also see a power-cycle (i.e.,
* power down and then back up) in between two inference requests.
*
* For ExecuTorch to work efficiently in these environments, we want to
* initialize the Method once once for the model and avoid re-initializing it
* for every inference. This can be achieved by restricting the runtime contexts
* (torch::executor::Program and torch::executor::Method) to live in a
* pre-allocated, shared, and persistent memory.
*
* This tool demonstrates that the memory can be managed this way.
*/
static uint8_t method_allocator_pool[2 * 1024U * 1024U]; // 4 MB
#define MAX_INPUTS_PER_MODEL 16
#define MAX_OUTPUTS_PER_MODEL 8
DEFINE_string(
model_path,
"model.pte",
"Model serialized in flatbuffer format.");
// These functions represent the work done on a worker core.
namespace worker {
Program* load_program(
const void* file_data,
size_t file_data_len,
MemoryAllocator& allocator) {
// Wrap the data in a DataLoader. The Program will take a pointer to it, so it
// must live for at least as long as the Program instance.
auto loader = allocator.allocateInstance<util::BufferDataLoader>();
ET_CHECK(loader != nullptr);
new (loader) util::BufferDataLoader(file_data, file_data_len);
// Load the program.
Result<Program> program_result = Program::load(loader);
ET_CHECK(program_result.ok());
// Move the Program into worker memory.
auto program = allocator.allocateInstance<Program>();
ET_CHECK(program != nullptr);
new (program) Program(std::move(program_result.get()));
return program;
}
MemoryManager* create_memory_manager(
MethodMeta* method_meta,
MemoryAllocator& worker_allocator) {
// Create the runtime allocator.
auto* method_allocator = worker_allocator.allocateInstance<MemoryAllocator>();
ET_CHECK(method_allocator != nullptr);
new (method_allocator)
MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool);
// Create the memory planned buffers.
size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
Span<uint8_t>* memory_planned_buffers =
worker_allocator.allocateList<Span<uint8_t>>(num_memory_planned_buffers);
ET_CHECK(memory_planned_buffers != nullptr);
for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
const size_t buffer_size =
method_meta->memory_planned_buffer_size(id).get();
ET_LOG(
Info, "Setting up planned buffer id %zu, size %zu.", id, buffer_size);
void* buffer = worker_allocator.allocate(buffer_size);
ET_CHECK(buffer != nullptr);
memory_planned_buffers[id] = {(uint8_t*)buffer, buffer_size};
ET_LOG(
Info,
"Created memory_planned_buffers with size %zu and addr %p",
buffer_size,
buffer);
}
auto* planned_memory =
worker_allocator.allocateInstance<HierarchicalAllocator>();
ET_CHECK(planned_memory != nullptr);
new (planned_memory) HierarchicalAllocator(
{memory_planned_buffers, num_memory_planned_buffers});
// The constant allocator is not currently used, but must be provided.
auto* const_allocator = worker_allocator.allocateInstance<MemoryAllocator>();
ET_CHECK(const_allocator != nullptr);
new (const_allocator) MemoryAllocator(0, nullptr);
// Assemble all of the allocators into the MemoryManager that the Method
// will use.
auto* memory_manager = worker_allocator.allocateInstance<MemoryManager>();
ET_CHECK(memory_manager != nullptr);
new (memory_manager) MemoryManager(method_allocator, planned_memory);
return memory_manager;
}
Method* init_method(
Program* program,
const char* method_name,
MemoryAllocator& worker_allocator,
std::vector<size_t>& input_sizes,
std::vector<size_t>& output_sizes) {
Result<MethodMeta> method_meta = program->method_meta(method_name);
ET_CHECK(method_meta.ok());
MemoryManager* memory_manager =
create_memory_manager(&method_meta.get(), worker_allocator);
//
// Create and load a method from the program, using the provided
// allocators. The Method is what actually runs the model. It is
// mutable, so should only be used by a single thread at at time, but it can
// be reused.
//
auto* method = worker_allocator.allocateInstance<Method>();
ET_CHECK(method != nullptr);
auto method_res = program->load_method(method_name, memory_manager);
ET_CHECK_MSG(
method_res.error() == Error::Ok,
"loading method('%s') failed with status 0x%" PRIx32,
method_name,
method_res.error());
new (method) Method(std::move(method_res.get()));
ET_LOG(Info, "Model method '%s' initialized.", method_name);
// Gather the byte size of each input/output tensor.
const size_t input_size = method->inputs_size();
for (size_t i = 0; i < input_size; i++) {
if (!method->get_input(i).isTensor()) {
ET_LOG(Info, "input %zu is not a tensor, skipping", i);
continue;
}
const auto& t = method->get_input(i).toTensor();
input_sizes.push_back(t.nbytes());
}
const size_t output_size = method->outputs_size();
for (size_t i = 0; i < output_size; i++) {
const auto& t = method->get_output(i).toTensor();
output_sizes.push_back(t.nbytes());
}
return method;
}
void inference_loop(
Method* method,
const std::vector<void*>& input_buffers,
const std::vector<void*>& output_buffers) {
ET_LOG(
Info,
"Assigning input pointers, receiving %lu inputs",
input_buffers.size());
// Prepare the inputs.
{
size_t bufi = 0;
for (size_t i = 0; i < method->inputs_size(); i++) {
if (!method->get_input(i).isTensor()) {
ET_LOG(Info, "input %zu is not a tensor, skipping", i);
continue;
}
const auto& t = method->get_input(i).toTensor();
ET_CHECK_MSG(
bufi < input_buffers.size(), "Not enough input buffers for model");
t.set_data(input_buffers[bufi++]);
}
}
ET_LOG(Info, "Inputs prepared.");
// Prepare the outputs.
{
size_t bufi = 0;
for (size_t i = 0; i < method->outputs_size(); i++) {
if (!method->get_output(i).isTensor()) {
ET_LOG(Info, "output %zu is not a tensor, skipping", i);
continue;
}
const auto& t = method->get_output(i).toTensor();
ET_CHECK_MSG(
bufi < output_buffers.size(), "Not enough output buffers for model");
t.set_data(output_buffers[bufi++]);
}
}
ET_LOG(Info, "Outputs prepared.");
// Run the model.
Error status = method->execute();
ET_CHECK_MSG(
status == Error::Ok,
"method->execute() failed with status 0x%" PRIx32,
status);
ET_LOG(Info, "Model executed successfully.");
}
} // namespace worker
/*
* This is an example of how ExecuTorch stack should run on multiple
* processors setup where there is a control core for memory
* management and a worker core that runs the actual inference.
*/
int main(int argc, char** argv) {
torch::executor::runtime_init();
gflags::ParseCommandLineFlags(&argc, &argv, true);
/*
* Step 1: The model gets loaded from file to memory on the control core
*/
std::shared_ptr<char> file_data;
size_t file_size;
Error err = torch::executor::util::read_file_content(
FLAGS_model_path.c_str(), &file_data, &file_size);
ET_CHECK_MSG(err == Error::Ok, "read_file_content failed: %d", int(err));
/*
* Step 2: Prepare the memory space required for worker core
*/
// The actual allocation size can be backend/model specific and smaller
constexpr size_t kWorkerBufferSize = 1 * 1024U * 1024U; // 1 MB
auto worker_buffer = std::make_unique<uint8_t[]>(kWorkerBufferSize);
MemoryAllocator worker_allocator(kWorkerBufferSize, worker_buffer.get());
/*
* Step 3: The worker core sets up the corresponding data structures for the
* program
*/
Program* program =
worker::load_program(file_data.get(), file_size, worker_allocator);
ET_LOG(
Info,
"Loaded %s and constructed program at %p",
FLAGS_model_path.c_str(),
program);
ET_CHECK(program != nullptr);
/*
* Step 4: The worker core sets up the Method. Here we let the control
* core read out the I/O info from the Method. This can also be done on
* the control core from the program flatbuffer, though there is no
* direct API at the moment.
*/
// Get the method name to execute.
const char* method_name = nullptr;
{
// Use the first method in the program.
const auto method_name_result = program->get_method_name(0);
ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
method_name = *method_name_result;
}
ET_LOG(Info, "Using method %s", method_name);
std::vector<size_t> input_sizes;
std::vector<size_t> output_sizes;
Method* method = worker::init_method(
program, method_name, worker_allocator, input_sizes, output_sizes);
ET_LOG(
Info,
"Number of inputs is %lu and number of outputs is %lu",
input_sizes.size(),
output_sizes.size());
/*
* Step 5: The control core or the applicaton code prepares the I/O
*/
// Allocate and initialize input/output tensor buffers for the inference
std::vector<void*> input_buffers;
for (size_t buffer_size : input_sizes) {
void* buffer = malloc(buffer_size);
memset(static_cast<char*>(buffer), 0, buffer_size);
input_buffers.push_back(buffer);
}
ET_LOG(Info, "Allocated the inputs");
std::vector<void*> output_buffers;
for (size_t buffer_size : output_sizes) {
void* buffer = malloc(buffer_size);
memset(static_cast<char*>(buffer), 0, buffer_size);
output_buffers.push_back(buffer);
}
ET_LOG(Info, "Allocated the outputs");
/*
* Step 6: The control core forwards the inference request and the worker
* core runs the program.
*/
// Run the inference on the inputs. CHECK-fails on error.
worker::inference_loop(method, input_buffers, output_buffers);
for (void* buffer : input_buffers) {
free(buffer);
}
for (void* buffer : output_buffers) {
free(buffer);
}
return 0;
}