test/relocatable_runner.cpp - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #include <memory>
 #include <vector>

 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/util/read_file.h>
 #include <executorch/util/util.h>

 #include <gflags/gflags.h>

 using namespace torch::executor;

 /**
  * @file
  *
  * In some hardware environments, the same model may run on different cores for
  * different inference requests. The same core may also see a power-cycle (i.e.,
  * power down and then back up) in between two inference requests.
  *
  * For ExecuTorch to work efficiently in these environments, we want to
  * initialize the Method once once for the model and avoid re-initializing it
  * for every inference. This can be achieved by restricting the runtime contexts
  * (torch::executor::Program and torch::executor::Method) to live in a
  * pre-allocated, shared, and persistent memory.
  *
  * This tool demonstrates that the memory can be managed this way.
  */

 static uint8_t method_allocator_pool[2 * 1024U * 1024U]; // 4 MB

 #define MAX_INPUTS_PER_MODEL 16
 #define MAX_OUTPUTS_PER_MODEL 8

 DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");

 // These functions represent the work done on a worker core.
 namespace worker {

 Program* load_program(
     const void* file_data,
     size_t file_data_len,
     MemoryAllocator& allocator) {
   // Wrap the data in a DataLoader. The Program will take a pointer to it, so it
   // must live for at least as long as the Program instance.
   auto loader = allocator.allocateInstance<util::BufferDataLoader>();
   ET_CHECK(loader != nullptr);
   new (loader) util::BufferDataLoader(file_data, file_data_len);

   // Load the program.
   Result<Program> program_result = Program::load(loader);
   ET_CHECK(program_result.ok());

   // Move the Program into worker memory.
   auto program = allocator.allocateInstance<Program>();
   ET_CHECK(program != nullptr);
   new (program) Program(std::move(program_result.get()));

   return program;
 }

 MemoryManager* create_memory_manager(
     MethodMeta* method_meta,
     MemoryAllocator& worker_allocator) {
   // Create the runtime allocator.
   auto* method_allocator = worker_allocator.allocateInstance<MemoryAllocator>();
   ET_CHECK(method_allocator != nullptr);
   new (method_allocator)
       MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool);

   // Create the memory planned buffers.
   size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
   Span<uint8_t>* memory_planned_buffers =
       worker_allocator.allocateList<Span<uint8_t>>(num_memory_planned_buffers);
   ET_CHECK(memory_planned_buffers != nullptr);
   for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
     const size_t buffer_size =
         method_meta->memory_planned_buffer_size(id).get();
     ET_LOG(
         Info, "Setting up planned buffer id %zu, size %zu.", id, buffer_size);
     void* buffer = worker_allocator.allocate(buffer_size);
     ET_CHECK(buffer != nullptr);
     memory_planned_buffers[id] = {(uint8_t*)buffer, buffer_size};
     ET_LOG(
         Info,
         "Created memory_planned_buffers with size %zu and addr %p",
         buffer_size,
         buffer);
   }
   auto* planned_memory =
       worker_allocator.allocateInstance<HierarchicalAllocator>();
   ET_CHECK(planned_memory != nullptr);
   new (planned_memory) HierarchicalAllocator(
       {memory_planned_buffers, num_memory_planned_buffers});

   // The constant allocator is not currently used, but must be provided.
   auto* const_allocator = worker_allocator.allocateInstance<MemoryAllocator>();
   ET_CHECK(const_allocator != nullptr);
   new (const_allocator) MemoryAllocator(0, nullptr);

   // Assemble all of the allocators into the MemoryManager that the Method
   // will use.
   auto* memory_manager = worker_allocator.allocateInstance<MemoryManager>();
   ET_CHECK(memory_manager != nullptr);
   new (memory_manager) MemoryManager(method_allocator, planned_memory);

   return memory_manager;
 }

 Method* init_method(
     Program* program,
     const char* method_name,
     MemoryAllocator& worker_allocator,
     std::vector<size_t>& input_sizes,
     std::vector<size_t>& output_sizes) {
   Result<MethodMeta> method_meta = program->method_meta(method_name);
   ET_CHECK(method_meta.ok());

   MemoryManager* memory_manager =
       create_memory_manager(&method_meta.get(), worker_allocator);

   //
   // Create and load a method from the program, using the provided
   // allocators. The Method is what actually runs the model. It is
   // mutable, so should only be used by a single thread at at time, but it can
   // be reused.
   //

   auto* method = worker_allocator.allocateInstance<Method>();
   ET_CHECK(method != nullptr);
   auto method_res = program->load_method(method_name, memory_manager);
   ET_CHECK_MSG(
       method_res.error() == Error::Ok,
       "loading method('%s') failed with status 0x%" PRIx32,
       method_name,
       method_res.error());
   new (method) Method(std::move(method_res.get()));

   ET_LOG(Info, "Model method '%s' initialized.", method_name);

   // Gather the byte size of each input/output tensor.
   const size_t input_size = method->inputs_size();
   for (size_t i = 0; i < input_size; i++) {
     if (!method->get_input(i).isTensor()) {
       ET_LOG(Info, "input %zu is not a tensor, skipping", i);
       continue;
     }
     const auto& t = method->get_input(i).toTensor();
     input_sizes.push_back(t.nbytes());
   }

   const size_t output_size = method->outputs_size();
   for (size_t i = 0; i < output_size; i++) {
     const auto& t = method->get_output(i).toTensor();
     output_sizes.push_back(t.nbytes());
   }

   return method;
 }

 void inference_loop(
     Method* method,
     const std::vector<void*>& input_buffers,
     const std::vector<void*>& output_buffers) {
   ET_LOG(
       Info,
       "Assigning input pointers, receiving %lu inputs",
       input_buffers.size());

   // Prepare the inputs.
   {
     size_t bufi = 0;
     for (size_t i = 0; i < method->inputs_size(); i++) {
       if (!method->get_input(i).isTensor()) {
         ET_LOG(Info, "input %zu is not a tensor, skipping", i);
         continue;
       }
       const auto& t = method->get_input(i).toTensor();
       ET_CHECK_MSG(
           bufi < input_buffers.size(), "Not enough input buffers for model");
       t.set_data(input_buffers[bufi++]);
     }
   }
   ET_LOG(Info, "Inputs prepared.");

   // Prepare the outputs.
   {
     size_t bufi = 0;
     for (size_t i = 0; i < method->outputs_size(); i++) {
       if (!method->get_output(i).isTensor()) {
         ET_LOG(Info, "output %zu is not a tensor, skipping", i);
         continue;
       }
       const auto& t = method->get_output(i).toTensor();
       ET_CHECK_MSG(
           bufi < output_buffers.size(), "Not enough output buffers for model");
       t.set_data(output_buffers[bufi++]);
     }
   }
   ET_LOG(Info, "Outputs prepared.");

   // Run the model.
   Error status = method->execute();
   ET_CHECK_MSG(
       status == Error::Ok,
       "method->execute() failed with status 0x%" PRIx32,
       status);
   ET_LOG(Info, "Model executed successfully.");
 }

 } // namespace worker

 /*
  * This is an example of how ExecuTorch stack should run on multiple
  * processors setup where there is a control core for memory
  * management and a worker core that runs the actual inference.
  */

 int main(int argc, char** argv) {
   torch::executor::runtime_init();
   gflags::ParseCommandLineFlags(&argc, &argv, true);

   /*
    * Step 1: The model gets loaded from file to memory on the control core
    */
   std::shared_ptr<char> file_data;
   size_t file_size;
   Error err = torch::executor::util::read_file_content(
       FLAGS_model_path.c_str(), &file_data, &file_size);
   ET_CHECK_MSG(err == Error::Ok, "read_file_content failed: %d", int(err));

   /*
    * Step 2: Prepare the memory space required for worker core
    */
   // The actual allocation size can be backend/model specific and smaller
   constexpr size_t kWorkerBufferSize = 1 * 1024U * 1024U; // 1 MB
   auto worker_buffer = std::make_unique<uint8_t[]>(kWorkerBufferSize);
   MemoryAllocator worker_allocator(kWorkerBufferSize, worker_buffer.get());

   /*
    * Step 3: The worker core sets up the corresponding data structures for the
    * program
    */
   Program* program =
       worker::load_program(file_data.get(), file_size, worker_allocator);
   ET_LOG(
       Info,
       "Loaded %s and constructed program at %p",
       FLAGS_model_path.c_str(),
       program);
   ET_CHECK(program != nullptr);

   /*
    * Step 4: The worker core sets up the Method. Here we let the control
    * core read out the I/O info from the Method. This can also be done on
    * the control core from the program flatbuffer, though there is no
    * direct API at the moment.
    */

   // Get the method name to execute.
   const char* method_name = nullptr;
   {
     // Use the first method in the program.
     const auto method_name_result = program->get_method_name(0);
     ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
     method_name = *method_name_result;
   }
   ET_LOG(Info, "Using method %s", method_name);

   std::vector<size_t> input_sizes;
   std::vector<size_t> output_sizes;

   Method* method = worker::init_method(
       program, method_name, worker_allocator, input_sizes, output_sizes);

   ET_LOG(
       Info,
       "Number of inputs is %lu and number of outputs is %lu",
       input_sizes.size(),
       output_sizes.size());

   /*
    * Step 5: The control core or the applicaton code prepares the I/O
    */

   // Allocate and initialize input/output tensor buffers for the inference
   std::vector<void*> input_buffers;
   for (size_t buffer_size : input_sizes) {
     void* buffer = malloc(buffer_size);
     memset(static_cast<char*>(buffer), 0, buffer_size);
     input_buffers.push_back(buffer);
   }
   ET_LOG(Info, "Allocated the inputs");

   std::vector<void*> output_buffers;
   for (size_t buffer_size : output_sizes) {
     void* buffer = malloc(buffer_size);
     memset(static_cast<char*>(buffer), 0, buffer_size);
     output_buffers.push_back(buffer);
   }
   ET_LOG(Info, "Allocated the outputs");

   /*
    * Step 6: The control core forwards the inference request and the worker
    * core runs the program.
    */

   // Run the inference on the inputs. CHECK-fails on error.
   worker::inference_loop(method, input_buffers, output_buffers);

   for (void* buffer : input_buffers) {
     free(buffer);
   }
   for (void* buffer : output_buffers) {
     free(buffer);
   }

   return 0;
 }
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#include <memory>
	#include <vector>

	#include <executorch/extension/data_loader/buffer_data_loader.h>
	#include <executorch/runtime/executor/method.h>
	#include <executorch/runtime/executor/program.h>
	#include <executorch/runtime/platform/log.h>
	#include <executorch/runtime/platform/runtime.h>
	#include <executorch/util/read_file.h>
	#include <executorch/util/util.h>

	#include <gflags/gflags.h>

	using namespace torch::executor;

	/**
	* @file
	*
	* In some hardware environments, the same model may run on different cores for
	* different inference requests. The same core may also see a power-cycle (i.e.,
	* power down and then back up) in between two inference requests.
	*
	* For ExecuTorch to work efficiently in these environments, we want to
	* initialize the Method once once for the model and avoid re-initializing it
	* for every inference. This can be achieved by restricting the runtime contexts
	* (torch::executor::Program and torch::executor::Method) to live in a
	* pre-allocated, shared, and persistent memory.
	*
	* This tool demonstrates that the memory can be managed this way.
	*/

	static uint8_t method_allocator_pool[2 * 1024U * 1024U]; // 4 MB

	#define MAX_INPUTS_PER_MODEL 16
	#define MAX_OUTPUTS_PER_MODEL 8

	DEFINE_string(
	model_path,
	"model.pte",
	"Model serialized in flatbuffer format.");

	// These functions represent the work done on a worker core.
	namespace worker {

	Program* load_program(
	const void* file_data,
	size_t file_data_len,
	MemoryAllocator& allocator) {
	// Wrap the data in a DataLoader. The Program will take a pointer to it, so it
	// must live for at least as long as the Program instance.
	auto loader = allocator.allocateInstance<util::BufferDataLoader>();
	ET_CHECK(loader != nullptr);
	new (loader) util::BufferDataLoader(file_data, file_data_len);

	// Load the program.
	Result<Program> program_result = Program::load(loader);
	ET_CHECK(program_result.ok());

	// Move the Program into worker memory.
	auto program = allocator.allocateInstance<Program>();
	ET_CHECK(program != nullptr);
	new (program) Program(std::move(program_result.get()));

	return program;
	}

	MemoryManager* create_memory_manager(
	MethodMeta* method_meta,
	MemoryAllocator& worker_allocator) {
	// Create the runtime allocator.
	auto* method_allocator = worker_allocator.allocateInstance<MemoryAllocator>();
	ET_CHECK(method_allocator != nullptr);
	new (method_allocator)
	MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool);

	// Create the memory planned buffers.
	size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
	Span<uint8_t>* memory_planned_buffers =
	worker_allocator.allocateList<Span<uint8_t>>(num_memory_planned_buffers);
	ET_CHECK(memory_planned_buffers != nullptr);
	for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
	const size_t buffer_size =
	method_meta->memory_planned_buffer_size(id).get();
	ET_LOG(
	Info, "Setting up planned buffer id %zu, size %zu.", id, buffer_size);
	void* buffer = worker_allocator.allocate(buffer_size);
	ET_CHECK(buffer != nullptr);
	memory_planned_buffers[id] = {(uint8_t*)buffer, buffer_size};
	ET_LOG(
	Info,
	"Created memory_planned_buffers with size %zu and addr %p",
	buffer_size,
	buffer);
	}
	auto* planned_memory =
	worker_allocator.allocateInstance<HierarchicalAllocator>();
	ET_CHECK(planned_memory != nullptr);
	new (planned_memory) HierarchicalAllocator(
	{memory_planned_buffers, num_memory_planned_buffers});

	// The constant allocator is not currently used, but must be provided.
	auto* const_allocator = worker_allocator.allocateInstance<MemoryAllocator>();
	ET_CHECK(const_allocator != nullptr);
	new (const_allocator) MemoryAllocator(0, nullptr);

	// Assemble all of the allocators into the MemoryManager that the Method
	// will use.
	auto* memory_manager = worker_allocator.allocateInstance<MemoryManager>();
	ET_CHECK(memory_manager != nullptr);
	new (memory_manager) MemoryManager(method_allocator, planned_memory);

	return memory_manager;
	}

	Method* init_method(
	Program* program,
	const char* method_name,
	MemoryAllocator& worker_allocator,
	std::vector<size_t>& input_sizes,
	std::vector<size_t>& output_sizes) {
	Result<MethodMeta> method_meta = program->method_meta(method_name);
	ET_CHECK(method_meta.ok());

	MemoryManager* memory_manager =
	create_memory_manager(&method_meta.get(), worker_allocator);

	//
	// Create and load a method from the program, using the provided
	// allocators. The Method is what actually runs the model. It is
	// mutable, so should only be used by a single thread at at time, but it can
	// be reused.
	//

	auto* method = worker_allocator.allocateInstance<Method>();
	ET_CHECK(method != nullptr);
	auto method_res = program->load_method(method_name, memory_manager);
	ET_CHECK_MSG(
	method_res.error() == Error::Ok,
	"loading method('%s') failed with status 0x%" PRIx32,
	method_name,
	method_res.error());
	new (method) Method(std::move(method_res.get()));

	ET_LOG(Info, "Model method '%s' initialized.", method_name);

	// Gather the byte size of each input/output tensor.
	const size_t input_size = method->inputs_size();
	for (size_t i = 0; i < input_size; i++) {
	if (!method->get_input(i).isTensor()) {
	ET_LOG(Info, "input %zu is not a tensor, skipping", i);
	continue;
	}
	const auto& t = method->get_input(i).toTensor();
	input_sizes.push_back(t.nbytes());
	}

	const size_t output_size = method->outputs_size();
	for (size_t i = 0; i < output_size; i++) {
	const auto& t = method->get_output(i).toTensor();
	output_sizes.push_back(t.nbytes());
	}

	return method;
	}

	void inference_loop(
	Method* method,
	const std::vector<void*>& input_buffers,
	const std::vector<void*>& output_buffers) {
	ET_LOG(
	Info,
	"Assigning input pointers, receiving %lu inputs",
	input_buffers.size());

	// Prepare the inputs.
	{
	size_t bufi = 0;
	for (size_t i = 0; i < method->inputs_size(); i++) {
	if (!method->get_input(i).isTensor()) {
	ET_LOG(Info, "input %zu is not a tensor, skipping", i);
	continue;
	}
	const auto& t = method->get_input(i).toTensor();
	ET_CHECK_MSG(
	bufi < input_buffers.size(), "Not enough input buffers for model");
	t.set_data(input_buffers[bufi++]);
	}
	}
	ET_LOG(Info, "Inputs prepared.");

	// Prepare the outputs.
	{
	size_t bufi = 0;
	for (size_t i = 0; i < method->outputs_size(); i++) {
	if (!method->get_output(i).isTensor()) {
	ET_LOG(Info, "output %zu is not a tensor, skipping", i);
	continue;
	}
	const auto& t = method->get_output(i).toTensor();
	ET_CHECK_MSG(
	bufi < output_buffers.size(), "Not enough output buffers for model");
	t.set_data(output_buffers[bufi++]);
	}
	}
	ET_LOG(Info, "Outputs prepared.");

	// Run the model.
	Error status = method->execute();
	ET_CHECK_MSG(
	status == Error::Ok,
	"method->execute() failed with status 0x%" PRIx32,
	status);
	ET_LOG(Info, "Model executed successfully.");
	}

	} // namespace worker

	/*
	* This is an example of how ExecuTorch stack should run on multiple
	* processors setup where there is a control core for memory
	* management and a worker core that runs the actual inference.
	*/

	int main(int argc, char** argv) {
	torch::executor::runtime_init();
	gflags::ParseCommandLineFlags(&argc, &argv, true);

	/*
	* Step 1: The model gets loaded from file to memory on the control core
	*/
	std::shared_ptr<char> file_data;
	size_t file_size;
	Error err = torch::executor::util::read_file_content(
	FLAGS_model_path.c_str(), &file_data, &file_size);
	ET_CHECK_MSG(err == Error::Ok, "read_file_content failed: %d", int(err));

	/*
	* Step 2: Prepare the memory space required for worker core
	*/
	// The actual allocation size can be backend/model specific and smaller
	constexpr size_t kWorkerBufferSize = 1 * 1024U * 1024U; // 1 MB
	auto worker_buffer = std::make_unique<uint8_t[]>(kWorkerBufferSize);
	MemoryAllocator worker_allocator(kWorkerBufferSize, worker_buffer.get());

	/*
	* Step 3: The worker core sets up the corresponding data structures for the
	* program
	*/
	Program* program =
	worker::load_program(file_data.get(), file_size, worker_allocator);
	ET_LOG(
	Info,
	"Loaded %s and constructed program at %p",
	FLAGS_model_path.c_str(),
	program);
	ET_CHECK(program != nullptr);

	/*
	* Step 4: The worker core sets up the Method. Here we let the control
	* core read out the I/O info from the Method. This can also be done on
	* the control core from the program flatbuffer, though there is no
	* direct API at the moment.
	*/

	// Get the method name to execute.
	const char* method_name = nullptr;
	{
	// Use the first method in the program.
	const auto method_name_result = program->get_method_name(0);
	ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
	method_name = *method_name_result;
	}
	ET_LOG(Info, "Using method %s", method_name);

	std::vector<size_t> input_sizes;
	std::vector<size_t> output_sizes;

	Method* method = worker::init_method(
	program, method_name, worker_allocator, input_sizes, output_sizes);

	ET_LOG(
	Info,
	"Number of inputs is %lu and number of outputs is %lu",
	input_sizes.size(),
	output_sizes.size());

	/*
	* Step 5: The control core or the applicaton code prepares the I/O
	*/

	// Allocate and initialize input/output tensor buffers for the inference
	std::vector<void*> input_buffers;
	for (size_t buffer_size : input_sizes) {
	void* buffer = malloc(buffer_size);
	memset(static_cast<char*>(buffer), 0, buffer_size);
	input_buffers.push_back(buffer);
	}
	ET_LOG(Info, "Allocated the inputs");

	std::vector<void*> output_buffers;
	for (size_t buffer_size : output_sizes) {
	void* buffer = malloc(buffer_size);
	memset(static_cast<char*>(buffer), 0, buffer_size);
	output_buffers.push_back(buffer);
	}
	ET_LOG(Info, "Allocated the outputs");

	/*
	* Step 6: The control core forwards the inference request and the worker
	* core runs the program.
	*/

	// Run the inference on the inputs. CHECK-fails on error.
	worker::inference_loop(method, input_buffers, output_buffers);

	for (void* buffer : input_buffers) {
	free(buffer);
	}
	for (void* buffer : output_buffers) {
	free(buffer);
	}

	return 0;
	}