sdk/runners/executor_runner.cpp - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #include <gflags/gflags.h>

 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/sdk/etdump/etdump.h>
 #include <executorch/util/bundled_program_verification.h>
 #include <executorch/util/util.h>
 #ifdef USE_ATEN_LIB
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #endif

 #if !defined(USE_ATEN_LIB)
 #include <executorch/backends/xnnpack/threadpool/fb/threadpool_use_n_threads.h>
 #include <executorch/backends/xnnpack/threadpool/threadpool.h>
 #endif

 // This tool includes all of the headers necessary to execute a model.
 // Demonstrate that those headers do not expose the internal flatbuffers
 // headers.
 #ifdef FLATBUFFERS_VERSION_MAJOR
 // FLATBUFFERS_VERSION_MAJOR is defined by flatbuffers/base.h, which is included
 // by all other flatbuffers library headers and by any generated headers. If
 // it's present, it means that this file is including a flatbuffers header
 // somewhere.
 #error "The executorch headers must not expose flatbuffers.h"
 #endif

 using namespace torch::executor;

 static constexpr size_t kRuntimeMemorySize = 4 * 1024U * 1024U; // 4 MB
 static uint8_t runtime_pool[kRuntimeMemorySize];
 static constexpr size_t kBundledAllocatorPoolSize = 16 * 1024U;
 static uint8_t bundled_allocator_pool[kBundledAllocatorPoolSize];

 DEFINE_bool(
     bundled_program,
     false,
     "True for running bundled program, false for executorch_flatbuffer::program");

 DEFINE_bool(
     generate_etdump,
     false,
     "If enabled etdump containing profiling data will be generated");

 DEFINE_string(
     etdump_path,
     "etdump.etdp",
     "If etdump generation is enabled an etdump will be written out to this path");

 DEFINE_string(
     prof_result_path,
     "prof_result.bin",
     "Executorch profiler output path.");

 DEFINE_bool(print_output, false, "Prints output of the model.");

 DEFINE_int32(num_iters, 1, "Number of inference iterations to run.");

 DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");

 DEFINE_int32(num_threads, 1, "Number of threads to use.");

 DEFINE_int32(
     testset_idx,
     0,
     "Index of bundled verification set to be run "
     "by bundled model for verification");

 DEFINE_double(
     rtol,
     1e-5,
     "The relative tolerance used for bundled program verification.");

 DEFINE_double(
     atol,
     1e-8,
     "The absolute tolerance used for bundled program verification.");

 /**
  * Helps handle bundled and non-bundled program inputs.
  */
 class ProgramData {
  public:
   /**
    * Tries loading the named file as a plain Program or a bundled program,
    * failing with an ET_CHECK on any failure.
    */
   static ProgramData load_or_die(std::string& filename) {
     // Create a DataLoader that wraps the input file. It may be a plain Program,
     // or it may be a BundledProgram that contains a Program.
     Result<util::FileDataLoader> loader =
         util::FileDataLoader::From(filename.c_str());
     ET_CHECK_MSG(
         loader.ok(),
         "Could not create loader for file '%s': 0x%x",
         filename.c_str(),
         (unsigned int)loader.error());

     // Figure out the file type. Create a scope to destroy the header after the
     // check.
     {
       Result<FreeableBuffer> header =
           loader->Load(/*offset=*/0, Program::kMinHeadBytes);
       ET_CHECK_MSG(
           header.ok(),
           "Could not load header of file '%s': 0x%x",
           filename.c_str(),
           (unsigned int)loader.error());
       Program::HeaderStatus hs =
           Program::check_header(header->data(), header->size());
       if (hs == Program::HeaderStatus::CompatibleVersion) {
         // It's a plain Program. We can use the existing loader, and there is no
         // bundled program data.
         return ProgramData(
             new util::FileDataLoader(std::move(*loader)),
             /*bundled_program_data=*/FreeableBuffer());
       }
     }

     // Try treating it as a bundled program.

     // Read in the entire file.
     Result<FreeableBuffer> file_data = loader->Load(0, loader->size().get());
     ET_CHECK_MSG(
         file_data.ok(),
         "Could not load contents of file '%s': 0x%x",
         filename.c_str(),
         (unsigned int)file_data.error());

     // Find the offset to the embedded Program.
     const void* program_data;
     size_t program_data_len;
     Error status = torch::executor::util::GetProgramData(
         const_cast<void*>(file_data->data()),
         file_data->size(),
         &program_data,
         &program_data_len);
     ET_CHECK_MSG(
         status == Error::Ok,
         "GetProgramData() failed on file '%s': 0x%x",
         filename.c_str(),
         (unsigned int)status);

     // Wrap the Program in a loader, and pass on the FreeableBuffer that
     // contains the full bundled program data.
     return ProgramData(
         new util::BufferDataLoader(program_data, program_data_len),
         std::move(*file_data));
   }

   /**
    * Returns the loader for the plain Program. May or may not be inside a
    * bundled program wrapper.
    */
   DataLoader* program_loader() {
     return loader_.get();
   }

   /**
    * If the file was a bundled program, returns a pointer to the file data.
    * Otherwise returns nullptr.
    */
   const void* bundled_program_data() const {
     if (bundled_program_data_.size() > 0) {
       return bundled_program_data_.data();
     } else {
       return nullptr;
     }
   }

  private:
   /// Takes ownership of both params.
   ProgramData(DataLoader* loader, FreeableBuffer&& bundled_program_data)
       : loader_(loader),
         bundled_program_data_(std::move(bundled_program_data)) {}

   std::unique_ptr<DataLoader> loader_;
   FreeableBuffer bundled_program_data_;
 };

 int main(int argc, char** argv) {
   torch::executor::runtime_init();

   gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (argc != 1) {
     std::string msg = "Extra commandline args: ";
     for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
       msg += argv[i];
     }
     ET_LOG(Error, "%s", msg.c_str());
     return 1;
   }

   ET_CHECK_MSG(
       FLAGS_num_threads >= 1,
       "Please specifiy valid number of threads to use.");

   // Load the file.
   auto program_data = ProgramData::load_or_die(FLAGS_model_path);

   // Parse the program file. This is immutable, and can also be reused between
   // multiple execution invocations across multiple threads.
   uint32_t prof_tok = EXECUTORCH_BEGIN_PROF("de-serialize model");
   Result<Program> program =
       torch::executor::Program::Load(program_data.program_loader());
   EXECUTORCH_END_PROF(prof_tok);
   if (!program.ok()) {
     ET_LOG(Error, "Failed to parse model file %s", FLAGS_model_path.c_str());
     return 1;
   }
   ET_LOG(Info, "Model file %s is loaded.", FLAGS_model_path.c_str());

   // Use the first method in the program.
   const size_t method_index = 0;
   const char* method_name = nullptr;
   {
     const auto method_name_result = program->get_method_name(method_index);
     ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
     method_name = *method_name_result;
   }
   ET_LOG(Info, "Running method %s", method_name);

   //
   // The runtime does not use malloc/new; it allocates all memory using the
   // MemoryManger provided by the client. Clients are responsible for allocating
   // the memory ahead of time, or providing MemoryAllocator subclasses that can
   // do it dynamically.
   //

   // The runtime allocator is used to allocate all dynamic C++ metadata/objects
   // used to represent the loaded program. This allocator is only used during
   // Program::load_method(), which will return an error if there was not enough
   // memory.
   //
   // The amount of memory required depends on the loaded program and the runtime
   // code itself. The amount of memory here is usually determined by running the
   // program and seeing how much memory is actually used, though it's possible
   // to subclass MemoryAllocator so that it calls malloc() under the hood.
   MemoryAllocator runtime_allocator{
       MemoryAllocator(kRuntimeMemorySize, runtime_pool)};
   runtime_allocator.enable_profiling("runtime allocator");

   // The non-const allocator is used to provide the memory-planned buffers that
   // back mutable tensors. Since it was planned ahead of time, the Program knows
   // how big each of the allocators needs to be.
   //
   // These buffers correspond to different hardware memory banks. Most mobile
   // environments will only have a single buffer. Some embedded environments may
   // have more than one for, e.g., slow/large DRAM and fast/small SRAM.
   std::vector<std::unique_ptr<uint8_t[]>> non_const_buffers;
   std::vector<MemoryAllocator> non_const_allocators;
   size_t num_non_const_buffers = 0;
   {
     auto result = program->num_non_const_buffers(method_name);
     ET_CHECK_MSG(
         result.ok(),
         "Failed to get number of non-const buffers for method %s: 0x%x",
         method_name,
         (unsigned int)result.error());
     num_non_const_buffers = *result;
   }
   // Note that this loop starts at ID 1, because ID 0 is reserved. But, the
   // HierarchicalAllocator indices are zero-based, so it's later adjusted by -1.
   // TODO(T142455629): Make HierarchicalAllocator ID-based to avoid this
   // memory_id-1.
   for (size_t id = 1; id < num_non_const_buffers; ++id) {
     auto buffer_size = program->get_non_const_buffer_size(id, method_name);
     ET_CHECK_MSG(
         buffer_size.ok(),
         "Failed to get size of non-const buffer %zu for method %s: 0x%x",
         id,
         method_name,
         (unsigned int)buffer_size.error());
     ET_LOG(
         Info, "Setting up non-const buffer %zu, size %zu.", id, *buffer_size);
     non_const_buffers.push_back(std::make_unique<uint8_t[]>(*buffer_size));
     // Since the list of allocators began empty, buffer ID N will live at index
     // N-1.
     non_const_allocators.push_back(
         MemoryAllocator(*buffer_size, non_const_buffers.back().get()));
     non_const_allocators.back().enable_profiling("non_const_allocators");
   }
   HierarchicalAllocator non_const_allocator(
       non_const_allocators.size(), non_const_allocators.data());

   // The constant allocator is not currently used. Please initialize with a
   // zero-sized allocator.
   MemoryAllocator const_allocator{MemoryAllocator(0, nullptr)};
   const_allocator.enable_profiling("const allocator");

   // The kernel temporary allocator is not currently used. Please initialize
   // with a zero-sized allocator.
   MemoryAllocator temp_allocator{MemoryAllocator(0, nullptr)};
   temp_allocator.enable_profiling("temp allocator");

   // Allocator for bundled input.
   MemoryAllocator bundled_input_allocator{
       MemoryAllocator(kBundledAllocatorPoolSize, bundled_allocator_pool)};

   // Assemble all of the allocators into the MemoryManager that the Executor
   // will use.
   MemoryManager memory_manager(
       &const_allocator,
       &non_const_allocator,
       &runtime_allocator,
       &temp_allocator);

   //
   // Load the named method from the Program, using the provided allocators. The
   // Method is what actually performs the inference. It is mutable and not
   // locked, so should only used by a single thread at a time. But, it can be
   // reused for multiple inferences.
   //

   prof_tok = EXECUTORCH_BEGIN_PROF("load model");
   Result<Method> method = program->load_method(method_name, &memory_manager);
   EXECUTORCH_END_PROF(prof_tok);
   ET_CHECK_MSG(
       method.ok(),
       "load_method() failed with status 0x%" PRIx32,
       method.error());

   ET_LOG(Info, "Model initialized.");

 #ifdef USE_ATEN_LIB
   // [TLS handling] This is to workaround an assertion failure
   // (https://fburl.com/code/302jyn8d) running `gelu` in ATen mode in fbcode
   // (such as bento). The problem is Executorch ATen mode doesn't have Thread
   // Local State, but `torch-cpp` is assuming tls init is done. There are two
   // more checks: MKLDNN disabled and C10_MOBILE, if any of them is true we
   // won't be hitting this assertion error. However in `torch-cpp` lib both
   // checks are false. Production impact: this should not make any impact in
   // production environment, given that in xplat we are depending on a library
   // that enables C10_MOBILE (`torch_mobile_core`).
   c10::impl::ExcludeDispatchKeyGuard no_autograd(c10::autograd_dispatch_keyset);
 #endif

 #if !defined(USE_ATEN_LIB)
   // To enable intra-op parallelism
   // This sets the # of threads to use for running executorch model
   // to num_threads. Applicable to lean mode.
   torch::executorch::threadpool::UseNThreadsThreadPoolGuard thread_pool_guard(
       FLAGS_num_threads);
   ET_CHECK_MSG(
       thread_pool_guard.guard_armed(),
       "Could not set # of threads to use. "
       "Num threads requested is %d, Threadpool size is: %ld",
       FLAGS_num_threads,
       torch::executorch::threadpool::get_threadpool()->get_thread_count());
 #endif
   // Run the model multiple times if requested.
   Error status;
   for (size_t i = 0; i < FLAGS_num_iters; i++) {
     // Prepare the inputs.
     exec_aten::ArrayRef<void*> inputs;
     if (FLAGS_bundled_program) {
       // Use the inputs embedded in the bundled program.
       status = torch::executor::util::LoadBundledInput(
           *method,
           program_data.bundled_program_data(),
           &bundled_input_allocator,
           method_index,
           FLAGS_testset_idx);
       ET_CHECK_MSG(
           status == Error::Ok,
           "LoadBundledInput failed with status 0x%" PRIx32,
           status);
     } else {
       // Use ones-initialized inputs.
       inputs = torch::executor::util::PrepareInputTensors(*method);
     }
     ET_LOG(Info, "Inputs prepared.");

     // Run the model.
     EXECUTORCH_PROFILE_CREATE_BLOCK("inference loop");
     prof_tok = EXECUTORCH_BEGIN_PROF("run model");
     status = method->execute();
     EXECUTORCH_END_PROF(prof_tok);
     ET_CHECK_MSG(
         status == Error::Ok,
         "method->execute() failed with status 0x%" PRIx32,
         status);
     ET_LOG(Info, "Model executed successfully.");

     // Handle the outputs.
     if (FLAGS_bundled_program) {
       status = torch::executor::util::VerifyResultWithBundledExpectedOutput(
           *method,
           program_data.bundled_program_data(),
           &bundled_input_allocator,
           method_index,
           FLAGS_testset_idx,
           FLAGS_rtol,
           FLAGS_atol);
       ET_CHECK_MSG(
           status == Error::Ok,
           "Bundle verification failed with status 0x%" PRIx32,
           status);
       ET_LOG(Info, "Model verified successfully.");
     } else {
       torch::executor::util::FreeInputs(inputs);
     }
   }

   // Print the outputs if requested.
   if (FLAGS_print_output) {
     auto output_list = std::make_unique<EValue[]>(method->outputs_size());
     status = method->get_outputs(output_list.get(), method->outputs_size());
     ET_CHECK_MSG(
         status == Error::Ok,
         "get_outputs failed with status 0x%" PRIx32,
         status);

     // TODO(T139071931): Don't assume that all outputs are tensors.
     for (size_t i = 0; i < method->outputs_size(); i++) {
       auto output_tensor = output_list[i].toTensor();
       const float* data_output = output_tensor.const_data_ptr<float>();
       for (size_t j = 0; j < output_list[i].toTensor().numel(); ++j) {
         ET_LOG(Info, "%f", data_output[j]);
       }
     }
   }

   // Dump the profiling data to the specified file.
   torch::executor::prof_result_t prof_result;
   EXECUTORCH_DUMP_PROFILE_RESULTS(&prof_result);
   if (prof_result.num_bytes != 0) {
     FILE* ptr = fopen(FLAGS_prof_result_path.c_str(), "w+");
     fwrite(prof_result.prof_data, 1, prof_result.num_bytes, ptr);
     fclose(ptr);
   }

   if (FLAGS_generate_etdump) {
     ETDump et_dump(runtime_allocator);
     auto ret =
         et_dump.serialize_prof_results_to_etdump(FLAGS_etdump_path.c_str());
     if (ret != torch::executor::Error::Ok) {
       ET_LOG(Error, "Failed to serialize and write out etdump data.");
       return -1;
     }
   }

   return 0;
 }
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#include <gflags/gflags.h>

	#include <executorch/extension/data_loader/buffer_data_loader.h>
	#include <executorch/extension/data_loader/file_data_loader.h>
	#include <executorch/runtime/executor/method.h>
	#include <executorch/runtime/executor/program.h>
	#include <executorch/runtime/platform/log.h>
	#include <executorch/runtime/platform/profiler.h>
	#include <executorch/runtime/platform/runtime.h>
	#include <executorch/sdk/etdump/etdump.h>
	#include <executorch/util/bundled_program_verification.h>
	#include <executorch/util/util.h>
	#ifdef USE_ATEN_LIB
	#include <c10/core/impl/LocalDispatchKeySet.h>
	#endif

	#if !defined(USE_ATEN_LIB)
	#include <executorch/backends/xnnpack/threadpool/fb/threadpool_use_n_threads.h>
	#include <executorch/backends/xnnpack/threadpool/threadpool.h>
	#endif

	// This tool includes all of the headers necessary to execute a model.
	// Demonstrate that those headers do not expose the internal flatbuffers
	// headers.
	#ifdef FLATBUFFERS_VERSION_MAJOR
	// FLATBUFFERS_VERSION_MAJOR is defined by flatbuffers/base.h, which is included
	// by all other flatbuffers library headers and by any generated headers. If
	// it's present, it means that this file is including a flatbuffers header
	// somewhere.
	#error "The executorch headers must not expose flatbuffers.h"
	#endif

	using namespace torch::executor;

	static constexpr size_t kRuntimeMemorySize = 4 * 1024U * 1024U; // 4 MB
	static uint8_t runtime_pool[kRuntimeMemorySize];
	static constexpr size_t kBundledAllocatorPoolSize = 16 * 1024U;
	static uint8_t bundled_allocator_pool[kBundledAllocatorPoolSize];

	DEFINE_bool(
	bundled_program,
	false,
	"True for running bundled program, false for executorch_flatbuffer::program");

	DEFINE_bool(
	generate_etdump,
	false,
	"If enabled etdump containing profiling data will be generated");

	DEFINE_string(
	etdump_path,
	"etdump.etdp",
	"If etdump generation is enabled an etdump will be written out to this path");

	DEFINE_string(
	prof_result_path,
	"prof_result.bin",
	"Executorch profiler output path.");

	DEFINE_bool(print_output, false, "Prints output of the model.");

	DEFINE_int32(num_iters, 1, "Number of inference iterations to run.");

	DEFINE_string(
	model_path,
	"model.pte",
	"Model serialized in flatbuffer format.");

	DEFINE_int32(num_threads, 1, "Number of threads to use.");

	DEFINE_int32(
	testset_idx,
	0,
	"Index of bundled verification set to be run "
	"by bundled model for verification");

	DEFINE_double(
	rtol,
	1e-5,
	"The relative tolerance used for bundled program verification.");

	DEFINE_double(
	atol,
	1e-8,
	"The absolute tolerance used for bundled program verification.");

	/**
	* Helps handle bundled and non-bundled program inputs.
	*/
	class ProgramData {
	public:
	/**
	* Tries loading the named file as a plain Program or a bundled program,
	* failing with an ET_CHECK on any failure.
	*/
	static ProgramData load_or_die(std::string& filename) {
	// Create a DataLoader that wraps the input file. It may be a plain Program,
	// or it may be a BundledProgram that contains a Program.
	Result<util::FileDataLoader> loader =
	util::FileDataLoader::From(filename.c_str());
	ET_CHECK_MSG(
	loader.ok(),
	"Could not create loader for file '%s': 0x%x",
	filename.c_str(),
	(unsigned int)loader.error());

	// Figure out the file type. Create a scope to destroy the header after the
	// check.
	{
	Result<FreeableBuffer> header =
	loader->Load(/offset=/0, Program::kMinHeadBytes);
	ET_CHECK_MSG(
	header.ok(),
	"Could not load header of file '%s': 0x%x",
	filename.c_str(),
	(unsigned int)loader.error());
	Program::HeaderStatus hs =
	Program::check_header(header->data(), header->size());
	if (hs == Program::HeaderStatus::CompatibleVersion) {
	// It's a plain Program. We can use the existing loader, and there is no
	// bundled program data.
	return ProgramData(
	new util::FileDataLoader(std::move(*loader)),
	/bundled_program_data=/FreeableBuffer());
	}
	}

	// Try treating it as a bundled program.

	// Read in the entire file.
	Result<FreeableBuffer> file_data = loader->Load(0, loader->size().get());
	ET_CHECK_MSG(
	file_data.ok(),
	"Could not load contents of file '%s': 0x%x",
	filename.c_str(),
	(unsigned int)file_data.error());

	// Find the offset to the embedded Program.
	const void* program_data;
	size_t program_data_len;
	Error status = torch::executor::util::GetProgramData(
	const_cast<void*>(file_data->data()),
	file_data->size(),
	&program_data,
	&program_data_len);
	ET_CHECK_MSG(
	status == Error::Ok,
	"GetProgramData() failed on file '%s': 0x%x",
	filename.c_str(),
	(unsigned int)status);

	// Wrap the Program in a loader, and pass on the FreeableBuffer that
	// contains the full bundled program data.
	return ProgramData(
	new util::BufferDataLoader(program_data, program_data_len),
	std::move(*file_data));
	}

	/**
	* Returns the loader for the plain Program. May or may not be inside a
	* bundled program wrapper.
	*/
	DataLoader* program_loader() {
	return loader_.get();
	}

	/**
	* If the file was a bundled program, returns a pointer to the file data.
	* Otherwise returns nullptr.
	*/
	const void* bundled_program_data() const {
	if (bundled_program_data_.size() > 0) {
	return bundled_program_data_.data();
	} else {
	return nullptr;
	}
	}

	private:
	/// Takes ownership of both params.
	ProgramData(DataLoader* loader, FreeableBuffer&& bundled_program_data)
	: loader_(loader),
	bundled_program_data_(std::move(bundled_program_data)) {}

	std::unique_ptr<DataLoader> loader_;
	FreeableBuffer bundled_program_data_;
	};

	int main(int argc, char** argv) {
	torch::executor::runtime_init();

	gflags::ParseCommandLineFlags(&argc, &argv, true);
	if (argc != 1) {
	std::string msg = "Extra commandline args: ";
	for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
	msg += argv[i];
	}
	ET_LOG(Error, "%s", msg.c_str());
	return 1;
	}

	ET_CHECK_MSG(
	FLAGS_num_threads >= 1,
	"Please specifiy valid number of threads to use.");

	// Load the file.
	auto program_data = ProgramData::load_or_die(FLAGS_model_path);

	// Parse the program file. This is immutable, and can also be reused between
	// multiple execution invocations across multiple threads.
	uint32_t prof_tok = EXECUTORCH_BEGIN_PROF("de-serialize model");
	Result<Program> program =
	torch::executor::Program::Load(program_data.program_loader());
	EXECUTORCH_END_PROF(prof_tok);
	if (!program.ok()) {
	ET_LOG(Error, "Failed to parse model file %s", FLAGS_model_path.c_str());
	return 1;
	}
	ET_LOG(Info, "Model file %s is loaded.", FLAGS_model_path.c_str());

	// Use the first method in the program.
	const size_t method_index = 0;
	const char* method_name = nullptr;
	{
	const auto method_name_result = program->get_method_name(method_index);
	ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
	method_name = *method_name_result;
	}
	ET_LOG(Info, "Running method %s", method_name);

	//
	// The runtime does not use malloc/new; it allocates all memory using the
	// MemoryManger provided by the client. Clients are responsible for allocating
	// the memory ahead of time, or providing MemoryAllocator subclasses that can
	// do it dynamically.
	//

	// The runtime allocator is used to allocate all dynamic C++ metadata/objects
	// used to represent the loaded program. This allocator is only used during
	// Program::load_method(), which will return an error if there was not enough
	// memory.
	//
	// The amount of memory required depends on the loaded program and the runtime
	// code itself. The amount of memory here is usually determined by running the
	// program and seeing how much memory is actually used, though it's possible
	// to subclass MemoryAllocator so that it calls malloc() under the hood.
	MemoryAllocator runtime_allocator{
	MemoryAllocator(kRuntimeMemorySize, runtime_pool)};
	runtime_allocator.enable_profiling("runtime allocator");

	// The non-const allocator is used to provide the memory-planned buffers that
	// back mutable tensors. Since it was planned ahead of time, the Program knows
	// how big each of the allocators needs to be.
	//
	// These buffers correspond to different hardware memory banks. Most mobile
	// environments will only have a single buffer. Some embedded environments may
	// have more than one for, e.g., slow/large DRAM and fast/small SRAM.
	std::vector<std::unique_ptr<uint8_t[]>> non_const_buffers;
	std::vector<MemoryAllocator> non_const_allocators;
	size_t num_non_const_buffers = 0;
	{
	auto result = program->num_non_const_buffers(method_name);
	ET_CHECK_MSG(
	result.ok(),
	"Failed to get number of non-const buffers for method %s: 0x%x",
	method_name,
	(unsigned int)result.error());
	num_non_const_buffers = *result;
	}
	// Note that this loop starts at ID 1, because ID 0 is reserved. But, the
	// HierarchicalAllocator indices are zero-based, so it's later adjusted by -1.
	// TODO(T142455629): Make HierarchicalAllocator ID-based to avoid this
	// memory_id-1.
	for (size_t id = 1; id < num_non_const_buffers; ++id) {
	auto buffer_size = program->get_non_const_buffer_size(id, method_name);
	ET_CHECK_MSG(
	buffer_size.ok(),
	"Failed to get size of non-const buffer %zu for method %s: 0x%x",
	id,
	method_name,
	(unsigned int)buffer_size.error());
	ET_LOG(
	Info, "Setting up non-const buffer %zu, size %zu.", id, *buffer_size);
	non_const_buffers.push_back(std::make_unique<uint8_t[]>(*buffer_size));
	// Since the list of allocators began empty, buffer ID N will live at index
	// N-1.
	non_const_allocators.push_back(
	MemoryAllocator(*buffer_size, non_const_buffers.back().get()));
	non_const_allocators.back().enable_profiling("non_const_allocators");
	}
	HierarchicalAllocator non_const_allocator(
	non_const_allocators.size(), non_const_allocators.data());

	// The constant allocator is not currently used. Please initialize with a
	// zero-sized allocator.
	MemoryAllocator const_allocator{MemoryAllocator(0, nullptr)};
	const_allocator.enable_profiling("const allocator");

	// The kernel temporary allocator is not currently used. Please initialize
	// with a zero-sized allocator.
	MemoryAllocator temp_allocator{MemoryAllocator(0, nullptr)};
	temp_allocator.enable_profiling("temp allocator");

	// Allocator for bundled input.
	MemoryAllocator bundled_input_allocator{
	MemoryAllocator(kBundledAllocatorPoolSize, bundled_allocator_pool)};

	// Assemble all of the allocators into the MemoryManager that the Executor
	// will use.
	MemoryManager memory_manager(
	&const_allocator,
	&non_const_allocator,
	&runtime_allocator,
	&temp_allocator);

	//
	// Load the named method from the Program, using the provided allocators. The
	// Method is what actually performs the inference. It is mutable and not
	// locked, so should only used by a single thread at a time. But, it can be
	// reused for multiple inferences.
	//

	prof_tok = EXECUTORCH_BEGIN_PROF("load model");
	Result<Method> method = program->load_method(method_name, &memory_manager);
	EXECUTORCH_END_PROF(prof_tok);
	ET_CHECK_MSG(
	method.ok(),
	"load_method() failed with status 0x%" PRIx32,
	method.error());

	ET_LOG(Info, "Model initialized.");

	#ifdef USE_ATEN_LIB
	// [TLS handling] This is to workaround an assertion failure
	// (https://fburl.com/code/302jyn8d) running `gelu` in ATen mode in fbcode
	// (such as bento). The problem is Executorch ATen mode doesn't have Thread
	// Local State, but `torch-cpp` is assuming tls init is done. There are two
	// more checks: MKLDNN disabled and C10_MOBILE, if any of them is true we
	// won't be hitting this assertion error. However in `torch-cpp` lib both
	// checks are false. Production impact: this should not make any impact in
	// production environment, given that in xplat we are depending on a library
	// that enables C10_MOBILE (`torch_mobile_core`).
	c10::impl::ExcludeDispatchKeyGuard no_autograd(c10::autograd_dispatch_keyset);
	#endif

	#if !defined(USE_ATEN_LIB)
	// To enable intra-op parallelism
	// This sets the # of threads to use for running executorch model
	// to num_threads. Applicable to lean mode.
	torch::executorch::threadpool::UseNThreadsThreadPoolGuard thread_pool_guard(
	FLAGS_num_threads);
	ET_CHECK_MSG(
	thread_pool_guard.guard_armed(),
	"Could not set # of threads to use. "
	"Num threads requested is %d, Threadpool size is: %ld",
	FLAGS_num_threads,
	torch::executorch::threadpool::get_threadpool()->get_thread_count());
	#endif
	// Run the model multiple times if requested.
	Error status;
	for (size_t i = 0; i < FLAGS_num_iters; i++) {
	// Prepare the inputs.
	exec_aten::ArrayRef<void*> inputs;
	if (FLAGS_bundled_program) {
	// Use the inputs embedded in the bundled program.
	status = torch::executor::util::LoadBundledInput(
	*method,
	program_data.bundled_program_data(),
	&bundled_input_allocator,
	method_index,
	FLAGS_testset_idx);
	ET_CHECK_MSG(
	status == Error::Ok,
	"LoadBundledInput failed with status 0x%" PRIx32,
	status);
	} else {
	// Use ones-initialized inputs.
	inputs = torch::executor::util::PrepareInputTensors(*method);
	}
	ET_LOG(Info, "Inputs prepared.");

	// Run the model.
	EXECUTORCH_PROFILE_CREATE_BLOCK("inference loop");
	prof_tok = EXECUTORCH_BEGIN_PROF("run model");
	status = method->execute();
	EXECUTORCH_END_PROF(prof_tok);
	ET_CHECK_MSG(
	status == Error::Ok,
	"method->execute() failed with status 0x%" PRIx32,
	status);
	ET_LOG(Info, "Model executed successfully.");

	// Handle the outputs.
	if (FLAGS_bundled_program) {
	status = torch::executor::util::VerifyResultWithBundledExpectedOutput(
	*method,
	program_data.bundled_program_data(),
	&bundled_input_allocator,
	method_index,
	FLAGS_testset_idx,
	FLAGS_rtol,
	FLAGS_atol);
	ET_CHECK_MSG(
	status == Error::Ok,
	"Bundle verification failed with status 0x%" PRIx32,
	status);
	ET_LOG(Info, "Model verified successfully.");
	} else {
	torch::executor::util::FreeInputs(inputs);
	}
	}

	// Print the outputs if requested.
	if (FLAGS_print_output) {
	auto output_list = std::make_unique<EValue[]>(method->outputs_size());
	status = method->get_outputs(output_list.get(), method->outputs_size());
	ET_CHECK_MSG(
	status == Error::Ok,
	"get_outputs failed with status 0x%" PRIx32,
	status);

	// TODO(T139071931): Don't assume that all outputs are tensors.
	for (size_t i = 0; i < method->outputs_size(); i++) {
	auto output_tensor = output_list[i].toTensor();
	const float* data_output = output_tensor.const_data_ptr<float>();
	for (size_t j = 0; j < output_list[i].toTensor().numel(); ++j) {
	ET_LOG(Info, "%f", data_output[j]);
	}
	}
	}

	// Dump the profiling data to the specified file.
	torch::executor::prof_result_t prof_result;
	EXECUTORCH_DUMP_PROFILE_RESULTS(&prof_result);
	if (prof_result.num_bytes != 0) {
	FILE* ptr = fopen(FLAGS_prof_result_path.c_str(), "w+");
	fwrite(prof_result.prof_data, 1, prof_result.num_bytes, ptr);
	fclose(ptr);
	}

	if (FLAGS_generate_etdump) {
	ETDump et_dump(runtime_allocator);
	auto ret =
	et_dump.serialize_prof_results_to_etdump(FLAGS_etdump_path.c_str());
	if (ret != torch::executor::Error::Ok) {
	ET_LOG(Error, "Failed to serialize and write out etdump data.");
	return -1;
	}
	}

	return 0;
	}