extension/pybindings/pybindings.cpp - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #include <algorithm>
 #include <cstdio>
 #include <iostream>
 #include <memory>
 #include <stdexcept>
 #include <unordered_map>

 #include <pybind11/iostream.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>

 #include <executorch/devtools/bundled_program/bundled_program.h>
 #include <executorch/devtools/bundled_program/schema/bundled_program_schema_generated.h>
 #include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/extension/data_loader/mmap_data_loader.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
 #include <executorch/runtime/core/data_loader.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <executorch/runtime/platform/runtime.h>

 #include <ATen/Functions.h>
 #include <ATen/Tensor.h>
 #include <ATen/core/functional.h>
 #include <c10/core/ScalarTypeToTypeMeta.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/python.h>

 #ifndef USE_ATEN_LIB
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <executorch/extension/aten_util/aten_bridge.h>
 #endif

 /// Throws a runtime_error with the provided message if `error` is not `Ok`.
 #define THROW_IF_ERROR(error, message, ...)                       \
   ({                                                              \
     if ((error) != Error::Ok) {                                   \
       char msg_buf[128];                                          \
       snprintf(msg_buf, sizeof(msg_buf), message, ##__VA_ARGS__); \
       /* pybind will convert this to a python exception. */       \
       throw std::runtime_error(msg_buf);                          \
     }                                                             \
   })

 #define THROW_INDEX_IF_ERROR(error, message, ...)                 \
   ({                                                              \
     if ((error) != Error::Ok) {                                   \
       char msg_buf[128];                                          \
       snprintf(msg_buf, sizeof(msg_buf), message, ##__VA_ARGS__); \
       /* pybind will convert this to a python exception. */       \
       throw std::out_of_range(msg_buf);                           \
     }                                                             \
   })

 // Our logs work by writing to stderr. By default this is done through fprintf
 // (as defined in posix.cpp) which then does not show up in python environments.
 // Here we override the pal to use std::cerr which can be properly redirected by
 // scoped_estream_redirect.
 void et_pal_emit_log_message(
     et_timestamp_t timestamp,
     et_pal_log_level_t level,
     const char* filename,
     ET_UNUSED const char* function,
     size_t line,
     const char* message,
     ET_UNUSED size_t length) {
   std::cerr << "[" << filename << ":" << line << "] " << message << std::endl;
 }

 namespace py = pybind11;
 using executorch::bundled_program::verify_method_outputs;
 using ::executorch::extension::BufferDataLoader;
 using ::executorch::extension::MallocMemoryAllocator;
 using ::executorch::extension::MmapDataLoader;
 using ::executorch::runtime::ArrayRef;
 using ::executorch::runtime::DataLoader;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::EventTracerDebugLogLevel;
 using ::executorch::runtime::get_registered_kernels;
 using ::executorch::runtime::HierarchicalAllocator;
 using ::executorch::runtime::Kernel;
 using ::executorch::runtime::MemoryAllocator;
 using ::executorch::runtime::MemoryManager;
 using ::executorch::runtime::Method;
 using ::executorch::runtime::prof_result_t;
 using ::executorch::runtime::Program;
 using ::executorch::runtime::Result;
 using ::executorch::runtime::Span;
 using ::executorch::runtime::Tag;
 using torch::executor::etdump_result;
 using torch::executor::ETDumpGen;

 #ifndef USE_ATEN_LIB
 using ::executorch::extension::alias_attensor_to_etensor;
 using ::executorch::extension::alias_etensor_to_attensor;
 using ::executorch::extension::torch_to_executorch_scalar_type;
 #endif // !USE_ATEN_LIB

 namespace executorch {
 namespace extension {
 namespace pybindings {

 namespace {

 void write_data_to_file(const std::string& path, void* buf, size_t size) {
   FILE* f = fopen(path.c_str(), "w+");
   if (!f) {
     throw std::runtime_error(
         "Failed to open file " + path + ": " + strerror(errno));
   }
   size_t num_written = fwrite(buf, 1, size, f);
   if (num_written != size) {
     fclose(f);
     throw std::runtime_error("Failed to write etdump to file " + path);
   }
   int err = fclose(f);
   if (err) {
     throw std::runtime_error(
         "Failed to close etdump file " + path + ": " + strerror(err));
   }
 }

 void setup_output_storage(
     Method& method,
     const std::vector<Span<uint8_t>>& output_storages) {
   if (output_storages.size() != method.outputs_size()) {
     THROW_IF_ERROR(
         Error::InvalidArgument,
         "number of output storages %zu does not match number of outputs %zu",
         output_storages.size(),
         method.outputs_size());
   }
   for (size_t i = 0; i < output_storages.size(); ++i) {
     if (output_storages[i].size() == 0) {
       // Skip empty output storages, this would happen for non-tensor outputs
       // and memory planned outputs.
       continue;
     }
     Error output_status = method.set_output_data_ptr(
         output_storages[i].data(), output_storages[i].size(), i);
     // We already should be skipping non-tensor outputs, and memory planned
     // outputs so any error is real.
     THROW_IF_ERROR(
         output_status,
         "set_output_data_ptr failed for output %zu with error 0x%" PRIx32,
         i,
         static_cast<uint32_t>(output_status));
   }
 }

 class Module final {
  public:
   explicit Module(
       std::unique_ptr<DataLoader> loader,
       std::unique_ptr<ETDumpGen> tracer = nullptr,
       size_t debug_buffer_size = 0,
       Program::Verification program_verification =
           Program::Verification::InternalConsistency)
       : loader_(std::move(loader)),
         event_tracer_(std::move(tracer)),
         debug_buffer_size_(debug_buffer_size) {
     ::executorch::runtime::runtime_init();
     Result<Program> program =
         Program::load(loader_.get(), program_verification);
     THROW_IF_ERROR(
         program.error(),
         "loading program failed with error: 0x%" PRIx32,
         static_cast<uint32_t>(program.error()));
     program_ = std::make_unique<Program>(std::move(program.get()));

     // Figure out the size of each non_const layer we need to support every
     // method in the program. Map will be easier to use than a list because we
     // dont know how many non_const arenas there will be
     std::map<size_t, int64_t> non_const_buffer_sizes;
     for (size_t i = 0; i < program_->num_methods(); ++i) {
       auto name = program_->get_method_name(i).get();
       auto method_meta = program_->method_meta(name).get();
       for (size_t j = 0; j < method_meta.num_non_const_buffers(); j++) {
         int64_t buffer_size = method_meta.non_const_buffer_size(j).get();
         if (non_const_buffer_sizes.find(j) == non_const_buffer_sizes.end()) {
           non_const_buffer_sizes.insert({j, buffer_size});
         } else {
           non_const_buffer_sizes[j] =
               std::max(non_const_buffer_sizes[j], buffer_size);
         }
       }
     }

     // Allocate the arenas. Using vector because we need to remember the size as
     // well, so vector is easier then unique_ptr.
     std::vector<std::vector<uint8_t>> non_const_buffers_;
     for (std::map<size_t, int64_t>::iterator i = non_const_buffer_sizes.begin();
          i != non_const_buffer_sizes.end();
          i++) {
       non_const_buffers_.push_back(std::vector<uint8_t>(i->second));
     }

     memory_ = std::make_unique<Memory>(std::move(non_const_buffers_));
     if (event_tracer_ && debug_buffer_size > 0) {
       // If a debug buffer was requested for the ETDump, allocate it and make
       // sure its lifetime is as long as the event_tracer.
       debug_buffer_ = std::make_unique<uint8_t[]>(debug_buffer_size);
       event_tracer_->set_debug_buffer(get_etdump_debug_buffer());
       event_tracer_->set_event_tracer_debug_level(
           EventTracerDebugLogLevel::kIntermediateOutputs);
     }

     // Load methods
     for (size_t i = 0; i < program_->num_methods(); ++i) {
       auto name = program_->get_method_name(i).get();
       // It's safe to use the same memory manager for all modules because
       // we can guarantee that only one will be executing at a time.
       // Everything in this module runs on a single thread.
       Result<Method> method = program_->load_method(
           name, memory_->mem_manager(), event_tracer_.get());
       THROW_IF_ERROR(
           method.error(),
           "loading method %s failed with error 0x%" PRIx32,
           name,
           static_cast<uint32_t>(method.error()));
       methods_.insert(
           {std::string(name),
            std::make_unique<Method>(std::move(method.get()))});
     }
   }

   Module(const Module&) = delete;
   Module& operator=(const Module&) = delete;
   Module(Module&&) = default;
   Module& operator=(Module&&) = default;

   /// Executes the specified method on the provided inputs and returns its
   /// outputs.
   std::vector<EValue> run_method(
       const std::string& method_name,
       const std::vector<EValue>& args,
       const std::optional<std::vector<Span<uint8_t>>>& output_storages =
           std::nullopt) {
     auto& method = get_method(method_name);
     exec_aten::ArrayRef<EValue> input_evalue_list(args.data(), args.size());

     Error set_inputs_status = method.set_inputs(input_evalue_list);
     THROW_IF_ERROR(
         set_inputs_status,
         "method->set_inputs() for method '%s' failed with error 0x%" PRIx32,
         method_name.c_str(),
         static_cast<uint32_t>(set_inputs_status));

 #ifdef USE_ATEN_LIB
     // [TLS handling] This is to workaround an assertion failure
     // (https://fburl.com/code/302jyn8d) running `gelu` in ATen mode in fbcode
     // (such as bento). The problem is ExecuTorch ATen mode doesn't have
     // Thread Local State, but `torch-cpp` is assuming tls init is done. There
     // are two more checks: MKLDNN disabled and C10_MOBILE, if any of them is
     // true we won't be hitting this assertion error. However in `torch-cpp`
     // lib both checks are false. Production impact: this should not make any
     // impact in production environment, given that in xplat we are depending
     // on a library that enables C10_MOBILE (`torch_mobile_core`).
     c10::impl::ExcludeDispatchKeyGuard no_autograd(
         c10::autograd_dispatch_keyset);
 #endif
     if (output_storages) {
       setup_output_storage(method, *output_storages);
     }
     Error execute_status = method.execute();
     THROW_IF_ERROR(
         execute_status,
         "method->execute() failed with error 0x%" PRIx32,
         static_cast<uint32_t>(execute_status));
     // process outputs
     return get_outputs(method_name);
   }

   std::vector<EValue> get_outputs(const std::string& method_name) {
     auto& method = methods_[method_name];
     std::vector<EValue> result(method->outputs_size());

     Error get_outputs_status =
         method->get_outputs(result.data(), method->outputs_size());
     THROW_IF_ERROR(
         get_outputs_status,
         "method->get_outputs() for method '%s' failed with error 0x%" PRIx32,
         method_name.c_str(),
         static_cast<uint32_t>(get_outputs_status));

     return result;
   }

   Method& get_method(const std::string& method_name) {
     if (methods_.count(method_name) == 0) {
       THROW_IF_ERROR(
           Error::InvalidArgument,
           "no such method in program: %s",
           method_name.c_str());
     }
     return *methods_[method_name].get();
   }

   /// Returns the names of all methods in the program.
   std::vector<std::string> method_names() const {
     std::vector<std::string> names;
     for (const auto& method : methods_) {
       names.push_back(method.first);
     }
     return names;
   }

   bool has_etdump() {
     return static_cast<bool>(event_tracer_);
   }

   ETDumpGen& etdump() {
     return *event_tracer_;
   }

   bool has_etdump_debug_buffer() const {
     return static_cast<bool>(debug_buffer_);
   }

   Span<uint8_t> get_etdump_debug_buffer() {
     return Span<uint8_t>(debug_buffer_.get(), debug_buffer_size_);
   }

  private:
   /// A wrapper/util class for executorch memory allocations/manager.
   class Memory {
    public:
     explicit Memory(std::vector<std::vector<uint8_t>>&& non_const_buffers)
         : runtime_allocator_(),
           non_const_buffers_(std::move(non_const_buffers)),
           non_const_spans_(create_non_const_spans()),
           non_const_allocator_(
               {non_const_spans_.data(), non_const_spans_.size()}),
           mem_manager_(
               &const_allocator_,
               &non_const_allocator_,
               &runtime_allocator_,
               &temp_allocator_) {}

     /// Returns a pointer to the internal memory manager, the Memory instance
     /// must outlive this pointer.
     MemoryManager* mem_manager() {
       return &mem_manager_;
     }

     Memory(const Memory&) = delete;
     Memory& operator=(const Memory&) = delete;

    private:
     MemoryAllocator const_allocator_{MemoryAllocator(0, nullptr)};

     MallocMemoryAllocator runtime_allocator_;

     MemoryAllocator temp_allocator_{MemoryAllocator(0, nullptr)};

     std::vector<std::vector<uint8_t>> non_const_buffers_;

     std::vector<Span<uint8_t>> non_const_spans_;

     HierarchicalAllocator non_const_allocator_;

     MemoryManager mem_manager_;

     std::vector<Span<uint8_t>> create_non_const_spans() {
       std::vector<Span<uint8_t>> result;
       for (size_t i = 0; i < non_const_buffers_.size(); i++) {
         result.push_back(
             {non_const_buffers_[i].data(), non_const_buffers_[i].size()});
       }
       return result;
     }
   };

   std::unique_ptr<Memory> memory_;
   std::unique_ptr<DataLoader> loader_; // program_ points to this.
   std::unique_ptr<const Program> program_; // methods_ entries points to this.
   std::unordered_map<std::string, std::unique_ptr<Method>> methods_;
   std::unique_ptr<ETDumpGen> event_tracer_;
   std::unique_ptr<uint8_t[]> debug_buffer_;
   size_t debug_buffer_size_;
 };

 inline std::unique_ptr<Module> load_module_from_buffer(
     const void* ptr,
     size_t ptr_len,
     bool enable_etdump,
     size_t debug_buffer_size,
     Program::Verification program_verification) {
   EXECUTORCH_SCOPE_PROF("load_module_from_buffer");
   auto loader = std::make_unique<BufferDataLoader>(ptr, ptr_len);
   return std::make_unique<Module>(
       std::move(loader),
       enable_etdump ? std::make_unique<torch::executor::ETDumpGen>() : nullptr,
       debug_buffer_size,
       program_verification);
 }

 inline std::unique_ptr<Module> load_module_from_file(
     const std::string& path,
     bool enable_etdump,
     size_t debug_buffer_size,
     Program::Verification program_verification) {
   EXECUTORCH_SCOPE_PROF("load_module_from_file");

   Result<MmapDataLoader> res = MmapDataLoader::from(
       path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
   THROW_IF_ERROR(
       res.error(),
       "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
       path.c_str(),
       static_cast<uint32_t>(res.error()));

   auto loader = std::make_unique<MmapDataLoader>(std::move(res.get()));
   return std::make_unique<Module>(
       std::move(loader),
       enable_etdump ? std::make_unique<torch::executor::ETDumpGen>() : nullptr,
       debug_buffer_size,
       program_verification);
 }

 static constexpr size_t kDEFAULT_BUNDLED_INPUT_POOL_SIZE = 16 * 1024U;

 struct PyBundledModule final {
   explicit PyBundledModule(
       const py::bytes& buffer,
       uint32_t bundled_input_pool_size)
       : bundled_program_ptr_(buffer),
         program_ptr_(static_cast<const void*>(
             bundled_program_flatbuffer::GetBundledProgram(
                 get_bundled_program_ptr())
                 ->program()
                 ->data())),
         program_len_(bundled_program_flatbuffer::GetBundledProgram(
                          get_bundled_program_ptr())
                          ->program()
                          ->size()) {}

   static std::unique_ptr<PyBundledModule> load_from_buffer(
       const py::bytes& buffer,
       uint32_t bundled_input_pool_size) {
     return std::make_unique<PyBundledModule>(buffer, bundled_input_pool_size);
   }

   const void* get_bundled_program_ptr() {
     return bundled_program_ptr_.cast<std::string_view>().data();
   }

   const void* get_program_ptr() {
     return program_ptr_;
   }

   size_t get_program_len() {
     return program_len_;
   }

  private:
   // Store the bytes object instead of a raw pointer so that this module will
   // keep the bytes alive.
   const py::bytes bundled_program_ptr_;
   const void* program_ptr_;
   size_t program_len_;
 };

 /// Expose a subset of TensorInfo information to python.
 struct PyTensorInfo final {
   explicit PyTensorInfo(
       std::shared_ptr<Module> module,
       torch::executor::TensorInfo info)
       : module_(std::move(module)), info_(info) {}

   py::tuple sizes() const {
     const auto shape = info_.sizes();
     py::tuple tup(shape.size());
     for (size_t i = 0; i < shape.size(); ++i) {
       tup[i] = py::cast(shape[i]);
     }
     return tup;
   }

   int8_t dtype() const {
     return static_cast<std::underlying_type<exec_aten::ScalarType>::type>(
         info_.scalar_type());
   }

   bool is_memory_planned() const {
     return info_.is_memory_planned();
   }

   size_t nbytes() const {
     return info_.nbytes();
   }

   std::string repr() const {
     std::string size_str = "[";
     for (const auto& d : info_.sizes()) {
       size_str.append(std::to_string(d));
       size_str.append(", ");
     }
     if (size_str.length() >= 2) {
       // Pop the last two characters (command and space) and add close bracket.
       size_str.pop_back();
       size_str.pop_back();
     }
     size_str.append("]");
     return "TensorInfo(sizes=" + size_str + ", dtype=" +
         std::string(executorch::runtime::toString(info_.scalar_type())) +
         ", is_memory_planned=" +
         (info_.is_memory_planned() ? "True" : "False") +
         ", nbytes=" + std::to_string(info_.nbytes()) + ")";
   }

  private:
   // TensorInfo relies on module to be alive.
   std::shared_ptr<Module> module_;
   torch::executor::TensorInfo info_;
 };

 /// Expose a subset of MethodMeta information to python.
 struct PyMethodMeta final {
   explicit PyMethodMeta(
       std::shared_ptr<Module> module,
       torch::executor::MethodMeta meta)
       : module_(std::move(module)), meta_(meta) {}

   const char* name() const {
     return meta_.name();
   }

   size_t num_inputs() const {
     return meta_.num_inputs();
   }

   std::unique_ptr<PyTensorInfo> input_tensor_meta(size_t index) const {
     const auto result = meta_.input_tensor_meta(index);
     THROW_INDEX_IF_ERROR(
         result.error(), "Cannot get input tensor meta at %zu", index);
     return std::make_unique<PyTensorInfo>(module_, result.get());
   }

   size_t num_outputs() const {
     return meta_.num_outputs();
   }

   std::unique_ptr<PyTensorInfo> output_tensor_meta(size_t index) const {
     const auto result = meta_.output_tensor_meta(index);
     THROW_INDEX_IF_ERROR(
         result.error(), "Cannot get output tensor meta at %zu", index);
     return std::make_unique<PyTensorInfo>(module_, result.get());
   }

   py::str repr() const {
     py::list input_meta_strs;
     for (size_t i = 0; i < meta_.num_inputs(); ++i) {
       input_meta_strs.append(py::str(input_tensor_meta(i)->repr()));
     }
     py::list output_meta_strs;
     for (size_t i = 0; i < meta_.num_outputs(); ++i) {
       output_meta_strs.append(py::str(output_tensor_meta(i)->repr()));
     }
     // Add quotes to be more similar to Python's repr for strings.
     py::str format =
         "MethodMeta(name='{}', num_inputs={}, input_tensor_meta={}, num_outputs={}, output_tensor_meta={})";
     return format.format(
         std::string(meta_.name()),
         std::to_string(meta_.num_inputs()),
         input_meta_strs,
         std::to_string(meta_.num_outputs()),
         output_meta_strs);
   }

  private:
   // Must keep the Module object alive or else the meta object is invalidated.
   std::shared_ptr<Module> module_;
   torch::executor::MethodMeta meta_;
 };

 struct PyModule final {
   explicit PyModule(
       const py::bytes& buffer,
       bool enable_etdump,
       size_t debug_buffer_size = 0,
       Program::Verification program_verification =
           Program::Verification::InternalConsistency)
       : module_(load_module_from_buffer(
             buffer.cast<std::string_view>().data(),
             py::len(buffer),
             enable_etdump,
             debug_buffer_size,
             program_verification)) {}

   explicit PyModule(
       const void* ptr,
       size_t ptr_len,
       bool enable_etdump,
       size_t debug_buffer_size = 0,
       Program::Verification program_verification =
           Program::Verification::InternalConsistency)
       : module_(load_module_from_buffer(
             ptr,
             ptr_len,
             enable_etdump,
             debug_buffer_size,
             program_verification)) {}

   explicit PyModule(
       const std::string& path,
       bool enable_etdump,
       size_t debug_buffer_size = 0,
       Program::Verification program_verification =
           Program::Verification::InternalConsistency)
       : module_(load_module_from_file(
             path,
             enable_etdump,
             debug_buffer_size,
             program_verification)) {}

   PyModule(const PyModule&) = delete;
   PyModule& operator=(const PyModule&) = delete;
   PyModule(PyModule&&) = default;
   PyModule& operator=(PyModule&&) = default;

   // Module is only valid as long as the python buffer is alive.
   static std::unique_ptr<PyModule> load_from_buffer(
       const py::bytes& buffer,
       bool enable_etdump,
       size_t debug_buffer_size = 0,
       Program::Verification program_verification =
           Program::Verification::InternalConsistency) {
     return std::make_unique<PyModule>(
         buffer, enable_etdump, debug_buffer_size, program_verification);
   }
   static std::unique_ptr<PyModule> load_from_file(
       const std::string& path,
       bool enable_etdump,
       size_t debug_buffer_size = 0,
       Program::Verification program_verification =
           Program::Verification::InternalConsistency) {
     return std::make_unique<PyModule>(
         path, enable_etdump, debug_buffer_size, program_verification);
   }

   static std::unique_ptr<PyModule> load_from_bundled_program(
       PyBundledModule& m,
       bool enable_etdump,
       size_t debug_buffer_size = 0) {
     return std::make_unique<PyModule>(
         m.get_program_ptr(),
         m.get_program_len(),
         enable_etdump,
         debug_buffer_size);
   }

   py::list run_method(
       const std::string& method_name,
       const py::sequence& inputs,
       bool clone_outputs = true) {
     const auto inputs_size = py::len(inputs);
     std::vector<EValue> cpp_inputs;
     cpp_inputs.reserve(inputs_size);

 #ifndef USE_ATEN_LIB // Portable mode
     // So the ETensors and their metadata stay in scope for
     // Module->run_method.
     std::vector<torch::executor::TensorImpl> input_tensors;
     std::vector<std::vector<torch::executor::Tensor::SizesType>> input_sizes;
     std::vector<std::vector<torch::executor::Tensor::StridesType>>
         input_strides;
     std::vector<std::vector<torch::executor::Tensor::DimOrderType>>
         input_dim_order;
     // We store pointers to these vector elements so important to reserve so
     // that we don't lose those on a vector resize. Don't need to do this for
     // the others since they are vectors of vectors, and we don't store a
     // pointer to the root level vector data.
     input_tensors.reserve(inputs_size);
 #endif

     // Convert python objects into EValues.
     for (size_t i = 0; i < inputs_size; ++i) {
       auto python_input = inputs[i];
       const std::string& type_str = py::str(python_input.get_type());
       if (type_str == "<class 'torch.Tensor'>") {
         auto at_tensor = python_input.cast<at::Tensor>();
         // alias_etensor_to_attensor will assert on this later, so to better
         // propogate up to python we check early and throw an exception.
         if (!at_tensor.is_contiguous()) {
           auto error_msg = "Input " + std::to_string(i) + "for method " +
               method_name + " is not contiguous.";
           throw std::runtime_error(error_msg);
         }

 #ifdef USE_ATEN_LIB
         EValue evalue(at_tensor);
 #else
         // convert at::Tensor to torch::executor::Tensor
         auto type =
             torch_to_executorch_scalar_type(at_tensor.options().dtype());
         size_t dim = at_tensor.dim();
         // cant directly alias at::Tensor sizes and strides due to int64 vs
         // int32 typing conflict
         input_sizes.emplace_back(
             at_tensor.sizes().begin(), at_tensor.sizes().end());
         input_strides.emplace_back(
             at_tensor.strides().begin(), at_tensor.strides().end());

         // Only works for MemoryFormat::Contiguous inputs
         std::vector<torch::executor::Tensor::DimOrderType> dim_order;
         for (size_t cur_dim = 0; cur_dim < dim; cur_dim++) {
           dim_order.push_back(cur_dim);
         }
         input_dim_order.push_back(std::move(dim_order));
         input_tensors.emplace_back(
             type,
             dim,
             input_sizes.back().data(),
             nullptr,
             input_dim_order.back().data(),
             input_strides.back().data());

         torch::executor::Tensor temp =
             torch::executor::Tensor(&input_tensors.back());
         alias_etensor_to_attensor(at_tensor, temp);
         EValue evalue(temp);
 #endif

         cpp_inputs.push_back(evalue);
       } else if (py::isinstance<py::none>(python_input)) {
         cpp_inputs.push_back(EValue());
       } else if (py::isinstance<py::bool_>(python_input)) {
         cpp_inputs.push_back(EValue(py::cast<bool>(python_input)));
       } else if (py::isinstance<py::int_>(python_input)) {
         cpp_inputs.push_back(EValue(py::cast<int64_t>(python_input)));
       } else {
         ET_ASSERT_UNREACHABLE_MSG("Unsupported pytype: %s", type_str.c_str());
       }
     }

     const auto& method = module_->get_method(method_name);
     const auto num_outputs = method.outputs_size();
     output_storages_ = make_output_storages(method);
     std::vector<Span<uint8_t>> output_storage_spans(num_outputs);
     for (int i = 0; i < output_storages_.size(); ++i) {
       output_storage_spans[i] =
           Span<uint8_t>(output_storages_[i].data(), output_storages_[i].size());
     }
     auto outputs =
         module_->run_method(method_name, cpp_inputs, output_storage_spans);

     // Retrieve outputs
     return get_outputs_as_py_list(outputs, clone_outputs);
   }

   py::list forward(const py::sequence& inputs, bool clone_outputs = true) {
     return run_method("forward", inputs, clone_outputs);
   }

   py::list forward_single_input(
       const torch::Tensor& inputTensor,
       bool clone_outputs = true) {
     py::list py_list;
     py_list.append(py::cast(inputTensor));
     return run_method("forward", py_list, clone_outputs);
   }

   bool has_etdump() {
     return module_->has_etdump();
   }

   void write_etdump_result_to_file(
       const std::string& path,
       const py::object& debug_buffer_path) {
     if (!has_etdump()) {
       throw std::runtime_error("No etdump found");
     }
     auto& etdump = module_->etdump();
     etdump_result result = etdump.get_etdump_data();
     if (result.buf != nullptr && result.size > 0) {
       write_data_to_file(path, result.buf, result.size);
       free(result.buf);
       if (module_->has_etdump_debug_buffer() &&
           py::isinstance<py::str>(debug_buffer_path)) {
         // Also write out the debug buffer to a separate file if requested.
         std::string debug_buffer_path_str =
             py::cast<py::str>(debug_buffer_path);
         const auto debug_buffer = module_->get_etdump_debug_buffer();
         write_data_to_file(
             debug_buffer_path_str, debug_buffer.data(), debug_buffer.size());
       }
     } else {
       ET_LOG(
           Info,
           "No etdump data found, try rebuilding with "
           "the CMake option EXECUTORCH_ENABLE_EVENT_TRACER or with "
           "buck run --config executorch.event_tracer_enabled=true");
     }
   }

   void load_bundled_input(
       PyBundledModule& m,
       const std::string method_name,
       size_t testset_idx) {
     const void* bundled_program_ptr = m.get_bundled_program_ptr();
     Error status = executorch::bundled_program::load_bundled_input(
         module_->get_method(method_name), bundled_program_ptr, testset_idx);
     THROW_IF_ERROR(
         status,
         "load_bundled_input failed with status 0x%" PRIx32,
         static_cast<uint32_t>(status));
   }

   py::list verify_result_with_bundled_expected_output(
       PyBundledModule& m,
       const std::string method_name,
       size_t testset_idx,
       double rtol = 1e-5,
       double atol = 1e-8) {
     const void* bundled_program_ptr = m.get_bundled_program_ptr();
     auto& method = module_->get_method(method_name);
     Error status = executorch::bundled_program::load_bundled_input(
         method, bundled_program_ptr, testset_idx);
     THROW_IF_ERROR(
         status,
         "load_bundled_input failed with status 0x%" PRIx32,
         static_cast<uint32_t>(status));
     py::list outputs = plan_execute(method_name);
     status = executorch::bundled_program::verify_method_outputs(
         method, bundled_program_ptr, testset_idx, rtol, atol);
     THROW_IF_ERROR(
         status,
         "Result verification failed with status %" PRIu32,
         static_cast<uint32_t>(status));
     return outputs;
   }

   py::list plan_execute(
       const std::string method_name,
       bool clone_outputs = true) {
     auto& method = module_->get_method(method_name);
     // Need to pre-allocate space for outputs just like in run_method.
     const auto num_outputs = method.outputs_size();
     output_storages_ = make_output_storages(method);
     std::vector<Span<uint8_t>> output_storage_spans(num_outputs);
     for (int i = 0; i < output_storages_.size(); ++i) {
       output_storage_spans[i] =
           Span<uint8_t>(output_storages_[i].data(), output_storages_[i].size());
     }
     setup_output_storage(method, output_storage_spans);
     auto status = method.execute();
     THROW_IF_ERROR(
         status,
         "executing execution plan for method 'forward' failed with error: 0x%" PRIx32,
         static_cast<uint32_t>(status));
     const auto outputs = module_->get_outputs(method_name);
     return get_outputs_as_py_list(outputs, clone_outputs);
   }

   py::list get_outputs_as_py_list(
       const std::vector<EValue>& outputs,
       bool clone_outputs = true) {
     const auto outputs_size = outputs.size();
     py::list list(outputs_size);
     for (size_t i = 0; i < outputs_size; ++i) {
       auto& v = outputs[i];
       if (Tag::None == v.tag) {
         list[i] = py::none();
       } else if (Tag::Int == v.tag) {
         list[i] = py::cast(v.toInt());
       } else if (Tag::Double == v.tag) {
         list[i] = py::cast(v.toDouble());
       } else if (Tag::Bool == v.tag) {
         list[i] = py::cast(v.toBool());
       } else if (Tag::String == v.tag) {
         list[i] = py::cast(std::string(v.toString().data()));
       } else if (Tag::Tensor == v.tag) {
 #ifdef USE_ATEN_LIB
         // Clone so the outputs in python do not share a lifetime with the
         // module object
         if (clone_outputs) {
           list[i] = py::cast(v.toTensor().clone());
         } else {
           list[i] = py::cast(v.toTensor());
         }
 #else
         if (clone_outputs) {
           list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()).clone());
         } else {
           list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()));
         }
 #endif
       } else {
         ET_ASSERT_UNREACHABLE_MSG("Invalid model output type");
       }
     }
     return list;
   }

   std::unique_ptr<PyMethodMeta> method_meta(const std::string method_name) {
     auto& method = module_->get_method(method_name);
     return std::make_unique<PyMethodMeta>(module_, method.method_meta());
   }

   std::vector<std::string> method_names() {
     return module_->method_names();
   }

  private:
   std::shared_ptr<Module> module_;
   // Need to keep-alive output storages until they can be compared in case of
   // bundled programs.
   std::vector<std::vector<uint8_t>> output_storages_;

   std::vector<std::vector<uint8_t>> make_output_storages(const Method& method) {
     const auto num_outputs = method.outputs_size();
     // Create a buffer for each output tensor. Memory planned outputs and non
     // tensor outputs get an empty buffer in this list which is ignored later.
     std::vector<std::vector<uint8_t>> output_storages;
     output_storages_.reserve(num_outputs);
     auto meta = method.method_meta();
     for (size_t i = 0; i < num_outputs; ++i) {
       auto output_type = meta.output_tag(i);
       THROW_IF_ERROR(
           output_type.error(), "Failed to get output type for output %zu", i);
       if (output_type.get() != Tag::Tensor) {
         // Skip allocating storage for non-tensor outputs.
         output_storages.emplace_back();
         continue;
       }
       const auto& output_tensor_meta =
           method.method_meta().output_tensor_meta(i);
       THROW_IF_ERROR(
           output_tensor_meta.error(),
           "Failed to get output tensor meta for output %zu",
           i);
       if (output_tensor_meta.get().is_memory_planned()) {
         // Skip allocating storage for planned memory outputs.
         output_storages.emplace_back();
         continue;
       }
       // Allocate storage for the output tensor.
       const size_t output_size = output_tensor_meta.get().nbytes();
       output_storages.emplace_back(output_size);
     }
     return output_storages;
   }
 };

 void create_profile_block(const std::string& name) {
   EXECUTORCH_PROFILE_CREATE_BLOCK(name.c_str());
 }

 py::list get_operator_names() {
   Span<const Kernel> kernels = get_registered_kernels();
   py::list res;
   for (const Kernel& k : kernels) {
     if (k.name_ != nullptr) {
       res.append(py::cast(k.name_));
     }
   }
   return res;
 }

 } // namespace

 PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
   // Redirects cout and cerr for function calls this guards to the python env.
   auto call_guard = py::
       call_guard<py::scoped_ostream_redirect, py::scoped_estream_redirect>();

   // Bind the verification enum to python.
   py::enum_<Program::Verification>(m, "Verification")
       .value("Minimal", Program::Verification::Minimal)
       .value("InternalConsistency", Program::Verification::InternalConsistency);

   m.def(
       "_load_for_executorch",
       PyModule::load_from_file,
       py::arg("path"),
       py::arg("enable_etdump") = false,
       py::arg("debug_buffer_size") = 0,
       py::arg("program_verification") =
           Program::Verification::InternalConsistency,
       call_guard);
   m.def(
       "_load_for_executorch_from_buffer",
       &PyModule::load_from_buffer,
       py::arg("buffer"),
       py::arg("enable_etdump") = false,
       py::arg("debug_buffer_size") = 0,
       py::arg("program_verification") =
           Program::Verification::InternalConsistency,
       call_guard);
   m.def(
       "_load_for_executorch_from_bundled_program",
       &PyModule::load_from_bundled_program,
       py::arg("ptr"),
       py::arg("enable_etdump") = false,
       py::arg("debug_buffer_size") = 0,
       call_guard);
   m.def(
       "_load_bundled_program_from_buffer",
       &PyBundledModule::load_from_buffer,
       py::arg("buffer"),
       py::arg("non_const_pool_size") = kDEFAULT_BUNDLED_INPUT_POOL_SIZE,
       call_guard);
   m.def(
       "_dump_profile_results",
       []() {
         prof_result_t prof_result;
         EXECUTORCH_DUMP_PROFILE_RESULTS(&prof_result);
         return py::bytes(
             reinterpret_cast<const char*>(prof_result.prof_data),
             prof_result.num_bytes);
       },
       call_guard);
   m.def("_get_operator_names", &get_operator_names);
   m.def("_create_profile_block", &create_profile_block, call_guard);
   m.def(
       "_reset_profile_results",
       []() { EXECUTORCH_RESET_PROFILE_RESULTS(); },
       call_guard);

   py::class_<PyModule>(m, "ExecuTorchModule")
       .def("load_bundled_input", &PyModule::load_bundled_input, call_guard)
       .def(
           "verify_result_with_bundled_expected_output",
           &PyModule::verify_result_with_bundled_expected_output,
           py::arg("bundle"),
           py::arg("method_name"),
           py::arg("testset_idx"),
           py::arg("rtol") = 1e-5,
           py::arg("atol") = 1e-8,
           call_guard)
       .def(
           "plan_execute",
           &PyModule::plan_execute,
           py::arg("method_name"),
           py::arg("clone_outputs") = true,
           call_guard)
       .def(
           "method_meta",
           &PyModule::method_meta,
           py::arg("method_name"),
           call_guard)
       .def("method_names", &PyModule::method_names, call_guard)
       .def(
           "run_method",
           &PyModule::run_method,
           py::arg("method_name"),
           py::arg("inputs") = py::list(),
           py::arg("clone_outputs") = true,
           call_guard)
       .def(
           "forward",
           &PyModule::forward,
           py::arg("inputs") = py::list(),
           py::arg("clone_outputs") = true,
           call_guard)
       .def("has_etdump", &PyModule::has_etdump, call_guard)
       .def(
           "write_etdump_result_to_file",
           &PyModule::write_etdump_result_to_file,
           py::arg("path"),
           py::arg("debug_buffer_path") = py::none(),
           call_guard)
       .def(
           "__call__",
           &PyModule::forward,
           py::arg("inputs") = py::list(),
           py::arg("clone_outputs") = true,
           call_guard)
       .def(
           "__call__",
           &PyModule::forward_single_input,
           py::arg("inputs") = py::list(),
           py::arg("clone_outputs") = true,
           call_guard);

   py::class_<PyBundledModule>(m, "BundledModule");
   py::class_<PyTensorInfo>(m, "TensorInfo")
       .def("sizes", &PyTensorInfo::sizes, call_guard)
       .def("dtype", &PyTensorInfo::dtype, call_guard)
       .def("is_memory_planned", &PyTensorInfo::is_memory_planned, call_guard)
       .def("nbytes", &PyTensorInfo::nbytes, call_guard)
       .def("__repr__", &PyTensorInfo::repr, call_guard);
   py::class_<PyMethodMeta>(m, "MethodMeta")
       .def("name", &PyMethodMeta::name, call_guard)
       .def("num_inputs", &PyMethodMeta::num_inputs, call_guard)
       .def("num_outputs", &PyMethodMeta::num_outputs, call_guard)
       .def(
           "input_tensor_meta",
           &PyMethodMeta::input_tensor_meta,
           py::arg("index"),
           call_guard)
       .def(
           "output_tensor_meta",
           &PyMethodMeta::output_tensor_meta,
           py::arg("index"),
           call_guard)
       .def("__repr__", &PyMethodMeta::repr, call_guard);
 }

 } // namespace pybindings
 } // namespace extension
 } // namespace executorch