runtime/ExecutionBuilder.h - platform/packages/modules/NeuralNetworks - Git at Google

 /*
  * Copyright (C) 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H
 #define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H

 #include <ControlFlow.h>
 #include <CpuExecutor.h>
 #include <android-base/thread_annotations.h>
 #include <nnapi/IBurst.h>
 #include <nnapi/IPreparedModel.h>
 #include <nnapi/Types.h>
 #include <nnapi/Validation.h>

 #include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>

 #include "ExecutionCallback.h"
 #include "Memory.h"
 #include "ModelArgumentInfo.h"
 #include "ModelBuilder.h"
 #include "NeuralNetworks.h"

 namespace android {
 namespace nn {

 class BurstBuilder;
 class CompilationBuilder;
 class Device;
 class DynamicTemporaries;
 class ExecutionPlan;
 class ExecutionStep;
 class ModelBuilder;
 class RuntimeMemory;
 class RuntimePreparedModel;
 class RuntimeExecution;
 class StepExecutor;

 class ExecutionBuilder {
     friend class StepExecutor;

    public:
     explicit ExecutionBuilder(const CompilationBuilder* compilation);
     virtual ~ExecutionBuilder() = default;

     int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer,
                  size_t length);
     int setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
                            const RuntimeMemory* memory, size_t offset, size_t length);
     int setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer,
                   size_t length);
     int setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
                             const RuntimeMemory* memory, size_t offset, size_t length);

     int setMeasureTiming(bool measure);

     int getDuration(int32_t durationCode, uint64_t* duration) const;

     int setTimeoutDuration(uint64_t duration);

     std::optional<uint64_t> getTimeoutDuration() const;

     int setLoopTimeout(uint64_t duration);

     uint64_t getLoopTimeoutDuration() const { return mLoopTimeoutDuration; }

     int enableInputAndOutputPadding(bool enable);

     int setReusable(bool reusable);

     int computeFenced(const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
                       int* sync_fence);

     int computeAsynchronously(std::shared_ptr<ExecutionCallback>* synchronizationCallback) {
         CHECK(synchronizationCallback != nullptr);
         return compute(synchronizationCallback);
     }
     int computeSynchronously() { return compute(nullptr); }
     int burstCompute(BurstBuilder* burst) { return compute(nullptr, burst); }

     // Initialize output dimensional information from ModelArgumentInfo.
     std::vector<OutputShape> getInitialOutputShapes() const;

     int getOutputOperandDimensions(uint32_t index, uint32_t* dimensions);
     int getOutputOperandRank(uint32_t index, uint32_t* rank);

     // Handshake with lower-level execution support
     bool measureTiming() const { return mMeasureTiming; }
     void reportTimingWithoutFencedExecutionCallback(Timing timing) {
         mTimingWithoutFencedExecutionCallback = timing;
     }

     const CompilationBuilder* getCompilation() const { return mCompilation; }
     const ModelBuilder* getModel() const { return mModel; }
     const ModelBuilder* getSourceModel(uint32_t index) const;
     const Operand& getSourceOperand(const std::pair<uint32_t, uint32_t>& sourceOperandIndex) const {
         return getSourceModel(sourceOperandIndex.first)->getOperand(sourceOperandIndex.second);
     }

     // This method will be called at the end of all computation paths to change the state
     // of the execution object and update output shapes / memories.
     int finishComputation(int result, const std::vector<OutputShape>& outputShapes);
     ErrorStatus finishComputation(ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
         const int result = finishComputation(convertErrorStatusToResultCode(error), outputShapes);
         return convertResultCodeToErrorStatus(result);
     }

     const ExecuteFencedInfoCallback& getExecuteFencedInfoCallback() {
         return mFencedExecutionCallback;
     }

     bool inFlight() const {
         std::lock_guard<std::mutex> lock(mStateMutex);
         return mState == State::COMPUTATION;
     }

     const ModelArgumentInfo& getInputInfo(uint32_t index) const { return mInputs[index]; }
     const ModelArgumentInfo& getOutputInfo(uint32_t index) const { return mOutputs[index]; }

     std::optional<RunTimePoolInfo> getRunTimePoolInfo(uint32_t poolIndex) const {
         return mMemories[poolIndex]->getRunTimePoolInfo();
     }

    protected:
     // If a callback is provided, then this is asynchronous. If a callback is
     // not provided (i.e., is nullptr), then this is synchronous.
     //
     // If burst is provided, then the burst path will be used. If a burst is not
     // provided (i.e., is nullptr), then a synchronous execution will occur.
     //
     // Providing both synchronizationCallback and burstBuilder is an error.
     int compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback,
                 BurstBuilder* burstBuilder = nullptr);

     virtual std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
             const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) = 0;

     virtual std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
             const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
             const OptionalTimePoint& deadline) = 0;

     // This method handles the common preparation and validation logic of compute and computeFenced.
     // It will be called at the start of every computation.
     int prepareForCompute(const char* name);

     const CompilationBuilder* mCompilation;

     // Update output dimensional information from OutputShape to ModelArgumentInfo.
     bool updateOutputShapes(ErrorStatus status, const std::vector<OutputShape>& outputShapes);

     bool updateMemories();

     const ModelBuilder* mModel;
     const ExecutionPlan* mPlan;

     // Whether CPU fallback is allowed based on the value of DeviceManager::kPartitioning* captured
     // from CompilationBuilder when the ExecutionBuilder is constructed.
     bool mAllowCpuFallback;

     // The information we'll send to the driver about the inputs and outputs.
     // Note that we build this in two steps:
     // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element.
     //    If set from a pointer, don't set the location in the Request::Argument but store it
     //    instead in mInputBuffers or mOutputBuffers.
     // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for
     //    the m*Buffers entries.  Copy the input values into the shared memory.
     // We do this to avoid creating a lot of shared memory objects if we have a lot of
     // parameters specified via pointers.  We also avoid copying in the case where
     // some of the nodes will interpreted on the CPU anyway.
     std::vector<ModelArgumentInfo> mInputs;
     std::vector<ModelArgumentInfo> mOutputs;
     MemoryTracker mMemories;

     // Do we ask the driver to measure timing?
     bool mMeasureTiming = false;

     // Timing reported from the driver.  This field is only used if
     // mFencedExecutionCallback is nullptr.
     Timing mTimingWithoutFencedExecutionCallback = {};

     // Amount of time to complete or abort the execution.
     std::optional<uint64_t> mTimeoutDuration;

     // Amount of time to complete or abort a loop.
     uint64_t mLoopTimeoutDuration = operation_while::kTimeoutNsDefault;

     // The state of the execution.
     // Properties can only been set when the execution is in the state State::PREPARATION.
     // Timing and output shapes can only be queried when the execution is in the state
     // State::COMPLETED.
     enum class State { PREPARATION, COMPUTATION, COMPLETED };
     State mState GUARDED_BY(mStateMutex) = State::PREPARATION;
     bool computationStarted() const {
         std::lock_guard<std::mutex> lock(mStateMutex);
         return mState != State::PREPARATION;
     }
     bool completed() const {
         std::lock_guard<std::mutex> lock(mStateMutex);
         return mState == State::COMPLETED;
     }

     // Mutex to guard mState. Note that this not strictly needed because we provide
     // no thread-safety guarantee to the ANeuralNetworksExecution object.
     mutable std::mutex mStateMutex;

     // Return false if the execution is in a bad state for starting computation.
     // Otherwise, return true and set the state to State::COMPUTATION.
     bool checkAndSetComputationState(const char* name);

     // With what error status has execution completed?
     enum class Completion { NO_ERROR, OUTPUT_INSUFFICIENT_SIZE, OTHER_ERROR };
     Completion mCompletion = Completion::OTHER_ERROR;
     Completion completedWith() const {
         CHECK(completed());
         return mCompletion;
     }

     // The result code of request validation.
     // It is only evaluated once at the first time it's needed.
     std::optional<int> mValidationResultCode;
     int getValidationResultCode();

     // Does every tensor output operand of the model have a fully specified shape?
     // It is only evaluated once at the first time it's needed.
     std::optional<bool> mOutputsFullySpecified;
     bool areOutputsFullySpecified();

     // The callback used to query execution related info in the case of fenced
     // execution; otherwise, nullptr.  If the execution plan has multiple steps,
     // this is the callback associated with the last step.  If the last step
     // doesn't support fenced execution (e.g., the driver is too old), or if the
     // launch of execution on the driver fails, then this callback will be
     // nullptr.
     ExecuteFencedInfoCallback mFencedExecutionCallback;

     // Whether set{Input,Output}[FromMemory] can accept padded length or not.
     bool mInputAndOutputPaddingEnabled = false;

     // enableInputAndOutputPadding may only be called before any call of
     // set{Input,Output}[FromMemory]
     bool mHasCalledSetInputOutput = false;

     // Can compute APIs be invoked multiple times on the execution object?
     bool mReusable = false;
 };

 // For execution plan with a SIMPLE body, i.e. the whole model will be executed on a single device.
 class SimpleExecutionBuilder : public ExecutionBuilder {
    public:
     SimpleExecutionBuilder(const CompilationBuilder* compilation);

     std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
             const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;

     std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
             const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
             const OptionalTimePoint& deadline) override;

    private:
     std::shared_ptr<StepExecutor> mExecutor;
 };

 // For execution plan with a COMPOUND body, i.e. partitioned execution with multiple steps.
 class CompoundExecutionBuilder : public ExecutionBuilder {
    public:
     CompoundExecutionBuilder(const CompilationBuilder* compilation);

     std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
             const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;

     std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
             const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
             const OptionalTimePoint& deadline) override;
 };

 // class StepExecutor is used to execute a single "step" in a
 // potentially multiple step execution process.  The graph associated
 // with that step is executed in its entirety on a single device (or
 // on the CPU).
 class StepExecutor {
    public:
     // executionBuilder
     //     Describes the full (possibly multiple-"step") execution.
     // model
     //     The model to be executed by the executor.  Possibly a single
     //     "step" model of a multiple-"step" executionBuilder.
     // driver, preparedModel
     //     The device on which to execute the "step", and the prepared
     //     model to execute on that device. For non-fallback StepExecutor,
     //     neither is nullptr; for fallback StepExecutor, both are ignored in
     //     StepExecutor::computeOnCpuFallback and may be nullptr.
     // reusable
     //     If true, multiple StepExecutor::compute/computeFenced may be called on this
     //     object; otherwise, only one StepExecutor::compute/computeFenced may be called.
     //     reusable must be false if mDynamicTemporaries != nullptr.
     // step
     //     Contains the output index mapping from the excerpted "step" model to
     //     main model if the execution has multiple "steps". Must be nullptr
     //     otherwise.
     //     (step == nullptr) == (dynamicTemporaries == nullptr)
     // dynamicTemporaries
     //     If the execution has multiple "steps", describes the temporaries
     //     of source models that do not have fully specified types and are outputs
     //     of "step" models. Must be nullptr otherwise.
     //     (step == nullptr) == (dynamicTemporaries == nullptr)
     StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
                  std::shared_ptr<Device> device,
                  std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable,
                  const ExecutionStep* step = nullptr,
                  DynamicTemporaries* dynamicTemporaries = nullptr);

     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
     // in the case where we have a single-"step" execution (i.e., the executor
     // is executing the entire model from the ExecutionBuilder).
     void mapInputsAndOutputsTrivially();

     // Update output shapes with shapes returned from execution.
     struct UpdateOutputShapes {
         // These fields are meaningless unless updateOutputShapes() returns true
         bool updatedDynamicTemporary;  // did shape (dimensions, size) information change for at
                                        // least one dynamic temporary?
         bool mainOutputInsufficient;  // is at least one main model output written by this execution
                                       // marked !isSufficient?
         bool zeroSizedInput;  // is at least one output of this execution step a zero-sized tensor
                               // that needs to be read by some other step of the same execution?
     };
     bool updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
                             std::vector<OutputShape>* to, UpdateOutputShapes* update);

     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
     // one at a time.  Note that these are input/output indexes, not
     // operand indexes.
     //
     // For mapOutputToInput(), outputDimensions may be nullptr if the input
     // operand has fully specified dimensions.
     void mapInput(uint32_t builderIndex, uint32_t executorIndex) {
         mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]);
     }
     void mapOutput(uint32_t builderIndex, uint32_t executorIndex) {
         mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]);
     }
     void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex,
                           const Dimensions* outputDimensions) {
         mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex],
                          outputDimensions);
     }

     // dimensions must either have zero rank or must be
     // consistent with and at least as well specified as operand dimensions
     // (i.e., either rank must match, or operand rank must be zero; and for each
     // individual dimension, either dimension must match, or operand dimension
     // must be zero).
     int setInputFromMemory(uint32_t inputIndex, const RuntimeMemory* memory, uint32_t offset,
                            uint32_t length, const Dimensions& dimensions = {}) {
         return setInputOrOutputFromMemory(mModel->getInputOperand(inputIndex), memory, offset,
                                           length, dimensions, &mInputs.at(inputIndex));
     }
     int setOutputFromMemory(uint32_t outputIndex, const RuntimeMemory* memory, uint32_t offset,
                             uint32_t length, const Dimensions& dimensions = {}) {
         return setInputOrOutputFromMemory(mModel->getOutputOperand(outputIndex), memory, offset,
                                           length, dimensions, &mOutputs.at(outputIndex));
     }

     // Executes using the (driver, preparedModel) specified at construction time.
     std::tuple<int, std::vector<OutputShape>, Timing> compute(
             const OptionalTimePoint& deadline, const SharedBurst& burstController = nullptr);

     // Re-compiles and executes using the CPU, regardless of the (driver,
     // preparedModel) specified at construction time.
     std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpuFallback();

     bool isCpu() const;

     // Perform fenced execution and return error_code, sync_fence_fd and a
     // callback.
     std::tuple<int, int, ExecuteFencedInfoCallback> computeFenced(
             const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
             const OptionalTimePoint& deadline);

     // Do the dynamic temporaries defined by this step have valid allocations?
     // (true if there are no dynamic temporaries defined by this step.)
     bool areDynamicTemporariesAllocated() const;

    private:
     // builderDimensions may be nullptr if executorInputOrOutput has fully
     // specified dimensions.
     void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
                           ModelArgumentInfo* executorInputOrOutput,
                           const Dimensions* builderDimensions = nullptr);

     // dimensions must either have zero rank or
     // must be consistent with and at least as well specified as operand
     // dimensions (i.e., either rank must match, or operand rank must be zero;
     // and for each individual dimension, either dimension must match, or
     // operand dimension must be zero).
     int setInputOrOutputFromMemory(const Operand& inputOrOutputOperand, const RuntimeMemory* memory,
                                    uint32_t offset, uint32_t length, const Dimensions& dimensions,
                                    ModelArgumentInfo* inputOrOutputInfo);

     // describes the full (possibly multiple-"step") execution
     ExecutionBuilder* mExecutionBuilder;

     // describes the single execution step
     const ExecutionStep* mExecutionStep;

     // describes the dynamic temporaries
     DynamicTemporaries* mDynamicTemporaries;

     // model to be executed on the executor, in both original and
     // compiled forms; and device on which to execute it
     const ModelBuilder* mModel;
     std::shared_ptr<Device> mDevice;
     std::shared_ptr<RuntimePreparedModel> mPreparedModel;

     // The reusable execution to launch multiple computations.
     // It is only created once at the first time it's needed.
     std::shared_ptr<RuntimeExecution> mExecution;
     // Returns {NO_ERROR, execution} on success, or {result_code, nullptr} on failure.
     std::pair<int, std::shared_ptr<RuntimeExecution>> getReusableExecution();

     // The information we'll send to the driver about the inputs and outputs.
     // Note that we build this in two steps:
     // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element.
     //    If set from a pointer, don't set the location in the Request::Argument but store it
     //    instead in mInputBuffers or mOutputBuffers.
     // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for
     //    the m*Buffers entries.  Copy the input values into the shared memory.
     // We do this to avoid creating a lot of shared memory objects if we have a lot of
     // parameters specified via pointers.  We also avoid copying in the case where
     // some of the nodes will interpreted on the CPU anyway.
     std::vector<ModelArgumentInfo> mInputs;
     std::vector<ModelArgumentInfo> mOutputs;
     MemoryTracker mMemories;

     // Whether compute/computeFenced may be invoked multiple times.
     bool mReusable = false;
 };

 std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes);

 }  // namespace nn
 }  // namespace android

 #endif  // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H
	/*
	* Copyright (C) 2017 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H
	#define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H

	#include <ControlFlow.h>
	#include <CpuExecutor.h>
	#include <android-base/thread_annotations.h>
	#include <nnapi/IBurst.h>
	#include <nnapi/IPreparedModel.h>
	#include <nnapi/Types.h>
	#include <nnapi/Validation.h>

	#include <memory>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	#include "ExecutionCallback.h"
	#include "Memory.h"
	#include "ModelArgumentInfo.h"
	#include "ModelBuilder.h"
	#include "NeuralNetworks.h"

	namespace android {
	namespace nn {

	class BurstBuilder;
	class CompilationBuilder;
	class Device;
	class DynamicTemporaries;
	class ExecutionPlan;
	class ExecutionStep;
	class ModelBuilder;
	class RuntimeMemory;
	class RuntimePreparedModel;
	class RuntimeExecution;
	class StepExecutor;

	class ExecutionBuilder {
	friend class StepExecutor;

	public:
	explicit ExecutionBuilder(const CompilationBuilder* compilation);
	virtual ~ExecutionBuilder() = default;

	int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer,
	size_t length);
	int setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
	const RuntimeMemory* memory, size_t offset, size_t length);
	int setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer,
	size_t length);
	int setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
	const RuntimeMemory* memory, size_t offset, size_t length);

	int setMeasureTiming(bool measure);

	int getDuration(int32_t durationCode, uint64_t* duration) const;

	int setTimeoutDuration(uint64_t duration);

	std::optional<uint64_t> getTimeoutDuration() const;

	int setLoopTimeout(uint64_t duration);

	uint64_t getLoopTimeoutDuration() const { return mLoopTimeoutDuration; }

	int enableInputAndOutputPadding(bool enable);

	int setReusable(bool reusable);

	int computeFenced(const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
	int* sync_fence);

	int computeAsynchronously(std::shared_ptr<ExecutionCallback>* synchronizationCallback) {
	CHECK(synchronizationCallback != nullptr);
	return compute(synchronizationCallback);
	}
	int computeSynchronously() { return compute(nullptr); }
	int burstCompute(BurstBuilder* burst) { return compute(nullptr, burst); }

	// Initialize output dimensional information from ModelArgumentInfo.
	std::vector<OutputShape> getInitialOutputShapes() const;

	int getOutputOperandDimensions(uint32_t index, uint32_t* dimensions);
	int getOutputOperandRank(uint32_t index, uint32_t* rank);

	// Handshake with lower-level execution support
	bool measureTiming() const { return mMeasureTiming; }
	void reportTimingWithoutFencedExecutionCallback(Timing timing) {
	mTimingWithoutFencedExecutionCallback = timing;
	}

	const CompilationBuilder* getCompilation() const { return mCompilation; }
	const ModelBuilder* getModel() const { return mModel; }
	const ModelBuilder* getSourceModel(uint32_t index) const;
	const Operand& getSourceOperand(const std::pair<uint32_t, uint32_t>& sourceOperandIndex) const {
	return getSourceModel(sourceOperandIndex.first)->getOperand(sourceOperandIndex.second);
	}

	// This method will be called at the end of all computation paths to change the state
	// of the execution object and update output shapes / memories.
	int finishComputation(int result, const std::vector<OutputShape>& outputShapes);
	ErrorStatus finishComputation(ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
	const int result = finishComputation(convertErrorStatusToResultCode(error), outputShapes);
	return convertResultCodeToErrorStatus(result);
	}

	const ExecuteFencedInfoCallback& getExecuteFencedInfoCallback() {
	return mFencedExecutionCallback;
	}

	bool inFlight() const {
	std::lock_guard<std::mutex> lock(mStateMutex);
	return mState == State::COMPUTATION;
	}

	const ModelArgumentInfo& getInputInfo(uint32_t index) const { return mInputs[index]; }
	const ModelArgumentInfo& getOutputInfo(uint32_t index) const { return mOutputs[index]; }

	std::optional<RunTimePoolInfo> getRunTimePoolInfo(uint32_t poolIndex) const {
	return mMemories[poolIndex]->getRunTimePoolInfo();
	}

	protected:
	// If a callback is provided, then this is asynchronous. If a callback is
	// not provided (i.e., is nullptr), then this is synchronous.
	//
	// If burst is provided, then the burst path will be used. If a burst is not
	// provided (i.e., is nullptr), then a synchronous execution will occur.
	//
	// Providing both synchronizationCallback and burstBuilder is an error.
	int compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback,
	BurstBuilder* burstBuilder = nullptr);

	virtual std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
	const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) = 0;

	virtual std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
	const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
	const OptionalTimePoint& deadline) = 0;

	// This method handles the common preparation and validation logic of compute and computeFenced.
	// It will be called at the start of every computation.
	int prepareForCompute(const char* name);

	const CompilationBuilder* mCompilation;

	// Update output dimensional information from OutputShape to ModelArgumentInfo.
	bool updateOutputShapes(ErrorStatus status, const std::vector<OutputShape>& outputShapes);

	bool updateMemories();

	const ModelBuilder* mModel;
	const ExecutionPlan* mPlan;

	// Whether CPU fallback is allowed based on the value of DeviceManager::kPartitioning* captured
	// from CompilationBuilder when the ExecutionBuilder is constructed.
	bool mAllowCpuFallback;

	// The information we'll send to the driver about the inputs and outputs.
	// Note that we build this in two steps:
	// 1. As the arguments are specified, set the corresponding mInputs or mOutputs element.
	// If set from a pointer, don't set the location in the Request::Argument but store it
	// instead in mInputBuffers or mOutputBuffers.
	// 2. Once we have all the inputs and outputs, if needed, allocate shared memory for
	// the m*Buffers entries. Copy the input values into the shared memory.
	// We do this to avoid creating a lot of shared memory objects if we have a lot of
	// parameters specified via pointers. We also avoid copying in the case where
	// some of the nodes will interpreted on the CPU anyway.
	std::vector<ModelArgumentInfo> mInputs;
	std::vector<ModelArgumentInfo> mOutputs;
	MemoryTracker mMemories;

	// Do we ask the driver to measure timing?
	bool mMeasureTiming = false;

	// Timing reported from the driver. This field is only used if
	// mFencedExecutionCallback is nullptr.
	Timing mTimingWithoutFencedExecutionCallback = {};

	// Amount of time to complete or abort the execution.
	std::optional<uint64_t> mTimeoutDuration;

	// Amount of time to complete or abort a loop.
	uint64_t mLoopTimeoutDuration = operation_while::kTimeoutNsDefault;

	// The state of the execution.
	// Properties can only been set when the execution is in the state State::PREPARATION.
	// Timing and output shapes can only be queried when the execution is in the state
	// State::COMPLETED.
	enum class State { PREPARATION, COMPUTATION, COMPLETED };
	State mState GUARDED_BY(mStateMutex) = State::PREPARATION;
	bool computationStarted() const {
	std::lock_guard<std::mutex> lock(mStateMutex);
	return mState != State::PREPARATION;
	}
	bool completed() const {
	std::lock_guard<std::mutex> lock(mStateMutex);
	return mState == State::COMPLETED;
	}

	// Mutex to guard mState. Note that this not strictly needed because we provide
	// no thread-safety guarantee to the ANeuralNetworksExecution object.
	mutable std::mutex mStateMutex;

	// Return false if the execution is in a bad state for starting computation.
	// Otherwise, return true and set the state to State::COMPUTATION.
	bool checkAndSetComputationState(const char* name);

	// With what error status has execution completed?
	enum class Completion { NO_ERROR, OUTPUT_INSUFFICIENT_SIZE, OTHER_ERROR };
	Completion mCompletion = Completion::OTHER_ERROR;
	Completion completedWith() const {
	CHECK(completed());
	return mCompletion;
	}

	// The result code of request validation.
	// It is only evaluated once at the first time it's needed.
	std::optional<int> mValidationResultCode;
	int getValidationResultCode();

	// Does every tensor output operand of the model have a fully specified shape?
	// It is only evaluated once at the first time it's needed.
	std::optional<bool> mOutputsFullySpecified;
	bool areOutputsFullySpecified();

	// The callback used to query execution related info in the case of fenced
	// execution; otherwise, nullptr. If the execution plan has multiple steps,
	// this is the callback associated with the last step. If the last step
	// doesn't support fenced execution (e.g., the driver is too old), or if the
	// launch of execution on the driver fails, then this callback will be
	// nullptr.
	ExecuteFencedInfoCallback mFencedExecutionCallback;

	// Whether set{Input,Output}[FromMemory] can accept padded length or not.
	bool mInputAndOutputPaddingEnabled = false;

	// enableInputAndOutputPadding may only be called before any call of
	// set{Input,Output}[FromMemory]
	bool mHasCalledSetInputOutput = false;

	// Can compute APIs be invoked multiple times on the execution object?
	bool mReusable = false;
	};

	// For execution plan with a SIMPLE body, i.e. the whole model will be executed on a single device.
	class SimpleExecutionBuilder : public ExecutionBuilder {
	public:
	SimpleExecutionBuilder(const CompilationBuilder* compilation);

	std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
	const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;

	std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
	const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
	const OptionalTimePoint& deadline) override;

	private:
	std::shared_ptr<StepExecutor> mExecutor;
	};

	// For execution plan with a COMPOUND body, i.e. partitioned execution with multiple steps.
	class CompoundExecutionBuilder : public ExecutionBuilder {
	public:
	CompoundExecutionBuilder(const CompilationBuilder* compilation);

	std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
	const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;

	std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
	const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
	const OptionalTimePoint& deadline) override;
	};

	// class StepExecutor is used to execute a single "step" in a
	// potentially multiple step execution process. The graph associated
	// with that step is executed in its entirety on a single device (or
	// on the CPU).
	class StepExecutor {
	public:
	// executionBuilder
	// Describes the full (possibly multiple-"step") execution.
	// model
	// The model to be executed by the executor. Possibly a single
	// "step" model of a multiple-"step" executionBuilder.
	// driver, preparedModel
	// The device on which to execute the "step", and the prepared
	// model to execute on that device. For non-fallback StepExecutor,
	// neither is nullptr; for fallback StepExecutor, both are ignored in
	// StepExecutor::computeOnCpuFallback and may be nullptr.
	// reusable
	// If true, multiple StepExecutor::compute/computeFenced may be called on this
	// object; otherwise, only one StepExecutor::compute/computeFenced may be called.
	// reusable must be false if mDynamicTemporaries != nullptr.
	// step
	// Contains the output index mapping from the excerpted "step" model to
	// main model if the execution has multiple "steps". Must be nullptr
	// otherwise.
	// (step == nullptr) == (dynamicTemporaries == nullptr)
	// dynamicTemporaries
	// If the execution has multiple "steps", describes the temporaries
	// of source models that do not have fully specified types and are outputs
	// of "step" models. Must be nullptr otherwise.
	// (step == nullptr) == (dynamicTemporaries == nullptr)
	StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
	std::shared_ptr<Device> device,
	std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable,
	const ExecutionStep* step = nullptr,
	DynamicTemporaries* dynamicTemporaries = nullptr);

	// Map inputs and outputs from ExecutionBuilder to StepExecutor,
	// in the case where we have a single-"step" execution (i.e., the executor
	// is executing the entire model from the ExecutionBuilder).
	void mapInputsAndOutputsTrivially();

	// Update output shapes with shapes returned from execution.
	struct UpdateOutputShapes {
	// These fields are meaningless unless updateOutputShapes() returns true
	bool updatedDynamicTemporary; // did shape (dimensions, size) information change for at
	// least one dynamic temporary?
	bool mainOutputInsufficient; // is at least one main model output written by this execution
	// marked !isSufficient?
	bool zeroSizedInput; // is at least one output of this execution step a zero-sized tensor
	// that needs to be read by some other step of the same execution?
	};
	bool updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
	std::vector<OutputShape>* to, UpdateOutputShapes* update);

	// Map inputs and outputs from ExecutionBuilder to StepExecutor,
	// one at a time. Note that these are input/output indexes, not
	// operand indexes.
	//
	// For mapOutputToInput(), outputDimensions may be nullptr if the input
	// operand has fully specified dimensions.
	void mapInput(uint32_t builderIndex, uint32_t executorIndex) {
	mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]);
	}
	void mapOutput(uint32_t builderIndex, uint32_t executorIndex) {
	mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]);
	}
	void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex,
	const Dimensions* outputDimensions) {
	mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex],
	outputDimensions);
	}

	// dimensions must either have zero rank or must be
	// consistent with and at least as well specified as operand dimensions
	// (i.e., either rank must match, or operand rank must be zero; and for each
	// individual dimension, either dimension must match, or operand dimension
	// must be zero).
	int setInputFromMemory(uint32_t inputIndex, const RuntimeMemory* memory, uint32_t offset,
	uint32_t length, const Dimensions& dimensions = {}) {
	return setInputOrOutputFromMemory(mModel->getInputOperand(inputIndex), memory, offset,
	length, dimensions, &mInputs.at(inputIndex));
	}
	int setOutputFromMemory(uint32_t outputIndex, const RuntimeMemory* memory, uint32_t offset,
	uint32_t length, const Dimensions& dimensions = {}) {
	return setInputOrOutputFromMemory(mModel->getOutputOperand(outputIndex), memory, offset,
	length, dimensions, &mOutputs.at(outputIndex));
	}

	// Executes using the (driver, preparedModel) specified at construction time.
	std::tuple<int, std::vector<OutputShape>, Timing> compute(
	const OptionalTimePoint& deadline, const SharedBurst& burstController = nullptr);

	// Re-compiles and executes using the CPU, regardless of the (driver,
	// preparedModel) specified at construction time.
	std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpuFallback();

	bool isCpu() const;

	// Perform fenced execution and return error_code, sync_fence_fd and a
	// callback.
	std::tuple<int, int, ExecuteFencedInfoCallback> computeFenced(
	const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
	const OptionalTimePoint& deadline);

	// Do the dynamic temporaries defined by this step have valid allocations?
	// (true if there are no dynamic temporaries defined by this step.)
	bool areDynamicTemporariesAllocated() const;

	private:
	// builderDimensions may be nullptr if executorInputOrOutput has fully
	// specified dimensions.
	void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
	ModelArgumentInfo* executorInputOrOutput,
	const Dimensions* builderDimensions = nullptr);

	// dimensions must either have zero rank or
	// must be consistent with and at least as well specified as operand
	// dimensions (i.e., either rank must match, or operand rank must be zero;
	// and for each individual dimension, either dimension must match, or
	// operand dimension must be zero).
	int setInputOrOutputFromMemory(const Operand& inputOrOutputOperand, const RuntimeMemory* memory,
	uint32_t offset, uint32_t length, const Dimensions& dimensions,
	ModelArgumentInfo* inputOrOutputInfo);

	// describes the full (possibly multiple-"step") execution
	ExecutionBuilder* mExecutionBuilder;

	// describes the single execution step
	const ExecutionStep* mExecutionStep;

	// describes the dynamic temporaries
	DynamicTemporaries* mDynamicTemporaries;

	// model to be executed on the executor, in both original and
	// compiled forms; and device on which to execute it
	const ModelBuilder* mModel;
	std::shared_ptr<Device> mDevice;
	std::shared_ptr<RuntimePreparedModel> mPreparedModel;

	// The reusable execution to launch multiple computations.
	// It is only created once at the first time it's needed.
	std::shared_ptr<RuntimeExecution> mExecution;
	// Returns {NO_ERROR, execution} on success, or {result_code, nullptr} on failure.
	std::pair<int, std::shared_ptr<RuntimeExecution>> getReusableExecution();

	// The information we'll send to the driver about the inputs and outputs.
	// Note that we build this in two steps:
	// 1. As the arguments are specified, set the corresponding mInputs or mOutputs element.
	// If set from a pointer, don't set the location in the Request::Argument but store it
	// instead in mInputBuffers or mOutputBuffers.
	// 2. Once we have all the inputs and outputs, if needed, allocate shared memory for
	// the m*Buffers entries. Copy the input values into the shared memory.
	// We do this to avoid creating a lot of shared memory objects if we have a lot of
	// parameters specified via pointers. We also avoid copying in the case where
	// some of the nodes will interpreted on the CPU anyway.
	std::vector<ModelArgumentInfo> mInputs;
	std::vector<ModelArgumentInfo> mOutputs;
	MemoryTracker mMemories;

	// Whether compute/computeFenced may be invoked multiple times.
	bool mReusable = false;
	};

	std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes);

	} // namespace nn
	} // namespace android

	#endif // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H