| /* |
| * Copyright (C) 2017 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H |
| #define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H |
| |
| #include <ControlFlow.h> |
| #include <CpuExecutor.h> |
| #include <android-base/thread_annotations.h> |
| #include <nnapi/IBurst.h> |
| #include <nnapi/IPreparedModel.h> |
| #include <nnapi/Types.h> |
| #include <nnapi/Validation.h> |
| |
| #include <memory> |
| #include <string> |
| #include <tuple> |
| #include <utility> |
| #include <vector> |
| |
| #include "ExecutionCallback.h" |
| #include "Memory.h" |
| #include "ModelArgumentInfo.h" |
| #include "ModelBuilder.h" |
| #include "NeuralNetworks.h" |
| |
| namespace android { |
| namespace nn { |
| |
| class BurstBuilder; |
| class CompilationBuilder; |
| class Device; |
| class DynamicTemporaries; |
| class ExecutionPlan; |
| class ExecutionStep; |
| class ModelBuilder; |
| class RuntimeMemory; |
| class RuntimePreparedModel; |
| class RuntimeExecution; |
| class StepExecutor; |
| |
| class ExecutionBuilder { |
| friend class StepExecutor; |
| |
| public: |
| explicit ExecutionBuilder(const CompilationBuilder* compilation); |
| virtual ~ExecutionBuilder() = default; |
| |
| int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer, |
| size_t length); |
| int setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, |
| const RuntimeMemory* memory, size_t offset, size_t length); |
| int setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer, |
| size_t length); |
| int setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, |
| const RuntimeMemory* memory, size_t offset, size_t length); |
| |
| int setMeasureTiming(bool measure); |
| |
| int getDuration(int32_t durationCode, uint64_t* duration) const; |
| |
| int setTimeoutDuration(uint64_t duration); |
| |
| std::optional<uint64_t> getTimeoutDuration() const; |
| |
| int setLoopTimeout(uint64_t duration); |
| |
| uint64_t getLoopTimeoutDuration() const { return mLoopTimeoutDuration; } |
| |
| int enableInputAndOutputPadding(bool enable); |
| |
| int setReusable(bool reusable); |
| |
| int computeFenced(const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence, |
| int* sync_fence); |
| |
| int computeAsynchronously(std::shared_ptr<ExecutionCallback>* synchronizationCallback) { |
| CHECK(synchronizationCallback != nullptr); |
| return compute(synchronizationCallback); |
| } |
| int computeSynchronously() { return compute(nullptr); } |
| int burstCompute(BurstBuilder* burst) { return compute(nullptr, burst); } |
| |
| // Initialize output dimensional information from ModelArgumentInfo. |
| std::vector<OutputShape> getInitialOutputShapes() const; |
| |
| int getOutputOperandDimensions(uint32_t index, uint32_t* dimensions); |
| int getOutputOperandRank(uint32_t index, uint32_t* rank); |
| |
| // Handshake with lower-level execution support |
| bool measureTiming() const { return mMeasureTiming; } |
| void reportTimingWithoutFencedExecutionCallback(Timing timing) { |
| mTimingWithoutFencedExecutionCallback = timing; |
| } |
| |
| const CompilationBuilder* getCompilation() const { return mCompilation; } |
| const ModelBuilder* getModel() const { return mModel; } |
| const ModelBuilder* getSourceModel(uint32_t index) const; |
| const Operand& getSourceOperand(const std::pair<uint32_t, uint32_t>& sourceOperandIndex) const { |
| return getSourceModel(sourceOperandIndex.first)->getOperand(sourceOperandIndex.second); |
| } |
| |
| // This method will be called at the end of all computation paths to change the state |
| // of the execution object and update output shapes / memories. |
| int finishComputation(int result, const std::vector<OutputShape>& outputShapes); |
| ErrorStatus finishComputation(ErrorStatus error, const std::vector<OutputShape>& outputShapes) { |
| const int result = finishComputation(convertErrorStatusToResultCode(error), outputShapes); |
| return convertResultCodeToErrorStatus(result); |
| } |
| |
| const ExecuteFencedInfoCallback& getExecuteFencedInfoCallback() { |
| return mFencedExecutionCallback; |
| } |
| |
| bool inFlight() const { |
| std::lock_guard<std::mutex> lock(mStateMutex); |
| return mState == State::COMPUTATION; |
| } |
| |
| const ModelArgumentInfo& getInputInfo(uint32_t index) const { return mInputs[index]; } |
| const ModelArgumentInfo& getOutputInfo(uint32_t index) const { return mOutputs[index]; } |
| |
| std::optional<RunTimePoolInfo> getRunTimePoolInfo(uint32_t poolIndex) const { |
| return mMemories[poolIndex]->getRunTimePoolInfo(); |
| } |
| |
| protected: |
| // If a callback is provided, then this is asynchronous. If a callback is |
| // not provided (i.e., is nullptr), then this is synchronous. |
| // |
| // If burst is provided, then the burst path will be used. If a burst is not |
| // provided (i.e., is nullptr), then a synchronous execution will occur. |
| // |
| // Providing both synchronizationCallback and burstBuilder is an error. |
| int compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback, |
| BurstBuilder* burstBuilder = nullptr); |
| |
| virtual std::tuple<int, std::vector<OutputShape>, Timing> computeInternal( |
| const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) = 0; |
| |
| virtual std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal( |
| const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, |
| const OptionalTimePoint& deadline) = 0; |
| |
| // This method handles the common preparation and validation logic of compute and computeFenced. |
| // It will be called at the start of every computation. |
| int prepareForCompute(const char* name); |
| |
| const CompilationBuilder* mCompilation; |
| |
| // Update output dimensional information from OutputShape to ModelArgumentInfo. |
| bool updateOutputShapes(ErrorStatus status, const std::vector<OutputShape>& outputShapes); |
| |
| bool updateMemories(); |
| |
| const ModelBuilder* mModel; |
| const ExecutionPlan* mPlan; |
| |
| // Whether CPU fallback is allowed based on the value of DeviceManager::kPartitioning* captured |
| // from CompilationBuilder when the ExecutionBuilder is constructed. |
| bool mAllowCpuFallback; |
| |
| // The information we'll send to the driver about the inputs and outputs. |
| // Note that we build this in two steps: |
| // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element. |
| // If set from a pointer, don't set the location in the Request::Argument but store it |
| // instead in mInputBuffers or mOutputBuffers. |
| // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for |
| // the m*Buffers entries. Copy the input values into the shared memory. |
| // We do this to avoid creating a lot of shared memory objects if we have a lot of |
| // parameters specified via pointers. We also avoid copying in the case where |
| // some of the nodes will interpreted on the CPU anyway. |
| std::vector<ModelArgumentInfo> mInputs; |
| std::vector<ModelArgumentInfo> mOutputs; |
| MemoryTracker mMemories; |
| |
| // Do we ask the driver to measure timing? |
| bool mMeasureTiming = false; |
| |
| // Timing reported from the driver. This field is only used if |
| // mFencedExecutionCallback is nullptr. |
| Timing mTimingWithoutFencedExecutionCallback = {}; |
| |
| // Amount of time to complete or abort the execution. |
| std::optional<uint64_t> mTimeoutDuration; |
| |
| // Amount of time to complete or abort a loop. |
| uint64_t mLoopTimeoutDuration = operation_while::kTimeoutNsDefault; |
| |
| // The state of the execution. |
| // Properties can only been set when the execution is in the state State::PREPARATION. |
| // Timing and output shapes can only be queried when the execution is in the state |
| // State::COMPLETED. |
| enum class State { PREPARATION, COMPUTATION, COMPLETED }; |
| State mState GUARDED_BY(mStateMutex) = State::PREPARATION; |
| bool computationStarted() const { |
| std::lock_guard<std::mutex> lock(mStateMutex); |
| return mState != State::PREPARATION; |
| } |
| bool completed() const { |
| std::lock_guard<std::mutex> lock(mStateMutex); |
| return mState == State::COMPLETED; |
| } |
| |
| // Mutex to guard mState. Note that this not strictly needed because we provide |
| // no thread-safety guarantee to the ANeuralNetworksExecution object. |
| mutable std::mutex mStateMutex; |
| |
| // Return false if the execution is in a bad state for starting computation. |
| // Otherwise, return true and set the state to State::COMPUTATION. |
| bool checkAndSetComputationState(const char* name); |
| |
| // With what error status has execution completed? |
| enum class Completion { NO_ERROR, OUTPUT_INSUFFICIENT_SIZE, OTHER_ERROR }; |
| Completion mCompletion = Completion::OTHER_ERROR; |
| Completion completedWith() const { |
| CHECK(completed()); |
| return mCompletion; |
| } |
| |
| // The result code of request validation. |
| // It is only evaluated once at the first time it's needed. |
| std::optional<int> mValidationResultCode; |
| int getValidationResultCode(); |
| |
| // Does every tensor output operand of the model have a fully specified shape? |
| // It is only evaluated once at the first time it's needed. |
| std::optional<bool> mOutputsFullySpecified; |
| bool areOutputsFullySpecified(); |
| |
| // The callback used to query execution related info in the case of fenced |
| // execution; otherwise, nullptr. If the execution plan has multiple steps, |
| // this is the callback associated with the last step. If the last step |
| // doesn't support fenced execution (e.g., the driver is too old), or if the |
| // launch of execution on the driver fails, then this callback will be |
| // nullptr. |
| ExecuteFencedInfoCallback mFencedExecutionCallback; |
| |
| // Whether set{Input,Output}[FromMemory] can accept padded length or not. |
| bool mInputAndOutputPaddingEnabled = false; |
| |
| // enableInputAndOutputPadding may only be called before any call of |
| // set{Input,Output}[FromMemory] |
| bool mHasCalledSetInputOutput = false; |
| |
| // Can compute APIs be invoked multiple times on the execution object? |
| bool mReusable = false; |
| }; |
| |
| // For execution plan with a SIMPLE body, i.e. the whole model will be executed on a single device. |
| class SimpleExecutionBuilder : public ExecutionBuilder { |
| public: |
| SimpleExecutionBuilder(const CompilationBuilder* compilation); |
| |
| std::tuple<int, std::vector<OutputShape>, Timing> computeInternal( |
| const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override; |
| |
| std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal( |
| const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, |
| const OptionalTimePoint& deadline) override; |
| |
| private: |
| std::shared_ptr<StepExecutor> mExecutor; |
| }; |
| |
| // For execution plan with a COMPOUND body, i.e. partitioned execution with multiple steps. |
| class CompoundExecutionBuilder : public ExecutionBuilder { |
| public: |
| CompoundExecutionBuilder(const CompilationBuilder* compilation); |
| |
| std::tuple<int, std::vector<OutputShape>, Timing> computeInternal( |
| const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override; |
| |
| std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal( |
| const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence, |
| const OptionalTimePoint& deadline) override; |
| }; |
| |
| // class StepExecutor is used to execute a single "step" in a |
| // potentially multiple step execution process. The graph associated |
| // with that step is executed in its entirety on a single device (or |
| // on the CPU). |
| class StepExecutor { |
| public: |
| // executionBuilder |
| // Describes the full (possibly multiple-"step") execution. |
| // model |
| // The model to be executed by the executor. Possibly a single |
| // "step" model of a multiple-"step" executionBuilder. |
| // driver, preparedModel |
| // The device on which to execute the "step", and the prepared |
| // model to execute on that device. For non-fallback StepExecutor, |
| // neither is nullptr; for fallback StepExecutor, both are ignored in |
| // StepExecutor::computeOnCpuFallback and may be nullptr. |
| // reusable |
| // If true, multiple StepExecutor::compute/computeFenced may be called on this |
| // object; otherwise, only one StepExecutor::compute/computeFenced may be called. |
| // reusable must be false if mDynamicTemporaries != nullptr. |
| // step |
| // Contains the output index mapping from the excerpted "step" model to |
| // main model if the execution has multiple "steps". Must be nullptr |
| // otherwise. |
| // (step == nullptr) == (dynamicTemporaries == nullptr) |
| // dynamicTemporaries |
| // If the execution has multiple "steps", describes the temporaries |
| // of source models that do not have fully specified types and are outputs |
| // of "step" models. Must be nullptr otherwise. |
| // (step == nullptr) == (dynamicTemporaries == nullptr) |
| StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model, |
| std::shared_ptr<Device> device, |
| std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable, |
| const ExecutionStep* step = nullptr, |
| DynamicTemporaries* dynamicTemporaries = nullptr); |
| |
| // Map inputs and outputs from ExecutionBuilder to StepExecutor, |
| // in the case where we have a single-"step" execution (i.e., the executor |
| // is executing the entire model from the ExecutionBuilder). |
| void mapInputsAndOutputsTrivially(); |
| |
| // Update output shapes with shapes returned from execution. |
| struct UpdateOutputShapes { |
| // These fields are meaningless unless updateOutputShapes() returns true |
| bool updatedDynamicTemporary; // did shape (dimensions, size) information change for at |
| // least one dynamic temporary? |
| bool mainOutputInsufficient; // is at least one main model output written by this execution |
| // marked !isSufficient? |
| bool zeroSizedInput; // is at least one output of this execution step a zero-sized tensor |
| // that needs to be read by some other step of the same execution? |
| }; |
| bool updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from, |
| std::vector<OutputShape>* to, UpdateOutputShapes* update); |
| |
| // Map inputs and outputs from ExecutionBuilder to StepExecutor, |
| // one at a time. Note that these are input/output indexes, not |
| // operand indexes. |
| // |
| // For mapOutputToInput(), outputDimensions may be nullptr if the input |
| // operand has fully specified dimensions. |
| void mapInput(uint32_t builderIndex, uint32_t executorIndex) { |
| mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]); |
| } |
| void mapOutput(uint32_t builderIndex, uint32_t executorIndex) { |
| mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]); |
| } |
| void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex, |
| const Dimensions* outputDimensions) { |
| mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex], |
| outputDimensions); |
| } |
| |
| // dimensions must either have zero rank or must be |
| // consistent with and at least as well specified as operand dimensions |
| // (i.e., either rank must match, or operand rank must be zero; and for each |
| // individual dimension, either dimension must match, or operand dimension |
| // must be zero). |
| int setInputFromMemory(uint32_t inputIndex, const RuntimeMemory* memory, uint32_t offset, |
| uint32_t length, const Dimensions& dimensions = {}) { |
| return setInputOrOutputFromMemory(mModel->getInputOperand(inputIndex), memory, offset, |
| length, dimensions, &mInputs.at(inputIndex)); |
| } |
| int setOutputFromMemory(uint32_t outputIndex, const RuntimeMemory* memory, uint32_t offset, |
| uint32_t length, const Dimensions& dimensions = {}) { |
| return setInputOrOutputFromMemory(mModel->getOutputOperand(outputIndex), memory, offset, |
| length, dimensions, &mOutputs.at(outputIndex)); |
| } |
| |
| // Executes using the (driver, preparedModel) specified at construction time. |
| std::tuple<int, std::vector<OutputShape>, Timing> compute( |
| const OptionalTimePoint& deadline, const SharedBurst& burstController = nullptr); |
| |
| // Re-compiles and executes using the CPU, regardless of the (driver, |
| // preparedModel) specified at construction time. |
| std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpuFallback(); |
| |
| bool isCpu() const; |
| |
| // Perform fenced execution and return error_code, sync_fence_fd and a |
| // callback. |
| std::tuple<int, int, ExecuteFencedInfoCallback> computeFenced( |
| const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence, |
| const OptionalTimePoint& deadline); |
| |
| // Do the dynamic temporaries defined by this step have valid allocations? |
| // (true if there are no dynamic temporaries defined by this step.) |
| bool areDynamicTemporariesAllocated() const; |
| |
| private: |
| // builderDimensions may be nullptr if executorInputOrOutput has fully |
| // specified dimensions. |
| void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput, |
| ModelArgumentInfo* executorInputOrOutput, |
| const Dimensions* builderDimensions = nullptr); |
| |
| // dimensions must either have zero rank or |
| // must be consistent with and at least as well specified as operand |
| // dimensions (i.e., either rank must match, or operand rank must be zero; |
| // and for each individual dimension, either dimension must match, or |
| // operand dimension must be zero). |
| int setInputOrOutputFromMemory(const Operand& inputOrOutputOperand, const RuntimeMemory* memory, |
| uint32_t offset, uint32_t length, const Dimensions& dimensions, |
| ModelArgumentInfo* inputOrOutputInfo); |
| |
| // describes the full (possibly multiple-"step") execution |
| ExecutionBuilder* mExecutionBuilder; |
| |
| // describes the single execution step |
| const ExecutionStep* mExecutionStep; |
| |
| // describes the dynamic temporaries |
| DynamicTemporaries* mDynamicTemporaries; |
| |
| // model to be executed on the executor, in both original and |
| // compiled forms; and device on which to execute it |
| const ModelBuilder* mModel; |
| std::shared_ptr<Device> mDevice; |
| std::shared_ptr<RuntimePreparedModel> mPreparedModel; |
| |
| // The reusable execution to launch multiple computations. |
| // It is only created once at the first time it's needed. |
| std::shared_ptr<RuntimeExecution> mExecution; |
| // Returns {NO_ERROR, execution} on success, or {result_code, nullptr} on failure. |
| std::pair<int, std::shared_ptr<RuntimeExecution>> getReusableExecution(); |
| |
| // The information we'll send to the driver about the inputs and outputs. |
| // Note that we build this in two steps: |
| // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element. |
| // If set from a pointer, don't set the location in the Request::Argument but store it |
| // instead in mInputBuffers or mOutputBuffers. |
| // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for |
| // the m*Buffers entries. Copy the input values into the shared memory. |
| // We do this to avoid creating a lot of shared memory objects if we have a lot of |
| // parameters specified via pointers. We also avoid copying in the case where |
| // some of the nodes will interpreted on the CPU anyway. |
| std::vector<ModelArgumentInfo> mInputs; |
| std::vector<ModelArgumentInfo> mOutputs; |
| MemoryTracker mMemories; |
| |
| // Whether compute/computeFenced may be invoked multiple times. |
| bool mReusable = false; |
| }; |
| |
| std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes); |
| |
| } // namespace nn |
| } // namespace android |
| |
| #endif // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H |