nn/runtime/ExecutionPlan.cpp - platform/frameworks/ml - Git at Google

 /*
  * Copyright (C) 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #define LOG_TAG "ExecutionPlan"

 #include "ExecutionPlan.h"

 #include "Callbacks.h"
 #include "CompilationBuilder.h"
 #include "ExecutionBuilder.h"
 #include "Manager.h"
 #include "ModelBuilder.h"
 #include "Tracing.h"
 #include "Utils.h"

 #include <functional>
 #include <map>
 #include <queue>
 #include <unordered_set>
 #include <utility>
 #include <vector>

 using ::android::hardware::neuralnetworks::V1_0::implementation::ExecutionCallback;
 using ::android::hardware::neuralnetworks::V1_0::implementation::PreparedModelCallback;

 namespace android {
 namespace nn {

 static int compile(std::shared_ptr<Device> device, const ModelBuilder* model,
                    int32_t executionPreference, sp<IPreparedModel>* preparedModel) {
     nnAssert(device != nullptr);  // nullptr indicates CPU
     // Compilation logic copied from ExecutionBuilder::startComputeOnDevice().
     Model hidlModel;
     model->setHidlModel(&hidlModel);

     sp<PreparedModelCallback> preparedModelCallback = new PreparedModelCallback();

     // Note that some work within VersionedIDevice will be subtracted from the
     // IPC layer
     NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "prepareModel");
     Return<ErrorStatus> prepareLaunchStatus = device->getInterface()->prepareModel(
         hidlModel, static_cast<ExecutionPreference>(executionPreference), preparedModelCallback);
     if (!prepareLaunchStatus.isOk()) {
         LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed due to transport error: "
                    << prepareLaunchStatus.description();
         return ANEURALNETWORKS_OP_FAILED;
     }
     if (prepareLaunchStatus != ErrorStatus::NONE) {
         LOG(ERROR) << "ExecutionStep::finishSubModel compilation failed with error: "
                    << toString(static_cast<ErrorStatus>(prepareLaunchStatus));
         return ANEURALNETWORKS_OP_FAILED;
     }

     preparedModelCallback->wait();
     ErrorStatus prepareReturnStatus = preparedModelCallback->getStatus();
     *preparedModel = preparedModelCallback->getPreparedModel();
     if (prepareReturnStatus != ErrorStatus::NONE || *preparedModel == nullptr) {
         LOG(ERROR) << "ExecutionPlan compilation on " << device->getName() << " failed:"
                    << " prepareReturnStatus=" << toString(prepareReturnStatus)
                    << ", preparedModel=" << preparedModel->get();
         return ANEURALNETWORKS_OP_FAILED;
     }
     return ANEURALNETWORKS_NO_ERROR;
 }

 typedef std::function<void(uint32_t)> OperationReadyCallback;

 // This class tracks whether we know the value of an operand as operations
 // are processed.
 class OperandTracker {
 public:
     // Creates the tracker for this model. Figure out which operations can be
     // executed right away and cb for each one of them.
     OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
     // Mark the specified operation as having been processed. The output
     // of the operation now being known, this may make new operations to be
     // able to run.  Call cb for each one of them.
     void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);

 private:
     const ModelBuilder* mModel;
     std::multimap<uint32_t, uint32_t> mOperandToOperations;
     std::vector<uint32_t> mUnknownInputCount;  // For each operation
 };

 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) :
         mModel(model) {
     const auto& operations = mModel->getOperations();
     mUnknownInputCount.resize(operations.size());
     for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
         const Operation& operation = operations[operationIndex];
         uint32_t count = 0;
         for (uint32_t operandIndex : operation.inputs) {
             auto lifetime = mModel->getOperand(operandIndex).lifetime;
             if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE ||
                 lifetime == OperandLifeTime::MODEL_OUTPUT) {
                 count++;
                 mOperandToOperations.insert(
                         std::pair<uint32_t, uint32_t>(operandIndex, operationIndex));
             }
         }
         if (count == 0) {
             cb(operationIndex);
         }
         mUnknownInputCount[operationIndex] = count;
     }
 }

 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
     // Mark all its outputs as known.
     const Operation& operation = mModel->getOperations()[operationIndex];
     for (uint32_t operandIndex : operation.outputs) {
         auto range = mOperandToOperations.equal_range(operandIndex);
         for (auto i = range.first; i != range.second; i++) {
             uint32_t& count = mUnknownInputCount[i->second];
             if (--count == 0) {
                 cb(i->second);
             }
         }
     }
 }

 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex,
                              std::shared_ptr<Device> device)
         : mPlan(plan), mIndex(stepIndex), mSubModel(), mDevice(device) {}

 // Adds an operand if it has not been added already.
 // Sets the index in the submodel for the corresponding operand.
 int ExecutionStep::addOperand(uint32_t fromOperandIndex, uint32_t* toOperandIndex,
                               const ModelBuilder& fromModel, OperandKind kind) {
     // Have we added this operand already?
     auto i = mOperandMap.find(fromOperandIndex);
     if (i != mOperandMap.end()) {
         nnAssert(kind == INPUT);
         *toOperandIndex = i->second;
         return ANEURALNETWORKS_NO_ERROR;
     }

     // First time we add this operand.
     *toOperandIndex = mSubModel.operandCount();
     mOperandMap.insert(std::pair<uint32_t, uint32_t>(fromOperandIndex, *toOperandIndex));

     // Add the operand to the submodel.
     const Operand& operand = fromModel.getOperand(fromOperandIndex);
     ANeuralNetworksOperandType type = {
         .type = static_cast<int32_t>(operand.type),
         .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
         .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
         .scale = operand.scale,
         .zeroPoint = operand.zeroPoint
     };
     int n = mSubModel.addOperand(type);
     if (n != ANEURALNETWORKS_NO_ERROR) {
         LOG(ERROR) << "Previous error occurred when partitioning the graph";
         return n;
     }

     // Sets its value.
     switch (operand.lifetime) {
         case OperandLifeTime::CONSTANT_COPY: {
             const uint8_t* data = fromModel.getPointerToOperandValue(operand.location.offset);
             n = mSubModel.setOperandValue(*toOperandIndex, data, operand.location.length);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
                 return n;
             }
         } break;
         case OperandLifeTime::CONSTANT_REFERENCE: {
             const Memory* memory = fromModel.getMemories()[operand.location.poolIndex];
             n = mSubModel.setOperandValueFromMemory(*toOperandIndex, memory,
                                                      operand.location.offset,
                                                      operand.location.length);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
                 return n;
             }
         } break;
         case OperandLifeTime::NO_VALUE: {
             n = mSubModel.setOperandValue(*toOperandIndex, nullptr, 0);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 LOG(ERROR) << "Previous error occurred when partitioning the graph";
                 return n;
             }
         } break;
         case OperandLifeTime::TEMPORARY_VARIABLE:  // handled similarly to MODEL_OUTPUT
             if (kind == INPUT) {
                 // The first time we've seen this operand is as an
                 // input.  That means it must be defined by a
                 // different partition, and is an input to this one.
                 mTempsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
             } else {
                 // The first time we've seen this operand is as an
                 // output.  It may be an input to a different
                 // partition, so keep track of it.
                 mPlan->recordTemporaryDef(fromOperandIndex, mIndex);
             }
             break;
         case OperandLifeTime::MODEL_INPUT:
             mModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
             break;
         case OperandLifeTime::MODEL_OUTPUT:  // handled similarly to TEMPORARY_VARIABLE
             if (kind == INPUT) {
                 // The first time we've seen this operand is as an
                 // input.  That means it must be defined by a
                 // different partition, and is an input to this one.
                 mOutputsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
             } else {
                 // The first time we've seen this operand is as an
                 // output.
                 mModelOutputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex));
             }
             break;
         default:
             nnAssert(false);
             break;
     }

     return ANEURALNETWORKS_NO_ERROR;
 }

 int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& fromModel) {
     const Operation& operation = fromModel.getOperation(operationIndex);

     // Convert the input and output operand indexes.
     //
     // We expect operations to be added in topological order.  Therefore:
     //
     // - We may not have seen an input if it is a model input, a
     //   constant, or an operand written by a different partition.
     //
     // - We should not have seen any outputs.
     const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
     const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
     std::vector<uint32_t> inputs(inputCount);
     std::vector<uint32_t> outputs(outputCount);

     auto addOperands = [this, &fromModel](const hidl_vec<uint32_t>& globalOperands,
                                           std::vector<uint32_t>& localOperands,
                                           OperandKind kind) -> int {
         const uint32_t operandCount = static_cast<uint32_t>(globalOperands.size());
         for (uint32_t i = 0; i < operandCount; i++) {
             uint32_t localOperand = ~0U;
             int n = addOperand(globalOperands[i], &localOperand, fromModel, kind);
             if (n != ANEURALNETWORKS_NO_ERROR)
                 return n;
             localOperands[i] = localOperand;
         }
         return ANEURALNETWORKS_NO_ERROR;
     };

     int n;
     if ((n = addOperands(operation.inputs, inputs, INPUT)) != ANEURALNETWORKS_NO_ERROR ||
         (n = addOperands(operation.outputs, outputs, OUTPUT)) != ANEURALNETWORKS_NO_ERROR) {
         return n;
     }

     return mSubModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
                                    outputCount, outputs.data());
 }

 void ExecutionStep::mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const {
     for (uint32_t i = 0, e = mInputIndexSubModelToFromModel.size(); i < e; i++) {
         stepExecutor->mapInput(mInputIndexSubModelToFromModel[i], i);
     }
     for (uint32_t i = 0, e = mOutputIndexSubModelToFromModel.size(); i < e; i++) {
         stepExecutor->mapOutput(mOutputIndexSubModelToFromModel[i], i);
     }
 }

 void ExecutionPlan::CompoundBody::findTempsAsSubModelOutputs() {
     for (const auto& step : mSteps) {
         for (const auto& input : step->getTempsAsSubModelInputs()) {
             const uint32_t fromModelIndex = input.first;
             const auto it = mTemporaryToDefiningStep.find(fromModelIndex);
             nnAssert(it != mTemporaryToDefiningStep.end());
             const uint32_t stepIndex = it->second;
             nnAssert(stepIndex < mSteps.size());
             mSteps[stepIndex]->recordTempAsSubModelOutput(fromModelIndex);
         }
     }
 }

 void ExecutionStep::logSubModel() const {
     VLOG(COMPILATION) << "ExecutionStep::finishSubModel, step " << mIndex;

     auto logRemapEntry = [](std::string &toLog, const std::pair<uint32_t, uint32_t>& e) {
         if (!toLog.empty()) {
             toLog += ", ";
         }
         toLog += "(";
         toLog += std::to_string(e.first);
         toLog += "->";
         toLog += std::to_string(e.second);
         toLog += ")";
     };

     auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
         std::string toLog;
         for (const auto& e : map) {
             logRemapEntry(toLog, e);
         }
         VLOG(COMPILATION) << name << ": " << toLog;
     };
     auto logRemapSet = [&logRemapEntry](const char* name, const SubModelOutputSetType& set) {
         std::string toLog;
         for (const auto& e : set) {
             logRemapEntry(toLog, e);
         }
         VLOG(COMPILATION) << name << ": " << toLog;
     };

     logRemapVector("model inputs", mModelInputs);
     logRemapVector("model outputs", mModelOutputs);
     logRemapVector("temps as submodel inputs", mTempsAsSubModelInputs);
     logRemapSet("temps as submodel outputs", mTempsAsSubModelOutputs);
     logRemapVector("outputs as submodel inputs", mOutputsAsSubModelInputs);
 }

 static void convertModelInputsOrOutputs(
         // IN: mModel{Inputs|Outputs}
         const ExecutionStep::RemapVectorType& myModelInputsOrOutputs,
         // IN: fromModel->{input|output}Count()
         uint32_t                              fromModelInputOrOutputCount,
         // IN: fromModel->get{Input|Output}OperandIndex
         std::function<uint32_t(uint32_t)>     fromModelGetInputOrOutputOperandIndex,
         // OUT: for v : mModel{Inputs|Outputs} : v.second
         std::vector<uint32_t>*                inputsOrOutputs,
         // OUT: submodel input-or-output index to original model input-or-output index
         std::vector<uint32_t>*                inputOrOutputIndexSubModelToFromModel) {
     std::map<uint32_t, uint32_t> fromModelIndexMap;  // operand index to input-or-output index
     for (uint32_t i = 0; i < fromModelInputOrOutputCount; i++) {
         fromModelIndexMap[fromModelGetInputOrOutputOperandIndex(i)] = i;
     }
     for (const auto& myInputOrOutput : myModelInputsOrOutputs) {
         inputsOrOutputs->push_back(myInputOrOutput.second);
         const uint32_t fromModelInputOrOutputIndex = fromModelIndexMap[myInputOrOutput.first];
         inputOrOutputIndexSubModelToFromModel->push_back(fromModelInputOrOutputIndex);
     }
 }

 int ExecutionStep::finishSubModel(const ModelBuilder* fromModel, bool* hasOutputOfUnknownSize,
                                   int32_t executionPreference) {
     if (VLOG_IS_ON(COMPILATION)) {
         logSubModel();
     }

     mSubModel.relaxComputationFloat32toFloat16(fromModel->isComputationFloat32RelaxedToFloat16());

     // Input order: mModelInputs, mTempsAsSubModelInputs, mOutputsAsSubModelInputs
     // Output order: mModelOutputs, mTempsAsSubModelOutputs
     //
     // ExecutionPlan::next() depends on these orderings.

     std::vector<uint32_t> inputs;
     convertModelInputsOrOutputs(mModelInputs,
                                 fromModel->inputCount(),
                                 [=](uint32_t i) { return fromModel->getInputOperandIndex(i); },
                                 &inputs,
                                 &mInputIndexSubModelToFromModel);
     for (const auto& subModelInput : mTempsAsSubModelInputs) {
         inputs.push_back(subModelInput.second);
     }
     for (const auto& subModelInput : mOutputsAsSubModelInputs) {
         inputs.push_back(subModelInput.second);
     }

     std::vector<uint32_t> outputs;
     convertModelInputsOrOutputs(mModelOutputs,
                                 fromModel->outputCount(),
                                 [=](uint32_t i) { return fromModel->getOutputOperandIndex(i); },
                                 &outputs,
                                 &mOutputIndexSubModelToFromModel);
     for (const auto& subModelOutput : mTempsAsSubModelOutputs) {
         outputs.push_back(subModelOutput.second);
         const Operand& operand = mSubModel.getOperand(subModelOutput.second);
         for (uint32_t dimension : operand.dimensions) {
             if (dimension == 0) {
                 *hasOutputOfUnknownSize = true;
                 VLOG(COMPILATION) << "SubModelOutput (operand#" << subModelOutput.first
                                 << " of original graph) has unknown size: "
                                 << toString(operand);
                 break;
             }
         }
     }

     {
         int n = mSubModel.identifyInputsAndOutputs(inputs.size(), &inputs[0], outputs.size(), &outputs[0]);
         if (n != ANEURALNETWORKS_NO_ERROR) {
             return n;
         }
         n = mSubModel.finish();
         if (n != ANEURALNETWORKS_NO_ERROR) {
             return n;
         }
     }

     {
         // Compute mOutputsAsSubModelInputsIndexToFromModel.

         std::map<uint32_t, uint32_t> fromModelOperandIndexToOutputIndex;
         for (unsigned i = 0, e = fromModel->outputCount(); i < e; ++i) {
             fromModelOperandIndexToOutputIndex[fromModel->getOutputOperandIndex(i)] = i;
         }

         for (unsigned i = 0, e = mOutputsAsSubModelInputs.size(); i < e; i++) {
             const uint32_t fromModelOperandIndex = mOutputsAsSubModelInputs[i].first;
             const auto it = fromModelOperandIndexToOutputIndex.find(fromModelOperandIndex);
             if (it == fromModelOperandIndexToOutputIndex.end()) {
                 LOG(ERROR) << "Could not find main model output operand " << fromModelOperandIndex
                            << " in main model output operand list";
                 return ANEURALNETWORKS_BAD_STATE;
             }
             mOutputsAsSubModelInputsIndexToFromModel.push_back(it->second);
         }
     }

     // TODO: Move compilation elsewhere?

     if (mDevice == nullptr) {
         return ANEURALNETWORKS_NO_ERROR;
     }

     VLOG(COMPILATION) << "ExecutionStep::finishSubModel, compilation";
     return compile(mDevice, &mSubModel, executionPreference, &mPreparedSubModel);
 }

 void ExecutionStep::dump() const {
     Model model;
     mSubModel.setHidlModel(&model);
     if (VLOG_IS_ON(COMPILATION)) {
         VLOG(COMPILATION) << "ExecutionStep#" << mIndex
                           << " for " << (mDevice == nullptr ? "CPU" : mDevice->getName());
         logModelToInfo(model);
     }
 }

 int ExecutionPlan::CompoundBody::finish(const ModelBuilder* fromModel,
                                         int32_t executionPreference) {
     findTempsAsSubModelOutputs();
     for (const auto& step : mSteps) {
         int n = step->finishSubModel(fromModel, &mHasSubModelOutputOfUnknownSize,
                                      executionPreference);
         if (n != ANEURALNETWORKS_NO_ERROR) {
             VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishSubModel failed";
             return n;
         }
     }
     if (mHasSubModelOutputOfUnknownSize) {
         VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- mHasSubModelOutputOfUnknownSize";
         return ANEURALNETWORKS_OP_FAILED;
     }

     mSuccessfulFinish = true;
     return ANEURALNETWORKS_NO_ERROR;
 }

 int ExecutionPlan::SimpleBody::finish([[maybe_unused]] const ModelBuilder* fromModel,
                                       int32_t executionPreference) {
     if (mDevice == nullptr) {
         mSuccessfulFinish = true;
         return ANEURALNETWORKS_NO_ERROR;
     }

     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
     const int n = compile(mDevice, mModel, executionPreference, &mPreparedModel);
     mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
     return n;
 }

 int ExecutionPlan::finish(const ModelBuilder* fromModel, int32_t executionPreference) {
     nnAssert(mBody != nullptr);
     return mBody->finish(fromModel, executionPreference);
 }

 ExecutionPlan::Controller::Controller(
     const ExecutionPlan* plan,
     const ExecutionBuilder* executionBuilder,
     std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs,
     uint32_t totalSizeOfTemporaries) :
         mPlan(plan), mExecutionBuilder(executionBuilder),
         mSubModelInputsAndOutputs(subModelInputsAndOutputs), mNextStepIndex(0) {
     if (totalSizeOfTemporaries) {
         if (mTemporaries.create(totalSizeOfTemporaries) != ANEURALNETWORKS_NO_ERROR) {
             LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
             mNextStepIndex = kBadStepIndex;
         }
     }
 }

 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
     const ExecutionBuilder* executionBuilder) const {
     nnAssert((mState == EMPTY) == (mBody == nullptr));
     if (mBody && !mBody->mSuccessfulFinish) {
         VLOG(EXECUTION) << "ExecutionPlan::makeController -- unsuccessful finish";
         return std::shared_ptr<Controller>(nullptr);
     }

     // Create the layout for a Memory object big enough for to hold
     // every TEMPORARY in the original model that is live across
     // partition boundaries.
     //
     // TODO: Rethink this approach for managing temporaries.  Some
     // alternatives:
     //
     // 1) Adopt a memory layout scheme analogous to stack allocation,
     // where objects of non-overlapping lifetime can occupy the same
     // storage.  We would still have a single Memory object in this
     // case.
     //
     // 2) Do something like what CpuExecutor does, and do allocations
     // and deallocations on the fly (during execution) before first
     // reference and after last reference, respectively.  This would
     // mean having one Memory object per TEMPORARY; or, in a more
     // complicated implementation, one Memory object per set of
     // temporaries that have the same lifetime.  Note that the Android
     // system limits the number of shared memory objects, which are
     // what our Memory objects represent.
     //
     uint32_t totalSizeOfTemporaries = 0;
     std::shared_ptr<Controller::SubModelInputsAndOutputsType> subModelInputsAndOutputs;
     if (mState == COMPOUND) {
         const ModelBuilder* fromModel = executionBuilder->getModel();
         for (const auto& step : compound()->mSteps) {
             for (const auto& output: step->getTempsAsSubModelOutputs()) {
                 const uint32_t fromModelOperandIndex = output.first;
                 const Operand& fromModelOperand = fromModel->getOperand(fromModelOperandIndex);
                 if (subModelInputsAndOutputs == nullptr) {
                     subModelInputsAndOutputs =
                             std::make_shared<Controller::SubModelInputsAndOutputsType>();
                 }
                 const uint32_t size = sizeOfData(fromModelOperand);
                 totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
                 subModelInputsAndOutputs->insert(std::make_pair(fromModelOperandIndex, totalSizeOfTemporaries));
                 totalSizeOfTemporaries += size;
             }
         }
         if (VLOG_IS_ON(EXECUTION) && (subModelInputsAndOutputs != nullptr)) {
             for (const auto& io : *subModelInputsAndOutputs) {
                 VLOG(EXECUTION) << "temp: origOpndIdx = " << io.first
                                 << ", offset = " << io.second;
             }
         }
     }

     return std::shared_ptr<Controller>(new Controller(this, executionBuilder,
                                                       subModelInputsAndOutputs,
                                                       totalSizeOfTemporaries));
 }


 // TODO: Find a better way to provide this functionality.
 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
                             std::shared_ptr<StepExecutor>* executor) const {
     *executor = nullptr;

     VLOG(EXECUTION) << "ExecutionPlan::fallback(" << controller << ", " << executor
                     << "): mNextStepIndex = " << controller->mNextStepIndex;

     if (controller->mNextStepIndex == 0) {
         // We haven't called next().
         return ANEURALNETWORKS_OP_FAILED;
     }

     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
         // The last call to next() did not produce an executor.
         return ANEURALNETWORKS_OP_FAILED;
     }

     --controller->mNextStepIndex;
     return next(controller, executor);
 }

 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
                         std::shared_ptr<StepExecutor>* executor) const {
     *executor = nullptr;

     VLOG(EXECUTION) << "ExecutionPlan::next("
                     << SHOW_IF_DEBUG(controller << ", " << executor)
                     << "): mNextStepIndex = " << controller->mNextStepIndex;

     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
         return ANEURALNETWORKS_OP_FAILED;
     }

     if (mState == EMPTY) {
         nnAssert(controller->mNextStepIndex == 0);  // end
         controller->mNextStepIndex = Controller::kBadStepIndex;
         return ANEURALNETWORKS_NO_ERROR;
     }

     if (mState == SIMPLE) {
         if (controller->mNextStepIndex == 0) {
             // First (and only) step.
             auto simpleBody = static_cast<const SimpleBody*>(mBody);
             *executor = std::make_shared<StepExecutor>(
                 controller->mExecutionBuilder,
                 simpleBody->mModel,
                 (simpleBody->mDevice == nullptr ? nullptr : simpleBody->mDevice->getInterface()),
                 simpleBody->mPreparedModel);
             (*executor)->mapInputsAndOutputsTrivially();
             controller->mNextStepIndex = 1;
             return ANEURALNETWORKS_NO_ERROR;
         }

         nnAssert(controller->mNextStepIndex == 1);  // end
         controller->mNextStepIndex = Controller::kBadStepIndex;
         return ANEURALNETWORKS_NO_ERROR;
     }

     auto compoundBody = compound();

     if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
         // end
         controller->mNextStepIndex = Controller::kBadStepIndex;
         return ANEURALNETWORKS_NO_ERROR;
     }

     // Input order: model inputs, temps as submodel inputs, outputs as submodel inputs
     // Output order: model outputs, temps as submodel outputs
     //
     // ExecutionStep::finishSubModel() establishes these orderings.

     const auto step = compoundBody->mSteps[controller->mNextStepIndex];
     *executor = std::make_shared<StepExecutor>(
         controller->mExecutionBuilder,
         step->getSubModel(),
         (step->getDevice() == nullptr ? nullptr : step->getDevice()->getInterface()),
         step->getPreparedSubModel());
     step->mapInputsAndOutputs(*executor);
     if (controller->mSubModelInputsAndOutputs != nullptr) {
         {
             // Tell executor about temps as submodel outputs.

             const size_t firstSubModelOutputIndex = step->getModelOutputs().size();
             const auto& subModelOutputs = step->getTempsAsSubModelOutputs();

             uint32_t idx = 0;
             for (auto I = subModelOutputs.begin(), E = subModelOutputs.end(); I != E; I++, idx++) {
                 const uint32_t fromModelOperandIndex = I->first;
                 const uint32_t offsetOfTemporary =
                     controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
                 int n = (*executor)->setOutputFromTemporaryMemory(
                     firstSubModelOutputIndex + idx,
                     &controller->mTemporaries,
                     offsetOfTemporary);
                 if (n != ANEURALNETWORKS_NO_ERROR) {
                     controller->mNextStepIndex = Controller::kBadStepIndex;
                     return n;
                 }
             }
         }
         {
             // Tell executor about temps as submodel inputs.

             const size_t firstSubModelInputIndex = step->getModelInputs().size();
             const auto& subModelInputs = step->getTempsAsSubModelInputs();

             uint32_t idx = 0;
             for (auto I = subModelInputs.begin(), E = subModelInputs.end(); I != E; I++, idx++) {
                 const uint32_t fromModelOperandIndex = I->first;
                 const uint32_t offsetOfTemporary =
                     controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex);
                 int n = (*executor)->setInputFromTemporaryMemory(
                     firstSubModelInputIndex + idx,
                     &controller->mTemporaries,
                     offsetOfTemporary);
                 if (n != ANEURALNETWORKS_NO_ERROR) {
                     controller->mNextStepIndex = Controller::kBadStepIndex;
                     return n;
                 }
             }
         }
     }
     {
         // Tell executor about outputs as submodel inputs.

         const size_t firstOutputsAsSubModelInputIndex =
                 step->getModelInputs().size() + step->getTempsAsSubModelInputs().size();
         const auto& outputsAsSubModelInputsIndexToFromModel =
                 step->getOutputsAsSubModelInputsIndexToFromModel();
         for (uint32_t i = 0, e = outputsAsSubModelInputsIndexToFromModel.size(); i < e; i++) {
             uint32_t o = outputsAsSubModelInputsIndexToFromModel[i];
             (*executor)->mapOutputToInput(o, firstOutputsAsSubModelInputIndex + i);
         }
     }

     controller->mNextStepIndex++;
     return ANEURALNETWORKS_NO_ERROR;
 }

 std::shared_ptr<ExecutionStep> ExecutionPlan::createNewStep(const std::shared_ptr<Device> device) {
     nnAssert(mState != SIMPLE);
     if (mState == EMPTY) {
         mBody = new CompoundBody();
         mState = COMPOUND;
     }
     auto& steps = compound()->mSteps;
     auto step = std::make_shared<ExecutionStep>(this, steps.size(), device);
     steps.push_back(step);
     return step;
 }

 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
                                      const ModelBuilder* model) {
     nnAssert(mState == EMPTY);
     mBody = new SimpleBody(device, model);
     mState = SIMPLE;
 }

 void ExecutionPlan::dump() const {
     if (mBody) {
         mBody->dump();
     } else {
         VLOG(COMPILATION) << "EMPTY";
     }
 }

 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
     switch (mState) {
         case EMPTY:
             return Kind::EMPTY;
         case SIMPLE:
             nnAssert(mBody);
             return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
         case COMPOUND:
             nnAssert(mBody);
             return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
         default:
             nnAssert(!"unexpected state");
             return Kind::ERROR;
     }
 }

 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
     nnAssert(mState == SIMPLE);
     return static_cast<const SimpleBody*>(mBody)->mDevice;
 }

 const std::vector<std::shared_ptr<ExecutionStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
     return compound()->mSteps;
 }

 bool ExecutionPlan::forTest_hasSubModelOutputsOfUnknownSize() const {
     return mBody->hasSubModelOutputsOfUnknownSize();
 }

 void ExecutionPlan::SimpleBody::dump() const {
     VLOG(COMPILATION) << "SIMPLE for " << (mDevice == nullptr ? "CPU" : mDevice->getName());
 }

 void ExecutionPlan::CompoundBody::dump() const {
     for (const auto& step : mSteps) {
         step->dump();
     }
 }

 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
                                    uint32_t preference, ExecutionPlan* plan) const {
     // This function uses a heuristic approach to partitioning the graph.
     // It should be good enough for the first release.

     const size_t nonCpuDeviceCount = devices.size();
     // The device count is the number of HAL devices + 1. The +1 is for the CPU.
     // Note that deviceCount includes CPU, which has no entry in devices[].
     const size_t deviceCount = nonCpuDeviceCount + 1;
     const size_t operationCount = mOperations.size();

     VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: deviceCount = " << deviceCount
                       << ", operationCount = " << operationCount;

     // If we only have the CPU, or if the graph has no operations, no need to try to partition.
     if (nonCpuDeviceCount == 0 || operationCount == 0) {
         // Make sure no op is an OEM operation.
         if (mHasOEMOperation) {
             LOG(ERROR) << "No driver can do the OEM op";
             return ANEURALNETWORKS_BAD_DATA;
         }
         plan->becomeSingleStep(nullptr /* CPU */, this);
         return plan->finish(this, preference);
     }

     // Figure out where each operation will best execute.
     // The value of the vector is the index in the devices vector, with devices.size()
     // representing the CPU.
     std::vector<int> bestDeviceForOperation(operationCount);
     int status = findBestDeviceForEachOperation(preference, devices, deviceCount,
                                                 &bestDeviceForOperation);
     if (status != ANEURALNETWORKS_NO_ERROR) {
         return status;
     }

     // If one device will run all the operations, we don't need to split the work.
     if (std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
                            std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
         const int bestDeviceIndex = bestDeviceForOperation[0];
         const bool cpu = (size_t(bestDeviceIndex) == deviceCount - 1);
         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
                           << bestDeviceIndex << " = "
                           << (cpu ? "CPU" : devices[bestDeviceIndex]->getName());
         plan->becomeSingleStep(cpu ? nullptr : devices[bestDeviceIndex], this);
         return plan->finish(this, preference);
     }

     // No easy solution, we need to split the work.

     // We keep track of the operations that are ready to run for each device.
     std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount);

     // This helper function enqueues the operation on the appropriate queue.
     auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
         int deviceIndex = bestDeviceForOperation[operationIndex];
         perDeviceQueue[deviceIndex].push(operationIndex);
         VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
                           << deviceIndex;
     };

     // This helper function finds a device that has operations ready to process.
     // We start by looking at the CPU. We do this to try to maximize the
     // size of the graph we'll send to non-CPU devices. If the CPU runs first,
     // it will have the chance to prepare more of the inputs required by the
     // other devices. This function returns -1 if all queues are empty.
     auto findNextDeviceToProcess = [&]() -> int {
         for (int i = deviceCount - 1; i >= 0; i--) {
             if (!perDeviceQueue[i].empty()) {
                 return i;
             }
         }
         return -1;
     };

     OperandTracker tracker(this, enqueueOnAppropriateDevice);
     // For each iteration of this loop, we'll create an execution step.
     while (true) {
         // Find the device we'll do this step for.
         int deviceIndex = findNextDeviceToProcess();
         VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
         if (deviceIndex < 0) {
             break;
         }
         // nullptr represents the CPU.
         std::shared_ptr<Device> device =
                 static_cast<size_t>(deviceIndex) < nonCpuDeviceCount
                         ? devices[deviceIndex] : nullptr;

         // Assign as much as possible to this device.
         std::shared_ptr<ExecutionStep> step = plan->createNewStep(device);
         auto& queue = perDeviceQueue[deviceIndex];
         while (!queue.empty()) {
             uint32_t operationIndex = queue.front();
             queue.pop();
             int n = step->addOperation(operationIndex, *this);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
                 return n;
             }
             tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
         }
     }

     int n = plan->finish(this, preference);
     if (VLOG_IS_ON(COMPILATION)) {
         Model model;
         setHidlModel(&model);
         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: original model: ";
         logModelToInfo(model);
         plan->dump();
     }
     return n;
 }

 PerformanceInfo ModelBuilder::getPerformanceInfo(const std::shared_ptr<Device> device,
                                                  uint32_t operationIndex) const {
     const Operation& operation = getOperation(operationIndex);
     // TODO This assumes that the type is dictated by the first operand. This is
     // currently the case but is not a safe assumption to make in the long term.
     const uint32_t operandIndex = operation.inputs[0];
     const OperandType operandType = mOperands[operandIndex].type;
     switch(operandType) {
         case OperandType::FLOAT32:
         case OperandType::TENSOR_FLOAT16:
         case OperandType::TENSOR_FLOAT32:
             if (mRelaxComputationFloat32toFloat16) {
                 return device->getRelaxedFloat32toFloat16Performance();
             } else {
                 return device->getFloat32Performance();
             }
         case OperandType::INT32:
         case OperandType::UINT32:
         case OperandType::BOOL:
         case OperandType::TENSOR_INT32:
         case OperandType::TENSOR_QUANT8_ASYMM:
         case OperandType::TENSOR_QUANT16_SYMM:
             // For OEM, the real selection will be made from who can run the operand.
         case OperandType::OEM:
         case OperandType::TENSOR_OEM_BYTE:
             return device->getQuantized8Performance();
         default:
             nnAssert(false);
             return device->getQuantized8Performance();
     }
 }

 namespace {
 // This class determines whether a given device can execute a given operation
 class CanDo {
 public:
     CanDo() {}

     void initialize(const ModelBuilder* model, std::shared_ptr<Device> device) {
         Model hidlModel;
         model->setHidlModel(&hidlModel);
         device->getSupportedOperations(hidlModel, &mSupportsOperationByIndex);
     }

     bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }

 private:
     hidl_vec<bool> mSupportsOperationByIndex;
 };
 };  // anonymous namespace

 int ModelBuilder::findBestDeviceForEachOperation(
         uint32_t preference,
         const std::vector<std::shared_ptr<Device>>& devices,
         const size_t deviceCount,
         std::vector<int>* bestDeviceForOperation) const {

     // Note that deviceCount includes CPU, which has no entry in devices[]
     const size_t nonCpuDeviceCount = deviceCount - 1;

     std::vector<CanDo> canDo(nonCpuDeviceCount);
     for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) {
         canDo[deviceIndex].initialize(this, devices[deviceIndex]);
     }

     // Figure out the best driver for each operation.
     const size_t operationCount = mOperations.size();
     for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
         // Find which non-CPU device gives the best performance for this operation.
         int bestChoice = -1;
         float bestPerfVal = 0.0;  // Do not check bestPerfVal if bestChoice < 0.
         for (size_t deviceIndex = 0; deviceIndex < nonCpuDeviceCount; deviceIndex++) {
             const auto& device = devices[deviceIndex];
             if (canDo[deviceIndex].check(operationIndex)) {
                 const PerformanceInfo perf = getPerformanceInfo(device, operationIndex);
                 const float perfVal =
                             (preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage
                                                                             : perf.execTime);
                 if (bestChoice < 0 || perfVal < bestPerfVal) {
                     bestChoice = deviceIndex;
                     bestPerfVal = perfVal;
                 }
             } else {
                 // Somewhat noisy logging, but only place where the user of
                 // NNAPI can get feedback on why an operation was not run on a
                 // specific device.
                 // Logs O(operationCount * nonCpuDeviceCount) times, but
                 // typically nonCpuDeviceCount is very small.
                 VLOG(COMPILATION) << "Device " << device->getName()
                                   << " can't do operation "
                                   << toString(getOperation(operationIndex).type);
             }
         }
         // If it's the OEM op, we'd better have a device able to do it.
         if (mOperations[operationIndex].type == OperationType::OEM_OPERATION) {
             if (bestChoice < 0) {
                 LOG(ERROR) << "No driver can do the OEM op";
                 return ANEURALNETWORKS_BAD_DATA;
             }
         } else {
             // If no driver has been found, or if the best driver is not better than the CPU,
             // prefer the CPU. Since the performance is a ratio compared to the CPU performance,
             // by definition the performance of the CPU is 1.0.
             if (bestChoice < 0 || bestPerfVal >= 1.0) {
                 bestChoice = nonCpuDeviceCount;  // The ID of the CPU.
             }
         }

         (*bestDeviceForOperation)[operationIndex] = bestChoice;
         VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
                           << toString(getOperation(operationIndex).type)
                           << ") = "
                           << (*bestDeviceForOperation)[operationIndex];
     }
     return ANEURALNETWORKS_NO_ERROR;
 }

 } // namespace nn
 } // namespace android