Create a specialized path for execution plan with a simple body.
This CL creates a specialized and simplified execution path for
execution plan with a SIMPLE body, i.e. the whole model is executed on a
single device.
Having this simplified path can avoid some runtime overheads that only
apply for a COMPOUND execution plan, including:
- ExecutionPlan::makeController/next
- StepExecutor::updateOutputShapes
It also simplifies the subsequent CLs to apply optmizations specific to
SIMPLE execution plans.
Additionally, this CL removes the use of ExecutionCallback in the sync
execution path.
Bug: 184073769
Test: NNT_static
Change-Id: If57bd9b9ea7f01e6e489d0b7977ab040691b2397
diff --git a/runtime/CompilationBuilder.cpp b/runtime/CompilationBuilder.cpp
index 749cd22..79dedbd 100644
--- a/runtime/CompilationBuilder.cpp
+++ b/runtime/CompilationBuilder.cpp
@@ -291,7 +291,11 @@
*execution = nullptr;
return ANEURALNETWORKS_BAD_STATE;
}
- *execution = new (std::nothrow) ExecutionBuilder(this);
+ if (mPlan.isSimple()) {
+ *execution = new (std::nothrow) SimpleExecutionBuilder(this);
+ } else {
+ *execution = new (std::nothrow) CompoundExecutionBuilder(this);
+ }
return (*execution ? ANEURALNETWORKS_NO_ERROR : ANEURALNETWORKS_OUT_OF_MEMORY);
}
diff --git a/runtime/ExecutionBuilder.cpp b/runtime/ExecutionBuilder.cpp
index 3985d12..f75a205 100644
--- a/runtime/ExecutionBuilder.cpp
+++ b/runtime/ExecutionBuilder.cpp
@@ -38,6 +38,7 @@
#include <utility>
#include <vector>
+#include "BurstBuilder.h"
#include "CompilationBuilder.h"
#include "Manager.h"
#include "ModelArgumentInfo.h"
@@ -151,13 +152,23 @@
: mCompilation(compilation),
mModel(compilation->mModel),
mPlan(&compilation->mPlan),
- mPartitioning(compilation->mPartitioning),
+ mAllowCpuFallback(DeviceManager::partitioningAllowsFallback(compilation->mPartitioning)),
mInputs(mModel->inputCount()),
mOutputs(mModel->outputCount()) {
VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size()
<< " inputs and " << mOutputs.size() << " outputs";
}
+SimpleExecutionBuilder::SimpleExecutionBuilder(const CompilationBuilder* compilation)
+ : ExecutionBuilder(compilation) {
+ CHECK(mPlan->isSimple());
+}
+
+CompoundExecutionBuilder::CompoundExecutionBuilder(const CompilationBuilder* compilation)
+ : ExecutionBuilder(compilation) {
+ CHECK(mPlan->isCompound());
+}
+
const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const {
return mPlan->getSourceModels().getModel(index);
}
@@ -575,18 +586,46 @@
return {n2, std::move(outputShapes), timing, executor};
}
-static void asyncStartComputePartitioned(
- ExecutionBuilder* executionBuilder, const ExecutionPlan& plan,
- std::shared_ptr<ExecutionPlan::Controller> controller, bool allowCpuFallback,
- const OptionalTimePoint& deadline,
- const std::shared_ptr<ExecutionCallback>& executionCallback) {
- CHECK(executionBuilder != nullptr);
- VLOG(EXECUTION) << "ExecutionBuilder::compute (from plan, iteratively)";
+std::tuple<int, std::vector<OutputShape>, Timing> SimpleExecutionBuilder::computeInternal(
+ const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) {
+ NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeInternal");
+ VLOG(EXECUTION) << "SimpleExecutionBuilder::computeInternal";
- std::vector<OutputShape> outputShapes = executionBuilder->getInitialOutputShapes();
- Timing timing;
- // Disallow CPU fallback when the ExecutionPlan is simple on CPU.
- allowCpuFallback &= !plan.isSimpleCpu();
+ auto burstController = burstBuilder ? burstBuilder->getControllerAt(0) : nullptr;
+ auto executor = mPlan->makeStepExecutor(this);
+ auto [n, outputShapes, timing] = executor->compute(deadline, burstController);
+
+ if (n == ANEURALNETWORKS_NO_ERROR) {
+ return {n, std::move(outputShapes), timing};
+ }
+
+ // ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE is not recoverable.
+ if (n == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
+ return {n, std::move(outputShapes), {}};
+ }
+
+ // If CPU fallback is not allowed and there was an error, end execution.
+ if (!mAllowCpuFallback) {
+ return {n, {}, {}};
+ }
+
+ // If CPU execution was already attempted, do not perform CPU fallback.
+ if (executor->isCpu()) {
+ return {n, {}, {}};
+ }
+
+ // If the code has reached this point, a potentially recoverable error
+ // occurred during the execution. Do an execution fallback on the CPU.
+ return cpuFallbackFull(this);
+}
+
+std::tuple<int, std::vector<OutputShape>, Timing> CompoundExecutionBuilder::computeInternal(
+ const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) {
+ NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeInternal");
+ VLOG(EXECUTION) << "CompoundExecutionBuilder::computeInternal (from plan, iteratively)";
+
+ auto controller = mPlan->makeController(this, burstBuilder);
+ std::vector<OutputShape> outputShapes = getInitialOutputShapes();
// On this iteration, do I need to repeat the previous step because it
// reported insufficient size?
@@ -599,29 +638,27 @@
std::shared_ptr<StepExecutor> executor;
SharedBurst burstController;
int n = doInsufficientSizeFallback
- ? plan.fallback(controller, &executor, &burstController, &outputShapes)
- : plan.next(controller, &executor, &burstController, &outputShapes);
+ ? mPlan->fallback(controller, &executor, &burstController, &outputShapes)
+ : mPlan->next(controller, &executor, &burstController, &outputShapes);
doInsufficientSizeFallback = false;
if (n != ANEURALNETWORKS_NO_ERROR) {
// During the interpreted execution of control flow, a loop timeout
// might occur in ExecutionPlan::next().
bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
- if (allowCpuFallback && !missedDeadline) break;
- executionCallback->notify(convertResultCodeToErrorStatus(n), {}, {});
- return;
+ if (mAllowCpuFallback && !missedDeadline) break;
+ return {n, {}, {}};
}
// If the code reached the end of the plan without error, then return
// with no error.
if (executor == nullptr) {
- executionCallback->notify(ErrorStatus::NONE, outputShapes, timing);
- return;
+ return {ANEURALNETWORKS_NO_ERROR, outputShapes, {}};
}
const bool executorIsCpu = executor->isCpu();
// Attempt to execute a single step of the execution.
- auto [stepN, stepOutputShapes, stepTiming] = executor->compute(deadline, burstController);
+ auto [stepN, stepOutputShapes, _] = executor->compute(deadline, burstController);
// Update global outputs and dynamic temporaries.
StepExecutor::UpdateOutputShapes updateOutputShapes = {};
@@ -638,10 +675,6 @@
stepN = ANEURALNETWORKS_OP_FAILED;
} else {
CHECK(executor->areDynamicTemporariesAllocated());
- // We only support collection of timing information in the case
- // of a single step, so it's safe to just keep track of the last
- // step's timing information.
- timing = stepTiming;
continue;
}
}
@@ -654,9 +687,7 @@
// - At least one main model output is not of sufficient size; or
// - we didn't learn anything new about dynamic temporaries.
// Neither of these is recoverable, so end execution.
- const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
- executionCallback->notify(stepStatus, outputShapes, {});
- return;
+ return {stepN, outputShapes, {}};
}
// Every main model output is of sufficient size. This implies that
// at least one dynamic temporary is not of sufficient size. This
@@ -666,30 +697,24 @@
}
// If CPU fallback is not allowed and there was an error, end execution.
- if (!allowCpuFallback) {
- const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
- executionCallback->notify(stepStatus, {}, {});
- return;
+ if (!mAllowCpuFallback) {
+ return {stepN, {}, {}};
}
- // If CPU execution was already attempted, either:
- // (1) perform a full CPU fallback if the plan is not simple, or
- // (2) return from the function with an error
+ // If CPU execution was already attempted, perform a full CPU fallback.
if (executorIsCpu) {
- if (!plan.isSimple()) break;
- executionCallback->notify(convertResultCodeToErrorStatus(stepN), {}, {});
- return;
+ break;
}
// If the code reaches this point, attempt a partial fallback to CPU.
- CHECK(allowCpuFallback);
+ CHECK(mAllowCpuFallback);
if (updateOutputShapes.zeroSizedInput) {
// Do not attempt a partial fallback.
break;
}
while (true) {
- auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
- cpuFallbackPartial(plan, controller);
+ auto [fallbackN, fallbackOutputShapes, _, fallbackExecutor] =
+ cpuFallbackPartial(*mPlan, controller);
// Update global outputs and dynamic temporaries.
StepExecutor::UpdateOutputShapes fallbackUpdateOutputShapes = {};
@@ -708,10 +733,6 @@
break;
}
CHECK(fallbackExecutor->areDynamicTemporariesAllocated());
- // We only support collection of timing information in the case of a
- // single step, so it's safe to just keep track of the last step's
- // timing information.
- timing = fallbackTiming;
goto nextStep;
}
@@ -724,9 +745,7 @@
// - At least one main model output is not of sufficient size; or
// - we didn't learn anything new about dynamic temporaries.
// Neither of these is recoverable, so end execution.
- const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
- executionCallback->notify(fallbackStatus, outputShapes, {});
- return;
+ return {fallbackN, outputShapes, {}};
}
// Every main model output is of sufficient size. This implies
// that at least one dynamic temporary is not of sufficient
@@ -734,13 +753,6 @@
continue;
}
- // Do not fallback twice if the ExecutionPlan is simple.
- if (plan.isSimple()) {
- const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
- executionCallback->notify(fallbackStatus, {}, {});
- return;
- }
-
// If the code reaches this point, then there was an error with the
// fallback. In this case, attempt full fallback.
break;
@@ -758,20 +770,68 @@
// If the code has reached this point, a potentially recoverable error
// occurred during the step executions. Instead, do a full execution
// fallback on the CPU.
- auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder);
- const ErrorStatus fullStatus = convertResultCodeToErrorStatus(fullN);
- executionCallback->notify(fullStatus, fullOutputShapes, fullTiming);
+ return cpuFallbackFull(this);
}
-// In case of partitioned execution, startComputeFenced call will return the sync
+static bool waitForSyncFences(const std::vector<int>& waitFor) {
+ for (int syncFd : waitFor) {
+ if (syncFd > 0) {
+ auto r = syncWait(syncFd, -1);
+ if (r != FenceState::SIGNALED) {
+ VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd;
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+std::tuple<int, int, ExecuteFencedInfoCallback> SimpleExecutionBuilder::computeFencedInternal(
+ const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
+ const OptionalTimePoint& deadline) {
+ NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeFencedInternal");
+ VLOG(EXECUTION) << "SimpleExecutionBuilder::computeFencedInternal";
+
+ auto executor = mPlan->makeStepExecutor(this);
+ auto [n, syncFd, callback] =
+ executor->computeFenced(waitFor, timeoutDurationAfterFence, deadline);
+
+ if (n == ANEURALNETWORKS_NO_ERROR) {
+ return {ANEURALNETWORKS_NO_ERROR, syncFd, callback};
+ }
+
+ // If CPU fallback is not allowed and there was an error, end execution.
+ if (!mAllowCpuFallback) {
+ return {n, -1, nullptr};
+ }
+
+ // If CPU execution was already attempted, return from the function with an error.
+ if (executor->isCpu()) {
+ return {n, -1, nullptr};
+ }
+
+ // If the code has reached this point, a potentially recoverable error
+ // occurred during the step executions. Instead, do a full execution
+ // fallback on the CPU.
+ VLOG(EXECUTION) << "Performing full fallback on the CPU.";
+ if (!waitForSyncFences(waitFor)) {
+ return {ANEURALNETWORKS_OP_FAILED, -1, nullptr};
+ }
+ auto [fallbackN, fallbackOutputShapes, fallbackTiming] = cpuFallbackFull(this);
+ reportTimingWithoutFencedExecutionCallback(fallbackTiming);
+ return {fallbackN, -1, nullptr};
+}
+
+// In case of partitioned execution, computeFencedInternal call will return the sync
// fence and the fenced compute callback returned from the last partition.
-// Any failed partition will result in the whole execution fallback to CPU if
-// allowCpuFallback is set to true.
-static std::tuple<int, int, ExecuteFencedInfoCallback> startComputeFenced(
- ExecutionBuilder* executionBuilder, const ExecutionPlan& plan,
- std::shared_ptr<ExecutionPlan::Controller> controller, const std::vector<int>& waitFor,
- uint64_t timeoutDurationAfterFence, const OptionalTimePoint& deadline,
- bool allowCpuFallback) {
+// Any failed partition will result in whole execution fallback to CPU if
+// mAllowCpuFallback is set to true.
+std::tuple<int, int, ExecuteFencedInfoCallback> CompoundExecutionBuilder::computeFencedInternal(
+ const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
+ const OptionalTimePoint& deadline) {
+ NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeFencedInternal");
+ VLOG(EXECUTION) << "CompoundExecutionBuilder::computeFencedInternal (from plan, iteratively)";
+
// We should have detected this earlier in the call chain and fallen back to
// non-fenced execution. This is an implementation limitation: In order to
// support dynamic temporarires in this code, we'd need to implement
@@ -779,35 +839,31 @@
// - If a partition has outputs of unknown size, execute that partition in a
// non fenced fashion, just as if it were scheduled on a driver that does
// not support fenced execution.
- // - Implement something similar to the code in asyncStartComputePartitioned()
+ // - Implement something similar to the code in CompoundExecutionBuilder::computeInternal()
// that handles a step execution that fails with
// ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE.
- CHECK(!executionBuilder->getCompilation()->hasDynamicTemporaries());
-
- CHECK(executionBuilder != nullptr);
- VLOG(EXECUTION) << "ExecutionBuilder::computeFenced (from plan, iteratively)";
- // Disallow fallback when the ExecutionPlan is simple on CPU.
- allowCpuFallback &= !plan.isSimpleCpu();
+ CHECK(!mCompilation->hasDynamicTemporaries());
// Initiate waitForFds, syncFence for the first step.
std::vector<int> waitForFds = waitFor;
int syncFence = -1;
ExecuteFencedInfoCallback executeFencedInfoCallback;
+ std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
while (true) {
VLOG(EXECUTION) << "looking for next StepExecutor";
// Get the current step of the execution.
std::shared_ptr<StepExecutor> executor;
- int n = plan.next(controller, &executor, nullptr, nullptr, syncFence);
+ int n = mPlan->next(controller, &executor, nullptr, nullptr, syncFence);
if (n != ANEURALNETWORKS_NO_ERROR) {
// During the interpreted execution of control flow, a loop timeout
// might occur in ExecutionPlan::next().
bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
- if (allowCpuFallback && !missedDeadline) break;
+ if (mAllowCpuFallback && !missedDeadline) break;
// Return -1 for the sync fence fd, and nullptr for the callback.
- return std::make_tuple(n, -1, nullptr);
+ return {n, -1, nullptr};
}
// If the code reached the end of the plan without error, then return
@@ -815,7 +871,6 @@
if (executor == nullptr) {
return {ANEURALNETWORKS_NO_ERROR, syncFence, executeFencedInfoCallback};
}
- const bool executorIsCpu = executor->isCpu();
// Attempt to execute a single step of the execution.
auto [stepN, syncFd, callback] =
@@ -834,17 +889,10 @@
continue;
}
// If CPU fallback is not allowed and there was an error, end execution.
- if (!allowCpuFallback) {
- return std::make_tuple(stepN, -1, nullptr);
+ if (!mAllowCpuFallback) {
+ return {stepN, -1, nullptr};
}
- // If CPU execution was already attempted, either:
- // (1) perform a full fallback if the plan is not simple, or
- // (2) return from the function with an error
- if (executorIsCpu) {
- if (!plan.isSimple()) break;
- return std::make_tuple(stepN, -1, nullptr);
- }
// If the code reaches this point, then there was an error with the
// fallback. In this case, attempt full fallback.
break;
@@ -854,19 +902,12 @@
// occurred during the step executions. Instead, do a full execution
// fallback on the CPU.
VLOG(EXECUTION) << "Performing full fallback on the CPU.";
- for (int syncFd : waitFor) {
- if (syncFd > 0) {
- auto r = syncWait(syncFd, -1);
- if (r != FenceState::SIGNALED) {
- VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd;
- return std::make_tuple(ANEURALNETWORKS_OP_FAILED, -1, nullptr);
- }
- }
+ if (!waitForSyncFences(waitFor)) {
+ return {ANEURALNETWORKS_OP_FAILED, -1, nullptr};
}
- auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder);
+ auto [fullN, fullOutputShapes, _] = cpuFallbackFull(this);
syncFence = -1;
- executionBuilder->reportTimingWithoutFencedExecutionCallback(fullTiming);
- return std::make_tuple(fullN, syncFence, nullptr);
+ return {fullN, syncFence, nullptr};
}
int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor,
@@ -913,13 +954,10 @@
// Unlike ExecutionBuilder::compute, we do not need to reset output dimensions here because
// fenced executions do not support dynamic output shape.
- const bool allowCpuFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
- std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
int result;
std::tie(result, mSyncFenceFd, mFencedExecutionCallback) =
- startComputeFenced(this, *mPlan, controller, waitFor, timeoutDurationAfterFence,
- deadline, allowCpuFallback);
+ computeFencedInternal(waitFor, timeoutDurationAfterFence, deadline);
*syncFence = mSyncFenceFd;
// If there is an error, call finishComputation to mark the computation as completed.
// Otherwise, we will call finishComputation in SyncFenceEvent::wait().
@@ -973,32 +1011,22 @@
output.reset();
}
- auto wrappedFinish = [this](ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
- return finishComputation(error, outputShapes);
- };
-
- // TODO: For asynchronous execution, entire plan-based-path should run in an
- // asynchronous thread -- take the asynchronous thread logic out of
- // CpuPreparedModel::execute() and use it to wrap the plan-based-path.
- const bool allowCpuFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
- std::shared_ptr<ExecutionPlan::Controller> controller =
- mPlan->makeController(this, burstBuilder);
if (synchronous) {
if (burstBuilder) {
VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API, burst)";
} else {
VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
}
- auto localSynchronizationCallback = std::make_shared<ExecutionCallback>();
- localSynchronizationCallback->setOnFinish(wrappedFinish);
- asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,
- localSynchronizationCallback);
- localSynchronizationCallback->wait();
+ const auto [n, outputShapes, timing] = computeInternal(deadline, burstBuilder);
if (mMeasureTiming) {
- mTimingWithoutFencedExecutionCallback = localSynchronizationCallback->getTiming();
+ mTimingWithoutFencedExecutionCallback = timing;
}
- return convertErrorStatusToResultCode(localSynchronizationCallback->getStatus());
+ return finishComputation(n, outputShapes);
} else /* asynchronous */ {
+ // TODO: For asynchronous execution, entire plan-based-path should run in an
+ // asynchronous thread -- take the asynchronous thread logic out of
+ // CpuPreparedModel::execute() and use it to wrap the plan-based-path.
+
// TODO: use a thread pool
// TODO(mikie): this could have NNTRACE so we could measure the overhead
// of spinning up a new thread.
@@ -1009,18 +1037,21 @@
// nullptr is returned. The executionCallback is
// abstracted in the NN API as an "event".
auto executionCallback = std::make_shared<ExecutionCallback>();
- executionCallback->setOnFinish(wrappedFinish);
+ executionCallback->setOnFinish(
+ [this](ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
+ return finishComputation(error, outputShapes);
+ });
+ const auto asyncStartCompute = [this, deadline, executionCallback] {
+ const auto [n, outputShapes, timing] = computeInternal(deadline, nullptr);
+ const auto status = convertResultCodeToErrorStatus(n);
+ executionCallback->notify(status, outputShapes, timing);
+ };
if (DeviceManager::get()->syncExecRuntime()) {
VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
- asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,
- executionCallback);
+ asyncStartCompute();
} else {
VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
- std::thread asyncExecution(
- [this, controller, allowCpuFallback, deadline, executionCallback] {
- asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback,
- deadline, executionCallback);
- });
+ std::thread asyncExecution(asyncStartCompute);
executionCallback->bindThread(std::move(asyncExecution));
}
*synchronizationCallback = executionCallback;
diff --git a/runtime/ExecutionBuilder.h b/runtime/ExecutionBuilder.h
index 910d076..08a8f53 100644
--- a/runtime/ExecutionBuilder.h
+++ b/runtime/ExecutionBuilder.h
@@ -55,6 +55,7 @@
public:
explicit ExecutionBuilder(const CompilationBuilder* compilation);
+ virtual ~ExecutionBuilder() = default;
int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer,
size_t length);
@@ -134,7 +135,7 @@
return mMemories[poolIndex]->getRunTimePoolInfo();
}
- private:
+ protected:
// If a callback is provided, then this is asynchronous. If a callback is
// not provided (i.e., is nullptr), then this is synchronous.
//
@@ -145,6 +146,13 @@
int compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback,
BurstBuilder* burstBuilder = nullptr);
+ virtual std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
+ const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) = 0;
+
+ virtual std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
+ const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
+ const OptionalTimePoint& deadline) = 0;
+
const CompilationBuilder* mCompilation;
// Update output dimensional information from OutputShape to ModelArgumentInfo.
@@ -157,9 +165,9 @@
const ModelBuilder* mModel;
const ExecutionPlan* mPlan;
- // This is a DeviceManager::kPartitioning* value captured from
- // CompilationBuilder when the ExecutionBuilder is constructed.
- uint32_t mPartitioning;
+ // Whether CPU fallback is allowed based on the value of DeviceManager::kPartitioning* captured
+ // from CompilationBuilder when the ExecutionBuilder is constructed.
+ bool mAllowCpuFallback;
// The information we'll send to the driver about the inputs and outputs.
// Note that we build this in two steps:
@@ -242,6 +250,32 @@
bool mReusable = false;
};
+// For execution plan with a SIMPLE body, i.e. the whole model will be executed on a single device.
+class SimpleExecutionBuilder : public ExecutionBuilder {
+ public:
+ SimpleExecutionBuilder(const CompilationBuilder* compilation);
+
+ std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
+ const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;
+
+ std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
+ const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
+ const OptionalTimePoint& deadline) override;
+};
+
+// For execution plan with a COMPOUND body, i.e. partitioned execution with multiple steps.
+class CompoundExecutionBuilder : public ExecutionBuilder {
+ public:
+ CompoundExecutionBuilder(const CompilationBuilder* compilation);
+
+ std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
+ const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;
+
+ std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
+ const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
+ const OptionalTimePoint& deadline) override;
+};
+
// class StepExecutor is used to execute a single "step" in a
// potentially multiple step execution process. The graph associated
// with that step is executed in its entirety on a single device (or
diff --git a/runtime/ExecutionPlan.cpp b/runtime/ExecutionPlan.cpp
index c01777d..3277a39 100644
--- a/runtime/ExecutionPlan.cpp
+++ b/runtime/ExecutionPlan.cpp
@@ -1105,10 +1105,6 @@
simulateFailureResultCode);
}
-ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
- const BurstBuilder* burstBuilder)
- : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}, {}) {}
-
ExecutionPlan::Controller::Controller(
const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
@@ -1202,9 +1198,7 @@
std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
CHECK(isValid());
- if (mState == SIMPLE) {
- return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder));
- }
+ CHECK(mState != SIMPLE);
const auto* body = compound();
// Create the layout for a RuntimeMemory object big enough to hold
// - every partition boundary TEMPORARY operand that is not a dynamic temporary, and
@@ -1504,6 +1498,8 @@
std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController,
const std::vector<OutputShape>* mainModelOutputShapes,
int syncFdOfLastStep) const {
+ CHECK(mState == COMPOUND);
+
controller->mLastStepSyncFd = syncFdOfLastStep;
*executor = nullptr;
if (burstController != nullptr) {
@@ -1517,33 +1513,6 @@
return ANEURALNETWORKS_OP_FAILED;
}
- if (mState == EMPTY) {
- CHECK_EQ(controller->mNextStepIndex, 0u); // end
- controller->mNextStepIndex = Controller::kBadStepIndex;
- return ANEURALNETWORKS_NO_ERROR;
- }
-
- if (mState == SIMPLE) {
- if (controller->mNextStepIndex == 0) {
- // First (and only) step.
- auto simpleBody = simple();
- *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder,
- simpleBody->mModel, simpleBody->mDevice,
- simpleBody->mPreparedModel);
- (*executor)->mapInputsAndOutputsTrivially();
- if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
- *burstController = controller->mBurstBuilder->getControllerAt(0);
- }
- controller->mFallbackNextStepIndex = 0;
- controller->mNextStepIndex = 1;
- return ANEURALNETWORKS_NO_ERROR;
- }
-
- CHECK_EQ(controller->mNextStepIndex, 1u); // end
- controller->mNextStepIndex = Controller::kBadStepIndex;
- return ANEURALNETWORKS_NO_ERROR;
- }
-
return nextCompound(controller, executor, burstController, mainModelOutputShapes);
}
@@ -1853,6 +1822,15 @@
return nextCompound(controller, executor, burstController, mainModelOutputShapes);
}
+std::shared_ptr<StepExecutor> ExecutionPlan::makeStepExecutor(
+ ExecutionBuilder* executionBuilder) const {
+ auto simpleBody = simple();
+ auto executor = std::make_shared<StepExecutor>(executionBuilder, simpleBody->mModel,
+ simpleBody->mDevice, simpleBody->mPreparedModel);
+ executor->mapInputsAndOutputsTrivially();
+ return executor;
+}
+
void ExecutionPlan::becomeCompoundIfEmpty() {
CHECK(mState != SIMPLE);
if (mState == EMPTY) {
diff --git a/runtime/ExecutionPlan.h b/runtime/ExecutionPlan.h
index 9a238c2..cebdd63 100644
--- a/runtime/ExecutionPlan.h
+++ b/runtime/ExecutionPlan.h
@@ -591,9 +591,6 @@
static const size_t kBadStepIndex = ~size_t(0);
- // A constructor for mState == SIMPLE.
- Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
- const BurstBuilder* burstBuilder);
// A constructor for mState == COMPOUND.
Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
const BurstBuilder* burstBuilder,
@@ -672,6 +669,7 @@
std::vector<SharedBurst> makeBursts() const;
+ // Only legal to call when mState == COMPOUND.
std::shared_ptr<Controller> makeController(ExecutionBuilder* executionBuilder,
const BurstBuilder* burstBuilder) const;
@@ -682,6 +680,7 @@
// mainModelOutputShapes may be nullptr if the only main model outputs that are step model
// inputs are of fully specified shape.
// syncFdOfLastStep is the sync fence fd generated by the most recently processed step.
+ // Only legal to call when mState == COMPOUND.
int next(std::shared_ptr<Controller> controller, std::shared_ptr<StepExecutor>* executor,
SharedBurst* burstController, const std::vector<OutputShape>* mainModelOutputShapes,
int syncFdOfLastStep = -1) const;
@@ -691,6 +690,9 @@
SharedBurst* burstController,
const std::vector<OutputShape>* mainModelOutputShapes) const;
+ // Only legal to call when mState == SIMPLE.
+ std::shared_ptr<StepExecutor> makeStepExecutor(ExecutionBuilder* executionBuilder) const;
+
ExecutionStep* createNewExecutionStep(uint32_t sourceModelIndex,
const std::shared_ptr<Device> device);
IfStep* createNewIfStep();
@@ -715,6 +717,7 @@
bool isValid() const { return mState != EMPTY && mBody != nullptr && mBody->mSuccessfulFinish; }
bool isSimple() const { return mState == SIMPLE; }
+ bool isCompound() const { return mState == COMPOUND; }
bool isSimpleCpu() const;
void setCaching(const std::string* cacheDir, const uint8_t* token) {