Create a specialized path for execution plan with a simple body.

This CL creates a specialized and simplified execution path for
execution plan with a SIMPLE body, i.e. the whole model is executed on a
single device.

Having this simplified path can avoid some runtime overheads that only
apply for a COMPOUND execution plan, including:
- ExecutionPlan::makeController/next
- StepExecutor::updateOutputShapes

It also simplifies the subsequent CLs to apply optmizations specific to
SIMPLE execution plans.

Additionally, this CL removes the use of ExecutionCallback in the sync
execution path.

Bug: 184073769
Test: NNT_static
Change-Id: If57bd9b9ea7f01e6e489d0b7977ab040691b2397
diff --git a/runtime/CompilationBuilder.cpp b/runtime/CompilationBuilder.cpp
index 749cd22..79dedbd 100644
--- a/runtime/CompilationBuilder.cpp
+++ b/runtime/CompilationBuilder.cpp
@@ -291,7 +291,11 @@
         *execution = nullptr;
         return ANEURALNETWORKS_BAD_STATE;
     }
-    *execution = new (std::nothrow) ExecutionBuilder(this);
+    if (mPlan.isSimple()) {
+        *execution = new (std::nothrow) SimpleExecutionBuilder(this);
+    } else {
+        *execution = new (std::nothrow) CompoundExecutionBuilder(this);
+    }
     return (*execution ? ANEURALNETWORKS_NO_ERROR : ANEURALNETWORKS_OUT_OF_MEMORY);
 }
 
diff --git a/runtime/ExecutionBuilder.cpp b/runtime/ExecutionBuilder.cpp
index 3985d12..f75a205 100644
--- a/runtime/ExecutionBuilder.cpp
+++ b/runtime/ExecutionBuilder.cpp
@@ -38,6 +38,7 @@
 #include <utility>
 #include <vector>
 
+#include "BurstBuilder.h"
 #include "CompilationBuilder.h"
 #include "Manager.h"
 #include "ModelArgumentInfo.h"
@@ -151,13 +152,23 @@
     : mCompilation(compilation),
       mModel(compilation->mModel),
       mPlan(&compilation->mPlan),
-      mPartitioning(compilation->mPartitioning),
+      mAllowCpuFallback(DeviceManager::partitioningAllowsFallback(compilation->mPartitioning)),
       mInputs(mModel->inputCount()),
       mOutputs(mModel->outputCount()) {
     VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size()
                     << " inputs and " << mOutputs.size() << " outputs";
 }
 
+SimpleExecutionBuilder::SimpleExecutionBuilder(const CompilationBuilder* compilation)
+    : ExecutionBuilder(compilation) {
+    CHECK(mPlan->isSimple());
+}
+
+CompoundExecutionBuilder::CompoundExecutionBuilder(const CompilationBuilder* compilation)
+    : ExecutionBuilder(compilation) {
+    CHECK(mPlan->isCompound());
+}
+
 const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const {
     return mPlan->getSourceModels().getModel(index);
 }
@@ -575,18 +586,46 @@
     return {n2, std::move(outputShapes), timing, executor};
 }
 
-static void asyncStartComputePartitioned(
-        ExecutionBuilder* executionBuilder, const ExecutionPlan& plan,
-        std::shared_ptr<ExecutionPlan::Controller> controller, bool allowCpuFallback,
-        const OptionalTimePoint& deadline,
-        const std::shared_ptr<ExecutionCallback>& executionCallback) {
-    CHECK(executionBuilder != nullptr);
-    VLOG(EXECUTION) << "ExecutionBuilder::compute (from plan, iteratively)";
+std::tuple<int, std::vector<OutputShape>, Timing> SimpleExecutionBuilder::computeInternal(
+        const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeInternal");
+    VLOG(EXECUTION) << "SimpleExecutionBuilder::computeInternal";
 
-    std::vector<OutputShape> outputShapes = executionBuilder->getInitialOutputShapes();
-    Timing timing;
-    // Disallow CPU fallback when the ExecutionPlan is simple on CPU.
-    allowCpuFallback &= !plan.isSimpleCpu();
+    auto burstController = burstBuilder ? burstBuilder->getControllerAt(0) : nullptr;
+    auto executor = mPlan->makeStepExecutor(this);
+    auto [n, outputShapes, timing] = executor->compute(deadline, burstController);
+
+    if (n == ANEURALNETWORKS_NO_ERROR) {
+        return {n, std::move(outputShapes), timing};
+    }
+
+    // ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE is not recoverable.
+    if (n == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
+        return {n, std::move(outputShapes), {}};
+    }
+
+    // If CPU fallback is not allowed and there was an error, end execution.
+    if (!mAllowCpuFallback) {
+        return {n, {}, {}};
+    }
+
+    // If CPU execution was already attempted, do not perform CPU fallback.
+    if (executor->isCpu()) {
+        return {n, {}, {}};
+    }
+
+    // If the code has reached this point, a potentially recoverable error
+    // occurred during the execution. Do an execution fallback on the CPU.
+    return cpuFallbackFull(this);
+}
+
+std::tuple<int, std::vector<OutputShape>, Timing> CompoundExecutionBuilder::computeInternal(
+        const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeInternal");
+    VLOG(EXECUTION) << "CompoundExecutionBuilder::computeInternal (from plan, iteratively)";
+
+    auto controller = mPlan->makeController(this, burstBuilder);
+    std::vector<OutputShape> outputShapes = getInitialOutputShapes();
 
     // On this iteration, do I need to repeat the previous step because it
     // reported insufficient size?
@@ -599,29 +638,27 @@
         std::shared_ptr<StepExecutor> executor;
         SharedBurst burstController;
         int n = doInsufficientSizeFallback
-                        ? plan.fallback(controller, &executor, &burstController, &outputShapes)
-                        : plan.next(controller, &executor, &burstController, &outputShapes);
+                        ? mPlan->fallback(controller, &executor, &burstController, &outputShapes)
+                        : mPlan->next(controller, &executor, &burstController, &outputShapes);
         doInsufficientSizeFallback = false;
         if (n != ANEURALNETWORKS_NO_ERROR) {
             // During the interpreted execution of control flow, a loop timeout
             // might occur in ExecutionPlan::next().
             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
-            if (allowCpuFallback && !missedDeadline) break;
-            executionCallback->notify(convertResultCodeToErrorStatus(n), {}, {});
-            return;
+            if (mAllowCpuFallback && !missedDeadline) break;
+            return {n, {}, {}};
         }
 
         // If the code reached the end of the plan without error, then return
         // with no error.
         if (executor == nullptr) {
-            executionCallback->notify(ErrorStatus::NONE, outputShapes, timing);
-            return;
+            return {ANEURALNETWORKS_NO_ERROR, outputShapes, {}};
         }
         const bool executorIsCpu = executor->isCpu();
 
         // Attempt to execute a single step of the execution.
-        auto [stepN, stepOutputShapes, stepTiming] = executor->compute(deadline, burstController);
+        auto [stepN, stepOutputShapes, _] = executor->compute(deadline, burstController);
 
         // Update global outputs and dynamic temporaries.
         StepExecutor::UpdateOutputShapes updateOutputShapes = {};
@@ -638,10 +675,6 @@
                 stepN = ANEURALNETWORKS_OP_FAILED;
             } else {
                 CHECK(executor->areDynamicTemporariesAllocated());
-                // We only support collection of timing information in the case
-                // of a single step, so it's safe to just keep track of the last
-                // step's timing information.
-                timing = stepTiming;
                 continue;
             }
         }
@@ -654,9 +687,7 @@
                 // - At least one main model output is not of sufficient size; or
                 // - we didn't learn anything new about dynamic temporaries.
                 // Neither of these is recoverable, so end execution.
-                const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
-                executionCallback->notify(stepStatus, outputShapes, {});
-                return;
+                return {stepN, outputShapes, {}};
             }
             // Every main model output is of sufficient size.  This implies that
             // at least one dynamic temporary is not of sufficient size.  This
@@ -666,30 +697,24 @@
         }
 
         // If CPU fallback is not allowed and there was an error, end execution.
-        if (!allowCpuFallback) {
-            const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
-            executionCallback->notify(stepStatus, {}, {});
-            return;
+        if (!mAllowCpuFallback) {
+            return {stepN, {}, {}};
         }
 
-        // If CPU execution was already attempted, either:
-        // (1) perform a full CPU fallback if the plan is not simple, or
-        // (2) return from the function with an error
+        // If CPU execution was already attempted, perform a full CPU fallback.
         if (executorIsCpu) {
-            if (!plan.isSimple()) break;
-            executionCallback->notify(convertResultCodeToErrorStatus(stepN), {}, {});
-            return;
+            break;
         }
 
         // If the code reaches this point, attempt a partial fallback to CPU.
-        CHECK(allowCpuFallback);
+        CHECK(mAllowCpuFallback);
         if (updateOutputShapes.zeroSizedInput) {
             // Do not attempt a partial fallback.
             break;
         }
         while (true) {
-            auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
-                    cpuFallbackPartial(plan, controller);
+            auto [fallbackN, fallbackOutputShapes, _, fallbackExecutor] =
+                    cpuFallbackPartial(*mPlan, controller);
 
             // Update global outputs and dynamic temporaries.
             StepExecutor::UpdateOutputShapes fallbackUpdateOutputShapes = {};
@@ -708,10 +733,6 @@
                     break;
                 }
                 CHECK(fallbackExecutor->areDynamicTemporariesAllocated());
-                // We only support collection of timing information in the case of a
-                // single step, so it's safe to just keep track of the last step's
-                // timing information.
-                timing = fallbackTiming;
                 goto nextStep;
             }
 
@@ -724,9 +745,7 @@
                     // - At least one main model output is not of sufficient size; or
                     // - we didn't learn anything new about dynamic temporaries.
                     // Neither of these is recoverable, so end execution.
-                    const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
-                    executionCallback->notify(fallbackStatus, outputShapes, {});
-                    return;
+                    return {fallbackN, outputShapes, {}};
                 }
                 // Every main model output is of sufficient size.  This implies
                 // that at least one dynamic temporary is not of sufficient
@@ -734,13 +753,6 @@
                 continue;
             }
 
-            // Do not fallback twice if the ExecutionPlan is simple.
-            if (plan.isSimple()) {
-                const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
-                executionCallback->notify(fallbackStatus, {}, {});
-                return;
-            }
-
             // If the code reaches this point, then there was an error with the
             // fallback. In this case, attempt full fallback.
             break;
@@ -758,20 +770,68 @@
     // If the code has reached this point, a potentially recoverable error
     // occurred during the step executions. Instead, do a full execution
     // fallback on the CPU.
-    auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder);
-    const ErrorStatus fullStatus = convertResultCodeToErrorStatus(fullN);
-    executionCallback->notify(fullStatus, fullOutputShapes, fullTiming);
+    return cpuFallbackFull(this);
 }
 
-// In case of partitioned execution, startComputeFenced call will return the sync
+static bool waitForSyncFences(const std::vector<int>& waitFor) {
+    for (int syncFd : waitFor) {
+        if (syncFd > 0) {
+            auto r = syncWait(syncFd, -1);
+            if (r != FenceState::SIGNALED) {
+                VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd;
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+std::tuple<int, int, ExecuteFencedInfoCallback> SimpleExecutionBuilder::computeFencedInternal(
+        const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
+        const OptionalTimePoint& deadline) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeFencedInternal");
+    VLOG(EXECUTION) << "SimpleExecutionBuilder::computeFencedInternal";
+
+    auto executor = mPlan->makeStepExecutor(this);
+    auto [n, syncFd, callback] =
+            executor->computeFenced(waitFor, timeoutDurationAfterFence, deadline);
+
+    if (n == ANEURALNETWORKS_NO_ERROR) {
+        return {ANEURALNETWORKS_NO_ERROR, syncFd, callback};
+    }
+
+    // If CPU fallback is not allowed and there was an error, end execution.
+    if (!mAllowCpuFallback) {
+        return {n, -1, nullptr};
+    }
+
+    // If CPU execution was already attempted, return from the function with an error.
+    if (executor->isCpu()) {
+        return {n, -1, nullptr};
+    }
+
+    // If the code has reached this point, a potentially recoverable error
+    // occurred during the step executions. Instead, do a full execution
+    // fallback on the CPU.
+    VLOG(EXECUTION) << "Performing full fallback on the CPU.";
+    if (!waitForSyncFences(waitFor)) {
+        return {ANEURALNETWORKS_OP_FAILED, -1, nullptr};
+    }
+    auto [fallbackN, fallbackOutputShapes, fallbackTiming] = cpuFallbackFull(this);
+    reportTimingWithoutFencedExecutionCallback(fallbackTiming);
+    return {fallbackN, -1, nullptr};
+}
+
+// In case of partitioned execution, computeFencedInternal call will return the sync
 // fence and the fenced compute callback returned from the last partition.
-// Any failed partition will result in the whole execution fallback to CPU if
-// allowCpuFallback is set to true.
-static std::tuple<int, int, ExecuteFencedInfoCallback> startComputeFenced(
-        ExecutionBuilder* executionBuilder, const ExecutionPlan& plan,
-        std::shared_ptr<ExecutionPlan::Controller> controller, const std::vector<int>& waitFor,
-        uint64_t timeoutDurationAfterFence, const OptionalTimePoint& deadline,
-        bool allowCpuFallback) {
+// Any failed partition will result in whole execution fallback to CPU if
+// mAllowCpuFallback is set to true.
+std::tuple<int, int, ExecuteFencedInfoCallback> CompoundExecutionBuilder::computeFencedInternal(
+        const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
+        const OptionalTimePoint& deadline) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeFencedInternal");
+    VLOG(EXECUTION) << "CompoundExecutionBuilder::computeFencedInternal (from plan, iteratively)";
+
     // We should have detected this earlier in the call chain and fallen back to
     // non-fenced execution.  This is an implementation limitation: In order to
     // support dynamic temporarires in this code, we'd need to implement
@@ -779,35 +839,31 @@
     // - If a partition has outputs of unknown size, execute that partition in a
     //   non fenced fashion, just as if it were scheduled on a driver that does
     //   not support fenced execution.
-    // - Implement something similar to the code in asyncStartComputePartitioned()
+    // - Implement something similar to the code in CompoundExecutionBuilder::computeInternal()
     //   that handles a step execution that fails with
     //   ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE.
-    CHECK(!executionBuilder->getCompilation()->hasDynamicTemporaries());
-
-    CHECK(executionBuilder != nullptr);
-    VLOG(EXECUTION) << "ExecutionBuilder::computeFenced (from plan, iteratively)";
-    // Disallow fallback when the ExecutionPlan is simple on CPU.
-    allowCpuFallback &= !plan.isSimpleCpu();
+    CHECK(!mCompilation->hasDynamicTemporaries());
 
     // Initiate waitForFds, syncFence for the first step.
     std::vector<int> waitForFds = waitFor;
     int syncFence = -1;
     ExecuteFencedInfoCallback executeFencedInfoCallback;
 
+    std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
     while (true) {
         VLOG(EXECUTION) << "looking for next StepExecutor";
 
         // Get the current step of the execution.
         std::shared_ptr<StepExecutor> executor;
-        int n = plan.next(controller, &executor, nullptr, nullptr, syncFence);
+        int n = mPlan->next(controller, &executor, nullptr, nullptr, syncFence);
         if (n != ANEURALNETWORKS_NO_ERROR) {
             // During the interpreted execution of control flow, a loop timeout
             // might occur in ExecutionPlan::next().
             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
-            if (allowCpuFallback && !missedDeadline) break;
+            if (mAllowCpuFallback && !missedDeadline) break;
             // Return -1 for the sync fence fd, and nullptr for the callback.
-            return std::make_tuple(n, -1, nullptr);
+            return {n, -1, nullptr};
         }
 
         // If the code reached the end of the plan without error, then return
@@ -815,7 +871,6 @@
         if (executor == nullptr) {
             return {ANEURALNETWORKS_NO_ERROR, syncFence, executeFencedInfoCallback};
         }
-        const bool executorIsCpu = executor->isCpu();
 
         // Attempt to execute a single step of the execution.
         auto [stepN, syncFd, callback] =
@@ -834,17 +889,10 @@
             continue;
         }
         // If CPU fallback is not allowed and there was an error, end execution.
-        if (!allowCpuFallback) {
-            return std::make_tuple(stepN, -1, nullptr);
+        if (!mAllowCpuFallback) {
+            return {stepN, -1, nullptr};
         }
 
-        // If CPU execution was already attempted, either:
-        // (1) perform a full fallback if the plan is not simple, or
-        // (2) return from the function with an error
-        if (executorIsCpu) {
-            if (!plan.isSimple()) break;
-            return std::make_tuple(stepN, -1, nullptr);
-        }
         // If the code reaches this point, then there was an error with the
         // fallback. In this case, attempt full fallback.
         break;
@@ -854,19 +902,12 @@
     // occurred during the step executions. Instead, do a full execution
     // fallback on the CPU.
     VLOG(EXECUTION) << "Performing full fallback on the CPU.";
-    for (int syncFd : waitFor) {
-        if (syncFd > 0) {
-            auto r = syncWait(syncFd, -1);
-            if (r != FenceState::SIGNALED) {
-                VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd;
-                return std::make_tuple(ANEURALNETWORKS_OP_FAILED, -1, nullptr);
-            }
-        }
+    if (!waitForSyncFences(waitFor)) {
+        return {ANEURALNETWORKS_OP_FAILED, -1, nullptr};
     }
-    auto [fullN, fullOutputShapes, fullTiming] = cpuFallbackFull(executionBuilder);
+    auto [fullN, fullOutputShapes, _] = cpuFallbackFull(this);
     syncFence = -1;
-    executionBuilder->reportTimingWithoutFencedExecutionCallback(fullTiming);
-    return std::make_tuple(fullN, syncFence, nullptr);
+    return {fullN, syncFence, nullptr};
 }
 
 int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor,
@@ -913,13 +954,10 @@
     // Unlike ExecutionBuilder::compute, we do not need to reset output dimensions here because
     // fenced executions do not support dynamic output shape.
 
-    const bool allowCpuFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
-    std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
     VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
     int result;
     std::tie(result, mSyncFenceFd, mFencedExecutionCallback) =
-            startComputeFenced(this, *mPlan, controller, waitFor, timeoutDurationAfterFence,
-                               deadline, allowCpuFallback);
+            computeFencedInternal(waitFor, timeoutDurationAfterFence, deadline);
     *syncFence = mSyncFenceFd;
     // If there is an error, call finishComputation to mark the computation as completed.
     // Otherwise, we will call finishComputation in SyncFenceEvent::wait().
@@ -973,32 +1011,22 @@
         output.reset();
     }
 
-    auto wrappedFinish = [this](ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
-        return finishComputation(error, outputShapes);
-    };
-
-    // TODO: For asynchronous execution, entire plan-based-path should run in an
-    // asynchronous thread -- take the asynchronous thread logic out of
-    // CpuPreparedModel::execute() and use it to wrap the plan-based-path.
-    const bool allowCpuFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
-    std::shared_ptr<ExecutionPlan::Controller> controller =
-            mPlan->makeController(this, burstBuilder);
     if (synchronous) {
         if (burstBuilder) {
             VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API, burst)";
         } else {
             VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
         }
-        auto localSynchronizationCallback = std::make_shared<ExecutionCallback>();
-        localSynchronizationCallback->setOnFinish(wrappedFinish);
-        asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,
-                                     localSynchronizationCallback);
-        localSynchronizationCallback->wait();
+        const auto [n, outputShapes, timing] = computeInternal(deadline, burstBuilder);
         if (mMeasureTiming) {
-            mTimingWithoutFencedExecutionCallback = localSynchronizationCallback->getTiming();
+            mTimingWithoutFencedExecutionCallback = timing;
         }
-        return convertErrorStatusToResultCode(localSynchronizationCallback->getStatus());
+        return finishComputation(n, outputShapes);
     } else /* asynchronous */ {
+        // TODO: For asynchronous execution, entire plan-based-path should run in an
+        // asynchronous thread -- take the asynchronous thread logic out of
+        // CpuPreparedModel::execute() and use it to wrap the plan-based-path.
+
         // TODO: use a thread pool
         // TODO(mikie): this could have NNTRACE so we could measure the overhead
         //              of spinning up a new thread.
@@ -1009,18 +1037,21 @@
         // nullptr is returned.  The executionCallback is
         // abstracted in the NN API as an "event".
         auto executionCallback = std::make_shared<ExecutionCallback>();
-        executionCallback->setOnFinish(wrappedFinish);
+        executionCallback->setOnFinish(
+                [this](ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
+                    return finishComputation(error, outputShapes);
+                });
+        const auto asyncStartCompute = [this, deadline, executionCallback] {
+            const auto [n, outputShapes, timing] = computeInternal(deadline, nullptr);
+            const auto status = convertResultCodeToErrorStatus(n);
+            executionCallback->notify(status, outputShapes, timing);
+        };
         if (DeviceManager::get()->syncExecRuntime()) {
             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
-            asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,
-                                         executionCallback);
+            asyncStartCompute();
         } else {
             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
-            std::thread asyncExecution(
-                    [this, controller, allowCpuFallback, deadline, executionCallback] {
-                        asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback,
-                                                     deadline, executionCallback);
-                    });
+            std::thread asyncExecution(asyncStartCompute);
             executionCallback->bindThread(std::move(asyncExecution));
         }
         *synchronizationCallback = executionCallback;
diff --git a/runtime/ExecutionBuilder.h b/runtime/ExecutionBuilder.h
index 910d076..08a8f53 100644
--- a/runtime/ExecutionBuilder.h
+++ b/runtime/ExecutionBuilder.h
@@ -55,6 +55,7 @@
 
    public:
     explicit ExecutionBuilder(const CompilationBuilder* compilation);
+    virtual ~ExecutionBuilder() = default;
 
     int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer,
                  size_t length);
@@ -134,7 +135,7 @@
         return mMemories[poolIndex]->getRunTimePoolInfo();
     }
 
-   private:
+   protected:
     // If a callback is provided, then this is asynchronous. If a callback is
     // not provided (i.e., is nullptr), then this is synchronous.
     //
@@ -145,6 +146,13 @@
     int compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback,
                 BurstBuilder* burstBuilder = nullptr);
 
+    virtual std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
+            const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) = 0;
+
+    virtual std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
+            const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
+            const OptionalTimePoint& deadline) = 0;
+
     const CompilationBuilder* mCompilation;
 
     // Update output dimensional information from OutputShape to ModelArgumentInfo.
@@ -157,9 +165,9 @@
     const ModelBuilder* mModel;
     const ExecutionPlan* mPlan;
 
-    // This is a DeviceManager::kPartitioning* value captured from
-    // CompilationBuilder when the ExecutionBuilder is constructed.
-    uint32_t mPartitioning;
+    // Whether CPU fallback is allowed based on the value of DeviceManager::kPartitioning* captured
+    // from CompilationBuilder when the ExecutionBuilder is constructed.
+    bool mAllowCpuFallback;
 
     // The information we'll send to the driver about the inputs and outputs.
     // Note that we build this in two steps:
@@ -242,6 +250,32 @@
     bool mReusable = false;
 };
 
+// For execution plan with a SIMPLE body, i.e. the whole model will be executed on a single device.
+class SimpleExecutionBuilder : public ExecutionBuilder {
+   public:
+    SimpleExecutionBuilder(const CompilationBuilder* compilation);
+
+    std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
+            const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;
+
+    std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
+            const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
+            const OptionalTimePoint& deadline) override;
+};
+
+// For execution plan with a COMPOUND body, i.e. partitioned execution with multiple steps.
+class CompoundExecutionBuilder : public ExecutionBuilder {
+   public:
+    CompoundExecutionBuilder(const CompilationBuilder* compilation);
+
+    std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
+            const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;
+
+    std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
+            const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
+            const OptionalTimePoint& deadline) override;
+};
+
 // class StepExecutor is used to execute a single "step" in a
 // potentially multiple step execution process.  The graph associated
 // with that step is executed in its entirety on a single device (or
diff --git a/runtime/ExecutionPlan.cpp b/runtime/ExecutionPlan.cpp
index c01777d..3277a39 100644
--- a/runtime/ExecutionPlan.cpp
+++ b/runtime/ExecutionPlan.cpp
@@ -1105,10 +1105,6 @@
                          simulateFailureResultCode);
 }
 
-ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
-                                      const BurstBuilder* burstBuilder)
-    : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}, {}) {}
-
 ExecutionPlan::Controller::Controller(
         const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
         const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
@@ -1202,9 +1198,7 @@
 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
         ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
     CHECK(isValid());
-    if (mState == SIMPLE) {
-        return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder));
-    }
+    CHECK(mState != SIMPLE);
     const auto* body = compound();
     // Create the layout for a RuntimeMemory object big enough to hold
     // - every partition boundary TEMPORARY operand that is not a dynamic temporary, and
@@ -1504,6 +1498,8 @@
                         std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController,
                         const std::vector<OutputShape>* mainModelOutputShapes,
                         int syncFdOfLastStep) const {
+    CHECK(mState == COMPOUND);
+
     controller->mLastStepSyncFd = syncFdOfLastStep;
     *executor = nullptr;
     if (burstController != nullptr) {
@@ -1517,33 +1513,6 @@
         return ANEURALNETWORKS_OP_FAILED;
     }
 
-    if (mState == EMPTY) {
-        CHECK_EQ(controller->mNextStepIndex, 0u);  // end
-        controller->mNextStepIndex = Controller::kBadStepIndex;
-        return ANEURALNETWORKS_NO_ERROR;
-    }
-
-    if (mState == SIMPLE) {
-        if (controller->mNextStepIndex == 0) {
-            // First (and only) step.
-            auto simpleBody = simple();
-            *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder,
-                                                       simpleBody->mModel, simpleBody->mDevice,
-                                                       simpleBody->mPreparedModel);
-            (*executor)->mapInputsAndOutputsTrivially();
-            if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
-                *burstController = controller->mBurstBuilder->getControllerAt(0);
-            }
-            controller->mFallbackNextStepIndex = 0;
-            controller->mNextStepIndex = 1;
-            return ANEURALNETWORKS_NO_ERROR;
-        }
-
-        CHECK_EQ(controller->mNextStepIndex, 1u);  // end
-        controller->mNextStepIndex = Controller::kBadStepIndex;
-        return ANEURALNETWORKS_NO_ERROR;
-    }
-
     return nextCompound(controller, executor, burstController, mainModelOutputShapes);
 }
 
@@ -1853,6 +1822,15 @@
     return nextCompound(controller, executor, burstController, mainModelOutputShapes);
 }
 
+std::shared_ptr<StepExecutor> ExecutionPlan::makeStepExecutor(
+        ExecutionBuilder* executionBuilder) const {
+    auto simpleBody = simple();
+    auto executor = std::make_shared<StepExecutor>(executionBuilder, simpleBody->mModel,
+                                                   simpleBody->mDevice, simpleBody->mPreparedModel);
+    executor->mapInputsAndOutputsTrivially();
+    return executor;
+}
+
 void ExecutionPlan::becomeCompoundIfEmpty() {
     CHECK(mState != SIMPLE);
     if (mState == EMPTY) {
diff --git a/runtime/ExecutionPlan.h b/runtime/ExecutionPlan.h
index 9a238c2..cebdd63 100644
--- a/runtime/ExecutionPlan.h
+++ b/runtime/ExecutionPlan.h
@@ -591,9 +591,6 @@
 
         static const size_t kBadStepIndex = ~size_t(0);
 
-        // A constructor for mState == SIMPLE.
-        Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
-                   const BurstBuilder* burstBuilder);
         // A constructor for mState == COMPOUND.
         Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
                    const BurstBuilder* burstBuilder,
@@ -672,6 +669,7 @@
 
     std::vector<SharedBurst> makeBursts() const;
 
+    // Only legal to call when mState == COMPOUND.
     std::shared_ptr<Controller> makeController(ExecutionBuilder* executionBuilder,
                                                const BurstBuilder* burstBuilder) const;
 
@@ -682,6 +680,7 @@
     // mainModelOutputShapes may be nullptr if the only main model outputs that are step model
     //     inputs are of fully specified shape.
     // syncFdOfLastStep is the sync fence fd generated by the most recently processed step.
+    // Only legal to call when mState == COMPOUND.
     int next(std::shared_ptr<Controller> controller, std::shared_ptr<StepExecutor>* executor,
              SharedBurst* burstController, const std::vector<OutputShape>* mainModelOutputShapes,
              int syncFdOfLastStep = -1) const;
@@ -691,6 +690,9 @@
                  SharedBurst* burstController,
                  const std::vector<OutputShape>* mainModelOutputShapes) const;
 
+    // Only legal to call when mState == SIMPLE.
+    std::shared_ptr<StepExecutor> makeStepExecutor(ExecutionBuilder* executionBuilder) const;
+
     ExecutionStep* createNewExecutionStep(uint32_t sourceModelIndex,
                                           const std::shared_ptr<Device> device);
     IfStep* createNewIfStep();
@@ -715,6 +717,7 @@
 
     bool isValid() const { return mState != EMPTY && mBody != nullptr && mBody->mSuccessfulFinish; }
     bool isSimple() const { return mState == SIMPLE; }
+    bool isCompound() const { return mState == COMPOUND; }
     bool isSimpleCpu() const;
 
     void setCaching(const std::string* cacheDir, const uint8_t* token) {