tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <sys/mman.h>

 #include <algorithm>
 #include <array>
 #include <cstdint>
 #include <iterator>
 #include <memory>
 #include <numeric>
 #include <ostream>
 #include <unordered_set>
 #include <vector>

 #include <gtest/gtest.h>
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"

 namespace tflite {
 namespace {

 class SingleOpModelWithNNAPI : public SingleOpModel {
  public:
   SingleOpModelWithNNAPI() = default;
   void Init(const NnApi* nnapi,
             tflite::StatefulNnApiDelegate::Options options) {
     stateful_delegate_.reset(new StatefulNnApiDelegate(nnapi, options));
     auto* delegate = stateful_delegate_.get();
     this->SetApplyDelegate([delegate, this](Interpreter* interpreter) {
       compilation_status_ = interpreter->ModifyGraphWithDelegate(delegate);
     });
   }

   StatefulNnApiDelegate* GetDelegate() { return stateful_delegate_.get(); }

   void SetBufferHandle(int index, TfLiteBufferHandle handle) {
     interpreter_->SetBufferHandle(index, handle, stateful_delegate_.get());
   }
   TfLiteStatus GetCompilationStatus() { return compilation_status_; }

  private:
   std::unique_ptr<StatefulNnApiDelegate> stateful_delegate_;
   TfLiteStatus compilation_status_;
 };

 class FloatAddOpModel : public SingleOpModelWithNNAPI {
  public:
   FloatAddOpModel() = default;
   void Init(const NnApi* nnapi, tflite::StatefulNnApiDelegate::Options options,
             const TensorData& input1, const TensorData& input2,
             const TensorData& output, ActivationFunctionType activation_type,
             bool allow_fp32_relax_to_fp16 = false) {
     SingleOpModelWithNNAPI::Init(nnapi, options);
     input1_ = AddInput(input1);
     input2_ = AddInput(input2);
     output_ = AddOutput(output);
     SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                  CreateAddOptions(builder_, activation_type).Union());
     BuildInterpreter({GetShape(input1_), GetShape(input2_)},
                      allow_fp32_relax_to_fp16);
   }

   int input1() { return input1_; }
   int input2() { return input2_; }

   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }

  protected:
   int input1_;
   int input2_;
   int output_;

  private:
 };

 struct NnApiDeviceSelectionTest
     : ::tflite::delegate::nnapi::NnApiDelegateMockTest {
   void SetUp() override {
     ::tflite::delegate::nnapi::NnApiDelegateMockTest::SetUp();
     nnapi_mock_->GetDeviceCountReturnsCount<3>();
     nnapi_mock_->StubGetDeviceWith(
         [](uint32_t devIndex, ANeuralNetworksDevice** device) -> int {
           *device = reinterpret_cast<ANeuralNetworksDevice*>(devIndex + 1);
           return 0;
         });
     nnapi_mock_->StubGetDeviceNameWith(
         [](const ANeuralNetworksDevice* device, const char** name) -> int {
           if (device == reinterpret_cast<ANeuralNetworksDevice*>(1)) {
             *name = "dsp";
           } else if (device == reinterpret_cast<ANeuralNetworksDevice*>(2)) {
             *name = "gpu";
           } else {
             *name = "nnapi-reference";
           }
           return ANEURALNETWORKS_NO_ERROR;
         });
     nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
         [](const ANeuralNetworksModel* model,
            const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
            bool* supportedOps) -> int {
           supportedOps[0] = true;
           return ANEURALNETWORKS_NO_ERROR;
         });
   }
   void InitWithOptions(tflite::StatefulNnApiDelegate::Options options) {
     m.Init(nnapi_mock_->GetNnApi(), options, {TensorType_FLOAT32, {1, 2, 2, 1}},
            {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {}},
            ActivationFunctionType_NONE);
     m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
     m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
   }
   FloatAddOpModel m;
 };

 TEST_F(NnApiDeviceSelectionTest, DoesntSetDevicesWithoutFlags) {
   nnapi_mock_->StubCompilationCreateForDevicesWith(
       [](ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
          ANeuralNetworksCompilation** compilation) -> int {
         EXPECT_TRUE(false) << "Should not call createForDevices";
         return 1;
       });

   tflite::StatefulNnApiDelegate::Options options;
   InitWithOptions(options);
   m.Invoke();
   EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
 }

 TEST_F(NnApiDeviceSelectionTest, SetsDeviceBasedOnOptions) {
   nnapi_mock_->CompilationCreateReturns<1>();
   nnapi_mock_->StubCompilationCreateForDevicesWith(
       [](ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
          ANeuralNetworksCompilation** compilation) -> int {
         EXPECT_EQ(numDevices, 1);
         EXPECT_EQ(devices[0], reinterpret_cast<ANeuralNetworksDevice*>(1));
         if (numDevices != 1 ||
             devices[0] != reinterpret_cast<ANeuralNetworksDevice*>(1)) {
           return 1;
         } else {
           *compilation = reinterpret_cast<ANeuralNetworksCompilation*>(3);
           return ANEURALNETWORKS_NO_ERROR;
         }
       });

   tflite::StatefulNnApiDelegate::Options options;
   options.accelerator_name = "dsp";
   InitWithOptions(options);
   m.Invoke();
   EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
 }

 TEST_F(NnApiDeviceSelectionTest, DisallowsCPUBasedOnOptions) {
   nnapi_mock_->CompilationCreateReturns<1>();
   nnapi_mock_->StubCompilationCreateForDevicesWith(
       [](ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
          ANeuralNetworksCompilation** compilation) -> int {
         EXPECT_EQ(numDevices, 2);
         EXPECT_EQ(devices[0], reinterpret_cast<ANeuralNetworksDevice*>(1));
         EXPECT_EQ(devices[1], reinterpret_cast<ANeuralNetworksDevice*>(2));
         if (numDevices != 2 ||
             devices[0] != reinterpret_cast<ANeuralNetworksDevice*>(1) ||
             devices[1] != reinterpret_cast<ANeuralNetworksDevice*>(2)) {
           return 1;
         } else {
           *compilation = reinterpret_cast<ANeuralNetworksCompilation*>(3);
           return ANEURALNETWORKS_NO_ERROR;
         }
       });

   tflite::StatefulNnApiDelegate::Options options;
   options.disallow_nnapi_cpu = true;
   InitWithOptions(options);
   m.Invoke();
   EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
 }

 TEST_F(NnApiDeviceSelectionTest,
        DoesNotDelegateIfOnlyReferenceDeviceIsAvailable_CpuEnabled) {
   // Only nnapi-reference is available on device
   nnapi_mock_->GetDeviceCountReturnsCount<1>();
   nnapi_mock_->GetDeviceNameReturnsName("nnapi-reference");

   tflite::StatefulNnApiDelegate::Options options;
   options.disallow_nnapi_cpu = false;
   InitWithOptions(options);
   m.Invoke();
   EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
   EXPECT_EQ(m.CountOpsExecutedByCpuKernel(), 1);
 }

 TEST_F(NnApiDeviceSelectionTest,
        DoesNotDelegateIfOnlyReferenceDeviceIsAvailable_CpuDisabled) {
   // Only nnapi-reference is available on device
   nnapi_mock_->GetDeviceCountReturnsCount<1>();
   nnapi_mock_->GetDeviceNameReturnsName("nnapi-reference");

   tflite::StatefulNnApiDelegate::Options options;
   options.disallow_nnapi_cpu = true;
   InitWithOptions(options);
   m.Invoke();
   EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
   EXPECT_EQ(m.CountOpsExecutedByCpuKernel(), 1);
 }

 struct UnsupportedOperationOnDeviceTest
     : ::tflite::delegate::nnapi::NnApiDelegateMockTest {};

 class AcceleratedModel {
  public:
   StatefulNnApiDelegate* GetDelegate() { return stateful_delegate_.get(); }

  protected:
   // build a delegate with a target accelerator name.
   AcceleratedModel(const NnApi* nnapi, const std::string& accelerator_name,
                    int max_nnapi_partitions = 0) {
     StatefulNnApiDelegate::Options options;
     options.accelerator_name = accelerator_name.c_str();
     options.max_number_delegated_partitions = max_nnapi_partitions;
     stateful_delegate_.reset(new StatefulNnApiDelegate(nnapi, options));
   }

   // build a delegate with no target accelerator name, can disable the NNAPI CPU
   // fallback implementation using the disallow_nnapi_cpu flag.
   AcceleratedModel(const NnApi* nnapi, bool disallow_nnapi_cpu,
                    int max_nnapi_partitions = 0) {
     StatefulNnApiDelegate::Options options;
     options.disallow_nnapi_cpu = disallow_nnapi_cpu;
     options.max_number_delegated_partitions = max_nnapi_partitions;
     stateful_delegate_.reset(new StatefulNnApiDelegate(nnapi, options));
   }

  private:
   std::unique_ptr<StatefulNnApiDelegate> stateful_delegate_;
 };

 class ArgMaxOpModel : public SingleOpModel, public AcceleratedModel {
  public:
   ArgMaxOpModel(std::initializer_list<int> input_shape, TensorType input_type,
                 int axis_value, TensorType output_type, const NnApi* nnapi,
                 const char* device_name)
       : SingleOpModel(), AcceleratedModel(nnapi, device_name) {
     Init(input_shape, input_type, axis_value, output_type);
   }

   ArgMaxOpModel(std::initializer_list<int> input_shape, TensorType input_type,
                 int axis_value, TensorType output_type, const NnApi* nnapi,
                 bool disallow_nnapi_cpu)
       : SingleOpModel(), AcceleratedModel(nnapi, disallow_nnapi_cpu) {
     Init(input_shape, input_type, axis_value, output_type);
   }

   int input() const { return input_; }

  protected:
   int input_;
   int axis_;
   int output_;

   void Init(std::initializer_list<int> input_shape, TensorType input_type,
             int axis_value, TensorType output_type) {
     auto* delegate = GetDelegate();
     this->SetApplyDelegate([delegate](Interpreter* interpreter) {
       interpreter->ModifyGraphWithDelegate(delegate);
     });
     input_ = AddInput(input_type);
     axis_ = AddConstInput(TensorType_INT32, {axis_value}, {1});
     output_ = AddOutput(output_type);

     SetBuiltinOp(BuiltinOperator_ARG_MAX, BuiltinOptions_ArgMaxOptions,
                  CreateArgMaxOptions(builder_, output_type).Union());
     BuildInterpreter({input_shape, {1}});
   }
 };

 TEST_F(UnsupportedOperationOnDeviceTest,
        ShouldUseDeviceFeatureLevelWhenSpecifyingTargetDevice) {
   nnapi_mock_->SetAndroidSdkVersion(29);
   nnapi_mock_->SetNnapiSupportedDevice("test-device", /* feature_level=*/28);
   // Setting this here because I want the delegate not to be applied in the
   // first case because the feature level is not high enough and not because the
   // operations are not supported by the device.
   nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
       [](const ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
          bool* supportedOps) -> int {
         std::fill(supportedOps, supportedOps + 1, true);
         return ANEURALNETWORKS_NO_ERROR;
       });

   ArgMaxOpModel m({1, 1, 1, 4}, TensorType_FLOAT32, /*axis_value=*/3,
                   TensorType_INT32, nnapi_mock_->GetNnApi(), "test-device");
   m.PopulateTensor<float>(m.input(), {0.1, 0.9, 0.7, 0.3});
   m.Invoke();

   EXPECT_EQ(m.CountOpsExecutedByCpuKernel(), 1)
       << "Expected Max not to be delegates since it not supported before NNAPI "
          "1.2 and device declares to support only NNAPI 1.1.";

   nnapi_mock_->SetNnapiSupportedDevice("test-device", /* feature_level=*/29);

   ArgMaxOpModel m1({1, 1, 1, 4}, TensorType_FLOAT32, /*axis_value=*/3,
                    TensorType_INT32, nnapi_mock_->GetNnApi(), "test-device");
   m1.PopulateTensor<float>(m.input(), {0.1, 0.9, 0.7, 0.3});
   m1.Invoke();

   EXPECT_EQ(m1.CountOpsExecutedByCpuKernel(), 0)
       << "Expected Max op to be delegated since it is supported in NNAPI 1.2.";
 }

 TEST_F(UnsupportedOperationOnDeviceTest,
        ShouldUseDeviceFeatureLevelWhenDisablingCPU) {
   nnapi_mock_->SetAndroidSdkVersion(29);
   nnapi_mock_->SetNnapiSupportedDevice("test-device", /* feature_level=*/28);
   // Setting this here because I want the delegate not to be applied in the
   // first case because the feature level is not high enough and not because the
   // operations are not supported by the device.
   nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
       [](const ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
          bool* supportedOps) -> int {
         std::fill(supportedOps, supportedOps + 1, true);
         return ANEURALNETWORKS_NO_ERROR;
       });

   ArgMaxOpModel m({1, 1, 1, 4}, TensorType_FLOAT32, /*axis_value=*/3,
                   TensorType_INT32, nnapi_mock_->GetNnApi(),
                   /*disallow_nnapi_cpu=*/true);
   m.PopulateTensor<float>(m.input(), {0.1, 0.9, 0.7, 0.3});
   m.Invoke();

   EXPECT_EQ(m.CountOpsExecutedByCpuKernel(), 1)
       << "Expected Max not to be delegates since it not supported before NNAPI "
          "1.2 and device declares to support only NNAPI 1.1.";

   ArgMaxOpModel m1({1, 1, 1, 4}, TensorType_FLOAT32, /*axis_value=*/3,
                    TensorType_INT32, nnapi_mock_->GetNnApi(),
                    /*disallow_nnapi_cpu=*/false);
   m1.PopulateTensor<float>(m.input(), {0.1, 0.9, 0.7, 0.3});
   m1.Invoke();

   EXPECT_EQ(m1.CountOpsExecutedByCpuKernel(), 0)
       << "Expected Max op to be delegated since we enabled NNAPI CPU "
          "implementation.";

   nnapi_mock_->SetNnapiSupportedDevice("test-device", /* feature_level=*/29);

   ArgMaxOpModel m2({1, 1, 1, 4}, TensorType_FLOAT32, /*axis_value=*/3,
                    TensorType_INT32, nnapi_mock_->GetNnApi(),
                    /*disallow_nnapi_cpu=*/true);
   m2.PopulateTensor<float>(m.input(), {0.1, 0.9, 0.7, 0.3});
   m2.Invoke();

   EXPECT_EQ(m2.CountOpsExecutedByCpuKernel(), 0)
       << "Expected Max op to be delegated since it is supported in NNAPI 1.2.";
 }

 // This is a model with two ops:
 //
 //  input1 ---->
 //                ADD --
 //  input2   -->        |
 //                       -->
 //                          SUB --> output
 //  input3 ---------------->
 //
 class AddSubOpsAcceleratedModel : public MultiOpModel, public AcceleratedModel {
  public:
   AddSubOpsAcceleratedModel(const TensorData& input1, const TensorData& input2,
                             const TensorData& input3, const TensorData& output,
                             ActivationFunctionType activation_type,
                             const NnApi* nnapi,
                             const std::string& accelerator_name,
                             bool allow_fp32_relax_to_fp16 = false)
       : MultiOpModel(), AcceleratedModel(nnapi, accelerator_name) {
     auto* delegate = GetDelegate();
     this->SetApplyDelegate([delegate](Interpreter* interpreter) {
       interpreter->ModifyGraphWithDelegate(delegate);
     });
     Init(input1, input2, input3, output, activation_type,
          allow_fp32_relax_to_fp16);
   }

   int input1() { return input1_; }
   int input2() { return input2_; }
   int input3() { return input3_; }

   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }

  protected:
   int input1_;
   int input2_;
   int input3_;
   int output_;

  private:
   // Performs initialization logic shared across all constructors.
   void Init(const TensorData& input1, const TensorData& input2,
             const TensorData& input3, const TensorData& output,
             ActivationFunctionType activation_type,
             bool allow_fp32_relax_to_fp16 = false) {
     input1_ = AddInput(input1);
     input2_ = AddInput(input2);
     input3_ = AddInput(input3);
     const int add_output = AddInnerTensor<float>(output);
     output_ = AddOutput(output);
     AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                  CreateAddOptions(builder_, activation_type).Union(),
                  {input1_, input2_}, {add_output});
     AddBuiltinOp(BuiltinOperator_SUB, BuiltinOptions_SubOptions,
                  CreateSubOptions(builder_, activation_type).Union(),
                  {add_output, input3_}, {output_});
     BuildInterpreter({GetShape(input1_), GetShape(input2_), GetShape(input3_)},
                      allow_fp32_relax_to_fp16);
   }
 };

 int should_build_model_with_sup_ops_compilation_model_create_count = 0;
 int should_build_model_with_sup_ops_add_operation_count = 0;
 TEST_F(UnsupportedOperationOnDeviceTest,
        ShouldBuildModelWithOnlyDeviceSupportedOps) {
   nnapi_mock_->SetNnapiSupportedDevice("test-device");

   nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
       [](const ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
          bool* supportedOps) -> int {
         // Returning the first as supported since this will leverage
         // the assertion on caching.
         supportedOps[0] = true;
         supportedOps[1] = false;
         return ANEURALNETWORKS_NO_ERROR;
       });

   nnapi_mock_->StubModelCreateWith([](ANeuralNetworksModel** model) -> int {
     ++should_build_model_with_sup_ops_compilation_model_create_count;
     *model = reinterpret_cast<ANeuralNetworksModel*>(1);
     return ANEURALNETWORKS_NO_ERROR;
   });

   nnapi_mock_->StubAddOperationWith(
       [](ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
          uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
          const uint32_t* outputs) -> int {
         ++should_build_model_with_sup_ops_add_operation_count;
         return ANEURALNETWORKS_NO_ERROR;
       });

   AddSubOpsAcceleratedModel m(
       {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
       {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {}},
       ActivationFunctionType_NONE, nnapi_mock_->GetNnApi(),
       /*accelerator_name=*/"test-device");
   std::vector<float> input1{-2.0, 0.2, 0.7, 0.9};
   std::vector<float> input2{0.1, 0.2, 0.3, 0.5};
   m.PopulateTensor<float>(m.input1(), input1);
   m.PopulateTensor<float>(m.input2(), input2);
   m.PopulateTensor<float>(m.input3(), input2);
   m.Invoke();

   EXPECT_EQ(m.CountOpsExecutedByCpuKernel(), 1);
   ASSERT_EQ(should_build_model_with_sup_ops_compilation_model_create_count, 2)
       << "Model with unsupported operations has been cached";
   EXPECT_EQ(should_build_model_with_sup_ops_add_operation_count, 3)
       << "The second model should contain only one operation";
 }

 TEST_F(UnsupportedOperationOnDeviceTest, ShouldRunOnCpuIfDeviceSupportsNoOps) {
   nnapi_mock_->SetNnapiSupportedDevice("test-device");

   nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
       [](const ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
          bool* supportedOps) -> int {
         std::fill(supportedOps, supportedOps + 2, false);
         return ANEURALNETWORKS_NO_ERROR;
       });

   AddSubOpsAcceleratedModel m(
       {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
       {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {}},
       ActivationFunctionType_NONE, nnapi_mock_->GetNnApi(),
       /*accelerator_name=*/"test-device");
   std::vector<float> input1{-2.0, 0.2, 0.7, 0.9};
   std::vector<float> input2{0.1, 0.2, 0.3, 0.5};
   m.PopulateTensor<float>(m.input1(), input1);
   m.PopulateTensor<float>(m.input2(), input2);
   m.PopulateTensor<float>(m.input3(), input2);
   m.Invoke();

   EXPECT_EQ(m.CountOpsExecutedByCpuKernel(), 2);
 }

 int should_cache_model_compilation_model_create_count = 0;
 TEST_F(UnsupportedOperationOnDeviceTest, ShouldCacheModelCompilation) {
   nnapi_mock_->SetNnapiSupportedDevice("test-device");

   nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
       [](const ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
          bool* supportedOps) -> int {
         std::fill(supportedOps, supportedOps + 2, true);
         return ANEURALNETWORKS_NO_ERROR;
       });

   nnapi_mock_->StubModelCreateWith([](ANeuralNetworksModel** model) -> int {
     ++should_cache_model_compilation_model_create_count;
     *model = reinterpret_cast<ANeuralNetworksModel*>(1);
     return ANEURALNETWORKS_NO_ERROR;
   });

   AddSubOpsAcceleratedModel m(
       {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
       {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {}},
       ActivationFunctionType_NONE, nnapi_mock_->GetNnApi(),
       /*accelerator_name=*/"test-device");
   std::vector<float> input1{-2.0, 0.2, 0.7, 0.9};
   std::vector<float> input2{0.1, 0.2, 0.3, 0.5};
   m.PopulateTensor<float>(m.input1(), input1);
   m.PopulateTensor<float>(m.input2(), input2);
   m.PopulateTensor<float>(m.input3(), input2);
   m.Invoke();

   ASSERT_EQ(m.CountOpsExecutedByCpuKernel(), 0);
   EXPECT_EQ(should_cache_model_compilation_model_create_count, 1);
 }

 TEST_F(UnsupportedOperationOnDeviceTest,
        ShouldNotApplySupportedOperationsFilterBeforeAndroidSdk29) {
   nnapi_mock_->SetAndroidSdkVersion(28, /*set_unsupported_ops_to_null=*/true);
   nnapi_mock_->ModelCreateReturns<0>();
   AddSubOpsAcceleratedModel m(
       {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
       {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {}},
       ActivationFunctionType_NONE, nnapi_mock_->GetNnApi(),
       /*accelerator_name=*/"test-device");
   std::vector<float> input1{-2.0, 0.2, 0.7, 0.9};
   std::vector<float> input2{0.1, 0.2, 0.3, 0.5};
   m.PopulateTensor<float>(m.input1(), input1);
   m.PopulateTensor<float>(m.input2(), input2);
   m.PopulateTensor<float>(m.input3(), input2);
   m.Invoke();

   // Delegation succeded without failures and all nodes have been delegated.
   ASSERT_EQ(m.CountOpsExecutedByCpuKernel(), 0);
 }

 // This is a model with two ops:
 //
 //  input1 ----> HARD_SWISH ---->
 //                                ADD --> output
 //  input2 ---------------------->
 //
 class HardSwishAddOpsAcceleratedModel : public MultiOpModel,
                                         public AcceleratedModel {
  public:
   HardSwishAddOpsAcceleratedModel(const TensorData& input1,
                                   const TensorData& input2,
                                   const TensorData& output,
                                   ActivationFunctionType activation_type,
                                   const NnApi* nnapi,
                                   const std::string& accelerator_name,
                                   bool allow_fp32_relax_to_fp16 = false)
       : MultiOpModel(), AcceleratedModel(nnapi, accelerator_name) {
     auto* delegate = GetDelegate();
     this->SetApplyDelegate([delegate](Interpreter* interpreter) {
       interpreter->ModifyGraphWithDelegate(delegate);
     });
     Init(input1, input2, output, activation_type, allow_fp32_relax_to_fp16);
   }

   int input1() { return input1_; }
   int input2() { return input2_; }

   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }

  protected:
   int input1_;
   int input2_;
   int output_;

  private:
   // Performs initialization logic shared across all constructors.
   void Init(const TensorData& input1, const TensorData& input2,
             const TensorData& output, ActivationFunctionType activation_type,
             bool allow_fp32_relax_to_fp16 = false) {
     input1_ = AddInput(input1);
     input2_ = AddInput(input2);
     const int hard_swish_output = AddInnerTensor<float>(output);
     output_ = AddOutput(output);
     AddBuiltinOp(BuiltinOperator_HARD_SWISH, BuiltinOptions_HardSwishOptions,
                  CreateHardSwishOptions(builder_).Union(), {input1_},
                  {hard_swish_output});
     AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                  CreateAddOptions(builder_, activation_type).Union(),
                  {input1_, hard_swish_output}, {output_});
     BuildInterpreter({GetShape(input1_), GetShape(input2_)},
                      allow_fp32_relax_to_fp16);
   }
 };

 struct TfLiteOpMappedToMultipleNnApiOps
     : ::tflite::delegate::nnapi::NnApiDelegateMockTest {};

 TEST_F(TfLiteOpMappedToMultipleNnApiOps, AllCostituentOpsNotSupported) {
   nnapi_mock_->ModelCreateReturns<0>();

   nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
       [](const ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
          bool* supportedOps) -> int {
         // HardSwish is mapped to 4 NNAPI ops, none of which supported.
         std::fill(supportedOps, supportedOps + 4, false);
         // After that we have the ADD op that is supported.
         supportedOps[4] = true;
         return ANEURALNETWORKS_NO_ERROR;
       });

   HardSwishAddOpsAcceleratedModel m(
       {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE,
       nnapi_mock_->GetNnApi(),
       /*accelerator_name=*/"test-device");
   std::vector<float> input1{-2.0, 0.2, 0.7, 0.9};
   std::vector<float> input2{0.1, 0.2, 0.3, 0.5};
   m.PopulateTensor<float>(m.input1(), input1);
   m.PopulateTensor<float>(m.input2(), input2);
   m.Invoke();

   // Delegation succeded without failures and HardSwish has not been delegated
   // but Add has been correctly delegated.
   ASSERT_EQ(m.CountOpsExecutedByCpuKernel(), 1);
 }

 TEST_F(TfLiteOpMappedToMultipleNnApiOps, NotAllConstitutentOpsSupported) {
   nnapi_mock_->ModelCreateReturns<0>();
   nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
       [](const ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
          bool* supportedOps) -> int {
         // HardSwish is mapped to 4 NNAPI ops (the first 4 ones), so we have 5
         // ops in the NNAPI model.
         std::fill(supportedOps, supportedOps + 5, true);
         // One of the NNAPI ops required by HardSwish is not supported.
         supportedOps[2] = false;
         return ANEURALNETWORKS_NO_ERROR;
       });

   HardSwishAddOpsAcceleratedModel m(
       {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE,
       nnapi_mock_->GetNnApi(),
       /*accelerator_name=*/"test-device");
   std::vector<float> input1{-2.0, 0.2, 0.7, 0.9};
   std::vector<float> input2{0.1, 0.2, 0.3, 0.5};
   m.PopulateTensor<float>(m.input1(), input1);
   m.PopulateTensor<float>(m.input2(), input2);
   m.Invoke();

   // Delegation succeded without failures. HardSwish has not been delegated
   // but Add is delegated.
   ASSERT_EQ(m.CountOpsExecutedByCpuKernel(), 1);
 }

 TEST_F(TfLiteOpMappedToMultipleNnApiOps, AllConstitutentOpsSupported) {
   nnapi_mock_->ModelCreateReturns<0>();
   nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
       [](const ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
          bool* supportedOps) -> int {
         // HardSwish is mapped to 4 NNAPI ops (the first 4 ones), so we have 5
         // ops in the NNAPI model.
         // All ops are supported by the accelerator.
         std::fill(supportedOps, supportedOps + 5, true);
         return ANEURALNETWORKS_NO_ERROR;
       });

   HardSwishAddOpsAcceleratedModel m(
       {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE,
       nnapi_mock_->GetNnApi(),
       /*accelerator_name=*/"test-device");
   std::vector<float> input1{-2.0, 0.2, 0.7, 0.9};
   std::vector<float> input2{0.1, 0.2, 0.3, 0.5};
   m.PopulateTensor<float>(m.input1(), input1);
   m.PopulateTensor<float>(m.input2(), input2);
   m.Invoke();

   // Delegation succeded without failures and all nodes have been delegated.
   ASSERT_EQ(m.CountOpsExecutedByCpuKernel(), 0);
 }

 class QuantizedWeightsConvolutionOpModel : public SingleOpModel,
                                            public AcceleratedModel {
  public:
   QuantizedWeightsConvolutionOpModel(
       const NnApi* nnapi, std::string accelerator_name, const TensorData& input,
       const TensorData& filter, const TensorData& output, int stride_width = 2,
       int stride_height = 2, enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE,
       int dilation_width_factor = 1, int dilation_height_factor = 1,
       int num_threads = -1, std::initializer_list<uint8_t> filter_data = {})
       : SingleOpModel(), AcceleratedModel(nnapi, accelerator_name) {
     auto* delegate = GetDelegate();
     this->SetApplyDelegate([delegate](Interpreter* interpreter) {
       interpreter->ModifyGraphWithDelegate(delegate);
     });

     input_ = AddInput(input);

     if (filter_data.size()) {
       filter_ = AddConstInput(filter, filter_data);
     } else {
       filter_ = AddInput(filter);
     }

     int bias_size = GetShape(filter_)[0];

     bias_ = AddInput({TensorType_FLOAT32, {bias_size}});

     output_ = AddOutput(output);

     SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions,
                  CreateConv2DOptions(
                      builder_, padding, stride_width, stride_height, activation,
                      dilation_width_factor, dilation_height_factor)
                      .Union());

     BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)},
                      num_threads);
   }

   void SetInput(std::initializer_list<float> data) {
     PopulateTensor(input_, data);
   }

   void SetFilter(std::initializer_list<float> data) {
     QuantizeAndPopulate<uint8_t>(filter_, data);
   }

   void SetBias(std::initializer_list<float> data) {
     PopulateTensor(input_, data);
   }

   std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
   std::vector<float> GetDequantizedOutput() {
     return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
                                GetScale(output_), GetZeroPoint(output_));
   }

  protected:
   int input_;
   int filter_;
   int bias_;
   int output_;
 };

 int quantized_conv2d_model_added_nnapi_ops_count = 0;
 TEST_F(TfLiteOpMappedToMultipleNnApiOps,
        AddedDequantizationsAreAccountedInModelOps) {
   nnapi_mock_->ModelCreateReturns<0>();
   nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
       [](const ANeuralNetworksModel* model,
          const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
          bool* supportedOps) -> int {
         std::fill(supportedOps,
                   supportedOps + quantized_conv2d_model_added_nnapi_ops_count,
                   true);
         return ANEURALNETWORKS_NO_ERROR;
       });
   nnapi_mock_->StubAddOperationWith(
       [](ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
          uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
          const uint32_t* outputs) -> int {
         ++quantized_conv2d_model_added_nnapi_ops_count;
         return ANEURALNETWORKS_NO_ERROR;
       });

   QuantizedWeightsConvolutionOpModel m(
       nnapi_mock_->GetNnApi(),
       /*accelerator_name=*/"test-device", {TensorType_FLOAT32, {2, 2, 4, 1}},
       {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64}, {TensorType_FLOAT32, {}});
   m.SetInput({
       // First batch
       1, 1, 1, 1,  // row = 1
       2, 2, 2, 2,  // row = 2
       // Second batch
       1, 2, 3, 4,  // row = 1
       1, 2, 3, 4,  // row = 2
   });
   m.SetFilter({
       1, 2, 3, 4,    // first 2x2 filter
       -1, 1, -1, 1,  // second 2x2 filter
       -1, -1, 1, 1,  // third 2x2 filter
   });
   m.SetBias({1, 2, 3});

   EXPECT_EQ(m.CountOpsExecutedByCpuKernel(), 0);
   // When delegating quantized Conv2D, for each quantized inputs a
   // dequantize operation is added to the model.
   // In our case 1 Dequantize op for the weights is expected generating
   // a 2 ops model.
   EXPECT_EQ(quantized_conv2d_model_added_nnapi_ops_count, 2);
 }

 // Model with a chain of no-op (add with zero operations)
 // interleaved with no-op custom nodes.
 class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
  public:
   LongIdentityModel(const std::vector<int>& input_shape, int graph_size,
                     const std::unordered_set<int>& custom_nodes_indexes,
                     const NnApi* nnapi, const std::string& accelerator_name,
                     int max_nnapi_partitions)
       : MultiOpModel(),
         AcceleratedModel(nnapi, accelerator_name, max_nnapi_partitions) {
     Init(input_shape, graph_size, custom_nodes_indexes);
   }

   LongIdentityModel(const std::vector<int>& input_shape, int graph_size,
                     const std::unordered_set<int>& custom_nodes_indexes,
                     const NnApi* nnapi, int max_nnapi_partitions)
       : MultiOpModel(), AcceleratedModel(nnapi, false, max_nnapi_partitions) {
     Init(input_shape, graph_size, custom_nodes_indexes);
   }

   void SetInput(std::vector<float> value) { PopulateTensor(input_, value); }

   int CountNnApiPartitions() {
     return std::count_if(
         std::begin(interpreter_->execution_plan()),
         std::end(interpreter_->execution_plan()), [this](const int node_index) {
           return interpreter_->node_and_registration(node_index)
                      ->first.delegate != nullptr;
         });
   }

  private:
   void Init(const std::vector<int>& input_shape, int graph_size,
             const std::unordered_set<int>& custom_nodes_indexes) {
     auto* delegate = GetDelegate();
     this->SetApplyDelegate([delegate](Interpreter* interpreter) {
       interpreter->ModifyGraphWithDelegate(delegate);
     });

     const TensorData tensor_data{TensorType_FLOAT32, input_shape};

     input_ = AddInput(tensor_data);
     zero_input_ = AddInput(tensor_data);

     std::vector<int> intermediate_outputs(graph_size - 1);
     std::generate(
         std::begin(intermediate_outputs), std::end(intermediate_outputs),
         [this, &tensor_data]() { return AddInnerTensor<float>(tensor_data); });

     output_ = AddOutput(tensor_data);

     AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                  CreateAddOptions(builder_).Union(), {input_, zero_input_},
                  {intermediate_outputs[0]});

     for (int i = 0; i < intermediate_outputs.size() - 1; i++) {
       if (custom_nodes_indexes.count(i + 1) == 1) {
         AddCustomOp("custom_no_op", {}, [this]() { return CustomNoOpNode(); },
                     {intermediate_outputs[i]}, {intermediate_outputs[i + 1]});
       } else {
         AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                      CreateAddOptions(builder_).Union(),
                      {intermediate_outputs[i], zero_input_},
                      {intermediate_outputs[i + 1]});
       }
     }

     AddBuiltinOp(
         BuiltinOperator_ADD, BuiltinOptions_AddOptions,
         CreateAddOptions(builder_).Union(),
         {intermediate_outputs[intermediate_outputs.size() - 1], zero_input_},
         {output_});

     BuildInterpreter({GetShape(input_), GetShape(zero_input_)});

     std::vector<float> zero(GetTensorSize(input_), 0.0);
     PopulateTensor(zero_input_, zero);
   }

   // Return the registration of a custom node simply copying input to output.
   TfLiteRegistration* CustomNoOpNode() {
     static TfLiteRegistration no_op = {
         .init = [](TfLiteContext* context, const char* buffer,
                    size_t length) -> void* { return nullptr; },

         .free = [](TfLiteContext* context, void* buffer) -> void {},

         .prepare = [](TfLiteContext* context,
                       TfLiteNode* node) -> TfLiteStatus {
           if (node->inputs->size != 1 || node->outputs->size != 1) {
             return kTfLiteError;
           }

           return kTfLiteOk;
         },

         .invoke = [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
           auto input_tensor = context->tensors[node->inputs->data[0]];
           auto output_tensor = context->tensors[node->outputs->data[0]];

           std::copy(input_tensor.data.raw,
                     input_tensor.data.raw + input_tensor.bytes,
                     output_tensor.data.raw);

           return kTfLiteOk;
         },

         .profiling_string = nullptr,
         .builtin_code = kTfLiteBuiltinDelegate,
         .custom_name = "NoOpTestDelegate",
         .version = 1,
     };

     return &no_op;
   }
   int input_;
   int zero_input_;
   int output_;
 };

 class NodeFilter {
  public:
   void ConfigureSupportedNodes(
       int graph_size, const std::unordered_set<int>& unsupported_indexes) {
     graph_size_ = graph_size;
     unsupported_indexes_ = unsupported_indexes;
   }

   void SetNodeSupport(bool* supported_ops) {
     for (int i = 0; i < graph_size_; i++) {
       supported_ops[i] = (unsupported_indexes_.count(i) == 0);
     }
   }

  private:
   int graph_size_;
   std::unordered_set<int> unsupported_indexes_;
 };

 // Using the same node filter for all DelegatePartitionLimitTests
 // because StubGetSupportedOperationsForDevicesWith wants a C function.
 NodeFilter* DelegatePartitionLimitTestNodeFilter() {
   static NodeFilter* node_filter = new NodeFilter();
   return node_filter;
 }

 class DelegatePartitionLimitTest
     : public ::tflite::delegate::nnapi::NnApiDelegateMockTest {
  protected:
   // Configure the underlying graph to generate a set of nnapi partition
   // with the sizes specified in nnapi_partition_sizes and the given
   // input_shape.
   void Init(int max_nnapi_partitions,
             const std::vector<int>& nnapi_partition_sizes,
             const std::vector<int>& input_shape,
             bool specify_accelerator = true) {
     // The graph will have as number of nodes the sum of nodes in the NNAPI
     // partitions plus nnapi_partition_sizes.size() - 1 nodes that will be
     // not supported by NNAPI and will cause the
     graph_size_ = std::accumulate(std::begin(nnapi_partition_sizes),
                                   std::end(nnapi_partition_sizes),
                                   nnapi_partition_sizes.size() - 1);

     std::unordered_set<int> unsupported_ops_idxs;
     int partition_node_idx = -1;
     for (int i = 0; i < nnapi_partition_sizes.size() - 1; i++) {
       partition_node_idx += nnapi_partition_sizes[i] + 1;
       unsupported_ops_idxs.insert(partition_node_idx);
     }

     if (specify_accelerator) {
       // Building a model that will contain initially a single partition
       // and will get then partitioned by checking the operations supported
       // by the target accelerator.
       // This because I am not able to know the size of each partition in my
       // stubbed GetSupportedOperationsForDevices API.
       DelegatePartitionLimitTestNodeFilter()->ConfigureSupportedNodes(
           graph_size_, unsupported_ops_idxs);

       nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
           [](const ANeuralNetworksModel* model,
              const ANeuralNetworksDevice* const* devices, uint32_t num_devices,
              bool* supported_ops) -> int {
             DelegatePartitionLimitTestNodeFilter()->SetNodeSupport(
                 supported_ops);
             return ANEURALNETWORKS_NO_ERROR;
           });

       model_ = std::make_unique<LongIdentityModel>(
           input_shape, graph_size_,
           /*custom_nodes_indexes=*/std::unordered_set<int>(),
           nnapi_mock_->GetNnApi(),
           /*accelerator_name=*/"test-device", max_nnapi_partitions);
     } else {
       // Building a model containing custom nodes that won't be supported
       // by the delegate and generate the partitions.
       model_ = std::make_unique<LongIdentityModel>(
           input_shape, graph_size_, unsupported_ops_idxs,
           nnapi_mock_->GetNnApi(), max_nnapi_partitions);
     }
   }

   std::unique_ptr<LongIdentityModel> model_;

   int OriginalGraphSize() { return graph_size_; }

  private:
   int graph_size_;
 };

 TEST_F(DelegatePartitionLimitTest, ShouldDelegateOnePartitionOnly) {
   Init(/*max_nnapi_partitions=*/1,
        /*nnapi_partition_sizes=*/{3, 2},
        /*input_shape=*/{1, 2, 2, 1});

   EXPECT_EQ(model_->CountNnApiPartitions(), 1);
 }

 TEST_F(DelegatePartitionLimitTest,
        ShouldDelegateAllPossiblePartitionsIfLimitIsZero) {
   Init(/*max_nnapi_partitions=*/0,
        /*nnapi_partition_sizes=*/{3, 2},
        /*input_shape=*/{1, 2, 2, 1});

   EXPECT_EQ(model_->CountNnApiPartitions(), 2);
 }

 TEST_F(DelegatePartitionLimitTest,
        ShouldDelegateAllPossiblePartitionsIfLimitIsNegative) {
   Init(/*max_nnapi_partitions=*/0,
        /*nnapi_partition_sizes=*/{3, 2},
        /*input_shape=*/{1, 2, 2, 1});

   EXPECT_EQ(model_->CountNnApiPartitions(), 2);
 }

 TEST_F(DelegatePartitionLimitTest,
        ShouldDelegateAllPossiblePartitionsIfBelowLimit) {
   Init(/*max_nnapi_partitions=*/3,
        /*nnapi_partition_sizes=*/{3, 2},
        /*input_shape=*/{1, 2, 2, 1});

   EXPECT_EQ(model_->CountNnApiPartitions(), 2);
 }

 TEST_F(DelegatePartitionLimitTest, ShouldDelegatePartitionWithHigherNodeCount) {
   int kLargestModelSize = 3;
   Init(/*max_nnapi_partitions=*/1,
        /*nnapi_partition_sizes=*/{3, 2},
        /*input_shape=*/{1, 2, 2, 1});

   EXPECT_EQ(model_->CountNnApiPartitions(), 1);
   EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(),
             OriginalGraphSize() - kLargestModelSize);
 }

 TEST_F(DelegatePartitionLimitTest,
        ShouldDelegatePartitionsWithHigherNodeCount) {
   int kLargestModelSize = 5;
   int kSecondLargestModelSize = 4;
   Init(/*max_nnapi_partitions=*/2,
        /*nnapi_partition_sizes=*/
        {1, kLargestModelSize, 2, kSecondLargestModelSize},
        /*input_shape=*/{1, 2, 2, 1});

   EXPECT_EQ(model_->CountNnApiPartitions(), 2);
   EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(), OriginalGraphSize() - 9);
 }

 TEST_F(DelegatePartitionLimitTest,
        ShouldLimitPartitionsEvenWithoutAcceleratorNameSpecified) {
   int kLargestModelSize = 5;
   int kSecondLargestModelSize = 4;
   Init(/*max_nnapi_partitions=*/2,
        /*nnapi_partition_sizes=*/
        {1, kLargestModelSize, 2, kSecondLargestModelSize},
        /*input_shape=*/{1, 2, 2, 1}, /*specify_accelerator=*/false);

   EXPECT_EQ(model_->CountNnApiPartitions(), 2);
   EXPECT_EQ(
       model_->CountOpsExecutedByCpuKernel(),
       OriginalGraphSize() - (kLargestModelSize + kSecondLargestModelSize));
 }

 }  // namespace
 }  // namespace tflite

 int main(int argc, char** argv) {
   ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }