NNAPI Delegate: Support sparse weights for conv2d

PiperOrigin-RevId: 379833048
Change-Id: Ia0b3603fce75f8ee437b61bd5445d6f09c12115f
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 5097d44..5037846 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -33,7 +33,6 @@
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
@@ -41,8 +40,6 @@
         "//tensorflow/lite/nnapi:nnapi_lib",
         "//tensorflow/lite/nnapi:nnapi_util",
         "//tensorflow/lite/nnapi/sl:nnapi_support_library_headers",
-        "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/tools/optimize/sparsity:format_converter",
         "@FP16",
         "@farmhash_archive//:farmhash",
     ],
@@ -73,7 +70,6 @@
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
@@ -82,8 +78,10 @@
         "//tensorflow/lite/nnapi:nnapi_util",
         "//tensorflow/lite/nnapi/sl:nnapi_support_library_headers",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/tools/optimize/sparsity:format_converter",
         "@FP16",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@farmhash_archive//:farmhash",
     ],
 )
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 01efeff..83217f9 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -31,7 +31,6 @@
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 #include "tensorflow/lite/nnapi/sl/public/NeuralNetworksSupportLibraryImpl.h"
 
@@ -59,7 +58,6 @@
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 #include "tensorflow/lite/nnapi/nnapi_util.h"
-#include "tensorflow/lite/tools/optimize/sparsity/format_converter.h"
 #include "tensorflow/lite/util.h"
 #ifdef NNAPI_VERBOSE_VALIDATION
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -388,28 +386,6 @@
   }
 }
 
-bool IsDequantizeConstFloat16(TfLiteContext* context, const TfLiteNode* node,
-                              const TfLiteRegistration* registration) {
-  return registration->builtin_code == kTfLiteBuiltinDequantize &&
-         context->tensors[node->inputs->data[0]].type ==
-             TfLiteType::kTfLiteFloat16 &&
-         IsConstantTensor(&context->tensors[node->inputs->data[0]]);
-}
-
-bool IsDequantizeNonConstFloat16(TfLiteContext* context, const TfLiteNode* node,
-                                 const TfLiteRegistration* registration) {
-  return registration->builtin_code == kTfLiteBuiltinDequantize &&
-         context->tensors[node->inputs->data[0]].type ==
-             TfLiteType::kTfLiteFloat16 &&
-         !IsConstantTensor(&context->tensors[node->inputs->data[0]]);
-}
-
-bool IsDensifyConstTensor(TfLiteContext* context, const TfLiteNode* node,
-                          const TfLiteRegistration* registration) {
-  return registration->builtin_code == kTfLiteBuiltinDensify &&
-         IsConstantTensor(&context->tensors[node->inputs->data[0]]);
-}
-
 bool HasUnspecifiedDimension(const TfLiteTensor* tensor) {
   if (tensor->dims_signature) {
     for (int i : TfLiteIntArrayView(tensor->dims_signature)) {
@@ -1570,7 +1546,7 @@
         RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context_,
             nnapi_->ANeuralNetworksModel_setOperandValue(
-                nn_model_, ann_tensor_index, tensor->data.data, tensor->bytes),
+                nn_model_, ann_tensor_index, tensor->data.raw, tensor->bytes),
             "setting new operand value", tensor, nnapi_errno_);
       }
     }
@@ -2178,13 +2154,6 @@
       }
     } break;
     case kTfLiteBuiltinDequantize: {
-      // Allow dequantizing fp16->fp32.
-      if (android_sdk_version >= kMinSdkVersionForNNAPI13 &&
-          context->tensors[node->inputs->data[0]].type == kTfLiteFloat16 &&
-          context->tensors[node->inputs->data[0]].allocation_type !=
-              kTfLiteMmapRo) {
-        return true;
-      }
       Expect(version == 1 || version == 2,
              NNAPIValidationFailureType::kUnsupportedOperatorVersion,
              "Supported op versions are 1 and 2 only", &val_ctx);
@@ -2206,15 +2175,6 @@
         }
       }
     } break;
-    case kTfLiteBuiltinDensify: {
-      // Allow densifying sparse weights.
-      if (android_sdk_version >= kMinSdkVersionForNNAPI13 &&
-          context->tensors[node->inputs->data[0]].allocation_type ==
-              kTfLiteMmapRo) {
-        return true;
-      }
-      return false;
-    } break;
     case kTfLiteBuiltinFloor: {
       ExpectOpVersion(version, 1, &val_ctx);
     } break;
@@ -3753,10 +3713,6 @@
     nodes_.push_back(node_index);
   }
 
-  // Initialize densify map and dequantize map.
-  densify_output_to_node_mapping_ = std::vector<int>(context->tensors_size, -1);
-  non_const_dequantize_output_to_node_mapping_ =
-      std::vector<int>(context->tensors_size, -1);
   const auto delegate_options =
       StatefulNnApiDelegate::GetOptions(params->delegate);
   if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI12 &&
@@ -3983,17 +3939,6 @@
     const auto tflite_op_index = nnapi_to_tflite_op_mapping_[nnapi_op_index];
     tflite_ops_support_status[tflite_op_index] &=
         nnapi_ops_support_flags[nnapi_op_index];
-    if (!tflite_ops_support_status[tflite_op_index]) {
-      if (std::count(non_const_dequantize_output_to_node_mapping_.begin(),
-                     non_const_dequantize_output_to_node_mapping_.end(), -1) <
-              non_const_dequantize_output_to_node_mapping_.size() ||
-          std::count(densify_output_to_node_mapping_.begin(),
-                     densify_output_to_node_mapping_.end(),
-                     -1) < densify_output_to_node_mapping_.size()) {
-        // Only allow full model delegation for sparse model.
-        return kTfLiteOk;
-      }
-    }
   }
 
   supported_nodes->clear();
@@ -4445,85 +4390,13 @@
   }
 }
 
-TfLiteStatus NNAPIDelegateKernel::DensifyAndDequantizeConstTensor(
-    TfLiteContext* context, int densify_node_id, bool should_dequantize,
-    NNAPIOpBuilder& builder) {
-  TfLiteNode* densify_node;
-  TfLiteRegistration* reg;
-  TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
-      context, densify_node_id, &densify_node, &reg));
-  int sparse_weight_tid = densify_node->inputs->data[0];
-  auto input_tensor = context->tensors[sparse_weight_tid];
-  auto output_tensor = context->tensors[densify_node->outputs->data[0]];
-  if (input_tensor.sparsity == nullptr) {
-    return kTfLiteError;
-  }
-  const int dims_count = output_tensor.dims->size;
-  std::vector<int> vector_shape(dims_count);
-  for (int i = 0; i < dims_count; i++) {
-    vector_shape[i] = output_tensor.dims->data[i];
-  }
-  size_t dense_size;
-  int new_tensor_index = -1;
-  switch (input_tensor.type) {
-    case kTfLiteFloat32: {
-      dense_size = output_tensor.bytes / sizeof(float);
-      std::vector<float> output_data(dense_size);
-      tflite::optimize::sparsity::FormatConverter<float> converter(
-          vector_shape, *input_tensor.sparsity);
-      converter.SparseToDense(static_cast<const float*>(input_tensor.data.data),
-                              dense_size, output_data.data(), context);
-      TF_LITE_ENSURE_STATUS(builder.AddNewInputConstantTensor<float>(
-          ANEURALNETWORKS_TENSOR_FLOAT32, kTfLiteFloat32, output_tensor.dims,
-          output_data, output_tensor.params, &new_tensor_index));
-      break;
-    }
-    case kTfLiteFloat16: {
-      dense_size = output_tensor.bytes / sizeof(Eigen::half);
-      std::vector<uint16_t> output_data(dense_size);
-      Eigen::half* unpacked_fp16_data =
-          reinterpret_cast<Eigen::half*>(output_data.data());
-      tflite::optimize::sparsity::FormatConverter<Eigen::half> converter(
-          vector_shape, *input_tensor.sparsity);
-      converter.SparseToDense(
-          static_cast<const Eigen::half*>(input_tensor.data.data), dense_size,
-          unpacked_fp16_data, context);
-      if (should_dequantize) {
-        // we need to dequantize the fp16 dense tensor
-        std::vector<float> float_dense_data(dense_size);
-        for (int i = 0; i < dense_size; ++i) {
-          float_dense_data[i] = fp16_ieee_to_fp32_value(
-              reinterpret_cast<uint16_t*>(output_data.data())[i]);
-        }
-        TF_LITE_ENSURE_STATUS(builder.AddNewInputConstantTensor<float>(
-            ANEURALNETWORKS_TENSOR_FLOAT32, kTfLiteFloat32, output_tensor.dims,
-            float_dense_data, output_tensor.params, &new_tensor_index));
-      } else {
-        TF_LITE_ENSURE_STATUS(builder.AddNewInputConstantTensor<uint16_t>(
-            ANEURALNETWORKS_TENSOR_FLOAT16, kTfLiteFloat16, output_tensor.dims,
-            output_data, output_tensor.params, &new_tensor_index));
-      }
-      break;
-    }
-    case kTfLiteInt8: {
-      dense_size = output_tensor.bytes / sizeof(int8_t);
-      std::vector<int8_t> output_data(dense_size);
-      tflite::optimize::sparsity::FormatConverter<int8_t> converter(
-          vector_shape, *input_tensor.sparsity);
-      converter.SparseToDense(
-          static_cast<const int8_t*>(input_tensor.data.data), dense_size,
-          output_data.data(), context);
-      TF_LITE_ENSURE_STATUS(builder.AddNewInputConstantTensor<int8_t>(
-          ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED, kTfLiteInt8,
-          output_tensor.dims, output_data, output_tensor.params,
-          &new_tensor_index));
-      break;
-    }
-    default: {
-      return kTfLiteError;
-    }
-  }
-  return kTfLiteOk;
+static bool IsDequantizeConstFloat16(TfLiteContext* context,
+                                     const TfLiteNode* node,
+                                     const TfLiteRegistration* registration) {
+  return registration->builtin_code == kTfLiteBuiltinDequantize &&
+         context->tensors[node->inputs->data[0]].type ==
+             TfLiteType::kTfLiteFloat16 &&
+         IsConstantTensor(&context->tensors[node->inputs->data[0]]);
 }
 
 TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(
@@ -4542,7 +4415,7 @@
     TF_LITE_ENSURE_STATUS(GetTargetFeatureLevel(
         context, nnapi_, nnapi_devices_, &target_feature_level_, nnapi_errno));
   }
-  // First path, handle const fp16->fp32 dequantize and densify if needed.
+  // First path, handle fp16->fp32 dequantize if needed.
   for (auto node_index : nodes_) {
     TfLiteNode* node = nullptr;
     TfLiteRegistration* registration = nullptr;
@@ -4552,13 +4425,6 @@
       builder.AddTensorInput(node->inputs->data[0], /*hybrid_op=*/false,
                              NN_TENSOR_FLAG_HALF_TO_FLOAT_CONVERSION);
     }
-    if (IsDensifyConstTensor(context, node, registration)) {
-      densify_output_to_node_mapping_[node->outputs->data[0]] = node_index;
-    }
-    if (IsDequantizeNonConstFloat16(context, node, registration)) {
-      non_const_dequantize_output_to_node_mapping_[node->outputs->data[0]] =
-          node_index;
-    }
   }
   // Clear the input and output lists for the dequantize path.
   builder.ClearInputOuputLists();
@@ -4570,11 +4436,6 @@
     TfLiteRegistration* reg;
     TF_LITE_ENSURE_STATUS(
         context->GetNodeAndRegistration(context, node_index, &node, &reg));
-    // skip DENSIFY -> DEQUANTIZE as they are handled elsewhere.
-    if (IsDensifyConstTensor(context, node, reg) ||
-        IsDequantizeNonConstFloat16(context, node, reg)) {
-      continue;
-    }
 
     // Fully quantized full LSTM.
     if (target_feature_level_ >= kMinSdkVersionForNNAPI13 &&
@@ -4715,30 +4576,6 @@
         continue;
       }
       const auto input_index = node->inputs->data[input_pos];
-      // handle sparse weights for Conv2d
-      if (reg->builtin_code == kTfLiteBuiltinConv2d && input_pos == 1) {
-        int densify_node_id = -1;
-        bool should_dequantize = false;
-        int dequantize_node_id =
-            non_const_dequantize_output_to_node_mapping_[input_index];
-        if (dequantize_node_id != -1) {
-          should_dequantize = true;
-          // Find densify->dequantize pattern.
-          TfLiteNode* dequant_node;
-          TfLiteRegistration* reg;
-          TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
-              context, dequantize_node_id, &dequant_node, &reg));
-          densify_node_id =
-              densify_output_to_node_mapping_[dequant_node->inputs->data[0]];
-        } else {
-          densify_node_id = densify_output_to_node_mapping_[input_index];
-        }
-        if (densify_node_id != -1) {
-          TF_LITE_ENSURE_STATUS(DensifyAndDequantizeConstTensor(
-              context, densify_node_id, should_dequantize, builder));
-          continue;
-        }
-      }
       if (need_int8_conversion &&
           (input_pos == 0 ||
            reg->builtin_code == kTfLiteBuiltinFullyConnected ||
@@ -5574,7 +5411,8 @@
     TfLiteRegistration* registration = nullptr;
     TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
         context, node_id, &node, &registration));
-    if (IsDequantizeConstFloat16(context, node, registration)) {
+    if (delegate::nnapi::IsDequantizeConstFloat16(context, node,
+                                                  registration)) {
       should_prune_fp16_dequantize = true;
       break;
     }
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 208b26e..b45a91c 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -371,11 +371,6 @@
   std::vector<uint8_t> nn_compilation_cache_token_;
 
   std::vector<int> nnapi_to_tflite_op_mapping_;
-  // Map of DENSIFY output tensor id to node id.
-  std::vector<int> densify_output_to_node_mapping_;
-  // Map of DEQUANTIZE output tensor id to node id.
-  // Only contains DEQUANTIZE nodes with non-const input.
-  std::vector<int> non_const_dequantize_output_to_node_mapping_;
 
   // Fully initialized in NNAPIDelegateKernel::AddOpsAndTensors
   int target_feature_level_ = 27;  // kMinSdkVersionForNNAPI10
@@ -384,11 +379,6 @@
       const TfLiteContext* context, int builtin_code, const TfLiteNode* node,
       int tflite_node_index, NNAPIOpBuilder* builder, int* nnapi_errno);
 
-  TfLiteStatus DensifyAndDequantizeConstTensor(TfLiteContext* context,
-                                               int densify_node_id,
-                                               bool should_dequantize,
-                                               NNAPIOpBuilder& builder);
-
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno,
                                 bool allow_dynamic_dimensions);