Longer fusion for Adreno in ThinPointwiseFuser. PiperOrigin-RevId: 467909285

commit: e23d648fb6abcba4f76ad0e3cdca525f53dbc0cd [log] [tgz]
author: Raman Sarokin <sorokin@google.com> Tue Aug 16 06:26:26 2022 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> Tue Aug 16 06:30:18 2022 -0700
tree: 21c194fd6fbc6de7d2ba5c060dbc42c4f7b5192d
parent: 2cd8079f27dba8cbfad9847e53615aed56b61ad6 [diff]
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/special/depthwise_conv_plus_1x1_conv.cc b/tensorflow/lite/delegates/gpu/common/tasks/special/depthwise_conv_plus_1x1_conv.cc
index 8a7241c..a8826d3 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/special/depthwise_conv_plus_1x1_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/special/depthwise_conv_plus_1x1_conv.cc

@@ -62,6 +62,7 @@
   bool IsNodeSupported(const GpuInfo& gpu_info, Node* node) const;
   bool IsElementwiseNode(Node* node) const;
   bool IsConvNode(Node* node) const;
+  bool IsDwConvNode(Node* node) const;
   void AddNode(const GpuInfo& gpu_info, Node* node);
   void AddElementwiseNode(ElementwiseDescriptor&& op_desc);
   void AddConvNode(const GpuInfo& gpu_info,
@@ -73,7 +74,6 @@
   void AddConvData(const Convolution2DAttributes& conv_attr);
   void AddDepthwiseConvData(const DepthwiseConvolution2DAttributes& dw_attr);
   void CreateConstantsGpuBuffer(const GpuInfo& gpu_info);
-  bool HasConvNode() const;
   std::vector<Node*> nodes_;
   OperationDef op_def_;
   Arguments args_;
@@ -81,6 +81,7 @@
   std::vector<std::string> outputs_;
   std::vector<float> gpu_data_;
   int weights_counter_ = 0;
+  int buffer_size_ = 0;
   std::string op_name_;
   int link_counter_ = 0;
   uint64_t flops_ = 0;
@@ -270,6 +271,24 @@
     if (!good_conv) {
       return false;
     }
+    if (gpu_info.IsAdreno() && gpu_info.IsApiOpenCl()) {
+      int conv_src_ch_aligned = AlignByN(conv_attr->weights.shape.i, 4);
+      int conv_dst_ch_aligned = AlignByN(conv_attr->weights.shape.o, 4);
+      int conv_weights_count =
+          conv_dst_ch_aligned + conv_src_ch_aligned * conv_dst_ch_aligned;
+
+      DataType data_type = op_def_.precision == CalculationsPrecision::F32
+                               ? DataType::FLOAT32
+                               : DataType::FLOAT16;
+      int weights_size = conv_weights_count * SizeOf(data_type);
+      if (convs_count_ >= 3 || buffer_size_ + weights_size > 1024 * 3) {
+        return false;
+      }
+    } else {
+      if (convs_count_ >= 1) {
+        return false;
+      }
+    }
     if (gpu_info.IsApple()) {
       if (op_def_.precision == CalculationsPrecision::F16) {
         return conv_shape.o <= 16 && conv_shape.i * conv_shape.o <= 16 * 16;
@@ -295,26 +314,42 @@
 }
 
 bool ThinPointwiseFuser::ReserveNode(const GpuInfo& gpu_info, Node* node) {
-  if (convs_count_ >= 1 || !IsNodeSupported(gpu_info, node)) {
+  if (!IsNodeSupported(gpu_info, node)) {
     return false;
   }
   nodes_.push_back(node);
   if (IsConvNode(node)) {
     convs_count_++;
+    Convolution2DAttributes* conv_attr =
+        absl::any_cast<Convolution2DAttributes>(&node->operation.attributes);
+
+    int conv_src_ch_aligned = AlignByN(conv_attr->weights.shape.i, 4);
+    int conv_dst_ch_aligned = AlignByN(conv_attr->weights.shape.o, 4);
+    int conv_weights_count =
+        conv_dst_ch_aligned + conv_src_ch_aligned * conv_dst_ch_aligned;
+
+    DataType data_type = op_def_.precision == CalculationsPrecision::F32
+                             ? DataType::FLOAT32
+                             : DataType::FLOAT16;
+    buffer_size_ += conv_weights_count * SizeOf(data_type);
+  }
+  if (IsDwConvNode(node)) {
+    DepthwiseConvolution2DAttributes* dw_attr =
+        absl::any_cast<DepthwiseConvolution2DAttributes>(
+            &node->operation.attributes);
+
+    int dw_dst_ch_aligned = AlignByN(dw_attr->weights.shape.i, 4);
+    int dw_weights_count = dw_dst_ch_aligned + dw_dst_ch_aligned *
+                                                   dw_attr->weights.shape.h *
+                                                   dw_attr->weights.shape.w;
+    DataType data_type = op_def_.precision == CalculationsPrecision::F32
+                             ? DataType::FLOAT32
+                             : DataType::FLOAT16;
+    buffer_size_ += dw_weights_count * SizeOf(data_type);
   }
   return true;
 }
 
-bool ThinPointwiseFuser::HasConvNode() const {
-  for (auto& node : nodes_) {
-    if (OperationTypeFromString(node->operation.type) ==
-        OperationType::CONVOLUTION_2D) {
-      return true;
-    }
-  }
-  return false;
-}
-
 void ThinPointwiseFuser::AddNode(const GpuInfo& gpu_info, Node* node) {
   auto op_type = OperationTypeFromString(node->operation.type);
   if (op_type == OperationType::RELU) {
@@ -347,6 +382,11 @@
   return op_type == OperationType::CONVOLUTION_2D;
 }
 
+bool ThinPointwiseFuser::IsDwConvNode(Node* node) const {
+  auto op_type = OperationTypeFromString(node->operation.type);
+  return op_type == OperationType::DEPTHWISE_CONVOLUTION;
+}
+
 void ThinPointwiseFuser::AddDepthwiseConvNode(
     const GpuInfo& gpu_info, const DepthwiseConvolution2DAttributes& attr) {
   AddDepthwiseConvData(attr);
commit	e23d648fb6abcba4f76ad0e3cdca525f53dbc0cd	[log] [tgz]
author	Raman Sarokin <sorokin@google.com>	Tue Aug 16 06:26:26 2022 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	Tue Aug 16 06:30:18 2022 -0700
tree	21c194fd6fbc6de7d2ba5c060dbc42c4f7b5192d
parent	2cd8079f27dba8cbfad9847e53615aed56b61ad6 [diff]