tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/lite/delegates/gpu/common/selectors/operation_selector.h"

 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>

 #include "absl/strings/str_cat.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/flops_util.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/selectors/convolution_selector.h"
 #include "tensorflow/lite/delegates/gpu/common/selectors/convolution_transposed_selector.h"
 #include "tensorflow/lite/delegates/gpu/common/selectors/default_selector.h"
 #include "tensorflow/lite/delegates/gpu/common/selectors/dw_convolution_selector.h"
 #include "tensorflow/lite/delegates/gpu/common/selectors/fully_connected_selector.h"
 #include "tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
 #include "tensorflow/lite/delegates/gpu/common/task/weights_conversion.h"
 #include "tensorflow/lite/delegates/gpu/common/tasks/elementwise.h"
 #include "tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.h"
 #include "tensorflow/lite/delegates/gpu/common/tasks/transpose.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"

 namespace tflite {
 namespace gpu {
 namespace {
 bool IsRecommendedForWinograd4x4To6x6(const Convolution2DAttributes& attr,
                                       const GpuInfo& gpu_info,
                                       const BHWC& dst_shape) {
   const int tiles_x = DivideRoundUp(dst_shape.w, 4);
   const int tiles_y = DivideRoundUp(dst_shape.h, 4);
   const int total_tiles = tiles_x * tiles_y;
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   int min_src_depth = 16;
   int min_dst_depth = 16;
   if (gpu_info.IsAdreno()) {
     min_src_depth = 32;
     min_dst_depth = 32;
   } else if (gpu_info.IsAMD()) {
     min_dst_depth = 8;
   }
   int min_tiles = 32;
   if (gpu_info.IsAdreno()) {
     if (gpu_info.adreno_info.IsAdreno6xx()) {
       min_tiles = 128;
     } else {
       min_tiles = 64;
     }
   }
   const bool recommended_channels =
       src_depth >= min_src_depth && dst_depth >= min_dst_depth;
   const bool recommended_hw = total_tiles >= min_tiles;
   return recommended_channels && recommended_hw;
 }

 absl::Status WinogradFromNode(const GpuInfo& gpu_info,
                               const std::vector<Value*>& inputs,
                               const std::vector<Value*>& outputs,
                               const OperationDef& op_def, ModelHints hints,
                               const BHWC& input_shape, const BHWC& output_shape,
                               const Convolution2DAttributes& attr,
                               GPUOperationsSubgraph* gpu_subgraph) {
   if (!IsSuitableForWinograd4x4To6x6(attr)) {
     return absl::UnimplementedError("No implementation for this case.");
   }
   if (!IsRecommendedForWinograd4x4To6x6(attr, gpu_info, output_shape)) {
     return absl::UnimplementedError("Not recommended for this case.");
   }

   const int tiles_x = DivideRoundUp(output_shape.w, 4);
   const int tiles_y = DivideRoundUp(output_shape.h, 4);
   const BHWC src_transformed_shape{input_shape.b, 36, tiles_x * tiles_y,
                                    input_shape.c};
   const BHWC dst_transformed_shape{input_shape.b, 36, tiles_x * tiles_y,
                                    output_shape.c};
   TensorDescriptor src_transformed_desc = op_def.src_tensors[0];
   RETURN_IF_ERROR(src_transformed_desc.UpdateToSupportedStorageType(
       gpu_info, src_transformed_shape));
   TensorDescriptor dst_transformed_desc = op_def.src_tensors[0];
   RETURN_IF_ERROR(dst_transformed_desc.UpdateToSupportedStorageType(
       gpu_info, dst_transformed_shape));
   const int src_transformed_id =
       gpu_subgraph->AddTensor(src_transformed_shape, src_transformed_desc);
   const int dst_transformed_id =
       gpu_subgraph->AddTensor(dst_transformed_shape, dst_transformed_desc);
   gpu_subgraph->operations.clear();
   gpu_subgraph->operations.resize(3);

   OperationDef winograd_up_def;
   winograd_up_def.precision = op_def.precision;
   winograd_up_def.src_tensors.push_back(op_def.src_tensors[0]);
   winograd_up_def.dst_tensors.push_back(src_transformed_desc);
   auto& winograd_up = gpu_subgraph->operations[0];
   winograd_up.operation =
       SelectWinograd4x4To36(gpu_info, attr.padding, winograd_up_def);
   winograd_up.input_ids = {static_cast<int>(inputs[0]->id)};
   winograd_up.output_ids = {src_transformed_id};
   winograd_up.name = "winograd_4x4_to_36";

   OperationDef conv_def;
   conv_def.precision = op_def.precision;
   conv_def.src_tensors.push_back(src_transformed_desc);
   conv_def.dst_tensors.push_back(dst_transformed_desc);
   auto& conv = gpu_subgraph->operations[1];
   conv.input_ids = {src_transformed_id};
   conv.output_ids = {dst_transformed_id};
   conv.operation = SelectConvolutionForWinograd(attr, input_shape, gpu_info,
                                                 conv_def, hints);
   conv.name = "convolution_winograd_4x4_6x6";
   conv.operation->flops_ =
       GetConvolutionWinograd4x4To6x6Flops(output_shape, attr.weights.shape);

   OperationDef winograd_down_def;
   winograd_down_def.precision = op_def.precision;
   winograd_down_def.src_tensors.push_back(dst_transformed_desc);
   winograd_down_def.dst_tensors.push_back(op_def.dst_tensors[0]);
   auto& winograd_down = gpu_subgraph->operations[2];
   winograd_down.input_ids = {dst_transformed_id};
   winograd_down.output_ids = {static_cast<int>(outputs[0]->id)};
   auto bias_copy = attr.bias;
   if (bias_copy.shape.v < attr.weights.shape.o) {
     bias_copy.shape = Linear(attr.weights.shape.o);
     bias_copy.data.resize(attr.weights.shape.o);
   }
   winograd_down.operation =
       SelectWinograd36To4x4(gpu_info, winograd_down_def, bias_copy);
   winograd_down.name = "winograd_36_to_4x4";
   return absl::OkStatus();
 }

 // Supported operation types:
 // 1) BATCHED_MATMUL
 // 2) CONVOLUTION_2D
 // 3) CONVOLUTION_TRANSPOSED
 absl::Status AddDynamicConv(ModelHints hints, const GpuInfo& gpu_info,
                             const OperationDef& op_def, OperationType op_type,
                             const BHWC& src_shape, const OHWI& weights_shape,
                             const BHWC& dst_shape, int src_id, int weights_id,
                             int dst_id, GPUOperationsSubgraph* gpu_subgraph,
                             void* attr) {
   gpu_subgraph->operations.reserve(gpu_subgraph->operations.size() + 2);
   gpu_subgraph->operations.push_back({});
   auto& converter_op = gpu_subgraph->operations.back();
   gpu_subgraph->operations.push_back({});
   auto& conv_op = gpu_subgraph->operations.back();
   OperationDef conv_temp_def = op_def;
   conv_temp_def.src_tensors[1] = {op_def.src_tensors[1].data_type,
                                   TensorStorageType::BUFFER, Layout::HWC};
   WeightsDescription weights_desc;
   const BHWC weights_shape_bhwc(weights_shape.o, weights_shape.h,
                                 weights_shape.w, weights_shape.i);
   conv_op.output_ids = {dst_id};
   if (op_type == OperationType::CONVOLUTION_2D) {
     Convolution2DAttributes* conv_attr =
         reinterpret_cast<Convolution2DAttributes*>(attr);
     conv_op.operation = SelectConvolutionWithDynamicWeights(
         *conv_attr, weights_shape_bhwc, dst_shape, gpu_info, conv_temp_def,
         hints, &weights_desc);
     conv_op.name = "convolution_dynamic";
     conv_op.operation->flops_ = GetConvolutionFlops(dst_shape, weights_shape);
   } else if (op_type == OperationType::CONVOLUTION_TRANSPOSED) {
     ConvolutionTransposedAttributes* conv_attr =
         reinterpret_cast<ConvolutionTransposedAttributes*>(attr);
     conv_op.operation = SelectConvolutionTransposedWithDynamicWeights(
         *conv_attr, gpu_info, conv_temp_def, &weights_desc);
     conv_op.name = "conv_transposed_dynamic";
     conv_op.operation->flops_ =
         GetConvolutionTransposedFlops(src_shape, weights_shape);
   } else if (op_type == OperationType::BATCHED_MATMUL) {
     Convolution2DAttributes* conv_attr =
         reinterpret_cast<Convolution2DAttributes*>(attr);
     conv_op.operation = SelectConvolutionWithDynamicWeights(
         *conv_attr, weights_shape_bhwc, dst_shape, gpu_info, conv_temp_def,
         hints, &weights_desc);
     conv_op.name = "mat_mul_as_convolution";
     conv_op.operation->flops_ = GetConvolutionFlops(dst_shape, weights_shape);
   } else {
     return absl::InternalError("No support of this operation type.");
   }
   conv_op.input_ids = {src_id};
   if (weights_desc.layout == WeightsLayout::k2DX4I4YIsSpatialIAndXIsOOGroupO4 ||
       weights_desc.layout == WeightsLayout::k2DX4O4YIsSpatialIAndXIsOOGroupI4) {
     // weights are 4x textures 2d
     uint2 tex_size = Get2dResourceSize(weights_desc, weights_shape);
     for (int i = 0; i < 4; ++i) {
       int tensor_id = gpu_subgraph->AddTensor(
           BHWC(1, tex_size.y, tex_size.x, 4),
           TensorDescriptor(weights_desc.type, TensorStorageType::TEXTURE_2D,
                            Layout::HWC));
       conv_op.input_ids.push_back(tensor_id);
       converter_op.output_ids.push_back(tensor_id);
     }
   } else {
     // weights are single buffer
     int tensor_id = gpu_subgraph->AddTensor(
         BHWC(1, 1, 1,
              GetTotalElementsCountForLayout(weights_desc, weights_shape)),
         TensorDescriptor(weights_desc.type, TensorStorageType::BUFFER,
                          Layout::HWC));
     conv_op.input_ids.push_back(tensor_id);
     converter_op.output_ids.push_back(tensor_id);
   }
   OperationDef conv_def = conv_op.operation->GetDefinition();
   OperationDef converter_def;
   converter_def.precision = op_def.precision;
   converter_def.src_tensors.push_back(op_def.src_tensors[1]);
   for (int i = 1; i < conv_def.src_tensors.size(); ++i) {
     converter_def.dst_tensors.push_back(conv_def.src_tensors[i]);
   }

   converter_op.input_ids = {weights_id};
   converter_op.operation =
       SelectConverterToConvWeights(weights_desc, converter_def, hints);
   converter_op.name = "bhwc_tensor_to_conv_weights";
   return absl::OkStatus();
 }

 void AddConvSharedWeights(
     const Convolution2DAttributes& attr, const WeightsDescription& weights_desc,
     std::vector<SharedWeightsConvDesc>* shared_conv_weights,
     GPUOperationsSubgraph* gpu_subgraph) {
   SharedWeightsConvDesc shared_weights_desc;
   shared_weights_desc.weights_id = attr.weights.id;
   shared_weights_desc.desc = weights_desc;
   int index = -1;
   for (int i = 0; i < shared_conv_weights->size(); ++i) {
     if ((*shared_conv_weights)[i] == shared_weights_desc) {
       index = i;
       break;
     }
   }
   if (index != -1) {
     const auto& new_ids = (*shared_conv_weights)[index].global_const_ids;
     for (int i = 0; i < new_ids.size(); ++i) {
       gpu_subgraph->operations[0].input_ids.push_back(new_ids[i]);
     }
   } else {
     shared_conv_weights->push_back(shared_weights_desc);
     if (weights_desc.layout ==
             WeightsLayout::k2DX4I4YIsSpatialIAndXIsOOGroupO4 ||
         weights_desc.layout ==
             WeightsLayout::k2DX4O4YIsSpatialIAndXIsOOGroupI4) {
       // weights are 4x textures 2d
       uint2 tex_size = Get2dResourceSize(weights_desc, attr.weights.shape);
       const int flt_count =
           GetTotalElementsCountForLayout(weights_desc, attr.weights.shape);

       std::vector<uint8_t> weights_data(flt_count * SizeOf(weights_desc.type));
       RearrangeWeights(attr.weights, weights_desc,
                        absl::MakeSpan(weights_data));
       int sub_size = SizeOf(weights_desc.type) * 4 * tex_size.x * tex_size.y;
       for (int i = 0; i < 4; ++i) {
         TensorDescriptor weights_tensor = TensorDescriptor(
             weights_desc.type, TensorStorageType::TEXTURE_2D, Layout::HWC);
         weights_tensor.SetBHWCShape(BHWC(1, tex_size.y, tex_size.x, 4));
         weights_tensor.SetData(std::vector<uint8_t>(
             weights_data.data() + sub_size * i,
             weights_data.data() + sub_size * i + sub_size));
         int tensor_id = gpu_subgraph->AddTensor(std::move(weights_tensor));
         gpu_subgraph->operations[0].input_ids.push_back(tensor_id);
         shared_conv_weights->back().global_const_ids.push_back(tensor_id);
       }
     } else {
       // weights are single buffer
       TensorDescriptor weights_tensor = TensorDescriptor(
           weights_desc.type, TensorStorageType::BUFFER, Layout::HWC);
       const int flt_count =
           GetTotalElementsCountForLayout(weights_desc, attr.weights.shape);
       weights_tensor.SetBHWCShape(BHWC(1, 1, 1, flt_count));
       std::vector<uint8_t> weights_data =
           std::vector<uint8_t>(flt_count * SizeOf(weights_desc.type));
       RearrangeWeights(attr.weights, weights_desc,
                        absl::MakeSpan(weights_data));
       weights_tensor.SetData(std::move(weights_data));
       int tensor_id = gpu_subgraph->AddTensor(std::move(weights_tensor));
       gpu_subgraph->operations[0].input_ids.push_back(tensor_id);
       shared_conv_weights->back().global_const_ids.push_back(tensor_id);
     }
   }
 }

 }  // namespace

 absl::Status GPUOperationFromNodePart0(
     const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints,
     const std::vector<Value*>& inputs, const std::vector<Value*>& outputs,
     const Node& node, std::vector<SharedWeightsConvDesc>* shared_conv_weights,
     GPUOperationsSubgraph* gpu_subgraph) {
   std::unique_ptr<GPUOperation>* gpu_op =
       InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
   auto op_type = OperationTypeFromString(node.operation.type);
   switch (op_type) {
     case OperationType::ADD: {
       if (inputs.size() == 2 &&
           (inputs[0]->tensor.shape.c == inputs[1]->tensor.shape.c ||
            inputs[1]->tensor.shape.c == 1)) {
         GPUOperation operation =
             CreateElementwiseTwoInput(op_def, op_type, inputs[1]->tensor.shape);
         *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       } else if (inputs.size() >= 2) {
         auto output = outputs[0];
         std::vector<int> channels(inputs.size());
         for (int i = 0; i < inputs.size(); ++i) {
           channels[i] = inputs[i]->tensor.shape.c;
         }
         SelectAdd(op_def, channels, output->tensor.shape.c, gpu_op);
         return absl::OkStatus();
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
         auto attr =
             absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
         GPUOperation operation =
             CreateElementwise(gpu_info, op_def, op_type, attr);
         *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       }
       return absl::UnimplementedError(absl::StrCat(
           "No support of ", node.operation.type, " with this parameters"));
     }
     case OperationType::BATCHED_MATMUL: {
       // Currently only batch = 1 is supported.
       // Matmul replaced with this sequence:
       //   1) Transpose second tensor(weights). (1xBxHxW)->(Wx1xBxH)
       //   2) Run cconvolution with runtime weights
       auto second_shape = inputs[1]->tensor.shape;
       auto dst_shape = outputs[0]->tensor.shape;
       if (dst_shape.b != 1) {
         return absl::UnimplementedError(
             "Currently only batch = 1 supported for BATCHED_MATMUL.");
       }
       const OHWI weights_shape(second_shape.c, 1, second_shape.h,
                                second_shape.w);
       const BHWC weights_shape_bhwc(weights_shape.o, weights_shape.h,
                                     weights_shape.w, weights_shape.i);
       Convolution2DAttributes attr;
       attr.strides = HW(1, 1);
       attr.dilations = HW(1, 1);
       attr.padding.appended = HW(0, 0);
       attr.padding.prepended = HW(0, 0);
       attr.bias.shape = Linear(weights_shape.o);
       attr.bias.data.resize(weights_shape.o, 0.0f);

       gpu_subgraph->operations.clear();
       TensorDescriptor transposed_desc = {op_def.src_tensors[1].data_type,
                                           op_def.src_tensors[1].storage_type,
                                           Layout::BHWC};
       RETURN_IF_ERROR(transposed_desc.UpdateToSupportedStorageType(
           gpu_info, weights_shape_bhwc));
       gpu_subgraph->operations.resize(1);
       auto& transpose_op = gpu_subgraph->operations[0];
       OperationDef transpose_def;
       transpose_def.precision = op_def.precision;
       transpose_def.src_tensors.push_back(op_def.src_tensors[1]);
       transpose_def.dst_tensors.push_back(transposed_desc);

       transpose_op.input_ids = {static_cast<int>(inputs[1]->id)};
       TransposeAttributes transpose_attr;
       transpose_attr.perm = BHWC(3, 0, 1, 2);
       transpose_op.operation = std::make_unique<GPUOperation>(
           CreateTranspose(transpose_def, transpose_attr));
       transpose_op.name = "mat_mul_transpose_second_tensor";

       const int transposed_id =
           gpu_subgraph->AddTensor(weights_shape_bhwc, transposed_desc);
       transpose_op.output_ids = {transposed_id};

       OperationDef conv_def = op_def;
       conv_def.src_tensors[1] = transposed_desc;
       return AddDynamicConv(hints, gpu_info, conv_def, op_type,
                             inputs[0]->tensor.shape, weights_shape, dst_shape,
                             inputs[0]->id, transposed_id, outputs[0]->id,
                             gpu_subgraph, &attr);
     }
     case OperationType::CAST:
       SelectCast(op_def, gpu_info, gpu_op);
       return absl::OkStatus();
     case OperationType::CONCAT: {
       auto attr = absl::any_cast<ConcatAttributes>(node.operation.attributes);
       const int max_inputs = gpu_info.GetMaxImageArguments() - 8;
       if (inputs.size() >= max_inputs) {
         int groups = DivideRoundUp(inputs.size(), max_inputs);
         gpu_subgraph->operations.clear();
         gpu_subgraph->operations.resize(groups);
         BHWC concatenated_shape = inputs[0]->tensor.shape;
         concatenated_shape.set(attr.axis, 0);
         for (int g = 0; g < groups; ++g) {
           std::vector<int> channels;
           auto& concat_op = gpu_subgraph->operations[g];
           OperationDef new_def;
           new_def.precision = op_def.precision;
           if (g != 0) {
             // concatenated tensor from previos concats
             new_def.src_tensors.push_back(op_def.dst_tensors[0]);
             concat_op.input_ids = {-g};
             channels.push_back(concatenated_shape.c);
           }
           for (int i = 0; i < max_inputs; ++i) {
             int src_index = g * max_inputs + i;
             if (src_index >= op_def.src_tensors.size()) {
               break;
             }
             new_def.src_tensors.push_back(op_def.src_tensors[src_index]);
             concat_op.input_ids.push_back(inputs[src_index]->id);
             channels.push_back(inputs[src_index]->tensor.shape.c);
             int current_size = concatenated_shape.get(attr.axis);
             concatenated_shape.set(
                 attr.axis,
                 current_size + inputs[src_index]->tensor.shape.get(attr.axis));
           }
           new_def.dst_tensors.push_back(op_def.dst_tensors[0]);
           if (g == groups - 1) {
             // last concat
             concat_op.output_ids = {static_cast<int>(outputs[0]->id)};
           } else {
             // intermediate concat, create new tensor for it
             int tensor_id = gpu_subgraph->AddTensor(concatenated_shape,
                                                     op_def.dst_tensors[0]);
             concat_op.output_ids = {tensor_id};
           }
           RETURN_IF_ERROR(SelectConcat(attr, channels, new_def, gpu_info,
                                        &concat_op.operation));
         }
         return absl::OkStatus();
       } else {
         std::vector<int> channels(inputs.size());
         for (int i = 0; i < inputs.size(); ++i) {
           channels[i] = inputs[i]->tensor.shape.c;
         }
         return SelectConcat(attr, channels, op_def, gpu_info, gpu_op);
       }
     }
     case OperationType::CONVOLUTION_2D: {
       auto attr =
           absl::any_cast<Convolution2DAttributes>(node.operation.attributes);
       auto input_shape = inputs[0]->tensor.shape;
       auto output_shape = outputs[0]->tensor.shape;
       if (inputs.size() == 1) {
         if (!hints.Check(ModelHints::kNoWinogradOptimizations) &&
             WinogradFromNode(gpu_info, inputs, outputs, op_def, hints,
                              input_shape, output_shape, attr, gpu_subgraph)
                 .ok()) {
           return absl::OkStatus();
         } else {
           gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
           if (attr.groups != 1) {
             gpu_subgraph->operations[0].name = "convolution_2d_grouped";
           }
           if (!shared_conv_weights || attr.weights.id == -1) {
             *gpu_op =
                 SelectConvolution(attr, output_shape, gpu_info, op_def, hints);
           } else {
             // Using convolutions with shared weights
             WeightsDescription weights_desc;
             const BHWC weights_shape_bhwc(
                 attr.weights.shape.o, attr.weights.shape.h,
                 attr.weights.shape.w, attr.weights.shape.i);
             OperationDef conv_temp_def = op_def;
             conv_temp_def.src_tensors.push_back(
                 {op_def.src_tensors[0].data_type, TensorStorageType::BUFFER,
                  Layout::HWC});
             *gpu_op = SelectConvolutionWithDynamicWeights(
                 attr, weights_shape_bhwc, output_shape, gpu_info, conv_temp_def,
                 hints, &weights_desc);
             AddConvSharedWeights(attr, weights_desc, shared_conv_weights,
                                  gpu_subgraph);
           }
           (*gpu_op)->flops_ =
               GetConvolutionFlops(output_shape, attr.weights.shape);
           return absl::OkStatus();
         }
       } else {
         // CONVOLUTION_2D with runtime weights
         const OHWI weights_shape =
             OHWI(inputs[1]->tensor.shape.b, inputs[1]->tensor.shape.h,
                  inputs[1]->tensor.shape.w, inputs[1]->tensor.shape.c);
         if (weights_shape.i != inputs[0]->tensor.shape.c) {
           return absl::UnimplementedError(
               "No support of grouped convolution with runtime weights");
         }
         if (attr.bias.data.empty()) {
           attr.bias.shape = Linear(weights_shape.o);
           attr.bias.data.resize(weights_shape.o, 0.0f);
         }
         gpu_subgraph->operations.clear();
         return AddDynamicConv(hints, gpu_info, op_def, op_type, input_shape,
                               weights_shape, output_shape, inputs[0]->id,
                               inputs[1]->id, outputs[0]->id, gpu_subgraph,
                               &attr);
       }
     }
     case OperationType::CONVOLUTION_TRANSPOSED: {
       auto attr = absl::any_cast<ConvolutionTransposedAttributes>(
           node.operation.attributes);
       if (inputs.size() == 1) {
         *gpu_op = SelectConvolutionTransposed(attr, gpu_info, op_def);
         (*gpu_op)->flops_ = GetConvolutionTransposedFlops(
             inputs[0]->tensor.shape, attr.weights.shape);
         return absl::OkStatus();
       } else {
         // CONVOLUTION_TRANSPOSED with runtime weights
         const OHWI weights_shape =
             OHWI(inputs[1]->tensor.shape.b, inputs[1]->tensor.shape.h,
                  inputs[1]->tensor.shape.w, inputs[1]->tensor.shape.c);
         if (attr.bias.data.empty()) {
           attr.bias.shape = Linear(weights_shape.o);
           attr.bias.data.resize(weights_shape.o, 0.0f);
         }
         gpu_subgraph->operations.clear();
         return AddDynamicConv(
             hints, gpu_info, op_def, op_type, inputs[0]->tensor.shape,
             weights_shape, outputs[0]->tensor.shape, inputs[0]->id,
             inputs[1]->id, outputs[0]->id, gpu_subgraph, &attr);
       }
     }
     case OperationType::DEPTHWISE_CONVOLUTION: {
       auto attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
           node.operation.attributes);
       if (inputs.size() == 1) {
         *gpu_op = SelectDWConvolution(attr, gpu_info, op_def);
         (*gpu_op)->flops_ = GetDepthwiseConvolutionFlops(
             outputs[0]->tensor.shape, attr.weights.shape);
       } else {
         if (inputs[1]->tensor.shape.b != 1) {
           return absl::UnimplementedError(
               "No support of depthwise runtime weights with channel multiplier "
               "!= 1");
         }
         *gpu_op = SelectDWConvolutionDynamicWeights(attr, gpu_info, op_def);
         (*gpu_op)->flops_ = GetDepthwiseConvolutionFlops(
             outputs[0]->tensor.shape,
             OHWI(inputs[1]->tensor.shape.b, inputs[1]->tensor.shape.h,
                  inputs[1]->tensor.shape.w, inputs[1]->tensor.shape.c));
       }
       return absl::OkStatus();
     }
     case OperationType::DEPTH_TO_SPACE: {
       auto attr =
           absl::any_cast<SpaceToDepthAttributes>(node.operation.attributes);
       SelectDepthToSpace(attr, op_def, gpu_op);
       return absl::OkStatus();
     }
     case OperationType::FULLY_CONNECTED: {
       auto attr =
           absl::any_cast<FullyConnectedAttributes>(node.operation.attributes);
       *gpu_op = SelectFullyConnected(attr, gpu_info, op_def,
                                      inputs[0]->tensor.shape.b);
       (*gpu_op)->flops_ =
           GetFullyConnectedFlops(outputs[0]->tensor.shape, attr.weights.shape);
       return absl::OkStatus();
     }
     case OperationType::FULLY_CONNECTED_INT8: {
       auto attr = absl::any_cast<FullyConnectedInt8Attributes>(
           node.operation.attributes);
       *gpu_op = SelectFullyConnected(attr, gpu_info, op_def);
       return absl::OkStatus();
     }
     case OperationType::GATHER: {
       auto attr = absl::any_cast<GatherAttributes>(node.operation.attributes);
       RETURN_IF_ERROR(SelectGather(attr, op_def, gpu_op));
       return absl::OkStatus();
     }
     case OperationType::LSTM: {
       *gpu_op = SelectLSTM(op_def, gpu_info);
       return absl::OkStatus();
     }
     case OperationType::MAX_UNPOOLING_2D: {
       auto attr =
           absl::any_cast<MaxUnpooling2DAttributes>(node.operation.attributes);
       *gpu_op = SelectMaxUnpooling(attr, op_def);
       return absl::OkStatus();
     }
     case OperationType::MEAN: {
       auto attr = absl::any_cast<MeanAttributes>(node.operation.attributes);
       *gpu_op = SelectReduce(attr.dims, inputs[0]->tensor.shape, op_type,
                              op_def, gpu_info);
       return absl::OkStatus();
     }
     case OperationType::MEAN_STDDEV_NORMALIZATION: {
       MeanStdDevNormalization operation = CreateMeanStdDevNormalization(
           op_def, gpu_info, (inputs[0]->tensor.shape.c + 3) / 4);
       *gpu_op = std::make_unique<MeanStdDevNormalization>(std::move(operation));
       return absl::OkStatus();
     }
     case OperationType::PAD: {
       auto attr = absl::any_cast<PadAttributes>(node.operation.attributes);
       SelectPadding(attr, op_def, gpu_op);
       return absl::OkStatus();
     }
     case OperationType::POOLING_2D: {
       auto attr =
           absl::any_cast<Pooling2DAttributes>(node.operation.attributes);
       *gpu_op = SelectPooling(attr, gpu_info, op_def);
       return absl::OkStatus();
     }
     case OperationType::PRELU: {
       auto attr = absl::any_cast<PReLUAttributes>(node.operation.attributes);
       *gpu_op = SelectPReLU(attr, gpu_info, op_def);
       return absl::OkStatus();
     }
     case OperationType::QUANTIZE_AND_DEQUANTIZE: {
       auto attr = absl::any_cast<QuantizeAndDequantizeAttributes>(
           node.operation.attributes);
       *gpu_op = SelectQuantizeAndDequantize(attr, op_def);
       return absl::OkStatus();
     }
     case OperationType::RELU: {
       auto attr = absl::any_cast<ReLUAttributes>(node.operation.attributes);
       *gpu_op = SelectReLU(attr, op_def);
       return absl::OkStatus();
     }
     case OperationType::RESAMPLER: {
       *gpu_op = SelectResampler(op_def, gpu_info);
       return absl::OkStatus();
     }
     case OperationType::RESHAPE: {
       const int src_channels = inputs[0]->tensor.shape.c;
       auto attr = absl::any_cast<ReshapeAttributes>(node.operation.attributes);
       SelectReshape(src_channels, attr.new_shape.c, op_def, gpu_op);
       return absl::OkStatus();
     }
     case OperationType::RESIZE: {
       auto attr = absl::any_cast<Resize2DAttributes>(node.operation.attributes);
       return SelectResize(attr, op_def, gpu_op);
     }
     case OperationType::SLICE: {
       auto attr = absl::any_cast<SliceAttributes>(node.operation.attributes);
       SelectStridedSlice(attr, op_def, gpu_op);
       return absl::OkStatus();
     }
     case OperationType::SOFTMAX: {
       SelectSoftmax(inputs[0]->tensor.shape, op_def, gpu_op);
       return absl::OkStatus();
     }
     case OperationType::SPACE_TO_DEPTH: {
       auto attr =
           absl::any_cast<SpaceToDepthAttributes>(node.operation.attributes);
       SelectSpaceToDepth(attr, op_def, gpu_op);
       return absl::OkStatus();
     }
     case OperationType::SPLIT: {
       std::vector<int> channels;
       channels.reserve(outputs.size());
       for (const auto& output : outputs) {
         channels.push_back(output->tensor.shape.c);
       }
       auto attr = absl::any_cast<SplitAttributes>(node.operation.attributes);
       SelectSplit(attr, gpu_info, channels, op_def, gpu_op);
       return absl::OkStatus();
     }
     case OperationType::TILE: {
       *gpu_op = SelectTile(op_def, inputs[0]->tensor.shape);
       return absl::OkStatus();
     }
     case OperationType::TRANSPOSE: {
       auto attr =
           absl::any_cast<TransposeAttributes>(node.operation.attributes);
       SelectTranspose(attr, op_def, gpu_op);
       return absl::OkStatus();
     }
     case OperationType::ABS:
     case OperationType::COPY:
     case OperationType::COS:
     case OperationType::ELU:
     case OperationType::EXP:
     case OperationType::HARD_SWISH:
     case OperationType::LOG:
     case OperationType::NEG:
     case OperationType::RSQRT:
     case OperationType::SIGMOID:
     case OperationType::SIN:
     case OperationType::SQRT:
     case OperationType::SQUARE:
     case OperationType::TANH: {
       GPUOperation operation =
           CreateElementwiseOneInput(gpu_info, op_def, op_type);
       *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
       return absl::OkStatus();
     }
     case OperationType::DIV:
     case OperationType::EQUAL:
     case OperationType::GREATER:
     case OperationType::GREATER_EQUAL:
     case OperationType::LESS:
     case OperationType::LESS_EQUAL:
     case OperationType::MAXIMUM:
     case OperationType::MINIMUM:
     case OperationType::MUL:
     case OperationType::NOT_EQUAL:
     case OperationType::POW:
     case OperationType::SQUARED_DIFF:
     case OperationType::SUB: {
       if (inputs.size() == 2) {
         GPUOperation operation =
             CreateElementwiseTwoInput(op_def, op_type, inputs[1]->tensor.shape);
         *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
         auto attr =
             absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
         GPUOperation operation =
             CreateElementwise(gpu_info, op_def, op_type, attr);
         *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       }
       return absl::UnimplementedError(absl::StrCat(
           "No support of ", node.operation.type, " with this parameters"));
     }
     case OperationType::REDUCE_MAXIMUM:
     case OperationType::REDUCE_MINIMUM:
     case OperationType::REDUCE_PRODUCT:
     case OperationType::REDUCE_SUM: {
       auto attr = absl::any_cast<ReduceAttributes>(node.operation.attributes);
       *gpu_op = SelectReduce(attr.dims, inputs[0]->tensor.shape, op_type,
                              op_def, gpu_info);
       return absl::OkStatus();
     }
     default:
       return SelectDefault(gpu_info, op_def, hints, inputs, outputs, node,
                            gpu_subgraph);
   }
 }

 absl::Status GPUOperationFromNode(
     const GpuInfo& gpu_info, const OperationDef& op_def, ModelHints hints,
     const std::vector<Value*>& inputs, const std::vector<Value*>& outputs,
     const Node& node, std::vector<SharedWeightsConvDesc>* shared_conv_weights,
     GPUOperationsSubgraph* gpu_subgraph) {
   RETURN_IF_ERROR(GPUOperationFromNodePart0(gpu_info, op_def, hints, inputs,
                                             outputs, node, shared_conv_weights,
                                             gpu_subgraph));
   for (auto& gpu_op : gpu_subgraph->operations) {
     if (gpu_op.name.empty()) {
       gpu_op.name = node.operation.type + " " + std::to_string(node.id);
     } else {
       gpu_op.name += " " + std::to_string(node.id);
     }
   }
   return absl::OkStatus();
 }

 }  // namespace gpu
 }  // namespace tflite