tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"

 #include <algorithm>
 #include <string>
 #include <utility>

 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"

 namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
 std::string GenerateUploadByThreads(const std::string& local_ptr_name,
                                     const std::string& global_ptr_name,
                                     const std::string& global_offset_name,
                                     const std::string& lid_name,
                                     int total_work_items,
                                     int elements_to_upload) {
   std::string c;
   std::string offset =
       global_offset_name.empty() ? "" : global_offset_name + " + ";
   const int groups = elements_to_upload / total_work_items;
   const int reminder = elements_to_upload % total_work_items;
   for (int i = 0; i < groups; ++i) {
     c += "    " + local_ptr_name + "[" + lid_name + " + " +
          std::to_string(total_work_items * i) + "] = " + global_ptr_name + "[" +
          offset + lid_name + " + " + std::to_string(total_work_items * i) +
          "];\n";
   }
   if (reminder != 0) {
     c += "    if (" + lid_name + " < " + std::to_string(reminder) + ") {\n";
     c += "      " + local_ptr_name + "[" + lid_name + " + " +
          std::to_string(total_work_items * groups) + "] = " + global_ptr_name +
          "[" + offset + lid_name + " + " +
          std::to_string(total_work_items * groups) + "];\n";
     c += "    }\n";
   }
   return c;
 }

 std::string GenerateAsyncUpload(const std::string& local_ptr_name,
                                 const std::string& global_ptr_name,
                                 const std::string& global_offset_name,
                                 int elements_to_upload) {
   std::string c;
   std::string offset =
       global_offset_name.empty() ? "" : " + " + global_offset_name;
   c += "    async_work_group_copy(" + local_ptr_name + ", " + global_ptr_name +
        offset + ", " + std::to_string(elements_to_upload) + ", 0);\n";
   return c;
 }

 std::string GenerateBlockCoords(const int3& block_size,
                                 const int3& work_group_launch_order,
                                 bool linear_hw) {
   std::string c;
   int3 launch_remap;
   launch_remap[work_group_launch_order.x] = 0;
   launch_remap[work_group_launch_order.y] = 1;
   launch_remap[work_group_launch_order.z] = 2;
   if (linear_hw) {
     if (work_group_launch_order[0] == 0) {
       c += "  int linear_hw = get_global_id(0);\n";
     } else {
       c += "  int linear_hw = get_group_id(" + std::to_string(launch_remap[0]) +
            ") * get_local_size(0) + get_local_id(0);\n";
     }
     c += "  int Y = (linear_hw / args.task_size_x) * " +
          std::to_string(block_size.y) + ";\n";
     c += "  int X = (linear_hw % args.task_size_x) * " +
          std::to_string(block_size.x) + ";\n";
     if (work_group_launch_order[1] == 1) {
       c += "  int Z = get_global_id(1) * " + std::to_string(block_size.z) +
            ";\n";
     } else {
       c += "  int Z = (get_group_id(" + std::to_string(launch_remap[1]) +
            ") * get_local_size(1) + get_local_id(1)) * " +
            std::to_string(block_size.z) + ";\n";
     }
   } else {
     if (work_group_launch_order[0] == 0) {
       c += "  int X = get_global_id(0) * " + std::to_string(block_size.x) +
            ";\n";
     } else {
       c += "  int X = (get_group_id(" + std::to_string(launch_remap[0]) +
            ") * get_local_size(0) + get_local_id(0)) * " +
            std::to_string(block_size.x) + ";\n";
     }
     if (work_group_launch_order[1] == 1) {
       c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) +
            ";\n";
     } else {
       c += "  int Y = (get_group_id(" + std::to_string(launch_remap[1]) +
            ") * get_local_size(1) + get_local_id(1)) * " +
            std::to_string(block_size.y) + ";\n";
     }
     if (work_group_launch_order[2] == 2) {
       c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) +
            ";\n";
     } else {
       c += "  int Z = (get_group_id(" + std::to_string(launch_remap[2]) +
            ") * get_local_size(2) + get_local_id(2)) * " +
            std::to_string(block_size.z) + ";\n";
     }
   }

   return c;
 }
 }  // namespace

 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const Convolution2DAttributes& attr,
                          const CLDevice& device, const BHWC* dst_shape)
     : GPUOperation(definition),
       stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
                       -attr.padding.prepended.h),
       kernel_dilation_(attr.weights.shape.w, attr.weights.shape.h,
                        attr.dilations.w, attr.dilations.h),
       conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}

 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const Convolution2DAttributes& attr,
                          const BHWC& weights_shape, const CLDevice& device,
                          const BHWC* dst_shape)
     : GPUOperation(definition),
       stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
                       -attr.padding.prepended.h),
       kernel_dilation_(weights_shape.w, weights_shape.h, attr.dilations.w,
                        attr.dilations.h),
       conv_params_(GuessBestParams(device, definition, attr, weights_shape,
                                    dst_shape)) {}

 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const FullyConnectedAttributes& attr,
                          const CLDevice& device, const BHWC* dst_shape)
     : GPUOperation(definition),
       stride_padding_(1, 1, 0, 0),
       kernel_dilation_(1, 1, 1, 1),
       conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}

 ConvPowerVR::ConvPowerVR(const OperationDef& definition)
     : GPUOperation(definition),
       stride_padding_(1, 1, 0, 0),
       kernel_dilation_(1, 1, 1, 1) {}

 ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
     : GPUOperation(std::move(operation)),
       stride_padding_(operation.stride_padding_),
       kernel_dilation_(operation.kernel_dilation_),
       conv_params_(operation.conv_params_) {}

 ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
   if (this != &operation) {
     std::swap(stride_padding_, operation.stride_padding_);
     std::swap(kernel_dilation_, operation.kernel_dilation_);
     std::swap(conv_params_, operation.conv_params_);
     GPUOperation::operator=(std::move(operation));
   }
   return *this;
 }

 absl::Status ConvPowerVR::Compile(const CreationContext& creation_context) {
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_padding_.x != 1;
   std::string code = GenerateConv(*creation_context.device, definition_,
                                   stride_correction, conv_params_, &args_);
   work_group_size_ = conv_params_.work_group_size;
   std::string element_wise_code;
   RETURN_IF_ERROR(
       MergeOperations(linked_operations_, &args_, &element_wise_code));
   RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
                                           {{"dst_tensor", element_wise_code}},
                                           &code));
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
       creation_context.device->IsPowerVR()) {
     options.push_back(CompilerOptions::POWERVR_FP16);
   }
   if (conv_params_.IsPrivateMemBroadcast()) {
     options.push_back(CompilerOptions::CL_2_0);
   }
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", options, *creation_context.context,
       *creation_context.device, &kernel_);
 }

 absl::Status ConvPowerVR::BindArguments() {
   if (definition_.src_tensors.size() == 2) {
     RETURN_IF_ERROR(args_.SetObjectRef("weights", src_[1]));
   }
   if (!conv_params_.x_kernel_is_1 || !conv_params_.y_kernel_is_1) {
     RETURN_IF_ERROR(args_.SetInt("stride_x", stride_padding_.x));
     RETURN_IF_ERROR(args_.SetInt("stride_y", stride_padding_.y));
     RETURN_IF_ERROR(
         args_.SetInt("padding_x", stride_padding_.z * src_[0]->Batch()));
     RETURN_IF_ERROR(args_.SetInt("padding_y", stride_padding_.w));
     RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_dilation_.x));
     RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_dilation_.y));
     RETURN_IF_ERROR(
         args_.SetInt("dilation_x", kernel_dilation_.z * src_[0]->Batch()));
     RETURN_IF_ERROR(args_.SetInt("dilation_y", kernel_dilation_.w));
   }
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   if (conv_params_.linear_hw) {
     const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
                                      conv_params_.block_size.x);
     RETURN_IF_ERROR(args_.SetInt("task_size_x", grid_x));
   }
   return absl::OkStatus();
 }

 int3 ConvPowerVR::GetGridSize() const {
   const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
                                    conv_params_.block_size.x);
   const int grid_y =
       DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
   const int grid_z =
       DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
   int3 wg;

   if (conv_params_.linear_hw) {
     wg.x = DivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
     wg.y = DivideRoundUp(grid_z, conv_params_.work_group_size.y);
     return int3(wg[conv_params_.work_group_launch_order[0]] *
                     conv_params_.work_group_size.x,
                 wg[conv_params_.work_group_launch_order[1]] *
                     conv_params_.work_group_size.y,
                 1);
   } else {
     wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x);
     wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y);
     wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z);
     return int3(wg[conv_params_.work_group_launch_order[0]] *
                     conv_params_.work_group_size.x,
                 wg[conv_params_.work_group_launch_order[1]] *
                     conv_params_.work_group_size.y,
                 wg[conv_params_.work_group_launch_order[2]] *
                     conv_params_.work_group_size.z);
   }
 }

 absl::Status ConvPowerVR::Tune(const TuningParameters& params) {
   if (conv_params_.weights_upload_type ==
           WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
       conv_params_.weights_upload_type ==
           WeightsUploadType::LOCAL_MEM_BY_THREADS ||
       conv_params_.fixed_work_group_size) {
     return absl::OkStatus();
   }
   if (conv_params_.work_group_launch_order[0] == 0 &&
       conv_params_.work_group_launch_order[1] == 1 &&
       conv_params_.work_group_launch_order[2] == 2) {
     RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
     RETURN_IF_ERROR(GetBestWorkGroupConv(params, kernel_, grid_size_,
                                          &conv_params_.work_group_size));
     work_group_size_ = conv_params_.work_group_size;
   }
   return absl::OkStatus();
 }

 std::string GenerateConv(const CLDevice& device, const OperationDef& op_def,
                          bool stride_correction,
                          const ConvPowerVR::ConvParams& conv_params,
                          Arguments* args) {
   auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
   src_desc->SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
     src_desc->SetStateVar("BatchedWidth", "true");
   }
   args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
   auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
   if (op_def.IsBatchSupported()) {
     dst_desc->SetStateVar("BatchedWidth", "true");
   }
   args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
   const bool is1x1 = conv_params.x_kernel_is_1 && conv_params.y_kernel_is_1;
   if (!is1x1) {
     args->AddInt("stride_x");
     args->AddInt("stride_y");
     args->AddInt("padding_x");
     args->AddInt("padding_y");
     args->AddInt("kernel_size_x");
     args->AddInt("kernel_size_y");
     args->AddInt("dilation_x");
     args->AddInt("dilation_y");
   }
   if (conv_params.linear_hw) {
     args->AddInt("task_size_x");
   }

   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool buffer_type = src_tensor_type == TensorStorageType::BUFFER ||
                            src_tensor_type == TensorStorageType::IMAGE_BUFFER;
   const bool manual_clamp = buffer_type && !is1x1;

   const bool need_local_mem =
       conv_params.weights_upload_type ==
           ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
       conv_params.weights_upload_type ==
           ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;

   const int local_mem_size =
       conv_params.block_size.z * 4 * conv_params.src_depth_loop_size;

   const bool use_simd_broadcast = conv_params.IsPrivateMemBroadcast();
   const int simd_size = conv_params.GetSimdSize();

   const bool late_oob_check = need_local_mem || use_simd_broadcast;

   const std::string weights_space =
       conv_params.weights_upload_type ==
               ConvPowerVR::WeightsUploadType::CONSTANT_MEM
           ? "__constant"
           : "__global";

   const std::string weights_data_type =
       conv_params.weights_data_type == DataType::FLOAT32 ? "float4" : "half4";

   const std::string weights_global_ptr =
       weights_space + " " + weights_data_type + "*";

   std::string c = GetCommonDefines(op_def.precision);
   if (use_simd_broadcast) {
     if (device.cl_version() == OpenCLVersion::CL_2_0) {
       c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
     }
   }

   const int3 work_group_size = conv_params.work_group_size;
   const int3 block_size = conv_params.block_size;
   if (conv_params.fixed_work_group_size) {
     c += "__attribute__((reqd_work_group_size(" +
          std::to_string(work_group_size.x) + ", " +
          std::to_string(work_group_size.y) + ", " +
          std::to_string(work_group_size.z) + ")))\n";
   }
   if (use_simd_broadcast && device.IsIntel()) {
     c += "__attribute__((intel_reqd_sub_group_size(" +
          std::to_string(simd_size) + ")))\n";
   }
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
   c += GenerateBlockCoords(conv_params.block_size,
                            conv_params.work_group_launch_order,
                            conv_params.linear_hw);
   std::vector<std::string> dst_x(conv_params.block_size.x);
   for (int x = 0; x < conv_params.block_size.x; ++x) {
     dst_x[x] = "(X + " + std::to_string(x) + ")";
   }
   std::vector<std::string> dst_y(conv_params.block_size.y);
   for (int y = 0; y < conv_params.block_size.y; ++y) {
     dst_y[y] = "(Y + " + std::to_string(y) + ")";
   }
   if (!late_oob_check) {
     c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
          "|| Z >= args.dst_tensor.Slices()) {\n";
     c += "    return;\n";
     c += "  }\n";
   }
   if (conv_params.weights_upload_type ==
       ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
     if (conv_params.linear_hw) {
       c += "  int lid = get_local_id(0);\n";
     } else {
       c += "  int lid = get_local_id(1) * " +
            std::to_string(work_group_size.x) + " + get_local_id(0);\n";
     }
   }
   if (use_simd_broadcast) {
     c += "  int simd_id = get_sub_group_local_id();\n";
   }
   for (int z = 0; z < block_size.z; ++z) {
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
         c += "  ACCUM_FLT4 r" + std::to_string(z) + std::to_string(y) +
              std::to_string(x) + " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
       }
     }
   }
   if (!is1x1) {
     for (int x = 0; x < block_size.x; ++x) {
       if (stride_correction) {
         c += "  int xc" + std::to_string(x) + " = " +
              GetXStrideCorrected(dst_x[x], "args.src_tensor.Batch()",
                                  "args.stride_x", "args.padding_x") +
              ";\n";
       } else {
         c += "  int xc" + std::to_string(x) + " = " + dst_x[x] +
              " * args.stride_x + args.padding_x;\n";
       }
     }
     for (int y = 0; y < block_size.y; ++y) {
       c += "  int yc" + std::to_string(y) + " = " + dst_y[y] +
            " * args.stride_y + args.padding_y;\n";
     }
   }
   if (need_local_mem) {
     c += "  __local " + weights_data_type + " weights_cache[" +
          std::to_string(local_mem_size) + "];\n";
   } else {
     c += "    " + weights_global_ptr + " weights_cache;\n";
   }
   if (is1x1) {
     if (conv_params.different_weights_for_height) {
       c += "  " + weights_global_ptr +
            " filters_loc = args.weights.GetPtr() + (Z * "
            "args.src_tensor.Height() + Y * " +
            std::to_string(block_size.z) + ") * 4 * args.src_tensor.Slices();\n";
     } else {
       c += "  " + weights_global_ptr +
            " filters_loc = args.weights.GetPtr() + Z * 4 * "
            "args.src_tensor.Slices();\n";
     }
   } else {
     c += "  " + weights_global_ptr +
          " filters_loc = args.weights.GetPtr() + Z * 4 * "
          "args.src_tensor.Slices() *args.kernel_size_x * args.kernel_size_y;\n";
   }
   if (buffer_type) {
     c += "  const int src_layer_offset = args.src_tensor.SliceStride();\n";
   }
   if (!is1x1) {
     c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
     for (int y = 0; y < block_size.y; ++y) {
       const std::string yck = "yck" + std::to_string(y);
       c += "  int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) +
            ";\n";
       if (manual_clamp) {
         c += "  bool my" + std::to_string(y) + " = " + yck + " >= 0 && " + yck +
              " < args.src_tensor.Height();\n";
         c += "  " + yck + " = clamp(" + yck +
              ", 0, args.src_tensor.Height() - 1);\n";
       }
     }
     c += "  for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
     for (int x = 0; x < block_size.x; ++x) {
       const std::string xck = "xck" + std::to_string(x);
       c += "  int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" +
            std::to_string(x) + ";\n";
       if (manual_clamp) {
         c += "  bool mx" + std::to_string(x) + " = " + xck + " >= 0 && " + xck +
              " < args.src_tensor.Width();\n";
         c += "  " + xck + " = clamp(" + xck +
              ", 0, args.src_tensor.Width() - 1);\n";
       }
     }
   }
   if (buffer_type) {
     for (int y = 0; y < block_size.y; ++y) {
       const std::string yck = "yck" + std::to_string(y);
       for (int x = 0; x < block_size.x; ++x) {
         const std::string xck = "xck" + std::to_string(x);
         std::string xc =
             is1x1 ? "min(" + dst_x[x] + ", args.src_tensor.Width() - 1)" : xck;
         std::string yc =
             is1x1 ? "min(" + dst_y[y] + ", args.src_tensor.Height() - 1)" : yck;
         std::string id = std::to_string(y) + std::to_string(x);
         c += "  int src_a_" + id + " = " + yc +
              " * args.src_tensor.Width() + " + xc + ";\n";
       }
     }
   }

   auto declare_src = [&]() {
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
         const std::string id = std::to_string(y) + std::to_string(x);
         c += "    " + weights_data_type + " src" + id + ";\n";
       }
     }
   };
   const bool conditional_read = device.IsMali();
   auto read_src = [&]() {
     const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
         if (buffer_type) {
           std::string id = std::to_string(y) + std::to_string(x);
           if (is1x1) {
             c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
                  ">(src_a_" + id + ");\n";
           } else {
             std::string condition =
                 "mx" + std::to_string(x) + " && my" + std::to_string(y);
             if (conditional_read) {
               c += "    src" + id + " = " + condition +
                    " ? args.src_tensor.Read<" + cl_type + ">(src_a_" + id +
                    ") : (FLT4)(0.0f);\n";
             } else {
               c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
                    ">(src_a_" + id + ") * (FLT)(" + condition + ");\n";
             }
           }
           c += "    src_a_" + id + " += src_layer_offset;\n";
         } else {
           std::string id = std::to_string(y) + std::to_string(x);
           const std::string xc = is1x1 ? dst_x[x] : "xck" + std::to_string(x);
           const std::string yc = is1x1 ? dst_y[y] : "yck" + std::to_string(y);
           c += "    src" + id + " = args.src_tensor.Read<" + cl_type + ">(" +
                xc + ", " + yc + ", s);\n";
         }
       }
     }
   };
   const bool weights_type_as_accum_type =
       !(op_def.precision == CalculationsPrecision::F32_F16 &&
         conv_params.weights_data_type == DataType::FLOAT16);
   auto conv_core = [&](int shared_offset) {
     const std::string channels[] = {"x", "y", "z", "w"};
     for (int z = 0; z < block_size.z; ++z) {
       if (weights_type_as_accum_type) {
         for (int ch = 0; ch < 4; ++ch) {
           for (int y = 0; y < block_size.y; ++y) {
             for (int x = 0; x < block_size.x; ++x) {
               std::string id = std::to_string(y) + std::to_string(x);
               if (use_simd_broadcast) {
                 int simd_id = (z * 4 + ch + shared_offset) / simd_size;
                 int thread_id = (z * 4 + ch + shared_offset) % simd_size;
                 std::string w_val_x = "sub_group_broadcast(simd_w" +
                                       std::to_string(simd_id) + ".x, " +
                                       std::to_string(thread_id) + "u)";
                 std::string w_val_y = "sub_group_broadcast(simd_w" +
                                       std::to_string(simd_id) + ".y, " +
                                       std::to_string(thread_id) + "u)";
                 std::string w_val_z = "sub_group_broadcast(simd_w" +
                                       std::to_string(simd_id) + ".z, " +
                                       std::to_string(thread_id) + "u)";
                 std::string w_val_w = "sub_group_broadcast(simd_w" +
                                       std::to_string(simd_id) + ".w, " +
                                       std::to_string(thread_id) + "u)";
                 c += "    r" + std::to_string(z) + id + ".x += " + w_val_x +
                      " * src" + id + "." + channels[ch] + ";\n";
                 c += "    r" + std::to_string(z) + id + ".y += " + w_val_y +
                      " * src" + id + "." + channels[ch] + ";\n";
                 c += "    r" + std::to_string(z) + id + ".z += " + w_val_z +
                      " * src" + id + "." + channels[ch] + ";\n";
                 c += "    r" + std::to_string(z) + id + ".w += " + w_val_w +
                      " * src" + id + "." + channels[ch] + ";\n";
               } else {
                 std::string w_val = "weights_cache[" +
                                     std::to_string(z * 4 + ch + shared_offset) +
                                     "]";
                 c += "    r" + std::to_string(z) + id + " += " + w_val +
                      " * src" + id + "." + channels[ch] + ";\n";
               }
             }
           }
         }
       } else {  // F32_F16 precision and weights type is float16
         for (int y = 0; y < block_size.y; ++y) {
           for (int x = 0; x < block_size.x; ++x) {
             std::string id = std::to_string(y) + std::to_string(x);
             std::string R = "r" + std::to_string(z) + id;
             std::string S = "src" + id;
             const int dz = z * 4 + shared_offset;
             std::string f0 = "weights_cache[" + std::to_string(dz + 0) + "]";
             std::string f1 = "weights_cache[" + std::to_string(dz + 1) + "]";
             std::string f2 = "weights_cache[" + std::to_string(dz + 2) + "]";
             std::string f3 = "weights_cache[" + std::to_string(dz + 3) + "]";
             c += "    " + R + " += convert_float4(" + S + ".x * " + f0 + " + " +
                  S + ".y * " + f1 + " + " + S + ".z * " + f2 + " + " + S +
                  ".w * " + f3 + ");\n";
           }
         }
       }
     }
   };

   c += "  int s = 0;\n";
   c += "  do {\n";
   declare_src();
   const int total_work_items =
       work_group_size.x * work_group_size.y * work_group_size.z;
   if (conv_params.weights_upload_type ==
       ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
     c += GenerateAsyncUpload("weights_cache", "filters_loc",
                              /*global_offset_name*/ "", local_mem_size);
   } else if (conv_params.weights_upload_type ==
              ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
     c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
     c += GenerateUploadByThreads("weights_cache", "filters_loc",
                                  /*global_offset_name*/ "", "lid",
                                  total_work_items, local_mem_size);
   } else if (use_simd_broadcast) {
     int parts = local_mem_size / simd_size;
     int reminder = local_mem_size % simd_size;
     for (int i = 0; i < parts; ++i) {
       c += "    FLT4 simd_w" + std::to_string(i) + " = filters_loc[simd_id + " +
            std::to_string(i * simd_size) + "];\n";
     }
     if (reminder) {
       c += "    FLT4 simd_w" + std::to_string(parts) + ";\n";
       c += "    if (simd_id < " + std::to_string(reminder) + ") {\n";
       c += "      simd_w" + std::to_string(parts) +
            " = filters_loc[simd_id + " + std::to_string(parts * simd_size) +
            "];\n";
       c += "    }\n";
     }
   } else {  // GLOBAL_MEM/CONSTANT_MEM
     c += "    weights_cache = filters_loc;\n";
   }
   read_src();
   c += "    s += 1;\n";
   if (conv_params.weights_upload_type ==
       ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
     c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
   }
   conv_core(0);
   for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
     read_src();
     conv_core(i * block_size.z * 4);
     c += "    s += 1;\n";
   }
   c += "    filters_loc += " + std::to_string(local_mem_size) + ";\n";
   c += "  } while (s < args.src_tensor.Slices());\n";
   if (!is1x1) {
     c += "  };\n";
     c += "  };\n";
   }
   if (conv_params.weights_upload_type ==
       ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
     c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "Z",
                              block_size.z);
   } else if (conv_params.weights_upload_type ==
              ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
     c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
     c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()", "Z",
                                  "lid", total_work_items, block_size.z);
     c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
   } else {
     c += "    weights_cache = args.biases.GetPtr() + Z;\n";
   }
   if (late_oob_check) {
     c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
          "|| Z >= args.dst_tensor.Slices()) {\n";
     c += "    return;\n";
     c += "  }\n";
   }
   for (int z = 0; z < block_size.z; ++z) {
     const std::string sz = std::to_string(z);
     c += "  if (Z + " + sz + " >= args.dst_tensor.Slices()) return;\n";
     c += "  {\n";
     c += "    FLT4 bias_val = TO_FLT4(weights_cache[" + sz + "]);\n";
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
         const std::string xs = dst_x[x];
         const std::string ys = dst_y[y];
         const std::string zs = "Z + " + sz;
         const std::string r_id = sz + std::to_string(y) + std::to_string(x);
         bool need_x_check = x != 0;
         bool need_y_check = y != 0;
         if (need_x_check && need_y_check) {
           c += "  if (" + xs + " < args.dst_tensor.Width() && " + ys +
                " < args.dst_tensor.Height()) {\n";
         } else if (need_x_check && !need_y_check) {
           c += "  if (" + xs + " < args.dst_tensor.Width()) {\n";
         } else if (!need_x_check && need_y_check) {
           c += "  if (" + ys + " < args.dst_tensor.Height()) {\n";
         } else {
           c += "  {\n";
         }
         c += "    FLT4 res = TO_FLT4(r" + r_id + ") + bias_val;\n";
         c += "    args.dst_tensor.Write(res, " + xs + ", " + ys + ", " + zs +
              ");\n";
         c += "  }\n";
       }
     }
     c += "  }\n";
   }
   c += "}\n";
   return c;
 }

 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition, int src_depth,
     int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
     bool different_weights_for_height, const BHWC* dst_shape) const {
   ConvParams conv_params;
   conv_params.linear_hw = false;
   conv_params.weights_data_type =
       DeduceDataTypeFromPrecision(definition.precision);
   conv_params.x_kernel_is_1 = x_kernel_is_1;
   conv_params.y_kernel_is_1 = y_kernel_is_1;
   conv_params.different_weights_for_height = different_weights_for_height;
   if (device.IsNvidia()) {
     if (different_weights_for_height) {
       conv_params.work_group_size = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
       conv_params.linear_hw = true;
       conv_params.work_group_size = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(1, 0, 2);
       conv_params.fixed_work_group_size = true;
     }
     conv_params.block_size = int3(1, 1, 4);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
       conv_params.block_size.z = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
       conv_params.block_size.z = 2;
     } else {
       conv_params.block_size.z = dst_depth;
     }
     if (src_depth % 2 == 0) {
       conv_params.src_depth_loop_size = 2;
     }
     if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
       conv_params.src_depth_loop_size = 4;
     }
   } else if (device.IsPowerVR()) {
     if (different_weights_for_height) {
       conv_params.work_group_size = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
       conv_params.linear_hw = true;
       conv_params.work_group_size = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(1, 0, 2);
       conv_params.fixed_work_group_size = true;
     }
     conv_params.weights_data_type =
         definition.precision == CalculationsPrecision::F16 ? DataType::FLOAT16
                                                            : DataType::FLOAT32;
     conv_params.block_size = int3(1, 1, 4);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type =
         WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
     if (dst_depth % 8 == 0 || dst_depth >= 32) {
       conv_params.block_size.z = 8;
     } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
       conv_params.block_size.z = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
       conv_params.block_size.z = 2;
     } else {
       conv_params.block_size.z = dst_depth;
     }
     if (definition.precision == CalculationsPrecision::F16) {
       conv_params.block_size.z = std::min(4, conv_params.block_size.z);
       if (src_depth % 2 == 0) {
         conv_params.src_depth_loop_size = 2;
       }
       if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
         conv_params.src_depth_loop_size = 4;
       }
       if (conv_params.block_size.z == 1) {
         if (src_depth % 2 == 0) {
           conv_params.src_depth_loop_size = 2;
         }
         if (src_depth % 4 == 0) {
           conv_params.src_depth_loop_size = 4;
         }
         if (src_depth <= 8) {
           conv_params.src_depth_loop_size = src_depth;
         }
       }
       conv_params.block_size.x = 2;
     }
   } else if (device.IsAMD()) {
     if (different_weights_for_height) {
       conv_params.work_group_size = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
       conv_params.work_group_size = int3(8, 4, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     }

     conv_params.block_size = int3(2, 1, 1);
     if (x_kernel_is_1 && y_kernel_is_1) {
       conv_params.block_size.y = 2;
     }
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::CONSTANT_MEM;
     if (dst_depth % 8 == 0 || dst_depth >= 32) {
       conv_params.block_size.z = 8;
     } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
       conv_params.block_size.z = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
       conv_params.block_size.z = 2;
     } else {
       conv_params.block_size.z = 1;
     }
     if (src_depth % 2 == 0 && src_depth >= 16) {
       conv_params.src_depth_loop_size = 2;
     }
   } else if (device.IsMali()) {
     int block_size = 2;
     if (dst_shape) {
       int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
       block_size = GetRecommendedBlockSizeForConv(device, definition.precision,
                                                   task_size);
     }
     if (!x_kernel_is_1 || !y_kernel_is_1) {
       block_size = std::min(block_size, 4);
     }
     if (block_size == 8) {
       if (dst_depth == 1 || dst_depth == 3) {
         conv_params.block_size = int3(2, 2, 1);
       } else {
         conv_params.block_size = int3(2, 2, 2);
       }
     } else if (block_size == 4) {
       if (dst_depth == 1 || dst_depth == 3) {
         conv_params.block_size = int3(2, 2, 1);
       } else {
         conv_params.block_size = int3(2, 1, 2);
       }
     } else if (block_size == 2) {
       conv_params.block_size = int3(2, 1, 1);
     } else {
       conv_params.block_size = int3(1, 1, 1);
     }
     conv_params.src_depth_loop_size = 1;
     MaliInfo mali_info = device.GetInfo().mali_info;
     if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
       conv_params.src_depth_loop_size = 2;
     }
     if (src_depth % 4 == 0 && block_size == 1 && !mali_info.IsMidgard() &&
         definition.precision == CalculationsPrecision::F16) {
       conv_params.src_depth_loop_size = 4;
     }
     conv_params.work_group_size = int3(4, 4, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
   } else if (device.IsAdreno()) {
     conv_params.block_size = int3(2, 2, 1);
     conv_params.work_group_size = int3(8, 2, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
   } else if (device.IsIntel()) {
     if (different_weights_for_height) {
       conv_params.work_group_size = int3(16, 1, 1);
       conv_params.work_group_launch_order = int3(0, 1, 2);
       conv_params.fixed_work_group_size = true;
     } else {
       conv_params.linear_hw = true;
       conv_params.work_group_size = int3(16, 1, 1);
       conv_params.work_group_launch_order = int3(0, 1, 2);
       conv_params.fixed_work_group_size = true;
     }
     conv_params.block_size = int3(1, 1, 4);
     conv_params.src_depth_loop_size = 1;
     if (definition.precision != CalculationsPrecision::F32_F16 &&
         device.SupportsExtension("cl_khr_subgroups") &&
         device.SupportsExtension("cl_intel_required_subgroup_size") &&
         device.IsCL20OrHigher() && device.SupportsSubGroupWithSize(16)) {
       conv_params.weights_upload_type =
           WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST;
     } else {
       conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
     }
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
       conv_params.block_size.z = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
       conv_params.block_size.z = 2;
     } else {
       conv_params.block_size.z = dst_depth;
     }
     if (src_depth % 2 == 0) {
       conv_params.src_depth_loop_size = 2;
     }
     if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
       conv_params.src_depth_loop_size = 4;
     }
   } else {
     conv_params.block_size = int3(1, 1, 4);
     conv_params.work_group_size = int3(8, 2, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
       conv_params.block_size.z = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
       conv_params.block_size.z = 2;
     } else {
       conv_params.block_size.z = dst_depth;
     }
     if (src_depth % 2 == 0) {
       conv_params.src_depth_loop_size = 2;
     }
     if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
       conv_params.src_depth_loop_size = 4;
     }
   }

   return conv_params;
 }

 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
                              attr.dilations.w == 1 &&
                              attr.padding.prepended.w == 0 &&
                              attr.padding.appended.w == 0;
   const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
                              attr.dilations.h == 1 &&
                              attr.padding.prepended.h == 0 &&
                              attr.padding.appended.h == 0;
   return GuessBestParams(device, definition, src_depth, dst_depth,
                          x_kernel_is_1, y_kernel_is_1, false, dst_shape);
 }

 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
     const BHWC* dst_shape) const {
   const int dst_depth = DivideRoundUp(weights_shape.b, 4);
   const int src_depth = DivideRoundUp(weights_shape.c, 4);
   const bool x_kernel_is_1 =
       weights_shape.w == 1 && attr.strides.w == 1 && attr.dilations.w == 1 &&
       attr.padding.prepended.w == 0 && attr.padding.appended.w == 0;
   const bool y_kernel_is_1 =
       weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
       attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
   return GuessBestParams(device, definition, src_depth, dst_depth,
                          x_kernel_is_1, y_kernel_is_1, false, dst_shape);
 }

 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition,
     const FullyConnectedAttributes& attr, const BHWC* dst_shape) const {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvPowerVR::ConvParams params = GuessBestParams(
       device, definition, src_depth, dst_depth, true, true, false, dst_shape);
   params.work_group_size.x *= params.work_group_size.y;
   params.work_group_size.y = 1;
   params.block_size.x *= params.block_size.y;
   params.block_size.y = 1;
   return params;
 }

 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
     const CLDevice& device, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvPowerVR::ConvParams params = GuessBestParams(
       device, definition, src_depth, dst_depth, true, true, true, dst_shape);
   params.block_size.x *= params.block_size.y;
   params.block_size.y = 1;
   return params;
 }

 absl::Status CreateConvPowerVR(const CreationContext& creation_context,
                                const OperationDef& definition,
                                const Convolution2DAttributes& attr,
                                ConvPowerVR* result, const BHWC* dst_shape) {
   *result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }

 absl::Status CreateConvPowerVR(const CreationContext& creation_context,
                                const OperationDef& definition,
                                const FullyConnectedAttributes& attr,
                                ConvPowerVR* result, const BHWC* dst_shape) {
   *result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }

 absl::Status CreateConvPowerVRDynamicWeights(
     const CreationContext& creation_context, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
     ConvPowerVR* result, const BHWC* dst_shape) {
   *result = ConvPowerVR(definition, attr, weights_shape,
                         *creation_context.device, dst_shape);
   BufferDescriptor desc;
   desc.element_type = definition.src_tensors[1].data_type;
   desc.element_size = 4;
   desc.memory_type = result->conv_params_.weights_upload_type ==
                              ConvPowerVR::WeightsUploadType::CONSTANT_MEM
                          ? MemoryType::CONSTANT
                          : MemoryType::GLOBAL;

   result->args_.AddObjectRef("weights", AccessType::READ,
                              absl::make_unique<BufferDescriptor>(desc));
   return result->UploadBias(attr.bias, creation_context.context);
 }

 absl::Status CreateConvPowerVRWino4x4To6x6(
     const CreationContext& creation_context, const OperationDef& definition,
     const Convolution2DAttributes& attr, ConvPowerVR* result,
     const BHWC* dst_shape) {
   *result = ConvPowerVR(definition);
   result->conv_params_ = result->GuessBestParamsWinograd(
       *creation_context.device, definition, attr, dst_shape);
   return result->UploadDataForWinograd4x4To6x6(
       attr.weights, *creation_context.device, creation_context.context);
 }

 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite