tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h"

 #include <string>
 #include <utility>

 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"

 namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {

 std::string GenerateConvolutionTransposed3DCode(const OperationDef& op_def,
                                                 const CLDevice& device,
                                                 bool weights_are_buffer,
                                                 const int4& block_size,
                                                 Arguments* args) {
   auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
   src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
   args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
   args->AddObjectRef(
       "dst_tensor", AccessType::WRITE,
       absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
   args->AddInt("stride_x");
   args->AddInt("stride_y");
   args->AddInt("stride_z");
   args->AddInt("padding_x");
   args->AddInt("padding_y");
   args->AddInt("padding_z");
   args->AddInt("kernel_size_x");
   args->AddInt("kernel_size_y");
   args->AddInt("kernel_size_z");
   args->AddInt("grid_size_s");

   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   bool image_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER;
   bool manual_clamp =
       image_buffer || src_tensor_type == TensorStorageType::BUFFER;

   std::string c = GetCommonDefines(op_def.precision);

   for (int s = 0; s < block_size.w; ++s) {
     const std::string f0 =
         weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s0123"
                            : "f" + std::to_string(s * 4 + 0);
     const std::string f1 =
         weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s4567"
                            : "f" + std::to_string(s * 4 + 1);
     const std::string f2 =
         weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s89ab"
                            : "f" + std::to_string(s * 4 + 2);
     const std::string f3 =
         weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].scdef"
                            : "f" + std::to_string(s * 4 + 3);
     switch (op_def.precision) {
       case CalculationsPrecision::F32:
       case CalculationsPrecision::F16:
         c += "#define CONV" + std::to_string(s) + "(R, S)    \\\n";
         c += "R += S.x * " + f0 + "; \\\n";
         c += "R += S.y * " + f1 + "; \\\n";
         c += "R += S.z * " + f2 + "; \\\n";
         c += "R += S.w * " + f3 + ";   \n";
         break;
       case CalculationsPrecision::F32_F16:
         c += "#define CONV" + std::to_string(s) + "(R, S) \\\n";
         c += "R += convert_float4(S.x * " + f0 + " + S.y * " + f1 +
              " + S.z * " + f2 + " + S.w * " + f3 + ");\n";
         break;
     }
   }

   switch (op_def.precision) {
     case CalculationsPrecision::F32:
       c += "#define FLT16 float16\n";
       break;
     case CalculationsPrecision::F32_F16:
     case CalculationsPrecision::F16:
       c += "#define FLT16 half16\n";
       break;
   }

   c += "__kernel void main_function(\n";
   c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
     c += "  int dst_x = (linear_id / args.dst_tensor.Batch());\n";
     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
     c += "  args.dst_tensor.SetBatchRef(B);\n";
     c += "  args.src_tensor.SetBatchRef(B);\n";
   } else {
     c += "  int dst_x = get_global_id(0);\n";
   }
   c += "  int rem_x = dst_x % args.stride_x;\n";
   c += "  int ceil_x = dst_x / args.stride_x;\n";
   c += "  dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) +
        " + rem_x;\n";
   c += "  int dst_y = get_global_id(1);\n";
   c += "  int rem_y = dst_y % args.stride_y;\n";
   c += "  int ceil_y = dst_y / args.stride_y;\n";
   c += "  dst_y = ceil_y * args.stride_y * " + std::to_string(block_size.y) +
        " + rem_y;\n";
   c += "  int linear_id_z = get_global_id(2);\n";
   c += "  int S = (linear_id_z % args.grid_size_s) * " +
        std::to_string(block_size.w) + ";\n";
   c += "  int dst_z = linear_id_z / args.grid_size_s;\n";
   c += "  int rem_z = dst_z % args.stride_z;\n";
   c += "  int ceil_z = dst_z / args.stride_z;\n";
   c += "  dst_z = ceil_z * args.stride_z * " + std::to_string(block_size.z) +
        " + rem_z;\n";
   c += "  if (dst_x >= args.dst_tensor.Width() || dst_y >= "
        "args.dst_tensor.Height() || dst_z >= "
        "args.dst_tensor.Depth()) return;\n";
   if (weights_are_buffer) {
     c += "  int f_base = S * args.src_tensor.Slices() * args.kernel_size_x * "
          "args.kernel_size_y * "
          "args.kernel_size_z;\n";
   }
   for (int i = 0; i < block_size.x * block_size.y * block_size.z * block_size.w;
        ++i) {
     c += "  ACCUM_FLT4 r" + std::to_string(i) +
          " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   }
   c += "  int kernel_first_dst_x = dst_x + args.padding_x;\n";
   c += "  int kernel_first_dst_y = dst_y + args.padding_y;\n";
   c += "  int kernel_first_dst_z = dst_z + args.padding_z;\n";
   c += "  int kernel_last_dst_x = kernel_first_dst_x - args.kernel_size_x;\n";
   c += "  int kernel_last_dst_y = kernel_first_dst_y - args.kernel_size_y;\n";
   c += "  int kernel_last_dst_z = kernel_first_dst_z - args.kernel_size_z;\n";
   c += "  int offset_x = abs(args.padding_x);\n";
   c += "  int offset_x_strided = offset_x * args.stride_x;\n";
   c +=
       "  int src_x = (kernel_first_dst_x + offset_x_strided) / args.stride_x - "
       "offset_x;\n";
   c += "  int offset_y = abs(args.padding_y);\n";
   c += "  int offset_y_strided = offset_y * args.stride_y;\n";
   c +=
       "  int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - "
       "offset_y;\n";
   c += "  int offset_z = abs(args.padding_z);\n";
   c += "  int offset_z_strided = offset_z * args.stride_z;\n";
   c +=
       "  int src_z = (kernel_first_dst_z + offset_z_strided) / args.stride_z - "
       "offset_z;\n";
   c += "  int src_as_dst_z = src_z * args.stride_z;\n";
   c += "  for (;src_as_dst_z > kernel_last_dst_z; src_z -= 1, src_as_dst_z -= "
        "args.stride_z) {\n";
   for (int z = 0; z < block_size.z; ++z) {
     const std::string zindex = std::to_string(z);
     c += "    int sz" + zindex + " = src_z + " + zindex + ";\n";
     if (src_tensor_type != TensorStorageType::TEXTURE_3D) {
       c += "    bool in_z" + zindex + " = sz" + zindex + " >= 0 && sz" +
            zindex + " < args.src_tensor.Depth();\n";
     }
   }
   if (block_size.z == 1 && (src_tensor_type != TensorStorageType::TEXTURE_3D)) {
     c += "    if (!in_z0) continue;\n";
   }
   c += "    int kernel_z = kernel_first_dst_z - src_as_dst_z;\n";
   c += "    int src_as_dst_y = src_y * args.stride_y;\n";
   c += "    int src_y_copy = src_y;\n";
   c += "    for (;src_as_dst_y > kernel_last_dst_y; src_y_copy -= 1, "
        "src_as_dst_y -= "
        "args.stride_y) {\n";
   for (int y = 0; y < block_size.y; ++y) {
     const std::string yindex = std::to_string(y);
     c += "      int sy" + yindex + " = src_y_copy + " + yindex + ";\n";
     if (manual_clamp) {
       c += "      bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" +
            yindex + " < args.src_tensor.Height();\n";
       if (!image_buffer) {
         c += "      sy" + yindex + " = clamp(sy" + yindex +
              ", 0, args.src_tensor.Height() - 1);\n";
       }
     }
   }
   c += "      int kernel_y = kernel_first_dst_y - src_as_dst_y;\n";
   c += "      int src_as_dst_x = src_x * args.stride_x;\n";
   c += "      int src_x_copy = src_x;\n";
   c += "      for (;src_as_dst_x > kernel_last_dst_x; src_x_copy -= 1, "
        "src_as_dst_x "
        "-= args.stride_x) {\n";
   for (int x = 0; x < block_size.x; ++x) {
     const std::string xindex = std::to_string(x);
     c += "        int sx" + xindex + " = src_x_copy + " + xindex + ";\n";
     if (manual_clamp) {
       c += "        bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" +
            xindex + " < args.src_tensor.Width();\n";
       if (!image_buffer) {
         c += "        sx" + xindex + " = clamp(sx" + xindex +
              ", 0, args.src_tensor.Width() - 1);\n";
       }
     }
   }
   const std::string layer_offset = "args.src_tensor.SliceStride()";
   for (int z = 0; z < block_size.z; ++z) {
     const std::string zindex = std::to_string(z);
     for (int y = 0; y < block_size.y; ++y) {
       const std::string yindex = std::to_string(y);
       for (int x = 0; x < block_size.x; ++x) {
         const std::string xindex = std::to_string(x);
         const std::string id =
             std::to_string((z * block_size.y + y) * block_size.x + x);
         c += "        args.src_tensor.GetAddress(addr_" + id + ", sx" + xindex +
              ", sy" + yindex + ", sz" + zindex + ", 0);";
         if (image_buffer) {
           c += "        addr_" + id + " = select(-1, addr_" + id + ", (in_x" +
                xindex + " && in_y" + yindex + "));\n";
           c += absl::Substitute(
               "        int dz_$0 = select(0, $3, (in_x$1 && "
               "in_y$2));\n",
               id, x, y, layer_offset);
         }
       }
     }
   }
   if (src_tensor_type == TensorStorageType::BUFFER) {
     c += "        int dz = " + layer_offset + ";\n";
   }
   if (block_size.x == 1 && block_size.y == 1 && manual_clamp) {
     c += "        if (!in_x0 || !in_y0) continue;\n";
   }
   c += "        int kernel_x = kernel_first_dst_x - src_as_dst_x;\n";
   c += "        int kernel_index =(kernel_z * args.kernel_size_y + kernel_y) * "
        "args.kernel_size_x + kernel_x;\n";
   if (weights_are_buffer) {
     c += "        int f_offset = f_base + kernel_index * "
          "args.src_tensor.Slices() * " +
          std::to_string(block_size.w) + ";\n";
   } else {
     c += "        int x_c = kernel_index * args.src_tensor.Slices();\n";
   }
   c += "        for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
   for (int y = 0; y < block_size.y; ++y) {
     const std::string yindex = std::to_string(y);
     for (int x = 0; x < block_size.x; ++x) {
       const std::string xindex = std::to_string(x);
       const std::string id = std::to_string(y * block_size.x + x);
       if (image_buffer) {
         c += "          FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
              "); addr_" + id + " += dz_" + id + ";\n";
       } else if (manual_clamp) {
         c += "          FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
              ") * (FLT)(in_x" + xindex + " && in_y" + yindex + "); addr_" + id +
              " += dz;\n";
       } else {
         c += "          FLT4 src" + id + " = args.src_tensor.Read(sx" + xindex +
              ", sy" + yindex + ", sz0, s);\n";
       }
     }
   }
   if (weights_are_buffer) {
     c += "          __global FLT16* weights_cache = "
          "args.weights.GetPtr(f_offset);\n";
     c += "          f_offset += " + std::to_string(block_size.w) + ";\n";
   } else {
     for (int z = 0; z < block_size.w; ++z) {
       c += absl::Substitute(
           R"(          FLT4 f$1 = args.weights0.Read(S + $0, x_c);
           FLT4 f$2 = args.weights1.Read(S + $0, x_c);
           FLT4 f$3 = args.weights2.Read(S + $0, x_c);
           FLT4 f$4 = args.weights3.Read(S + $0, x_c);
 )",
           z, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
     }
     c += "          x_c++;\n";
   }
   for (int z = 0; z < block_size.w; ++z) {
     for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
       c += "          CONV" + std::to_string(z) + "(r" +
            std::to_string(i + z * block_size.x * block_size.y * block_size.z) +
            ", src" + std::to_string(i) + ");\n";
     }
   }
   c += "        }\n";
   c += "      }\n";
   c += "    }\n";
   c += "  }\n";
   for (int s = 0; s < block_size.w; ++s) {
     c += "  if (S < args.dst_tensor.Slices()) {\n";
     c += "    FLT4 bias_val = args.biases.Read(S);\n";
     for (int z = 0; z < block_size.z; ++z) {
       for (int y = 0; y < block_size.y; ++y) {
         for (int x = 0; x < block_size.x; ++x) {
           const std::string id = std::to_string(
               ((s * block_size.z + z) * block_size.y + y) * block_size.x + x);
           c += "    {\n";
           c += "      int xc = dst_x + args.stride_x * " + std::to_string(x) +
                ";\n";
           c += "      int yc = dst_y + args.stride_y * " + std::to_string(y) +
                ";\n";
           c += "      int zc = dst_z + args.stride_z * " + std::to_string(z) +
                ";\n";
           c += "      if (xc < args.dst_tensor.Width() && yc < "
                "args.dst_tensor.Height() && zc < args.dst_tensor.Depth()) {\n";
           c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
           c += "        args.dst_tensor.Write(res, xc, yc, zc, S)\n";
           c += "      }\n";
           c += "    }\n";
         }
       }
     }
     c += "  }\n";
     c += "  S++;\n";
   }
   c += "}\n";
   return c;
 }
 }  // namespace

 ConvolutionTransposed3D::ConvolutionTransposed3D(
     const OperationDef& definition,
     const ConvolutionTransposed3DAttributes& attr, const CLDevice& device)
     : GPUOperation(definition),
       weights_are_buffer_(device.IsMali()),
       kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
                    attr.weights.shape.d),
       stride_(attr.stride.w, attr.stride.h, attr.stride.d),
       padding_(attr.padding.prepended.w, attr.padding.prepended.h,
                attr.padding.prepended.d),
       block_size_(2, 2, 1, 2) {}

 ConvolutionTransposed3D::ConvolutionTransposed3D(
     ConvolutionTransposed3D&& operation)
     : GPUOperation(std::move(operation)),
       weights_are_buffer_(operation.weights_are_buffer_),
       kernel_size_(operation.kernel_size_),
       stride_(operation.stride_),
       padding_(operation.padding_),
       block_size_(operation.block_size_) {}

 ConvolutionTransposed3D& ConvolutionTransposed3D::operator=(
     ConvolutionTransposed3D&& operation) {
   if (this != &operation) {
     std::swap(weights_are_buffer_, operation.weights_are_buffer_);
     std::swap(kernel_size_, operation.kernel_size_);
     std::swap(stride_, operation.stride_);
     std::swap(padding_, operation.padding_);
     std::swap(block_size_, operation.block_size_);
     GPUOperation::operator=(std::move(operation));
   }
   return *this;
 }

 absl::Status ConvolutionTransposed3D::Compile(
     const CreationContext& creation_context) {
   std::string code = GenerateConvolutionTransposed3DCode(
       definition_, *creation_context.device, weights_are_buffer_, block_size_,
       &args_);
   std::string element_wise_code;
   RETURN_IF_ERROR(
       MergeOperations(linked_operations_, &args_, &element_wise_code));
   RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
                                           {{"dst_tensor", element_wise_code}},
                                           &code));

   std::vector<CompilerOptions> options;
   if (creation_context.device->IsPowerVR() && block_size_.y != 1) {
     bool is_texture3d = definition_.src_tensors[0].storage_type ==
                         TensorStorageType::TEXTURE_3D;
     bool is_texture_array = definition_.src_tensors[0].storage_type ==
                             TensorStorageType::TEXTURE_ARRAY;
     if (is_texture3d || is_texture_array) {
       options.push_back(CompilerOptions::CL_OPT_DISABLE);
     }
   }
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", options, *creation_context.context,
       *creation_context.device, &kernel_);
 }

 absl::Status ConvolutionTransposed3D::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
   RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
   RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
   RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x));
   RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
   RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
   RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
   RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
   RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
   return args_.SetInt("grid_size_s",
                       DivideRoundUp(dst_[0]->Slices(), block_size_.w));
 }

 int3 ConvolutionTransposed3D::GetGridSize() const {
   const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
   const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
   const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
   const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
   const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
   const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w) *
                      DivideRoundUp(aligned_d, block_size_.z);
   return int3(grid_x, grid_y, grid_z);
 }

 absl::Status ConvolutionTransposed3D::Tune(const TuningParameters& params) {
   RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
   return GetBestWorkGroupConv(params, kernel_, grid_size_, &work_group_size_);
 }

 absl::Status CreateConvolutionTransposed3D(
     const CreationContext& creation_context, const OperationDef& definition,
     const ConvolutionTransposed3DAttributes& attr,
     ConvolutionTransposed3D* result) {
   *result = ConvolutionTransposed3D(definition, attr, *creation_context.device);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));

   TensorLinearDescriptor desc;
   desc.storage_type =
       DeduceLinearStorageType(definition.GetPrimaryStorageType());
   desc.element_type = definition.GetDataType();

   LinearStorage lt;
   RETURN_IF_ERROR(
       CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
   result->args_.AddObject("biases", AccessType::READ,
                           absl::make_unique<LinearStorage>(std::move(lt)),
                           absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }

 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
	/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h"

	#include <string>
	#include <utility>

	#include "absl/strings/substitute.h"
	#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
	#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
	#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
	#include "tensorflow/lite/delegates/gpu/common/status.h"

	namespace tflite {
	namespace gpu {
	namespace cl {
	namespace {

	std::string GenerateConvolutionTransposed3DCode(const OperationDef& op_def,
	const CLDevice& device,
	bool weights_are_buffer,
	const int4& block_size,
	Arguments* args) {
	auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
	src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
	args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
	args->AddObjectRef(
	"dst_tensor", AccessType::WRITE,
	absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
	args->AddInt("stride_x");
	args->AddInt("stride_y");
	args->AddInt("stride_z");
	args->AddInt("padding_x");
	args->AddInt("padding_y");
	args->AddInt("padding_z");
	args->AddInt("kernel_size_x");
	args->AddInt("kernel_size_y");
	args->AddInt("kernel_size_z");
	args->AddInt("grid_size_s");

	const auto src_tensor_type = op_def.src_tensors[0].storage_type;
	bool image_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER;
	bool manual_clamp =
	image_buffer \|\| src_tensor_type == TensorStorageType::BUFFER;

	std::string c = GetCommonDefines(op_def.precision);

	for (int s = 0; s < block_size.w; ++s) {
	const std::string f0 =
	weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s0123"
	: "f" + std::to_string(s * 4 + 0);
	const std::string f1 =
	weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s4567"
	: "f" + std::to_string(s * 4 + 1);
	const std::string f2 =
	weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s89ab"
	: "f" + std::to_string(s * 4 + 2);
	const std::string f3 =
	weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].scdef"
	: "f" + std::to_string(s * 4 + 3);
	switch (op_def.precision) {
	case CalculationsPrecision::F32:
	case CalculationsPrecision::F16:
	c += "#define CONV" + std::to_string(s) + "(R, S) \\\n";
	c += "R += S.x * " + f0 + "; \\\n";
	c += "R += S.y * " + f1 + "; \\\n";
	c += "R += S.z * " + f2 + "; \\\n";
	c += "R += S.w * " + f3 + "; \n";
	break;
	case CalculationsPrecision::F32_F16:
	c += "#define CONV" + std::to_string(s) + "(R, S) \\\n";
	c += "R += convert_float4(S.x * " + f0 + " + S.y * " + f1 +
	" + S.z * " + f2 + " + S.w * " + f3 + ");\n";
	break;
	}
	}

	switch (op_def.precision) {
	case CalculationsPrecision::F32:
	c += "#define FLT16 float16\n";
	break;
	case CalculationsPrecision::F32_F16:
	case CalculationsPrecision::F16:
	c += "#define FLT16 half16\n";
	break;
	}

	c += "__kernel void main_function(\n";
	c += "$0) {\n";
	if (op_def.IsBatchSupported()) {
	c += " int linear_id = get_global_id(0);\n";
	c += " int dst_x = (linear_id / args.dst_tensor.Batch());\n";
	c += " int B = linear_id % args.dst_tensor.Batch();\n";
	c += " args.dst_tensor.SetBatchRef(B);\n";
	c += " args.src_tensor.SetBatchRef(B);\n";
	} else {
	c += " int dst_x = get_global_id(0);\n";
	}
	c += " int rem_x = dst_x % args.stride_x;\n";
	c += " int ceil_x = dst_x / args.stride_x;\n";
	c += " dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) +
	" + rem_x;\n";
	c += " int dst_y = get_global_id(1);\n";
	c += " int rem_y = dst_y % args.stride_y;\n";
	c += " int ceil_y = dst_y / args.stride_y;\n";
	c += " dst_y = ceil_y * args.stride_y * " + std::to_string(block_size.y) +
	" + rem_y;\n";
	c += " int linear_id_z = get_global_id(2);\n";
	c += " int S = (linear_id_z % args.grid_size_s) * " +
	std::to_string(block_size.w) + ";\n";
	c += " int dst_z = linear_id_z / args.grid_size_s;\n";
	c += " int rem_z = dst_z % args.stride_z;\n";
	c += " int ceil_z = dst_z / args.stride_z;\n";
	c += " dst_z = ceil_z * args.stride_z * " + std::to_string(block_size.z) +
	" + rem_z;\n";
	c += " if (dst_x >= args.dst_tensor.Width() \|\| dst_y >= "
	"args.dst_tensor.Height() \|\| dst_z >= "
	"args.dst_tensor.Depth()) return;\n";
	if (weights_are_buffer) {
	c += " int f_base = S * args.src_tensor.Slices() * args.kernel_size_x * "
	"args.kernel_size_y * "
	"args.kernel_size_z;\n";
	}
	for (int i = 0; i < block_size.x * block_size.y * block_size.z * block_size.w;
	++i) {
	c += " ACCUM_FLT4 r" + std::to_string(i) +
	" = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
	}
	c += " int kernel_first_dst_x = dst_x + args.padding_x;\n";
	c += " int kernel_first_dst_y = dst_y + args.padding_y;\n";
	c += " int kernel_first_dst_z = dst_z + args.padding_z;\n";
	c += " int kernel_last_dst_x = kernel_first_dst_x - args.kernel_size_x;\n";
	c += " int kernel_last_dst_y = kernel_first_dst_y - args.kernel_size_y;\n";
	c += " int kernel_last_dst_z = kernel_first_dst_z - args.kernel_size_z;\n";
	c += " int offset_x = abs(args.padding_x);\n";
	c += " int offset_x_strided = offset_x * args.stride_x;\n";
	c +=
	" int src_x = (kernel_first_dst_x + offset_x_strided) / args.stride_x - "
	"offset_x;\n";
	c += " int offset_y = abs(args.padding_y);\n";
	c += " int offset_y_strided = offset_y * args.stride_y;\n";
	c +=
	" int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - "
	"offset_y;\n";
	c += " int offset_z = abs(args.padding_z);\n";
	c += " int offset_z_strided = offset_z * args.stride_z;\n";
	c +=
	" int src_z = (kernel_first_dst_z + offset_z_strided) / args.stride_z - "
	"offset_z;\n";
	c += " int src_as_dst_z = src_z * args.stride_z;\n";
	c += " for (;src_as_dst_z > kernel_last_dst_z; src_z -= 1, src_as_dst_z -= "
	"args.stride_z) {\n";
	for (int z = 0; z < block_size.z; ++z) {
	const std::string zindex = std::to_string(z);
	c += " int sz" + zindex + " = src_z + " + zindex + ";\n";
	if (src_tensor_type != TensorStorageType::TEXTURE_3D) {
	c += " bool in_z" + zindex + " = sz" + zindex + " >= 0 && sz" +
	zindex + " < args.src_tensor.Depth();\n";
	}
	}
	if (block_size.z == 1 && (src_tensor_type != TensorStorageType::TEXTURE_3D)) {
	c += " if (!in_z0) continue;\n";
	}
	c += " int kernel_z = kernel_first_dst_z - src_as_dst_z;\n";
	c += " int src_as_dst_y = src_y * args.stride_y;\n";
	c += " int src_y_copy = src_y;\n";
	c += " for (;src_as_dst_y > kernel_last_dst_y; src_y_copy -= 1, "
	"src_as_dst_y -= "
	"args.stride_y) {\n";
	for (int y = 0; y < block_size.y; ++y) {
	const std::string yindex = std::to_string(y);
	c += " int sy" + yindex + " = src_y_copy + " + yindex + ";\n";
	if (manual_clamp) {
	c += " bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" +
	yindex + " < args.src_tensor.Height();\n";
	if (!image_buffer) {
	c += " sy" + yindex + " = clamp(sy" + yindex +
	", 0, args.src_tensor.Height() - 1);\n";
	}
	}
	}
	c += " int kernel_y = kernel_first_dst_y - src_as_dst_y;\n";
	c += " int src_as_dst_x = src_x * args.stride_x;\n";
	c += " int src_x_copy = src_x;\n";
	c += " for (;src_as_dst_x > kernel_last_dst_x; src_x_copy -= 1, "
	"src_as_dst_x "
	"-= args.stride_x) {\n";
	for (int x = 0; x < block_size.x; ++x) {
	const std::string xindex = std::to_string(x);
	c += " int sx" + xindex + " = src_x_copy + " + xindex + ";\n";
	if (manual_clamp) {
	c += " bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" +
	xindex + " < args.src_tensor.Width();\n";
	if (!image_buffer) {
	c += " sx" + xindex + " = clamp(sx" + xindex +
	", 0, args.src_tensor.Width() - 1);\n";
	}
	}
	}
	const std::string layer_offset = "args.src_tensor.SliceStride()";
	for (int z = 0; z < block_size.z; ++z) {
	const std::string zindex = std::to_string(z);
	for (int y = 0; y < block_size.y; ++y) {
	const std::string yindex = std::to_string(y);
	for (int x = 0; x < block_size.x; ++x) {
	const std::string xindex = std::to_string(x);
	const std::string id =
	std::to_string((z * block_size.y + y) * block_size.x + x);
	c += " args.src_tensor.GetAddress(addr_" + id + ", sx" + xindex +
	", sy" + yindex + ", sz" + zindex + ", 0);";
	if (image_buffer) {
	c += " addr_" + id + " = select(-1, addr_" + id + ", (in_x" +
	xindex + " && in_y" + yindex + "));\n";
	c += absl::Substitute(
	" int dz_$0 = select(0, $3, (in_x$1 && "
	"in_y$2));\n",
	id, x, y, layer_offset);
	}
	}
	}
	}
	if (src_tensor_type == TensorStorageType::BUFFER) {
	c += " int dz = " + layer_offset + ";\n";
	}
	if (block_size.x == 1 && block_size.y == 1 && manual_clamp) {
	c += " if (!in_x0 \|\| !in_y0) continue;\n";
	}
	c += " int kernel_x = kernel_first_dst_x - src_as_dst_x;\n";
	c += " int kernel_index =(kernel_z * args.kernel_size_y + kernel_y) * "
	"args.kernel_size_x + kernel_x;\n";
	if (weights_are_buffer) {
	c += " int f_offset = f_base + kernel_index * "
	"args.src_tensor.Slices() * " +
	std::to_string(block_size.w) + ";\n";
	} else {
	c += " int x_c = kernel_index * args.src_tensor.Slices();\n";
	}
	c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
	for (int y = 0; y < block_size.y; ++y) {
	const std::string yindex = std::to_string(y);
	for (int x = 0; x < block_size.x; ++x) {
	const std::string xindex = std::to_string(x);
	const std::string id = std::to_string(y * block_size.x + x);
	if (image_buffer) {
	c += " FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
	"); addr_" + id + " += dz_" + id + ";\n";
	} else if (manual_clamp) {
	c += " FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
	") * (FLT)(in_x" + xindex + " && in_y" + yindex + "); addr_" + id +
	" += dz;\n";
	} else {
	c += " FLT4 src" + id + " = args.src_tensor.Read(sx" + xindex +
	", sy" + yindex + ", sz0, s);\n";
	}
	}
	}
	if (weights_are_buffer) {
	c += " __global FLT16* weights_cache = "
	"args.weights.GetPtr(f_offset);\n";
	c += " f_offset += " + std::to_string(block_size.w) + ";\n";
	} else {
	for (int z = 0; z < block_size.w; ++z) {
	c += absl::Substitute(
	R"( FLT4 f$1 = args.weights0.Read(S + $0, x_c);
	FLT4 f$2 = args.weights1.Read(S + $0, x_c);
	FLT4 f$3 = args.weights2.Read(S + $0, x_c);
	FLT4 f$4 = args.weights3.Read(S + $0, x_c);
	)",
	z, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
	}
	c += " x_c++;\n";
	}
	for (int z = 0; z < block_size.w; ++z) {
	for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
	c += " CONV" + std::to_string(z) + "(r" +
	std::to_string(i + z * block_size.x * block_size.y * block_size.z) +
	", src" + std::to_string(i) + ");\n";
	}
	}
	c += " }\n";
	c += " }\n";
	c += " }\n";
	c += " }\n";
	for (int s = 0; s < block_size.w; ++s) {
	c += " if (S < args.dst_tensor.Slices()) {\n";
	c += " FLT4 bias_val = args.biases.Read(S);\n";
	for (int z = 0; z < block_size.z; ++z) {
	for (int y = 0; y < block_size.y; ++y) {
	for (int x = 0; x < block_size.x; ++x) {
	const std::string id = std::to_string(
	((s * block_size.z + z) * block_size.y + y) * block_size.x + x);
	c += " {\n";
	c += " int xc = dst_x + args.stride_x * " + std::to_string(x) +
	";\n";
	c += " int yc = dst_y + args.stride_y * " + std::to_string(y) +
	";\n";
	c += " int zc = dst_z + args.stride_z * " + std::to_string(z) +
	";\n";
	c += " if (xc < args.dst_tensor.Width() && yc < "
	"args.dst_tensor.Height() && zc < args.dst_tensor.Depth()) {\n";
	c += " FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
	c += " args.dst_tensor.Write(res, xc, yc, zc, S)\n";
	c += " }\n";
	c += " }\n";
	}
	}
	}
	c += " }\n";
	c += " S++;\n";
	}
	c += "}\n";
	return c;
	}
	} // namespace

	ConvolutionTransposed3D::ConvolutionTransposed3D(
	const OperationDef& definition,
	const ConvolutionTransposed3DAttributes& attr, const CLDevice& device)
	: GPUOperation(definition),
	weights_are_buffer_(device.IsMali()),
	kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
	attr.weights.shape.d),
	stride_(attr.stride.w, attr.stride.h, attr.stride.d),
	padding_(attr.padding.prepended.w, attr.padding.prepended.h,
	attr.padding.prepended.d),
	block_size_(2, 2, 1, 2) {}

	ConvolutionTransposed3D::ConvolutionTransposed3D(
	ConvolutionTransposed3D&& operation)
	: GPUOperation(std::move(operation)),
	weights_are_buffer_(operation.weights_are_buffer_),
	kernel_size_(operation.kernel_size_),
	stride_(operation.stride_),
	padding_(operation.padding_),
	block_size_(operation.block_size_) {}

	ConvolutionTransposed3D& ConvolutionTransposed3D::operator=(
	ConvolutionTransposed3D&& operation) {
	if (this != &operation) {
	std::swap(weights_are_buffer_, operation.weights_are_buffer_);
	std::swap(kernel_size_, operation.kernel_size_);
	std::swap(stride_, operation.stride_);
	std::swap(padding_, operation.padding_);
	std::swap(block_size_, operation.block_size_);
	GPUOperation::operator=(std::move(operation));
	}
	return *this;
	}

	absl::Status ConvolutionTransposed3D::Compile(
	const CreationContext& creation_context) {
	std::string code = GenerateConvolutionTransposed3DCode(
	definition_, *creation_context.device, weights_are_buffer_, block_size_,
	&args_);
	std::string element_wise_code;
	RETURN_IF_ERROR(
	MergeOperations(linked_operations_, &args_, &element_wise_code));
	RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
	{{"dst_tensor", element_wise_code}},
	&code));

	std::vector<CompilerOptions> options;
	if (creation_context.device->IsPowerVR() && block_size_.y != 1) {
	bool is_texture3d = definition_.src_tensors[0].storage_type ==
	TensorStorageType::TEXTURE_3D;
	bool is_texture_array = definition_.src_tensors[0].storage_type ==
	TensorStorageType::TEXTURE_ARRAY;
	if (is_texture3d \|\| is_texture_array) {
	options.push_back(CompilerOptions::CL_OPT_DISABLE);
	}
	}
	return creation_context.cache->GetOrCreateCLKernel(
	code, "main_function", options, *creation_context.context,
	*creation_context.device, &kernel_);
	}

	absl::Status ConvolutionTransposed3D::BindArguments() {
	RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
	RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
	RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
	RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
	RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
	RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x));
	RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
	RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
	RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
	RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
	RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
	return args_.SetInt("grid_size_s",
	DivideRoundUp(dst_[0]->Slices(), block_size_.w));
	}

	int3 ConvolutionTransposed3D::GetGridSize() const {
	const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
	const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
	const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
	const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
	const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
	const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w) *
	DivideRoundUp(aligned_d, block_size_.z);
	return int3(grid_x, grid_y, grid_z);
	}

	absl::Status ConvolutionTransposed3D::Tune(const TuningParameters& params) {
	RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
	return GetBestWorkGroupConv(params, kernel_, grid_size_, &work_group_size_);
	}

	absl::Status CreateConvolutionTransposed3D(
	const CreationContext& creation_context, const OperationDef& definition,
	const ConvolutionTransposed3DAttributes& attr,
	ConvolutionTransposed3D* result) {
	result = ConvolutionTransposed3D(definition, attr, creation_context.device);
	RETURN_IF_ERROR(
	result->UploadWeights(attr.weights, creation_context.context));

	TensorLinearDescriptor desc;
	desc.storage_type =
	DeduceLinearStorageType(definition.GetPrimaryStorageType());
	desc.element_type = definition.GetDataType();

	LinearStorage lt;
	RETURN_IF_ERROR(
	CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
	result->args_.AddObject("biases", AccessType::READ,
	absl::make_unique<LinearStorage>(std::move(lt)),
	absl::make_unique<TensorLinearDescriptor>(desc));
	return absl::OkStatus();
	}

	} // namespace cl
	} // namespace gpu
	} // namespace tflite