tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h"

 #include <string>
 #include <utility>
 #include <vector>

 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"

 namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {

 bool IsSpecializedCase(int channel_multiplier) {
   return channel_multiplier == 1 || channel_multiplier == 2 ||
          channel_multiplier == 4;
 }

 std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
                         int channel_multiplier,
                         TextureAddressMode address_mode) {
   std::string c;
   if (channel_multiplier == 1) {
     c += "        FLT4 src_final =" +
          src_tensor.ReadWHDS("x_c", "y_c", "z_c", "S", address_mode) + ";\n";
   } else if (channel_multiplier == 2) {
     c += "        int z_layer = S / 2;\n";
     c += "        FLT4 src =" +
          src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
          ";\n";
     c += "        FLT2 t0 = S % 2 == 0 ? src.xy : src.zw;\n";
     c += "        FLT4 src_final = (FLT4)(t0.x, t0.x, t0.y, t0.y);\n";
   } else if (channel_multiplier == 4) {
     c += "        int z_layer = S / 4;\n";
     c += "        FLT4 src =" +
          src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
          ";\n";
     c += "        FLT t0 = src.x;\n";
     c += "        int reminder = S % 4;\n";
     c += "        if (reminder == 1) t0 = src.y;\n";
     c += "        if (reminder == 2) t0 = src.z;\n";
     c += "        if (reminder == 3) t0 = src.w;\n";
     c += "        FLT4 src_final = (FLT4)(t0, t0, t0, t0);\n";
   } else {
     c += "        int z_layer = S / channel_multiplier;\n";
     c += "        FLT4 src =" +
          src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
          ";\n";
     c += "        int z_offset = (S % channel_multiplier) * 4;\n";
     c += "        FLT4 src_final;\n";
     c += "        FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
     c += "        src_final.x = temp_arr[(z_offset + 0) / "
          "channel_multiplier];\n";
     c += "        src_final.y = temp_arr[(z_offset + 1) / "
          "channel_multiplier];\n";
     c += "        src_final.z = temp_arr[(z_offset + 2) / "
          "channel_multiplier];\n";
     c += "        src_final.w = temp_arr[(z_offset + 3) / "
          "channel_multiplier];\n";
   }

   return c;
 }

 std::string GenerateDepthwiseConvolution3DCode(
     const OperationDef& op_def, bool stride_correction,
     const LinearStorage& biases, int channel_multiplier,
     bool weights_are_buffer,
     const std::vector<ElementwiseOperation*>& linked_operations,
     const CLDevice& device) {
   TensorCodeGenerator src_tensor(
       "src_data",
       WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
       op_def.src_tensors[0]);
   TensorCodeGenerator dst_tensor(
       "dst_data",
       WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
       op_def.dst_tensors[0]);
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;

   std::string c = GetCommonDefines(op_def.precision);

   const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
                             src_tensor_type == TensorStorageType::IMAGE_BUFFER;

   c += "__kernel void main_function(\n";
   c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
   if (weights_are_buffer) {
     c += "    __global FLT4* filters,  \n";
   } else {
     c += "    __read_only image2d_t filters,  \n";
   }
   c += biases.GetDeclaration();
   c += GetArgsDeclaration(linked_operations);
   c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
   c += "    int4 kernel_size,                \n";
   c += "    int4 stride,                     \n";
   c += "    int4 padding,                    \n";
   c += "    int4 dilation,                   \n";
   if (!IsSpecializedCase(channel_multiplier)) {
     c += "    int channel_multiplier,            \n";
   }
   if (op_def.IsBatchSupported()) {
     c += "    int batch_size,          \n";
   }
   c += "    int4 src_size,                   \n";
   c += "    int4 dst_size                    \n";
   c += ") {\n";
   c += "  int X = get_global_id(0);\n";
   c += "  int Y = get_global_id(1);\n";
   c += "  int linear_id_z = get_global_id(2);\n";
   c += "  int S = linear_id_z % dst_size.w;\n";
   c += "  int Z = linear_id_z / dst_size.w;\n";
   c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
   c += "  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   if (stride_correction) {
     c += "  int x_offseted = " +
          GetXStrideCorrected("X", "batch_size", "stride.x", "padding.x") +
          ";\n";
   } else {
     c += "  int x_offseted = X * stride.x + padding.x;\n";
   }
   c += "  int y_offseted = Y * stride.y + padding.y;\n";
   c += "  int z_offseted = Z * stride.z + padding.z;\n";
   if (weights_are_buffer) {
     c += "  int fx_c = S * kernel_size.x * kernel_size.y * kernel_size.z;\n";
   } else {
     c += "  int fx_c = 0;\n";
   }

   if (manual_clamp) {
     c += "  for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
     c += "    int z_c = z_offseted + kz * dilation.z;\n";
     c += "    bool outside_z = z_c < 0 || z_c >= src_size.z;\n";
     c += "    for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
     c += "      int y_c = y_offseted + ky * dilation.y;\n";
     c += "      bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
     c += "      for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
     c += "        int x_c = x_offseted + kx * dilation.x;\n";
     c += "        bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
     c += "        if (!outside_x && !outside_y && !outside_z) {\n";
     if (weights_are_buffer) {
       c += "          FLT4 f = filters[fx_c];\n";
     } else {
       c += "          FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, "
            "S));\n";
     }
     c += GetSrcValue(src_tensor, channel_multiplier,
                      TextureAddressMode::DONT_CARE);
     c += "          r += TO_ACCUM_TYPE(src_final * f);\n";
     c += "        };\n";
     c += "        fx_c++;\n";
     c += "      }\n";
     c += "    }\n";
     c += "  }\n";
   } else {  // Texture types with ZERO clamping
     c += "  for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
     c += "    int z_c = z_offseted + kz * dilation.z;\n";
     if (src_tensor_type !=
         TensorStorageType::TEXTURE_3D) {  // Only TEXTURE_3D supports clamping
                                           // in DEPTH dimension
       c += "    if (z_c < 0 || z_c >= src_size.z) {\n";
       c += "      fx_c += kernel_size.y * kernel_size.x;\n";
       c += "      continue;\n";
       c += "    }\n";
     }
     c += "    for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
     c += "      int y_c = y_offseted + ky * dilation.y;\n";
     c += "      for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
     c += "        int x_c = x_offseted + kx * dilation.x;\n";
     const auto access_mode = GetFastestZeroMode(device);
     c += GetSrcValue(src_tensor, channel_multiplier, access_mode);
     if (weights_are_buffer) {
       c += "        FLT4 f = filters[fx_c];\n";
     } else {
       c += "        FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, S));\n";
     }
     c += "        fx_c++;\n";
     c += "        r += TO_ACCUM_TYPE(src_final * f);\n";
     c += "      }\n";
     c += "    }\n";
     c += "  }\n";
   }
   c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("S") + ";\n";
   c += "  FLT4 res0 = TO_FLT4(r) + bias_val;\n";
   const LinkingContext context{"res0", "X", "Y", "S"};
   c += PostProcess(linked_operations, context);
   c += "  " + dst_tensor.WriteWHDS("res0", "X", "Y", "Z", "S") + "\n";
   c += "}\n";
   return c;
 }
 }  // namespace

 DepthwiseConvolution3D::DepthwiseConvolution3D(
     const OperationDef& definition,
     const DepthwiseConvolution3DAttributes& attr, const CLDevice& device)
     : GPUOperation(definition),
       weights_are_buffer_(device.IsMali()),
       kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
                    attr.weights.shape.d),
       stride_(attr.strides.w, attr.strides.h, attr.strides.d),
       padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
                -attr.padding.prepended.d),
       dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
       channel_multiplier_(attr.weights.shape.o),
       work_group_size_(8, 8, 1) {}

 DepthwiseConvolution3D::DepthwiseConvolution3D(
     DepthwiseConvolution3D&& operation)
     : GPUOperation(std::move(operation)),
       weights_tex2d_(std::move(operation.weights_tex2d_)),
       weights_buf_(std::move(operation.weights_buf_)),
       weights_are_buffer_(operation.weights_are_buffer_),
       biases_(std::move(operation.biases_)),
       kernel_size_(operation.kernel_size_),
       stride_(operation.stride_),
       padding_(operation.padding_),
       dilation_(operation.dilation_),
       channel_multiplier_(operation.channel_multiplier_),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}

 DepthwiseConvolution3D& DepthwiseConvolution3D::operator=(
     DepthwiseConvolution3D&& operation) {
   if (this != &operation) {
     weights_tex2d_ = std::move(operation.weights_tex2d_);
     weights_buf_ = std::move(operation.weights_buf_);
     std::swap(weights_are_buffer_, operation.weights_are_buffer_);
     biases_ = std::move(operation.biases_);
     std::swap(kernel_size_, operation.kernel_size_);
     std::swap(stride_, operation.stride_);
     std::swap(padding_, operation.padding_);
     std::swap(dilation_, operation.dilation_);
     std::swap(channel_multiplier_, operation.channel_multiplier_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
   }
   return *this;
 }

 absl::Status DepthwiseConvolution3D::Compile(
     const CreationContext& creation_context) {
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
   const auto code = GenerateDepthwiseConvolution3DCode(
       definition_, stride_correction, biases_, channel_multiplier_,
       weights_are_buffer_, linked_operations_, *creation_context.device);
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }

 absl::Status DepthwiseConvolution3D::BindArguments() {
   kernel_.ResetBindingCounter();
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
   if (weights_are_buffer_) {
     RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
   } else {
     RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_tex2d_.GetMemoryPtr()));
   }
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
   RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(
       int4(kernel_size_.x, kernel_size_.y, kernel_size_.z, 1)));
   RETURN_IF_ERROR(
       kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(
       int4(padding_.x * src_[0]->Batch(), padding_.y, padding_.z, 1)));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(
       int4(dilation_.x * src_[0]->Batch(), dilation_.y, dilation_.z, 1)));
   if (!IsSpecializedCase(channel_multiplier_)) {
     RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(channel_multiplier_)));
   }
   if (definition_.IsBatchSupported()) {
     RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
   }
   RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
   return absl::OkStatus();
 }

 int3 DepthwiseConvolution3D::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
   const int grid_y = dst_[0]->Height();
   const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
   return int3(grid_x, grid_y, grid_z);
 }

 absl::Status DepthwiseConvolution3D::Tune(const TuningParameters& params) {
   RETURN_IF_ERROR(BindArguments());
   return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
 }

 absl::Status DepthwiseConvolution3D::AddToQueue(CLCommandQueue* queue) {
   RETURN_IF_ERROR(BindArguments());
   return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
 }

 absl::Status CreateDepthwiseConvolution3D(
     const CreationContext& creation_context, const OperationDef& definition,
     const DepthwiseConvolution3DAttributes& attr,
     DepthwiseConvolution3D* result) {
   *result = DepthwiseConvolution3D(definition, attr, *creation_context.device);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
   LinearStorageCreateInfo create_info;
   create_info.storage_type =
       DeduceLinearStorageType(definition.GetPrimaryStorageType());
   create_info.data_type = definition.GetDataType();
   create_info.name = "biases";
   create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
   RETURN_IF_ERROR(CreateLinearStorage(
       create_info, attr.bias, creation_context.context, &result->biases_));
   return absl::OkStatus();
 }

 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
	/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include "tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h"

	#include <string>
	#include <utility>
	#include <vector>

	#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
	#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
	#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"

	namespace tflite {
	namespace gpu {
	namespace cl {
	namespace {

	bool IsSpecializedCase(int channel_multiplier) {
	return channel_multiplier == 1 \|\| channel_multiplier == 2 \|\|
	channel_multiplier == 4;
	}

	std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
	int channel_multiplier,
	TextureAddressMode address_mode) {
	std::string c;
	if (channel_multiplier == 1) {
	c += " FLT4 src_final =" +
	src_tensor.ReadWHDS("x_c", "y_c", "z_c", "S", address_mode) + ";\n";
	} else if (channel_multiplier == 2) {
	c += " int z_layer = S / 2;\n";
	c += " FLT4 src =" +
	src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
	";\n";
	c += " FLT2 t0 = S % 2 == 0 ? src.xy : src.zw;\n";
	c += " FLT4 src_final = (FLT4)(t0.x, t0.x, t0.y, t0.y);\n";
	} else if (channel_multiplier == 4) {
	c += " int z_layer = S / 4;\n";
	c += " FLT4 src =" +
	src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
	";\n";
	c += " FLT t0 = src.x;\n";
	c += " int reminder = S % 4;\n";
	c += " if (reminder == 1) t0 = src.y;\n";
	c += " if (reminder == 2) t0 = src.z;\n";
	c += " if (reminder == 3) t0 = src.w;\n";
	c += " FLT4 src_final = (FLT4)(t0, t0, t0, t0);\n";
	} else {
	c += " int z_layer = S / channel_multiplier;\n";
	c += " FLT4 src =" +
	src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
	";\n";
	c += " int z_offset = (S % channel_multiplier) * 4;\n";
	c += " FLT4 src_final;\n";
	c += " FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
	c += " src_final.x = temp_arr[(z_offset + 0) / "
	"channel_multiplier];\n";
	c += " src_final.y = temp_arr[(z_offset + 1) / "
	"channel_multiplier];\n";
	c += " src_final.z = temp_arr[(z_offset + 2) / "
	"channel_multiplier];\n";
	c += " src_final.w = temp_arr[(z_offset + 3) / "
	"channel_multiplier];\n";
	}

	return c;
	}

	std::string GenerateDepthwiseConvolution3DCode(
	const OperationDef& op_def, bool stride_correction,
	const LinearStorage& biases, int channel_multiplier,
	bool weights_are_buffer,
	const std::vector<ElementwiseOperation*>& linked_operations,
	const CLDevice& device) {
	TensorCodeGenerator src_tensor(
	"src_data",
	WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
	op_def.src_tensors[0]);
	TensorCodeGenerator dst_tensor(
	"dst_data",
	WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
	op_def.dst_tensors[0]);
	const auto src_tensor_type = op_def.src_tensors[0].storage_type;

	std::string c = GetCommonDefines(op_def.precision);

	const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER \|\|
	src_tensor_type == TensorStorageType::IMAGE_BUFFER;

	c += "__kernel void main_function(\n";
	c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
	if (weights_are_buffer) {
	c += " __global FLT4* filters, \n";
	} else {
	c += " __read_only image2d_t filters, \n";
	}
	c += biases.GetDeclaration();
	c += GetArgsDeclaration(linked_operations);
	c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
	c += " int4 kernel_size, \n";
	c += " int4 stride, \n";
	c += " int4 padding, \n";
	c += " int4 dilation, \n";
	if (!IsSpecializedCase(channel_multiplier)) {
	c += " int channel_multiplier, \n";
	}
	if (op_def.IsBatchSupported()) {
	c += " int batch_size, \n";
	}
	c += " int4 src_size, \n";
	c += " int4 dst_size \n";
	c += ") {\n";
	c += " int X = get_global_id(0);\n";
	c += " int Y = get_global_id(1);\n";
	c += " int linear_id_z = get_global_id(2);\n";
	c += " int S = linear_id_z % dst_size.w;\n";
	c += " int Z = linear_id_z / dst_size.w;\n";
	c += " if (X >= dst_size.x \|\| Y >= dst_size.y \|\| Z >= dst_size.z) return;\n";
	c += " ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
	if (stride_correction) {
	c += " int x_offseted = " +
	GetXStrideCorrected("X", "batch_size", "stride.x", "padding.x") +
	";\n";
	} else {
	c += " int x_offseted = X * stride.x + padding.x;\n";
	}
	c += " int y_offseted = Y * stride.y + padding.y;\n";
	c += " int z_offseted = Z * stride.z + padding.z;\n";
	if (weights_are_buffer) {
	c += " int fx_c = S * kernel_size.x * kernel_size.y * kernel_size.z;\n";
	} else {
	c += " int fx_c = 0;\n";
	}

	if (manual_clamp) {
	c += " for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
	c += " int z_c = z_offseted + kz * dilation.z;\n";
	c += " bool outside_z = z_c < 0 \|\| z_c >= src_size.z;\n";
	c += " for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
	c += " int y_c = y_offseted + ky * dilation.y;\n";
	c += " bool outside_y = y_c < 0 \|\| y_c >= src_size.y;\n";
	c += " for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
	c += " int x_c = x_offseted + kx * dilation.x;\n";
	c += " bool outside_x = x_c < 0 \|\| x_c >= src_size.x;\n";
	c += " if (!outside_x && !outside_y && !outside_z) {\n";
	if (weights_are_buffer) {
	c += " FLT4 f = filters[fx_c];\n";
	} else {
	c += " FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, "
	"S));\n";
	}
	c += GetSrcValue(src_tensor, channel_multiplier,
	TextureAddressMode::DONT_CARE);
	c += " r += TO_ACCUM_TYPE(src_final * f);\n";
	c += " };\n";
	c += " fx_c++;\n";
	c += " }\n";
	c += " }\n";
	c += " }\n";
	} else { // Texture types with ZERO clamping
	c += " for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
	c += " int z_c = z_offseted + kz * dilation.z;\n";
	if (src_tensor_type !=
	TensorStorageType::TEXTURE_3D) { // Only TEXTURE_3D supports clamping
	// in DEPTH dimension
	c += " if (z_c < 0 \|\| z_c >= src_size.z) {\n";
	c += " fx_c += kernel_size.y * kernel_size.x;\n";
	c += " continue;\n";
	c += " }\n";
	}
	c += " for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
	c += " int y_c = y_offseted + ky * dilation.y;\n";
	c += " for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
	c += " int x_c = x_offseted + kx * dilation.x;\n";
	const auto access_mode = GetFastestZeroMode(device);
	c += GetSrcValue(src_tensor, channel_multiplier, access_mode);
	if (weights_are_buffer) {
	c += " FLT4 f = filters[fx_c];\n";
	} else {
	c += " FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, S));\n";
	}
	c += " fx_c++;\n";
	c += " r += TO_ACCUM_TYPE(src_final * f);\n";
	c += " }\n";
	c += " }\n";
	c += " }\n";
	}
	c += " FLT4 bias_val = " + biases.ReadLinearFLT4("S") + ";\n";
	c += " FLT4 res0 = TO_FLT4(r) + bias_val;\n";
	const LinkingContext context{"res0", "X", "Y", "S"};
	c += PostProcess(linked_operations, context);
	c += " " + dst_tensor.WriteWHDS("res0", "X", "Y", "Z", "S") + "\n";
	c += "}\n";
	return c;
	}
	} // namespace

	DepthwiseConvolution3D::DepthwiseConvolution3D(
	const OperationDef& definition,
	const DepthwiseConvolution3DAttributes& attr, const CLDevice& device)
	: GPUOperation(definition),
	weights_are_buffer_(device.IsMali()),
	kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
	attr.weights.shape.d),
	stride_(attr.strides.w, attr.strides.h, attr.strides.d),
	padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
	-attr.padding.prepended.d),
	dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
	channel_multiplier_(attr.weights.shape.o),
	work_group_size_(8, 8, 1) {}

	DepthwiseConvolution3D::DepthwiseConvolution3D(
	DepthwiseConvolution3D&& operation)
	: GPUOperation(std::move(operation)),
	weights_tex2d_(std::move(operation.weights_tex2d_)),
	weights_buf_(std::move(operation.weights_buf_)),
	weights_are_buffer_(operation.weights_are_buffer_),
	biases_(std::move(operation.biases_)),
	kernel_size_(operation.kernel_size_),
	stride_(operation.stride_),
	padding_(operation.padding_),
	dilation_(operation.dilation_),
	channel_multiplier_(operation.channel_multiplier_),
	kernel_(std::move(operation.kernel_)),
	work_group_size_(operation.work_group_size_) {}

	DepthwiseConvolution3D& DepthwiseConvolution3D::operator=(
	DepthwiseConvolution3D&& operation) {
	if (this != &operation) {
	weights_tex2d_ = std::move(operation.weights_tex2d_);
	weights_buf_ = std::move(operation.weights_buf_);
	std::swap(weights_are_buffer_, operation.weights_are_buffer_);
	biases_ = std::move(operation.biases_);
	std::swap(kernel_size_, operation.kernel_size_);
	std::swap(stride_, operation.stride_);
	std::swap(padding_, operation.padding_);
	std::swap(dilation_, operation.dilation_);
	std::swap(channel_multiplier_, operation.channel_multiplier_);
	kernel_ = std::move(operation.kernel_);
	std::swap(work_group_size_, operation.work_group_size_);
	GPUOperation::operator=(std::move(operation));
	}
	return *this;
	}

	absl::Status DepthwiseConvolution3D::Compile(
	const CreationContext& creation_context) {
	const bool stride_correction =
	definition_.IsBatchSupported() && stride_.x != 1;
	const auto code = GenerateDepthwiseConvolution3DCode(
	definition_, stride_correction, biases_, channel_multiplier_,
	weights_are_buffer_, linked_operations_, *creation_context.device);
	return creation_context.cache->GetOrCreateCLKernel(
	code, "main_function", *creation_context.context,
	*creation_context.device, &kernel_);
	}

	absl::Status DepthwiseConvolution3D::BindArguments() {
	kernel_.ResetBindingCounter();
	RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
	if (weights_are_buffer_) {
	RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
	} else {
	RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_tex2d_.GetMemoryPtr()));
	}
	RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
	RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
	RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
	RETURN_IF_ERROR(kernel_.SetBytesAuto(
	int4(kernel_size_.x, kernel_size_.y, kernel_size_.z, 1)));
	RETURN_IF_ERROR(
	kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
	RETURN_IF_ERROR(kernel_.SetBytesAuto(
	int4(padding_.x * src_[0]->Batch(), padding_.y, padding_.z, 1)));
	RETURN_IF_ERROR(kernel_.SetBytesAuto(
	int4(dilation_.x * src_[0]->Batch(), dilation_.y, dilation_.z, 1)));
	if (!IsSpecializedCase(channel_multiplier_)) {
	RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(channel_multiplier_)));
	}
	if (definition_.IsBatchSupported()) {
	RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
	}
	RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
	RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
	return absl::OkStatus();
	}

	int3 DepthwiseConvolution3D::GetGridSize() const {
	const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
	const int grid_y = dst_[0]->Height();
	const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
	return int3(grid_x, grid_y, grid_z);
	}

	absl::Status DepthwiseConvolution3D::Tune(const TuningParameters& params) {
	RETURN_IF_ERROR(BindArguments());
	return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
	}

	absl::Status DepthwiseConvolution3D::AddToQueue(CLCommandQueue* queue) {
	RETURN_IF_ERROR(BindArguments());
	return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
	}

	absl::Status CreateDepthwiseConvolution3D(
	const CreationContext& creation_context, const OperationDef& definition,
	const DepthwiseConvolution3DAttributes& attr,
	DepthwiseConvolution3D* result) {
	result = DepthwiseConvolution3D(definition, attr, creation_context.device);
	RETURN_IF_ERROR(
	result->UploadWeights(attr.weights, creation_context.context));
	LinearStorageCreateInfo create_info;
	create_info.storage_type =
	DeduceLinearStorageType(definition.GetPrimaryStorageType());
	create_info.data_type = definition.GetDataType();
	create_info.name = "biases";
	create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
	RETURN_IF_ERROR(CreateLinearStorage(
	create_info, attr.bias, creation_context.context, &result->biases_));
	return absl::OkStatus();
	}

	} // namespace cl
	} // namespace gpu
	} // namespace tflite