| /* Copyright 2020 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h" |
| |
| #include <algorithm> |
| #include <string> |
| #include <utility> |
| |
| #include "absl/strings/substitute.h" |
| #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h" |
| #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h" |
| #include "tensorflow/lite/delegates/gpu/cl/precision.h" |
| #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" |
| #include "tensorflow/lite/delegates/gpu/common/data_type.h" |
| #include "tensorflow/lite/delegates/gpu/common/status.h" |
| |
| namespace tflite { |
| namespace gpu { |
| namespace cl { |
| |
| Conv3D::Conv3D(const OperationDef& definition, |
| const Convolution3DAttributes& attr, const CLDevice& device) |
| : GPUOperation(definition), |
| stride_(attr.strides.w, attr.strides.h, attr.strides.d), |
| padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, |
| -attr.padding.prepended.d), |
| kernel_size_(attr.weights.shape.w, attr.weights.shape.h, |
| attr.weights.shape.d), |
| dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d), |
| conv_params_(GuessBestParams(device, definition, attr)) {} |
| |
| Conv3D::Conv3D(Conv3D&& operation) |
| : GPUOperation(std::move(operation)), |
| stride_(operation.stride_), |
| padding_(operation.padding_), |
| kernel_size_(operation.kernel_size_), |
| dilation_(operation.dilation_), |
| conv_params_(operation.conv_params_) {} |
| |
| Conv3D& Conv3D::operator=(Conv3D&& operation) { |
| if (this != &operation) { |
| std::swap(stride_, operation.stride_); |
| std::swap(padding_, operation.padding_); |
| std::swap(kernel_size_, operation.kernel_size_); |
| std::swap(dilation_, operation.dilation_); |
| std::swap(conv_params_, operation.conv_params_); |
| GPUOperation::operator=(std::move(operation)); |
| } |
| return *this; |
| } |
| |
| absl::Status Conv3D::Compile(const CreationContext& creation_context) { |
| const bool stride_correction = |
| definition_.IsBatchSupported() && stride_.x != 1; |
| std::string code = |
| GenerateConv3D(definition_, stride_correction, conv_params_, &args_); |
| work_group_size_ = conv_params_.work_group_size; |
| std::string element_wise_code; |
| RETURN_IF_ERROR( |
| MergeOperations(linked_operations_, &args_, &element_wise_code)); |
| RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(), |
| {{"dst_tensor", element_wise_code}}, |
| &code)); |
| |
| std::vector<CompilerOptions> options; |
| if (definition_.precision == CalculationsPrecision::F16 && |
| creation_context.device->IsPowerVR()) { |
| options.push_back(CompilerOptions::POWERVR_FP16); |
| } |
| return creation_context.cache->GetOrCreateCLKernel( |
| code, "main_function", options, *creation_context.context, |
| *creation_context.device, &kernel_); |
| } |
| |
| absl::Status Conv3D::BindArguments() { |
| RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0])); |
| RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0])); |
| if (!conv_params_.x_kernel_is_1) { |
| RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x)); |
| RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch())); |
| RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x)); |
| RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch())); |
| } |
| if (!conv_params_.y_kernel_is_1) { |
| RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y)); |
| RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y)); |
| RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y)); |
| RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y)); |
| } |
| if (!conv_params_.z_kernel_is_1) { |
| RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z)); |
| RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z)); |
| RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z)); |
| RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z)); |
| } |
| return args_.SetInt("grid_size_s", DivideRoundUp(dst_[0]->Slices(), |
| conv_params_.block_size.w)); |
| } |
| |
| int3 Conv3D::GetGridSize() const { |
| const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), |
| conv_params_.block_size.x); |
| const int grid_y = |
| DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y); |
| const int grid_z = |
| DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) * |
| DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z); |
| int3 wg; |
| wg.x = DivideRoundUp(grid_x, conv_params_.work_group_size.x); |
| wg.y = DivideRoundUp(grid_y, conv_params_.work_group_size.y); |
| wg.z = DivideRoundUp(grid_z, conv_params_.work_group_size.z); |
| return int3(wg[conv_params_.work_group_launch_order[0]] * |
| conv_params_.work_group_size.x, |
| wg[conv_params_.work_group_launch_order[1]] * |
| conv_params_.work_group_size.y, |
| wg[conv_params_.work_group_launch_order[2]] * |
| conv_params_.work_group_size.z); |
| } |
| |
| absl::Status Conv3D::Tune(const TuningParameters& params) { |
| if (conv_params_.weights_upload_type == |
| WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP || |
| conv_params_.weights_upload_type == |
| WeightsUploadType::LOCAL_MEM_BY_THREADS) { |
| return absl::OkStatus(); |
| } |
| if (conv_params_.work_group_launch_order[0] == 0 && |
| conv_params_.work_group_launch_order[1] == 1 && |
| conv_params_.work_group_launch_order[2] == 2) { |
| RETURN_IF_ERROR(args_.Bind(kernel_.kernel())); |
| RETURN_IF_ERROR(GetBestWorkGroupConv(params, kernel_, grid_size_, |
| &conv_params_.work_group_size)); |
| work_group_size_ = conv_params_.work_group_size; |
| } |
| return absl::OkStatus(); |
| } |
| |
| namespace { |
| std::string GenerateUploadByThreads(const std::string& local_ptr_name, |
| const std::string& global_ptr_name, |
| const std::string& global_offset_name, |
| const std::string& lid_name, |
| int total_work_items, |
| int elements_to_upload) { |
| std::string c; |
| std::string offset = |
| global_offset_name.empty() ? "" : global_offset_name + " + "; |
| const int groups = elements_to_upload / total_work_items; |
| const int reminder = elements_to_upload % total_work_items; |
| for (int i = 0; i < groups; ++i) { |
| c += " " + local_ptr_name + "[" + lid_name + " + " + |
| std::to_string(total_work_items * i) + "] = " + global_ptr_name + "[" + |
| offset + lid_name + " + " + std::to_string(total_work_items * i) + |
| "];\n"; |
| } |
| if (reminder != 0) { |
| c += " if (" + lid_name + " < " + std::to_string(reminder) + ") {\n"; |
| c += " " + local_ptr_name + "[" + lid_name + " + " + |
| std::to_string(total_work_items * groups) + "] = " + global_ptr_name + |
| "[" + offset + lid_name + " + " + |
| std::to_string(total_work_items * groups) + "];\n"; |
| c += " }\n"; |
| } |
| return c; |
| } |
| |
| std::string GenerateAsyncUpload(const std::string& local_ptr_name, |
| const std::string& global_ptr_name, |
| const std::string& global_offset_name, |
| int elements_to_upload) { |
| std::string c; |
| std::string offset = |
| global_offset_name.empty() ? "" : " + " + global_offset_name; |
| c += " async_work_group_copy(" + local_ptr_name + ", " + global_ptr_name + |
| offset + ", " + std::to_string(elements_to_upload) + ", 0);\n"; |
| return c; |
| } |
| |
| std::string GenerateGlobalCoordinates(const int4& block_size, |
| const int3& work_group_launch_order) { |
| std::string c; |
| int3 launch_remap; |
| launch_remap[work_group_launch_order.x] = 0; |
| launch_remap[work_group_launch_order.y] = 1; |
| launch_remap[work_group_launch_order.z] = 2; |
| if (work_group_launch_order[0] == 0) { |
| c += " int DST_X = get_global_id(0) * " + std::to_string(block_size.x) + |
| ";\n"; |
| } else { |
| c += " int DST_X = (get_group_id(" + std::to_string(launch_remap[0]) + |
| ") * get_local_size(0) + get_local_id(0)) * " + |
| std::to_string(block_size.x) + ";\n"; |
| } |
| if (work_group_launch_order[1] == 1) { |
| c += " int DST_Y = get_global_id(1) * " + std::to_string(block_size.y) + |
| ";\n"; |
| } else { |
| c += " int DST_Y = (get_group_id(" + std::to_string(launch_remap[1]) + |
| ") * get_local_size(1) + get_local_id(1)) * " + |
| std::to_string(block_size.y) + ";\n"; |
| } |
| if (work_group_launch_order[2] == 2) { |
| c += " int linear_id_z = get_global_id(2);\n"; |
| } else { |
| c += " int linear_id_z = get_group_id(" + std::to_string(launch_remap[2]) + |
| ") * get_local_size(2) + get_local_id(2);\n"; |
| } |
| c += " int DST_S = (linear_id_z % args.grid_size_s) * " + |
| std::to_string(block_size.w) + ";\n"; |
| c += " int DST_Z = (linear_id_z / args.grid_size_s) * " + |
| std::to_string(block_size.z) + ";\n"; |
| return c; |
| } |
| |
| std::string GenerateConv(CalculationsPrecision precision, |
| const int4& block_size, int offset, |
| bool weights_are_buffer) { |
| std::string c; |
| const std::string channels[] = {"x", "y", "z", "w"}; |
| for (int s = 0; s < block_size.w; ++s) { |
| switch (precision) { |
| case CalculationsPrecision::F32: |
| case CalculationsPrecision::F16: |
| for (int ch = 0; ch < 4; ++ch) { |
| const std::string weight_id = std::to_string(s * 4 + ch + offset); |
| std::string weight_name; |
| if (weights_are_buffer) { |
| weight_name = "weights_cache[" + weight_id + "]"; |
| } else { |
| weight_name = "f" + weight_id; |
| } |
| for (int z = 0; z < block_size.z; ++z) { |
| for (int y = 0; y < block_size.y; ++y) { |
| for (int x = 0; x < block_size.x; ++x) { |
| std::string id = |
| std::to_string(z) + std::to_string(y) + std::to_string(x); |
| c += " r" + std::to_string(s) + id + " += " + weight_name + |
| " * src" + id + "." + channels[ch] + ";\n"; |
| } |
| } |
| } |
| } |
| break; |
| case CalculationsPrecision::F32_F16: |
| for (int z = 0; z < block_size.z; ++z) { |
| for (int y = 0; y < block_size.y; ++y) { |
| for (int x = 0; x < block_size.x; ++x) { |
| std::string id = |
| std::to_string(z) + std::to_string(y) + std::to_string(x); |
| std::vector<std::string> weight_names(4); |
| for (int i = 0; i < 4; ++i) { |
| std::string weight_id = std::to_string(s * 4 + i + offset); |
| if (weights_are_buffer) { |
| weight_names[i] = "weights_cache[" + weight_id + "]"; |
| } else { |
| weight_names[i] = "f" + weight_id; |
| } |
| } |
| c += absl::Substitute( |
| " $0 += convert_float4($1.x * $2 + $1.y * $3 + $1.z * " |
| "$4 + $1.w * $5);\n", |
| "r" + std::to_string(s) + id, "src" + id, weight_names[0], |
| weight_names[1], weight_names[2], weight_names[3]); |
| } |
| } |
| } |
| break; |
| } |
| } |
| return c; |
| } |
| } // namespace |
| |
| std::string GenerateConv3D(const OperationDef& op_def, bool stride_correction, |
| const Conv3D::ConvParams& conv_params, |
| Arguments* args) { |
| auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]); |
| src_desc->SetTextureAddressMode(TextureAddressMode::ZERO); |
| if (op_def.IsBatchSupported()) { |
| src_desc->SetStateVar("BatchedWidth", "true"); |
| } |
| args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc)); |
| auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]); |
| if (op_def.IsBatchSupported()) { |
| dst_desc->SetStateVar("BatchedWidth", "true"); |
| } |
| args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc)); |
| if (!conv_params.x_kernel_is_1) { |
| args->AddInt("stride_x"); |
| args->AddInt("padding_x"); |
| args->AddInt("kernel_size_x"); |
| args->AddInt("dilation_x"); |
| } |
| if (!conv_params.y_kernel_is_1) { |
| args->AddInt("stride_y"); |
| args->AddInt("padding_y"); |
| args->AddInt("kernel_size_y"); |
| args->AddInt("dilation_y"); |
| } |
| if (!conv_params.z_kernel_is_1) { |
| args->AddInt("stride_z"); |
| args->AddInt("padding_z"); |
| args->AddInt("kernel_size_z"); |
| args->AddInt("dilation_z"); |
| } |
| args->AddInt("grid_size_s"); |
| |
| const auto src_tensor_type = op_def.src_tensors[0].storage_type; |
| const bool buffer_type = src_tensor_type == TensorStorageType::BUFFER || |
| src_tensor_type == TensorStorageType::IMAGE_BUFFER; |
| |
| const bool manual_clamp_x = buffer_type && !conv_params.x_kernel_is_1; |
| const bool manual_clamp_y = buffer_type && !conv_params.y_kernel_is_1; |
| const bool manual_clamp_z = |
| src_tensor_type != TensorStorageType::TEXTURE_3D && |
| !conv_params.z_kernel_is_1; |
| |
| const bool can_read_out_of_x = !buffer_type; |
| const bool can_read_out_of_y = !buffer_type; |
| const bool can_read_out_of_z = |
| src_tensor_type == TensorStorageType::TEXTURE_3D || |
| src_tensor_type == TensorStorageType::TEXTURE_2D || |
| src_tensor_type == TensorStorageType::SINGLE_TEXTURE_2D; |
| |
| const bool is1x1x1 = conv_params.x_kernel_is_1 && conv_params.y_kernel_is_1 && |
| conv_params.z_kernel_is_1; |
| |
| const bool need_local_mem = |
| conv_params.weights_upload_type == |
| Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS || |
| conv_params.weights_upload_type == |
| Conv3D::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP; |
| |
| const int3 work_group_size = conv_params.work_group_size; |
| const int4 block_size = conv_params.block_size; |
| std::string c = GetCommonDefines(op_def.precision); |
| if (need_local_mem) { // we use fixed workgroup size when use local mem |
| c += "__attribute__((reqd_work_group_size(" + |
| std::to_string(work_group_size.x) + ", " + |
| std::to_string(work_group_size.y) + ", " + |
| std::to_string(work_group_size.z) + ")))\n"; |
| } |
| c += "__kernel void main_function(\n"; |
| c += "$0) {\n"; |
| c += GenerateGlobalCoordinates(block_size, |
| conv_params.work_group_launch_order); |
| if (!need_local_mem) { |
| c += " if (DST_X >= args.dst_tensor.Width() || DST_Y >= " |
| "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Depth()) " |
| "return;\n"; |
| } |
| if (conv_params.weights_upload_type == |
| Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) { |
| c += " int lid = get_local_id(1) * " + std::to_string(work_group_size.x) + |
| " + get_local_id(0);\n"; |
| } |
| for (int s = 0; s < block_size.w; ++s) { |
| for (int z = 0; z < block_size.z; ++z) { |
| for (int y = 0; y < block_size.y; ++y) { |
| for (int x = 0; x < block_size.x; ++x) { |
| c += " ACCUM_FLT4 r" + std::to_string(s) + std::to_string(z) + |
| std::to_string(y) + std::to_string(x) + |
| " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"; |
| } |
| } |
| } |
| } |
| if (!conv_params.x_kernel_is_1) { |
| for (int x = 0; x < block_size.x; ++x) { |
| const std::string xc = "(DST_X + " + std::to_string(x) + ")"; |
| if (stride_correction) { |
| c += " int xc" + std::to_string(x) + " = " + |
| GetXStrideCorrected(xc, "args.src_tensor.Batch()", "args.stride_x", |
| "args.padding_x") + |
| ";\n"; |
| } else { |
| c += " int xc" + std::to_string(x) + " = " + xc + |
| " * args.stride_x + args.padding_x;\n"; |
| } |
| } |
| } else if (!can_read_out_of_x) { |
| for (int x = 0; x < block_size.x; ++x) { |
| const std::string xc = "(DST_X + " + std::to_string(x) + ")"; |
| c += " int xc" + std::to_string(x) + " = clamp(" + xc + |
| ", 0, args.src_tensor.Width() - 1);\n"; |
| } |
| } |
| if (!conv_params.y_kernel_is_1) { |
| for (int y = 0; y < block_size.y; ++y) { |
| const std::string yc = "(DST_Y + " + std::to_string(y) + ")"; |
| c += " int yc" + std::to_string(y) + " = " + yc + |
| " * args.stride_y + args.padding_y;\n"; |
| } |
| } else if (!can_read_out_of_y) { |
| for (int y = 0; y < block_size.y; ++y) { |
| const std::string yc = "(DST_Y + " + std::to_string(y) + ")"; |
| c += " int yc" + std::to_string(y) + " = clamp(" + yc + |
| ", 0, args.src_tensor.Height() - 1);\n"; |
| } |
| } |
| if (!conv_params.z_kernel_is_1) { |
| for (int z = 0; z < block_size.z; ++z) { |
| const std::string zc = "(DST_Z + " + std::to_string(z) + ")"; |
| c += " int zc" + std::to_string(z) + " = " + zc + |
| " * args.stride_z + args.padding_z;\n"; |
| } |
| } else if (!can_read_out_of_z) { |
| for (int z = 0; z < block_size.z; ++z) { |
| const std::string zc = "(DST_Z + " + std::to_string(z) + ")"; |
| c += " int zc" + std::to_string(z) + " = clamp(" + zc + |
| ", 0, args.src_tensor.Depth() - 1);\n"; |
| } |
| } |
| if (need_local_mem) { |
| c += " __local FLT4 weights_cache[" + |
| std::to_string(block_size.w * 4 * conv_params.src_depth_loop_size) + |
| "];\n"; |
| } |
| if (conv_params.weights_upload_type == |
| Conv3D::WeightsUploadType::GLOBAL_MEM) { |
| c += " __global FLT4* weights_cache;\n"; |
| } |
| std::string kernel_size; |
| kernel_size += conv_params.x_kernel_is_1 ? "" : " * args.kernel_size_x"; |
| kernel_size += conv_params.y_kernel_is_1 ? "" : " * args.kernel_size_y"; |
| kernel_size += conv_params.z_kernel_is_1 ? "" : " * args.kernel_size_z"; |
| if (conv_params.AreWeightsBuffer()) { |
| c += " __global FLT4* filters_loc = args.weights.GetPtr() + DST_S * 4 * " |
| "args.src_tensor.Slices()" + |
| kernel_size + ";\n"; |
| } |
| if (buffer_type) { |
| c += " const int src_layer_offset = args.src_tensor.SliceStride();\n"; |
| } |
| if (!is1x1x1) { |
| c += " int filter_offset = 0;\n"; |
| } |
| if (!conv_params.z_kernel_is_1) { |
| c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n"; |
| for (int z = 0; z < block_size.z; ++z) { |
| const std::string zck = "zck" + std::to_string(z); |
| c += " int zck" + std::to_string(z) + " = kz * args.dilation_z + zc" + |
| std::to_string(z) + ";\n"; |
| if (manual_clamp_z) { |
| c += " bool mz" + std::to_string(z) + " = " + zck + " >= 0 && " + zck + |
| " < args.src_tensor.Depth();\n"; |
| c += " " + zck + " = clamp(" + zck + |
| ", 0, args.src_tensor.Depth() - 1);\n"; |
| } |
| } |
| } |
| if (!conv_params.y_kernel_is_1) { |
| c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n"; |
| for (int y = 0; y < block_size.y; ++y) { |
| const std::string yck = "yck" + std::to_string(y); |
| c += " int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) + |
| ";\n"; |
| if (manual_clamp_y) { |
| c += " bool my" + std::to_string(y) + " = " + yck + " >= 0 && " + yck + |
| " < args.src_tensor.Height();\n"; |
| c += " " + yck + " = clamp(" + yck + |
| ", 0, args.src_tensor.Height() - 1);\n"; |
| } |
| } |
| } |
| if (!conv_params.x_kernel_is_1) { |
| c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n"; |
| for (int x = 0; x < block_size.x; ++x) { |
| const std::string xck = "xck" + std::to_string(x); |
| c += " int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" + |
| std::to_string(x) + ";\n"; |
| if (manual_clamp_x) { |
| c += " bool mx" + std::to_string(x) + " = " + xck + " >= 0 && " + xck + |
| " < args.src_tensor.Width();\n"; |
| c += " " + xck + " = clamp(" + xck + |
| ", 0, args.src_tensor.Width() - 1);\n"; |
| } |
| } |
| } |
| |
| auto get_src_x_coord = [&](int id) { |
| std::string xs = std::to_string(id); |
| std::string xc = "xck" + xs; |
| if (conv_params.x_kernel_is_1) { |
| if (can_read_out_of_x) { |
| xc = "DST_X + " + xs; |
| } else { |
| xc = "xc" + xs; |
| } |
| } |
| return xc; |
| }; |
| auto get_src_y_coord = [&](int id) { |
| std::string ys = std::to_string(id); |
| std::string yc = "yck" + ys; |
| if (conv_params.y_kernel_is_1) { |
| if (can_read_out_of_y) { |
| yc = "DST_Y + " + ys; |
| } else { |
| yc = "yc" + ys; |
| } |
| } |
| return yc; |
| }; |
| auto get_src_z_coord = [&](int id) { |
| std::string zs = std::to_string(id); |
| std::string zc = "zck" + zs; |
| if (conv_params.z_kernel_is_1) { |
| if (can_read_out_of_z) { |
| zc = "DST_Z + " + zs; |
| } else { |
| zc = "zc" + zs; |
| } |
| } |
| return zc; |
| }; |
| |
| if (buffer_type) { |
| for (int z = 0; z < block_size.z; ++z) { |
| const std::string zs = std::to_string(z); |
| const std::string zc = get_src_z_coord(z); |
| for (int y = 0; y < block_size.y; ++y) { |
| const std::string ys = std::to_string(y); |
| const std::string yc = get_src_y_coord(y); |
| for (int x = 0; x < block_size.x; ++x) { |
| const std::string xs = std::to_string(x); |
| const std::string xc = get_src_x_coord(x); |
| const std::string id = zs + ys + xs; |
| c += " args.src_tensor.GetAddress(src_a_" + id + ", " + xc + ", " + |
| yc + ", " + zc + ", 0);\n"; |
| if (!is1x1x1 && src_tensor_type == TensorStorageType::IMAGE_BUFFER) { |
| std::string condition; |
| if (manual_clamp_x) { |
| if (!condition.empty()) { |
| condition += " && "; |
| } |
| condition += "mx" + xs; |
| } |
| if (manual_clamp_y) { |
| if (!condition.empty()) { |
| condition += " && "; |
| } |
| condition += "my" + ys; |
| } |
| if (manual_clamp_z) { |
| if (!condition.empty()) { |
| condition += " && "; |
| } |
| condition += "mz" + zs; |
| } |
| c += " src_a_" + id + " = select(-1, src_a_" + id + ", " + |
| condition + ");\n"; |
| c += " int dz_" + id + " = select(0, src_layer_offset, " + |
| condition + ");\n"; |
| } |
| } |
| } |
| } |
| } |
| |
| auto declare_src = [&]() { |
| for (int z = 0; z < block_size.z; ++z) { |
| const std::string zs = std::to_string(z); |
| for (int y = 0; y < block_size.y; ++y) { |
| const std::string ys = std::to_string(y); |
| for (int x = 0; x < block_size.x; ++x) { |
| const std::string xs = std::to_string(x); |
| const std::string id = zs + ys + xs; |
| c += " FLT4 src" + id + ";\n"; |
| } |
| } |
| } |
| }; |
| |
| auto read_src = [&]() { |
| for (int z = 0; z < block_size.z; ++z) { |
| const std::string zs = std::to_string(z); |
| const std::string zc = get_src_z_coord(z); |
| for (int y = 0; y < block_size.y; ++y) { |
| const std::string ys = std::to_string(y); |
| const std::string yc = get_src_y_coord(y); |
| for (int x = 0; x < block_size.x; ++x) { |
| const std::string xs = std::to_string(x); |
| const std::string xc = get_src_x_coord(x); |
| std::string multiplier; |
| multiplier += manual_clamp_x ? " * (FLT)(mx" + xs + ")" : ""; |
| multiplier += manual_clamp_y ? " * (FLT)(my" + ys + ")" : ""; |
| multiplier += manual_clamp_z ? " * (FLT)(mz" + zs + ")" : ""; |
| const std::string id = zs + ys + xs; |
| if (buffer_type) { |
| if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) { |
| multiplier = ""; |
| } |
| c += " src" + id + " = args.src_tensor.Read(src_a_" + id + ")" + |
| multiplier + ";\n"; |
| if (!is1x1x1 && |
| src_tensor_type == TensorStorageType::IMAGE_BUFFER) { |
| c += " src_a_" + id + " += dz_" + id + ";\n"; |
| } else { |
| c += " src_a_" + id + " += src_layer_offset;\n"; |
| } |
| } else { |
| c += " src" + id + " = args.src_tensor.Read(" + xc + ", " + yc + |
| ", " + zc + ", s)" + multiplier + ";\n"; |
| } |
| } |
| } |
| } |
| }; |
| c += " int s = 0;\n"; |
| declare_src(); |
| c += " do {\n"; |
| const int total_work_items = |
| work_group_size.x * work_group_size.y * work_group_size.z; |
| if (conv_params.weights_upload_type == |
| Conv3D::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) { |
| c += |
| GenerateAsyncUpload("weights_cache", "filters_loc", |
| /*global_offset_name*/ "", |
| block_size.w * 4 * conv_params.src_depth_loop_size); |
| } else if (conv_params.weights_upload_type == |
| Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) { |
| c += " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
| c += GenerateUploadByThreads( |
| "weights_cache", "filters_loc", |
| /*global_offset_name*/ "", "lid", total_work_items, |
| block_size.w * 4 * conv_params.src_depth_loop_size); |
| } else if (conv_params.weights_upload_type == |
| Conv3D::WeightsUploadType::GLOBAL_MEM) { |
| c += " weights_cache = filters_loc;\n"; |
| } else { // TEXTURES_MEM |
| for (int dst_s = 0; dst_s < block_size.w; ++dst_s) { |
| const std::string f_y = is1x1x1 ? "s" : "filter_offset"; |
| c += absl::Substitute( |
| R"( FLT4 f$2 = args.weights0.Read(DST_S + $0, $1); |
| FLT4 f$3 = args.weights1.Read(DST_S + $0, $1); |
| FLT4 f$4 = args.weights2.Read(DST_S + $0, $1); |
| FLT4 f$5 = args.weights3.Read(DST_S + $0, $1); |
| )", |
| dst_s, f_y, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2, |
| dst_s * 4 + 3); |
| } |
| if (!is1x1x1) { |
| c += " filter_offset++;\n"; |
| } |
| } |
| read_src(); |
| c += " s += 1;\n"; |
| if (conv_params.weights_upload_type == |
| Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) { |
| c += " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
| } |
| c += GenerateConv(op_def.precision, block_size, 0, |
| conv_params.AreWeightsBuffer()); |
| for (int i = 1; i < conv_params.src_depth_loop_size; ++i) { |
| read_src(); |
| c += GenerateConv(op_def.precision, block_size, i * block_size.w * 4, |
| conv_params.AreWeightsBuffer()); |
| c += " s += 1;\n"; |
| } |
| if (conv_params.AreWeightsBuffer()) { |
| c += " filters_loc += " + |
| std::to_string(block_size.w * 4 * conv_params.src_depth_loop_size) + |
| ";\n"; |
| } |
| c += " } while (s < args.src_tensor.Slices());\n"; |
| if (!conv_params.z_kernel_is_1) { |
| c += " }\n"; |
| } |
| if (!conv_params.y_kernel_is_1) { |
| c += " }\n"; |
| } |
| if (!conv_params.x_kernel_is_1) { |
| c += " }\n"; |
| } |
| if (conv_params.weights_upload_type == |
| Conv3D::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) { |
| c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S", |
| block_size.w); |
| } else if (conv_params.weights_upload_type == |
| Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) { |
| c += " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
| c += |
| GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()", |
| "DST_S", "lid", total_work_items, block_size.w); |
| c += " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
| } else if (conv_params.weights_upload_type == |
| Conv3D::WeightsUploadType::GLOBAL_MEM) { |
| c += " weights_cache = args.biases.GetPtr() + DST_S;\n"; |
| } |
| if (need_local_mem) { |
| c += " if (DST_X >= args.dst_tensor.Width() || DST_Y >= " |
| "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Depth()) " |
| "return;\n"; |
| } |
| for (int s = 0; s < block_size.w; ++s) { |
| const std::string dsts = |
| "DST_S" + (s == 0 ? "" : " + " + std::to_string(s)); |
| c += " if (" + dsts + " >= args.dst_tensor.Slices()) return;\n"; |
| for (int z = 0; z < block_size.z; ++z) { |
| const std::string dstz = |
| "DST_Z" + (z == 0 ? "" : " + " + std::to_string(z)); |
| for (int y = 0; y < block_size.y; ++y) { |
| const std::string dsty = |
| "DST_Y" + (y == 0 ? "" : " + " + std::to_string(y)); |
| for (int x = 0; x < block_size.x; ++x) { |
| const std::string dstx = |
| "DST_X" + (x == 0 ? "" : " + " + std::to_string(x)); |
| const std::string r_id = std::to_string(s) + std::to_string(z) + |
| std::to_string(y) + std::to_string(x); |
| c += " if (" + dstx + " < args.dst_tensor.Width() && " + dsty + |
| " < args.dst_tensor.Height() && " + dstz + |
| " < args.dst_tensor.Depth()) {\n"; |
| if (conv_params.AreWeightsBuffer()) { |
| c += " FLT4 res = TO_FLT4(r" + r_id + ") + weights_cache[" + |
| std::to_string(s) + "];\n"; |
| } else { |
| c += " FLT4 res = TO_FLT4(r" + r_id + ") + args.biases.Read(" + |
| dsts + ");\n"; |
| } |
| c += " args.dst_tensor.Write(res, " + dstx + ", " + dsty + ", " + |
| dstz + ", " + dsts + ");\n"; |
| c += " }\n"; |
| } |
| } |
| } |
| } |
| c += "}\n"; |
| return c; |
| } |
| |
| Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device, |
| const OperationDef& definition, |
| int src_slices, int dst_slices, |
| bool x_kernel_is_1, |
| bool y_kernel_is_1, |
| bool z_kernel_is_1) const { |
| ConvParams conv_params; |
| conv_params.x_kernel_is_1 = x_kernel_is_1; |
| conv_params.y_kernel_is_1 = y_kernel_is_1; |
| conv_params.z_kernel_is_1 = z_kernel_is_1; |
| if (device.IsNvidia()) { |
| conv_params.block_size = int4(1, 1, 1, 4); |
| conv_params.work_group_size = int3(8, 4, 1); |
| conv_params.work_group_launch_order = int3(2, 0, 1); |
| conv_params.src_depth_loop_size = 1; |
| conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS; |
| if (dst_slices % 4 == 0 || dst_slices >= 8) { |
| conv_params.block_size.w = 4; |
| } else if (dst_slices % 2 == 0 || dst_slices >= 4) { |
| conv_params.block_size.w = 2; |
| } else { |
| conv_params.block_size.w = dst_slices; |
| } |
| if (src_slices % 2 == 0) { |
| conv_params.src_depth_loop_size = 2; |
| } |
| if (src_slices % 4 == 0 && conv_params.block_size.w <= 2) { |
| conv_params.src_depth_loop_size = 4; |
| } |
| } else if (device.IsPowerVR()) { |
| conv_params.block_size = int4(1, 1, 1, 4); |
| conv_params.work_group_size = int3(8, 4, 1); |
| conv_params.work_group_launch_order = int3(2, 0, 1); |
| conv_params.src_depth_loop_size = 1; |
| conv_params.weights_upload_type = |
| WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP; |
| if (dst_slices % 8 == 0 || dst_slices >= 32) { |
| conv_params.block_size.w = 8; |
| } else if (dst_slices % 4 == 0 || dst_slices >= 8) { |
| conv_params.block_size.w = 4; |
| } else if (dst_slices % 2 == 0 || dst_slices >= 4) { |
| conv_params.block_size.w = 2; |
| } else { |
| conv_params.block_size.w = dst_slices; |
| } |
| if (definition.precision == CalculationsPrecision::F16) { |
| conv_params.block_size.w = std::min(4, conv_params.block_size.w); |
| if (src_slices % 2 == 0) { |
| conv_params.src_depth_loop_size = 2; |
| } |
| if (src_slices % 4 == 0 && conv_params.block_size.w <= 2) { |
| conv_params.src_depth_loop_size = 4; |
| } |
| if (conv_params.block_size.w == 1) { |
| if (src_slices % 2 == 0) { |
| conv_params.src_depth_loop_size = 2; |
| } |
| if (src_slices % 4 == 0) { |
| conv_params.src_depth_loop_size = 4; |
| } |
| if (src_slices <= 8) { |
| conv_params.src_depth_loop_size = src_slices; |
| } |
| } |
| conv_params.block_size.x = 2; |
| conv_params.work_group_size = int3(4, 8, 1); |
| } |
| } else if (device.IsAdreno()) { |
| conv_params.block_size = int4(2, 2, 1, 2); |
| conv_params.work_group_size = int3(8, 4, 1); |
| conv_params.work_group_launch_order = int3(0, 1, 2); |
| conv_params.src_depth_loop_size = 1; |
| conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM; |
| } else if (device.IsMali()) { |
| conv_params.block_size = int4(1, 1, 1, 4); |
| conv_params.work_group_size = int3(8, 4, 1); |
| conv_params.work_group_launch_order = int3(0, 1, 2); |
| conv_params.src_depth_loop_size = 1; |
| conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM; |
| if (dst_slices % 4 == 0 || dst_slices >= 8) { |
| conv_params.block_size.w = 4; |
| } else if (dst_slices % 2 == 0 || dst_slices >= 4) { |
| conv_params.block_size.w = 2; |
| } else { |
| conv_params.block_size.w = dst_slices; |
| } |
| if (src_slices % 2 == 0) { |
| conv_params.src_depth_loop_size = 2; |
| } |
| if (src_slices % 4 == 0 && conv_params.block_size.w <= 2) { |
| conv_params.src_depth_loop_size = 4; |
| } |
| } else { |
| conv_params.block_size = int4(2, 2, 1, 2); |
| conv_params.work_group_size = int3(8, 4, 1); |
| conv_params.work_group_launch_order = int3(0, 1, 2); |
| conv_params.src_depth_loop_size = 1; |
| conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM; |
| } |
| |
| return conv_params; |
| } |
| |
| Conv3D::ConvParams Conv3D::GuessBestParams( |
| const CLDevice& device, const OperationDef& definition, |
| const Convolution3DAttributes& attr) const { |
| const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4); |
| const int src_slices = DivideRoundUp(attr.weights.shape.i, 4); |
| const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 && |
| attr.dilations.w == 1 && |
| attr.padding.prepended.w == 0 && |
| attr.padding.appended.w == 0; |
| const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 && |
| attr.dilations.h == 1 && |
| attr.padding.prepended.h == 0 && |
| attr.padding.appended.h == 0; |
| const bool z_kernel_is_1 = attr.weights.shape.d == 1 && attr.strides.d == 1 && |
| attr.dilations.d == 1 && |
| attr.padding.prepended.d == 0 && |
| attr.padding.appended.d == 0; |
| return GuessBestParams(device, definition, src_slices, dst_slices, |
| x_kernel_is_1, y_kernel_is_1, z_kernel_is_1); |
| } |
| |
| absl::Status CreateConv3D(const CreationContext& creation_context, |
| const OperationDef& definition, |
| const Convolution3DAttributes& attr, Conv3D* result) { |
| *result = Conv3D(definition, attr, *creation_context.device); |
| return result->UploadData(attr.weights, attr.bias, creation_context.context); |
| } |
| |
| } // namespace cl |
| } // namespace gpu |
| } // namespace tflite |