| /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h" |
| |
| #include <array> |
| #include <string> |
| #include <utility> |
| |
| #include "tensorflow/lite/delegates/gpu/cl/cl_device.h" |
| #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h" |
| #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h" |
| #include "tensorflow/lite/delegates/gpu/cl/precision.h" |
| #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" |
| #include "tensorflow/lite/delegates/gpu/common/status.h" |
| |
| namespace tflite { |
| namespace gpu { |
| namespace cl { |
| namespace { |
| |
| // element_size must be 1, 2 or 4 |
| // 1 - is FLT4 |
| // 2 - is FLT8 |
| // 4 - is FLT16 |
| // This function generates code for arithmetic part of convolution |
| std::string GetComputationPart(const int3& block_size, int element_size, |
| CalculationsPrecision precision) { |
| const std::string hexes[16] = {"0", "1", "2", "3", "4", "5", "6", "7", |
| "8", "9", "a", "b", "c", "d", "e", "f"}; |
| std::string c; |
| for (int z = 0; z < block_size.z; ++z) { |
| const std::string z_s = std::to_string(z); |
| c += " FLT16 W" + z_s + " = weights_cache[" + z_s + "];\n"; |
| for (int y = 0; y < block_size.y; ++y) { |
| for (int x = 0; x < block_size.x; ++x) { |
| std::string s_index = std::to_string(y * block_size.x + x); |
| for (int e = 0; e < element_size; ++e) { |
| std::string r_index = |
| z_s + std::to_string(y) + std::to_string(x * element_size + e); |
| const std::string f0 = "W" + z_s + ".s0123"; |
| const std::string f1 = "W" + z_s + ".s4567"; |
| const std::string f2 = "W" + z_s + ".s89ab"; |
| const std::string f3 = "W" + z_s + ".scdef"; |
| switch (precision) { |
| case CalculationsPrecision::F32: |
| case CalculationsPrecision::F16: |
| c += " r" + r_index + " += " + f0 + " * s" + s_index + ".s" + |
| hexes[e * 4 + 0] + ";\n"; |
| c += " r" + r_index + " += " + f1 + " * s" + s_index + ".s" + |
| hexes[e * 4 + 1] + ";\n"; |
| c += " r" + r_index + " += " + f2 + " * s" + s_index + ".s" + |
| hexes[e * 4 + 2] + ";\n"; |
| c += " r" + r_index + " += " + f3 + " * s" + s_index + ".s" + |
| hexes[e * 4 + 3] + ";\n"; |
| break; |
| case CalculationsPrecision::F32_F16: |
| c += " r" + r_index + " += convert_float4(" + f0 + " * s" + |
| s_index + ".s" + hexes[e * 4 + 0] + " + " + f1 + " * s" + |
| s_index + ".s" + hexes[e * 4 + 1] + " + " + f2 + " * s" + |
| s_index + ".s" + hexes[e * 4 + 2] + " + " + f3 + " * s" + |
| s_index + ".s" + hexes[e * 4 + 3] + ");\n"; |
| break; |
| } |
| } |
| } |
| } |
| } |
| return c; |
| } |
| |
| std::string GenerateConvBuffer1x1(const OperationDef& op_def, |
| const ConvBuffer1x1::ConvParams& conv_params, |
| Arguments* args) { |
| auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]); |
| if (op_def.IsBatchSupported()) { |
| src_desc->SetStateVar("BatchedWidth", "true"); |
| } |
| if (conv_params.element_size == 8) { |
| src_desc->SetStateVar("ElementsX2", "true"); |
| } else if (conv_params.element_size == 16) { |
| src_desc->SetStateVar("ElementsX4", "true"); |
| } |
| args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc)); |
| auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]); |
| if (op_def.IsBatchSupported()) { |
| dst_desc->SetStateVar("BatchedWidth", "true"); |
| } |
| args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc)); |
| |
| std::string c = GetCommonDefines(op_def.precision); |
| switch (op_def.precision) { |
| case CalculationsPrecision::F32: |
| c += "#define FLT8 float8\n"; |
| c += "#define FLT16 float16\n"; |
| break; |
| case CalculationsPrecision::F32_F16: |
| case CalculationsPrecision::F16: |
| c += "#define FLT8 half8\n"; |
| c += "#define FLT16 half16\n"; |
| break; |
| } |
| |
| const int3 block_size = conv_params.block_size; |
| const int element_size = conv_params.element_size / 4; |
| |
| c += "__kernel void main_function(\n"; |
| c += "$0) {\n"; |
| c += " int X = get_global_id(0) * " + |
| std::to_string(block_size.x * element_size) + ";\n"; |
| c += " int X_SRC = get_global_id(0) * " + std::to_string(block_size.x) + |
| ";\n"; |
| c += " int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n"; |
| c += " int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n"; |
| c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || " |
| "Z >= args.dst_tensor.Slices()) return;\n"; |
| if (conv_params.different_weights_for_height) { |
| c += " __global FLT16* weights_cache = args.weights.GetPtr() + (Z * " |
| "args.src_tensor.Height() + " |
| "Y * " + |
| std::to_string(block_size.z) + |
| ") * " |
| "args.src_tensor.Slices();\n"; |
| } else { |
| c += " __global FLT16* weights_cache = args.weights.GetPtr() + Z * " |
| "args.src_tensor.Slices();\n"; |
| } |
| for (int z = 0; z < block_size.z; ++z) { |
| const std::string z_s = std::to_string(z); |
| c += " ACCUM_FLT4 bias_val_" + z_s + |
| " = TO_ACCUM_TYPE(args.biases.Read(Z + " + z_s + "));\n"; |
| for (int y = 0; y < block_size.y; ++y) { |
| for (int x = 0; x < block_size.x * element_size; ++x) { |
| c += " ACCUM_FLT4 r" + z_s + std::to_string(y) + std::to_string(x) + |
| " = bias_val_" + z_s + ";\n"; |
| } |
| } |
| } |
| for (int x = 0; x < block_size.x; ++x) { |
| std::string x_s = std::to_string(x); |
| c += " int xc" + x_s + " = min(X_SRC + " + std::to_string(x) + |
| ", args.src_tensor.Width() - 1);\n"; |
| } |
| for (int y = 0; y < block_size.y; ++y) { |
| std::string y_s = std::to_string(y); |
| c += " int yc" + y_s + " = min(Y + " + y_s + |
| ", args.src_tensor.Height() - 1);\n"; |
| } |
| for (int y = 0; y < block_size.y; ++y) { |
| std::string y_s = std::to_string(y); |
| for (int x = 0; x < block_size.x; ++x) { |
| std::string x_s = std::to_string(x); |
| std::string i_s = std::to_string(y * block_size.x + x); |
| c += " int src_addr_" + i_s + " = (yc" + y_s + |
| ") * args.src_tensor.Width() + (xc" + x_s + ");\n"; |
| } |
| } |
| c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n"; |
| for (int y = 0; y < block_size.y; ++y) { |
| std::string y_s = std::to_string(y); |
| for (int x = 0; x < block_size.x; ++x) { |
| std::string x_s = std::to_string(x); |
| std::string i_s = std::to_string(y * block_size.x + x); |
| c += " FLT" + std::to_string(element_size * 4) + " s" + i_s + |
| " = args.src_tensor.Read(src_addr_" + i_s + ");\n"; |
| } |
| } |
| c += GetComputationPart(block_size, element_size, op_def.precision); |
| for (int i = 0; i < block_size.x * block_size.y; ++i) { |
| std::string i_s = std::to_string(i); |
| c += " src_addr_" + i_s + " += args.src_tensor.SliceStride();\n"; |
| } |
| c += " weights_cache += " + std::to_string(block_size.z) + ";\n"; |
| c += " }\n"; // SRC_SLICES |
| |
| for (int z = 0; z < block_size.z; ++z) { |
| const std::string z_s = std::to_string(z); |
| if (z != 0) { |
| c += " if (Z + " + z_s + " >= args.dst_tensor.Slices()) return;\n"; |
| } |
| for (int y = 0; y < block_size.y; ++y) { |
| const std::string y_s = std::to_string(y); |
| for (int x = 0; x < block_size.x * element_size; ++x) { |
| const std::string x_s = std::to_string(x); |
| c += " if (X + " + x_s + " < args.dst_tensor.Width() && Y + " + y_s + |
| " < args.dst_tensor.Height()) {\n"; |
| c += " FLT4 res = TO_FLT4(r" + z_s + y_s + x_s + ");\n"; |
| c += " args.dst_tensor.Write(res, X + " + x_s + ", Y + " + y_s + |
| ", Z + " + z_s + ");\n"; |
| c += " }\n"; |
| } |
| } |
| } |
| c += "}\n"; |
| return c; |
| } |
| |
| ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device, |
| const OperationDef& definition, |
| const BHWC& shape, int src_depth, |
| int dst_depth) { |
| ConvBuffer1x1::ConvParams conv_params; |
| conv_params.element_size = 4; |
| conv_params.block_size = int3(1, 1, 1); |
| if (!device.IsMali()) { |
| return conv_params; |
| } |
| bool can_use_flt8 = (shape.w * shape.b) % 2 == 0 && |
| definition.precision != CalculationsPrecision::F32; |
| bool is_midgard = device.IsMali() && device.GetInfo().mali_info.IsMidgard(); |
| if (is_midgard) { |
| if (can_use_flt8) { |
| conv_params.element_size = 8; |
| } |
| if (definition.precision == CalculationsPrecision::F16 || !can_use_flt8) { |
| conv_params.block_size.x = 2; |
| } |
| return conv_params; |
| } |
| |
| int task_size = shape.w * shape.b * shape.h * dst_depth; |
| int block_size = |
| GetRecommendedBlockSizeForConv(device, definition.precision, task_size); |
| |
| if (!can_use_flt8 && block_size > 4) { |
| block_size = 4; |
| } |
| |
| if (can_use_flt8 && block_size >= 2) { |
| conv_params.element_size = 8; |
| block_size /= 2; |
| } |
| if (block_size == 4) { |
| conv_params.block_size.x = 2; |
| if (definition.precision == CalculationsPrecision::F32 && dst_depth < 32) { |
| conv_params.block_size.y = 2; |
| } else { |
| conv_params.block_size.z = 2; |
| } |
| } else if (block_size == 2) { |
| if (dst_depth >= 32) { |
| conv_params.block_size.z = 2; |
| } else { |
| conv_params.block_size.x = 2; |
| } |
| } |
| |
| return conv_params; |
| } |
| |
| ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device, |
| const OperationDef& definition, |
| int src_depth, int dst_depth) { |
| ConvBuffer1x1::ConvParams conv_params; |
| conv_params.element_size = 4; |
| conv_params.block_size = int3(1, 1, 1); |
| if (device.IsMali() && definition.precision == CalculationsPrecision::F16 && |
| device.GetInfo().compute_units_count <= 4) { |
| conv_params.block_size.x *= 2; |
| } |
| return conv_params; |
| } |
| |
| } // namespace |
| |
| ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition, |
| const ConvParams& conv_params) |
| : GPUOperation(definition), conv_params_(conv_params) {} |
| |
| ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation) |
| : GPUOperation(std::move(operation)), |
| conv_params_(std::move(operation.conv_params_)) {} |
| |
| ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) { |
| if (this != &operation) { |
| std::swap(conv_params_, operation.conv_params_); |
| GPUOperation::operator=(std::move(operation)); |
| } |
| return *this; |
| } |
| |
| absl::Status ConvBuffer1x1::Compile(const CreationContext& creation_context) { |
| std::string code = GenerateConvBuffer1x1(definition_, conv_params_, &args_); |
| work_group_size_ = conv_params_.work_group_size; |
| std::string element_wise_code; |
| RETURN_IF_ERROR( |
| MergeOperations(linked_operations_, &args_, &element_wise_code)); |
| RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(), |
| {{"dst_tensor", element_wise_code}}, |
| &code)); |
| RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel( |
| code, "main_function", *creation_context.context, |
| *creation_context.device, &kernel_)); |
| return absl::OkStatus(); |
| } |
| |
| absl::Status ConvBuffer1x1::BindArguments() { |
| if (definition_.src_tensors.size() == 2) { |
| RETURN_IF_ERROR(args_.SetObjectRef("weights", src_[1])); |
| } |
| RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0])); |
| return args_.SetObjectRef("dst_tensor", dst_[0]); |
| } |
| |
| int3 ConvBuffer1x1::GetGridSize() const { |
| const int dst_width_elements = DivideRoundUp( |
| dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4)); |
| const int grid_x = |
| DivideRoundUp(dst_width_elements, conv_params_.block_size.x); |
| const int grid_y = |
| DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y); |
| const int grid_z = |
| DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z); |
| return int3(grid_x, grid_y, grid_z); |
| } |
| |
| absl::Status ConvBuffer1x1::Tune(const TuningParameters& params) { |
| RETURN_IF_ERROR(args_.Bind(kernel_.kernel())); |
| RETURN_IF_ERROR(GetBestWorkGroupConv(params, kernel_, grid_size_, |
| &conv_params_.work_group_size)); |
| work_group_size_ = conv_params_.work_group_size; |
| return absl::OkStatus(); |
| } |
| |
| bool IsConvBuffer1x1Supported(const OperationDef& definition, |
| const Convolution2DAttributes& attr) { |
| auto src_storage_type = definition.src_tensors[0].storage_type; |
| return src_storage_type == TensorStorageType::BUFFER && |
| attr.weights.shape.w == 1 && attr.weights.shape.h == 1 && |
| attr.dilations.w == 1 && attr.dilations.h == 1 && |
| attr.strides.w == 1 && attr.strides.h == 1 && |
| attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 && |
| attr.padding.appended.w == 0 && attr.padding.appended.h == 0; |
| } |
| |
| bool IsConvBuffer1x1Supported(const OperationDef& definition, |
| const BHWC& weights_shape, |
| const Convolution2DAttributes& attr) { |
| auto src_storage_type = definition.src_tensors[0].storage_type; |
| return src_storage_type == TensorStorageType::BUFFER && |
| weights_shape.w == 1 && weights_shape.h == 1 && |
| attr.dilations.w == 1 && attr.dilations.h == 1 && |
| attr.strides.w == 1 && attr.strides.h == 1 && |
| attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 && |
| attr.padding.appended.w == 0 && attr.padding.appended.h == 0; |
| } |
| |
| absl::Status CreateConvBuffer1x1(const CreationContext& creation_context, |
| const OperationDef& definition, |
| const Convolution2DAttributes& attr, |
| ConvBuffer1x1* result, const BHWC* shape) { |
| if (!IsConvBuffer1x1Supported(definition, attr)) { |
| return absl::InvalidArgumentError("ConvBuffer1x1 doesn't supported"); |
| } |
| const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4); |
| const int src_depth = DivideRoundUp(attr.weights.shape.i, 4); |
| ConvBuffer1x1::ConvParams conv_params; |
| if (shape) { |
| conv_params = GetBestParams(*creation_context.device, definition, *shape, |
| src_depth, dst_depth); |
| } else { |
| conv_params = GetBestParams(*creation_context.device, definition, src_depth, |
| dst_depth); |
| } |
| *result = ConvBuffer1x1(definition, conv_params); |
| return result->UploadData(attr.weights, attr.bias, creation_context.context); |
| } |
| |
| absl::Status CreateConvBuffer1x1(const CreationContext& creation_context, |
| const OperationDef& definition, |
| const FullyConnectedAttributes& attr, |
| ConvBuffer1x1* result, const BHWC* shape) { |
| const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4); |
| const int src_depth = DivideRoundUp(attr.weights.shape.i, 4); |
| ConvBuffer1x1::ConvParams conv_params; |
| if (shape) { |
| conv_params = GetBestParams(*creation_context.device, definition, *shape, |
| src_depth, dst_depth); |
| } else { |
| conv_params = GetBestParams(*creation_context.device, definition, src_depth, |
| dst_depth); |
| } |
| conv_params.block_size.x *= conv_params.block_size.y; |
| conv_params.block_size.y = 1; |
| *result = ConvBuffer1x1(definition, conv_params); |
| return result->UploadData(attr.weights, attr.bias, creation_context.context); |
| } |
| |
| absl::Status CreateConvBuffer1x1Wino4x4To6x6( |
| const CreationContext& creation_context, const OperationDef& definition, |
| const Convolution2DAttributes& attr, ConvBuffer1x1* result, |
| const BHWC* shape) { |
| const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4); |
| const int src_depth = DivideRoundUp(attr.weights.shape.i, 4); |
| ConvBuffer1x1::ConvParams conv_params; |
| if (shape) { |
| conv_params = GetBestParams(*creation_context.device, definition, *shape, |
| src_depth, dst_depth); |
| } else { |
| conv_params = GetBestParams(*creation_context.device, definition, src_depth, |
| dst_depth); |
| } |
| conv_params.block_size.x *= conv_params.block_size.y; |
| conv_params.block_size.y = 1; |
| conv_params.different_weights_for_height = true; |
| *result = ConvBuffer1x1(definition, conv_params); |
| return result->UploadDataForWinograd4x4To6x6( |
| attr.weights, *creation_context.device, creation_context.context); |
| } |
| |
| absl::Status CreateConvBuffer1x1DynamicWeights( |
| const CreationContext& creation_context, const OperationDef& definition, |
| const Convolution2DAttributes& attr, const BHWC& weights_shape, |
| ConvBuffer1x1* result, const BHWC* dst_shape) { |
| const int dst_depth = DivideRoundUp(weights_shape.b, 4); |
| const int src_depth = DivideRoundUp(weights_shape.c, 4); |
| ConvBuffer1x1::ConvParams conv_params; |
| if (dst_shape) { |
| conv_params = GetBestParams(*creation_context.device, definition, |
| *dst_shape, src_depth, dst_depth); |
| } else { |
| conv_params = GetBestParams(*creation_context.device, definition, src_depth, |
| dst_depth); |
| } |
| *result = ConvBuffer1x1(definition, conv_params); |
| BufferDescriptor desc; |
| desc.element_type = definition.src_tensors[1].data_type; |
| desc.element_size = 16; |
| desc.memory_type = MemoryType::GLOBAL; |
| result->args_.AddObjectRef("weights", AccessType::READ, |
| absl::make_unique<BufferDescriptor>(desc)); |
| return result->UploadBiases(attr.bias, creation_context.context); |
| } |
| |
| } // namespace cl |
| } // namespace gpu |
| } // namespace tflite |