| /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h" |
| |
| #include "absl/strings/substitute.h" |
| #include "tensorflow/lite/delegates/gpu/common/access_type.h" |
| #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h" |
| |
| namespace tflite { |
| namespace gpu { |
| namespace { |
| std::string GetElementWiseCode(const OperationDef& op_def, |
| bool check_src_slices) { |
| std::string c; |
| c += "MAIN_FUNCTION(\n"; |
| c += "$0) {\n"; |
| c += " int X = GLOBAL_ID_0;\n"; |
| c += " int Y = GLOBAL_ID_1;\n"; |
| c += " int Z = GLOBAL_ID_2;\n"; |
| c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || " |
| "Z >= args.dst_tensor.Slices()) return; \n"; |
| if (check_src_slices) { |
| c += " FLT4 src = INIT_FLT4(0.0f);\n"; |
| c += " if (Z < args.src_tensor.Slices()) {\n"; |
| c += " src = args.src_tensor.Read(X, Y, Z);\n"; |
| c += " }\n"; |
| } else { |
| c += " FLT4 src = args.src_tensor.Read(X, Y, Z);\n"; |
| } |
| c += " args.dst_tensor.Write(src, X, Y, Z);\n"; |
| c += "} \n"; |
| return c; |
| } |
| |
| } // namespace |
| |
| DataType OperationDef::GetDataType() const { |
| return DeduceDataTypeFromPrecision(precision); |
| } |
| |
| DataType OperationDef::GetPrimaryDataType() const { |
| return src_tensors[0].data_type; |
| } |
| TensorStorageType OperationDef::GetPrimaryStorageType() const { |
| return src_tensors[0].storage_type; |
| } |
| |
| bool OperationDef::IsBatchSupported() const { |
| for (const auto& src : src_tensors) { |
| if (HasAxis(src.layout, Axis::BATCH)) { |
| return true; |
| } |
| } |
| for (const auto& dst : dst_tensors) { |
| if (HasAxis(dst.layout, Axis::BATCH)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| GPUOperation::GPUOperation(const OperationDef& definition) |
| : definition_(definition) {} |
| |
| void GPUOperation::SetSrc(GpuSpatialTensor* ptr, int index) { |
| if (index >= src_.size()) { |
| src_.resize(index + 1, nullptr); |
| } |
| src_[index] = ptr; |
| } |
| |
| void GPUOperation::SetDst(GpuSpatialTensor* ptr, int index) { |
| if (index >= dst_.size()) { |
| dst_.resize(index + 1, nullptr); |
| } |
| dst_[index] = ptr; |
| } |
| |
| GPUOperation::GPUOperation(GPUOperation&& operation) |
| : args_(std::move(operation.args_)), |
| code_(std::move(operation.code_)), |
| work_group_size_(operation.work_group_size_), |
| compiler_options_(std::move(operation.compiler_options_)), |
| tensor_to_grid_(operation.tensor_to_grid_), |
| elementwise_(operation.elementwise_), |
| linkable_(operation.linkable_), |
| check_src_channels_size_(operation.check_src_channels_size_), |
| definition_(std::move(operation.definition_)), |
| src_(std::move(operation.src_)), |
| dst_(std::move(operation.dst_)), |
| grid_dimension_(operation.grid_dimension_), |
| work_group_launch_order_(operation.work_group_launch_order_), |
| grid_size_(operation.grid_size_), |
| src_tensors_names_(std::move(operation.src_tensors_names_)), |
| dst_tensors_names_(std::move(operation.dst_tensors_names_)), |
| work_groups_count_(operation.work_groups_count_), |
| linkable_count_(operation.linkable_count_), |
| elementwise_code_(std::move(operation.elementwise_code_)) {} |
| |
| GPUOperation& GPUOperation::operator=(GPUOperation&& operation) { |
| if (this != &operation) { |
| args_ = std::move(operation.args_); |
| code_ = std::move(operation.code_); |
| std::swap(work_group_size_, operation.work_group_size_); |
| compiler_options_ = std::move(operation.compiler_options_); |
| tensor_to_grid_ = operation.tensor_to_grid_; |
| elementwise_ = operation.elementwise_; |
| linkable_ = operation.linkable_; |
| check_src_channels_size_ = operation.check_src_channels_size_; |
| definition_ = std::move(operation.definition_); |
| src_ = std::move(operation.src_); |
| dst_ = std::move(operation.dst_); |
| std::swap(grid_dimension_, operation.grid_dimension_); |
| std::swap(work_group_launch_order_, operation.work_group_launch_order_); |
| std::swap(grid_size_, operation.grid_size_); |
| src_tensors_names_ = std::move(operation.src_tensors_names_); |
| dst_tensors_names_ = std::move(operation.dst_tensors_names_); |
| std::swap(work_groups_count_, operation.work_groups_count_); |
| std::swap(linkable_count_, operation.linkable_count_); |
| elementwise_code_ = std::move(operation.elementwise_code_); |
| } |
| return *this; |
| } |
| |
| absl::Status GPUOperation::AddOperation(GPUOperation* operation) { |
| linkable_count_ += 1; |
| std::string code = operation->code_; |
| std::string unique_postfix = absl::StrCat("_link", linkable_count_); |
| operation->args_.RenameArgs(unique_postfix, &code); |
| elementwise_code_ += "{\n" + code + "\n}\n"; |
| RETURN_IF_ERROR(args_.Merge(std::move(operation->args_), unique_postfix)); |
| for (int i = 0; i < operation->src_tensors_names_.size(); ++i) { |
| definition_.src_tensors.push_back( |
| operation->definition_.src_tensors[i + 1]); |
| src_tensors_names_.push_back(operation->src_tensors_names_[i] + |
| unique_postfix); |
| } |
| for (int i = 0; i < operation->dst_tensors_names_.size(); ++i) { |
| dst_tensors_names_.push_back(operation->dst_tensors_names_[i] + |
| unique_postfix); |
| } |
| return absl::OkStatus(); |
| } |
| |
| void GPUOperation::AddSrcTensor(const std::string& tensor_name, |
| const TensorDescriptor& desc) { |
| src_tensors_names_.push_back(tensor_name); |
| auto desc_new = absl::make_unique<TensorDescriptor>(desc); |
| args_.AddObjectRef(tensor_name, AccessType::READ, std::move(desc_new)); |
| } |
| |
| void GPUOperation::AddSrcBuffer(const std::string& buffer_name, |
| const BufferDescriptor& desc) { |
| src_tensors_names_.push_back(buffer_name); |
| auto desc_new = absl::make_unique<BufferDescriptor>(desc); |
| args_.AddObjectRef(buffer_name, AccessType::READ, std::move(desc_new)); |
| } |
| |
| void GPUOperation::AddSrcTexture2D(const std::string& texture_name, |
| const Texture2DDescriptor& desc) { |
| src_tensors_names_.push_back(texture_name); |
| auto desc_new = absl::make_unique<Texture2DDescriptor>(desc); |
| args_.AddObjectRef(texture_name, AccessType::READ, std::move(desc_new)); |
| } |
| |
| void GPUOperation::AddDstTensor(const std::string& tensor_name, |
| const TensorDescriptor& desc) { |
| dst_tensors_names_.push_back(tensor_name); |
| auto desc_new = absl::make_unique<TensorDescriptor>(desc); |
| args_.AddObjectRef(tensor_name, AccessType::WRITE, std::move(desc_new)); |
| } |
| |
| void GPUOperation::AssembleCode(const GpuInfo& gpu_info) { |
| if (elementwise_) { |
| auto src_desc = |
| absl::make_unique<TensorDescriptor>(definition_.src_tensors[0]); |
| if (definition_.IsBatchSupported()) { |
| src_desc->SetStateVar("BatchedWidth", "true"); |
| } |
| src_tensors_names_.insert(src_tensors_names_.begin(), "src_tensor"); |
| args_.AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc)); |
| |
| auto dst_desc = |
| absl::make_unique<TensorDescriptor>(definition_.dst_tensors[0]); |
| if (definition_.IsBatchSupported()) { |
| dst_desc->SetStateVar("BatchedWidth", "true"); |
| } |
| dst_tensors_names_.insert(dst_tensors_names_.begin(), "dst_tensor"); |
| args_.AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc)); |
| |
| elementwise_code_ = "{\n" + code_ + "\n}\n" + elementwise_code_; |
| code_ = GetElementWiseCode(definition_, check_src_channels_size_); |
| } |
| } |
| |
| void GPUOperation::GetPossibleKernelWorkGroups( |
| TuningType tuning_type, const GpuInfo& gpu_info, |
| const KernelInfo& kernel_info, std::vector<int3>* work_groups) const { |
| GetPossibleWorkGroups(tuning_type, gpu_info, kernel_info, grid_size_, |
| work_groups); |
| } |
| |
| int3 GPUOperation::GetGridSize() const { |
| if (elementwise_ || tensor_to_grid_ == TensorToGrid::kWBToX_HDToY_SToZ) { |
| const int grid_x = dst_[0]->Width() * dst_[0]->Batch(); |
| const int grid_y = dst_[0]->Height() * dst_[0]->Depth(); |
| const int grid_z = dst_[0]->Slices(); |
| return int3(grid_x, grid_y, grid_z); |
| } |
| if (tensor_to_grid_ == TensorToGrid::kWBToX_HDToY_ZIs1) { |
| const int grid_x = dst_[0]->Width() * dst_[0]->Batch(); |
| const int grid_y = dst_[0]->Height() * dst_[0]->Depth(); |
| const int grid_z = 1; |
| return int3(grid_x, grid_y, grid_z); |
| } |
| if (tensor_to_grid_ == TensorToGrid::kWBToX_HToY_DToZ) { |
| const int grid_x = dst_[0]->Width() * dst_[0]->Batch(); |
| const int grid_y = dst_[0]->Height(); |
| const int grid_z = dst_[0]->Depth(); |
| return int3(grid_x, grid_y, grid_z); |
| } |
| if (tensor_to_grid_ == TensorToGrid::kBToX_YIs1_ZIs1) { |
| const int grid_x = dst_[0]->Batch(); |
| const int grid_y = 1; |
| const int grid_z = 1; |
| return int3(grid_x, grid_y, grid_z); |
| } |
| return grid_size_; |
| } |
| |
| void GPUOperation::AddUniquePostfix(const std::string& unique_postfix) { |
| for (int i = 0; i < src_tensors_names_.size(); ++i) { |
| src_tensors_names_[i] += unique_postfix; |
| } |
| for (int i = 0; i < dst_tensors_names_.size(); ++i) { |
| dst_tensors_names_[i] += unique_postfix; |
| } |
| } |
| |
| } // namespace gpu |
| } // namespace tflite |