| /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "tensorflow/lite/delegates/gpu/cl/inference_context.h" |
| |
| #include <algorithm> |
| #include <cmath> |
| #include <cstdint> |
| #include <functional> |
| #include <limits> |
| #include <map> |
| #include <memory> |
| #include <numeric> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "absl/container/flat_hash_map.h" |
| #include "absl/container/flat_hash_set.h" |
| #include "tensorflow/lite/delegates/gpu/cl/buffer.h" |
| #include "tensorflow/lite/delegates/gpu/cl/cl_device.h" |
| #include "tensorflow/lite/delegates/gpu/cl/serialization.h" |
| #include "tensorflow/lite/delegates/gpu/common/data_type.h" |
| #include "tensorflow/lite/delegates/gpu/common/memory_management.h" |
| #include "tensorflow/lite/delegates/gpu/common/model.h" |
| #include "tensorflow/lite/delegates/gpu/common/model_transformer.h" |
| #include "tensorflow/lite/delegates/gpu/common/operations.h" |
| #include "tensorflow/lite/delegates/gpu/common/precision.h" |
| #include "tensorflow/lite/delegates/gpu/common/selectors/operation_selector.h" |
| #include "tensorflow/lite/delegates/gpu/common/selectors/special_selector.h" |
| #include "tensorflow/lite/delegates/gpu/common/shape.h" |
| #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h" |
| #include "tensorflow/lite/delegates/gpu/common/task/storage_type_util.h" |
| #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h" |
| #include "tensorflow/lite/delegates/gpu/common/types.h" |
| #include "tensorflow/lite/delegates/gpu/common/util.h" |
| |
| namespace tflite { |
| namespace gpu { |
| namespace cl { |
| |
| namespace { |
| |
| std::vector<std::pair<ValueId, TensorDescriptor>> GetCLNodeTensors( |
| const CLNode& node) { |
| std::vector<std::pair<ValueId, TensorDescriptor>> result; |
| result.reserve(node.inputs.size() + node.outputs.size()); |
| const OperationDef op_def = node.cl_operation.GetDefinition(); |
| for (int j = 0; j < node.inputs.size(); ++j) { |
| result.push_back({node.inputs[j], op_def.src_tensors[j]}); |
| } |
| for (int j = 0; j < node.outputs.size(); ++j) { |
| result.push_back({node.outputs[j], op_def.dst_tensors[j]}); |
| } |
| |
| return result; |
| } |
| |
| void AddUsage(ValueId id, int task_index, |
| std::map<ValueId, int2>* usage_records) { |
| auto it = usage_records->find(id); |
| if (it == usage_records->end()) { |
| (*usage_records)[id].x = task_index; |
| (*usage_records)[id].y = task_index; |
| } else { |
| (*usage_records)[id].y = task_index; |
| } |
| } |
| |
| // returns true if actual memory for this storage type will be allocated with |
| // clCreateBuffer. |
| bool IsBufferBased(const GpuInfo& gpu_info, const TensorStorageType& type) { |
| const bool image2d_based_buffer = |
| (type == TensorStorageType::TEXTURE_2D || |
| type == TensorStorageType::SINGLE_TEXTURE_2D) && |
| gpu_info.opencl_info.IsImage2dFromBufferSupported(); |
| return type == TensorStorageType::BUFFER || |
| type == TensorStorageType::IMAGE_BUFFER || image2d_based_buffer; |
| } |
| |
| // Calculates the total size of the assignment. |
| size_t TotalSize(const ObjectsAssignment<size_t>& assignment) { |
| return std::accumulate(assignment.object_sizes.begin(), |
| assignment.object_sizes.end(), static_cast<size_t>(0)); |
| } |
| |
| } // namespace |
| |
| void InferenceContext::ExecutionHints::Init(const GpuInfo& gpu_info) { |
| if (gpu_info.IsMali()) { |
| need_flush = true; |
| need_manual_release = gpu_info.mali_info.IsValhall() ? false : true; |
| |
| flush_periodically = true; |
| flush_period = 24; |
| } |
| if (gpu_info.IsPowerVR()) { |
| need_flush = true; |
| flush_periodically = true; |
| flush_period = 16; |
| } |
| } |
| |
| absl::Status InferenceContext::InitFromGraph( |
| const CreateGpuModelInfo& create_info, const GraphFloat32& graph, |
| Environment* env, std::vector<uint8_t>* serialized_model) { |
| GpuModel gpu_model; |
| RETURN_IF_ERROR(GraphToGpuModel(graph, create_info, |
| env->GetDevicePtr()->GetInfo(), &gpu_model)); |
| flatbuffers::FlatBufferBuilder builder; |
| flatbuffers::Offset<tflite::gpu::data::GpuModel> gpu_model_fb; |
| if (serialized_model) { |
| gpu_model_fb = Encode(gpu_model, &builder); |
| } |
| CopyFromGpuModel(&gpu_model); |
| |
| CreationContext creation_context; |
| creation_context.device = env->GetDevicePtr(); |
| creation_context.context = &env->context(); |
| creation_context.queue = env->queue(); |
| creation_context.cache = env->program_cache(); |
| for (const auto& external_tensor : create_info.external_immutable_tensors) { |
| auto* cl_spatial_tensor = dynamic_cast<Tensor*>(external_tensor.second); |
| if (!cl_spatial_tensor) { |
| return absl::InvalidArgumentError("Expected CLSpatialTensor."); |
| } |
| external_immutable_tensors_[external_tensor.first] = cl_spatial_tensor; |
| } |
| std::map<ValueId, Tensor> temp_external_tensors; |
| for (const auto& external_tensor : create_info.external_mutable_tensors) { |
| RETURN_IF_ERROR(CreateTensor( |
| env->context(), tensors_descs_[external_tensor.first].shape, |
| tensors_descs_[external_tensor.first], |
| &temp_external_tensors[external_tensor.first])); |
| external_mutable_tensors_[external_tensor.first] = |
| &temp_external_tensors[external_tensor.first]; |
| } |
| PrepareExternal(); |
| execution_hints_.Init(env->device().GetInfo()); |
| RETURN_IF_ERROR( |
| AllocateMemory(creation_context.GetGpuInfo(), creation_context.context)); |
| BindMemoryToOperations(); |
| RETURN_IF_ERROR(Compile(creation_context)); |
| RETURN_IF_ERROR(UpdateParams()); |
| |
| TuningType tuning_type = TuningType::kExhaustive; |
| if (create_info.hints.Check(ModelHints::kFastTuning)) { |
| tuning_type = TuningType::kFast; |
| } |
| if (env->device().GetInfo().IsMali()) { |
| const MaliInfo& info = env->device().GetInfo().mali_info; |
| if (info.IsMaliT6xx()) { |
| // Mali T628 hangs forever in clFinish when used profiling queue |
| // TuningType::FAST does not use profiling queue. |
| tuning_type = TuningType::kFast; |
| } |
| } |
| RETURN_IF_ERROR( |
| Tune(tuning_type, env->device().GetInfo(), env->profiling_queue())); |
| if (external_mutable_tensors_.empty()) { |
| // using recordable queue only when no mutable external tensors |
| InitRecordableQueue(env); |
| } |
| |
| for (auto& external_tensor : external_mutable_tensors_) { |
| external_tensor.second = nullptr; |
| } |
| |
| gpu_info_ = env->device().GetInfo(); |
| |
| if (serialized_model) { |
| auto encoded_fb = Encode(*env->GetDevicePtr(), *this, *env->program_cache(), |
| gpu_model_fb, &builder); |
| data::FinishInferenceContextBuffer(builder, encoded_fb); |
| serialized_model->resize(builder.GetSize()); |
| std::memcpy(serialized_model->data(), builder.GetBufferPointer(), |
| builder.GetSize()); |
| } |
| ReleaseCPURepresentation(); |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::RestoreDeserialized( |
| const absl::Span<const uint8_t> serialized_model, Environment* env, |
| CreateGpuModelInfo* create_info) { |
| flatbuffers::Verifier verifier(serialized_model.data(), |
| serialized_model.size()); |
| if (!data::VerifyInferenceContextBuffer(verifier)) { |
| return absl::DataLossError("Deserialization failed."); |
| } |
| auto decoded_fb = data::GetInferenceContext(serialized_model.data()); |
| RETURN_IF_ERROR(Decode(env->context(), *env->GetDevicePtr(), |
| env->program_cache(), decoded_fb, this)); |
| |
| CreationContext creation_context; |
| creation_context.device = env->GetDevicePtr(); |
| creation_context.context = &env->context(); |
| creation_context.queue = env->queue(); |
| creation_context.cache = env->program_cache(); |
| std::map<ValueId, Tensor> temp_external_tensors; |
| if (create_info) { |
| for (const auto& external_tensor : |
| create_info->external_immutable_tensors) { |
| auto* cl_spatial_tensor = dynamic_cast<Tensor*>(external_tensor.second); |
| if (!cl_spatial_tensor) { |
| return absl::InvalidArgumentError("Expected CLSpatialTensor."); |
| } |
| external_immutable_tensors_[external_tensor.first] = cl_spatial_tensor; |
| } |
| for (const auto& external_tensor : create_info->external_mutable_tensors) { |
| RETURN_IF_ERROR(CreateTensor( |
| env->context(), tensors_descs_[external_tensor.first].shape, |
| tensors_descs_[external_tensor.first], |
| &temp_external_tensors[external_tensor.first])); |
| external_mutable_tensors_[external_tensor.first] = |
| &temp_external_tensors[external_tensor.first]; |
| } |
| } |
| PrepareExternal(); |
| |
| execution_hints_.Init(env->device().GetInfo()); |
| |
| RETURN_IF_ERROR( |
| AllocateMemory(creation_context.GetGpuInfo(), creation_context.context)); |
| BindMemoryToOperations(); |
| for (auto& node : nodes_) { |
| RETURN_IF_ERROR(node.cl_operation.RestoreDeserialized(creation_context)); |
| } |
| RETURN_IF_ERROR(UpdateParams()); |
| if (external_mutable_tensors_.empty()) { |
| // using recordable queue only when no mutable external tensors |
| InitRecordableQueue(env); |
| } |
| for (auto& external_tensor : external_mutable_tensors_) { |
| external_tensor.second = nullptr; |
| } |
| ReleaseCPURepresentation(); |
| return absl::OkStatus(); |
| } |
| |
| void InferenceContext::CopyFromGpuModel(GpuModel* gpu_model) { |
| for (const auto& input : gpu_model->input_ids_and_refs) { |
| input_ids_.push_back(input.first); |
| } |
| for (const auto& variable_input : gpu_model->variable_ids_and_refs) { |
| variable_ids_and_refs_[variable_input.first] = variable_input.second; |
| } |
| for (const auto& output : gpu_model->output_ids_and_refs) { |
| output_ids_.push_back(output.first); |
| } |
| nodes_.resize(gpu_model->nodes.size()); |
| for (int i = 0; i < gpu_model->nodes.size(); ++i) { |
| nodes_[i].cl_operation.Init(std::move(gpu_model->nodes[i].gpu_operation)); |
| nodes_[i].inputs = gpu_model->nodes[i].inputs; |
| nodes_[i].outputs = gpu_model->nodes[i].outputs; |
| nodes_[i].name = gpu_model->nodes[i].name; |
| } |
| const_tensors_descs_ = std::move(gpu_model->const_tensors); |
| tensors_descs_ = std::move(gpu_model->tensors); |
| } |
| |
| void InferenceContext::InitRecordableQueue(Environment* env) { |
| std::vector<ClOperation*> ops(nodes_.size()); |
| for (int i = 0; i < nodes_.size(); ++i) { |
| ops[i] = &nodes_[i].cl_operation; |
| } |
| recordable_queue_ = CreateRecordableQueue(ops, env->device(), env->context()); |
| } |
| |
| absl::Status InferenceContext::InitFromGraphWithTransforms( |
| const CreateGpuModelInfo& create_info, GraphFloat32* graph, |
| Environment* env, std::vector<uint8_t>* serialized_model) { |
| RETURN_IF_ERROR(RunGraphTransformsForGpuModel(graph)); |
| RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env, serialized_model)); |
| return absl::OkStatus(); |
| } |
| |
| void InferenceContext::GetUsages(const std::function<bool(ValueId)>& functor, |
| std::map<ValueId, int2>* usages) { |
| for (ValueId in_id : input_ids_) { |
| if (functor(in_id)) { |
| AddUsage(in_id, 0, usages); |
| } |
| } |
| for (int op_index = 0; op_index < nodes_.size(); ++op_index) { |
| auto tensors = GetCLNodeTensors(nodes_[op_index]); |
| for (auto& tensor : tensors) { |
| if (functor(tensor.first)) { |
| AddUsage(tensor.first, op_index, usages); |
| } |
| } |
| } |
| for (ValueId out_id : output_ids_) { |
| if (functor(out_id)) { |
| AddUsage(out_id, nodes_.size(), usages); |
| } |
| } |
| } |
| |
| InferenceContext::TensorMemoryType InferenceContext::GetTensorMemoryType( |
| const GpuInfo& gpu_info, ValueId id) { |
| if (external_immutable_tensors_.find(id) != |
| external_immutable_tensors_.end()) { |
| return TensorMemoryType::kExternal; |
| } else if (external_mutable_tensors_.find(id) != |
| external_mutable_tensors_.end()) { |
| return TensorMemoryType::kExternal; |
| } else if (const_tensors_.find(id) != const_tensors_.end()) { |
| return TensorMemoryType::kConst; |
| } else if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) { |
| return TensorMemoryType::kVariable; |
| } else if (IsBufferBased(gpu_info, tensors_descs_[id].storage_type)) { |
| return TensorMemoryType::kBuffer; |
| } else { |
| return TensorMemoryType::kStrongShape; |
| } |
| } |
| |
| absl::Status InferenceContext::AllocateMemory(const GpuInfo& gpu_info, |
| CLContext* context) { |
| RETURN_IF_ERROR(AllocateMemoryForConstTensors(context)); |
| RETURN_IF_ERROR(AllocateMemoryForVariableTensors(context)); |
| RETURN_IF_ERROR(AllocateMemoryForBuffers(gpu_info, context)); |
| RETURN_IF_ERROR(AllocateMemoryForStrongShapes(gpu_info, context)); |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::AllocateMemoryForConstTensors( |
| CLContext* context) { |
| for (auto& description : const_tensors_descs_) { |
| RETURN_IF_ERROR(const_tensors_[description.first].CreateFromDescriptor( |
| description.second, context)); |
| } |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::AllocateMemoryForVariableTensors( |
| CLContext* context) { |
| std::map<ValueId, int> ref_value_to_tensor_index; |
| |
| for (auto value_and_ref_value : variable_ids_and_refs_) { |
| if (ref_value_to_tensor_index.find(value_and_ref_value.second) == |
| ref_value_to_tensor_index.end()) { |
| const auto& t = tensors_descs_[value_and_ref_value.first]; |
| const auto& shape = t.shape; |
| const auto& descriptor = t; |
| |
| RETURN_IF_ERROR( |
| CreateTensor(*context, shape, descriptor, |
| &variable_tensors_[value_and_ref_value.second])); |
| } |
| } |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::AllocateMemoryForBuffers(const GpuInfo& gpu_info, |
| CLContext* context) { |
| std::map<ValueId, int2> buffer_usages; |
| GetUsages( |
| [this, &gpu_info](ValueId id) { |
| return GetTensorMemoryType(gpu_info, id) == TensorMemoryType::kBuffer; |
| }, |
| &buffer_usages); |
| |
| std::vector<TensorUsageRecord<size_t>> buffer_usage_records; |
| bool has_buffer_based_images = false; |
| for (auto& usage : buffer_usages) { |
| const auto& t = tensors_descs_[usage.first]; |
| const auto& shape = t.shape; |
| const auto& descriptor = t; |
| const size_t element_size = SizeOf(descriptor.data_type); |
| size_t buffer_size; |
| if (descriptor.storage_type == TensorStorageType::TEXTURE_2D || |
| descriptor.storage_type == TensorStorageType::SINGLE_TEXTURE_2D) { |
| has_buffer_based_images = true; |
| const size_t bytes_per_pixel = |
| element_size * |
| (descriptor.storage_type == TensorStorageType::TEXTURE_2D ? 4 |
| : shape.c); |
| const size_t width = shape.b * shape.w; |
| const size_t height = shape.h * DivideRoundUp(shape.c, 4); |
| size_t width_pixel_alignment = gpu_info.opencl_info.image_pitch_alignment; |
| if (gpu_info.IsAdreno() && width_pixel_alignment % bytes_per_pixel == 0) { |
| width_pixel_alignment /= bytes_per_pixel; |
| } |
| const size_t width_aligned = AlignByN(width, width_pixel_alignment); |
| buffer_size = width_aligned * bytes_per_pixel * height; |
| } else { |
| if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) { |
| has_buffer_based_images = true; |
| } |
| buffer_size = |
| shape.b * shape.w * shape.h * AlignByN(shape.c, 4) * element_size; |
| } |
| graph_ids_to_shared_buffer_tensors_[usage.first] = |
| buffer_usage_records.size(); |
| buffer_usage_records.push_back({buffer_size, |
| static_cast<TaskId>(usage.second.x), |
| static_cast<TaskId>(usage.second.y)}); |
| } |
| |
| ObjectsAssignment<size_t> buffer_assignment; |
| RETURN_IF_ERROR(AssignObjectsToTensors( |
| buffer_usage_records, MemoryStrategy::GREEDY_BEST, &buffer_assignment)); |
| |
| bool use_offset_assignment = false; |
| |
| OffsetsAssignment offset_assignment; |
| if ((!has_buffer_based_images && gpu_info.IsCL11OrHigher()) || |
| CanUseSubBufferForImage2d(gpu_info)) { |
| const size_t base_align_bytes = |
| std::max<size_t>(gpu_info.opencl_info.base_addr_align_in_bits >> 3, 1); |
| RETURN_IF_ERROR(AssignOffsetsToTensors( |
| buffer_usage_records, MemoryStrategy::GREEDY_BY_SIZE, |
| &offset_assignment, base_align_bytes)); |
| if (offset_assignment.total_size < TotalSize(buffer_assignment) && |
| offset_assignment.total_size <= gpu_info.GetMaxBufferSize()) { |
| use_offset_assignment = true; |
| } |
| } |
| |
| if (use_offset_assignment) { |
| shared_buffers_.resize(offset_assignment.offsets.size()); |
| RETURN_IF_ERROR(CreateReadWriteBuffer(offset_assignment.total_size, context, |
| &shared_buffers_parent_)); |
| for (int i = 0; i < offset_assignment.offsets.size(); ++i) { |
| RETURN_IF_ERROR(CreateReadWriteSubBuffer( |
| shared_buffers_parent_, offset_assignment.offsets[i], |
| buffer_usage_records[i].tensor_size, context, &shared_buffers_[i])); |
| } |
| } else { |
| shared_buffers_.resize(buffer_assignment.object_sizes.size()); |
| for (int i = 0; i < buffer_assignment.object_sizes.size(); ++i) { |
| RETURN_IF_ERROR(CreateReadWriteBuffer(buffer_assignment.object_sizes[i], |
| context, &shared_buffers_[i])); |
| } |
| } |
| |
| std::vector<bool> created_tensors(buffer_usage_records.size(), false); |
| shared_buffer_tensors_.resize(buffer_usage_records.size()); |
| for (auto& node : nodes_) { |
| auto tensors = GetCLNodeTensors(node); |
| for (auto& t : tensors) { |
| if (GetTensorMemoryType(gpu_info, t.first) != TensorMemoryType::kBuffer) |
| continue; |
| const int tensor_index = graph_ids_to_shared_buffer_tensors_[t.first]; |
| if (created_tensors[tensor_index]) continue; |
| const auto& shape_5d = tensors_descs_[t.first].shape; |
| const auto shape = BHWC(shape_5d.b, shape_5d.h, shape_5d.w, shape_5d.c); |
| const int buffer_index = use_offset_assignment |
| ? tensor_index |
| : buffer_assignment.object_ids[tensor_index]; |
| if (t.second.storage_type == TensorStorageType::TEXTURE_2D || |
| t.second.storage_type == TensorStorageType::SINGLE_TEXTURE_2D) { |
| const size_t bytes_per_pixel = |
| SizeOf(t.second.data_type) * |
| (t.second.storage_type == TensorStorageType::TEXTURE_2D ? 4 |
| : shape.c); |
| size_t width_pixel_alignment = |
| gpu_info.opencl_info.image_pitch_alignment; |
| if (gpu_info.IsAdreno() && |
| width_pixel_alignment % bytes_per_pixel == 0) { |
| width_pixel_alignment /= bytes_per_pixel; |
| } |
| RETURN_IF_ERROR(CreateSharedImage2DBufferTensor( |
| *context, shared_buffers_[buffer_index].GetMemoryPtr(), shape, |
| t.second, width_pixel_alignment, |
| &shared_buffer_tensors_[tensor_index])); |
| } else { |
| RETURN_IF_ERROR(CreateSharedTensor( |
| *context, shared_buffers_[buffer_index].GetMemoryPtr(), shape, |
| t.second, &shared_buffer_tensors_[tensor_index])); |
| } |
| created_tensors[tensor_index] = true; |
| } |
| } |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::AllocateMemoryForStrongShapes( |
| const GpuInfo& gpu_info, CLContext* context) { |
| std::map<ValueId, int2> usages; |
| GetUsages( |
| [this, &gpu_info](ValueId id) { |
| return GetTensorMemoryType(gpu_info, id) == |
| TensorMemoryType::kStrongShape; |
| }, |
| &usages); |
| |
| struct TensorDescComparator { |
| TensorDescriptor tensor_desc; |
| |
| bool operator==(const TensorDescComparator& t) const { |
| return tensor_desc.data_type == t.tensor_desc.data_type && |
| tensor_desc.storage_type == t.tensor_desc.storage_type && |
| tensor_desc.layout == t.tensor_desc.layout && |
| tensor_desc.shape == t.tensor_desc.shape; |
| } |
| }; |
| |
| std::vector<TensorUsageRecord<TensorDescComparator>> usage_records; |
| std::map<ValueId, ValueId> remap_from_graph_ids; |
| for (auto& usage : usages) { |
| remap_from_graph_ids[usage.first] = usage_records.size(); |
| usage_records.push_back({{tensors_descs_[usage.first]}, |
| static_cast<TaskId>(usage.second.x), |
| static_cast<TaskId>(usage.second.y)}); |
| } |
| |
| ObjectsAssignment<TensorDescComparator> assignment; |
| RETURN_IF_ERROR(AssignObjectsToTensors( |
| usage_records, MemoryStrategy::EQUALITY, &assignment)); |
| |
| for (auto& node : nodes_) { |
| auto tensors = GetCLNodeTensors(node); |
| for (auto& t : tensors) { |
| if (GetTensorMemoryType(gpu_info, t.first) != |
| TensorMemoryType::kStrongShape) { |
| continue; |
| } |
| const auto& shape = tensors_descs_[t.first].shape; |
| const auto id = assignment.object_ids[remap_from_graph_ids[t.first]]; |
| graph_ids_to_strong_shape_tensors_[t.first] = id; |
| const auto& it = strong_shape_tensors_.find(id); |
| if (it == strong_shape_tensors_.end()) { |
| RETURN_IF_ERROR(CreateTensor(*context, shape, t.second, |
| &strong_shape_tensors_[id])); |
| } |
| } |
| } |
| return absl::OkStatus(); |
| } |
| |
| void InferenceContext::BindMemoryToOperations() { |
| for (auto& node : nodes_) { |
| for (int i = 0; i < node.inputs.size(); ++i) { |
| node.cl_operation.GetGpuOperation().SetSrc(GetTensor(node.inputs[i]), i); |
| } |
| for (int i = 0; i < node.outputs.size(); ++i) { |
| node.cl_operation.GetGpuOperation().SetDst(GetTensor(node.outputs[i]), i); |
| } |
| } |
| } |
| |
| absl::Status InferenceContext::Compile( |
| const CreationContext& creation_context) { |
| for (auto& node : nodes_) { |
| RETURN_IF_ERROR(node.cl_operation.Compile(creation_context)); |
| } |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::Tune(TuningType tuning_type, |
| const GpuInfo& gpu_info, |
| ProfilingCommandQueue* profiling_queue) { |
| for (auto& node : nodes_) { |
| RETURN_IF_ERROR( |
| node.cl_operation.Tune(tuning_type, gpu_info, profiling_queue)); |
| } |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::UpdateParams() { |
| for (auto& node : nodes_) { |
| RETURN_IF_ERROR(node.cl_operation.UpdateParams()); |
| } |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::SetTensor(const ValueId& tensor_id, |
| Tensor* tensor_ptr) { |
| auto it = external_mutable_tensors_.find(tensor_id); |
| if (it == external_mutable_tensors_.end()) { |
| return absl::InvalidArgumentError("No external tensor with this id."); |
| } |
| external_mutable_tensors_[tensor_id] = tensor_ptr; |
| for (int node_index : external_tensor_to_nodes_[tensor_id]) { |
| auto& node = nodes_[node_index]; |
| for (int i = 0; i < node.inputs.size(); ++i) { |
| if (node.inputs[i] == tensor_id) { |
| RETURN_IF_ERROR(node.cl_operation.SetSrcTensor(i, tensor_ptr)); |
| } |
| } |
| for (int i = 0; i < node.outputs.size(); ++i) { |
| if (node.outputs[i] == tensor_id) { |
| RETURN_IF_ERROR(node.cl_operation.SetDstTensor(i, tensor_ptr)); |
| } |
| } |
| } |
| return absl::OkStatus(); |
| } |
| |
| void InferenceContext::PrepareExternal() { |
| for (auto& external : external_mutable_tensors_) { |
| for (int i = 0; i < nodes_.size(); ++i) { |
| bool has_tensor = false; |
| const auto& src_ids = nodes_[i].inputs; |
| for (int i = 0; i < src_ids.size(); ++i) { |
| if (src_ids[i] == external.first) { |
| has_tensor = true; |
| } |
| } |
| const auto& dst_ids = nodes_[i].outputs; |
| for (int i = 0; i < dst_ids.size(); ++i) { |
| if (dst_ids[i] == external.first) { |
| has_tensor = true; |
| } |
| } |
| if (has_tensor) { |
| external_tensor_to_nodes_[external.first].push_back(i); |
| } |
| } |
| } |
| } |
| |
| absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) { |
| if (recordable_queue_ && recordable_queue_->IsSupported()) { |
| return recordable_queue_->Execute(queue); |
| } |
| if (execution_hints_.need_manual_release) { |
| if (execution_hints_.prev_enqueue_start_point.is_valid()) { |
| execution_hints_.prev_enqueue_start_point.Wait(); |
| } |
| RETURN_IF_ERROR( |
| queue->EnqueueEvent(&execution_hints_.prev_enqueue_start_point)); |
| } |
| int counter = 0; |
| for (auto& node : nodes_) { |
| RETURN_IF_ERROR(node.cl_operation.AddToQueue(queue)); |
| counter++; |
| if (execution_hints_.flush_periodically && |
| counter % execution_hints_.flush_period == 0) { |
| clFlush(queue->queue()); |
| } |
| } |
| if (execution_hints_.need_flush) { |
| clFlush(queue->queue()); |
| } |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::ProfileTime(ProfilingCommandQueue* queue, |
| ProfilingInfo* result) { |
| queue->ResetMeasurements(); |
| for (auto& node : nodes_) { |
| queue->SetEventsLabel(node.name); |
| RETURN_IF_ERROR(node.cl_operation.AddToQueue(queue)); |
| } |
| RETURN_IF_ERROR(queue->WaitForCompletion()); |
| *result = queue->GetProfilingInfo(); |
| |
| if (!(gpu_info_.IsMali() || gpu_info_.IsPowerVR())) { |
| return absl::OkStatus(); |
| } |
| |
| if (gpu_info_.IsMali()) { |
| queue->ResetMeasurements(); |
| for (int i = 0; i < nodes_.size(); ++i) { |
| queue->SetEventsLabel(nodes_[i].name); |
| const double times = |
| 16.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration); |
| const int n = std::min(256.0, std::max(2.0, times)); |
| RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n)); |
| } |
| RETURN_IF_ERROR(queue->WaitForCompletion()); |
| *result = queue->GetProfilingInfo(); |
| return absl::OkStatus(); |
| } |
| |
| if (gpu_info_.IsPowerVR()) { |
| queue->ResetMeasurements(); |
| for (int i = 0; i < nodes_.size(); ++i) { |
| queue->SetEventsLabel(nodes_[i].name); |
| const double times = |
| 32.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration); |
| const int n = std::min(64.0, std::max(4.0, times)); |
| RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n)); |
| } |
| RETURN_IF_ERROR(queue->WaitForCompletion()); |
| *result = queue->GetProfilingInfo(); |
| |
| queue->ResetMeasurements(); |
| for (int i = 0; i < nodes_.size(); ++i) { |
| queue->SetEventsLabel(nodes_[i].name); |
| const double times = |
| 128.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration); |
| const int n = std::min(1024.0, std::max(4.0, times)); |
| RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n)); |
| } |
| RETURN_IF_ERROR(queue->WaitForCompletion()); |
| *result = queue->GetProfilingInfo(); |
| return absl::OkStatus(); |
| } |
| |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::Profile(ProfilingCommandQueue* queue, |
| ProfilingInfo* result) { |
| RETURN_IF_ERROR(ProfileTime(queue, result)); |
| for (int i = 0; i < nodes_.size(); ++i) { |
| uint64_t read_size = 0; |
| for (auto& src_id : nodes_[i].inputs) { |
| read_size += GetTensor(src_id)->GetMemorySizeInBytes(); |
| } |
| const auto& gpu_op = nodes_[i].cl_operation.GetGpuOperation(); |
| read_size += gpu_op.const_args_size_; |
| uint64_t write_size = 0; |
| for (auto& dst_id : nodes_[i].outputs) { |
| write_size += GetTensor(dst_id)->GetMemorySizeInBytes(); |
| } |
| result->dispatches[i].flops = gpu_op.flops_; |
| result->dispatches[i].read_mem_size = read_size; |
| result->dispatches[i].write_mem_size = write_size; |
| } |
| |
| return absl::OkStatus(); |
| } |
| |
| uint64_t InferenceContext::GetSizeOfMemoryAllocatedForIntermediateTensors() |
| const { |
| uint64_t total_memory = 0; |
| for (const auto& t : strong_shape_tensors_) { |
| total_memory += t.second.GetMemorySizeInBytes(); |
| } |
| for (const auto& b : shared_buffers_) { |
| // Sub-buffers do not allocate memory. Count the size of the parent buffer |
| // object instead. |
| if (!b.IsSubBuffer()) { |
| total_memory += b.GetMemorySizeInBytes(); |
| } |
| } |
| for (const auto& t : variable_tensors_) { |
| total_memory += t.second.GetMemorySizeInBytes(); |
| } |
| total_memory += shared_buffers_parent_.GetMemorySizeInBytes(); |
| |
| return total_memory; |
| } |
| |
| uint64_t InferenceContext::GetConstantTensorsSize() const { |
| uint64_t total_size = 0; |
| for (const auto& node : nodes_) { |
| total_size += node.cl_operation.GetGpuOperation().const_args_size_; |
| } |
| return total_size; |
| } |
| |
| Tensor* InferenceContext::GetTensor(ValueId id) { |
| if (external_immutable_tensors_.find(id) != |
| external_immutable_tensors_.end()) { |
| return external_immutable_tensors_[id]; |
| } else if (external_mutable_tensors_.find(id) != |
| external_mutable_tensors_.end()) { |
| return external_mutable_tensors_[id]; |
| } else if (const_tensors_.find(id) != const_tensors_.end()) { |
| return &const_tensors_[id]; |
| } else if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) { |
| return &variable_tensors_[variable_ids_and_refs_[id]]; |
| } else if (graph_ids_to_shared_buffer_tensors_.find(id) != |
| graph_ids_to_shared_buffer_tensors_.end()) { |
| return &shared_buffer_tensors_[graph_ids_to_shared_buffer_tensors_[id]]; |
| } else { |
| return &strong_shape_tensors_[graph_ids_to_strong_shape_tensors_[id]]; |
| } |
| } |
| |
| absl::Status InferenceContext::SetInputTensor(ValueId id, |
| const TensorFloat32& tensor, |
| CLCommandQueue* queue) { |
| return GetTensor(id)->WriteData(queue, tensor); |
| } |
| |
| absl::Status InferenceContext::GetOutputTensor(ValueId id, |
| CLCommandQueue* queue, |
| TensorFloat32* result) { |
| const auto& gpu_tensor = *GetTensor(id); |
| const auto dst_shape = BHWC(gpu_tensor.Batch(), gpu_tensor.Height(), |
| gpu_tensor.Width(), gpu_tensor.Channels()); |
| result->id = id; |
| result->shape = dst_shape; |
| result->data.resize(dst_shape.DimensionsProduct()); |
| return gpu_tensor.ReadData(queue, result); |
| } |
| |
| void InferenceContext::ReleaseCPURepresentation() { |
| for (auto& node : nodes_) { |
| node.cl_operation.GetGpuOperation().args_.ReleaseCPURepresentation(); |
| } |
| const_tensors_descs_.clear(); |
| } |
| |
| } // namespace cl |
| } // namespace gpu |
| } // namespace tflite |