| /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "tensorflow/lite/delegates/gpu/cl/inference_context.h" |
| |
| #include <algorithm> |
| #include <cmath> |
| #include <cstdint> |
| #include <map> |
| #include <memory> |
| #include <string> |
| #include <unordered_set> |
| #include <vector> |
| |
| #include "tensorflow/lite/delegates/gpu/cl/buffer.h" |
| #include "tensorflow/lite/delegates/gpu/cl/cl_device.h" |
| #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h" |
| #include "tensorflow/lite/delegates/gpu/cl/model_hints.h" |
| #include "tensorflow/lite/delegates/gpu/cl/precision.h" |
| #include "tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h" |
| #include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h" |
| #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" |
| #include "tensorflow/lite/delegates/gpu/common/data_type.h" |
| #include "tensorflow/lite/delegates/gpu/common/memory_management.h" |
| #include "tensorflow/lite/delegates/gpu/common/model.h" |
| #include "tensorflow/lite/delegates/gpu/common/model_transformer.h" |
| #include "tensorflow/lite/delegates/gpu/common/operations.h" |
| #include "tensorflow/lite/delegates/gpu/common/shape.h" |
| #include "tensorflow/lite/delegates/gpu/common/transformations/add_bias.h" |
| #include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h" |
| #include "tensorflow/lite/delegates/gpu/common/types.h" |
| #include "tensorflow/lite/delegates/gpu/common/util.h" |
| |
| namespace tflite { |
| namespace gpu { |
| namespace cl { |
| |
| namespace { |
| bool IsReady(const std::unordered_set<ValueId>& ready_tensors, |
| const CLNode& node) { |
| for (const ValueId in_id : node.inputs) { |
| if (ready_tensors.find(in_id) == ready_tensors.end()) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| std::vector<std::pair<ValueId, TensorDescriptor>> GetCLNodeTensors( |
| const CLNode& node) { |
| std::vector<std::pair<ValueId, TensorDescriptor>> result; |
| const OperationDef main_def = node.operations[0]->GetDefinition(); |
| const auto& first_range = node.ranges[0]; |
| for (int k = first_range.x; k < first_range.y; ++k) { |
| result.push_back({node.inputs[k], main_def.src_tensors[k - first_range.x]}); |
| } |
| for (int j = 1; j < node.ranges.size(); ++j) { |
| const auto& range = node.ranges[j]; |
| const OperationDef op_def = node.operations[j]->GetDefinition(); |
| for (int k = range.x; k < range.y; ++k) { |
| result.push_back({node.inputs[k], op_def.src_tensors[k - range.x + 1]}); |
| } |
| } |
| for (int j = 0; j < node.outputs.size(); ++j) { |
| result.push_back({node.outputs[j], main_def.dst_tensors[j]}); |
| } |
| |
| return result; |
| } |
| |
| void MergeCLNodes(CLNode* src, CLNode* dst) { |
| int offset = dst->inputs.size(); |
| for (int j = 1; j < src->inputs.size(); ++j) { |
| dst->inputs.push_back(src->inputs[j]); |
| } |
| auto first_range = src->ranges[0]; |
| dst->ranges.push_back( |
| int2(first_range.x + offset, first_range.y - 1 + offset)); |
| for (int i = 1; i < src->ranges.size(); ++i) { |
| auto range = src->ranges[i]; |
| dst->ranges.push_back(int2(range.x + offset, range.y + offset)); |
| } |
| dst->outputs[0] = src->outputs[0]; |
| for (int i = 0; i < src->operations.size(); ++i) { |
| dst->operations.push_back(std::move(src->operations[i])); |
| } |
| dst->name += " linked : " + src->name; |
| } |
| |
| void AddUsage(ValueId id, int task_index, |
| std::map<ValueId, int2>* usage_records) { |
| auto it = usage_records->find(id); |
| if (it == usage_records->end()) { |
| (*usage_records)[id].x = task_index; |
| (*usage_records)[id].y = task_index; |
| } else { |
| (*usage_records)[id].y = task_index; |
| } |
| } |
| |
| // returns true if actual memory for this storage type will be allocated with |
| // clCreateBuffer. |
| bool IsBufferBased(const TensorStorageType& type) { |
| return type == TensorStorageType::BUFFER || |
| type == TensorStorageType::IMAGE_BUFFER; |
| } |
| |
| // Generic add is add that have several runtime inputs and they are not |
| // broadcasted, i.e. pointwise add for N tensors where N > 1. |
| bool IsGenericAdd(const Node& node, const std::vector<Value*>& inputs, |
| const std::vector<Value*>& outputs) { |
| if (inputs.size() == 1) { |
| return false; |
| } |
| const OperationType op_type = OperationTypeFromString(node.operation.type); |
| if (op_type != OperationType::ADD) { |
| return false; |
| } |
| |
| const auto dst_shape = outputs[0]->tensor.shape; |
| for (int i = 0; i < inputs.size(); ++i) { |
| const auto src_shape = inputs[i]->tensor.shape; |
| if (dst_shape.b != src_shape.b && src_shape.b == 1) { |
| return false; |
| } |
| if (dst_shape.h != src_shape.h && src_shape.h == 1) { |
| return false; |
| } |
| if (dst_shape.w != src_shape.w && src_shape.w == 1) { |
| return false; |
| } |
| if (dst_shape.c != src_shape.c && src_shape.c == 1) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| } // namespace |
| |
| CLNode::CLNode(CLNode&& node) |
| : operations(std::move(node.operations)), |
| inputs(std::move(node.inputs)), |
| outputs(std::move(node.outputs)), |
| ranges(std::move(node.ranges)), |
| name(std::move(node.name)) {} |
| |
| CLNode& CLNode::operator=(CLNode&& node) { |
| if (this != &node) { |
| operations = std::move(node.operations); |
| inputs = std::move(node.inputs); |
| outputs = std::move(node.outputs); |
| ranges = std::move(node.ranges); |
| name = std::move(node.name); |
| } |
| return *this; |
| } |
| |
| absl::Status InferenceContext::InitFromGraph( |
| const CreateInferenceInfo& create_info, const GraphFloat32& graph, |
| Environment* env) { |
| CreationContext creation_context; |
| creation_context.device = env->GetDevicePtr(); |
| creation_context.context = &env->context(); |
| creation_context.queue = env->queue(); |
| creation_context.cache = env->program_cache(); |
| |
| ReserveGraphTensors(create_info, creation_context, graph); |
| precision_ = create_info.precision; |
| storage_type_ = create_info.storage_type; |
| if (env->device().IsMali()) { |
| need_flush_ = true; |
| need_manual_release_ = true; |
| |
| flush_periodically_ = true; |
| flush_period_ = 24; |
| } |
| if (env->device().IsPowerVR()) { |
| need_flush_ = true; |
| } |
| CopyInAndOutIds(graph); |
| RETURN_IF_ERROR( |
| ConvertOperations(creation_context, graph, create_info.hints)); |
| Merge(); |
| RETURN_IF_ERROR(AllocateMemory(env->device(), creation_context.context)); |
| BindMemoryToOperations(); |
| RETURN_IF_ERROR(Compile(creation_context)); |
| RETURN_IF_ERROR(UpdateParams()); |
| |
| TuningParameters tuning_parameters; |
| tuning_parameters.queue = env->profiling_queue(); |
| tuning_parameters.info = env->device().GetInfoPtr(); |
| if (create_info.hints.Check(ModelHints::kFastTuning)) { |
| tuning_parameters.tuning_type = TuningType::FAST; |
| } |
| RETURN_IF_ERROR(Tune(tuning_parameters)); |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::InitFromGraphWithTransforms( |
| const CreateInferenceInfo& create_info, GraphFloat32* graph, |
| Environment* env) { |
| RETURN_IF_ERROR(RunGraphTransforms(graph)); |
| RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env)); |
| return absl::OkStatus(); |
| } |
| |
| void InferenceContext::CopyInAndOutIds(const GraphFloat32& graph) { |
| const auto inputs = graph.inputs(); |
| for (const auto& input : inputs) { |
| input_ids_.push_back(input->id); |
| } |
| |
| const auto outputs = graph.outputs(); |
| for (const auto& output : outputs) { |
| output_ids_.push_back(output->id); |
| } |
| } |
| |
| void InferenceContext::ReserveGraphTensors( |
| const CreateInferenceInfo& create_info, |
| const CreationContext& creation_context, const GraphFloat32& graph) { |
| ValueId max_id; |
| auto tensors = graph.values(); |
| auto data_type = DeduceDataTypeFromPrecision(create_info.precision); |
| for (auto& t : tensors) { |
| TensorStorageType storage_type = create_info.storage_type; |
| const auto shape = graph.GetValue(t->id)->tensor.shape; |
| Layout layout = shape.b == 1 ? Layout::HWC : Layout::BHWC; |
| if (graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id)) { |
| if (shape.c < 4 && |
| CanCreateTensorWithShape( |
| *creation_context.context, *creation_context.device, shape, |
| TensorDescriptor{data_type, TensorStorageType::SINGLE_TEXTURE_2D, |
| layout})) { |
| storage_type = TensorStorageType::SINGLE_TEXTURE_2D; |
| } |
| } |
| storage_type = SelectBestStorageType(*creation_context.context, |
| *creation_context.device, shape, |
| storage_type, data_type, layout); |
| tensor_reserver_.Add( |
| t->id, {shape, TensorDescriptor{data_type, storage_type, layout}}); |
| max_id = std::max(max_id, t->id); |
| } |
| tensor_reserver_.SetNext(max_id + 1); |
| } |
| |
| absl::Status InferenceContext::ConvertOperations( |
| const CreationContext& creation_context, const GraphFloat32& graph, |
| ModelHints hints) { |
| std::vector<Node*> graph_nodes = graph.nodes(); |
| std::map<ValueId, int> |
| tensor_usages; // keeps latest index of operation that updated tensor |
| for (const auto& input_id : input_ids_) { |
| tensor_usages[input_id] = -1; // so as inputs "updated" before operation 0, |
| // we will mark them with -1 |
| } |
| for (int i = 0; i < graph_nodes.size(); ++i) { |
| const Node& node = *graph_nodes[i]; |
| auto inputs = graph.FindInputs(node.id); |
| auto outputs = graph.FindOutputs(node.id); |
| |
| // Reordering of input ids and updating of temporary tensors_usage struct. |
| // This stage is necessary because we are building OperationDef that rely on |
| // order of input ids. But we also should have input id on first position |
| // that potentially can be "linking" tensor and as result eliminated(unused) |
| // We apply it only for ADD operation, because of ADD associativity and |
| // ADD can be linked. |
| // In current approach "linking" tensor can be only latest written |
| // tensor(during linear order of execution) among input tensors. |
| if (IsGenericAdd(node, inputs, outputs)) { |
| int latest_written_tensor_index = 0; |
| int last_usage = tensor_usages[inputs[0]->id]; |
| for (int j = 1; j < inputs.size(); ++j) { |
| if (tensor_usages[inputs[j]->id] > last_usage) { |
| last_usage = tensor_usages[inputs[j]->id]; |
| latest_written_tensor_index = j; |
| } |
| } |
| std::swap(inputs[0], inputs[latest_written_tensor_index]); |
| } |
| for (const auto& out_id : outputs) { |
| tensor_usages[out_id->id] = i; |
| } |
| |
| OperationDef op_def; |
| op_def.precision = precision_; |
| for (int j = 0; j < inputs.size(); ++j) { |
| op_def.src_tensors.push_back( |
| tensor_reserver_.Get(inputs[j]->id).descriptor); |
| } |
| for (int j = 0; j < outputs.size(); ++j) { |
| op_def.dst_tensors.push_back( |
| tensor_reserver_.Get(outputs[j]->id).descriptor); |
| } |
| GPUOperationsSubgraph gpu_subgraph; |
| RETURN_IF_ERROR(GPUOperationFromNode(creation_context, op_def, hints, |
| inputs, outputs, node, &gpu_subgraph)); |
| std::unordered_map<int, ValueId> mapping_to_global_ids; |
| for (int j = 0; j < gpu_subgraph.new_tensors.size(); ++j) { |
| const auto& t = gpu_subgraph.new_tensors[j]; |
| auto global_id = tensor_reserver_.Add({t.first, t.second}); |
| mapping_to_global_ids[j] = global_id; |
| } |
| for (auto& gpu_op : gpu_subgraph.operations) { |
| CLNode cl_node; |
| cl_node.operations.push_back(std::move(gpu_op.operation)); |
| cl_node.ranges.push_back( |
| int2(0, static_cast<int>(gpu_op.input_ids.size()))); |
| cl_node.inputs.resize(gpu_op.input_ids.size()); |
| for (int j = 0; j < gpu_op.input_ids.size(); ++j) { |
| int id = gpu_op.input_ids[j]; |
| if (id >= 0) { |
| cl_node.inputs[j] = inputs[id]->id; |
| } else { |
| cl_node.inputs[j] = mapping_to_global_ids[-(id + 1)]; |
| } |
| } |
| cl_node.outputs.resize(gpu_op.output_ids.size()); |
| for (int j = 0; j < gpu_op.output_ids.size(); ++j) { |
| int id = gpu_op.output_ids[j]; |
| if (id >= 0) { |
| cl_node.outputs[j] = outputs[id]->id; |
| } else { |
| cl_node.outputs[j] = mapping_to_global_ids[-(id + 1)]; |
| } |
| } |
| cl_node.name = node.operation.type + " " + std::to_string(node.id); |
| nodes_.push_back(std::move(cl_node)); |
| } |
| } |
| |
| return absl::OkStatus(); |
| } |
| |
| void InferenceContext::Merge() { |
| std::unordered_set<ValueId> ready_tensors; |
| for (const auto& input_id : input_ids_) { |
| ready_tensors.insert(input_id); |
| } |
| for (int i = 0; i < nodes_.size(); ++i) { |
| auto& node = nodes_[i]; |
| for (const auto& out_id : node.outputs) { |
| ready_tensors.insert(out_id); |
| } |
| if (node.outputs.size() != 1) { |
| continue; |
| } |
| std::vector<int> next_nodes; |
| int link_index = 0; |
| for (int j = i + 1; j < nodes_.size(); ++j) { |
| for (int k = 0; k < nodes_[j].inputs.size(); ++k) { |
| if (nodes_[j].inputs[k] == node.outputs[0]) { |
| next_nodes.push_back(j); |
| link_index = k; |
| } |
| } |
| } |
| if (next_nodes.size() != 1 || link_index != 0) { |
| continue; |
| } |
| auto& linkable_node = nodes_[next_nodes[0]]; |
| auto* elementwise = |
| dynamic_cast<ElementwiseOperation*>(linkable_node.operations[0].get()); |
| if (!elementwise || !elementwise->IsLinkable() || |
| linkable_node.outputs.size() != 1 || |
| !IsReady(ready_tensors, linkable_node)) { |
| continue; |
| } |
| const auto& original_dst_def = |
| node.operations[0]->GetDefinition().dst_tensors[0]; |
| const auto& link_dst_def = |
| linkable_node.operations[0]->GetDefinition().dst_tensors[0]; |
| if (original_dst_def != link_dst_def) { |
| continue; |
| } |
| MergeCLNodes(&linkable_node, &node); |
| nodes_.erase(nodes_.begin() + next_nodes[0]); |
| i -= 1; |
| } |
| for (auto& node : nodes_) { |
| for (int j = 1; j < node.operations.size(); ++j) { |
| auto* elementwise = |
| dynamic_cast<ElementwiseOperation*>(node.operations[j].get()); |
| node.operations[0]->AddOperation(elementwise); |
| } |
| } |
| } |
| |
| void InferenceContext::GetUsages( |
| const std::function<bool(const TensorDescriptor&)>& functor, |
| std::map<ValueId, int2>* usages) { |
| for (ValueId in_id : input_ids_) { |
| const auto& desc = tensor_reserver_.Get(in_id).descriptor; |
| if (functor(desc)) { |
| AddUsage(in_id, 0, usages); |
| } |
| } |
| for (int op_index = 0; op_index < nodes_.size(); ++op_index) { |
| auto tensors = GetCLNodeTensors(nodes_[op_index]); |
| for (auto& tensor : tensors) { |
| if (functor(tensor.second)) { |
| AddUsage(tensor.first, op_index, usages); |
| } |
| } |
| } |
| for (ValueId out_id : output_ids_) { |
| const auto& desc = tensor_reserver_.Get(out_id).descriptor; |
| if (functor(desc)) { |
| AddUsage(out_id, nodes_.size(), usages); |
| } |
| } |
| } |
| |
| absl::Status InferenceContext::AllocateMemory(const CLDevice& device, |
| CLContext* context) { |
| RETURN_IF_ERROR(AllocateMemoryForBuffers(device, context)); |
| RETURN_IF_ERROR(AllocateMemoryForStrongShapes(device, context)); |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::AllocateMemoryForBuffers(const CLDevice& device, |
| CLContext* context) { |
| std::map<ValueId, int2> buffer_usages; |
| GetUsages( |
| [](const TensorDescriptor& t) { return IsBufferBased(t.storage_type); }, |
| &buffer_usages); |
| |
| std::vector<TensorUsageRecord<size_t>> buffer_usage_records; |
| for (auto& usage : buffer_usages) { |
| const auto& t = tensor_reserver_.Get(usage.first); |
| const auto& shape = t.shape; |
| const auto& descriptor = t.descriptor; |
| const size_t element_size = |
| descriptor.data_type == DataType::FLOAT32 ? 4 : 2; |
| const size_t buffer_size = |
| shape.b * shape.w * shape.h * AlignByN(shape.c, 4) * element_size; |
| graph_ids_to_shared_buffer_tensors_[usage.first] = |
| buffer_usage_records.size(); |
| buffer_usage_records.push_back({buffer_size, |
| static_cast<TaskId>(usage.second.x), |
| static_cast<TaskId>(usage.second.y)}); |
| } |
| |
| ObjectsAssignment<size_t> buffer_assignment; |
| RETURN_IF_ERROR(AssignObjectsToTensors( |
| buffer_usage_records, MemoryStrategy::GREEDY_BEST, &buffer_assignment)); |
| |
| shared_buffers_.resize(buffer_assignment.object_sizes.size()); |
| for (int i = 0; i < buffer_assignment.object_sizes.size(); ++i) { |
| RETURN_IF_ERROR(CreateReadWriteBuffer(buffer_assignment.object_sizes[i], |
| context, &shared_buffers_[i])); |
| } |
| |
| std::vector<bool> created_tensors(buffer_usage_records.size(), false); |
| shared_buffer_tensors_.resize(buffer_usage_records.size()); |
| for (auto& node : nodes_) { |
| auto tensors = GetCLNodeTensors(node); |
| for (auto& t : tensors) { |
| if (!IsBufferBased(t.second.storage_type)) continue; |
| const int tensor_index = graph_ids_to_shared_buffer_tensors_[t.first]; |
| if (created_tensors[tensor_index]) continue; |
| const auto& shape = tensor_reserver_.Get(t.first).shape; |
| const int buffer_index = buffer_assignment.object_ids[tensor_index]; |
| RETURN_IF_ERROR(CreateSharedTensor( |
| *context, shared_buffers_[buffer_index].GetMemoryPtr(), shape, |
| t.second, &shared_buffer_tensors_[tensor_index])); |
| created_tensors[tensor_index] = true; |
| } |
| } |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::AllocateMemoryForStrongShapes( |
| const CLDevice& device, CLContext* context) { |
| std::map<ValueId, int2> usages; |
| GetUsages( |
| [](const TensorDescriptor& t) { return !IsBufferBased(t.storage_type); }, |
| &usages); |
| |
| std::vector<TensorUsageRecord<DummyTensor>> usage_records; |
| std::map<ValueId, ValueId> remap_from_graph_ids; |
| for (auto& usage : usages) { |
| remap_from_graph_ids[usage.first] = usage_records.size(); |
| usage_records.push_back({tensor_reserver_.Get(usage.first), |
| static_cast<TaskId>(usage.second.x), |
| static_cast<TaskId>(usage.second.y)}); |
| } |
| |
| ObjectsAssignment<DummyTensor> assignment; |
| RETURN_IF_ERROR(AssignObjectsToTensors( |
| usage_records, MemoryStrategy::EQUALITY, &assignment)); |
| |
| for (auto& node : nodes_) { |
| auto tensors = GetCLNodeTensors(node); |
| for (auto& t : tensors) { |
| if (IsBufferBased(t.second.storage_type)) continue; |
| const auto& shape = tensor_reserver_.Get(t.first).shape; |
| const auto id = assignment.object_ids[remap_from_graph_ids[t.first]]; |
| graph_ids_to_strong_shape_tensors_[t.first] = id; |
| const auto& it = strong_shape_tensors_.find(id); |
| if (it == strong_shape_tensors_.end()) { |
| RETURN_IF_ERROR(CreateTensor(*context, device, shape, t.second, |
| &strong_shape_tensors_[id])); |
| } |
| } |
| } |
| return absl::OkStatus(); |
| } |
| |
| void InferenceContext::BindMemoryToOperations() { |
| for (auto& node : nodes_) { |
| const auto& first_range = node.ranges[0]; |
| for (int k = first_range.x; k < first_range.y; ++k) { |
| node.operations[0]->SetSrc(GetTensor(node.inputs[k]), k - first_range.x); |
| } |
| for (int i = 1; i < node.ranges.size(); ++i) { |
| const auto& range = node.ranges[i]; |
| for (int k = range.x; k < range.y; ++k) { |
| node.operations[i]->SetSrc(GetTensor(node.inputs[k]), k - range.x + 1); |
| } |
| } |
| |
| for (int i = 0; i < node.outputs.size(); ++i) { |
| node.operations[0]->SetDst(GetTensor(node.outputs[i]), i); |
| } |
| } |
| } |
| |
| absl::Status InferenceContext::Compile( |
| const CreationContext& creation_context) { |
| for (auto& node : nodes_) { |
| RETURN_IF_ERROR(node.operations[0]->Compile(creation_context)); |
| } |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::Tune(const TuningParameters& tuning_parameters) { |
| for (auto& node : nodes_) { |
| RETURN_IF_ERROR(node.operations[0]->Tune(tuning_parameters)); |
| } |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::UpdateParams() { |
| for (auto& node : nodes_) { |
| RETURN_IF_ERROR(node.operations[0]->UpdateParams()); |
| } |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) { |
| if (need_manual_release_) { |
| if (prev_enqueue_start_point_.is_valid()) { |
| prev_enqueue_start_point_.Wait(); |
| } |
| RETURN_IF_ERROR(queue->EnqueueEvent(&prev_enqueue_start_point_)); |
| } |
| int counter = 0; |
| for (auto& node : nodes_) { |
| RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue)); |
| counter++; |
| if (flush_periodically_ && counter % flush_period_ == 0) { |
| clFlush(queue->queue()); |
| } |
| } |
| if (need_flush_) { |
| clFlush(queue->queue()); |
| } |
| return absl::OkStatus(); |
| } |
| |
| absl::Status InferenceContext::Profile(ProfilingCommandQueue* queue, |
| ProfilingInfo* result) { |
| queue->ResetMeasurements(); |
| for (auto& node : nodes_) { |
| queue->SetEventsLabel(node.name); |
| RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue)); |
| } |
| RETURN_IF_ERROR(queue->WaitForCompletion()); |
| *result = queue->GetProfilingInfo(); |
| return absl::OkStatus(); |
| } |
| |
| uint64_t InferenceContext::GetSizeOfMemoryAllocatedForIntermediateTensors() |
| const { |
| uint64_t total_memory = 0; |
| for (const auto& t : strong_shape_tensors_) { |
| total_memory += t.second.GetMemorySizeInBytes(); |
| } |
| for (const auto& b : shared_buffers_) { |
| total_memory += b.GetMemorySizeInBytes(); |
| } |
| |
| return total_memory; |
| } |
| |
| Tensor* InferenceContext::GetTensor(ValueId id) { |
| if (graph_ids_to_shared_buffer_tensors_.find(id) != |
| graph_ids_to_shared_buffer_tensors_.end()) { |
| return &shared_buffer_tensors_[graph_ids_to_shared_buffer_tensors_[id]]; |
| } else { |
| return &strong_shape_tensors_[graph_ids_to_strong_shape_tensors_[id]]; |
| } |
| } |
| |
| absl::Status InferenceContext::SetInputTensor(ValueId id, |
| const TensorFloat32& tensor, |
| CLCommandQueue* queue) { |
| return GetTensor(id)->WriteData(queue, tensor); |
| } |
| |
| absl::Status InferenceContext::GetOutputTensor(ValueId id, |
| CLCommandQueue* queue, |
| TensorFloat32* result) { |
| const auto& gpu_tensor = *GetTensor(id); |
| const auto dst_shape = BHWC(gpu_tensor.Batch(), gpu_tensor.Height(), |
| gpu_tensor.Width(), gpu_tensor.Channels()); |
| result->id = id; |
| result->shape = dst_shape; |
| result->data.resize(dst_shape.DimensionsProduct()); |
| return gpu_tensor.ReadData(queue, result); |
| } |
| |
| absl::Status RunGraphTransforms(GraphFloat32* graph) { |
| auto merge_padding_transform = NewMergePaddingWithAdd(); |
| auto add_bias_transform = NewAddBias(); |
| ModelTransformer transformer(graph, /*reporter=*/nullptr); |
| if (!transformer.Apply("add_bias", add_bias_transform.get())) { |
| return absl::InternalError("Invalid add_bias transform"); |
| } |
| if (!transformer.Apply("merge_padding", merge_padding_transform.get())) { |
| return absl::InternalError("Invalid merge_padding transform"); |
| } |
| return absl::OkStatus(); |
| } |
| |
| } // namespace cl |
| } // namespace gpu |
| } // namespace tflite |