| /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "tensorflow/lite/delegates/gpu/metal/inference_context.h" |
| |
| #include <map> |
| #include <vector> |
| |
| #include "absl/strings/substitute.h" |
| #include "tensorflow/lite/delegates/gpu/common/memory_management.h" |
| #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h" |
| #include "tensorflow/lite/delegates/gpu/common/model.h" |
| #include "tensorflow/lite/delegates/gpu/common/precision.h" |
| #include "tensorflow/lite/delegates/gpu/common/shape.h" |
| #include "tensorflow/lite/delegates/gpu/common/status.h" |
| #include "tensorflow/lite/delegates/gpu/common/util.h" |
| #include "tensorflow/lite/delegates/gpu/metal/compute_task.h" |
| #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" |
| |
| using ::tflite::gpu::BHWC; |
| using ::tflite::gpu::metal::ComputeTaskDescriptorPtr; |
| using ::tflite::gpu::CalculationsPrecision; |
| using ::tflite::gpu::ValueId; |
| using ::tflite::gpu::AlignByN; |
| using ::tflite::gpu::HalfBits; |
| using ::tflite::gpu::MemoryStrategy; |
| using ::tflite::gpu::TensorUsageRecord; |
| |
| @implementation TFLInferenceContext { |
| std::vector<TFLComputeTask*> _computeTasks; |
| // contains indexes of _computeTasks |
| std::vector<int> _taskIdsWithInOutBuffers; |
| std::vector<ValueId> _inputIds; |
| std::vector<ValueId> _outputIds; |
| id<MTLDevice> _device; |
| CalculationsPrecision _precision; |
| std::map<ValueId, BHWC> _tensorShapes; |
| } |
| |
| - (absl::Status)compileModelWithDevice:(id<MTLDevice>)device |
| model:(const tflite::gpu::metal::CompiledModel&) compiledModel |
| inputBufferIDs:(const std::vector<tflite::gpu::ValueId>&)inputBufferIDs |
| outputBufferIDs:(const std::vector<tflite::gpu::ValueId>&)outputBufferIDs |
| precision:(tflite::gpu::CalculationsPrecision)precision { |
| _device = device; |
| _inputIds = inputBufferIDs; |
| _outputIds = outputBufferIDs; |
| _precision = precision; |
| // Metal resources are created here. |
| for (const auto& node : compiledModel.nodes) { |
| TFLComputeTask* task = [[TFLComputeTask alloc] init]; |
| RETURN_IF_ERROR([task compileWithDevice:_device |
| taskDescriptor:node |
| precision:_precision]); |
| [task setDescription:node.description]; |
| _computeTasks.emplace_back(task); |
| } |
| _tensorShapes = compiledModel.tensor_shapes; |
| [self allocateTensors]; |
| return absl::OkStatus(); |
| } |
| |
| - (absl::Status)allocateTensors { |
| // These maps contain all input/output/intermediate buffers shared across model. |
| std::map<ValueId, id<MTLBuffer>> buffers; |
| std::set<ValueId> preallocatedIds; |
| // Insert uninitialized input buffers. This buffers will be set externally. |
| for (auto tensor_id : _inputIds) { |
| buffers[tensor_id] = nil; |
| preallocatedIds.insert(tensor_id); |
| } |
| for (const auto& outputId : _outputIds) { |
| preallocatedIds.insert(outputId); |
| } |
| for (auto& task : _computeTasks) { |
| // The same device must be used here as well as on shader compilation stage. |
| RETURN_IF_ERROR([task updateParamsWithDevice:_device tensorShapes:_tensorShapes]); |
| } |
| |
| // TODO(ypisarchyk): it make sense to move it to separate function |
| // Generate usage records for each intermediate tensor in order of their first_task |
| std::vector<TensorUsageRecord<size_t>> usageRecords; |
| std::map<ValueId, size_t> usageRecordIds; |
| for (uint32_t i = 0; i < _computeTasks.size(); ++i) { |
| for (const auto tensor_id : [_computeTasks[i] getOutputIds]) { |
| if (!preallocatedIds.count(tensor_id)) { |
| if (!usageRecordIds.count(tensor_id)) { |
| const auto it = _tensorShapes.find(tensor_id); |
| if (it == _tensorShapes.end()) { |
| return absl::InternalError("Dimensions for intermediate tensor not found."); |
| } |
| usageRecordIds[tensor_id] = usageRecords.size(); |
| usageRecords.emplace_back(it->second.w * it->second.h * AlignByN(it->second.c, 4), i, i); |
| } else { |
| usageRecords[usageRecordIds[tensor_id]].last_task = i; |
| } |
| } |
| } |
| for (const auto tensor_id : [_computeTasks[i] getInputIds]) { |
| if (!preallocatedIds.count(tensor_id)) { |
| usageRecords[usageRecordIds[tensor_id]].last_task = i; |
| } |
| } |
| } |
| |
| tflite::gpu::ObjectsAssignment<size_t> assignment; |
| RETURN_IF_ERROR(AssignObjectsToTensors(usageRecords, MemoryStrategy::GREEDY_BEST, &assignment)); |
| auto objectsCount = assignment.object_sizes.size(); |
| std::vector<id<MTLBuffer>> sharedBuffers(objectsCount); |
| const bool f32_storage = _precision == CalculationsPrecision::F32; |
| size_t dataTypeSize = f32_storage ? sizeof(float) : sizeof(HalfBits); |
| |
| // allocate buffers for each shared object |
| for (size_t i = 0; i < objectsCount; ++i) { |
| // Initialize metal buffer |
| NSUInteger bufferSize = dataTypeSize * assignment.object_sizes[i]; |
| |
| #if (defined(__MAC_10_14) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_14) || \ |
| (defined(__IPHONE_12_0) && __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_12_0) || \ |
| (defined(__TVOS_12_0) && __TV_OS_VERSION_MIN_REQUIRED >= __TVOS_12_0) |
| if (bufferSize > [_device maxBufferLength]) { |
| std::string error("Tensor id: "); |
| error += std::to_string(assignment.object_ids[i]) + |
| " with size: " + std::to_string(bufferSize) + |
| " exceeds MTLDevice maxBufferLength: " + std::to_string([_device maxBufferLength]); |
| return absl::ResourceExhaustedError(error); |
| } |
| #endif |
| #if defined(__MAC_10_12) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_12 |
| if ([_device currentAllocatedSize] + bufferSize > [_device recommendedMaxWorkingSetSize]) { |
| std::string error("Out of memory in MTLBuffer allocation. Currently allocated: "); |
| error += std::to_string([_device currentAllocatedSize]); |
| return absl::ResourceExhaustedError(error); |
| } |
| #endif |
| |
| sharedBuffers[i] = [_device newBufferWithLength:bufferSize |
| options:MTLResourceStorageModeShared]; |
| } |
| for (int i = 0; i < _computeTasks.size(); ++i) { |
| auto& task = _computeTasks[i]; |
| if ([task hasInOutIds:preallocatedIds]) { |
| _taskIdsWithInOutBuffers.push_back(i); |
| } |
| RETURN_IF_ERROR([task assignBuffers:&buffers |
| outputIds:_outputIds |
| usageRecordIds:usageRecordIds |
| sharedBufferIds:assignment.object_ids |
| sharedBuffers:sharedBuffers]); |
| } |
| return absl::OkStatus(); |
| } |
| |
| - (void)encodeWithEncoder:(id<MTLComputeCommandEncoder>)commandEncoder |
| inputOutputBuffers:(const std::map<ValueId, id<MTLBuffer>>&)inputOutputBuffers |
| encoderBlock:(id<MTLComputeCommandEncoder> (^)(bool isLast))encoderBlock { |
| for (auto& task_index : _taskIdsWithInOutBuffers) { |
| auto& task = _computeTasks[task_index]; |
| [task updateBuffers:inputOutputBuffers]; |
| } |
| for (int i = 0; i < _computeTasks.size(); ++i) { |
| auto& task = _computeTasks[i]; |
| [task encodeWithEncoder:commandEncoder]; |
| if (encoderBlock != nil) { |
| commandEncoder = encoderBlock(i == _computeTasks.size() - 1); |
| } |
| } |
| } |
| |
| @end |