blob: 767657c8391ef0e14233f60e6869405031e9b56d [file] [log] [blame]
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/delegate.h"
#include <cstdint>
#include <memory>
#include <thread> // NOLINT(build/c++11)
#include <vector>
#include "absl/container/flat_hash_map.h"
#include "absl/memory/memory.h"
#include "absl/types/span.h"
#include "tensorflow/lite/builtin_ops.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/delegates/gpu/api.h"
#include "tensorflow/lite/delegates/gpu/cl/api.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h"
#include "tensorflow/lite/delegates/gpu/common/model.h"
#include "tensorflow/lite/delegates/gpu/common/model_builder.h"
#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
#include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
#include "tensorflow/lite/minimal_logging.h"
#ifndef CL_DELEGATE_NO_GL
#include "tensorflow/lite/delegates/gpu/gl/api2.h"
#endif
namespace tflite {
namespace gpu {
namespace {
InferencePriority ToPriority(int32_t priority) {
switch (priority) {
case TFLITE_GPU_INFERENCE_PRIORITY_AUTO:
return InferencePriority::AUTO;
case TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION:
return InferencePriority::MAX_PRECISION;
case TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY:
return InferencePriority::MIN_LATENCY;
case TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE:
return InferencePriority::MIN_MEMORY_USAGE;
}
return InferencePriority::UNKNOWN;
}
InferenceUsage ToUsage(int32_t usage) {
switch (usage) {
case TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER:
return InferenceUsage::FAST_SINGLE_ANSWER;
case TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED:
return InferenceUsage::SUSTAINED_SPEED;
}
return InferenceUsage::UNKNOWN;
}
// Forward declarations.
TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
class Delegate {
public:
explicit Delegate(const TfLiteGpuDelegateOptionsV2* options)
: num_delegate_kernels_(0) {
delegate_.data_ = reinterpret_cast<void*>(this);
delegate_.Prepare = DelegatePrepare;
delegate_.CopyFromBufferHandle = nullptr;
delegate_.CopyToBufferHandle = nullptr;
delegate_.FreeBufferHandle = nullptr;
delegate_.flags = kTfLiteDelegateFlagsNone;
options_ = options ? *options : TfLiteGpuDelegateOptionsV2Default();
if (options_.max_delegated_partitions <= 0) {
options_.max_delegated_partitions = 1;
}
}
TfLiteDelegate* tflite_delegate() { return &delegate_; }
const TfLiteGpuDelegateOptionsV2& options() const { return options_; }
bool IsQuantOpsAllowed() const {
return options_.experimental_flags &
TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
}
int MaxDelegatedPartitions() const {
return options_.max_delegated_partitions;
}
int num_delegate_kernels() const { return num_delegate_kernels_; }
private:
TfLiteDelegate delegate_;
TfLiteGpuDelegateOptionsV2 options_;
int num_delegate_kernels_ = 0;
friend class DelegateKernel;
};
// Represent the execution of a subset of nodes on GPU.
class DelegateKernel {
public:
explicit DelegateKernel(Delegate* delegate) : delegate_(delegate) {
++delegate_->num_delegate_kernels_;
}
~DelegateKernel() { --delegate_->num_delegate_kernels_; }
absl::Status Prepare(TfLiteContext* context,
const TfLiteDelegateParams* delegate_params) {
thread_id_prepare_ = std::this_thread::get_id();
// Extract TFLite delegate execution plan from the context and convert it
// into GraphFloat32.
GraphFloat32 graph;
std::vector<uint32_t> input_refs;
std::vector<uint32_t> output_refs;
RETURN_IF_ERROR(InitializeGraph(context, delegate_params, &graph,
&input_refs, &output_refs));
std::unique_ptr<InferenceBuilder> builder;
bool graph_is_destroyed;
const int experimental_flags = delegate_->options().experimental_flags;
if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY) {
RETURN_IF_ERROR(
InitializeOpenClApi(&graph, &builder, &graph_is_destroyed));
} else if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY) {
RETURN_IF_ERROR(InitializeOpenGlApi(&graph, &builder));
} else {
// By default, we try CL first & fall back to GL if that fails.
absl::Status status =
InitializeOpenClApi(&graph, &builder, &graph_is_destroyed);
if (!status.ok()) {
TF_LITE_KERNEL_LOG(context, std::string(status.message()).c_str());
TF_LITE_KERNEL_LOG(context, "Falling back to OpenGL");
// Graph needs to be re-created because it is moved above.
GraphFloat32 graph2;
if (graph_is_destroyed) {
RETURN_IF_ERROR(InitializeGraph(context, delegate_params, &graph2,
&input_refs, &output_refs));
}
RETURN_IF_ERROR(InitializeOpenGlApi(
graph_is_destroyed ? &graph2 : &graph, &builder));
}
}
// At this point tflite didn't allocate tensors yet, therefore, collect
// indices and set all input and output tensors from tflite later.
input_indices_.reserve(input_refs.size());
for (uint32_t tensor_index : input_refs) {
const int64_t object_index = input_indices_.size();
input_indices_.push_back(tensor_index);
RETURN_IF_ERROR(
builder->SetInputObjectDef(object_index, GetObjectDef(tensor_index)));
}
output_indices_.reserve(output_refs.size());
for (uint32_t tensor_index : output_refs) {
const int64_t object_index = output_indices_.size();
output_indices_.push_back(tensor_index);
RETURN_IF_ERROR(builder->SetOutputObjectDef(object_index,
GetObjectDef(tensor_index)));
}
return builder->Build(&runner_);
}
// This directs the runtime to allocate memory for input/output temporary
// tensors that require dequantization/quantization.
absl::Status GetRequiredTemporaries(TfLiteContext* context, TfLiteNode* node,
TfLiteIntArray** temporaries_array_ptr) {
if (quant_conversion_map_.empty()) return absl::OkStatus();
std::vector<int> temporary_tensors;
for (auto index : input_indices_) {
if (quant_conversion_map_.find(index) != quant_conversion_map_.end()) {
temporary_tensors.push_back(index);
}
}
for (auto index : output_indices_) {
if (quant_conversion_map_.find(index) != quant_conversion_map_.end()) {
temporary_tensors.push_back(index);
}
}
*temporaries_array_ptr = TfLiteIntArrayCreate(temporary_tensors.size());
for (int i = 0; i < temporary_tensors.size(); ++i) {
(*temporaries_array_ptr)->data[i] = temporary_tensors[i];
}
return absl::OkStatus();
}
absl::Status Invoke(TfLiteContext* context) {
if (thread_id_prepare_ != std::this_thread::get_id()) {
TFLITE_LOG(tflite::TFLITE_LOG_WARNING,
"GpuDelegate invoke thread != prepare thread");
if (enforce_same_thread_) {
return absl::FailedPreconditionError(
"GpuDelegate must run on the same thread where it was "
"initialized.");
}
}
const bool is_dequant_required = !quant_conversion_map_.empty();
if (is_dequant_required) {
RETURN_IF_ERROR(
DequantizeInputs(context, input_indices_, quant_conversion_map_));
}
RETURN_IF_ERROR(SetInputsAndOutputs(context));
RETURN_IF_ERROR(runner_->Run());
if (is_dequant_required) {
RETURN_IF_ERROR(
QuantizeOutputs(context, output_indices_, quant_conversion_map_));
}
return absl::OkStatus();
}
private:
absl::Status SetInputsAndOutputs(TfLiteContext* context) {
for (int i = 0; i < input_indices_.size(); ++i) {
RETURN_IF_ERROR(runner_->SetInputObject(
i, GetTensorObject(input_indices_[i], context)));
}
for (int i = 0; i < output_indices_.size(); ++i) {
RETURN_IF_ERROR(runner_->SetOutputObject(
i, GetTensorObject(output_indices_[i], context)));
}
return absl::OkStatus();
}
ObjectDef GetObjectDef(int index) const {
ObjectDef default_object_def;
default_object_def.data_type = DataType::FLOAT32;
default_object_def.data_layout = DataLayout::BHWC;
default_object_def.object_type = ObjectType::CPU_MEMORY;
default_object_def.user_provided = true;
return default_object_def;
}
TensorObject GetTensorObject(int index, TfLiteContext* context) const {
auto& tensor = context->tensors[index];
return MakeCpuMemory(absl::MakeSpan(tensor.data.raw, tensor.bytes));
}
private:
absl::Status InitializeGraph(TfLiteContext* context,
const TfLiteDelegateParams* delegate_params,
GraphFloat32* graph,
std::vector<uint32_t>* input_refs,
std::vector<uint32_t>* output_refs) {
quant_conversion_map_.clear();
if (delegate_->IsQuantOpsAllowed()) {
RETURN_IF_ERROR(BuildFinalModel(context, delegate_params, graph,
&quant_conversion_map_));
} else {
RETURN_IF_ERROR(BuildFinalModel(context, delegate_params, graph));
}
input_refs->clear();
output_refs->clear();
const auto inputs = graph->inputs();
input_refs->reserve(inputs.size());
for (const auto& input : inputs) {
input_refs->push_back(input->tensor.ref);
}
const auto outputs = graph->outputs();
output_refs->reserve(outputs.size());
for (const auto& output : outputs) {
output_refs->push_back(output->tensor.ref);
}
return absl::OkStatus();
}
absl::Status InitializeOpenClApi(GraphFloat32* graph,
std::unique_ptr<InferenceBuilder>* builder,
bool* graph_is_destroyed) {
*graph_is_destroyed = false;
cl::InferenceEnvironmentOptions env_options;
cl::InferenceEnvironmentProperties properties;
RETURN_IF_ERROR(cl::NewInferenceEnvironment(env_options, &cl_environment_,
&properties));
auto delegate_options = delegate_->options();
cl::InferenceOptions options;
// If is_precision_loss_allowed == -1, then just use priorities instead
// of paying attention to is_precision_loss_allowed value.
if (delegate_options.is_precision_loss_allowed == -1) {
options.priority1 = ToPriority(delegate_options.inference_priority1);
options.priority2 = ToPriority(delegate_options.inference_priority2);
options.priority3 = ToPriority(delegate_options.inference_priority3);
} else {
// Users set is_precision_loss_allowed explicitly, thus use it explicitly.
if (delegate_options.is_precision_loss_allowed == 0) {
options.priority1 = InferencePriority::MAX_PRECISION;
} else {
options.priority1 = InferencePriority::MIN_LATENCY;
}
}
options.usage = ToUsage(delegate_options.inference_preference);
*graph_is_destroyed = true;
RETURN_IF_ERROR(cl_environment_->NewInferenceBuilder(
options, std::move(*graph), builder));
TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
"Initialized OpenCL-based API.");
return absl::OkStatus();
}
absl::Status InitializeOpenGlApi(GraphFloat32* graph,
std::unique_ptr<InferenceBuilder>* builder) {
#ifndef CL_DELEGATE_NO_GL
gl::InferenceEnvironmentOptions env_options;
gl::InferenceEnvironmentProperties properties;
RETURN_IF_ERROR(
NewInferenceEnvironment(env_options, &gl_environment_, &properties));
auto delegate_options = delegate_->options();
gl::InferenceOptions options;
options.usage = ToUsage(delegate_options.inference_preference);
options.priority1 = ToPriority(delegate_options.inference_priority1);
options.priority2 = ToPriority(delegate_options.inference_priority2);
options.priority3 = ToPriority(delegate_options.inference_priority3);
RETURN_IF_ERROR(gl_environment_->NewInferenceBuilder(std::move(*graph),
options, builder));
enforce_same_thread_ = true;
TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
"Initialized OpenGL-based API.");
return absl::OkStatus();
#else
return absl::UnavailableError("OpenGL-based API disabled");
#endif
}
// The Delegate instance that's shared across all DelegateKernel instances.
Delegate* const delegate_; // doesn't own the memory.
std::unique_ptr<cl::InferenceEnvironment> cl_environment_;
#ifndef CL_DELEGATE_NO_GL
std::unique_ptr<gl::InferenceEnvironment> gl_environment_;
#endif
std::unique_ptr<InferenceRunner> runner_;
std::vector<int64_t> input_indices_;
std::vector<int64_t> output_indices_;
// Whenever quantized inference is enabled, this maps the tensor index of each
// originally quantized (8-bit) tensor to its float version added in
// model_builder - and vice versa.
absl::flat_hash_map<int, int> quant_conversion_map_;
std::thread::id thread_id_prepare_; // thread id used for Prapare()
bool enforce_same_thread_ = false; // flag to enforce same thread for Invoke
};
inline DelegateKernel* GetDelegateKernel(TfLiteNode* node) {
return reinterpret_cast<DelegateKernel*>(node->user_data);
}
inline Delegate* GetDelegate(TfLiteDelegate* delegate) {
return reinterpret_cast<Delegate*>(delegate->data_);
}
TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
const TfLiteRegistration kRegistration = {
// .init
[](TfLiteContext* context, const char* buffer, size_t) -> void* {
const auto* params =
reinterpret_cast<const TfLiteDelegateParams*>(buffer);
auto* gpu_delegate = GetDelegate(params->delegate);
// Everything below should happen in prepare function call, but TFLite
// for whatever reason forbids that.
auto gpu_delegate_kernel =
absl::make_unique<DelegateKernel>(gpu_delegate);
const auto status = gpu_delegate_kernel->Prepare(context, params);
if (!status.ok()) {
TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Init: %s",
std::string(status.message()).c_str());
return nullptr;
}
return gpu_delegate_kernel.release();
},
// .free
[](TfLiteContext*, void* buffer) -> void {
delete reinterpret_cast<DelegateKernel*>(buffer);
},
// .prepare
[](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
if (!node->user_data) {
TF_LITE_KERNEL_LOG(
context,
"TfLiteGpuDelegate Prepare: delegate is not initialized");
return kTfLiteError;
}
auto* gpu_delegate_kernel = GetDelegateKernel(node);
const auto status = gpu_delegate_kernel->GetRequiredTemporaries(
context, node, &node->temporaries);
if (!status.ok()) {
TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Prepare: %s",
std::string(status.message()).c_str());
return kTfLiteError;
}
// TODO(akulik): tflite tensors are not allocated here either. It would
// be good to set inputs and outputs only once here instead of setting
// them every time in .invoke.
return kTfLiteOk;
},
// .invoke
[](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
const auto status = GetDelegateKernel(node)->Invoke(context);
if (!status.ok()) {
TF_LITE_KERNEL_LOG(context, "TfLiteGpuDelegate Invoke: %s",
std::string(status.message()).c_str());
return kTfLiteError;
}
return kTfLiteOk;
},
nullptr, // .profiling_string
0, // .builtin_code
"TfLiteGpuDelegateV2", // .custom_name
1, // .version
};
auto* gpu_delegate = GetDelegate(delegate);
TfLiteIntArray* ops_to_replace =
GetOpsToReplace(context, gpu_delegate->IsQuantOpsAllowed(),
gpu_delegate->MaxDelegatedPartitions());
const auto status = context->ReplaceNodeSubsetsWithDelegateKernels(
context, kRegistration, ops_to_replace, delegate);
TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Created %d GPU delegate kernels.",
gpu_delegate->num_delegate_kernels());
TfLiteIntArrayFree(ops_to_replace);
return status;
}
} // namespace
} // namespace gpu
} // namespace tflite
TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
TfLiteGpuDelegateOptionsV2 options;
// set it to -1 to detect whether it was later adjusted.
options.is_precision_loss_allowed = -1;
options.inference_preference = TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
options.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO;
options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO;
options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
options.max_delegated_partitions = 1;
return options;
}
TfLiteDelegate* TfLiteGpuDelegateV2Create(
const TfLiteGpuDelegateOptionsV2* options) {
auto* gpu_delegate = new tflite::gpu::Delegate(options);
TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
"Created TensorFlow Lite delegate for GPU.");
return gpu_delegate ? gpu_delegate->tflite_delegate() : nullptr;
}
void TfLiteGpuDelegateV2Delete(TfLiteDelegate* delegate) {
delete tflite::gpu::GetDelegate(delegate);
}