#include <cstdint>
#include <functional>
#include <map>
#include <memory>
#include <unordered_map>
#include <vector>
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
#include "tensorflow/lite/delegates/gpu/cl/environment.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
#include "tensorflow/lite/delegates/gpu/common/model.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
namespace tflite {
namespace gpu {
namespace cl {
struct CLNode {
std::vector<std::unique_ptr<GPUOperation>> operations;
std::vector<ValueId> inputs;
std::vector<ValueId> outputs;
// So as CLNode can have few operations, ranges keep range of ids from inputs,
// for every operation.
std::vector<int2> ranges;
// Mostly for debug purposes.
std::string name;
CLNode() = default;
CLNode(CLNode&& node);
CLNode& operator=(CLNode&& node);
CLNode(const CLNode&) = delete;
CLNode& operator=(const CLNode&) = delete;
class InferenceContext {
struct CreateInferenceInfo {
CalculationsPrecision precision;
TensorStorageType storage_type;
ModelHints hints;
absl::Status InitFromGraph(const CreateInferenceInfo& create_info,
const GraphFloat32& graph, Environment* env);
// Applies OpenCL-specific transformations to the graph before the
// initialization. These transformations are either impossible or useless in
// other backends.
absl::Status InitFromGraphWithTransforms(
const CreateInferenceInfo& create_info, GraphFloat32* graph,
Environment* env);
absl::Status AddToQueue(CLCommandQueue* queue);
absl::Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result);
// for profiling and memory statistics
uint64_t GetSizeOfMemoryAllocatedForIntermediateTensors() const;
absl::Status SetInputTensor(ValueId id, const TensorFloat32& tensor,
CLCommandQueue* queue);
// It will work only with input/output tensor ids. For all other ids we don't
// have any guarantees.
Tensor* GetTensor(ValueId id);
absl::Status GetOutputTensor(ValueId id, CLCommandQueue* queue,
TensorFloat32* result);
void CopyInAndOutIds(const GraphFloat32& graph);
absl::Status ConvertOperations(const CreationContext& creation_context,
const GraphFloat32& graph, ModelHints hints);
void CreateLinks();
void ReserveGraphTensors(const CreateInferenceInfo& create_info,
const CreationContext& creation_context,
const GraphFloat32& graph);
void Merge();
absl::Status AllocateMemory(const CLDevice& device, CLContext* context);
absl::Status AllocateMemoryForBuffers(const CLDevice& device,
CLContext* context);
absl::Status AllocateMemoryForStrongShapes(const CLDevice& device,
CLContext* context);
// utility function
void GetUsages(const std::function<bool(const TensorDescriptor&)>& functor,
std::map<ValueId, int2>* usages);
void BindMemoryToOperations();
absl::Status Compile(const CreationContext& creation_context);
absl::Status Tune(const TuningParameters& tuning_parameters);
absl::Status UpdateParams();
// performance hacks
bool need_flush_ = false;
bool flush_periodically_ = false;
int flush_period_ = 1;
// In order to reduce memory leak on Mali a pipeline needs to be synchronized
// with CPU to prevent growing internal global OpenCL kernel pool. One trick
// is to enqueue an event from a previous run. Most of the time is should
// already be executed on GPU and should not stall the pipeline.
bool need_manual_release_ = false;
CLEvent prev_enqueue_start_point_;
CalculationsPrecision precision_;
TensorStorageType storage_type_;
// Directly mapped nodes from graph, but some of them "inactive" due
// to fusion (inactive = fused).
// Memory is allocated only once, in ConvertOperations, and is not modified
// anywhere.
std::vector<CLNode> nodes_;
struct DummyTensor {
BHWC shape;
TensorDescriptor descriptor;
bool operator==(const DummyTensor& b) const {
return shape == b.shape && descriptor == b.descriptor;
class TensorReserver {
ValueId Add(const DummyTensor& dummy) {
reservations_[next_] = dummy;
return next_++;
void Add(ValueId id, const DummyTensor& dummy) {
reservations_[id] = dummy;
void SetNext(ValueId id) { next_ = id; }
DummyTensor Get(ValueId id) { return reservations_[id]; }
std::unordered_map<ValueId, DummyTensor> reservations_;
ValueId next_;
TensorReserver tensor_reserver_;
std::vector<Buffer> shared_buffers_;
shared_buffer_tensors_; // use references to memory from shared_buffers_
std::map<ValueId, int> graph_ids_to_shared_buffer_tensors_;
std::map<ValueId, Tensor> strong_shape_tensors_;
std::map<ValueId, ValueId> graph_ids_to_strong_shape_tensors_;
std::vector<ValueId> input_ids_;
std::vector<ValueId> output_ids_;
// Runs OpenCL specific transforms for the graph.
absl::Status RunGraphTransforms(GraphFloat32* graph);
} // namespace cl
} // namespace gpu
} // namespace tflite