tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include <algorithm>
 #include <chrono>  // NOLINT(build/c++11)
 #include <iostream>
 #include <string>
 #include <vector>

 #include "absl/time/time.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/kernels/register.h"

 namespace tflite {
 namespace gpu {
 namespace cl {

 absl::Status RunPredefinedLayoutSample(const std::string& model_name) {
   auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
   GraphFloat32 graph_cl;
   ops::builtin::BuiltinOpResolver op_resolver;
   RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl,
                                       /*allow_quant_ops=*/true));

   Environment env;
   RETURN_IF_ERROR(CreateEnvironment(&env));

   CreateGpuModelInfo create_info;
   create_info.precision = env.IsSupported(CalculationsPrecision::F16)
                               ? CalculationsPrecision::F16
                               : CalculationsPrecision::F32;
   create_info.storage_type = GetFastestStorageType(env.device().GetInfo());
   create_info.hints.Add(ModelHints::kAllowSpecialKernels);
   {
     // Example of adding predefined descriptor
     // Assumed that graph has first input with batch = 1.
     auto data_type = DeduceDataTypeFromPrecision(create_info.precision);
     create_info.predefined[graph_cl.inputs()[0]->id] =
         TensorDescriptor{data_type, TensorStorageType::BUFFER, Layout::HWC};
   }
   std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
   std::cout << "Storage type: " << ToString(create_info.storage_type)
             << std::endl;
   InferenceContext context;
   RETURN_IF_ERROR(
       context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));

   // After initialization we can receive input tensor
   // in_ten will have TensorStorageType::BUFFER storage type
   Tensor* in_ten = context.GetTensor(graph_cl.inputs()[0]->id);
   if (in_ten->GetStorageType() != TensorStorageType::BUFFER) {
     return absl::InternalError("Failed preconditiion");
   }

   RETURN_IF_ERROR(context.AddToQueue(env.queue()));

   std::cout << "Finished RunPredefinedLayoutSample." << std::endl;

   return absl::OkStatus();
 }

 absl::Status RunExternalImmutableSample(const std::string& model_name) {
   auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
   GraphFloat32 graph_cl;
   ops::builtin::BuiltinOpResolver op_resolver;
   RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl,
                                       /*allow_quant_ops*/ true));

   Environment env;
   RETURN_IF_ERROR(CreateEnvironment(&env));

   CreateGpuModelInfo create_info;
   create_info.precision = env.IsSupported(CalculationsPrecision::F16)
                               ? CalculationsPrecision::F16
                               : CalculationsPrecision::F32;
   create_info.storage_type = GetFastestStorageType(env.device().GetInfo());
   create_info.hints.Add(ModelHints::kAllowSpecialKernels);
   // Example of external immutable tensors:
   std::vector<Tensor> outputs(graph_cl.outputs().size());
   for (int i = 0; i < graph_cl.outputs().size(); ++i) {
     // Assumed that graph outputs have batch size = 1.
     auto data_type = DeduceDataTypeFromPrecision(create_info.precision);
     TensorDescriptor required_tensor_desc = TensorDescriptor{
         data_type, TensorStorageType::TEXTURE_ARRAY, Layout::HWC};
     required_tensor_desc.SetBHWCShape(graph_cl.outputs()[i]->tensor.shape);
     RETURN_IF_ERROR(
         CreateTensor(env.context(), required_tensor_desc, &outputs[i]));
     create_info.external_immutable_tensors[graph_cl.outputs()[i]->id] =
         &outputs[i];
   }
   std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
   std::cout << "Storage type: " << ToString(create_info.storage_type)
             << std::endl;
   InferenceContext context;
   RETURN_IF_ERROR(
       context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));

   RETURN_IF_ERROR(context.AddToQueue(env.queue()));

   // outputs can be used here. But AddToQueue do not have cpu
   // syncronization.
   RETURN_IF_ERROR(env.queue()->WaitForCompletion());

   const auto dst_shape = BHWC(outputs[0].Batch(), outputs[0].Height(),
                               outputs[0].Width(), outputs[0].Channels());
   TensorFloat32 cpu_tensor;
   cpu_tensor.shape = dst_shape;
   cpu_tensor.data.resize(dst_shape.DimensionsProduct());
   RETURN_IF_ERROR(outputs[0].ReadData(env.queue(), &cpu_tensor));
   std::cout << "First tensor data at index 0 - " << cpu_tensor.data[0]
             << std::endl;

   return absl::OkStatus();
 }

 absl::Status RunSerializedTest(const std::string& model_name) {
   auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
   GraphFloat32 graph_cl;
   ops::builtin::BuiltinOpResolver op_resolver;
   RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl,
                                       /*allow_quant_ops*/ true));

   Environment env;
   RETURN_IF_ERROR(CreateEnvironment(&env));

   CreateGpuModelInfo create_info;
   create_info.precision = env.IsSupported(CalculationsPrecision::F16)
                               ? CalculationsPrecision::F16
                               : CalculationsPrecision::F32;
   create_info.storage_type = GetFastestStorageType(env.device().GetInfo());
   create_info.hints.Add(ModelHints::kAllowSpecialKernels);

   {  // calculating time without building serialized model
     InferenceContext test_context;
     const auto start = std::chrono::high_resolution_clock::now();
     RETURN_IF_ERROR(
         test_context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));
     const auto end = std::chrono::high_resolution_clock::now();
     const double total_time_ms = (end - start).count() * 1e-6f;
     std::cout << "Inference context initialization total time - "
               << total_time_ms << "ms" << std::endl;
   }
   InferenceContext context;
   std::vector<uint8_t> serialized_model;
   RETURN_IF_ERROR(context.InitFromGraphWithTransforms(create_info, &graph_cl,
                                                       &env, &serialized_model));

   std::vector<TensorFloat32> src_tensors(graph_cl.inputs().size());
   for (int i = 0; i < graph_cl.inputs().size(); ++i) {
     src_tensors[i].id = graph_cl.inputs()[i]->id;
     src_tensors[i].shape = graph_cl.inputs()[i]->tensor.shape;
     src_tensors[i].data.resize(src_tensors[i].shape.DimensionsProduct());
     for (int j = 0; j < src_tensors[i].data.size(); ++j) {
       src_tensors[i].data[j] = std::sin(j);
     }
   }
   for (int i = 0; i < graph_cl.inputs().size(); ++i) {
     RETURN_IF_ERROR(context.SetInputTensor(graph_cl.inputs()[i]->id,
                                            src_tensors[i], env.queue()));
   }
   RETURN_IF_ERROR(context.AddToQueue(env.queue()));
   RETURN_IF_ERROR(env.queue()->WaitForCompletion());

   std::vector<TensorFloat32> dst_tensors(graph_cl.outputs().size());
   for (int i = 0; i < graph_cl.outputs().size(); ++i) {
     RETURN_IF_ERROR(context.GetOutputTensor(graph_cl.outputs()[i]->id,
                                             env.queue(), &dst_tensors[i]));
   }

   Environment env_v2;
   RETURN_IF_ERROR(CreateEnvironment(&env_v2));
   InferenceContext serialized_context;
   {
     const auto start = std::chrono::high_resolution_clock::now();
     RETURN_IF_ERROR(
         serialized_context.RestoreDeserialized(serialized_model, &env_v2));
     const auto end = std::chrono::high_resolution_clock::now();
     const double total_time_ms = (end - start).count() * 1e-6f;
     std::cout << "Serialized inference context initialization total time - "
               << total_time_ms << "ms" << std::endl;
   }
   for (int i = 0; i < graph_cl.inputs().size(); ++i) {
     RETURN_IF_ERROR(serialized_context.SetInputTensor(
         graph_cl.inputs()[i]->id, src_tensors[i], env_v2.queue()));
   }

   RETURN_IF_ERROR(serialized_context.AddToQueue(env_v2.queue()));
   RETURN_IF_ERROR(env_v2.queue()->WaitForCompletion());

   std::vector<TensorFloat32> dst_tensors_v2(graph_cl.outputs().size());
   for (int i = 0; i < graph_cl.outputs().size(); ++i) {
     RETURN_IF_ERROR(serialized_context.GetOutputTensor(
         graph_cl.outputs()[i]->id, env_v2.queue(), &dst_tensors_v2[i]));
   }

   for (int i = 0; i < graph_cl.outputs().size(); ++i) {
     if (dst_tensors[i].data.size() != dst_tensors_v2[i].data.size()) {
       std::cout << "Different sizes for " << i << " output tensor" << std::endl;
       break;
     }
     for (int j = 0; j < dst_tensors[i].data.size(); ++j) {
       if (dst_tensors[i].data[j] != dst_tensors_v2[i].data[j]) {
         std::cout << "Different elements for " << j << " element in " << i
                   << " tensor: " << dst_tensors[i].data[j] << " - "
                   << dst_tensors_v2[i].data[j] << std::endl;
         break;
       }
     }
   }

   return absl::OkStatus();
 }

 absl::Status RunCommandBufferSample(int num_tests, int num_runs_per_sec,
                                     Environment* env,
                                     InferenceContext* context) {
   if (!env->device().GetInfo().SupportsExtension("cl_khr_command_buffer")) {
     return absl::OkStatus();
   }

   cl_command_queue command_queue = env->queue()->queue();
   cl_int errcode_ret;
   std::vector<cl_command_buffer_khr> cbs(num_runs_per_sec);
   for (auto& cb : cbs) {
     cb = clCreateCommandBufferKHR(1, &command_queue, nullptr, &errcode_ret);
     if (errcode_ret != CL_SUCCESS) {
       return absl::InternalError("Failed clCreateCommandBufferKHR.");
     }
     RETURN_IF_ERROR(context->AddToCommanBuffer(cb));
     errcode_ret = clFinalizeCommandBufferKHR(cb);
     if (errcode_ret != CL_SUCCESS) {
       return absl::InternalError("Failed clFinalizeCommandBufferKHR.");
     }
   }

   for (int i = 0; i < num_tests; ++i) {
     const auto start = std::chrono::high_resolution_clock::now();
     for (auto& cb : cbs) {
       cl_int error_code =
           clEnqueueCommandBufferKHR(1, &command_queue, cb, 0, nullptr, nullptr);
       if (error_code != CL_SUCCESS) {
         return absl::UnknownError(
             absl::StrCat("Failed to clEnqueueCommandBufferKHR - ",
                          CLErrorCodeToString(error_code)));
       }
       clFlush(command_queue);
     }
     clFinish(command_queue);
     const auto end = std::chrono::high_resolution_clock::now();
     const double total_time_ms = (end - start).count() * 1e-6f;
     const double average_inference_time = total_time_ms / num_runs_per_sec;
     std::cout << "Total time CB - " << average_inference_time << "ms"
               << std::endl;
   }
   for (auto& cb : cbs) {
     clReleaseCommandBufferKHR(cb);
   }
   return absl::OkStatus();
 }

 absl::Status RunModelSample(const std::string& model_name) {
   auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
   GraphFloat32 graph_cl;
   ops::builtin::BuiltinOpResolver op_resolver;
   RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl,
                                       /*allow_quant_ops*/ true));

   Environment env;
   RETURN_IF_ERROR(CreateEnvironment(&env));

   CreateGpuModelInfo create_info;
   create_info.precision = env.IsSupported(CalculationsPrecision::F16)
                               ? CalculationsPrecision::F16
                               : CalculationsPrecision::F32;
   create_info.storage_type = GetFastestStorageType(env.device().GetInfo());
   create_info.hints.Add(ModelHints::kAllowSpecialKernels);
   std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
   std::cout << "Storage type: " << ToString(create_info.storage_type)
             << std::endl;
   InferenceContext context;
   RETURN_IF_ERROR(
       context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));

   auto* queue = env.profiling_queue();
   ProfilingInfo profiling_info;
   RETURN_IF_ERROR(context.Profile(queue, &profiling_info));
   std::cout << profiling_info.GetDetailedReport() << std::endl;
   const uint64_t runtime_mem_bytes =
       context.GetSizeOfMemoryAllocatedForIntermediateTensors();
   std::cout << "Memory for intermediate tensors - "
             << runtime_mem_bytes / 1024.0 / 1024.0 << " MB" << std::endl;
   const uint64_t const_mem_bytes = context.GetConstantTensorsSize();
   std::cout << "Memory for constant tensors - "
             << const_mem_bytes / 1024.0 / 1024.0 << " MB" << std::endl;
   std::cout << "Total tensors memory(const + intermediate) - "
             << (const_mem_bytes + runtime_mem_bytes) / 1024.0 / 1024.0 << " MB"
             << std::endl;

   const int num_runs_per_sec = std::max(
       1, static_cast<int>(1000.0f / absl::ToDoubleMilliseconds(
                                         profiling_info.GetTotalTime())));

   const int kNumRuns = 10;
   for (int i = 0; i < kNumRuns; ++i) {
     const auto start = std::chrono::high_resolution_clock::now();
     for (int k = 0; k < num_runs_per_sec; ++k) {
       RETURN_IF_ERROR(context.AddToQueue(env.queue()));
     }
     RETURN_IF_ERROR(env.queue()->WaitForCompletion());
     const auto end = std::chrono::high_resolution_clock::now();
     const double total_time_ms = (end - start).count() * 1e-6f;
     const double average_inference_time = total_time_ms / num_runs_per_sec;
     std::cout << "Total time - " << average_inference_time << "ms" << std::endl;
   }

   RETURN_IF_ERROR(
       RunCommandBufferSample(kNumRuns, num_runs_per_sec, &env, &context));

   return absl::OkStatus();
 }

 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite

 int main(int argc, char** argv) {
   if (argc <= 1) {
     std::cerr << "Expected model path as second argument.";
     return -1;
   }

   auto load_status = tflite::gpu::cl::LoadOpenCL();
   if (!load_status.ok()) {
     std::cerr << load_status.message();
     return -1;
   }

   auto run_status = tflite::gpu::cl::RunModelSample(argv[1]);
   if (!run_status.ok()) {
     std::cerr << run_status.message();
     return -1;
   }

   bool run_serialized_test = false;
   if (run_serialized_test) {
     run_status = tflite::gpu::cl::RunSerializedTest(argv[1]);
     if (!run_status.ok()) {
       std::cerr << run_status.message();
       return -1;
     }
   }

   bool run_with_external_immutable_tensors = false;
   if (run_with_external_immutable_tensors) {
     run_status = tflite::gpu::cl::RunExternalImmutableSample(argv[1]);
     if (!run_status.ok()) {
       std::cerr << run_status.message();
       return -1;
     }
   }

   bool run_with_predefined_layout = false;
   if (run_with_predefined_layout) {
     run_status = tflite::gpu::cl::RunPredefinedLayoutSample(argv[1]);
     if (!run_status.ok()) {
       std::cerr << run_status.message();
       return -1;
     }
   }

   return EXIT_SUCCESS;
 }
	/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include <algorithm>
	#include <chrono> // NOLINT(build/c++11)
	#include <iostream>
	#include <string>
	#include <vector>

	#include "absl/time/time.h"
	#include "tensorflow/lite/delegates/gpu/cl/environment.h"
	#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
	#include "tensorflow/lite/delegates/gpu/common/model.h"
	#include "tensorflow/lite/delegates/gpu/common/model_builder.h"
	#include "tensorflow/lite/delegates/gpu/common/status.h"
	#include "tensorflow/lite/kernels/register.h"

	namespace tflite {
	namespace gpu {
	namespace cl {

	absl::Status RunPredefinedLayoutSample(const std::string& model_name) {
	auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
	GraphFloat32 graph_cl;
	ops::builtin::BuiltinOpResolver op_resolver;
	RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl,
	/allow_quant_ops=/true));

	Environment env;
	RETURN_IF_ERROR(CreateEnvironment(&env));

	CreateGpuModelInfo create_info;
	create_info.precision = env.IsSupported(CalculationsPrecision::F16)
	? CalculationsPrecision::F16
	: CalculationsPrecision::F32;
	create_info.storage_type = GetFastestStorageType(env.device().GetInfo());
	create_info.hints.Add(ModelHints::kAllowSpecialKernels);
	{
	// Example of adding predefined descriptor
	// Assumed that graph has first input with batch = 1.
	auto data_type = DeduceDataTypeFromPrecision(create_info.precision);
	create_info.predefined[graph_cl.inputs()[0]->id] =
	TensorDescriptor{data_type, TensorStorageType::BUFFER, Layout::HWC};
	}
	std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
	std::cout << "Storage type: " << ToString(create_info.storage_type)
	<< std::endl;
	InferenceContext context;
	RETURN_IF_ERROR(
	context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));

	// After initialization we can receive input tensor
	// in_ten will have TensorStorageType::BUFFER storage type
	Tensor* in_ten = context.GetTensor(graph_cl.inputs()[0]->id);
	if (in_ten->GetStorageType() != TensorStorageType::BUFFER) {
	return absl::InternalError("Failed preconditiion");
	}

	RETURN_IF_ERROR(context.AddToQueue(env.queue()));

	std::cout << "Finished RunPredefinedLayoutSample." << std::endl;

	return absl::OkStatus();
	}

	absl::Status RunExternalImmutableSample(const std::string& model_name) {
	auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
	GraphFloat32 graph_cl;
	ops::builtin::BuiltinOpResolver op_resolver;
	RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl,
	/allow_quant_ops/ true));

	Environment env;
	RETURN_IF_ERROR(CreateEnvironment(&env));

	CreateGpuModelInfo create_info;
	create_info.precision = env.IsSupported(CalculationsPrecision::F16)
	? CalculationsPrecision::F16
	: CalculationsPrecision::F32;
	create_info.storage_type = GetFastestStorageType(env.device().GetInfo());
	create_info.hints.Add(ModelHints::kAllowSpecialKernels);
	// Example of external immutable tensors:
	std::vector<Tensor> outputs(graph_cl.outputs().size());
	for (int i = 0; i < graph_cl.outputs().size(); ++i) {
	// Assumed that graph outputs have batch size = 1.
	auto data_type = DeduceDataTypeFromPrecision(create_info.precision);
	TensorDescriptor required_tensor_desc = TensorDescriptor{
	data_type, TensorStorageType::TEXTURE_ARRAY, Layout::HWC};
	required_tensor_desc.SetBHWCShape(graph_cl.outputs()[i]->tensor.shape);
	RETURN_IF_ERROR(
	CreateTensor(env.context(), required_tensor_desc, &outputs[i]));
	create_info.external_immutable_tensors[graph_cl.outputs()[i]->id] =
	&outputs[i];
	}
	std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
	std::cout << "Storage type: " << ToString(create_info.storage_type)
	<< std::endl;
	InferenceContext context;
	RETURN_IF_ERROR(
	context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));

	RETURN_IF_ERROR(context.AddToQueue(env.queue()));

	// outputs can be used here. But AddToQueue do not have cpu
	// syncronization.
	RETURN_IF_ERROR(env.queue()->WaitForCompletion());

	const auto dst_shape = BHWC(outputs[0].Batch(), outputs[0].Height(),
	outputs[0].Width(), outputs[0].Channels());
	TensorFloat32 cpu_tensor;
	cpu_tensor.shape = dst_shape;
	cpu_tensor.data.resize(dst_shape.DimensionsProduct());
	RETURN_IF_ERROR(outputs[0].ReadData(env.queue(), &cpu_tensor));
	std::cout << "First tensor data at index 0 - " << cpu_tensor.data[0]
	<< std::endl;

	return absl::OkStatus();
	}

	absl::Status RunSerializedTest(const std::string& model_name) {
	auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
	GraphFloat32 graph_cl;
	ops::builtin::BuiltinOpResolver op_resolver;
	RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl,
	/allow_quant_ops/ true));

	Environment env;
	RETURN_IF_ERROR(CreateEnvironment(&env));

	CreateGpuModelInfo create_info;
	create_info.precision = env.IsSupported(CalculationsPrecision::F16)
	? CalculationsPrecision::F16
	: CalculationsPrecision::F32;
	create_info.storage_type = GetFastestStorageType(env.device().GetInfo());
	create_info.hints.Add(ModelHints::kAllowSpecialKernels);

	{ // calculating time without building serialized model
	InferenceContext test_context;
	const auto start = std::chrono::high_resolution_clock::now();
	RETURN_IF_ERROR(
	test_context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));
	const auto end = std::chrono::high_resolution_clock::now();
	const double total_time_ms = (end - start).count() * 1e-6f;
	std::cout << "Inference context initialization total time - "
	<< total_time_ms << "ms" << std::endl;
	}
	InferenceContext context;
	std::vector<uint8_t> serialized_model;
	RETURN_IF_ERROR(context.InitFromGraphWithTransforms(create_info, &graph_cl,
	&env, &serialized_model));

	std::vector<TensorFloat32> src_tensors(graph_cl.inputs().size());
	for (int i = 0; i < graph_cl.inputs().size(); ++i) {
	src_tensors[i].id = graph_cl.inputs()[i]->id;
	src_tensors[i].shape = graph_cl.inputs()[i]->tensor.shape;
	src_tensors[i].data.resize(src_tensors[i].shape.DimensionsProduct());
	for (int j = 0; j < src_tensors[i].data.size(); ++j) {
	src_tensors[i].data[j] = std::sin(j);
	}
	}
	for (int i = 0; i < graph_cl.inputs().size(); ++i) {
	RETURN_IF_ERROR(context.SetInputTensor(graph_cl.inputs()[i]->id,
	src_tensors[i], env.queue()));
	}
	RETURN_IF_ERROR(context.AddToQueue(env.queue()));
	RETURN_IF_ERROR(env.queue()->WaitForCompletion());

	std::vector<TensorFloat32> dst_tensors(graph_cl.outputs().size());
	for (int i = 0; i < graph_cl.outputs().size(); ++i) {
	RETURN_IF_ERROR(context.GetOutputTensor(graph_cl.outputs()[i]->id,
	env.queue(), &dst_tensors[i]));
	}

	Environment env_v2;
	RETURN_IF_ERROR(CreateEnvironment(&env_v2));
	InferenceContext serialized_context;
	{
	const auto start = std::chrono::high_resolution_clock::now();
	RETURN_IF_ERROR(
	serialized_context.RestoreDeserialized(serialized_model, &env_v2));
	const auto end = std::chrono::high_resolution_clock::now();
	const double total_time_ms = (end - start).count() * 1e-6f;
	std::cout << "Serialized inference context initialization total time - "
	<< total_time_ms << "ms" << std::endl;
	}
	for (int i = 0; i < graph_cl.inputs().size(); ++i) {
	RETURN_IF_ERROR(serialized_context.SetInputTensor(
	graph_cl.inputs()[i]->id, src_tensors[i], env_v2.queue()));
	}

	RETURN_IF_ERROR(serialized_context.AddToQueue(env_v2.queue()));
	RETURN_IF_ERROR(env_v2.queue()->WaitForCompletion());

	std::vector<TensorFloat32> dst_tensors_v2(graph_cl.outputs().size());
	for (int i = 0; i < graph_cl.outputs().size(); ++i) {
	RETURN_IF_ERROR(serialized_context.GetOutputTensor(
	graph_cl.outputs()[i]->id, env_v2.queue(), &dst_tensors_v2[i]));
	}

	for (int i = 0; i < graph_cl.outputs().size(); ++i) {
	if (dst_tensors[i].data.size() != dst_tensors_v2[i].data.size()) {
	std::cout << "Different sizes for " << i << " output tensor" << std::endl;
	break;
	}
	for (int j = 0; j < dst_tensors[i].data.size(); ++j) {
	if (dst_tensors[i].data[j] != dst_tensors_v2[i].data[j]) {
	std::cout << "Different elements for " << j << " element in " << i
	<< " tensor: " << dst_tensors[i].data[j] << " - "
	<< dst_tensors_v2[i].data[j] << std::endl;
	break;
	}
	}
	}

	return absl::OkStatus();
	}

	absl::Status RunCommandBufferSample(int num_tests, int num_runs_per_sec,
	Environment* env,
	InferenceContext* context) {
	if (!env->device().GetInfo().SupportsExtension("cl_khr_command_buffer")) {
	return absl::OkStatus();
	}

	cl_command_queue command_queue = env->queue()->queue();
	cl_int errcode_ret;
	std::vector<cl_command_buffer_khr> cbs(num_runs_per_sec);
	for (auto& cb : cbs) {
	cb = clCreateCommandBufferKHR(1, &command_queue, nullptr, &errcode_ret);
	if (errcode_ret != CL_SUCCESS) {
	return absl::InternalError("Failed clCreateCommandBufferKHR.");
	}
	RETURN_IF_ERROR(context->AddToCommanBuffer(cb));
	errcode_ret = clFinalizeCommandBufferKHR(cb);
	if (errcode_ret != CL_SUCCESS) {
	return absl::InternalError("Failed clFinalizeCommandBufferKHR.");
	}
	}

	for (int i = 0; i < num_tests; ++i) {
	const auto start = std::chrono::high_resolution_clock::now();
	for (auto& cb : cbs) {
	cl_int error_code =
	clEnqueueCommandBufferKHR(1, &command_queue, cb, 0, nullptr, nullptr);
	if (error_code != CL_SUCCESS) {
	return absl::UnknownError(
	absl::StrCat("Failed to clEnqueueCommandBufferKHR - ",
	CLErrorCodeToString(error_code)));
	}
	clFlush(command_queue);
	}
	clFinish(command_queue);
	const auto end = std::chrono::high_resolution_clock::now();
	const double total_time_ms = (end - start).count() * 1e-6f;
	const double average_inference_time = total_time_ms / num_runs_per_sec;
	std::cout << "Total time CB - " << average_inference_time << "ms"
	<< std::endl;
	}
	for (auto& cb : cbs) {
	clReleaseCommandBufferKHR(cb);
	}
	return absl::OkStatus();
	}

	absl::Status RunModelSample(const std::string& model_name) {
	auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
	GraphFloat32 graph_cl;
	ops::builtin::BuiltinOpResolver op_resolver;
	RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl,
	/allow_quant_ops/ true));

	Environment env;
	RETURN_IF_ERROR(CreateEnvironment(&env));

	CreateGpuModelInfo create_info;
	create_info.precision = env.IsSupported(CalculationsPrecision::F16)
	? CalculationsPrecision::F16
	: CalculationsPrecision::F32;
	create_info.storage_type = GetFastestStorageType(env.device().GetInfo());
	create_info.hints.Add(ModelHints::kAllowSpecialKernels);
	std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
	std::cout << "Storage type: " << ToString(create_info.storage_type)
	<< std::endl;
	InferenceContext context;
	RETURN_IF_ERROR(
	context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));

	auto* queue = env.profiling_queue();
	ProfilingInfo profiling_info;
	RETURN_IF_ERROR(context.Profile(queue, &profiling_info));
	std::cout << profiling_info.GetDetailedReport() << std::endl;
	const uint64_t runtime_mem_bytes =
	context.GetSizeOfMemoryAllocatedForIntermediateTensors();
	std::cout << "Memory for intermediate tensors - "
	<< runtime_mem_bytes / 1024.0 / 1024.0 << " MB" << std::endl;
	const uint64_t const_mem_bytes = context.GetConstantTensorsSize();
	std::cout << "Memory for constant tensors - "
	<< const_mem_bytes / 1024.0 / 1024.0 << " MB" << std::endl;
	std::cout << "Total tensors memory(const + intermediate) - "
	<< (const_mem_bytes + runtime_mem_bytes) / 1024.0 / 1024.0 << " MB"
	<< std::endl;

	const int num_runs_per_sec = std::max(
	1, static_cast<int>(1000.0f / absl::ToDoubleMilliseconds(
	profiling_info.GetTotalTime())));

	const int kNumRuns = 10;
	for (int i = 0; i < kNumRuns; ++i) {
	const auto start = std::chrono::high_resolution_clock::now();
	for (int k = 0; k < num_runs_per_sec; ++k) {
	RETURN_IF_ERROR(context.AddToQueue(env.queue()));
	}
	RETURN_IF_ERROR(env.queue()->WaitForCompletion());
	const auto end = std::chrono::high_resolution_clock::now();
	const double total_time_ms = (end - start).count() * 1e-6f;
	const double average_inference_time = total_time_ms / num_runs_per_sec;
	std::cout << "Total time - " << average_inference_time << "ms" << std::endl;
	}

	RETURN_IF_ERROR(
	RunCommandBufferSample(kNumRuns, num_runs_per_sec, &env, &context));

	return absl::OkStatus();
	}

	} // namespace cl
	} // namespace gpu
	} // namespace tflite

	int main(int argc, char** argv) {
	if (argc <= 1) {
	std::cerr << "Expected model path as second argument.";
	return -1;
	}

	auto load_status = tflite::gpu::cl::LoadOpenCL();
	if (!load_status.ok()) {
	std::cerr << load_status.message();
	return -1;
	}

	auto run_status = tflite::gpu::cl::RunModelSample(argv[1]);
	if (!run_status.ok()) {
	std::cerr << run_status.message();
	return -1;
	}

	bool run_serialized_test = false;
	if (run_serialized_test) {
	run_status = tflite::gpu::cl::RunSerializedTest(argv[1]);
	if (!run_status.ok()) {
	std::cerr << run_status.message();
	return -1;
	}
	}

	bool run_with_external_immutable_tensors = false;
	if (run_with_external_immutable_tensors) {
	run_status = tflite::gpu::cl::RunExternalImmutableSample(argv[1]);
	if (!run_status.ok()) {
	std::cerr << run_status.message();
	return -1;
	}
	}

	bool run_with_predefined_layout = false;
	if (run_with_predefined_layout) {
	run_status = tflite::gpu::cl::RunPredefinedLayoutSample(argv[1]);
	if (!run_status.ok()) {
	std::cerr << run_status.message();
	return -1;
	}
	}

	return EXIT_SUCCESS;
	}