Add a timer to measure op enqueue overhead for XLA GPU
PiperOrigin-RevId: 294551183
Change-Id: Ied82a59ed7e75e2b73e407c7cb6af82e0b981876
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 943a7f7..4d3655c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -303,6 +303,8 @@
const ServiceExecutableRunOptions* run_options,
std::vector<ShapeTree<MaybeOwningDeviceMemory>> arguments,
HloExecutionProfile* hlo_execution_profile) {
+ XLA_SCOPED_LOGGING_TIMER(absl::StrCat("GpuExecutable::ExecuteAsyncOnStream(",
+ module().name(), ")"));
se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
// Force synchronous execution if the allocator requires it.
const bool block_host_until_done =