Add a timer to measure op enqueue overhead for XLA GPU PiperOrigin-RevId: 294551183 Change-Id: Ied82a59ed7e75e2b73e407c7cb6af82e0b981876

commit: 48d155ca605ebaae4ef5a30161f92541ed911007 [log] [tgz]
author: Sanjoy Das <sanjoy@google.com> Tue Feb 11 16:45:51 2020 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> Tue Feb 11 16:48:29 2020 -0800
tree: 89dbd33ba2e68e3f9cf6dd4c49fc017dae0188eb
parent: f903518db3a750508e902172619957537efb7dc5 [diff]
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 943a7f7..4d3655c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc

@@ -303,6 +303,8 @@
     const ServiceExecutableRunOptions* run_options,
     std::vector<ShapeTree<MaybeOwningDeviceMemory>> arguments,
     HloExecutionProfile* hlo_execution_profile) {
+  XLA_SCOPED_LOGGING_TIMER(absl::StrCat("GpuExecutable::ExecuteAsyncOnStream(",
+                                        module().name(), ")"));
   se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
   // Force synchronous execution if the allocator requires it.
   const bool block_host_until_done =
commit	48d155ca605ebaae4ef5a30161f92541ed911007	[log] [tgz]
author	Sanjoy Das <sanjoy@google.com>	Tue Feb 11 16:45:51 2020 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	Tue Feb 11 16:48:29 2020 -0800
tree	89dbd33ba2e68e3f9cf6dd4c49fc017dae0188eb
parent	f903518db3a750508e902172619957537efb7dc5 [diff]