tensorflow/core/profiler/internal/gpu/cupti_tracer.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"

 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/profiler/internal/annotation_stack.h"

 namespace tensorflow {
 namespace profiler {

 namespace {

 static thread_local int internalCuCall = 0;

 // Temporary disable cupti api tracing for this thread during the life scope of
 // this class. Used for the API calls that initiated by us.
 class CuptiApiTracingDisabler {
  public:
   CuptiApiTracingDisabler() { internalCuCall++; }
   ~CuptiApiTracingDisabler() { internalCuCall--; }
 };

 Status ToStatus(CUptiResult result) {
   if (result == CUPTI_SUCCESS) {
     return Status::OK();
   }
   const char *str = nullptr;
   cuptiGetResultString(result, &str);
   return errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
 }

 Status ToStatus(CUresult result) {
   if (result == CUDA_SUCCESS) {
     return Status::OK();
   }
   const char *str = nullptr;
   cuGetErrorName(result, &str);
   return errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
 }

 inline void LogIfError(const Status &status) {
   if (status.ok()) return;
   LOG(ERROR) << status.error_message();
 }

 // Maps an OverheadKind enum to a const string.
 const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
   switch (kind) {
     case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER:
       return "COMPILER";
     case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH:
       return "BUFFER_FLUSH";
     case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION:
       return "INSTRUMENTATION";
     case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE:
       return "RESOURCE";
     default:
       break;
   }
   return "<UNKNOWN>";
 }

 const char *getActivityUnifiedMemoryKindString(
     CUpti_ActivityUnifiedMemoryCounterKind kind) {
   switch (kind) {
     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
       return "UM_BYTES_TRANSFER_HTOD";
     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
       return "UM_BYTES_TRANSFER_DTOH";
     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
       return "UM_CPU_PAGE_FAULT";
     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
       return "UM_GPU_PAGE_FAULT";
     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING:
       return "UM_THRASHING";
     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING:
       return "UM_THROTTLING";
     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP:
       return "UM_REMOTE_MAP";
     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD:
       return "UM_BYTES_TRANSFER_DTOD";
     default:
       break;
   }
   return "<UNKNOWN>";
 }

 // CUPTI_ERROR_INSUFFICIENT_PRIVILEGES is introduced at CUDA 10.1.
 #if CUDA_VERSION <= 10000
 #define CUPTI_ERROR_INSUFFICIENT_PRIVILEGES 35
 #endif

 #define RETURN_IF_CUPTI_ERROR(expr)                                         \
   do {                                                                      \
     CUptiResult status = expr;                                              \
     if (ABSL_PREDICT_FALSE(status != CUPTI_SUCCESS)) {                      \
       const char *errstr = "";                                              \
       cupti_interface_->GetResultString(status, &errstr);                   \
       LOG(ERROR) << "function " << #expr << "failed with error " << errstr; \
       if (status == CUPTI_ERROR_INSUFFICIENT_PRIVILEGES) {                  \
         return errors::PermissionDenied("CUPTI need root access!");         \
       } else {                                                              \
         return errors::Internal("CUPTI call error", errstr);                \
       }                                                                     \
     }                                                                       \
   } while (false)

 // GetCachedTID() caches the thread ID in thread-local storage (which is a
 // userspace construct) to avoid unnecessary system calls. Without this caching,
 // it can take roughly 98ns, while it takes roughly 1ns with this caching.
 int32 GetCachedTID() {
   static thread_local int32 current_thread_id =
       Env::Default()->GetCurrentThreadId();
   return current_thread_id;
 }

 size_t Bytes2D(const CUDA_MEMCPY2D *p) { return p->Height * p->WidthInBytes; }

 size_t Bytes3D(const CUDA_MEMCPY3D *p) {
   return p->Depth * p->Height * p->WidthInBytes;
 }

 template <typename CudaMemcpy>
 CuptiTracerEventType MemcpyKind(const CudaMemcpy *p) {
   if (p->srcMemoryType == CU_MEMORYTYPE_HOST &&
       p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
     return CuptiTracerEventType::MemcpyH2D;
   }
   if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
       p->dstMemoryType == CU_MEMORYTYPE_HOST) {
     return CuptiTracerEventType::MemcpyD2H;
   }
   if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
       p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
     return CuptiTracerEventType::MemcpyD2D;
   }
   return CuptiTracerEventType::Unsupported;
 }

 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
 DecodeDriverMemcpy(CUpti_CallbackId cbid, const void *params) {
   switch (cbid) {
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: {
       const auto *p = reinterpret_cast<const cuMemcpyHtoD_v2_params *>(params);
       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyH2D,
                              false);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: {
       const auto *p =
           reinterpret_cast<const cuMemcpyHtoDAsync_v2_params *>(params);
       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyH2D,
                              true);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: {
       const auto *p = reinterpret_cast<const cuMemcpyDtoH_v2_params *>(params);
       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2H,
                              false);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: {
       const auto *p =
           reinterpret_cast<const cuMemcpyDtoHAsync_v2_params *>(params);
       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2H,
                              true);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: {
       const auto *p = reinterpret_cast<const cuMemcpyDtoD_v2_params *>(params);
       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2D,
                              false);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: {
       const auto *p =
           reinterpret_cast<const cuMemcpyDtoDAsync_v2_params *>(params);
       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2D,
                              true);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
       const auto *p = reinterpret_cast<const cuMemcpy_params *>(params);
       return std::make_tuple(p->ByteCount, CuptiTracerEventType::Unsupported,
                              false);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
       const auto *p = reinterpret_cast<const cuMemcpyAsync_params *>(params);
       return std::make_tuple(p->ByteCount, CuptiTracerEventType::Unsupported,
                              true);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: {
       const auto *p = reinterpret_cast<const cuMemcpy2D_v2_params *>(params);
       return std::make_tuple(Bytes2D(p->pCopy), MemcpyKind(p->pCopy), false);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: {
       const auto *p =
           reinterpret_cast<const cuMemcpy2DAsync_v2_params *>(params);
       return std::make_tuple(Bytes2D(p->pCopy), MemcpyKind(p->pCopy), true);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: {
       const auto *p = reinterpret_cast<const cuMemcpy3D_v2_params *>(params);
       return std::make_tuple(Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: {
       const auto *p =
           reinterpret_cast<const cuMemcpy3DAsync_v2_params *>(params);
       return std::make_tuple(Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: {
       const cuMemcpyPeer_params *p2p_params =
           reinterpret_cast<const cuMemcpyPeer_params *>(params);
       return std::make_tuple(p2p_params->ByteCount,
                              CuptiTracerEventType::MemcpyP2P, false);
     }
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: {
       const cuMemcpyPeerAsync_params_st *p2p_params =
           reinterpret_cast<const cuMemcpyPeerAsync_params_st *>(params);
       return std::make_tuple(p2p_params->ByteCount,
                              CuptiTracerEventType::MemcpyP2P, true);
     }
     default: {
       LOG(ERROR) << "Unsupported memcpy activity observed: " << cbid;
       return std::make_tuple(0, CuptiTracerEventType::Unsupported, false);
     }
   }
 }

 // Cupti callback corresponding to a driver or runtime API. This global function
 // is invoked twice for each API: at entry and at exit. The cbdata
 // parameter is guaranteed by Cupti to be thread-safe. Most invocations are
 // dropped to the floor and entry/exit is tracked for the APIs we deem
 // performance-relevant.
 void CUPTIAPI ApiCallback(void *user_data, CUpti_CallbackDomain domain,
                           CUpti_CallbackId cbid,
                           const CUpti_CallbackData *cbdata) {
   CuptiTracer *tracer = reinterpret_cast<CuptiTracer *>(user_data);
   tracer->HandleCallback(domain, cbid, cbdata).IgnoreError();
 }

 // Callback which is invoked when an empty buffer is requested by CUPTI.
 // Allocates an empty aligned-memory buffer. The buffer is used by CUPTI as a
 // ring buffer where device maintains activity profiles that have been
 // collected.
 void CUPTIAPI AllocCuptiActivityBuffer(uint8_t **buffer, size_t *size,
                                        size_t *maxNumRecords) {
   // Buffer size and alignment, 32K and 8 as in CUPTI samples.
   constexpr size_t kBufferSize = 32 * 1024;
   constexpr int kBufferAlignSize = 8;
   *buffer = reinterpret_cast<uint8_t *>(
       port::AlignedMalloc(kBufferSize, kBufferAlignSize));
   if (*buffer == nullptr) {
     LOG(WARNING)
         << "Cupti Buffer not allocated, activity records will be dropped";
     return;
   }
   *size = kBufferSize;
   *maxNumRecords = 0;  // Cupti to fill as many records as fit in the buffer.
   VLOG(3) << "Allocated Cupti Buffer, buffer=" << std::hex
           << reinterpret_cast<uintptr_t>(*buffer) << std::dec
           << " size=" << *size;
 }

 // Callback which is invoked when a buffer containing activity records is
 // available from CUPTI. Frees the buffer after reading activity records from
 // it.
 void CUPTIAPI FreeCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
                                       uint8_t *buffer, size_t size,
                                       size_t valid_size) {
   VLOG(3) << "Freeing Cupti Buffer, buffer:" << std::hex
           << reinterpret_cast<uintptr_t>(buffer) << std::dec
           << " size: " << size << " valid_size: " << valid_size;

   if (valid_size > 0) {
     VLOG(3) << "Activity profile for stream " << stream_id;

     CuptiTracer *cupti_tracer = CuptiTracer::GetCuptiTracerSingleton();
     cupti_tracer->ProcessActivityBuffer(context, stream_id, buffer, valid_size)
         .IgnoreError();
   }
   port::AlignedFree(buffer);
 }

 void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
                                const CUpti_CallbackData *cbdata,
                                uint64 start_time, uint64 end_time) {
   CuptiTracerEvent event;
   event.type = CuptiTracerEventType::Kernel;
   event.source = CuptiTracerEventSource::DriverCallback;
   event.name = cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
   event.start_time_ns = start_time;
   event.end_time_ns = end_time;
   event.thread_id = GetCachedTID();
   event.device_id = device_id;
   event.context_id = cbdata->contextUid;
   event.correlation_id = cbdata->correlationId;
   VLOG(3) << "Cuda Kernel Launched: " << event.name;
   collector->AddEvent(std::move(event));
 }

 // Performs the actual callback for both normal and P2P memcpy operations.
 CuptiTracerEvent PopulateMemcpyCallbackEvent(
     CuptiTracerEventType type, const CUpti_CallbackData *cbdata,
     size_t num_bytes, uint32 src_device, uint32 dst_device, bool async,
     uint64 start_time, uint64 end_time) {
   CuptiTracerEvent event;
   event.type = type;
   event.source = CuptiTracerEventSource::DriverCallback;
   event.start_time_ns = start_time;
   event.end_time_ns = end_time;
   event.thread_id = GetCachedTID();
   event.device_id = src_device;
   event.context_id = cbdata->contextUid;
   event.correlation_id = cbdata->correlationId;
   event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
   event.memcpy_info.num_bytes = num_bytes;
   event.memcpy_info.destination = dst_device;
   event.memcpy_info.async = async;
   return event;
 }

 void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
                                      uint32 device_id, CUpti_CallbackId cbid,
                                      const CUpti_CallbackData *cbdata,
                                      uint64 start_time, uint64 end_time) {
   size_t num_bytes;
   CuptiTracerEventType type;
   bool async;
   std::tie(num_bytes, type, async) =
       DecodeDriverMemcpy(cbid, cbdata->functionParams);

   VLOG(3) << "Cuda Memcpy observed :" << num_bytes;
   CuptiTracerEvent event =
       PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, device_id, device_id,
                                   async, start_time, end_time);
   collector->AddEvent(std::move(event));
 }

 void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
                                   CuptiInterface *cupti_interface,
                                   uint32 device_id, CUpti_CallbackId cbid,
                                   const CUpti_CallbackData *cbdata,
                                   uint64 start_time, uint64 end_time) {
   size_t num_bytes;
   CuptiTracerEventType type;
   bool async;
   std::tie(num_bytes, type, async) =
       DecodeDriverMemcpy(cbid, cbdata->functionParams);

   uint32 dst_device = -1, src_device = -1;
   const cuMemcpyPeer_params *p2p_params =
       reinterpret_cast<const cuMemcpyPeer_params *>(cbdata->functionParams);
   cupti_interface->GetDeviceId(p2p_params->srcContext, &src_device);
   cupti_interface->GetDeviceId(p2p_params->dstContext, &dst_device);
   VLOG(3) << "Cuda P2P Memcpy observed, src: " << src_device
           << " dst: " << dst_device << " size:" << num_bytes;
   CuptiTracerEvent event =
       PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, src_device,
                                   dst_device, async, start_time, end_time);
   collector->AddEvent(std::move(event));
 }

 void AddCudaMallocEventUponApiExit(CuptiTraceCollector *collector,
                                    uint32 device_id, CUpti_CallbackId cbid,
                                    const CUpti_CallbackData *cbdata,
                                    uint64 start_time, uint64 end_time) {
   const cuMemAlloc_v2_params_st *params =
       reinterpret_cast<const cuMemAlloc_v2_params_st *>(cbdata->functionParams);
   CuptiTracerEvent event;
   event.type = CuptiTracerEventType::MemoryAlloc;
   event.source = CuptiTracerEventSource::DriverCallback;
   event.name = cbdata->functionName;
   event.start_time_ns = start_time;
   event.end_time_ns = end_time;
   event.thread_id = GetCachedTID();
   event.device_id = device_id;
   event.context_id = cbdata->contextUid;
   event.correlation_id = cbdata->correlationId;
   event.memalloc_info.num_bytes = params->bytesize;
   VLOG(3) << "Cuda Malloc/Free observed: " << params->bytesize;
   collector->AddEvent(std::move(event));
 }

 void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
                                 uint32 device_id, CUpti_CallbackId cbid,
                                 const CUpti_CallbackData *cbdata,
                                 uint64 start_time, uint64 end_time) {
   CuptiTracerEvent event;
   event.type = CuptiTracerEventType::Generic;
   event.source = CuptiTracerEventSource::DriverCallback;
   event.name = cbdata->functionName;
   event.start_time_ns = start_time;
   event.end_time_ns = end_time;
   event.thread_id = GetCachedTID();
   event.device_id = device_id;
   event.context_id = cbdata->contextUid;
   event.correlation_id = cbdata->correlationId;
   collector->AddEvent(std::move(event));
 }

 void AddKernelActivityEvent(CuptiTraceCollector *collector,
                             const CUpti_ActivityKernel4 *kernel) {
   CuptiTracerEvent event;
   event.type = CuptiTracerEventType::Kernel;
   event.source = CuptiTracerEventSource::Activity;
   event.name = kernel->name;
   event.start_time_ns = kernel->start;
   event.end_time_ns = kernel->end;
   event.device_id = kernel->deviceId;
   event.context_id = kernel->contextId;
   event.stream_id = kernel->streamId;
   event.correlation_id = kernel->correlationId;
   event.annotation = collector->annotation_map()->LookUp(event.device_id,
                                                          event.correlation_id);
   event.kernel_info.registers_per_thread = kernel->registersPerThread;
   event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
   event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
   event.kernel_info.block_x = kernel->blockX;
   event.kernel_info.block_y = kernel->blockY;
   event.kernel_info.block_z = kernel->blockZ;
   event.kernel_info.grid_x = kernel->gridX;
   event.kernel_info.grid_y = kernel->gridY;
   event.kernel_info.grid_z = kernel->gridZ;
   collector->AddEvent(std::move(event));
 }

 void AddMemcpyActivityEvent(CuptiTraceCollector *collector,
                             const CUpti_ActivityMemcpy *memcpy) {
   CuptiTracerEvent event;
   switch (memcpy->copyKind) {
     case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
       event.type = CuptiTracerEventType::MemcpyH2D;
       event.name = "MemcpyH2D";
       break;
     case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
       event.type = CuptiTracerEventType::MemcpyD2H;
       event.name = "MemcpyD2H";
       break;
     case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
       event.type = CuptiTracerEventType::MemcpyD2D;
       event.name = "MemcpyD2D";
       break;
     case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
       event.type = CuptiTracerEventType::MemcpyP2P;
       event.name = "MemcpyP2P";
       break;
     default:
       event.type = CuptiTracerEventType::MemcpyOther;
       event.name = "MemcpyOther";
       break;
   }
   event.source = CuptiTracerEventSource::Activity;
   event.start_time_ns = memcpy->start;
   event.end_time_ns = memcpy->end;
   event.device_id = memcpy->deviceId;
   event.context_id = memcpy->contextId;
   event.stream_id = memcpy->streamId;
   event.correlation_id = memcpy->correlationId;
   event.annotation = collector->annotation_map()->LookUp(event.device_id,
                                                          event.correlation_id);
   event.memcpy_info.kind = memcpy->copyKind;
   event.memcpy_info.num_bytes = memcpy->bytes;
   event.memcpy_info.destination = memcpy->deviceId;
   event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
   event.memcpy_info.src_mem_kind = memcpy->srcKind;
   event.memcpy_info.dst_mem_kind = memcpy->dstKind;
   collector->AddEvent(std::move(event));
 }

 // Invokes callback upon peer-2-peer memcpy between different GPU devices.
 void AddMemcpy2ActivityEvent(CuptiTraceCollector *collector,
                              const CUpti_ActivityMemcpy2 *memcpy2) {
   CuptiTracerEvent event;
   event.type = CuptiTracerEventType::MemcpyP2P;
   event.name = "MemcpyP2P";
   event.source = CuptiTracerEventSource::Activity;
   event.start_time_ns = memcpy2->start;
   event.end_time_ns = memcpy2->end;
   event.device_id = memcpy2->srcDeviceId;
   event.context_id = memcpy2->contextId;
   event.stream_id = memcpy2->streamId;
   event.correlation_id = memcpy2->correlationId;
   event.annotation = collector->annotation_map()->LookUp(event.device_id,
                                                          event.correlation_id);
   event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
   event.memcpy_info.num_bytes = memcpy2->bytes;
   event.memcpy_info.destination = memcpy2->dstDeviceId;
   event.memcpy_info.async = memcpy2->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
   event.memcpy_info.src_mem_kind = memcpy2->srcKind;
   event.memcpy_info.dst_mem_kind = memcpy2->dstKind;
   collector->AddEvent(std::move(event));
 }

 void AddCuptiOverheadActivityEvent(CuptiTraceCollector *collector,
                                    const CUpti_ActivityOverhead *overhead) {
   CuptiTracerEvent event;
   event.type = CuptiTracerEventType::Overhead;
   event.name = getActivityOverheadKindString(overhead->overheadKind);
   event.source = CuptiTracerEventSource::Activity;
   event.start_time_ns = overhead->start;
   event.end_time_ns = overhead->end;
   // If the overhead is not related to a device, we assign it to device 0.
   event.device_id = 0;
   // NOTE: no correlation id.
   switch (overhead->objectKind) {
     case CUPTI_ACTIVITY_OBJECT_UNKNOWN:
       // Don't know how to deal with such activities because of we need either
       // attribute it to a GPU stream or a CPU thread.
       return;

     case CUPTI_ACTIVITY_OBJECT_THREAD:
     case CUPTI_ACTIVITY_OBJECT_PROCESS:
       event.thread_id = overhead->objectId.pt.threadId;
       break;
     case CUPTI_ACTIVITY_OBJECT_STREAM:
       event.stream_id = overhead->objectId.dcs.streamId;
       TF_FALLTHROUGH_INTENDED;
     case CUPTI_ACTIVITY_OBJECT_DEVICE:
     case CUPTI_ACTIVITY_OBJECT_CONTEXT:
       event.device_id = overhead->objectId.dcs.deviceId;
       break;
     default:
       LOG(ERROR) << "Unexpected object kind: " << overhead->objectKind;
       return;
   }
   collector->AddEvent(std::move(event));
 }

 void AddUnifiedMemoryActivityEvent(
     CuptiTraceCollector *collector,
     const CUpti_ActivityUnifiedMemoryCounter2 *record) {
   VLOG(3) << "Cuda Unified Memory Activity, kind: " << record->counterKind
           << " src: " << record->srcId << " dst: " << record->dstId;
   CuptiTracerEvent event;
   event.type = CuptiTracerEventType::UnifiedMemory;
   event.name = getActivityUnifiedMemoryKindString(record->counterKind);
   event.source = CuptiTracerEventSource::Activity;
   event.start_time_ns = record->start;
   if (record->counterKind ==
           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT ||
       record->counterKind ==
           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING ||
       record->counterKind ==
           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP ||
       record->end <= record->start) {
     // If the end time is not valid, trim it so that it can be shown on the UI.
     event.end_time_ns = record->start + 1;
   } else {
     event.end_time_ns = record->end;
   }
   event.device_id = record->srcId;
   // NOTE: not context id and correlation id.

   // For visualization purpose, we assign a pseudo stream id for each
   // record->counterKind of unified memory related events.
   constexpr int kPseudoStreamId = 0x10000000;
   event.stream_id = kPseudoStreamId + record->counterKind;
   event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
   // Check whether the activity is byte transfer.
   if (record->counterKind ==
           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD ||
       record->counterKind ==
           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH ||
       record->counterKind ==
           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD) {
     event.memcpy_info.num_bytes = record->value;
   } else {
     event.memcpy_info.num_bytes = 0;
   }
   event.memcpy_info.destination = record->dstId;
   event.memcpy_info.async = false;
   collector->AddEvent(std::move(event));
 }

 // This hook uses cupti activity api to measure device side activities.
 class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
  public:
   CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions &option,
                                     CuptiInterface *cupti_interface,
                                     CuptiTraceCollector *collector)
       : option_(option),
         cupti_interface_(cupti_interface),
         collector_(collector) {}

   Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
                           CUpti_CallbackId cbid,
                           const CUpti_CallbackData *cbdata) override {
     // Stash away the current Cupti timestamp into cbdata.
     *cbdata->correlationData =
         option_.required_callback_api_events ? CuptiTracer::GetTimestamp() : 0;
     return Status::OK();
   }
   Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
                          CUpti_CallbackId cbid,
                          const CUpti_CallbackData *cbdata) override {
     // If we are not collecting CPU events from Callback API, we can return now.
     if (!option_.required_callback_api_events) {
       return Status::OK();
     }

     // Grab timestamp for API exit. API entry timestamp saved in cbdata.
     uint64 end_tsc = CuptiTracer::GetTimestamp();
     uint64 start_tsc = *cbdata->correlationData;
     TrackContext(cbid, cbdata->context);
     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
                                      start_tsc, end_tsc, domain, cbid, cbdata);
   }
   Status SyncAndFlush() override {
     if (option_.sync_devices_before_stop) {
       CuptiApiTracingDisabler disabler;
       absl::MutexLock lock(&mutex_);
       for (auto &ctx : contexts_) {
         cuCtxPushCurrent(ctx);
         cuCtxSynchronize();  // Ignore error here for best effort.
         CUcontext current;
         cuCtxPopCurrent(&current);
       }
     }
     return Status::OK();
   }

  private:
   void TrackContext(CUpti_CallbackId cbid, CUcontext ctx) {
     if (!option_.sync_devices_before_stop) return;
     if (ctx == NULL) return;
     absl::MutexLock lock(&mutex_);
     if (cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2 ||
         cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy) {
       contexts_.erase(ctx);
     } else {
       contexts_.emplace(ctx);
     }
   }

   const CuptiTracerOptions option_;
   CuptiInterface *cupti_interface_;
   CuptiTraceCollector *collector_;
   absl::Mutex mutex_;
   absl::flat_hash_set<CUcontext> contexts_ TF_GUARDED_BY(mutex_);

   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithActivityApi);
 };

 struct KernelRecord {
   const char *kernel_name;
   // TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only
   // record the stream and infer the context during collection.
   CUcontext context;
   CUstream stream;
   uint32 correlation_id;
   CUevent start_event;
   CUevent stop_event;
   KernelDetails details;
   uint64 start_timestamp;
 };

 struct MemcpyRecord {
   CuptiTracerEventType type;
   size_t size_bytes;
   CUcontext context;
   CUstream stream;
   uint32 correlation_id;
   bool async;
   CUevent start_event;
   CUevent stop_event;
   uint64 start_timestamp;
 };

 Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
   CuptiApiTracingDisabler disabler;
   TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT)));
   return ToStatus(cuEventRecord(*event, stream));
 }

 #if CUDA_VERSION >= 10000
 // Maintain and restore current thread's CUDA context.
 // Note: cuStreamGetCtx only available after CUDA 9.2.
 class ScopedCudaContext {
  public:
   ScopedCudaContext(CUstream stream) : stream_(stream) {
     CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
     CUcontext context;
     if (cuStreamGetCtx(stream, &context) != CUDA_SUCCESS) return;
     context_ = context;
     uint32 device_ordinal;
     if (cuptiGetDeviceId(context, &device_ordinal) != CUPTI_SUCCESS) return;
     device_ordinal_ = device_ordinal;
     context_pushed_ = cuCtxPushCurrent(context) == CUDA_SUCCESS;
   }
   ~ScopedCudaContext() {
     if (!context_pushed_) return;
     CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
     cuCtxPopCurrent(&*context_);
   }

   // If successful, return the device ordinal of the relevant cuda stream.
   // Otherwise absl::nullopt;
   absl::optional<uint32> GetDeviceOrdinal() { return device_ordinal_; }

   // If successful, return the cuda context of the relevant cuda stream.
   // Otherwise absl::nullopt;
   absl::optional<CUcontext> GetContext() { return context_; }

  private:
   CUstream stream_;
   absl::optional<CUcontext> context_;
   absl::optional<uint32> device_ordinal_;
   bool context_pushed_ = false;
 };
 #endif

 // Stores a series of kernel and memcpy records.
 class CudaEventRecorder {
  public:
   CudaEventRecorder(CuptiInterface *cupti_interface,
                     CuptiTraceCollector *collector, int ordinal)
       : cupti_interface_(cupti_interface),
         collector_(collector),
         ordinal_(ordinal) {
     device_name_ = absl::StrCat("gpu ", ordinal);  // default.
     CUdevice device;
     if (cuDeviceGet(&device, ordinal) == CUDA_SUCCESS) {
       char name[100];
       if (cuDeviceGetName(name, sizeof(name), device) == CUDA_SUCCESS) {
         device_name_ = name;
       }
     }
   }

   // Registers the start of a kernel launch. The returned index should be passed
   // to StopKernel() after the kernel launch has completed.
   template <typename T>
   size_t StartKernel(const char *kernel_name, CUcontext context,
                      uint32 correlation_id, const T *params) {
     CUstream stream = params->hStream;
     KernelRecord record = {kernel_name, context, stream, correlation_id};
     record.details.registers_per_thread = 0;  // unknown.
     record.details.static_shared_memory_usage = params->sharedMemBytes;
     record.details.dynamic_shared_memory_usage = 0;  // unknown
     record.details.block_x = params->blockDimX;
     record.details.block_y = params->blockDimY;
     record.details.block_z = params->blockDimZ;
     record.details.grid_x = params->gridDimX;
     record.details.grid_y = params->gridDimY;
     record.details.grid_z = params->gridDimZ;
     record.start_timestamp = CuptiTracer::GetTimestamp();
     LogIfError(CreateAndRecordEvent(&record.start_event, stream));
     absl::MutexLock lock(&mutex_);
     if (stopped_) return -1;
     kernel_records_.push_back(record);
     return kernel_records_.size() - 1;
   }
   uint64 StopKernel(size_t index) {
     absl::MutexLock lock(&mutex_);
     if (index >= kernel_records_.size()) return 0;
     auto &record = kernel_records_[index];
     LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
     return record.start_timestamp;
   }

   // Registers the start of a copy operation. The returned index should be
   // passed to StopMemcpy() after the memcpy has completed.
   size_t StartMemcpy(CuptiTracerEventType type, size_t size_bytes,
                      CUcontext context, CUstream stream, uint32 correlation_id,
                      bool async) {
     MemcpyRecord record = {type,   size_bytes,     context,
                            stream, correlation_id, async};
     record.start_timestamp = CuptiTracer::GetTimestamp();
     LogIfError(CreateAndRecordEvent(&record.start_event, stream));
     absl::MutexLock lock(&mutex_);
     if (stopped_) return -1;
     memcpy_records_.push_back(record);
     return memcpy_records_.size() - 1;
   }
   uint64 StopMemcpy(size_t index) {
     absl::MutexLock lock(&mutex_);
     if (index >= memcpy_records_.size()) return 0;
     auto &record = memcpy_records_[index];
     LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
     return record.start_timestamp;
   }

   Status Stop() {
     {
       absl::MutexLock lock(&mutex_);
       stopped_ = true;
       LOG(INFO) << "Collecting " << kernel_records_.size()
                 << " kernel records, " << memcpy_records_.size()
                 << " memcpy records.";

       // Gather all profiled streams and contexts.
       for (const auto &record : kernel_records_) {
         TF_RETURN_IF_ERROR(
             AddStreamInfo(record.context, record.stream, "Kernel"));
       }
       for (const auto &record : memcpy_records_) {
         TF_RETURN_IF_ERROR(AddStreamInfo(record.context, record.stream,
                                          GetTraceEventTypeName(record.type)));
       }
     }

     // Synchronize all contexts, record end events, synchronize again.
     // This scheme is an unreliable measure to associate a event with the wall
     // time. There are chances that other threads might enque kernels which
     // delay the second synchronization.
     TF_RETURN_IF_ERROR(Synchronize());
     for (auto &pair : context_infos_) {
       TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
       TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr));
     }

     TF_RETURN_IF_ERROR(Synchronize());
     end_walltime_us_ = Env::Default()->NowMicros();
     return Status::OK();
   }

   Status Flush(AnnotationMap *annotation_map) {
     auto kernel_records = ConsumeKernelRecords();
     auto memcpy_records = ConsumeMemcpyRecords();
     for (const auto &record : kernel_records) {
       TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
     }
     for (const auto &record : memcpy_records) {
       TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
     }
     return Status::OK();
   }

   std::vector<KernelRecord> ConsumeKernelRecords() {
     absl::MutexLock lock(&mutex_);
     return std::move(kernel_records_);
   }
   std::vector<MemcpyRecord> ConsumeMemcpyRecords() {
     absl::MutexLock lock(&mutex_);
     return std::move(memcpy_records_);
   }

  private:
   struct ContextInfo {
     uint32 context_id = 0;
     int num_streams = 0;
     CUevent end_event;
   };

   struct StreamInfo {
     uint32 stream_id = 0;
     std::string name;
     int index;  // 0 is reserved for null stream.
     const ContextInfo *ctx_info;
   };

   // Synchronizes all contexts.
   Status Synchronize() const {
     CuptiApiTracingDisabler disabler;
     for (const auto &pair : context_infos_) {
       TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
       TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize()));
     }
     return Status::OK();
   }

   // Returns element from context_infos_, adding it if not yet present.
   Status GetContextInfo(CUcontext context, ContextInfo **ctx_info_ptr) {
     auto it = context_infos_.find(context);

     if (it == context_infos_.end()) {
       uint32 context_id = 0;
       RETURN_IF_CUPTI_ERROR(
           cupti_interface_->GetContextId(context, &context_id));
       ContextInfo ctx_info = {context_id};
       it = context_infos_.emplace(context, ctx_info).first;
     }

     *ctx_info_ptr = &it->second;
     return Status::OK();
   }

   // Adds element to stream_infos_ if not yet present. If present, clear name
   // if it doesn't match parameter.
   Status AddStreamInfo(CUcontext context, CUstream stream,
                        absl::string_view name) {
     StreamKey key(context, stream);
     auto it = stream_infos_.find(key);
     if (it != stream_infos_.end()) {
       if (it->second.name != name) {
         it->second.name.clear();  // Stream with inconsistent names, clear it.
       }
       return Status::OK();
     }

     ContextInfo *ctx_info;
     TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
     int index = stream ? ++ctx_info->num_streams : 0;
     uint32 stream_id = 0;
 #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
     RETURN_IF_CUPTI_ERROR(
         cupti_interface_->GetStreamIdEx(context, stream, 1, &stream_id));
 #else
     RETURN_IF_CUPTI_ERROR(
         cupti_interface_->GetStreamIdEx(context, stream, 0, &stream_id));
 #endif

     StreamInfo stream_info = {stream_id, static_cast<std::string>(name), index,
                               ctx_info};
     stream_infos_.emplace(key, stream_info);
     return Status::OK();
   }

   // Returns time in microseconds between events recorded on the GPU.
   static uint64_t GetElapsedTimeUs(CUevent start, CUevent stop) {
     CuptiApiTracingDisabler disabler;
     float elapsed_ms = 0.0f;
     LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
     return static_cast<uint64>(
         std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
   }

   Status SaveRecord(const KernelRecord &record,
                     AnnotationMap *annotation_map) const {
     if (!record.start_event || !record.stop_event) {
       return Status::OK();
     }
     const auto &stream_info =
         stream_infos_.at(StreamKey(record.context, record.stream));
     auto start_us =
         GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
     auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);

     std::string annotation;

     CuptiTracerEvent event;
     event.type = CuptiTracerEventType::Kernel;
     event.source = CuptiTracerEventSource::Activity;  // on gpu device.
     event.name = record.kernel_name;
     event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
     event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
     event.device_id = ordinal_;
     event.context_id = stream_info.ctx_info->context_id;
     event.stream_id = stream_info.stream_id;
     event.correlation_id = record.correlation_id;
     event.annotation =
         annotation_map->LookUp(event.device_id, event.correlation_id);
     event.kernel_info = record.details;
     collector_->AddEvent(std::move(event));
     return Status::OK();
   }

   Status SaveRecord(const MemcpyRecord &record,
                     AnnotationMap *annotation_map) const {
     if (!record.start_event || !record.stop_event) {
       return Status::OK();
     }
     const auto &stream_info =
         stream_infos_.at(StreamKey(record.context, record.stream));
     auto start_us =
         GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
     auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);

     CuptiTracerEvent event;
     event.type = record.type;
     event.name = GetTraceEventTypeName(event.type);
     event.source = CuptiTracerEventSource::Activity;
     event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
     event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
     event.device_id = ordinal_;
     event.context_id = stream_info.ctx_info->context_id;
     event.stream_id = stream_info.stream_id;
     event.correlation_id = record.correlation_id;
     event.annotation =
         annotation_map->LookUp(event.device_id, event.correlation_id);
     event.memcpy_info.num_bytes = record.size_bytes;
     // TODO: support MemcpyD2D where destination != source;
     event.memcpy_info.destination = ordinal_;
     event.memcpy_info.async = record.async;
     // TODO: set src_mem_kind and dst_mem_kind.
     collector_->AddEvent(std::move(event));
     return Status::OK();
   }

   absl::Mutex mutex_;
   bool stopped_ TF_GUARDED_BY(mutex_) = false;
   std::vector<KernelRecord> kernel_records_ TF_GUARDED_BY(mutex_);
   std::vector<MemcpyRecord> memcpy_records_ TF_GUARDED_BY(mutex_);

   CuptiInterface *cupti_interface_;
   CuptiTraceCollector *collector_;
   const int ordinal_;
   std::string device_name_;
   uint64 end_walltime_us_;
   // Include context in key to distinguish null streams.
   using StreamKey = std::pair<CUcontext, CUstream>;

   absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
   absl::flat_hash_map<StreamKey, StreamInfo> stream_infos_;
 };

 // This hook uses cuda events to measure device side activities.
 class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
  public:
   CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions &option,
                                   CuptiInterface *cupti_interface,
                                   CuptiTraceCollector *collector)
       : option_(option),
         cupti_interface_(cupti_interface),
         collector_(collector) {
     int num_gpus = CuptiTracer::NumGpus();
     cuda_event_recorders_.reserve(num_gpus);
     for (int i = 0; i < num_gpus; ++i) {
       cuda_event_recorders_.emplace_back(
           absl::make_unique<CudaEventRecorder>(cupti_interface, collector, i));
     }
   }
   ~CuptiDriverApiHookWithCudaEvent() {
     for (auto *callback_context : callback_contexts_) delete callback_context;
   }

   Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
                           CUpti_CallbackId cbid,
                           const CUpti_CallbackData *cbdata) override {
     auto *recorder = cuda_event_recorders_[device_id].get();
     switch (cbid) {
       case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
         DCHECK_NE(cbdata->symbolName, nullptr);
         auto params =
             static_cast<const cuLaunchKernel_params *>(cbdata->functionParams);
         *cbdata->correlationData = recorder->StartKernel<cuLaunchKernel_params>(
             cbdata->symbolName, cbdata->context, cbdata->correlationId, params);
         break;
       }
       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: {
         DCHECK_NE(cbdata->symbolName, nullptr);
         auto params = static_cast<const cuLaunchCooperativeKernel_params_st *>(
             cbdata->functionParams);
         *cbdata->correlationData =
             recorder->StartKernel<cuLaunchCooperativeKernel_params_st>(
                 cbdata->symbolName, cbdata->context, cbdata->correlationId,
                 params);
         break;
       }
       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
 #if CUDA_VERSION >= 10000
         auto params =
             static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
                 cbdata->functionParams);
         std::vector<uint32> record_indices;
         record_indices.reserve(params->numDevices);
         *cbdata->correlationData = -1;  // Invalid value.
         const auto &annotation = AnnotationStack::Get();
         for (int i = 0; i < params->numDevices; ++i) {
           CUstream stream = params->launchParamsList[i].hStream;
           ScopedCudaContext scoped_cuda_context(stream);
           auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
           auto context = scoped_cuda_context.GetContext();
           if (!dev_id) return errors::Internal("Invalid CUDA stream");
           // Because annotation are per device, therefore we need to populate
           // annotation for each device involved.
           collector_->annotation_map()->Add(*dev_id, cbdata->correlationId,
                                             annotation);
           record_indices.push_back(
               cuda_event_recorders_[*dev_id]->StartKernel<CUDA_LAUNCH_PARAMS>(
                   "CooperativeKernelMultiDevice", *context,
                   cbdata->correlationId, &(params->launchParamsList[i])));
         }
         auto *callback_context =
             new CuptiApiCallbackContext(std::move(record_indices));
         callback_contexts_.insert(callback_context);
         *cbdata->correlationData = reinterpret_cast<uint64>(callback_context);
 #else
         VLOG(1) << "Unhandled cuLaunchCooperativeKernelMultiDevice.";
 #endif
       } break;
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
         auto params =
             static_cast<const cuMemcpy_params *>(cbdata->functionParams);
         StartMemcpy<cuMemcpy_params>(GetMemcpyType(params->src, params->dst),
                                      cbdata, recorder);
         break;
       }
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
         auto params =
             static_cast<const cuMemcpyAsync_params *>(cbdata->functionParams);
         StartMemcpyAsync<cuMemcpyAsync_params>(
             GetMemcpyType(params->src, params->dst), cbdata, recorder);
         break;
       }
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
         StartMemcpy<cuMemcpyHtoD_v2_params>(CuptiTracerEventType::MemcpyH2D,
                                             cbdata, recorder);
         break;
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
         StartMemcpyAsync<cuMemcpyHtoDAsync_v2_params>(
             CuptiTracerEventType::MemcpyH2D, cbdata, recorder);
         break;
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
         StartMemcpy<cuMemcpyDtoH_v2_params>(CuptiTracerEventType::MemcpyD2H,
                                             cbdata, recorder);
         break;
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
         StartMemcpyAsync<cuMemcpyDtoHAsync_v2_params>(
             CuptiTracerEventType::MemcpyD2H, cbdata, recorder);
         break;
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
         StartMemcpy<cuMemcpyDtoD_v2_params>(CuptiTracerEventType::MemcpyD2D,
                                             cbdata, recorder);
         break;
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
         StartMemcpyAsync<cuMemcpyDtoDAsync_v2_params>(
             CuptiTracerEventType::MemcpyD2D, cbdata, recorder);
         break;
       default:
         VLOG(1) << "Unexpected callback id: " << cbid;
         break;
     }
     return Status::OK();
   }
   Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
                          CUpti_CallbackId cbid,
                          const CUpti_CallbackData *cbdata) override {
     auto *recorder = cuda_event_recorders_[device_id].get();
     if (*cbdata->correlationData == static_cast<size_t>(-1))
       return Status::OK();
     uint64 start_tsc = 0;
     switch (cbid) {
       case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
         start_tsc = recorder->StopKernel(*cbdata->correlationData);
         break;
       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
 #if CUDA_VERSION >= 10000
         auto *callback_context = reinterpret_cast<CuptiApiCallbackContext *>(
             *cbdata->correlationData);
         callback_contexts_.erase(callback_context);
         auto record_indices = std::move(callback_context->record_indices);
         delete callback_context;
         auto params =
             static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
                 cbdata->functionParams);
         if (record_indices.size() != params->numDevices)
           return errors::Internal("Invalid correlation data");
         for (int i = 0; i < params->numDevices; ++i) {
           CUstream stream = params->launchParamsList[i].hStream;
           ScopedCudaContext scoped_cuda_context(stream);
           auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
           if (!dev_id) return errors::Internal("Invalid CUDA stream");
           start_tsc =
               cuda_event_recorders_[*dev_id]->StopKernel(record_indices[i]);
         }
 #endif
       } break;
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
         start_tsc = recorder->StopMemcpy(*cbdata->correlationData);
         break;
       default:
         VLOG(1) << "Unexpected callback id: " << cbid;
         // TODO: figure out how to get start timestamp in this case.
         return Status::OK();
     }
     // If we are not collecting CPU events from Callback API, we can return now.
     if (!option_.required_callback_api_events) {
       return Status::OK();
     }

     // Grab timestamp for API exit. API entry timestamp saved in cbdata.
     uint64 end_tsc = CuptiTracer::GetTimestamp();
     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
                                      start_tsc, end_tsc, domain, cbid, cbdata);
   }
   Status SyncAndFlush() override {
     for (auto &recorder : cuda_event_recorders_) {
       TF_RETURN_IF_ERROR(recorder->Stop());
     }
     for (auto &recorder : cuda_event_recorders_) {
       TF_RETURN_IF_ERROR(recorder->Flush(collector_->annotation_map()));
     }
     return Status::OK();
   }

  private:
   template <typename T>
   static void StartMemcpy(CuptiTracerEventType type,
                           const CUpti_CallbackData *cbdata,
                           CudaEventRecorder *recorder) {
     auto params = static_cast<const T *>(cbdata->functionParams);
     *cbdata->correlationData =
         recorder->StartMemcpy(type, params->ByteCount, cbdata->context, nullptr,
                               cbdata->correlationId, /*async*/ false);
   }
   template <typename T>
   static void StartMemcpyAsync(CuptiTracerEventType type,
                                const CUpti_CallbackData *cbdata,
                                CudaEventRecorder *recorder) {
     auto params = static_cast<const T *>(cbdata->functionParams);
     *cbdata->correlationData = recorder->StartMemcpy(
         type, params->ByteCount, cbdata->context, params->hStream,
         cbdata->correlationId, /*async*/ true);
   }

   static CUmemorytype GetMemoryType(CUdeviceptr ptr) {
     CuptiApiTracingDisabler disabler;
     CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
     auto status =
         cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr);
     if (status == CUDA_ERROR_INVALID_VALUE) {
       // Pointer not registered with CUDA, must be host memory.
       return CU_MEMORYTYPE_HOST;
     }
     LogIfError(ToStatus(status));
     return mem_type;
   }

   static CuptiTracerEventType GetMemcpyType(CUdeviceptr src, CUdeviceptr dst) {
     CUmemorytype src_type = GetMemoryType(src);
     CUmemorytype dst_type = GetMemoryType(dst);
     // TODO: handle CU_MEMORYTYPE_ARRAY case
     if (src_type == CU_MEMORYTYPE_HOST && dst_type == CU_MEMORYTYPE_DEVICE) {
       return CuptiTracerEventType::MemcpyH2D;
     } else if (src_type == CU_MEMORYTYPE_DEVICE &&
                dst_type == CU_MEMORYTYPE_HOST) {
       return CuptiTracerEventType::MemcpyD2H;
     } else if (src_type == CU_MEMORYTYPE_DEVICE &&
                dst_type == CU_MEMORYTYPE_DEVICE) {
       return CuptiTracerEventType::MemcpyD2D;
     }
     return CuptiTracerEventType::MemcpyOther;
   }

   // Each cuLaunchCooperativeKernelMultiDevice will need to add an entry in
   // each corresponding device, therefore we need to keep records of all
   // the record indices in each device's record array.
   // We allocate such data structure during API entry and free during API exit.
   // However there is no guarantee that we receive such callbacks in pairs, we
   // maintain a on-going API calls to make sure no memory leaks.
   struct CuptiApiCallbackContext {
     CuptiApiCallbackContext(std::vector<uint32> &&r)
         : record_indices(std::move(r)) {}
     std::vector<uint32> record_indices;
   };

   const CuptiTracerOptions option_;
   CuptiInterface *cupti_interface_;
   CuptiTraceCollector *collector_;
   std::set<CuptiApiCallbackContext *> callback_contexts_;
   std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
 };
 }  // namespace

 /*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
     CuptiTraceCollector *collector, CuptiInterface *cupti_interface,
     int device_id, uint64 start_tsc, uint64 end_tsc,
     CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
     const CUpti_CallbackData *cbdata) {
   switch (cbid) {
     case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
     case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
     case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
       AddKernelEventUponApiExit(collector, device_id, cbdata, start_tsc,
                                 end_tsc);
       break;
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
       AddNormalMemcpyEventUponApiExit(collector, device_id, cbid, cbdata,
                                       start_tsc, end_tsc);
       break;
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
       AddP2PMemcpyEventUponApiExit(collector, cupti_interface, device_id, cbid,
                                    cbdata, start_tsc, end_tsc);
       break;
     case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
       AddCudaMallocEventUponApiExit(collector, device_id, cbid, cbdata,
                                     start_tsc, end_tsc);
       break;
     default:
       AddGenericEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
                                  end_tsc);
       break;
   }
   return Status::OK();
 }

 const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
   switch (type) {
     case CuptiTracerEventType::MemcpyH2D:
       return "MemcpyH2D";
     case CuptiTracerEventType::MemcpyD2H:
       return "MemcpyD2H";
     case CuptiTracerEventType::MemcpyD2D:
       return "MemcpyD2D";
     case CuptiTracerEventType::MemcpyP2P:
       return "MemcpyP2P";
     case CuptiTracerEventType::MemcpyOther:
       return "MemcpyOther";
     case CuptiTracerEventType::Kernel:
       return "Compute";
     case CuptiTracerEventType::MemoryAlloc:
       return "MemoryAlloc";
     case CuptiTracerEventType::Overhead:
       return "Overhead";
     case CuptiTracerEventType::UnifiedMemory:
       return "UnifiedMemory";
     case CuptiTracerEventType::Generic:
       return "Generic";
     default:
       DCHECK(false);
       return "";
   }
 }

 void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
                         const std::string &annotation) {
   if (annotation.empty()) return;
   VLOG(3) << "Add annotation: device_id: " << device_id
           << " correlation_id: " << correlation_id
           << " annotation: " << annotation;
   if (device_id >= per_device_map_.size()) return;
   auto &per_device_map = per_device_map_[device_id];
   absl::MutexLock lock(&per_device_map.mutex);
   if (per_device_map.annotations.size() < max_size_) {
     absl::string_view annotation_str =
         *per_device_map.annotations.insert(annotation).first;
     per_device_map.correlation_map.emplace(correlation_id, annotation_str);
   }
 }

 absl::string_view AnnotationMap::LookUp(uint32 device_id,
                                         uint32 correlation_id) {
   if (device_id >= per_device_map_.size()) return absl::string_view();
   auto &per_device_map = per_device_map_[device_id];
   absl::MutexLock lock(&per_device_map.mutex);
   auto it = per_device_map.correlation_map.find(correlation_id);
   return it != per_device_map.correlation_map.end() ? it->second
                                                     : absl::string_view();
 }

 /* static */ CuptiTracer *CuptiTracer::GetCuptiTracerSingleton() {
   static auto *singleton = new CuptiTracer(GetCuptiInterface());
   return singleton;
 }

 bool CuptiTracer::IsAvailable() const {
   return NumGpus() && !activity_tracing_enabled_ && !api_tracing_enabled_;
 }

 int CuptiTracer::NumGpus() {
   static int num_gpus = []() -> int {
     if (cuInit(0) != CUDA_SUCCESS) {
       return 0;
     }
     int gpu_count;
     if (cuDeviceGetCount(&gpu_count) != CUDA_SUCCESS) {
       return 0;
     }
     LOG(INFO) << "Profiler found " << gpu_count << " GPUs";
     return gpu_count;
   }();
   return num_gpus;
 }

 void CuptiTracer::Enable(const CuptiTracerOptions &option,
                          CuptiTraceCollector *collector) {
   option_ = option;
   collector_ = collector;
   if (option_->enable_event_based_activity) {
     option_->enable_activity_api = false;
     cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithCudaEvent(
         option, cupti_interface_, collector));
   } else {
     cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithActivityApi(
         option, cupti_interface_, collector));
   }

   Status status = EnableApiTracing();
   need_root_access_ |= status.code() == error::PERMISSION_DENIED;
   if (!status.ok()) return;

   if (option_->enable_activity_api) {
     EnableActivityTracing().IgnoreError();
   }
 }

 void CuptiTracer::Disable() {
   DisableApiTracing().IgnoreError();
   if (option_->enable_activity_api) {
     DisableActivityTracing().IgnoreError();
   }
   cupti_interface_->CleanUp();
   Finalize().IgnoreError();
   cupti_driver_api_hook_->SyncAndFlush().IgnoreError();
   collector_->Flush();
   collector_ = nullptr;
   option_.reset();
   cupti_driver_api_hook_.reset();
 }

 Status CuptiTracer::EnableApiTracing() {
   if (api_tracing_enabled_) return Status::OK();

   VLOG(1) << "Enable subscriber";
   // Subscribe can return CUPTI_ERROR_MAX_LIMIT_REACHED.
   // The application which calls CUPTI APIs cannot be used with Nvidia tools
   // like nvprof, Nvidia Visual Profiler, Nsight Compute, Nsight Systems.
   RETURN_IF_CUPTI_ERROR(cupti_interface_->Subscribe(
       &subscriber_, (CUpti_CallbackFunc)ApiCallback, this));
   api_tracing_enabled_ = true;

   if (!option_->cbids_selected.empty()) {
     for (auto cbid : option_->cbids_selected) {
       RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
           1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
     }
   } else {  // select all callback ids.
     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
         1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
   }
   return Status::OK();
 }

 Status CuptiTracer::DisableApiTracing() {
   if (!api_tracing_enabled_) return Status::OK();

   api_tracing_enabled_ = false;

   if (!option_->cbids_selected.empty()) {
     for (auto cbid : option_->cbids_selected) {
       RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
           0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
     }
   } else {
     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
         0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
   }

   VLOG(1) << "Disable subscriber";
   RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_));
   return Status::OK();
 }

 Status CuptiTracer::EnableActivityTracing() {
   if (!option_->activities_selected.empty()) {
     // Initialize callback functions for Cupti Activity API.
     VLOG(1) << "Registering CUPTI activity callbacks";
     RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityRegisterCallbacks(
         AllocCuptiActivityBuffer, FreeCuptiActivityBuffer));

     VLOG(1) << "Enabling activity tracing for "
             << option_->activities_selected.size() << " activities";
     for (auto activity : option_->activities_selected) {
       VLOG(1) << "Enabling activity tracing for: " << activity;
       if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
         ConfigureActivityUnifiedMemoryCounter(true);
       }
       RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityEnable(activity));
     }
   }
   activity_tracing_enabled_ = true;
   return Status::OK();
 }

 Status CuptiTracer::DisableActivityTracing() {
   if (activity_tracing_enabled_) {
     VLOG(1) << "Disabling activity tracing for "
             << option_->activities_selected.size() << " activities";
     for (auto activity : option_->activities_selected) {
       VLOG(1) << "Disabling activity tracing for: " << activity;
       if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
         ConfigureActivityUnifiedMemoryCounter(false);
       }
       RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityDisable(activity));
     }
     option_->activities_selected.clear();

     VLOG(1) << "Flushing CUPTI activity buffer";
     RETURN_IF_CUPTI_ERROR(
         cupti_interface_->ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
     LOG(INFO) << "CUPTI activity buffer flushed";
   }
   activity_tracing_enabled_ = false;
   return Status::OK();
 }

 Status CuptiTracer::Finalize() {
   if (option_->cupti_finalize) {
     RETURN_IF_CUPTI_ERROR(cupti_interface_->Finalize());
   }
   return Status::OK();
 }

 /*static*/ uint64 CuptiTracer::GetTimestamp() {
   uint64_t tsc;
   CuptiInterface *cupti_interface = GetCuptiInterface();
   if (cupti_interface && cupti_interface->GetTimestamp(&tsc) == CUPTI_SUCCESS) {
     return tsc;
   }
   // Return 0 on error. If an activity timestamp is 0, the activity will be
   // dropped during time normalization.
   return 0;
 }

 Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
                                    CUpti_CallbackId cbid,
                                    const CUpti_CallbackData *cbdata) {
   if (!api_tracing_enabled_) return Status::OK();  // already unsubscribed.
   if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return Status::OK();
   if (internalCuCall) return Status::OK();

   if (cbdata->context == nullptr) {
     // API callback is called before any CUDA context is created.
     // This is expected to be rare, and we ignore this case.
     VLOG(3) << "API callback received before creation of CUDA context\n";
     return errors::Internal("cutpi callback without context");
   }

   // Grab a correct device ID.
   uint32 device_id = -1;
   RETURN_IF_CUPTI_ERROR(
       cupti_interface_->GetDeviceId(cbdata->context, &device_id));
   if (device_id >= num_gpus_) {
     return errors::Internal("Invalid device id:", device_id);
   }

   if (cbdata->callbackSite == CUPTI_API_ENTER) {
     TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiEnter(
         device_id, domain, cbid, cbdata));
   } else if (cbdata->callbackSite == CUPTI_API_EXIT) {
     // Set up the map from correlation id to annotation string.
     const auto &annotation = AnnotationStack::Get();
     if (!annotation.empty()) {
       collector_->annotation_map()->Add(device_id, cbdata->correlationId,
                                         annotation);
     }

     TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiExit(
         device_id, domain, cbid, cbdata));
   }
   return Status::OK();
 }

 void CuptiTracer::ConfigureActivityUnifiedMemoryCounter(bool enable) {
   CUpti_ActivityUnifiedMemoryCounterConfig config[2];
   // By experiments, currently only measurements from these two activities are
   // trustworthy. Others like GPU page fault may be problematic.
   config[0].kind =
       CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
   config[1].kind =
       CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;

   for (size_t i = 0; i < 2; i++) {
     config[i].enable = enable;
   }

   CUptiResult res;

   res = cupti_interface_->ActivityConfigureUnifiedMemoryCounter(config, 2);
   if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED) {
     LOG(ERROR) << "Unified memory is not supported on the "
                   "underlying platform.\n";
   } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE) {
     LOG(ERROR) << "Unified memory is not supported on the device.\n";
   } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES) {
     LOG(ERROR) << "Unified memory is not supported on the "
                   "non-P2P multi-gpu setup.\n";
   } else if (res != CUPTI_SUCCESS) {
     const char *errstr = "";
     cuptiGetResultString(res, &errstr);
     LOG(ERROR) << "Error while enabling unified memory profiling: " << errstr;
   } else {
     VLOG(1) << "Configuring Unified memory profiling: " << res;
   }
 }

 Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
                                           uint8_t *buffer, size_t size) {
   if (!activity_tracing_enabled_) {
     LOG(WARNING) << "CUPTI activity buffer is freed after flush.";
     return Status::OK();
   }
   if (cupti_interface_->Disabled()) return errors::Internal("Disabled.");

   CUpti_Activity *record = nullptr;
   while (true) {
     CUptiResult status =
         cupti_interface_->ActivityGetNextRecord(buffer, size, &record);
     if (status == CUPTI_SUCCESS) {
       switch (record->kind) {
         case CUPTI_ACTIVITY_KIND_KERNEL:  // sequential
         case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
           AddKernelActivityEvent(
               collector_, reinterpret_cast<CUpti_ActivityKernel4 *>(record));
           break;
         case CUPTI_ACTIVITY_KIND_MEMCPY:
           AddMemcpyActivityEvent(
               collector_, reinterpret_cast<CUpti_ActivityMemcpy *>(record));
           break;
         case CUPTI_ACTIVITY_KIND_MEMCPY2:
           AddMemcpy2ActivityEvent(
               collector_, reinterpret_cast<CUpti_ActivityMemcpy2 *>(record));
           break;
         case CUPTI_ACTIVITY_KIND_OVERHEAD:
           AddCuptiOverheadActivityEvent(
               collector_, reinterpret_cast<CUpti_ActivityOverhead *>(record));
           break;
         case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
           AddUnifiedMemoryActivityEvent(
               collector_,
               reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2 *>(record));
           break;
         default:
           LOG(ERROR) << "Activity type " << record->kind << " not supported.";
           break;
       }
     } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
       break;
     } else {
       return errors::Internal("Parse cupti activity buffer error.");
     }
   }

   // Report dropped records.
   size_t dropped;
   RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityGetNumDroppedRecords(
       context, stream_id, &dropped));
   if (dropped != 0) {
     uint32 device_id = -1;
     RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
     collector_->OnEventsDropped("CUpti activity buffer", dropped);
   }
   return Status::OK();
 }

 /*static*/ std::string CuptiTracer::ErrorIfAny() {
   if (CuptiTracer::NumGpus() == 0) {
     return "No GPU detected.";
   } else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) {
     return "Insufficient privilege to run libcupti (you need root permission).";
   } else if (CuptiTracer::GetTimestamp() == 0) {
     return "Failed to load libcupti (is it installed and accessible?)";
   }
   return "";
 }

 }  // namespace profiler
 }  // namespace tensorflow