make event.name std::string. due to CUpti_ActivityKernel4::name is not persistent.
and delay the Demangle process to serialization time because we want to reduce the overhead during profiling.
PiperOrigin-RevId: 268489643
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index 81a2501..8916e28 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -20,7 +20,6 @@
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/gtl/cleanup.h"
#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/platform/abi.h"
#include "tensorflow/core/platform/annotation.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/logging.h"
@@ -851,7 +850,7 @@
CuptiTracerEvent event;
event.type = CuptiTracerEventType::Kernel;
event.source = CuptiTracerEventSource::Activity; // on gpu device.
- event.name = port::MaybeAbiDemangle(record.kernel_name);
+ event.name = record.kernel_name;
event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
event.device_id = ordinal_;
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
index e931f73..87e6c04 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
@@ -102,8 +102,12 @@
std::numeric_limits<uint64_t>::max();
CuptiTracerEventType type;
CuptiTracerEventSource source;
- // name and annotation are only guaranteed to be valid in collector->AddEvent.
- absl::string_view name;
+ // Although CUpti_CallbackData::functionName is persistent, however
+ // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
+ // it.
+ std::string name;
+ // This points to strings in AnnotationMap, which should outlive the point
+ // where serialization happens.
absl::string_view annotation;
uint64 start_time_ns;
uint64 end_time_ns;
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index d8b356a..aa7609d 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -24,6 +24,7 @@
#include "tensorflow/core/common_runtime/step_stats_collector.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/abi.h"
#include "tensorflow/core/platform/annotation.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/stringprintf.h"
@@ -82,7 +83,7 @@
}
per_device_adaptor_[event.device_id].AddEvent(std::move(event));
}
- void OnEventsDropped(const string& reason, uint32 num_events) override {}
+ void OnEventsDropped(const std::string& reason, uint32 num_events) override {}
void Flush() override {
LOG(INFO) << " GpuTracer has collected " << num_callback_events_
<< " callback api events and " << num_activity_events_
@@ -138,7 +139,7 @@
if (event.source == CuptiTracerEventSource::DriverCallback) {
DCHECK_EQ(event.name, "cuStreamSynchronize");
- ns->set_node_name(string(event.name));
+ ns->set_node_name(event.name);
ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
ns->set_thread_id(event.thread_id);
collector->Save(sync_device, ns);
@@ -153,13 +154,13 @@
}
auto annotation_stack = ParseAnnotationStack(event.annotation);
- absl::string_view activity_name = !annotation_stack.empty()
- ? annotation_stack.back().name
- : event.name;
- ns->set_node_name(string(activity_name));
+ std::string activity_name =
+ !annotation_stack.empty()
+ ? std::string(annotation_stack.back().name)
+ : port::MaybeAbiDemangle(event.name.c_str());
switch (event.type) {
case CuptiTracerEventType::Kernel: {
- const string details = strings::Printf(
+ const std::string details = strings::Printf(
"regs:%llu shm:%llu grid:%llu,%llu,%llu block:%llu,%llu,%llu",
event.kernel_info.registers_per_thread,
event.kernel_info.static_shared_memory_usage,
@@ -177,7 +178,7 @@
case CuptiTracerEventType::MemcpyD2H:
case CuptiTracerEventType::MemcpyD2D:
case CuptiTracerEventType::MemcpyP2P: {
- string details = absl::StrCat(
+ std::string details = absl::StrCat(
activity_name, " bytes:", event.memcpy_info.num_bytes);
if (event.memcpy_info.async) {
absl::StrAppend(&details, " aync");
@@ -196,9 +197,10 @@
break;
}
default:
- ns->set_timeline_label(string(activity_name));
+ ns->set_timeline_label(activity_name);
collector->Save(stream_device, ns);
}
+ ns->set_node_name(std::move(activity_name));
}
}
}