make event.name std::string. due to CUpti_ActivityKernel4::name is not persistent.
and delay the Demangle process to serialization time because we want to reduce the overhead during profiling.

PiperOrigin-RevId: 268489643
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index 81a2501..8916e28 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -20,7 +20,6 @@
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/platform/abi.h"
 #include "tensorflow/core/platform/annotation.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -851,7 +850,7 @@
     CuptiTracerEvent event;
     event.type = CuptiTracerEventType::Kernel;
     event.source = CuptiTracerEventSource::Activity;  // on gpu device.
-    event.name = port::MaybeAbiDemangle(record.kernel_name);
+    event.name = record.kernel_name;
     event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
     event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
     event.device_id = ordinal_;
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
index e931f73..87e6c04 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
@@ -102,8 +102,12 @@
       std::numeric_limits<uint64_t>::max();
   CuptiTracerEventType type;
   CuptiTracerEventSource source;
-  // name and annotation are only guaranteed to be valid in collector->AddEvent.
-  absl::string_view name;
+  // Although CUpti_CallbackData::functionName is persistent, however
+  // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
+  // it.
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
   absl::string_view annotation;
   uint64 start_time_ns;
   uint64 end_time_ns;
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index d8b356a..aa7609d 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -24,6 +24,7 @@
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/abi.h"
 #include "tensorflow/core/platform/annotation.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stringprintf.h"
@@ -82,7 +83,7 @@
     }
     per_device_adaptor_[event.device_id].AddEvent(std::move(event));
   }
-  void OnEventsDropped(const string& reason, uint32 num_events) override {}
+  void OnEventsDropped(const std::string& reason, uint32 num_events) override {}
   void Flush() override {
     LOG(INFO) << " GpuTracer has collected " << num_callback_events_
               << " callback api events and " << num_activity_events_
@@ -138,7 +139,7 @@
 
         if (event.source == CuptiTracerEventSource::DriverCallback) {
           DCHECK_EQ(event.name, "cuStreamSynchronize");
-          ns->set_node_name(string(event.name));
+          ns->set_node_name(event.name);
           ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
           ns->set_thread_id(event.thread_id);
           collector->Save(sync_device, ns);
@@ -153,13 +154,13 @@
           }
 
           auto annotation_stack = ParseAnnotationStack(event.annotation);
-          absl::string_view activity_name = !annotation_stack.empty()
-                                                ? annotation_stack.back().name
-                                                : event.name;
-          ns->set_node_name(string(activity_name));
+          std::string activity_name =
+              !annotation_stack.empty()
+                  ? std::string(annotation_stack.back().name)
+                  : port::MaybeAbiDemangle(event.name.c_str());
           switch (event.type) {
             case CuptiTracerEventType::Kernel: {
-              const string details = strings::Printf(
+              const std::string details = strings::Printf(
                   "regs:%llu shm:%llu grid:%llu,%llu,%llu block:%llu,%llu,%llu",
                   event.kernel_info.registers_per_thread,
                   event.kernel_info.static_shared_memory_usage,
@@ -177,7 +178,7 @@
             case CuptiTracerEventType::MemcpyD2H:
             case CuptiTracerEventType::MemcpyD2D:
             case CuptiTracerEventType::MemcpyP2P: {
-              string details = absl::StrCat(
+              std::string details = absl::StrCat(
                   activity_name, " bytes:", event.memcpy_info.num_bytes);
               if (event.memcpy_info.async) {
                 absl::StrAppend(&details, " aync");
@@ -196,9 +197,10 @@
               break;
             }
             default:
-              ns->set_timeline_label(string(activity_name));
+              ns->set_timeline_label(activity_name);
               collector->Save(stream_device, ns);
           }
+          ns->set_node_name(std::move(activity_name));
         }
       }
     }