[Profiler] Remove `addMemoryUsageActivity` from kineto_shim (#76678)

Summary: `addMemoryUsageActivity` is effectively the same as `addCPUActivity` except it also has `addMetadata` calls. This change generalizes `addCPUActivity` (We have to define an enum to map to `libkineto::ActivityType`) to accept arbitrary metadata strings.

Test Plan: Unit tests

Differential Revision: D36070203

Pull Request resolved: https://github.com/pytorch/pytorch/pull/76678
Approved by: https://github.com/aaronenyeshi
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 716f0da..1f9756f 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -132,6 +132,22 @@
 };
 static_assert(std::is_pod<MemoryEventData>::value, "Non-POD member of MemoryEventData.");
 
+auto getAnnotations(const MemoryEventData& event) {
+  torch::profiler::impl::kineto::annotation_t out{
+      {"Device Type", std::to_string((int8_t)event.device_type)},
+      {"Device Id", std::to_string(event.device_index)},
+      {"Addr", std::to_string(reinterpret_cast<intptr_t>(event.ptr))},
+      {"Bytes", std::to_string(event.alloc_size)}};
+
+  if (event.total_allocated >= 0) {
+    out.emplace_back("Total Allocated", std::to_string(event.total_allocated));
+  }
+  if (event.total_reserved >= 0) {
+    out.emplace_back("Total Reserved", std::to_string(event.total_reserved));
+  }
+  return out;
+}
+
 // Assumption: Total threads number will not exceed 2^16-1, and total ops will
 // not exceed 2^48 -1.
 static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) {
@@ -227,15 +243,14 @@
 
     for (const auto& e : memory_events_) {
       auto start_time_us = converter(e.start_time) / 1000;
-      cpu_trace_.addMemoryUsageActivity(
+      cpu_trace_.addCPUActivity(
           kMemoryEventName,
+          torch::profiler::impl::kineto::KinetoActivityType::CPU_INSTANT_EVENT,
           e.kineto_info,
+          /*correlation_id=*/0,
           start_time_us,
-          c10::Device(e.device_type, e.device_index),
-          e.ptr,
-          e.alloc_size,
-          e.total_allocated,
-          e.total_reserved);
+          start_time_us,
+          getAnnotations(e));
 
       kineto_events_.emplace_back();
       auto& evt = kineto_events_.back();
@@ -261,11 +276,12 @@
 
       cpu_trace_.addCPUActivity(
           e.name(),
-          e.record_function_scope(),
+          e.kinetoType(),
           e.kineto_info_,
           e.correlation_id(),
           start_us,
-          end_us);
+          end_us,
+          /*annotations=*/{});
 
       kineto_events_.emplace_back();
       kineto_events_.back()
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index afce19c..c084b9d 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -131,8 +131,12 @@
   return c10::visit([](auto& e){ return e.name_; }, event_);
 }
 
-uint8_t Result::record_function_scope() const {
-  return c10::visit([](auto& e){ return e.record_function_scope_; }, event_);
+torch::profiler::impl::kineto::KinetoActivityType Result::kinetoType() const {
+  auto record_function_scope = static_cast<at::RecordScope>(
+      c10::visit([](auto& e) { return e.record_function_scope_; }, event_));
+  return record_function_scope == at::RecordScope::USER_SCOPE
+      ? torch::profiler::impl::kineto::KinetoActivityType::USER_ANNOTATION
+      : torch::profiler::impl::kineto::KinetoActivityType::CPU_OP;
 }
 
 uint64_t Result::correlation_id() const {
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index fd8b997..509fcd8 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -71,7 +71,7 @@
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct Result {
   std::string name() const;
-  uint8_t record_function_scope() const;
+  torch::profiler::impl::kineto::KinetoActivityType kinetoType() const;
   uint64_t correlation_id() const;
 
   int64_t start_time_us_;
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index 0411eb5..df2098f 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -60,18 +60,36 @@
 }
 #endif // USE_KINETO
 
+#ifdef USE_KINETO
+namespace {
+libkineto::ActivityType toActivityType(const KinetoActivityType type) {
+  switch (type) {
+    case KinetoActivityType::CPU_OP:
+      return libkineto::ActivityType::CPU_OP;
+    case KinetoActivityType::CPU_INSTANT_EVENT:
+      return libkineto::ActivityType::CPU_INSTANT_EVENT;
+    default:
+      TORCH_INTERNAL_ASSERT(
+          type == KinetoActivityType::USER_ANNOTATION,
+          "Invalid KinetoActivityType: ",
+          (int)type);
+      return libkineto::ActivityType::USER_ANNOTATION;
+  }
+}
+} // namespace
+#endif // USE_KINETO
+
 void TraceWrapper::addCPUActivity(
     const std::string& name,
-    const uint8_t scope,
+    const KinetoActivityType kineto_type,
     const DeviceAndResource device_and_resource,
     const uint64_t correlation_id,
     const int64_t start_time,
-    const int64_t end_time) {
+    const int64_t end_time,
+    const annotation_t& annotations) {
 #ifdef USE_KINETO
   TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace.");
-  auto type = ((at::RecordScope)scope == at::RecordScope::USER_SCOPE)
-    ? libkineto::ActivityType::USER_ANNOTATION
-    : libkineto::ActivityType::CPU_OP;
+  auto type = toActivityType(kineto_type);
   cpu_trace_->activities.emplace_back(libkineto::GenericTraceActivity(
     cpu_trace_->span, type, name));
   auto& act = cpu_trace_->activities.back();
@@ -79,36 +97,11 @@
   act.resource = device_and_resource.resource;
   act.id = correlation_id;
   act.startTime = start_time;
-  act.endTime = end_time;
-#endif // USE_KINETO
-}
-
-void TraceWrapper::addMemoryUsageActivity(
-    const std::string& name,
-    const DeviceAndResource device_and_resource,
-    const int64_t time,
-    const c10::Device device,
-    const void* ptr,
-    const int64_t alloc_size,
-    const int64_t total_allocated,
-    const int64_t total_reserved) {
-#ifdef USE_KINETO
-  TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace.");
-  cpu_trace_->activities.emplace_back(libkineto::GenericTraceActivity(
-    cpu_trace_->span, libkineto::ActivityType::CPU_INSTANT_EVENT, name));
-  auto& act = cpu_trace_->activities.back();
-  act.device = device_and_resource.device;
-  act.resource = device_and_resource.resource;
-  act.startTime = time;
-  act.addMetadata("Device Type", std::to_string((int8_t)device.type()));
-  act.addMetadata("Device Id", std::to_string(device.index()));
-  act.addMetadata("Addr", std::to_string(reinterpret_cast<intptr_t>(ptr)));
-  act.addMetadata("Bytes", std::to_string(alloc_size));
-  if (total_allocated >= 0) {
-    act.addMetadata("Total Allocated", std::to_string(total_allocated));
+  if (type != libkineto::ActivityType::CPU_INSTANT_EVENT) {
+    act.endTime = end_time;
   }
-  if (total_reserved >= 0) {
-    act.addMetadata("Total Reserved", std::to_string(total_reserved));
+  for (const auto& i : annotations) {
+    act.addMetadata(i.first, i.second);
   }
 #endif // USE_KINETO
 }
diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h
index 76ff18f..59ff529 100644
--- a/torch/csrc/profiler/kineto_shim.h
+++ b/torch/csrc/profiler/kineto_shim.h
@@ -58,31 +58,30 @@
 using interface_trace_t = DummyTraceBuffer;
 #endif // USE_KINETO
 
+// Subset of `libkineto::ActivityType` for `addCPUActivity`.
+enum class KinetoActivityType : uint8_t {
+  CPU_OP = 0,
+  CPU_INSTANT_EVENT,
+  USER_ANNOTATION
+};
+
+using annotation_t = std::vector<std::pair<std::string, std::string>>;
+
 // Wraps: libkineto::CpuTraceBuffer
 struct TraceWrapper {
   TraceWrapper(const int64_t start_time, const std::string& name);
   TraceWrapper(TraceWrapper&&) = default;
   TraceWrapper(const TraceWrapper&) = delete;
 
-  // The caller is expected to hold a mutex when calling `addCPUActivity` and
-  // addMemoryUsageActivity.
+  // The caller is expected to hold a mutex when calling `addCPUActivity`.
   void addCPUActivity(
       const std::string& name,
-      const uint8_t scope,
+      const KinetoActivityType kineto_type,
       const DeviceAndResource device_and_resource,
       const uint64_t correlation_id,
       const int64_t start_time,
-      const int64_t end_time);
-
-  void addMemoryUsageActivity(
-      const std::string& name,
-      const DeviceAndResource device_and_resource,
-      const int64_t time,
-      const c10::Device device,
-      const void* ptr,
-      const int64_t alloc_size,
-      const int64_t total_allocated,
-      const int64_t total_reserved);
+      const int64_t end_time,
+      const annotation_t& annotations);
 
   void transferCpuTrace(int64_t end_time);