[Profiler] Remove `addMemoryUsageActivity` from kineto_shim (#76678)
Summary: `addMemoryUsageActivity` is effectively the same as `addCPUActivity` except it also has `addMetadata` calls. This change generalizes `addCPUActivity` (We have to define an enum to map to `libkineto::ActivityType`) to accept arbitrary metadata strings.
Test Plan: Unit tests
Differential Revision: D36070203
Pull Request resolved: https://github.com/pytorch/pytorch/pull/76678
Approved by: https://github.com/aaronenyeshi
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 716f0da..1f9756f 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -132,6 +132,22 @@
};
static_assert(std::is_pod<MemoryEventData>::value, "Non-POD member of MemoryEventData.");
+auto getAnnotations(const MemoryEventData& event) {
+ torch::profiler::impl::kineto::annotation_t out{
+ {"Device Type", std::to_string((int8_t)event.device_type)},
+ {"Device Id", std::to_string(event.device_index)},
+ {"Addr", std::to_string(reinterpret_cast<intptr_t>(event.ptr))},
+ {"Bytes", std::to_string(event.alloc_size)}};
+
+ if (event.total_allocated >= 0) {
+ out.emplace_back("Total Allocated", std::to_string(event.total_allocated));
+ }
+ if (event.total_reserved >= 0) {
+ out.emplace_back("Total Reserved", std::to_string(event.total_reserved));
+ }
+ return out;
+}
+
// Assumption: Total threads number will not exceed 2^16-1, and total ops will
// not exceed 2^48 -1.
static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) {
@@ -227,15 +243,14 @@
for (const auto& e : memory_events_) {
auto start_time_us = converter(e.start_time) / 1000;
- cpu_trace_.addMemoryUsageActivity(
+ cpu_trace_.addCPUActivity(
kMemoryEventName,
+ torch::profiler::impl::kineto::KinetoActivityType::CPU_INSTANT_EVENT,
e.kineto_info,
+ /*correlation_id=*/0,
start_time_us,
- c10::Device(e.device_type, e.device_index),
- e.ptr,
- e.alloc_size,
- e.total_allocated,
- e.total_reserved);
+ start_time_us,
+ getAnnotations(e));
kineto_events_.emplace_back();
auto& evt = kineto_events_.back();
@@ -261,11 +276,12 @@
cpu_trace_.addCPUActivity(
e.name(),
- e.record_function_scope(),
+ e.kinetoType(),
e.kineto_info_,
e.correlation_id(),
start_us,
- end_us);
+ end_us,
+ /*annotations=*/{});
kineto_events_.emplace_back();
kineto_events_.back()
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index afce19c..c084b9d 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -131,8 +131,12 @@
return c10::visit([](auto& e){ return e.name_; }, event_);
}
-uint8_t Result::record_function_scope() const {
- return c10::visit([](auto& e){ return e.record_function_scope_; }, event_);
+torch::profiler::impl::kineto::KinetoActivityType Result::kinetoType() const {
+ auto record_function_scope = static_cast<at::RecordScope>(
+ c10::visit([](auto& e) { return e.record_function_scope_; }, event_));
+ return record_function_scope == at::RecordScope::USER_SCOPE
+ ? torch::profiler::impl::kineto::KinetoActivityType::USER_ANNOTATION
+ : torch::profiler::impl::kineto::KinetoActivityType::CPU_OP;
}
uint64_t Result::correlation_id() const {
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index fd8b997..509fcd8 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -71,7 +71,7 @@
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
struct Result {
std::string name() const;
- uint8_t record_function_scope() const;
+ torch::profiler::impl::kineto::KinetoActivityType kinetoType() const;
uint64_t correlation_id() const;
int64_t start_time_us_;
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index 0411eb5..df2098f 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -60,18 +60,36 @@
}
#endif // USE_KINETO
+#ifdef USE_KINETO
+namespace {
+libkineto::ActivityType toActivityType(const KinetoActivityType type) {
+ switch (type) {
+ case KinetoActivityType::CPU_OP:
+ return libkineto::ActivityType::CPU_OP;
+ case KinetoActivityType::CPU_INSTANT_EVENT:
+ return libkineto::ActivityType::CPU_INSTANT_EVENT;
+ default:
+ TORCH_INTERNAL_ASSERT(
+ type == KinetoActivityType::USER_ANNOTATION,
+ "Invalid KinetoActivityType: ",
+ (int)type);
+ return libkineto::ActivityType::USER_ANNOTATION;
+ }
+}
+} // namespace
+#endif // USE_KINETO
+
void TraceWrapper::addCPUActivity(
const std::string& name,
- const uint8_t scope,
+ const KinetoActivityType kineto_type,
const DeviceAndResource device_and_resource,
const uint64_t correlation_id,
const int64_t start_time,
- const int64_t end_time) {
+ const int64_t end_time,
+ const annotation_t& annotations) {
#ifdef USE_KINETO
TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace.");
- auto type = ((at::RecordScope)scope == at::RecordScope::USER_SCOPE)
- ? libkineto::ActivityType::USER_ANNOTATION
- : libkineto::ActivityType::CPU_OP;
+ auto type = toActivityType(kineto_type);
cpu_trace_->activities.emplace_back(libkineto::GenericTraceActivity(
cpu_trace_->span, type, name));
auto& act = cpu_trace_->activities.back();
@@ -79,36 +97,11 @@
act.resource = device_and_resource.resource;
act.id = correlation_id;
act.startTime = start_time;
- act.endTime = end_time;
-#endif // USE_KINETO
-}
-
-void TraceWrapper::addMemoryUsageActivity(
- const std::string& name,
- const DeviceAndResource device_and_resource,
- const int64_t time,
- const c10::Device device,
- const void* ptr,
- const int64_t alloc_size,
- const int64_t total_allocated,
- const int64_t total_reserved) {
-#ifdef USE_KINETO
- TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace.");
- cpu_trace_->activities.emplace_back(libkineto::GenericTraceActivity(
- cpu_trace_->span, libkineto::ActivityType::CPU_INSTANT_EVENT, name));
- auto& act = cpu_trace_->activities.back();
- act.device = device_and_resource.device;
- act.resource = device_and_resource.resource;
- act.startTime = time;
- act.addMetadata("Device Type", std::to_string((int8_t)device.type()));
- act.addMetadata("Device Id", std::to_string(device.index()));
- act.addMetadata("Addr", std::to_string(reinterpret_cast<intptr_t>(ptr)));
- act.addMetadata("Bytes", std::to_string(alloc_size));
- if (total_allocated >= 0) {
- act.addMetadata("Total Allocated", std::to_string(total_allocated));
+ if (type != libkineto::ActivityType::CPU_INSTANT_EVENT) {
+ act.endTime = end_time;
}
- if (total_reserved >= 0) {
- act.addMetadata("Total Reserved", std::to_string(total_reserved));
+ for (const auto& i : annotations) {
+ act.addMetadata(i.first, i.second);
}
#endif // USE_KINETO
}
diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h
index 76ff18f..59ff529 100644
--- a/torch/csrc/profiler/kineto_shim.h
+++ b/torch/csrc/profiler/kineto_shim.h
@@ -58,31 +58,30 @@
using interface_trace_t = DummyTraceBuffer;
#endif // USE_KINETO
+// Subset of `libkineto::ActivityType` for `addCPUActivity`.
+enum class KinetoActivityType : uint8_t {
+ CPU_OP = 0,
+ CPU_INSTANT_EVENT,
+ USER_ANNOTATION
+};
+
+using annotation_t = std::vector<std::pair<std::string, std::string>>;
+
// Wraps: libkineto::CpuTraceBuffer
struct TraceWrapper {
TraceWrapper(const int64_t start_time, const std::string& name);
TraceWrapper(TraceWrapper&&) = default;
TraceWrapper(const TraceWrapper&) = delete;
- // The caller is expected to hold a mutex when calling `addCPUActivity` and
- // addMemoryUsageActivity.
+ // The caller is expected to hold a mutex when calling `addCPUActivity`.
void addCPUActivity(
const std::string& name,
- const uint8_t scope,
+ const KinetoActivityType kineto_type,
const DeviceAndResource device_and_resource,
const uint64_t correlation_id,
const int64_t start_time,
- const int64_t end_time);
-
- void addMemoryUsageActivity(
- const std::string& name,
- const DeviceAndResource device_and_resource,
- const int64_t time,
- const c10::Device device,
- const void* ptr,
- const int64_t alloc_size,
- const int64_t total_allocated,
- const int64_t total_reserved);
+ const int64_t end_time,
+ const annotation_t& annotations);
void transferCpuTrace(int64_t end_time);