torch/csrc/profiler/kineto_shim.cpp - platform/external/pytorch - Git at Google

 #include <torch/csrc/profiler/kineto_shim.h>

 #include <type_traits>

 #ifdef USE_KINETO
 #include <libkineto.h>
 #endif

 #include <c10/util/Exception.h>

 namespace torch {
 namespace profiler {
 namespace impl {
 namespace kineto {

 // Here lies pain and `#ifdef USE_KINETO`

 #ifdef USE_KINETO
 namespace {
 const std::set<libkineto::ActivityType> cpuTypes{
     libkineto::ActivityType::CPU_OP,
     libkineto::ActivityType::CPU_INSTANT_EVENT,
     libkineto::ActivityType::USER_ANNOTATION,
     libkineto::ActivityType::EXTERNAL_CORRELATION,
     libkineto::ActivityType::XPU_RUNTIME,
     libkineto::ActivityType::CUDA_RUNTIME,
     libkineto::ActivityType::PYTHON_FUNCTION,
 };

 const std::set<libkineto::ActivityType> cudaTypes = {
     libkineto::ActivityType::GPU_MEMCPY,
     libkineto::ActivityType::GPU_MEMSET,
     libkineto::ActivityType::CONCURRENT_KERNEL,
     // CUDA_RUNTIME appears in both cpuTypes and cudaTypes.
     libkineto::ActivityType::CUDA_RUNTIME,
 };
 const std::set<libkineto::ActivityType> xpuTypes = {
     libkineto::ActivityType::GPU_MEMCPY,
     libkineto::ActivityType::GPU_MEMSET,
     libkineto::ActivityType::CONCURRENT_KERNEL,
     // XPU_RUNTIME appears in both cpuTypes and xpuTypes.
     libkineto::ActivityType::XPU_RUNTIME,
 };
 } // namespace
 #endif // USE_KINETO

 static_assert(
     c10::is_pod_v<DeviceAndResource>,
     "Kineto specific details should be in `kineto_ids`.");

 const DeviceAndResource kineto_ids() {
 #ifdef USE_KINETO
   return {
       /*device=*/libkineto::processId(),
       /*resource=*/libkineto::systemThreadId()};
 #else
   return {};
 #endif // USE_KINETO
 }

 void addMetadata(
     const activity_t* activity,
     const std::string& key,
     const std::string& value) {
 #ifdef USE_KINETO
   // ActivityTraceInterface returns const pointers, so we have to cast away the
   // constness to add metadata.
   const_cast<activity_t*>(activity)->addMetadata(key, value);
 #endif // USE_KINETO
 }

 TraceWrapper::TraceWrapper(const int64_t start_time, const std::string& name)
 #ifdef USE_KINETO
     : cpu_trace_(std::make_unique<libkineto::CpuTraceBuffer>()) {
   cpu_trace_->span.startTime = start_time;
   cpu_trace_->gpuOpCount = -1;
   cpu_trace_->span.name = name;
 }
 #else
 {
 }
 #endif // USE_KINETO

 TraceWrapper::~TraceWrapper() = default;

 activity_t* TraceWrapper::addCPUActivity(
     const std::string& name,
     const libkineto::ActivityType type,
     const DeviceAndResource device_and_resource,
     const uint64_t correlation_id,
     const int64_t start_time,
     const int64_t end_time) {
 #ifdef USE_KINETO
   TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace.");
   cpu_trace_->emplace_activity(cpu_trace_->span, type, name);
   auto& act = libkineto::CpuTraceBuffer::toRef(cpu_trace_->activities.back());
   act.device = device_and_resource.device;
   act.resource = device_and_resource.resource;
   act.id = correlation_id;
   act.startTime = start_time;
   if (type != libkineto::ActivityType::CPU_INSTANT_EVENT) {
     act.endTime = end_time;
   }
   return cpu_trace_->activities.back().get();
 #else
   return nullptr;
 #endif // USE_KINETO
 }

 void TraceWrapper::transferCpuTrace(int64_t end_time) {
 #ifdef USE_KINETO
   cpu_trace_->span.endTime = end_time;
   libkineto::api().activityProfiler().transferCpuTrace(std::move(cpu_trace_));
 #endif // USE_KINETO
 }

 TraceWrapper::operator bool() const {
 #ifdef USE_KINETO
   return cpu_trace_ != nullptr;
 #else
   return false;
 #endif // USE_KINETO
 }

 ActivityTraceWrapper::ActivityTraceWrapper(
     std::unique_ptr<interface_trace_t>&& trace)
     : trace_(std::move(trace)) {}

 ActivityTraceWrapper::operator bool() const {
 #ifdef USE_KINETO
   return trace_ != nullptr;
 #else
   return false;
 #endif // USE_KINETO
 }

 void ActivityTraceWrapper::save(const std::string& path) {
 #ifdef USE_KINETO
   TORCH_CHECK(!saved_, "Trace is already saved.");
   TORCH_CHECK(trace_ != nullptr, "Missing trace.")
   trace_->save(path);
   saved_ = true;
 #else
   TORCH_CHECK(
       false,
       "Saving a trace requires using torch.profiler with Kineto support (USE_KINETO=1)");
 #endif // USE_KINETO
 }

 namespace {
 // Handles processing of Experimental Config options for Kineto
 class ExperimentalConfigWrapper {
  public:
   explicit ExperimentalConfigWrapper(
       const torch::profiler::impl::ExperimentalConfig& config)
       : config_(config) {}

   bool assertValid(const ActivitySet& activities) {
     // Kineto supports reading performance events per kernel/iteration
     // using CUPTI Range based profiler API. In this mode however we
     // do not trace CPU or GPU events.
     bool cupti_range_profiler = !config_.profiler_metrics.empty();
     if (cupti_range_profiler &&
         activities.count(torch::autograd::profiler::ActivityType::CPU)) {
       LOG(WARNING)
           << "Cannot run range profiler with CPU activities, please only"
           << " use CUDA activity type";
       return false;
     }
     return cupti_range_profiler;
   }

   void prepareTraceWithExperimentalOptions() {
 #ifdef USE_KINETO
     std::set<libkineto::ActivityType> k_activities{
         libkineto::ActivityType::CUDA_PROFILER_RANGE};

     const size_t num_metrics = config_.profiler_metrics.size();
     std::stringstream configss;

     LOG(INFO) << "CUPTI profiler metrics size = " << num_metrics;

     configss << "ACTIVITIES_WARMUP_PERIOD_SECS=0\n"
              << "CUPTI_PROFILER_METRICS=";

     for (int i = 0; i < num_metrics; i++) {
       configss << config_.profiler_metrics[i];
       if (num_metrics > 1 && i < (num_metrics - 1)) {
         configss << ",";
       }
     }
     configss << "\nCUPTI_PROFILER_ENABLE_PER_KERNEL="
              << (config_.profiler_measure_per_kernel ? "true" : "false")
              << "\n";
     LOG(INFO) << "Generated config = " << configss.str();

     libkineto::api().activityProfiler().prepareTrace(
         k_activities, configss.str());
 #endif // USE_KINETO
   }

  private:
   const torch::profiler::impl::ExperimentalConfig& config_;
 };
 } // namespace

 void prepareTrace(
     const bool cpuOnly,
     const ActivitySet& activities,
     const torch::profiler::impl::ExperimentalConfig& config) {
 #ifdef USE_KINETO
   if (!libkineto::api().isProfilerRegistered()) {
     libkineto_init(/*cpuOnly=*/cpuOnly, /*logOnError=*/true);
     libkineto::api().suppressLogMessages();
   }

   if (!libkineto::api().isProfilerInitialized()) {
     libkineto::api().initProfilerIfRegistered();
   }

   std::set<libkineto::ActivityType> k_activities;
   if (activities.count(torch::autograd::profiler::ActivityType::CPU)) {
     k_activities.insert(cpuTypes.begin(), cpuTypes.end());
   }
   if (activities.count(torch::autograd::profiler::ActivityType::XPU)) {
     k_activities.insert(xpuTypes.begin(), xpuTypes.end());
   }
   if (activities.count(torch::autograd::profiler::ActivityType::CUDA)) {
     k_activities.insert(cudaTypes.begin(), cudaTypes.end());
   }

   ExperimentalConfigWrapper configWrap(config);

   // Experimental Configuration options are present
   if (config && configWrap.assertValid(activities)) {
     configWrap.prepareTraceWithExperimentalOptions();
     return;
   }

   libkineto::api().activityProfiler().prepareTrace(k_activities);
 #endif // USE_KINETO
 }

 void startTrace() {
 #ifdef USE_KINETO
   libkineto::api().activityProfiler().startTrace();
 #endif // USE_KINETO
 }

 ActivityTraceWrapper stopTrace() {
   return ActivityTraceWrapper{
 #ifdef USE_KINETO
       libkineto::api().activityProfiler().stopTrace()
 #else
       std::make_unique<interface_trace_t>()
 #endif // USE_KINETO
   };
 }

 void pushCorrelationId(uint64_t correlation_id) {
 #ifdef USE_KINETO
   libkineto::api().activityProfiler().pushCorrelationId(correlation_id);
 #endif // USE_KINETO
 }

 void pushUserCorrelationId(uint64_t correlation_id) {
 #ifdef USE_KINETO
   libkineto::api().activityProfiler().pushUserCorrelationId(correlation_id);
 #endif // USE_KINETO
 }

 void popCorrelationId() {
 #ifdef USE_KINETO
   libkineto::api().activityProfiler().popCorrelationId();
 #endif // USE_KINETO
 }

 void popUserCorrelationId() {
 #ifdef USE_KINETO
   libkineto::api().activityProfiler().popUserCorrelationId();
 #endif // USE_KINETO
 }

 void recordThreadInfo() {
 #ifdef USE_KINETO
   libkineto::api().activityProfiler().recordThreadInfo();
 #endif // USE_KINETO
 }

 void logInvariantViolation(
     const std::string& assertion,
     const std::string& error,
     const std::string& profile_id,
     const std::string& group_profile_id) {
 #ifdef USE_KINETO
   if (libkineto::api().isProfilerInitialized()) {
     libkineto::api().activityProfiler().logInvariantViolation(
         profile_id, assertion, error, group_profile_id);
   }
 #endif // USE_KINETO
 }

 } // namespace kineto
 } // namespace impl
 } // namespace profiler

 namespace autograd {
 namespace profiler {
 c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
   // fallthrough
   switch (activity_type) {
     case libkineto::ActivityType::GPU_MEMCPY:
     case libkineto::ActivityType::GPU_MEMSET:
     case libkineto::ActivityType::CONCURRENT_KERNEL:
     case libkineto::ActivityType::GPU_USER_ANNOTATION:
     case libkineto::ActivityType::CUDA_PROFILER_RANGE:
       return c10::DeviceType::CUDA;
     case libkineto::ActivityType::CPU_OP:
     case libkineto::ActivityType::USER_ANNOTATION:
     case libkineto::ActivityType::EXTERNAL_CORRELATION:
     case libkineto::ActivityType::CUDA_RUNTIME:
     case libkineto::ActivityType::CPU_INSTANT_EVENT:
     case libkineto::ActivityType::GLOW_RUNTIME:
     case libkineto::ActivityType::PYTHON_FUNCTION:
       return c10::DeviceType::CPU;
     default: {
       TORCH_WARN(
           "Unknown activity type (",
           (uint8_t)activity_type,
           "), assuming CPU device");
       return c10::DeviceType::CPU;
     }
   }
 }

 void addMetadataJson(const std::string& key, const std::string& value) {
 #ifdef USE_KINETO
   if (libkineto::api().isProfilerInitialized()) {
     libkineto::api().activityProfiler().addMetadata(key, value);
   } else {
     LOG(WARNING) << "Profiler is not initialized: skipping profiling metadata";
   }
 #else
   LOG(WARNING) << "Adding profiling metadata requires using "
                << "torch.profiler with Kineto support (USE_KINETO=1)";
 #endif // USE_KINETO
 }

 void profilerStep() {
 #ifdef USE_KINETO
   if (libkineto::api().isProfilerInitialized()) {
     libkineto::api().activityProfiler().step();
   } else {
     LOG(WARNING) << "Profiler is not initialized: skipping step() invocation";
   }
 #endif // USE_KINETO
 }

 } // namespace profiler
 } // namespace autograd
 } // namespace torch
	#include <torch/csrc/profiler/kineto_shim.h>

	#include <type_traits>

	#ifdef USE_KINETO
	#include <libkineto.h>
	#endif

	#include <c10/util/Exception.h>

	namespace torch {
	namespace profiler {
	namespace impl {
	namespace kineto {

	// Here lies pain and `#ifdef USE_KINETO`

	#ifdef USE_KINETO
	namespace {
	const std::set<libkineto::ActivityType> cpuTypes{
	libkineto::ActivityType::CPU_OP,
	libkineto::ActivityType::CPU_INSTANT_EVENT,
	libkineto::ActivityType::USER_ANNOTATION,
	libkineto::ActivityType::EXTERNAL_CORRELATION,
	libkineto::ActivityType::XPU_RUNTIME,
	libkineto::ActivityType::CUDA_RUNTIME,
	libkineto::ActivityType::PYTHON_FUNCTION,
	};

	const std::set<libkineto::ActivityType> cudaTypes = {
	libkineto::ActivityType::GPU_MEMCPY,
	libkineto::ActivityType::GPU_MEMSET,
	libkineto::ActivityType::CONCURRENT_KERNEL,
	// CUDA_RUNTIME appears in both cpuTypes and cudaTypes.
	libkineto::ActivityType::CUDA_RUNTIME,
	};
	const std::set<libkineto::ActivityType> xpuTypes = {
	libkineto::ActivityType::GPU_MEMCPY,
	libkineto::ActivityType::GPU_MEMSET,
	libkineto::ActivityType::CONCURRENT_KERNEL,
	// XPU_RUNTIME appears in both cpuTypes and xpuTypes.
	libkineto::ActivityType::XPU_RUNTIME,
	};
	} // namespace
	#endif // USE_KINETO

	static_assert(
	c10::is_pod_v<DeviceAndResource>,
	"Kineto specific details should be in `kineto_ids`.");

	const DeviceAndResource kineto_ids() {
	#ifdef USE_KINETO
	return {
	/device=/libkineto::processId(),
	/resource=/libkineto::systemThreadId()};
	#else
	return {};
	#endif // USE_KINETO
	}

	void addMetadata(
	const activity_t* activity,
	const std::string& key,
	const std::string& value) {
	#ifdef USE_KINETO
	// ActivityTraceInterface returns const pointers, so we have to cast away the
	// constness to add metadata.
	const_cast<activity_t*>(activity)->addMetadata(key, value);
	#endif // USE_KINETO
	}

	TraceWrapper::TraceWrapper(const int64_t start_time, const std::string& name)
	#ifdef USE_KINETO
	: cpu_trace_(std::make_unique<libkineto::CpuTraceBuffer>()) {
	cpu_trace_->span.startTime = start_time;
	cpu_trace_->gpuOpCount = -1;
	cpu_trace_->span.name = name;
	}
	#else
	{
	}
	#endif // USE_KINETO

	TraceWrapper::~TraceWrapper() = default;

	activity_t* TraceWrapper::addCPUActivity(
	const std::string& name,
	const libkineto::ActivityType type,
	const DeviceAndResource device_and_resource,
	const uint64_t correlation_id,
	const int64_t start_time,
	const int64_t end_time) {
	#ifdef USE_KINETO
	TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace.");
	cpu_trace_->emplace_activity(cpu_trace_->span, type, name);
	auto& act = libkineto::CpuTraceBuffer::toRef(cpu_trace_->activities.back());
	act.device = device_and_resource.device;
	act.resource = device_and_resource.resource;
	act.id = correlation_id;
	act.startTime = start_time;
	if (type != libkineto::ActivityType::CPU_INSTANT_EVENT) {
	act.endTime = end_time;
	}
	return cpu_trace_->activities.back().get();
	#else
	return nullptr;
	#endif // USE_KINETO
	}

	void TraceWrapper::transferCpuTrace(int64_t end_time) {
	#ifdef USE_KINETO
	cpu_trace_->span.endTime = end_time;
	libkineto::api().activityProfiler().transferCpuTrace(std::move(cpu_trace_));
	#endif // USE_KINETO
	}

	TraceWrapper::operator bool() const {
	#ifdef USE_KINETO
	return cpu_trace_ != nullptr;
	#else
	return false;
	#endif // USE_KINETO
	}

	ActivityTraceWrapper::ActivityTraceWrapper(
	std::unique_ptr<interface_trace_t>&& trace)
	: trace_(std::move(trace)) {}

	ActivityTraceWrapper::operator bool() const {
	#ifdef USE_KINETO
	return trace_ != nullptr;
	#else
	return false;
	#endif // USE_KINETO
	}

	void ActivityTraceWrapper::save(const std::string& path) {
	#ifdef USE_KINETO
	TORCH_CHECK(!saved_, "Trace is already saved.");
	TORCH_CHECK(trace_ != nullptr, "Missing trace.")
	trace_->save(path);
	saved_ = true;
	#else
	TORCH_CHECK(
	false,
	"Saving a trace requires using torch.profiler with Kineto support (USE_KINETO=1)");
	#endif // USE_KINETO
	}

	namespace {
	// Handles processing of Experimental Config options for Kineto
	class ExperimentalConfigWrapper {
	public:
	explicit ExperimentalConfigWrapper(
	const torch::profiler::impl::ExperimentalConfig& config)
	: config_(config) {}

	bool assertValid(const ActivitySet& activities) {
	// Kineto supports reading performance events per kernel/iteration
	// using CUPTI Range based profiler API. In this mode however we
	// do not trace CPU or GPU events.
	bool cupti_range_profiler = !config_.profiler_metrics.empty();
	if (cupti_range_profiler &&
	activities.count(torch::autograd::profiler::ActivityType::CPU)) {
	LOG(WARNING)
	<< "Cannot run range profiler with CPU activities, please only"
	<< " use CUDA activity type";
	return false;
	}
	return cupti_range_profiler;
	}

	void prepareTraceWithExperimentalOptions() {
	#ifdef USE_KINETO
	std::set<libkineto::ActivityType> k_activities{
	libkineto::ActivityType::CUDA_PROFILER_RANGE};

	const size_t num_metrics = config_.profiler_metrics.size();
	std::stringstream configss;

	LOG(INFO) << "CUPTI profiler metrics size = " << num_metrics;

	configss << "ACTIVITIES_WARMUP_PERIOD_SECS=0\n"
	<< "CUPTI_PROFILER_METRICS=";

	for (int i = 0; i < num_metrics; i++) {
	configss << config_.profiler_metrics[i];
	if (num_metrics > 1 && i < (num_metrics - 1)) {
	configss << ",";
	}
	}
	configss << "\nCUPTI_PROFILER_ENABLE_PER_KERNEL="
	<< (config_.profiler_measure_per_kernel ? "true" : "false")
	<< "\n";
	LOG(INFO) << "Generated config = " << configss.str();

	libkineto::api().activityProfiler().prepareTrace(
	k_activities, configss.str());
	#endif // USE_KINETO
	}

	private:
	const torch::profiler::impl::ExperimentalConfig& config_;
	};
	} // namespace

	void prepareTrace(
	const bool cpuOnly,
	const ActivitySet& activities,
	const torch::profiler::impl::ExperimentalConfig& config) {
	#ifdef USE_KINETO
	if (!libkineto::api().isProfilerRegistered()) {
	libkineto_init(/cpuOnly=/cpuOnly, /logOnError=/true);
	libkineto::api().suppressLogMessages();
	}

	if (!libkineto::api().isProfilerInitialized()) {
	libkineto::api().initProfilerIfRegistered();
	}

	std::set<libkineto::ActivityType> k_activities;
	if (activities.count(torch::autograd::profiler::ActivityType::CPU)) {
	k_activities.insert(cpuTypes.begin(), cpuTypes.end());
	}
	if (activities.count(torch::autograd::profiler::ActivityType::XPU)) {
	k_activities.insert(xpuTypes.begin(), xpuTypes.end());
	}
	if (activities.count(torch::autograd::profiler::ActivityType::CUDA)) {
	k_activities.insert(cudaTypes.begin(), cudaTypes.end());
	}

	ExperimentalConfigWrapper configWrap(config);

	// Experimental Configuration options are present
	if (config && configWrap.assertValid(activities)) {
	configWrap.prepareTraceWithExperimentalOptions();
	return;
	}

	libkineto::api().activityProfiler().prepareTrace(k_activities);
	#endif // USE_KINETO
	}

	void startTrace() {
	#ifdef USE_KINETO
	libkineto::api().activityProfiler().startTrace();
	#endif // USE_KINETO
	}

	ActivityTraceWrapper stopTrace() {
	return ActivityTraceWrapper{
	#ifdef USE_KINETO
	libkineto::api().activityProfiler().stopTrace()
	#else
	std::make_unique<interface_trace_t>()
	#endif // USE_KINETO
	};
	}

	void pushCorrelationId(uint64_t correlation_id) {
	#ifdef USE_KINETO
	libkineto::api().activityProfiler().pushCorrelationId(correlation_id);
	#endif // USE_KINETO
	}

	void pushUserCorrelationId(uint64_t correlation_id) {
	#ifdef USE_KINETO
	libkineto::api().activityProfiler().pushUserCorrelationId(correlation_id);
	#endif // USE_KINETO
	}

	void popCorrelationId() {
	#ifdef USE_KINETO
	libkineto::api().activityProfiler().popCorrelationId();
	#endif // USE_KINETO
	}

	void popUserCorrelationId() {
	#ifdef USE_KINETO
	libkineto::api().activityProfiler().popUserCorrelationId();
	#endif // USE_KINETO
	}

	void recordThreadInfo() {
	#ifdef USE_KINETO
	libkineto::api().activityProfiler().recordThreadInfo();
	#endif // USE_KINETO
	}

	void logInvariantViolation(
	const std::string& assertion,
	const std::string& error,
	const std::string& profile_id,
	const std::string& group_profile_id) {
	#ifdef USE_KINETO
	if (libkineto::api().isProfilerInitialized()) {
	libkineto::api().activityProfiler().logInvariantViolation(
	profile_id, assertion, error, group_profile_id);
	}
	#endif // USE_KINETO
	}

	} // namespace kineto
	} // namespace impl
	} // namespace profiler

	namespace autograd {
	namespace profiler {
	c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
	// fallthrough
	switch (activity_type) {
	case libkineto::ActivityType::GPU_MEMCPY:
	case libkineto::ActivityType::GPU_MEMSET:
	case libkineto::ActivityType::CONCURRENT_KERNEL:
	case libkineto::ActivityType::GPU_USER_ANNOTATION:
	case libkineto::ActivityType::CUDA_PROFILER_RANGE:
	return c10::DeviceType::CUDA;
	case libkineto::ActivityType::CPU_OP:
	case libkineto::ActivityType::USER_ANNOTATION:
	case libkineto::ActivityType::EXTERNAL_CORRELATION:
	case libkineto::ActivityType::CUDA_RUNTIME:
	case libkineto::ActivityType::CPU_INSTANT_EVENT:
	case libkineto::ActivityType::GLOW_RUNTIME:
	case libkineto::ActivityType::PYTHON_FUNCTION:
	return c10::DeviceType::CPU;
	default: {
	TORCH_WARN(
	"Unknown activity type (",
	(uint8_t)activity_type,
	"), assuming CPU device");
	return c10::DeviceType::CPU;
	}
	}
	}

	void addMetadataJson(const std::string& key, const std::string& value) {
	#ifdef USE_KINETO
	if (libkineto::api().isProfilerInitialized()) {
	libkineto::api().activityProfiler().addMetadata(key, value);
	} else {
	LOG(WARNING) << "Profiler is not initialized: skipping profiling metadata";
	}
	#else
	LOG(WARNING) << "Adding profiling metadata requires using "
	<< "torch.profiler with Kineto support (USE_KINETO=1)";
	#endif // USE_KINETO
	}

	void profilerStep() {
	#ifdef USE_KINETO
	if (libkineto::api().isProfilerInitialized()) {
	libkineto::api().activityProfiler().step();
	} else {
	LOG(WARNING) << "Profiler is not initialized: skipping step() invocation";
	}
	#endif // USE_KINETO
	}

	} // namespace profiler
	} // namespace autograd
	} // namespace torch