torch/csrc/autograd/profiler_legacy.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <cstdint>
 #include <forward_list>
 #include <iostream>
 #include <memory>
 #include <mutex>
 #include <sstream>
 #include <string>
 #include <tuple>
 #include <vector>

 #include <torch/csrc/Export.h>
 #include <torch/csrc/profiler/api.h>
 #include <torch/csrc/profiler/util.h>

 namespace torch {
 namespace autograd {

 struct Node;

 namespace profiler {

 enum class C10_API_ENUM EventKind : uint16_t {
   Mark,
   PushRange,
   PopRange,
   MemoryAlloc,
 };

 // To be deprecated, once we switch to Kineto profiling
 struct TORCH_API LegacyEvent {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   LegacyEvent(
       EventKind kind,
       at::StringView name,
       uint16_t thread_id,
       bool record_cuda,
       at::RecordFunctionHandle handle = 0,
       std::vector<std::vector<int64_t>>&& shapes = {},
       int node_id = -1,
       bool is_async = false)
       : name_(std::move(name)),
         kind_(kind),
         thread_id_(thread_id),
         handle_(handle),
         shapes_(shapes),
         node_id_(node_id),
         is_async_(is_async) {
     record(record_cuda);
   }

   // Constructor to be used in conjunction with LegacyEvent::fromIValue.
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   LegacyEvent(
       EventKind kind,
       at::StringView name,
       uint16_t thread_id,
       at::RecordFunctionHandle handle,
       std::vector<std::vector<int64_t>>&& shapes,
       int node_id,
       bool is_remote,
       int64_t cpu_memory_usage,
       int64_t cpu_ns,
       bool cuda_recorded,
       int64_t cuda_memory_usage = 0,
       int device = -1,
       double cuda_us = -1)
       : cpu_ns_(cpu_ns),
         name_(std::move(name)),
         kind_(kind),
         thread_id_(thread_id),
         handle_(handle),
         shapes_(shapes),
         cpu_memory_usage_(cpu_memory_usage),
         cuda_memory_usage_(cuda_memory_usage),
         device_(device),
         node_id_(node_id),
         is_remote_(is_remote),
         cuda_us_(cuda_us) {
     // Sanity check values that were deserialized
     TORCH_INTERNAL_ASSERT(cpu_ns_ > 0);
     if (cuda_recorded) {
       TORCH_INTERNAL_ASSERT(device_ >= 0);
       TORCH_INTERNAL_ASSERT(cuda_us_ >= 0);
     }
   }

   // Returns IValues corresponding to event structure, to be used for
   // serialization.
   at::IValue toIValue() const;

   // Reconstructs an event from IValues given by toIValue.
   static LegacyEvent fromIValue(const at::IValue& eventIValue);

   void record(bool record_cuda);

   std::string kindStr() const {
     switch (kind_) {
       case EventKind::Mark:
         return "mark";
       case EventKind::PushRange:
         return "push";
       case EventKind::PopRange:
         return "pop";
       case EventKind::MemoryAlloc:
         return "memory_alloc";
     }
     throw std::runtime_error("unknown event kind");
   }

   EventKind kind() const {
     return kind_;
   }

   const char* name() const {
     return name_.str();
   }

   uint64_t threadId() const {
     return thread_id_;
   }

   std::vector<std::vector<int64_t>> shapes() const {
     return shapes_;
   }

   double cpuElapsedUs(const LegacyEvent& e) const {
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
     return static_cast<double>(e.cpu_ns_ - cpu_ns_) / (1000.0);
   }

   void setCpuUs(int64_t cpu_us) {
     cpu_ns_ = static_cast<double>(cpu_us) * 1000.0;
   }

   double cpuUs() const {
     return static_cast<double>(cpu_ns_) / (1000.0);
   }

   double cudaElapsedUs(const LegacyEvent& e) const;

   bool hasCuda() const {
     return cuda_event != nullptr || (isRemote() && device_ != -1);
   }

   int device() const {
     return device_;
   }

   void updateMemoryStats(int64_t alloc_size, c10::Device device) {
     if (device.is_cuda() || device.type() == c10::DeviceType::HIP) {
       cuda_memory_usage_ = alloc_size;
     } else if (
         device.is_cpu() || device.type() == c10::DeviceType::MKLDNN ||
         device.type() == c10::DeviceType::IDEEP) {
       cpu_memory_usage_ = alloc_size;
     } else {
       LOG(WARNING) << "Unsupported memory profiling device: " << device;
     }
   }

   int64_t cpuMemoryUsage() const {
     return cpu_memory_usage_;
   }

   int64_t cudaMemoryUsage() const {
     return cuda_memory_usage_;
   }

   at::RecordFunctionHandle handle() const {
     return handle_;
   }

   // Node ID corresponding to this event.
   int nodeId() const {
     return node_id_;
   }

   // Set Node ID on this event.
   void setNodeId(int node_id) {
     node_id_ = node_id;
   }

   void setName(at::StringView newName_) {
     name_ = std::move(newName_);
   }

   bool isRemote() const {
     return is_remote_;
   }

   void setCudaUs(int64_t cuda_us) {
     cuda_us_ = cuda_us;
   }

   void setSequenceNr(int64_t sequence_nr) {
     sequence_nr_ = sequence_nr;
   }

   int64_t sequenceNr() const {
     return sequence_nr_;
   }

   void setCorrelationId(uint64_t correlation_id) {
     correlation_id_ = correlation_id;
   }

   uint64_t correlationId() const {
     return correlation_id_;
   }

   const std::vector<std::string>& stack() const {
     return stack_;
   }

   void setStack(const std::vector<std::string>& stack) {
     stack_ = stack;
   }

   uint64_t fwdThreadId() const {
     return fwd_thread_id_;
   }

   void setFwdThreadId(uint64_t fwd_thread_id) {
     fwd_thread_id_ = fwd_thread_id;
   }

   uint8_t scope() const {
     return scope_;
   }

   void setScope(uint8_t scope) {
     scope_ = scope;
   }

   const std::unordered_map<std::string, c10::IValue>& extraArgs() const {
     return extra_args_;
   }

   void setExtraArgs(std::unordered_map<std::string, c10::IValue>&& save_args) {
     extra_args_ = std::move(save_args);
   }

   uint64_t flops() {
     return flops_;
   }

   bool isAsync() {
     return is_async_;
   }

   void setFlops(uint64_t flops) {
     flops_ = flops;
   }

  private:
   // signed to allow for negative intervals, initialized for safety.
   int64_t cpu_ns_ = 0;
   at::StringView name_;
   EventKind kind_;
   uint64_t thread_id_;
   uint64_t fwd_thread_id_;
   at::RecordFunctionHandle handle_{0};
   std::vector<std::vector<int64_t>> shapes_;
   int64_t cpu_memory_usage_ = 0;
   int64_t cuda_memory_usage_ = 0;
   int device_ = -1;
   torch::profiler::impl::ProfilerEventStub cuda_event = nullptr;
   int node_id_ = 0;
   bool is_remote_ = false;
   int64_t cuda_us_ = -1;
   int64_t sequence_nr_ = -1;
   bool is_async_ = false;

   std::vector<std::string> stack_;
   uint8_t scope_;
   uint64_t correlation_id_;
   // Extra arguments for computing op flops
   std::unordered_map<std::string, c10::IValue> extra_args_;
   uint64_t flops_ = 0;
 };

 // a linked-list of fixed sized vectors, to avoid
 // a std::vector resize from taking a large amount of time inside
 // a profiling  event
 struct RangeEventList {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,modernize-use-equals-default)
   RangeEventList() {
     events_.reserve(kReservedCapacity);
   }

   template <typename... Args>
   void record(Args&&... args) {
     std::lock_guard<std::mutex> guard(mutex_);
     events_.emplace_back(std::forward<Args>(args)...);
   }

   std::vector<LegacyEvent> consolidate() {
     std::lock_guard<std::mutex> lock(mutex_);
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     std::vector<LegacyEvent> result;
     result.insert(
         result.begin(),
         std::make_move_iterator(events_.begin()),
         std::make_move_iterator(events_.end()));
     events_.erase(events_.begin(), events_.end());
     return result;
   }

   size_t size() {
     std::lock_guard<std::mutex> lock(mutex_);
     return events_.size();
   }

  private:
   // This mutex is used to serialize access when different threads are writing
   // to the same instance of RangeEventList.
   std::mutex mutex_;
   std::vector<LegacyEvent> events_;

   static const size_t kReservedCapacity = 1024;
 };

 // A struct to control settings of disableProfiler options.
 struct TORCH_API ProfilerDisableOptions {
   ProfilerDisableOptions() = default;
   ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
       : cleanupTLSState(shouldCleanupTLSState),
         consolidate(shouldConsolidate) {}
   // Whether we should clean up profiler states that are thread local, such as
   // ThreadLocalDebugInfo and thread local RecordFunction callbacks.
   bool cleanupTLSState = true;
   // Whether we should consolidate all currently recorded profiled events. If
   // false, will not consolidate and other threads can continue to write to the
   // event lists.
   bool consolidate = true;
 };

 // NOTE: profiler mode is thread local, with automatic propagation
 // across thread boundary (e.g. at::launch tasks)
 TORCH_API void enableProfilerLegacy(
     const torch::profiler::impl::ProfilerConfig&);
 using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
 TORCH_API thread_event_lists disableProfilerLegacy(
     c10::optional<ProfilerDisableOptions> profilerDisableOptions =
         c10::nullopt);

 // adds profiledEvents to the current thread local recorded events. Each event
 // will be marked with node ID given by fromNodeId.
 TORCH_API void addEventList(std::vector<LegacyEvent>&& profiledEvents);
 // Writes profiled events to a stream.
 TORCH_API void writeProfilerEventsToStream(
     std::ostream& out,
     const std::vector<LegacyEvent*>& events);

 // Usage:
 //   {
 //     RecordProfile guard("filename.trace");
 //     // code you want to profile
 //   }
 // Then open filename.trace in chrome://tracing
 struct TORCH_API RecordProfile {
   RecordProfile(std::ostream& out);
   RecordProfile(const std::string& filename);

   ~RecordProfile();

  private:
   void init();
   std::unique_ptr<std::ofstream> file_;
   std::ostream& out_;
   void processEvents(const std::vector<LegacyEvent*>& events);
 };

 // A guard that enables the legacy profiler, taking in an optional callback to
 // process the results Usage:
 // {
 //   TLSLegacyProfilerGuard g([](thread_event_lists profilerResults) {
 //     // process profilerResults
 //   });
 //   Code to profile
 // }
 struct TORCH_API TLSLegacyProfilerGuard {
   explicit TLSLegacyProfilerGuard(
       const torch::profiler::impl::ProfilerConfig& cfg,
       c10::optional<std::function<void(const thread_event_lists&)>>
           resultCallback = c10::nullopt,
       c10::optional<ProfilerDisableOptions> profilerDisableOptions =
           c10::nullopt)
       : cb_(std::move(resultCallback)),
         // NOLINTNEXTLINE(performance-move-const-arg)
         profilerDisableOptions_(std::move(profilerDisableOptions)) {
     enableProfilerLegacy(cfg);
   }
   ~TLSLegacyProfilerGuard() {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     thread_event_lists event_lists =
         disableProfilerLegacy(profilerDisableOptions_);
     if (cb_) {
       try {
         (*cb_)(event_lists);
       } catch (const std::exception& e) {
         LOG(ERROR) << "Got error processing profiler events: " << e.what();
       }
     }
   }

  private:
   c10::optional<std::function<void(const thread_event_lists&)>> cb_;
   const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
 };

 } // namespace profiler
 } // namespace autograd
 } // namespace torch
	#pragma once

	#include <cstdint>
	#include <forward_list>
	#include <iostream>
	#include <memory>
	#include <mutex>
	#include <sstream>
	#include <string>
	#include <tuple>
	#include <vector>

	#include <torch/csrc/Export.h>
	#include <torch/csrc/profiler/api.h>
	#include <torch/csrc/profiler/util.h>

	namespace torch {
	namespace autograd {

	struct Node;

	namespace profiler {

	enum class C10_API_ENUM EventKind : uint16_t {
	Mark,
	PushRange,
	PopRange,
	MemoryAlloc,
	};

	// To be deprecated, once we switch to Kineto profiling
	struct TORCH_API LegacyEvent {
	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
	LegacyEvent(
	EventKind kind,
	at::StringView name,
	uint16_t thread_id,
	bool record_cuda,
	at::RecordFunctionHandle handle = 0,
	std::vector<std::vector<int64_t>>&& shapes = {},
	int node_id = -1,
	bool is_async = false)
	: name_(std::move(name)),
	kind_(kind),
	thread_id_(thread_id),
	handle_(handle),
	shapes_(shapes),
	node_id_(node_id),
	is_async_(is_async) {
	record(record_cuda);
	}

	// Constructor to be used in conjunction with LegacyEvent::fromIValue.
	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
	LegacyEvent(
	EventKind kind,
	at::StringView name,
	uint16_t thread_id,
	at::RecordFunctionHandle handle,
	std::vector<std::vector<int64_t>>&& shapes,
	int node_id,
	bool is_remote,
	int64_t cpu_memory_usage,
	int64_t cpu_ns,
	bool cuda_recorded,
	int64_t cuda_memory_usage = 0,
	int device = -1,
	double cuda_us = -1)
	: cpu_ns_(cpu_ns),
	name_(std::move(name)),
	kind_(kind),
	thread_id_(thread_id),
	handle_(handle),
	shapes_(shapes),
	cpu_memory_usage_(cpu_memory_usage),
	cuda_memory_usage_(cuda_memory_usage),
	device_(device),
	node_id_(node_id),
	is_remote_(is_remote),
	cuda_us_(cuda_us) {
	// Sanity check values that were deserialized
	TORCH_INTERNAL_ASSERT(cpu_ns_ > 0);
	if (cuda_recorded) {
	TORCH_INTERNAL_ASSERT(device_ >= 0);
	TORCH_INTERNAL_ASSERT(cuda_us_ >= 0);
	}
	}

	// Returns IValues corresponding to event structure, to be used for
	// serialization.
	at::IValue toIValue() const;

	// Reconstructs an event from IValues given by toIValue.
	static LegacyEvent fromIValue(const at::IValue& eventIValue);

	void record(bool record_cuda);

	std::string kindStr() const {
	switch (kind_) {
	case EventKind::Mark:
	return "mark";
	case EventKind::PushRange:
	return "push";
	case EventKind::PopRange:
	return "pop";
	case EventKind::MemoryAlloc:
	return "memory_alloc";
	}
	throw std::runtime_error("unknown event kind");
	}

	EventKind kind() const {
	return kind_;
	}

	const char* name() const {
	return name_.str();
	}

	uint64_t threadId() const {
	return thread_id_;
	}

	std::vector<std::vector<int64_t>> shapes() const {
	return shapes_;
	}

	double cpuElapsedUs(const LegacyEvent& e) const {
	// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
	return static_cast<double>(e.cpu_ns_ - cpu_ns_) / (1000.0);
	}

	void setCpuUs(int64_t cpu_us) {
	cpu_ns_ = static_cast<double>(cpu_us) * 1000.0;
	}

	double cpuUs() const {
	return static_cast<double>(cpu_ns_) / (1000.0);
	}

	double cudaElapsedUs(const LegacyEvent& e) const;

	bool hasCuda() const {
	return cuda_event != nullptr \|\| (isRemote() && device_ != -1);
	}

	int device() const {
	return device_;
	}

	void updateMemoryStats(int64_t alloc_size, c10::Device device) {
	if (device.is_cuda() \|\| device.type() == c10::DeviceType::HIP) {
	cuda_memory_usage_ = alloc_size;
	} else if (
	device.is_cpu() \|\| device.type() == c10::DeviceType::MKLDNN \|\|
	device.type() == c10::DeviceType::IDEEP) {
	cpu_memory_usage_ = alloc_size;
	} else {
	LOG(WARNING) << "Unsupported memory profiling device: " << device;
	}
	}

	int64_t cpuMemoryUsage() const {
	return cpu_memory_usage_;
	}

	int64_t cudaMemoryUsage() const {
	return cuda_memory_usage_;
	}

	at::RecordFunctionHandle handle() const {
	return handle_;
	}

	// Node ID corresponding to this event.
	int nodeId() const {
	return node_id_;
	}

	// Set Node ID on this event.
	void setNodeId(int node_id) {
	node_id_ = node_id;
	}

	void setName(at::StringView newName_) {
	name_ = std::move(newName_);
	}

	bool isRemote() const {
	return is_remote_;
	}

	void setCudaUs(int64_t cuda_us) {
	cuda_us_ = cuda_us;
	}

	void setSequenceNr(int64_t sequence_nr) {
	sequence_nr_ = sequence_nr;
	}

	int64_t sequenceNr() const {
	return sequence_nr_;
	}

	void setCorrelationId(uint64_t correlation_id) {
	correlation_id_ = correlation_id;
	}

	uint64_t correlationId() const {
	return correlation_id_;
	}

	const std::vector<std::string>& stack() const {
	return stack_;
	}

	void setStack(const std::vector<std::string>& stack) {
	stack_ = stack;
	}

	uint64_t fwdThreadId() const {
	return fwd_thread_id_;
	}

	void setFwdThreadId(uint64_t fwd_thread_id) {
	fwd_thread_id_ = fwd_thread_id;
	}

	uint8_t scope() const {
	return scope_;
	}

	void setScope(uint8_t scope) {
	scope_ = scope;
	}

	const std::unordered_map<std::string, c10::IValue>& extraArgs() const {
	return extra_args_;
	}

	void setExtraArgs(std::unordered_map<std::string, c10::IValue>&& save_args) {
	extra_args_ = std::move(save_args);
	}

	uint64_t flops() {
	return flops_;
	}

	bool isAsync() {
	return is_async_;
	}

	void setFlops(uint64_t flops) {
	flops_ = flops;
	}

	private:
	// signed to allow for negative intervals, initialized for safety.
	int64_t cpu_ns_ = 0;
	at::StringView name_;
	EventKind kind_;
	uint64_t thread_id_;
	uint64_t fwd_thread_id_;
	at::RecordFunctionHandle handle_{0};
	std::vector<std::vector<int64_t>> shapes_;
	int64_t cpu_memory_usage_ = 0;
	int64_t cuda_memory_usage_ = 0;
	int device_ = -1;
	torch::profiler::impl::ProfilerEventStub cuda_event = nullptr;
	int node_id_ = 0;
	bool is_remote_ = false;
	int64_t cuda_us_ = -1;
	int64_t sequence_nr_ = -1;
	bool is_async_ = false;

	std::vector<std::string> stack_;
	uint8_t scope_;
	uint64_t correlation_id_;
	// Extra arguments for computing op flops
	std::unordered_map<std::string, c10::IValue> extra_args_;
	uint64_t flops_ = 0;
	};

	// a linked-list of fixed sized vectors, to avoid
	// a std::vector resize from taking a large amount of time inside
	// a profiling event
	struct RangeEventList {
	// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,modernize-use-equals-default)
	RangeEventList() {
	events_.reserve(kReservedCapacity);
	}

	template <typename... Args>
	void record(Args&&... args) {
	std::lock_guard<std::mutex> guard(mutex_);
	events_.emplace_back(std::forward<Args>(args)...);
	}

	std::vector<LegacyEvent> consolidate() {
	std::lock_guard<std::mutex> lock(mutex_);
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	std::vector<LegacyEvent> result;
	result.insert(
	result.begin(),
	std::make_move_iterator(events_.begin()),
	std::make_move_iterator(events_.end()));
	events_.erase(events_.begin(), events_.end());
	return result;
	}

	size_t size() {
	std::lock_guard<std::mutex> lock(mutex_);
	return events_.size();
	}

	private:
	// This mutex is used to serialize access when different threads are writing
	// to the same instance of RangeEventList.
	std::mutex mutex_;
	std::vector<LegacyEvent> events_;

	static const size_t kReservedCapacity = 1024;
	};

	// A struct to control settings of disableProfiler options.
	struct TORCH_API ProfilerDisableOptions {
	ProfilerDisableOptions() = default;
	ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
	: cleanupTLSState(shouldCleanupTLSState),
	consolidate(shouldConsolidate) {}
	// Whether we should clean up profiler states that are thread local, such as
	// ThreadLocalDebugInfo and thread local RecordFunction callbacks.
	bool cleanupTLSState = true;
	// Whether we should consolidate all currently recorded profiled events. If
	// false, will not consolidate and other threads can continue to write to the
	// event lists.
	bool consolidate = true;
	};

	// NOTE: profiler mode is thread local, with automatic propagation
	// across thread boundary (e.g. at::launch tasks)
	TORCH_API void enableProfilerLegacy(
	const torch::profiler::impl::ProfilerConfig&);
	using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
	TORCH_API thread_event_lists disableProfilerLegacy(
	c10::optional<ProfilerDisableOptions> profilerDisableOptions =
	c10::nullopt);

	// adds profiledEvents to the current thread local recorded events. Each event
	// will be marked with node ID given by fromNodeId.
	TORCH_API void addEventList(std::vector<LegacyEvent>&& profiledEvents);
	// Writes profiled events to a stream.
	TORCH_API void writeProfilerEventsToStream(
	std::ostream& out,
	const std::vector<LegacyEvent*>& events);

	// Usage:
	// {
	// RecordProfile guard("filename.trace");
	// // code you want to profile
	// }
	// Then open filename.trace in chrome://tracing
	struct TORCH_API RecordProfile {
	RecordProfile(std::ostream& out);
	RecordProfile(const std::string& filename);

	~RecordProfile();

	private:
	void init();
	std::unique_ptr<std::ofstream> file_;
	std::ostream& out_;
	void processEvents(const std::vector<LegacyEvent*>& events);
	};

	// A guard that enables the legacy profiler, taking in an optional callback to
	// process the results Usage:
	// {
	// TLSLegacyProfilerGuard g([](thread_event_lists profilerResults) {
	// // process profilerResults
	// });
	// Code to profile
	// }
	struct TORCH_API TLSLegacyProfilerGuard {
	explicit TLSLegacyProfilerGuard(
	const torch::profiler::impl::ProfilerConfig& cfg,
	c10::optional<std::function<void(const thread_event_lists&)>>
	resultCallback = c10::nullopt,
	c10::optional<ProfilerDisableOptions> profilerDisableOptions =
	c10::nullopt)
	: cb_(std::move(resultCallback)),
	// NOLINTNEXTLINE(performance-move-const-arg)
	profilerDisableOptions_(std::move(profilerDisableOptions)) {
	enableProfilerLegacy(cfg);
	}
	~TLSLegacyProfilerGuard() {
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	thread_event_lists event_lists =
	disableProfilerLegacy(profilerDisableOptions_);
	if (cb_) {
	try {
	(*cb_)(event_lists);
	} catch (const std::exception& e) {
	LOG(ERROR) << "Got error processing profiler events: " << e.what();
	}
	}
	}

	private:
	c10::optional<std::function<void(const thread_event_lists&)>> cb_;
	const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
	};

	} // namespace profiler
	} // namespace autograd
	} // namespace torch