torch/csrc/autograd/profiler_legacy.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <iostream>
 #include <mutex>
 #include <memory>
 #include <vector>
 #include <cstdint>
 #include <string>
 #include <sstream>
 #include <forward_list>
 #include <tuple>
 #include <ATen/ATen.h>
 #include <torch/csrc/WindowsTorchApiMacro.h>
 #include <torch/csrc/autograd/profiler_utils.h>
 #ifndef _WIN32
 #include <ctime>
 #endif
 #if defined(C10_IOS) && defined(C10_MOBILE)
 #include <sys/time.h> // for gettimeofday()
 #endif

 #include <ATen/record_function.h>

 #include <torch/csrc/jit/frontend/source_range.h>

 struct CUevent_st;
 typedef std::shared_ptr<CUevent_st> CUDAEventStub;

 namespace torch { namespace autograd {

 struct Node;

 namespace profiler {

 struct TORCH_API CUDAStubs {
   virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) const {
     fail();
   }
   virtual float elapsed(const CUDAEventStub* event, const CUDAEventStub* event2) const {
     fail();
     return 0.f;
   }
   virtual void nvtxMarkA(const char* name) const {
     fail();
   }
   virtual void nvtxRangePushA(const char* name) const {
     fail();
   }
   virtual void nvtxRangePop() const {
     fail();
   }
   virtual bool enabled() const {
     return false;
   }
   virtual void onEachDevice(std::function<void(int)> op) const {
     fail();
   }
   virtual void synchronize() const {
     fail();
   }
   virtual ~CUDAStubs();

 private:
   void fail() const {
     AT_ERROR("CUDA used in profiler but not enabled.");
   }
 };

 TORCH_API void registerCUDAMethods(CUDAStubs* stubs);

 constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
   return ((a + b - 1) / b) * b;
 }

 inline int64_t getTime() {
 #if defined(C10_IOS) && defined(C10_MOBILE)
 // clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS can't rely on
 // CLOCK_REALTIME, as it is defined no matter if clock_gettime is implemented or not
   struct timeval now;
   gettimeofday(&now, NULL);
   return static_cast<int64_t>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
 #elif defined(_WIN32) || defined(__MACH__)
   using namespace std::chrono;
   using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
   return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
 #else
   // clock_gettime is *much* faster than std::chrono implementation on Linux
   struct timespec t{};
   clock_gettime(CLOCK_MONOTONIC, &t);
   return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
 #endif
 }

 enum class C10_API_ENUM EventKind : uint16_t {
   Mark,
   PushRange,
   PopRange,
   MemoryAlloc,
 };

 // To be deprecated, once we switch to Kineto profiling
 struct TORCH_API LegacyEvent {
   LegacyEvent(
       EventKind kind,
       at::StringView name,
       uint16_t thread_id,
       bool record_cuda,
       at::RecordFunctionHandle handle = 0,
       std::vector<std::vector<int64_t>>&& shapes = {},
       int node_id = -1)
       : name_(std::move(name)),
         kind_(kind),
         thread_id_(thread_id),
         handle_(handle),
         shapes_(shapes),
         node_id_(node_id) {
     record(record_cuda);
   }

   // Constructor to be used in conjunction with LegacyEvent::fromIValue.
   LegacyEvent(
       EventKind kind,
       at::StringView name,
       uint16_t thread_id,
       at::RecordFunctionHandle handle,
       std::vector<std::vector<int64_t>>&& shapes,
       int node_id,
       bool is_remote,
       int64_t cpu_memory_usage,
       int64_t cpu_ns,
       bool cuda_recorded,
       int64_t cuda_memory_usage = 0,
       int device = -1,
       double cuda_us = -1)
       : cpu_ns_(cpu_ns),
         name_(std::move(name)),
         kind_(kind),
         thread_id_(thread_id),
         handle_(handle),
         shapes_(shapes),
         cpu_memory_usage_(cpu_memory_usage),
         cuda_memory_usage_(cuda_memory_usage),
         device_(device),
         node_id_(node_id),
         is_remote_(is_remote),
         cuda_us_(cuda_us) {
     // Sanity check values that were deserialized
     TORCH_INTERNAL_ASSERT(cpu_ns_ > 0);
     if (cuda_recorded) {
       TORCH_INTERNAL_ASSERT(device_ >= 0);
       TORCH_INTERNAL_ASSERT(cuda_us_ >= 0);
     }
   }

   // Returns IValues corresponding to event structure, to be used for
   // serialization.
   at::IValue toIValue() const;

   // Reconstructs an event from IValues given by toIValue.
   static LegacyEvent fromIValue(const at::IValue& eventIValue);

   void record(bool record_cuda);

   std::string kindStr() const {
     switch (kind_) {
       case EventKind::Mark: return "mark";
       case EventKind::PushRange: return "push";
       case EventKind::PopRange: return "pop";
       case EventKind::MemoryAlloc: return "memory_alloc";
     }
     throw std::runtime_error("unknown event kind");
   }

   const char* name() const {
     return name_.str();
   }

   uint64_t threadId() const {
     return thread_id_;
   }

   std::vector<std::vector<int64_t>> shapes() const {
     return shapes_;
   }

   double cpuElapsedUs(const LegacyEvent& e) const {
     return (e.cpu_ns_ - cpu_ns_)/(1000.0);
   }

   void setCpuUs(int64_t cpu_us) {
     cpu_ns_ = cpu_us * 1000.0;
   }

   double cpuUs() const {
     return cpu_ns_ / (1000.0);
   }

   double cudaElapsedUs(const LegacyEvent& e) const;

   bool hasCuda() const {
     return cuda_event != nullptr || (isRemote() && device_ != -1);
   }

   int device() const {
     return device_;
   }

   void updateMemoryStats(int64_t alloc_size, c10::Device device) {
     if (device.type() == c10::DeviceType::CUDA ||
         device.type() == c10::DeviceType::HIP) {
       cuda_memory_usage_ = alloc_size;
     } else if (device.type() == c10::DeviceType::CPU ||
         device.type() == c10::DeviceType::MKLDNN ||
         device.type() == c10::DeviceType::IDEEP) {
       cpu_memory_usage_ = alloc_size;
     } else {
       LOG(WARNING) << "Unsupported memory profiling device: " << device;
     }
   }

   int64_t cpuMemoryUsage() const {
     return cpu_memory_usage_;
   }

   int64_t cudaMemoryUsage() const {
     return cuda_memory_usage_;
   }

   at::RecordFunctionHandle handle() const {
     return handle_;
   }

   // Node ID corresponding to this event.
   int nodeId( ) const {
     return node_id_;
   }

   // Set Node ID on this event.
   void setNodeId(int node_id) {
     node_id_ = node_id;
   }

   void setName(at::StringView newName_) {
     name_ = std::move(newName_);
   }

   bool isRemote() const {
     return is_remote_;
   }

   void setCudaUs(int64_t cuda_us) {
     cuda_us_ = cuda_us;
   }

   void setSequenceNr(int64_t sequence_nr) {
     sequence_nr_ = sequence_nr;
   }

   int64_t sequenceNr() const {
     return sequence_nr_;
   }

   void setCorrelationId(uint64_t correlation_id) {
     correlation_id_ = correlation_id;
   }

   uint64_t correlationId() const {
     return correlation_id_;
   }

   const std::vector<std::string>& stack() const {
     return stack_;
   }

   void setStack(const std::vector<std::string>& stack) {
     stack_ = stack;
   }

   uint64_t fwdThreadId() const {
     return fwd_thread_id_;
   }

   void setFwdThreadId(uint64_t fwd_thread_id) {
     fwd_thread_id_ = fwd_thread_id;
   }

   uint8_t scope() const {
     return scope_;
   }

   void setScope(uint8_t scope) {
     scope_ = scope;
   }

   const std::unordered_map<std::string, c10::IValue>& extraArgs() const {
     return extra_args_;
   }

   void setExtraArgs(std::unordered_map<std::string, c10::IValue>&& save_args) {
     extra_args_ = std::move(save_args);
   }

   uint64_t flops() {
     return flops_;
   }

   void setFlops(uint64_t flops) {
     flops_ = flops;
   }

  private:
   // signed to allow for negative intervals, initialized for safety.
   int64_t cpu_ns_ = 0;
   at::StringView name_;
   EventKind kind_;
   uint64_t thread_id_;
   uint64_t fwd_thread_id_;
   at::RecordFunctionHandle handle_ {0};
   std::vector<std::vector<int64_t>> shapes_;
   int64_t cpu_memory_usage_ = 0;
   int64_t cuda_memory_usage_ = 0;
   int device_ = -1;
   CUDAEventStub cuda_event = nullptr;
   int node_id_ = 0;
   bool is_remote_ = false;
   int64_t cuda_us_ = -1;
   int64_t sequence_nr_ = -1;

   std::vector<std::string> stack_;
   uint8_t scope_;
   uint64_t correlation_id_;
   // Extra arguments for computing op flops
   std::unordered_map<std::string, c10::IValue> extra_args_;
   uint64_t flops_ = 0;
 };

 // a linked-list of fixed sized vectors, to avoid
 // a std::vector resize from taking a large amount of time inside
 // a profiling  event
 struct RangeEventList {
   RangeEventList() {
     events_.reserve(kReservedCapacity);
   }

   template<typename... Args>
   void record(Args&&... args) {
     std::lock_guard<std::mutex> guard(mutex_);
     events_.emplace_back(std::forward<Args>(args)...);
   }

   std::vector<LegacyEvent> consolidate() {
     std::lock_guard<std::mutex> lock(mutex_);
     std::vector<LegacyEvent> result;
     result.insert(
         result.begin(),
         std::make_move_iterator(events_.begin()),
         std::make_move_iterator(events_.end()));
     events_.erase(events_.begin(), events_.end());
     return result;
   }

   size_t size() {
     std::lock_guard<std::mutex> lock(mutex_);
     return events_.size();
   }

  private:
   // This mutex is used to serialize access when different threads are writing
   // to the same instance of RangeEventList.
   std::mutex mutex_;
   std::vector<LegacyEvent> events_;

   static const size_t kReservedCapacity = 1024;
 };

 enum class C10_API_ENUM ProfilerState {
   Disabled = 0,
   CPU, // CPU-only profiling
   CUDA, // CPU + CUDA events
   NVTX,  // only emit NVTX markers
   KINETO, // use libkineto
   NUM_PROFILER_STATES, // must be the last one
 };

 struct TORCH_API ProfilerConfig {
   ProfilerConfig(
       ProfilerState state,
       bool report_input_shapes = false,
       bool profile_memory = false,
       bool with_stack = false,
       bool with_flops = false)
       : state(state),
         report_input_shapes(report_input_shapes),
         profile_memory(profile_memory),
         with_stack(with_stack),
         with_flops(with_flops) {}
   ~ProfilerConfig() = default;
   ProfilerState state;
   bool report_input_shapes;
   bool profile_memory;
   bool with_stack;
   bool with_flops;

   // Returns IValues corresponding to ProfilerConfig struct, to be used for
   // serialization.
   at::IValue toIValue() const;

   // Reconstructs a ProfilerConfig from IValues given by toIValue.
   static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue);
 };

 // A struct to control settings of disableProfiler options.
 struct TORCH_API ProfilerDisableOptions {
   ProfilerDisableOptions() = default;
   ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
       : cleanupTLSState(shouldCleanupTLSState),
         consolidate(shouldConsolidate) {}
   // Whether we should clean up profiler states that are thread local, such as
   // ThreadLocalDebugInfo and thread local RecordFunction callbacks.
   bool cleanupTLSState = true;
   // Whether we should consolidate all currently recorded profiled events. If
   // false, will not consolidate and other threads can continue to write to the
   // event lists.
   bool consolidate = true;
 };

 // NOTE: profiler mode is thread local, with automatic propagation
 // across thread boundary (e.g. at::launch tasks)
 TORCH_API void enableProfilerLegacy(const ProfilerConfig&);
 using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
 TORCH_API thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> profilerDisableOptions = c10::nullopt);

 // adds profiledEvents to the current thread local recorded events. Each event
 // will be marked with node ID given by fromNodeId.
 TORCH_API void addEventList(std::vector<LegacyEvent>&& profiledEvents);
 // Returns if the profiler is currently enabled in the current thread.
 TORCH_API bool profilerEnabled();
 // Retrieve the thread_local ProfilerConfig.
 TORCH_API ProfilerConfig getProfilerConfig();
 // Writes profiled events to a stream.
 TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events);

 // Usage:
 //   {
 //     RecordProfile guard("filename.trace");
 //     // code you want to profile
 //   }
 // Then open filename.trace in chrome://tracing
 struct TORCH_API RecordProfile {
   RecordProfile(std::ostream& out);
   RecordProfile(const std::string& filename);

   ~RecordProfile();
 private:
   void init();
   std::unique_ptr<std::ofstream> file_;
   std::ostream& out_;
   void processEvents(const std::vector<LegacyEvent*>& events);
 };

 // A guard that enables the profiler, taking in an optional callback to process
 // the results
 // Usage:
 // {
 //   TLSProfilerGuard g([](thread_event_lists profilerResults) {
 //     // process profilerResults
 //   });
 //   Code to profile
 // }
 struct TORCH_API TLSProfilerGuard {
   explicit TLSProfilerGuard(
       const ProfilerConfig& cfg,
       c10::optional<std::function<void(const thread_event_lists&)>>
           resultCallback = c10::nullopt,
       c10::optional<ProfilerDisableOptions> profilerDisableOptions =
           c10::nullopt)
       : cb_(std::move(resultCallback)),
         profilerDisableOptions_(std::move(profilerDisableOptions)) {
     enableProfilerLegacy(cfg);
   }
   ~TLSProfilerGuard() {
     thread_event_lists event_lists = disableProfilerLegacy(profilerDisableOptions_);
     if (cb_) {
       try {
         (*cb_)(event_lists);
       } catch (const std::exception& e) {
         LOG(ERROR) << "Got error processing profiler events: " << e.what();
       }
     }
   }

  private:
   c10::optional<std::function<void(const thread_event_lists&)>> cb_;
   const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
 };

 struct TORCH_API FileLineFunc {
   std::string filename;
   size_t line;
   std::string funcname;
 };
 TORCH_API std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs);
 TORCH_API std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs);
 TORCH_API std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn);

 struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   explicit ProfilerThreadLocalState(const ProfilerConfig& config)
       : config_(config), remoteProfiledEvents_{c10::nullopt} {}
   ~ProfilerThreadLocalState() override = default;

   const ProfilerConfig& config() const;

   thread_event_lists consolidate();

   void mark(std::string name, bool include_cuda = true);

   void setOrAddRemoteProfiledEvents(
       std::vector<LegacyEvent>&& remoteProfiledEvents);

   void pushRange(
       const at::RecordFunction& fn,
       const bool record_cuda,
       const char* msg = "",
       std::vector<std::vector<int64_t>>&& shapes = {});

   void popRange(const at::RecordFunction& fn, const bool record_cuda);

   void setCallbackHandle(at::CallbackHandle handle) {
     handle_ = handle;
   }

   at::CallbackHandle callbackHandle() const {
     return handle_;
   }

   bool hasCallbackHandle() {
     return handle_ > 0;
   }

   void reportMemoryUsage(
       void* /* unused */,
       int64_t alloc_size,
       c10::Device device) override;

   bool memoryProfilingEnabled() const override;

  protected:
   std::string getNvtxStr(
       const at::StringView& name,
       const char* msg,
       int64_t sequence_nr,
       const std::vector<std::vector<int64_t>>& shapes) const;

   RangeEventList& getEventList(int64_t thread_id = -1);

   std::mutex state_mutex_;
   std::unordered_map<uint64_t, std::shared_ptr<RangeEventList>>
       event_lists_map_;

   ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
   at::CallbackHandle handle_ = 0;
   c10::optional<std::vector<std::vector<LegacyEvent>>> remoteProfiledEvents_;
 };


 } // namespace profiler
 }} // namespace torch::autograd
	#pragma once

	#include <iostream>
	#include <mutex>
	#include <memory>
	#include <vector>
	#include <cstdint>
	#include <string>
	#include <sstream>
	#include <forward_list>
	#include <tuple>
	#include <ATen/ATen.h>
	#include <torch/csrc/WindowsTorchApiMacro.h>
	#include <torch/csrc/autograd/profiler_utils.h>
	#ifndef _WIN32
	#include <ctime>
	#endif
	#if defined(C10_IOS) && defined(C10_MOBILE)
	#include <sys/time.h> // for gettimeofday()
	#endif

	#include <ATen/record_function.h>

	#include <torch/csrc/jit/frontend/source_range.h>

	struct CUevent_st;
	typedef std::shared_ptr<CUevent_st> CUDAEventStub;

	namespace torch { namespace autograd {

	struct Node;

	namespace profiler {

	struct TORCH_API CUDAStubs {
	virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) const {
	fail();
	}
	virtual float elapsed(const CUDAEventStub* event, const CUDAEventStub* event2) const {
	fail();
	return 0.f;
	}
	virtual void nvtxMarkA(const char* name) const {
	fail();
	}
	virtual void nvtxRangePushA(const char* name) const {
	fail();
	}
	virtual void nvtxRangePop() const {
	fail();
	}
	virtual bool enabled() const {
	return false;
	}
	virtual void onEachDevice(std::function<void(int)> op) const {
	fail();
	}
	virtual void synchronize() const {
	fail();
	}
	virtual ~CUDAStubs();

	private:
	void fail() const {
	AT_ERROR("CUDA used in profiler but not enabled.");
	}
	};

	TORCH_API void registerCUDAMethods(CUDAStubs* stubs);

	constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
	return ((a + b - 1) / b) * b;
	}

	inline int64_t getTime() {
	#if defined(C10_IOS) && defined(C10_MOBILE)
	// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS can't rely on
	// CLOCK_REALTIME, as it is defined no matter if clock_gettime is implemented or not
	struct timeval now;
	gettimeofday(&now, NULL);
	return static_cast<int64_t>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
	#elif defined(_WIN32) \|\| defined(__MACH__)
	using namespace std::chrono;
	using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
	return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
	#else
	// clock_gettime is much faster than std::chrono implementation on Linux
	struct timespec t{};
	clock_gettime(CLOCK_MONOTONIC, &t);
	return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
	#endif
	}

	enum class C10_API_ENUM EventKind : uint16_t {
	Mark,
	PushRange,
	PopRange,
	MemoryAlloc,
	};

	// To be deprecated, once we switch to Kineto profiling
	struct TORCH_API LegacyEvent {
	LegacyEvent(
	EventKind kind,
	at::StringView name,
	uint16_t thread_id,
	bool record_cuda,
	at::RecordFunctionHandle handle = 0,
	std::vector<std::vector<int64_t>>&& shapes = {},
	int node_id = -1)
	: name_(std::move(name)),
	kind_(kind),
	thread_id_(thread_id),
	handle_(handle),
	shapes_(shapes),
	node_id_(node_id) {
	record(record_cuda);
	}

	// Constructor to be used in conjunction with LegacyEvent::fromIValue.
	LegacyEvent(
	EventKind kind,
	at::StringView name,
	uint16_t thread_id,
	at::RecordFunctionHandle handle,
	std::vector<std::vector<int64_t>>&& shapes,
	int node_id,
	bool is_remote,
	int64_t cpu_memory_usage,
	int64_t cpu_ns,
	bool cuda_recorded,
	int64_t cuda_memory_usage = 0,
	int device = -1,
	double cuda_us = -1)
	: cpu_ns_(cpu_ns),
	name_(std::move(name)),
	kind_(kind),
	thread_id_(thread_id),
	handle_(handle),
	shapes_(shapes),
	cpu_memory_usage_(cpu_memory_usage),
	cuda_memory_usage_(cuda_memory_usage),
	device_(device),
	node_id_(node_id),
	is_remote_(is_remote),
	cuda_us_(cuda_us) {
	// Sanity check values that were deserialized
	TORCH_INTERNAL_ASSERT(cpu_ns_ > 0);
	if (cuda_recorded) {
	TORCH_INTERNAL_ASSERT(device_ >= 0);
	TORCH_INTERNAL_ASSERT(cuda_us_ >= 0);
	}
	}

	// Returns IValues corresponding to event structure, to be used for
	// serialization.
	at::IValue toIValue() const;

	// Reconstructs an event from IValues given by toIValue.
	static LegacyEvent fromIValue(const at::IValue& eventIValue);

	void record(bool record_cuda);

	std::string kindStr() const {
	switch (kind_) {
	case EventKind::Mark: return "mark";
	case EventKind::PushRange: return "push";
	case EventKind::PopRange: return "pop";
	case EventKind::MemoryAlloc: return "memory_alloc";
	}
	throw std::runtime_error("unknown event kind");
	}

	const char* name() const {
	return name_.str();
	}

	uint64_t threadId() const {
	return thread_id_;
	}

	std::vector<std::vector<int64_t>> shapes() const {
	return shapes_;
	}

	double cpuElapsedUs(const LegacyEvent& e) const {
	return (e.cpu_ns_ - cpu_ns_)/(1000.0);
	}

	void setCpuUs(int64_t cpu_us) {
	cpu_ns_ = cpu_us * 1000.0;
	}

	double cpuUs() const {
	return cpu_ns_ / (1000.0);
	}

	double cudaElapsedUs(const LegacyEvent& e) const;

	bool hasCuda() const {
	return cuda_event != nullptr \|\| (isRemote() && device_ != -1);
	}

	int device() const {
	return device_;
	}

	void updateMemoryStats(int64_t alloc_size, c10::Device device) {
	if (device.type() == c10::DeviceType::CUDA \|\|
	device.type() == c10::DeviceType::HIP) {
	cuda_memory_usage_ = alloc_size;
	} else if (device.type() == c10::DeviceType::CPU \|\|
	device.type() == c10::DeviceType::MKLDNN \|\|
	device.type() == c10::DeviceType::IDEEP) {
	cpu_memory_usage_ = alloc_size;
	} else {
	LOG(WARNING) << "Unsupported memory profiling device: " << device;
	}
	}

	int64_t cpuMemoryUsage() const {
	return cpu_memory_usage_;
	}

	int64_t cudaMemoryUsage() const {
	return cuda_memory_usage_;
	}

	at::RecordFunctionHandle handle() const {
	return handle_;
	}

	// Node ID corresponding to this event.
	int nodeId( ) const {
	return node_id_;
	}

	// Set Node ID on this event.
	void setNodeId(int node_id) {
	node_id_ = node_id;
	}

	void setName(at::StringView newName_) {
	name_ = std::move(newName_);
	}

	bool isRemote() const {
	return is_remote_;
	}

	void setCudaUs(int64_t cuda_us) {
	cuda_us_ = cuda_us;
	}

	void setSequenceNr(int64_t sequence_nr) {
	sequence_nr_ = sequence_nr;
	}

	int64_t sequenceNr() const {
	return sequence_nr_;
	}

	void setCorrelationId(uint64_t correlation_id) {
	correlation_id_ = correlation_id;
	}

	uint64_t correlationId() const {
	return correlation_id_;
	}

	const std::vector<std::string>& stack() const {
	return stack_;
	}

	void setStack(const std::vector<std::string>& stack) {
	stack_ = stack;
	}

	uint64_t fwdThreadId() const {
	return fwd_thread_id_;
	}

	void setFwdThreadId(uint64_t fwd_thread_id) {
	fwd_thread_id_ = fwd_thread_id;
	}

	uint8_t scope() const {
	return scope_;
	}

	void setScope(uint8_t scope) {
	scope_ = scope;
	}

	const std::unordered_map<std::string, c10::IValue>& extraArgs() const {
	return extra_args_;
	}

	void setExtraArgs(std::unordered_map<std::string, c10::IValue>&& save_args) {
	extra_args_ = std::move(save_args);
	}

	uint64_t flops() {
	return flops_;
	}

	void setFlops(uint64_t flops) {
	flops_ = flops;
	}

	private:
	// signed to allow for negative intervals, initialized for safety.
	int64_t cpu_ns_ = 0;
	at::StringView name_;
	EventKind kind_;
	uint64_t thread_id_;
	uint64_t fwd_thread_id_;
	at::RecordFunctionHandle handle_ {0};
	std::vector<std::vector<int64_t>> shapes_;
	int64_t cpu_memory_usage_ = 0;
	int64_t cuda_memory_usage_ = 0;
	int device_ = -1;
	CUDAEventStub cuda_event = nullptr;
	int node_id_ = 0;
	bool is_remote_ = false;
	int64_t cuda_us_ = -1;
	int64_t sequence_nr_ = -1;

	std::vector<std::string> stack_;
	uint8_t scope_;
	uint64_t correlation_id_;
	// Extra arguments for computing op flops
	std::unordered_map<std::string, c10::IValue> extra_args_;
	uint64_t flops_ = 0;
	};

	// a linked-list of fixed sized vectors, to avoid
	// a std::vector resize from taking a large amount of time inside
	// a profiling event
	struct RangeEventList {
	RangeEventList() {
	events_.reserve(kReservedCapacity);
	}

	template<typename... Args>
	void record(Args&&... args) {
	std::lock_guard<std::mutex> guard(mutex_);
	events_.emplace_back(std::forward<Args>(args)...);
	}

	std::vector<LegacyEvent> consolidate() {
	std::lock_guard<std::mutex> lock(mutex_);
	std::vector<LegacyEvent> result;
	result.insert(
	result.begin(),
	std::make_move_iterator(events_.begin()),
	std::make_move_iterator(events_.end()));
	events_.erase(events_.begin(), events_.end());
	return result;
	}

	size_t size() {
	std::lock_guard<std::mutex> lock(mutex_);
	return events_.size();
	}

	private:
	// This mutex is used to serialize access when different threads are writing
	// to the same instance of RangeEventList.
	std::mutex mutex_;
	std::vector<LegacyEvent> events_;

	static const size_t kReservedCapacity = 1024;
	};

	enum class C10_API_ENUM ProfilerState {
	Disabled = 0,
	CPU, // CPU-only profiling
	CUDA, // CPU + CUDA events
	NVTX, // only emit NVTX markers
	KINETO, // use libkineto
	NUM_PROFILER_STATES, // must be the last one
	};

	struct TORCH_API ProfilerConfig {
	ProfilerConfig(
	ProfilerState state,
	bool report_input_shapes = false,
	bool profile_memory = false,
	bool with_stack = false,
	bool with_flops = false)
	: state(state),
	report_input_shapes(report_input_shapes),
	profile_memory(profile_memory),
	with_stack(with_stack),
	with_flops(with_flops) {}
	~ProfilerConfig() = default;
	ProfilerState state;
	bool report_input_shapes;
	bool profile_memory;
	bool with_stack;
	bool with_flops;

	// Returns IValues corresponding to ProfilerConfig struct, to be used for
	// serialization.
	at::IValue toIValue() const;

	// Reconstructs a ProfilerConfig from IValues given by toIValue.
	static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue);
	};

	// A struct to control settings of disableProfiler options.
	struct TORCH_API ProfilerDisableOptions {
	ProfilerDisableOptions() = default;
	ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
	: cleanupTLSState(shouldCleanupTLSState),
	consolidate(shouldConsolidate) {}
	// Whether we should clean up profiler states that are thread local, such as
	// ThreadLocalDebugInfo and thread local RecordFunction callbacks.
	bool cleanupTLSState = true;
	// Whether we should consolidate all currently recorded profiled events. If
	// false, will not consolidate and other threads can continue to write to the
	// event lists.
	bool consolidate = true;
	};

	// NOTE: profiler mode is thread local, with automatic propagation
	// across thread boundary (e.g. at::launch tasks)
	TORCH_API void enableProfilerLegacy(const ProfilerConfig&);
	using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
	TORCH_API thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> profilerDisableOptions = c10::nullopt);

	// adds profiledEvents to the current thread local recorded events. Each event
	// will be marked with node ID given by fromNodeId.
	TORCH_API void addEventList(std::vector<LegacyEvent>&& profiledEvents);
	// Returns if the profiler is currently enabled in the current thread.
	TORCH_API bool profilerEnabled();
	// Retrieve the thread_local ProfilerConfig.
	TORCH_API ProfilerConfig getProfilerConfig();
	// Writes profiled events to a stream.
	TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events);

	// Usage:
	// {
	// RecordProfile guard("filename.trace");
	// // code you want to profile
	// }
	// Then open filename.trace in chrome://tracing
	struct TORCH_API RecordProfile {
	RecordProfile(std::ostream& out);
	RecordProfile(const std::string& filename);

	~RecordProfile();
	private:
	void init();
	std::unique_ptr<std::ofstream> file_;
	std::ostream& out_;
	void processEvents(const std::vector<LegacyEvent*>& events);
	};

	// A guard that enables the profiler, taking in an optional callback to process
	// the results
	// Usage:
	// {
	// TLSProfilerGuard g([](thread_event_lists profilerResults) {
	// // process profilerResults
	// });
	// Code to profile
	// }
	struct TORCH_API TLSProfilerGuard {
	explicit TLSProfilerGuard(
	const ProfilerConfig& cfg,
	c10::optional<std::function<void(const thread_event_lists&)>>
	resultCallback = c10::nullopt,
	c10::optional<ProfilerDisableOptions> profilerDisableOptions =
	c10::nullopt)
	: cb_(std::move(resultCallback)),
	profilerDisableOptions_(std::move(profilerDisableOptions)) {
	enableProfilerLegacy(cfg);
	}
	~TLSProfilerGuard() {
	thread_event_lists event_lists = disableProfilerLegacy(profilerDisableOptions_);
	if (cb_) {
	try {
	(*cb_)(event_lists);
	} catch (const std::exception& e) {
	LOG(ERROR) << "Got error processing profiler events: " << e.what();
	}
	}
	}

	private:
	c10::optional<std::function<void(const thread_event_lists&)>> cb_;
	const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
	};

	struct TORCH_API FileLineFunc {
	std::string filename;
	size_t line;
	std::string funcname;
	};
	TORCH_API std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs);
	TORCH_API std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs);
	TORCH_API std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn);

	struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
	explicit ProfilerThreadLocalState(const ProfilerConfig& config)
	: config_(config), remoteProfiledEvents_{c10::nullopt} {}
	~ProfilerThreadLocalState() override = default;

	const ProfilerConfig& config() const;

	thread_event_lists consolidate();

	void mark(std::string name, bool include_cuda = true);

	void setOrAddRemoteProfiledEvents(
	std::vector<LegacyEvent>&& remoteProfiledEvents);

	void pushRange(
	const at::RecordFunction& fn,
	const bool record_cuda,
	const char* msg = "",
	std::vector<std::vector<int64_t>>&& shapes = {});

	void popRange(const at::RecordFunction& fn, const bool record_cuda);

	void setCallbackHandle(at::CallbackHandle handle) {
	handle_ = handle;
	}

	at::CallbackHandle callbackHandle() const {
	return handle_;
	}

	bool hasCallbackHandle() {
	return handle_ > 0;
	}

	void reportMemoryUsage(
	void* /* unused */,
	int64_t alloc_size,
	c10::Device device) override;

	bool memoryProfilingEnabled() const override;

	protected:
	std::string getNvtxStr(
	const at::StringView& name,
	const char* msg,
	int64_t sequence_nr,
	const std::vector<std::vector<int64_t>>& shapes) const;

	RangeEventList& getEventList(int64_t thread_id = -1);

	std::mutex state_mutex_;
	std::unordered_map<uint64_t, std::shared_ptr<RangeEventList>>
	event_lists_map_;

	ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
	at::CallbackHandle handle_ = 0;
	c10::optional<std::vector<std::vector<LegacyEvent>>> remoteProfiledEvents_;
	};


	} // namespace profiler
	}} // namespace torch::autograd