torch/csrc/autograd/profiler.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <iostream>
 #include <mutex>
 #include <memory>
 #include <vector>
 #include <cstdint>
 #include <string>
 #include <sstream>
 #include <forward_list>
 #include <tuple>
 #include <ATen/ATen.h>
 #include <torch/csrc/WindowsTorchApiMacro.h>
 #ifndef _WIN32
 #include <ctime>
 #endif

 #include <torch/csrc/autograd/record_function.h>

 typedef struct CUevent_st* CUDAEventStub;

 namespace torch { namespace autograd {

 struct Node;

 namespace profiler {

 struct TORCH_API CUDAStubs {
   virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
     fail();
   }
   virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) {
     fail();
     return 0.f;
   }
   virtual void nvtxMarkA(const char* name) {
     fail();
   }
   virtual void nvtxRangePushA(const char* name) {
     fail();
   }
   virtual void nvtxRangePop() {
     fail();
   }
   virtual bool enabled() {
     return false;
   }
   virtual void onEachDevice(std::function<void(int)> op) {
     fail();
   }
   virtual void synchronize() {
     fail();
   }
   virtual ~CUDAStubs();

 private:
   void fail() {
     AT_ERROR("CUDA used in profiler but not enabled.");
   }
 };

 TORCH_API void registerCUDAMethods(CUDAStubs* stubs);

 constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
   return ((a + b - 1) / b) * b;
 }

 #if (defined(__MACH__) && !defined(CLOCK_REALTIME)) || defined(C10_IOS)
 #include <sys/time.h>
 // clock_gettime is not implemented on older versions of OS X (< 10.12).
 // If implemented, CLOCK_REALTIME will have already been defined.

 // clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS can't rely on
 // CLOCK_REALTIME, as it is defined no matter if clock_gettime is implemented or not
 #endif

 inline int64_t getTime() {
 #ifdef _WIN32
   using namespace std::chrono;
   using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
   return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
 #elif (defined(__MACH__) && !defined(CLOCK_REALTIME)) || defined(C10_IOS)
   struct timeval now;
   gettimeofday(&now, NULL);
   return static_cast<int64_t>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
 #else
   // clock_gettime is *much* faster than std::chrono implementation on Linux
   struct timespec t{};
   clock_gettime(CLOCK_MONOTONIC, &t);
   return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
 #endif
 }

 // Old GCC versions generate warnings incorrectly
 // see https://stackoverflow.com/questions/2463113/g-c0x-enum-class-compiler-warnings
 #ifndef _MSC_VER
 #  pragma GCC diagnostic push
 #  pragma GCC diagnostic ignored "-Wattributes"
 #endif
 enum class TORCH_API ProfilerState {
     Disabled,
     CPU, // CPU-only profiling
     CUDA, // CPU + CUDA events
     NVTX,  // only emit NVTX markers
 };

 struct TORCH_API ProfilerConfig {
   ProfilerConfig(ProfilerState state, bool report_input_shapes)
       : state(state), report_input_shapes(report_input_shapes) {}
   ~ProfilerConfig();
   ProfilerState state;
   bool report_input_shapes;
 };

 enum class TORCH_API EventKind : uint16_t {
   Mark,
   PushRange,
   PopRange
 };
 #ifndef _MSC_VER
 #  pragma GCC diagnostic pop
 #endif

 struct TORCH_API Event final {
   Event(
       EventKind kind,
       StringView name,
       uint16_t thread_id,
       bool record_cuda,
       std::vector<std::vector<int64_t>>&& shapes = {})
       : name_(std::move(name)),
         kind_(kind),
         thread_id_(thread_id),
         shapes_(shapes) {
     record(record_cuda);
   }

   void record(bool record_cuda);
   std::string kind() const {
     switch(kind_) {
       case EventKind::Mark: return "mark";
       case EventKind::PushRange: return "push";
       case EventKind::PopRange: return "pop";
     }
     throw std::runtime_error("unknown EventKind");
   }
   const char* name() const {
     return name_.str();
   }
   uint16_t thread_id() const {
     return thread_id_;
   }
   std::vector<std::vector<int64_t>> shapes() const {
     return shapes_;
   }
   double cpu_elapsed_us(const Event & e) {
     return (e.cpu_ns_ - cpu_ns_)/(1000.0);
   }
   double cuda_elapsed_us(const Event & e);
   bool has_cuda() const {
     return event != nullptr;
   }
   int device() const {
     return device_;
   }
 private:
   // signed to allow for negative intervals, initialized for safety.
   int64_t cpu_ns_ = 0;
   StringView name_;
   EventKind kind_;
   uint16_t thread_id_;
   std::vector<std::vector<int64_t>> shapes_;
   int device_ = -1;
   struct CUevent_st* event = nullptr;
 };

 // a linked-list of fixed sized vectors, to avoid
 // a std::vector resize from taking a large amount of time inside
 // a profiling  event
 struct RangeEventList {
   // This mutex is used to serialize access when different threads are writing
   // to the same instance of RangeEventList.
   std::mutex mutex_;
   constexpr static size_t MB = 1024 * 1024;
   constexpr static size_t event_block_size = 16 * MB;
   constexpr static size_t num_block_elements =
     event_block_size / ceilToMultiple(sizeof(Event), alignof(Event));
   static_assert(sizeof(Event[num_block_elements]) <= event_block_size,
                 "num_block_elements is calculated incorrectly");
   using block_type = std::vector<Event>;

   template<typename... Args>
   void record(Args&&... args) {
     std::lock_guard<std::mutex> guard(mutex_);
     if (blocks.empty() || blocks.front().size() == num_block_elements) {
       allocBlock();
     }
     blocks.front().emplace_back(std::forward<Args>(args)...);
   }

   std::vector<Event> consolidate() {
     std::unique_lock<std::mutex> lock(mutex_);
     std::forward_list<block_type> localBlocks;
     localBlocks.swap(blocks);
     lock.unlock();
     std::vector<Event> result;

     for (auto & block : localBlocks) {
       result.insert(result.begin(),
                     std::make_move_iterator(block.begin()),
                     std::make_move_iterator(block.end()));
     }
     return result;
   }

   std::forward_list<block_type> blocks;
   private:
      // allocBlock() assumes that mutex_ is held when called, in order to prevent
     // multiple threads' block writes stomping over each other.
     void allocBlock() {
       blocks.emplace_front();
       auto & new_block = blocks.front();
       new_block.reserve(num_block_elements);
       // Materialize all pages in the new block to release jitter when recording events.
       const char * const end_ptr = reinterpret_cast<char*>(new_block.data() + num_block_elements);
       for (volatile const char * ptr = reinterpret_cast<char*>(new_block.data());
           ptr < end_ptr; ptr += 4 * 1024) {
         (*ptr);
       }
     }
 };

 TORCH_API RangeEventList& getEventList();
 TORCH_API void mark(std::string name, bool include_cuda = true);

 using thread_event_lists = std::vector<std::vector<Event>>;
 // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
 // there no autograd functions are being executed when these function are used.
 TORCH_API void enableProfiler(ProfilerConfig);
 TORCH_API thread_event_lists disableProfiler();
 TORCH_API bool profilerEnabled();


 // Usage:
 //   {
 //     RecordProfile guard("filename.trace");
 //     // code you want to profile
 //   }
 // Then open filename.trace in chrome://tracing
 struct TORCH_API RecordProfile {
   RecordProfile(std::ostream& out);
   RecordProfile(const std::string& filename);

   ~RecordProfile();
 private:
   void init();
   std::unique_ptr<std::ofstream> file_;
   std::ostream& out_;
   void processEvents(const std::vector<Event*>& events);
 };


 } // namespace profiler
 }} // namespace torch::autograd
	#pragma once

	#include <iostream>
	#include <mutex>
	#include <memory>
	#include <vector>
	#include <cstdint>
	#include <string>
	#include <sstream>
	#include <forward_list>
	#include <tuple>
	#include <ATen/ATen.h>
	#include <torch/csrc/WindowsTorchApiMacro.h>
	#ifndef _WIN32
	#include <ctime>
	#endif

	#include <torch/csrc/autograd/record_function.h>

	typedef struct CUevent_st* CUDAEventStub;

	namespace torch { namespace autograd {

	struct Node;

	namespace profiler {

	struct TORCH_API CUDAStubs {
	virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
	fail();
	}
	virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) {
	fail();
	return 0.f;
	}
	virtual void nvtxMarkA(const char* name) {
	fail();
	}
	virtual void nvtxRangePushA(const char* name) {
	fail();
	}
	virtual void nvtxRangePop() {
	fail();
	}
	virtual bool enabled() {
	return false;
	}
	virtual void onEachDevice(std::function<void(int)> op) {
	fail();
	}
	virtual void synchronize() {
	fail();
	}
	virtual ~CUDAStubs();

	private:
	void fail() {
	AT_ERROR("CUDA used in profiler but not enabled.");
	}
	};

	TORCH_API void registerCUDAMethods(CUDAStubs* stubs);

	constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
	return ((a + b - 1) / b) * b;
	}

	#if (defined(__MACH__) && !defined(CLOCK_REALTIME)) \|\| defined(C10_IOS)
	#include <sys/time.h>
	// clock_gettime is not implemented on older versions of OS X (< 10.12).
	// If implemented, CLOCK_REALTIME will have already been defined.

	// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS can't rely on
	// CLOCK_REALTIME, as it is defined no matter if clock_gettime is implemented or not
	#endif

	inline int64_t getTime() {
	#ifdef _WIN32
	using namespace std::chrono;
	using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
	return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
	#elif (defined(__MACH__) && !defined(CLOCK_REALTIME)) \|\| defined(C10_IOS)
	struct timeval now;
	gettimeofday(&now, NULL);
	return static_cast<int64_t>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
	#else
	// clock_gettime is much faster than std::chrono implementation on Linux
	struct timespec t{};
	clock_gettime(CLOCK_MONOTONIC, &t);
	return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
	#endif
	}

	// Old GCC versions generate warnings incorrectly
	// see https://stackoverflow.com/questions/2463113/g-c0x-enum-class-compiler-warnings
	#ifndef _MSC_VER
	# pragma GCC diagnostic push
	# pragma GCC diagnostic ignored "-Wattributes"
	#endif
	enum class TORCH_API ProfilerState {
	Disabled,
	CPU, // CPU-only profiling
	CUDA, // CPU + CUDA events
	NVTX, // only emit NVTX markers
	};

	struct TORCH_API ProfilerConfig {
	ProfilerConfig(ProfilerState state, bool report_input_shapes)
	: state(state), report_input_shapes(report_input_shapes) {}
	~ProfilerConfig();
	ProfilerState state;
	bool report_input_shapes;
	};

	enum class TORCH_API EventKind : uint16_t {
	Mark,
	PushRange,
	PopRange
	};
	#ifndef _MSC_VER
	# pragma GCC diagnostic pop
	#endif

	struct TORCH_API Event final {
	Event(
	EventKind kind,
	StringView name,
	uint16_t thread_id,
	bool record_cuda,
	std::vector<std::vector<int64_t>>&& shapes = {})
	: name_(std::move(name)),
	kind_(kind),
	thread_id_(thread_id),
	shapes_(shapes) {
	record(record_cuda);
	}

	void record(bool record_cuda);
	std::string kind() const {
	switch(kind_) {
	case EventKind::Mark: return "mark";
	case EventKind::PushRange: return "push";
	case EventKind::PopRange: return "pop";
	}
	throw std::runtime_error("unknown EventKind");
	}
	const char* name() const {
	return name_.str();
	}
	uint16_t thread_id() const {
	return thread_id_;
	}
	std::vector<std::vector<int64_t>> shapes() const {
	return shapes_;
	}
	double cpu_elapsed_us(const Event & e) {
	return (e.cpu_ns_ - cpu_ns_)/(1000.0);
	}
	double cuda_elapsed_us(const Event & e);
	bool has_cuda() const {
	return event != nullptr;
	}
	int device() const {
	return device_;
	}
	private:
	// signed to allow for negative intervals, initialized for safety.
	int64_t cpu_ns_ = 0;
	StringView name_;
	EventKind kind_;
	uint16_t thread_id_;
	std::vector<std::vector<int64_t>> shapes_;
	int device_ = -1;
	struct CUevent_st* event = nullptr;
	};

	// a linked-list of fixed sized vectors, to avoid
	// a std::vector resize from taking a large amount of time inside
	// a profiling event
	struct RangeEventList {
	// This mutex is used to serialize access when different threads are writing
	// to the same instance of RangeEventList.
	std::mutex mutex_;
	constexpr static size_t MB = 1024 * 1024;
	constexpr static size_t event_block_size = 16 * MB;
	constexpr static size_t num_block_elements =
	event_block_size / ceilToMultiple(sizeof(Event), alignof(Event));
	static_assert(sizeof(Event[num_block_elements]) <= event_block_size,
	"num_block_elements is calculated incorrectly");
	using block_type = std::vector<Event>;

	template<typename... Args>
	void record(Args&&... args) {
	std::lock_guard<std::mutex> guard(mutex_);
	if (blocks.empty() \|\| blocks.front().size() == num_block_elements) {
	allocBlock();
	}
	blocks.front().emplace_back(std::forward<Args>(args)...);
	}

	std::vector<Event> consolidate() {
	std::unique_lock<std::mutex> lock(mutex_);
	std::forward_list<block_type> localBlocks;
	localBlocks.swap(blocks);
	lock.unlock();
	std::vector<Event> result;

	for (auto & block : localBlocks) {
	result.insert(result.begin(),
	std::make_move_iterator(block.begin()),
	std::make_move_iterator(block.end()));
	}
	return result;
	}

	std::forward_list<block_type> blocks;
	private:
	// allocBlock() assumes that mutex_ is held when called, in order to prevent
	// multiple threads' block writes stomping over each other.
	void allocBlock() {
	blocks.emplace_front();
	auto & new_block = blocks.front();
	new_block.reserve(num_block_elements);
	// Materialize all pages in the new block to release jitter when recording events.
	const char * const end_ptr = reinterpret_cast<char*>(new_block.data() + num_block_elements);
	for (volatile const char * ptr = reinterpret_cast<char*>(new_block.data());
	ptr < end_ptr; ptr += 4 * 1024) {
	(*ptr);
	}
	}
	};

	TORCH_API RangeEventList& getEventList();
	TORCH_API void mark(std::string name, bool include_cuda = true);

	using thread_event_lists = std::vector<std::vector<Event>>;
	// NOTE: changing profiler modes is NOT THREAD SAFE. You should ensure that
	// there no autograd functions are being executed when these function are used.
	TORCH_API void enableProfiler(ProfilerConfig);
	TORCH_API thread_event_lists disableProfiler();
	TORCH_API bool profilerEnabled();


	// Usage:
	// {
	// RecordProfile guard("filename.trace");
	// // code you want to profile
	// }
	// Then open filename.trace in chrome://tracing
	struct TORCH_API RecordProfile {
	RecordProfile(std::ostream& out);
	RecordProfile(const std::string& filename);

	~RecordProfile();
	private:
	void init();
	std::unique_ptr<std::ofstream> file_;
	std::ostream& out_;
	void processEvents(const std::vector<Event*>& events);
	};


	} // namespace profiler
	}} // namespace torch::autograd