torch/csrc/autograd/profiler.h - platform/external/pytorch - Git at Google

 #pragma once

 #ifdef USE_CUDA
 #include <nvToolsExt.h>
 #endif
 #include <thread>
 #include <iostream>
 #include <mutex>
 #include <memory>
 #include <vector>
 #include <cstdint>
 #include <string>
 #include <list>
 #include <sstream>
 #include <forward_list>
 #include <tuple>
 #include "ATen/ATen.h"
 #include "torch/csrc/WindowsTorchApiMacro.h"
 #include "torch/csrc/cuda/cuda_check.h"
 #ifdef USE_CUDA
 #include "ATen/cuda/CUDAContext.h"
 #include <cuda_runtime.h>
 #endif

 namespace torch { namespace autograd {

 struct Function;

 namespace profiler {

 constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
   return ((a + b - 1) / b) * b;
 }

 inline uint64_t getTime() {
   using namespace std::chrono;
   using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
   return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
 }

 enum class EventKind {
   Mark,
   PushRange,
   PopRange
 };

 struct Event {
   Event(EventKind kind, std::string name, uint32_t thread_id, bool record_cuda)
   : kind_(kind)
   , name_(std::move(name))
   , thread_id_(thread_id) {
 #ifdef USE_CUDA
     if(record_cuda) {
       TORCH_CUDA_CHECK(cudaGetDevice(&device_));
       TORCH_CUDA_CHECK(cudaEventCreate(&event));
       auto stream = at::cuda::getCurrentCUDAStream();
       cpu_ns_ = getTime();
       TORCH_CUDA_CHECK(cudaEventRecord(event, stream));
     } else {
       cpu_ns_ = getTime();
     }
 #else
     cpu_ns_ = getTime();
 #endif
   }
   std::string kind() const {
     switch(kind_) {
       case EventKind::Mark: return "mark";
       case EventKind::PushRange: return "push";
       case EventKind::PopRange: return "pop";
     }
     throw std::runtime_error("unknown EventKind");
   }
   const std::string & name() const {
     return name_;
   }
   uint32_t thread_id() const {
     return thread_id_;
   }
   double cpu_elapsed_us(const Event & e) {
     return (e.cpu_ns_ - cpu_ns_)/(1000.0);
   }
   double cuda_elapsed_us(const Event & e) {
 #ifdef USE_CUDA
     if(!e.has_cuda() || !has_cuda()) {
       throw std::logic_error("Events were not recorded for CUDA");
     }
     if(e.device() != device()) {
       throw std::logic_error("Events are not on the same device");
     }
     TORCH_CUDA_CHECK(cudaEventSynchronize(event));
     TORCH_CUDA_CHECK(cudaEventSynchronize(e.event));
     float ms;
     TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event, e.event));
     return ms*1000.0;
 #else
     throw std::logic_error("CUDA not enabled");
 #endif
   }
   bool has_cuda() const {
 #ifdef USE_CUDA
     return event != nullptr;
 #else
     return false;
 #endif
   }
   int device() const {
     return device_;
   }
 private:
   EventKind kind_;
   std::string name_;
   uint32_t thread_id_;
   int64_t cpu_ns_; // signed to allow for negative intervals
 #ifdef USE_CUDA
   cudaEvent_t event = nullptr;
 #endif
   int device_ = -1;
 };

 // a linked-list of fixed sized vectors, to avoid
 // a std::vector resize from taking a large amount of time inside
 // a profiling  event
 struct RangeEventList {
   constexpr static size_t MB = 1024 * 1024;
   constexpr static size_t event_block_size = 16 * MB;
   constexpr static size_t num_block_elements =
     event_block_size / ceilToMultiple(sizeof(Event), alignof(Event));
   static_assert(sizeof(Event[num_block_elements]) <= event_block_size,
                 "num_block_elements is calculated incorrectly");
   using block_type = std::vector<Event>;

   void allocBlock() {
     blocks.emplace_front();
     blocks.front().reserve(num_block_elements);
   }

   template<typename... Args>
   void record(Args&&... args) {
     if (blocks.empty() || blocks.front().size() == num_block_elements) {
       allocBlock();
     }
     blocks.front().emplace_back(std::forward<Args>(args)...);
   }

   std::vector<Event> consolidate() {
     std::vector<Event> result;
     for (auto & block : blocks) {
       result.insert(result.begin(),
                     std::make_move_iterator(block.begin()),
                     std::make_move_iterator(block.end()));
     }
     blocks.clear();
     return result;
   }

   std::forward_list<block_type> blocks;
 };

 enum class ProfilerState {
     Disabled,
     CPU, // CPU-only profiling
     CUDA, // CPU + CUDA events
     NVTX,  // only emit NVTX markers
 };

 TORCH_API RangeEventList& getEventList();
 TORCH_API void mark(std::string name, bool include_cuda = true);
 TORCH_API void pushRange(std::string name);
 TORCH_API void popRange();

 struct TORCH_API RecordFunction {
   explicit RecordFunction(Function* fn);

   explicit RecordFunction(std::string name);

   explicit RecordFunction(const char* name);

   ~RecordFunction();

   // Needed only because we don't have Function defined yet.
   void pushFunctionRange(Function *fn);
 };

 using thread_event_lists = std::vector<std::vector<Event>>;
 // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
 // there no autograd functions are being executed when these function are used.
 TORCH_API void enableProfiler(ProfilerState new_state);
 TORCH_API thread_event_lists disableProfiler();

 } // namespace profiler
 }} // namespace torch::autograd
	#pragma once

	#ifdef USE_CUDA
	#include <nvToolsExt.h>
	#endif
	#include <thread>
	#include <iostream>
	#include <mutex>
	#include <memory>
	#include <vector>
	#include <cstdint>
	#include <string>
	#include <list>
	#include <sstream>
	#include <forward_list>
	#include <tuple>
	#include "ATen/ATen.h"
	#include "torch/csrc/WindowsTorchApiMacro.h"
	#include "torch/csrc/cuda/cuda_check.h"
	#ifdef USE_CUDA
	#include "ATen/cuda/CUDAContext.h"
	#include <cuda_runtime.h>
	#endif

	namespace torch { namespace autograd {

	struct Function;

	namespace profiler {

	constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
	return ((a + b - 1) / b) * b;
	}

	inline uint64_t getTime() {
	using namespace std::chrono;
	using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
	return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
	}

	enum class EventKind {
	Mark,
	PushRange,
	PopRange
	};

	struct Event {
	Event(EventKind kind, std::string name, uint32_t thread_id, bool record_cuda)
	: kind_(kind)
	, name_(std::move(name))
	, thread_id_(thread_id) {
	#ifdef USE_CUDA
	if(record_cuda) {
	TORCH_CUDA_CHECK(cudaGetDevice(&device_));
	TORCH_CUDA_CHECK(cudaEventCreate(&event));
	auto stream = at::cuda::getCurrentCUDAStream();
	cpu_ns_ = getTime();
	TORCH_CUDA_CHECK(cudaEventRecord(event, stream));
	} else {
	cpu_ns_ = getTime();
	}
	#else
	cpu_ns_ = getTime();
	#endif
	}
	std::string kind() const {
	switch(kind_) {
	case EventKind::Mark: return "mark";
	case EventKind::PushRange: return "push";
	case EventKind::PopRange: return "pop";
	}
	throw std::runtime_error("unknown EventKind");
	}
	const std::string & name() const {
	return name_;
	}
	uint32_t thread_id() const {
	return thread_id_;
	}
	double cpu_elapsed_us(const Event & e) {
	return (e.cpu_ns_ - cpu_ns_)/(1000.0);
	}
	double cuda_elapsed_us(const Event & e) {
	#ifdef USE_CUDA
	if(!e.has_cuda() \|\| !has_cuda()) {
	throw std::logic_error("Events were not recorded for CUDA");
	}
	if(e.device() != device()) {
	throw std::logic_error("Events are not on the same device");
	}
	TORCH_CUDA_CHECK(cudaEventSynchronize(event));
	TORCH_CUDA_CHECK(cudaEventSynchronize(e.event));
	float ms;
	TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event, e.event));
	return ms*1000.0;
	#else
	throw std::logic_error("CUDA not enabled");
	#endif
	}
	bool has_cuda() const {
	#ifdef USE_CUDA
	return event != nullptr;
	#else
	return false;
	#endif
	}
	int device() const {
	return device_;
	}
	private:
	EventKind kind_;
	std::string name_;
	uint32_t thread_id_;
	int64_t cpu_ns_; // signed to allow for negative intervals
	#ifdef USE_CUDA
	cudaEvent_t event = nullptr;
	#endif
	int device_ = -1;
	};

	// a linked-list of fixed sized vectors, to avoid
	// a std::vector resize from taking a large amount of time inside
	// a profiling event
	struct RangeEventList {
	constexpr static size_t MB = 1024 * 1024;
	constexpr static size_t event_block_size = 16 * MB;
	constexpr static size_t num_block_elements =
	event_block_size / ceilToMultiple(sizeof(Event), alignof(Event));
	static_assert(sizeof(Event[num_block_elements]) <= event_block_size,
	"num_block_elements is calculated incorrectly");
	using block_type = std::vector<Event>;

	void allocBlock() {
	blocks.emplace_front();
	blocks.front().reserve(num_block_elements);
	}

	template<typename... Args>
	void record(Args&&... args) {
	if (blocks.empty() \|\| blocks.front().size() == num_block_elements) {
	allocBlock();
	}
	blocks.front().emplace_back(std::forward<Args>(args)...);
	}

	std::vector<Event> consolidate() {
	std::vector<Event> result;
	for (auto & block : blocks) {
	result.insert(result.begin(),
	std::make_move_iterator(block.begin()),
	std::make_move_iterator(block.end()));
	}
	blocks.clear();
	return result;
	}

	std::forward_list<block_type> blocks;
	};

	enum class ProfilerState {
	Disabled,
	CPU, // CPU-only profiling
	CUDA, // CPU + CUDA events
	NVTX, // only emit NVTX markers
	};

	TORCH_API RangeEventList& getEventList();
	TORCH_API void mark(std::string name, bool include_cuda = true);
	TORCH_API void pushRange(std::string name);
	TORCH_API void popRange();

	struct TORCH_API RecordFunction {
	explicit RecordFunction(Function* fn);

	explicit RecordFunction(std::string name);

	explicit RecordFunction(const char* name);

	~RecordFunction();

	// Needed only because we don't have Function defined yet.
	void pushFunctionRange(Function *fn);
	};

	using thread_event_lists = std::vector<std::vector<Event>>;
	// NOTE: changing profiler modes is NOT THREAD SAFE. You should ensure that
	// there no autograd functions are being executed when these function are used.
	TORCH_API void enableProfiler(ProfilerState new_state);
	TORCH_API thread_event_lists disableProfiler();

	} // namespace profiler
	}} // namespace torch::autograd