torch/csrc/autograd/profiler.cpp - platform/external/pytorch - Git at Google

 #include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/jit/code_template.h>

 #include <fstream>
 #include <list>
 #include <mutex>
 #include <sstream>
 #include <string>
 #include <vector>

 namespace torch { namespace autograd { namespace profiler {

 CUDAStubs default_stubs;
 constexpr CUDAStubs* default_stubs_addr = &default_stubs;
 // constant initialization, so it is guarenteed to be initialized before
 // static initialization calls which may invoke registerCUDAMethods
 static CUDAStubs* cuda_stubs = default_stubs_addr;

 void registerCUDAMethods(CUDAStubs* stubs) {
   cuda_stubs = stubs;
 }

 ProfilerState state = ProfilerState::Disabled;
 uint16_t next_thread_id = 0;
 std::mutex all_event_lists_mutex;
 std::list<std::shared_ptr<RangeEventList>> all_event_lists;
 thread_local std::shared_ptr<RangeEventList> event_list;
 thread_local uint16_t thread_id;

 ProfilerConfig::~ProfilerConfig() = default;

 RangeEventList& getEventList() {
   if (!event_list) {
     std::lock_guard<std::mutex> guard(all_event_lists_mutex);
     event_list = std::make_shared<RangeEventList>();
     thread_id = next_thread_id++;
     all_event_lists.emplace_front(event_list);
   }
   return *event_list;
 }

 void mark(std::string name, bool include_cuda /* = true */) {
   if (state == ProfilerState::Disabled) {
     return;
   }
   if (state == ProfilerState::NVTX) {
     cuda_stubs->nvtxMarkA(name.c_str());
   } else {
     getEventList().record(
         EventKind::Mark,
         StringView(std::move(name)),
         thread_id,
         include_cuda && state == ProfilerState::CUDA);
   }
 }

 void pushRangeImpl(
     const StringView& name,
     const char* msg = "",
     int64_t sequence_nr = -1,
     std::vector<std::vector<int64_t>>&& shapes = {}) {
   if (state == ProfilerState::Disabled) {
     return;
   }
   if (state == ProfilerState::NVTX) {
     if(sequence_nr >= 0 || shapes.size() > 0) {
       std::stringstream s;
       if(sequence_nr >= 0)
         s << name.str() << msg << sequence_nr;
       if(shapes.size() > 0) {
         s << ", sizes = [";
         for(int i = 0; i < shapes.size(); i++) {
           if(shapes[i].size() > 0) {
             s << "[";
             for(int dim = 0; dim < shapes[i].size(); dim++) {
               s << shapes[i][dim];
               if(dim < shapes[i].size() - 1)
                 s << ", ";
             }
             s << "]";
           }
           else
             s << "[]";
           if(i < shapes.size() - 1)
             s << ", ";
         }
         s << "]";
       }
       cuda_stubs->nvtxRangePushA(s.str().c_str());
     } else {
       cuda_stubs->nvtxRangePushA(name.str());
     }
   } else {
     getEventList().record(
         EventKind::PushRange,
         name,
         thread_id,
         state == ProfilerState::CUDA,
         std::move(shapes));
   }
 }

 void pushRange(std::string name) {
   pushRangeImpl(StringView(std::move(name)));
 }

 void popRange() {
   if (state == ProfilerState::Disabled) {
     return;
   }
   if (state == ProfilerState::NVTX) {
     cuda_stubs->nvtxRangePop();
   } else {
     getEventList().record(
         EventKind::PopRange,
         StringView(""),
         thread_id,
         state == ProfilerState::CUDA);
   }
 }

 void enableProfiler(ProfilerConfig config) {
   ProfilerState new_state = config.state;
   AT_ASSERT(new_state != ProfilerState::Disabled);
   if (new_state == ProfilerState::NVTX && !cuda_stubs->enabled())
     throw std::runtime_error("Can't use NVTX profiler - PyTorch was compiled without CUDA");
   if (state != ProfilerState::Disabled && new_state != state) {
     throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running");
   }

   pushCallback(
       [config](const RecordFunction& fn) {
         auto* msg = (fn.seqNr() >= 0) ? ", seq = " : "";
         if (config.report_input_shapes) {
           std::vector<std::vector<int64_t>> inputSizes;
           inputSizes.reserve(fn.inputs().size());
           for (const c10::IValue& input : fn.inputs()) {
             if (!input.isTensor()) {
               inputSizes.emplace_back();
               continue;
             }
             const at::Tensor& tensor = input.toTensor();
             if (tensor.defined()) {
               inputSizes.push_back(input.toTensor().sizes().vec());
             } else {
               inputSizes.emplace_back();
             }
           }
           pushRangeImpl(fn.name(), msg, fn.seqNr(), std::move(inputSizes));
         } else {
           pushRangeImpl(fn.name(), msg, fn.seqNr(), {});
         }
       },
       [](const RecordFunction& /* unused */) { popRange(); },
       config.report_input_shapes);
   state = new_state;

   if(state == ProfilerState::CUDA) {
     // event recording appears to have some startup overhead, so we need to
     // to generate some dummy events first before recording syncrhonization events
     for(int i = 0; i < 5; i++) {
       cuda_stubs->onEachDevice([](int d) {
           mark("__cuda_startup");
           cuda_stubs->synchronize();
       });
     }

     // cuda events must be on the same device, so we need a start event recorded
     // for each gpu. we then use this event to synchronize time on the GPU
     // with the CPU clock.
     cuda_stubs->onEachDevice([](int d) {
         mark("__cuda_start_event");
     });
   }
   mark("__start_profile", false);
 }

 thread_event_lists disableProfiler() {
   if (state == ProfilerState::Disabled) {
     throw std::runtime_error("can't disable profiler when it's not running");
   }
   ProfilerState old_state = state;
   mark("__stop_profile");

   popCallback();
   state = ProfilerState::Disabled;

   if (old_state == ProfilerState::NVTX) {
     return thread_event_lists();
   } else {
     thread_event_lists result;
     std::lock_guard<std::mutex> guard(all_event_lists_mutex);
     for (auto it = all_event_lists.begin(); it != all_event_lists.end();) {
       auto & list = *it;
       result.emplace_back(list->consolidate());
       // GC lists that are not held by any threads
       if (list.use_count() == 1) {
         auto current_it = it;
         ++it;
         all_event_lists.erase(current_it);
       } else {
         ++it;
       }
     }
     return result;
   }
 }

 void Event::record(bool record_cuda) {
   if (record_cuda) {
     cuda_stubs->record(&device_, &event, &cpu_ns_);
     return;
   }
   cpu_ns_ = getTime();
 }

 double Event::cuda_elapsed_us(const Event & e) {
   if(!e.has_cuda() || !has_cuda()) {
     throw std::logic_error("Events were not recorded for CUDA");
   }
   if(e.device() != device()) {
     throw std::logic_error("Events are not on the same device");
   }
   return cuda_stubs->elapsed(event, e.event);
 }

 CUDAStubs::~CUDAStubs() = default;


 static jit::CodeTemplate event_template(R"(
 {
   "name": "${name}",
   "ph": "X",
   "ts": ${ts},
   "dur": ${dur},
   "tid": ${tid},
   "pid": "CPU Functions",
   "args": {}
 })");


 RecordProfile::RecordProfile(std::ostream& out)
 : out_(out) {
   init();
 }

 RecordProfile::RecordProfile(const std::string& filename)
 : file_(new std::ofstream(filename)), out_(*file_) {
   init();
 }

 void RecordProfile::init() {
   enableProfiler(ProfilerConfig(ProfilerState::CPU, false /* report shapes */));
 }

 RecordProfile::~RecordProfile() {
   thread_event_lists event_lists = disableProfiler();
   std::vector<Event*> events;
   for(auto& l : event_lists) {
     for(auto& e : l) {
         events.push_back(&e);
     }
   }
   processEvents(events);
   if (file_){
     file_->close();
   }
 }

 void RecordProfile::processEvents(const std::vector<Event*>& events) {
   TORCH_CHECK(out_, "could not open file");
   Event* start = nullptr;
   for (Event* e : events) {
     if(0 == strcmp(e->name(), "__start_profile")) {
       start = e;
       break;
     }
   }
   TORCH_CHECK(start, "could not find start?");
   std::vector<Event*> stack;
   out_ << "[\n";
   bool first = true;
   for(Event* e : events) {
     if(e->kind() == "push") {
       stack.push_back(e);
     } else if(e->kind() == "pop") {
       if(!first) {
         out_ << ",\n";
       }
       first = false;
       Event* e_start = stack.back();
       stack.pop_back();
       jit::TemplateEnv env;
       env.s("name", e_start->name());
       env.d("ts", start->cpu_elapsed_us(*e_start));
       env.d("dur", e_start->cpu_elapsed_us(*e));
       env.d("tid", e_start->thread_id());
       out_ << event_template.format(env);
     }
   }
   out_ << "]\n";
 }

 }}}
	#include <torch/csrc/autograd/profiler.h>
	#include <torch/csrc/jit/code_template.h>

	#include <fstream>
	#include <list>
	#include <mutex>
	#include <sstream>
	#include <string>
	#include <vector>

	namespace torch { namespace autograd { namespace profiler {

	CUDAStubs default_stubs;
	constexpr CUDAStubs* default_stubs_addr = &default_stubs;
	// constant initialization, so it is guarenteed to be initialized before
	// static initialization calls which may invoke registerCUDAMethods
	static CUDAStubs* cuda_stubs = default_stubs_addr;

	void registerCUDAMethods(CUDAStubs* stubs) {
	cuda_stubs = stubs;
	}

	ProfilerState state = ProfilerState::Disabled;
	uint16_t next_thread_id = 0;
	std::mutex all_event_lists_mutex;
	std::list<std::shared_ptr<RangeEventList>> all_event_lists;
	thread_local std::shared_ptr<RangeEventList> event_list;
	thread_local uint16_t thread_id;

	ProfilerConfig::~ProfilerConfig() = default;

	RangeEventList& getEventList() {
	if (!event_list) {
	std::lock_guard<std::mutex> guard(all_event_lists_mutex);
	event_list = std::make_shared<RangeEventList>();
	thread_id = next_thread_id++;
	all_event_lists.emplace_front(event_list);
	}
	return *event_list;
	}

	void mark(std::string name, bool include_cuda /* = true */) {
	if (state == ProfilerState::Disabled) {
	return;
	}
	if (state == ProfilerState::NVTX) {
	cuda_stubs->nvtxMarkA(name.c_str());
	} else {
	getEventList().record(
	EventKind::Mark,
	StringView(std::move(name)),
	thread_id,
	include_cuda && state == ProfilerState::CUDA);
	}
	}

	void pushRangeImpl(
	const StringView& name,
	const char* msg = "",
	int64_t sequence_nr = -1,
	std::vector<std::vector<int64_t>>&& shapes = {}) {
	if (state == ProfilerState::Disabled) {
	return;
	}
	if (state == ProfilerState::NVTX) {
	if(sequence_nr >= 0 \|\| shapes.size() > 0) {
	std::stringstream s;
	if(sequence_nr >= 0)
	s << name.str() << msg << sequence_nr;
	if(shapes.size() > 0) {
	s << ", sizes = [";
	for(int i = 0; i < shapes.size(); i++) {
	if(shapes[i].size() > 0) {
	s << "[";
	for(int dim = 0; dim < shapes[i].size(); dim++) {
	s << shapes[i][dim];
	if(dim < shapes[i].size() - 1)
	s << ", ";
	}
	s << "]";
	}
	else
	s << "[]";
	if(i < shapes.size() - 1)
	s << ", ";
	}
	s << "]";
	}
	cuda_stubs->nvtxRangePushA(s.str().c_str());
	} else {
	cuda_stubs->nvtxRangePushA(name.str());
	}
	} else {
	getEventList().record(
	EventKind::PushRange,
	name,
	thread_id,
	state == ProfilerState::CUDA,
	std::move(shapes));
	}
	}

	void pushRange(std::string name) {
	pushRangeImpl(StringView(std::move(name)));
	}

	void popRange() {
	if (state == ProfilerState::Disabled) {
	return;
	}
	if (state == ProfilerState::NVTX) {
	cuda_stubs->nvtxRangePop();
	} else {
	getEventList().record(
	EventKind::PopRange,
	StringView(""),
	thread_id,
	state == ProfilerState::CUDA);
	}
	}

	void enableProfiler(ProfilerConfig config) {
	ProfilerState new_state = config.state;
	AT_ASSERT(new_state != ProfilerState::Disabled);
	if (new_state == ProfilerState::NVTX && !cuda_stubs->enabled())
	throw std::runtime_error("Can't use NVTX profiler - PyTorch was compiled without CUDA");
	if (state != ProfilerState::Disabled && new_state != state) {
	throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running");
	}

	pushCallback(
	[config](const RecordFunction& fn) {
	auto* msg = (fn.seqNr() >= 0) ? ", seq = " : "";
	if (config.report_input_shapes) {
	std::vector<std::vector<int64_t>> inputSizes;
	inputSizes.reserve(fn.inputs().size());
	for (const c10::IValue& input : fn.inputs()) {
	if (!input.isTensor()) {
	inputSizes.emplace_back();
	continue;
	}
	const at::Tensor& tensor = input.toTensor();
	if (tensor.defined()) {
	inputSizes.push_back(input.toTensor().sizes().vec());
	} else {
	inputSizes.emplace_back();
	}
	}
	pushRangeImpl(fn.name(), msg, fn.seqNr(), std::move(inputSizes));
	} else {
	pushRangeImpl(fn.name(), msg, fn.seqNr(), {});
	}
	},
	[](const RecordFunction& /* unused */) { popRange(); },
	config.report_input_shapes);
	state = new_state;

	if(state == ProfilerState::CUDA) {
	// event recording appears to have some startup overhead, so we need to
	// to generate some dummy events first before recording syncrhonization events
	for(int i = 0; i < 5; i++) {
	cuda_stubs->onEachDevice([](int d) {
	mark("__cuda_startup");
	cuda_stubs->synchronize();
	});
	}

	// cuda events must be on the same device, so we need a start event recorded
	// for each gpu. we then use this event to synchronize time on the GPU
	// with the CPU clock.
	cuda_stubs->onEachDevice([](int d) {
	mark("__cuda_start_event");
	});
	}
	mark("__start_profile", false);
	}

	thread_event_lists disableProfiler() {
	if (state == ProfilerState::Disabled) {
	throw std::runtime_error("can't disable profiler when it's not running");
	}
	ProfilerState old_state = state;
	mark("__stop_profile");

	popCallback();
	state = ProfilerState::Disabled;

	if (old_state == ProfilerState::NVTX) {
	return thread_event_lists();
	} else {
	thread_event_lists result;
	std::lock_guard<std::mutex> guard(all_event_lists_mutex);
	for (auto it = all_event_lists.begin(); it != all_event_lists.end();) {
	auto & list = *it;
	result.emplace_back(list->consolidate());
	// GC lists that are not held by any threads
	if (list.use_count() == 1) {
	auto current_it = it;
	++it;
	all_event_lists.erase(current_it);
	} else {
	++it;
	}
	}
	return result;
	}
	}

	void Event::record(bool record_cuda) {
	if (record_cuda) {
	cuda_stubs->record(&device_, &event, &cpu_ns_);
	return;
	}
	cpu_ns_ = getTime();
	}

	double Event::cuda_elapsed_us(const Event & e) {
	if(!e.has_cuda() \|\| !has_cuda()) {
	throw std::logic_error("Events were not recorded for CUDA");
	}
	if(e.device() != device()) {
	throw std::logic_error("Events are not on the same device");
	}
	return cuda_stubs->elapsed(event, e.event);
	}

	CUDAStubs::~CUDAStubs() = default;


	static jit::CodeTemplate event_template(R"(
	{
	"name": "${name}",
	"ph": "X",
	"ts": ${ts},
	"dur": ${dur},
	"tid": ${tid},
	"pid": "CPU Functions",
	"args": {}
	})");


	RecordProfile::RecordProfile(std::ostream& out)
	: out_(out) {
	init();
	}

	RecordProfile::RecordProfile(const std::string& filename)
	: file_(new std::ofstream(filename)), out_(*file_) {
	init();
	}

	void RecordProfile::init() {
	enableProfiler(ProfilerConfig(ProfilerState::CPU, false /* report shapes */));
	}

	RecordProfile::~RecordProfile() {
	thread_event_lists event_lists = disableProfiler();
	std::vector<Event*> events;
	for(auto& l : event_lists) {
	for(auto& e : l) {
	events.push_back(&e);
	}
	}
	processEvents(events);
	if (file_){
	file_->close();
	}
	}

	void RecordProfile::processEvents(const std::vector<Event*>& events) {
	TORCH_CHECK(out_, "could not open file");
	Event* start = nullptr;
	for (Event* e : events) {
	if(0 == strcmp(e->name(), "__start_profile")) {
	start = e;
	break;
	}
	}
	TORCH_CHECK(start, "could not find start?");
	std::vector<Event*> stack;
	out_ << "[\n";
	bool first = true;
	for(Event* e : events) {
	if(e->kind() == "push") {
	stack.push_back(e);
	} else if(e->kind() == "pop") {
	if(!first) {
	out_ << ",\n";
	}
	first = false;
	Event* e_start = stack.back();
	stack.pop_back();
	jit::TemplateEnv env;
	env.s("name", e_start->name());
	env.d("ts", start->cpu_elapsed_us(*e_start));
	env.d("dur", e_start->cpu_elapsed_us(*e));
	env.d("tid", e_start->thread_id());
	out_ << event_template.format(env);
	}
	}
	out_ << "]\n";
	}

	}}}