torch/csrc/autograd/profiler_cuda.cpp - platform/external/pytorch - Git at Google

 #include <torch/csrc/autograd/profiler.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/irange.h>
 #include <nvToolsExt.h>

 #include <sstream>

 namespace torch { namespace autograd { namespace profiler {

 namespace {

 static inline void cudaCheck(cudaError_t result, const char * file, int line) {
   if(result != cudaSuccess) {
     std::stringstream ss;
     ss << file << ":" << line << ": ";
     if (result == cudaErrorInitializationError) {
       // It is common for users to use DataLoader with multiple workers
       // and the autograd profiler. Throw a nice error message here.
       ss << "CUDA initialization error. "
          << "This can occur if one runs the profiler in CUDA mode on code "
          << "that creates a DataLoader with num_workers > 0. This operation "
          << "is currently unsupported; potential workarounds are: "
          << "(1) don't use the profiler in CUDA mode or (2) use num_workers=0 "
          << "in the DataLoader or (3) Don't profile the data loading portion "
          << "of your code. https://github.com/pytorch/pytorch/issues/6313 "
          << "tracks profiler support for multi-worker DataLoader.";
     } else {
       ss << cudaGetErrorString(result);
     }
     throw std::runtime_error(ss.str());
   }
 }
 #define TORCH_CUDA_CHECK(result) cudaCheck(result,__FILE__,__LINE__);

 struct CUDAMethods : public CUDAStubs {
   void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) const override {
     if (device) {
       TORCH_CUDA_CHECK(cudaGetDevice(device));
     }
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     CUevent_st* cuda_event_ptr;
     TORCH_CUDA_CHECK(cudaEventCreate(&cuda_event_ptr));
     *event = std::shared_ptr<CUevent_st>(cuda_event_ptr, [](CUevent_st* ptr) {
       TORCH_CUDA_CHECK(cudaEventDestroy(ptr));
     });
     auto stream = at::cuda::getCurrentCUDAStream();
     if (cpu_ns) {
       *cpu_ns = getTime();
     }
     TORCH_CUDA_CHECK(cudaEventRecord(cuda_event_ptr, stream));
   }

   float elapsed(const CUDAEventStub* event, const CUDAEventStub* event2) const override{
     TORCH_CUDA_CHECK(cudaEventSynchronize(event->get()));
     TORCH_CUDA_CHECK(cudaEventSynchronize(event2->get()));
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     float ms;
     TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event->get(), event2->get()));
     // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions)
     return ms*1000.0;
   }

   void nvtxMarkA(const char* name) const override {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     ::nvtxMark(name);
   }

   void nvtxRangePushA(const char* name) const override {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     ::nvtxRangePushA(name);
   }

   void nvtxRangePop() const override {
     ::nvtxRangePop();
   }

   void onEachDevice(std::function<void(int)> op) const override {
     at::cuda::OptionalCUDAGuard device_guard;
     // NOLINTNEXTLINE(bugprone-signed-char-misuse)
     int count = at::cuda::device_count();
     for(const auto i : c10::irange(count)) {
       device_guard.set_index(i);
       op(i);
     }
   }

   void synchronize() const override {
     cudaDeviceSynchronize();
   }

   bool enabled() const override {
     return true;
   }
 };

 struct RegisterCUDAMethods {
   RegisterCUDAMethods() {
     static CUDAMethods methods;
     registerCUDAMethods(&methods);
   }
 };
 RegisterCUDAMethods reg;

 } // namespaces
 } // namespace profiler
 } // namespace autograd
 } // namespace torch
	#include <torch/csrc/autograd/profiler.h>
	#include <c10/cuda/CUDAGuard.h>
	#include <c10/util/irange.h>
	#include <nvToolsExt.h>

	#include <sstream>

	namespace torch { namespace autograd { namespace profiler {

	namespace {

	static inline void cudaCheck(cudaError_t result, const char * file, int line) {
	if(result != cudaSuccess) {
	std::stringstream ss;
	ss << file << ":" << line << ": ";
	if (result == cudaErrorInitializationError) {
	// It is common for users to use DataLoader with multiple workers
	// and the autograd profiler. Throw a nice error message here.
	ss << "CUDA initialization error. "
	<< "This can occur if one runs the profiler in CUDA mode on code "
	<< "that creates a DataLoader with num_workers > 0. This operation "
	<< "is currently unsupported; potential workarounds are: "
	<< "(1) don't use the profiler in CUDA mode or (2) use num_workers=0 "
	<< "in the DataLoader or (3) Don't profile the data loading portion "
	<< "of your code. https://github.com/pytorch/pytorch/issues/6313 "
	<< "tracks profiler support for multi-worker DataLoader.";
	} else {
	ss << cudaGetErrorString(result);
	}
	throw std::runtime_error(ss.str());
	}
	}
	#define TORCH_CUDA_CHECK(result) cudaCheck(result,__FILE__,__LINE__);

	struct CUDAMethods : public CUDAStubs {
	void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) const override {
	if (device) {
	TORCH_CUDA_CHECK(cudaGetDevice(device));
	}
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	CUevent_st* cuda_event_ptr;
	TORCH_CUDA_CHECK(cudaEventCreate(&cuda_event_ptr));
	event = std::shared_ptr<CUevent_st>(cuda_event_ptr, [](CUevent_st ptr) {
	TORCH_CUDA_CHECK(cudaEventDestroy(ptr));
	});
	auto stream = at::cuda::getCurrentCUDAStream();
	if (cpu_ns) {
	*cpu_ns = getTime();
	}
	TORCH_CUDA_CHECK(cudaEventRecord(cuda_event_ptr, stream));
	}

	float elapsed(const CUDAEventStub* event, const CUDAEventStub* event2) const override{
	TORCH_CUDA_CHECK(cudaEventSynchronize(event->get()));
	TORCH_CUDA_CHECK(cudaEventSynchronize(event2->get()));
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	float ms;
	TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event->get(), event2->get()));
	// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions)
	return ms*1000.0;
	}

	void nvtxMarkA(const char* name) const override {
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	::nvtxMark(name);
	}

	void nvtxRangePushA(const char* name) const override {
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	::nvtxRangePushA(name);
	}

	void nvtxRangePop() const override {
	::nvtxRangePop();
	}

	void onEachDevice(std::function<void(int)> op) const override {
	at::cuda::OptionalCUDAGuard device_guard;
	// NOLINTNEXTLINE(bugprone-signed-char-misuse)
	int count = at::cuda::device_count();
	for(const auto i : c10::irange(count)) {
	device_guard.set_index(i);
	op(i);
	}
	}

	void synchronize() const override {
	cudaDeviceSynchronize();
	}

	bool enabled() const override {
	return true;
	}
	};

	struct RegisterCUDAMethods {
	RegisterCUDAMethods() {
	static CUDAMethods methods;
	registerCUDAMethods(&methods);
	}
	};
	RegisterCUDAMethods reg;

	} // namespaces
	} // namespace profiler
	} // namespace autograd
	} // namespace torch