torch/csrc/profiler/stubs/cuda.cpp - platform/external/pytorch - Git at Google

 #include <sstream>

 #ifndef FBCODE_CAFFE2
 #include <nvtx3/nvToolsExt.h>
 #else
 #include <nvToolsExt.h>
 #endif

 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/profiler/stubs/base.h>
 #include <torch/csrc/profiler/util.h>

 namespace torch {
 namespace profiler {
 namespace impl {
 namespace {

 static inline void cudaCheck(cudaError_t result, const char* file, int line) {
   if (result != cudaSuccess) {
     std::stringstream ss;
     ss << file << ":" << line << ": ";
     if (result == cudaErrorInitializationError) {
       // It is common for users to use DataLoader with multiple workers
       // and the autograd profiler. Throw a nice error message here.
       ss << "CUDA initialization error. "
          << "This can occur if one runs the profiler in CUDA mode on code "
          << "that creates a DataLoader with num_workers > 0. This operation "
          << "is currently unsupported; potential workarounds are: "
          << "(1) don't use the profiler in CUDA mode or (2) use num_workers=0 "
          << "in the DataLoader or (3) Don't profile the data loading portion "
          << "of your code. https://github.com/pytorch/pytorch/issues/6313 "
          << "tracks profiler support for multi-worker DataLoader.";
     } else {
       ss << cudaGetErrorString(result);
     }
     throw std::runtime_error(ss.str());
   }
 }
 #define TORCH_CUDA_CHECK(result) cudaCheck(result, __FILE__, __LINE__);

 struct CUDAMethods : public ProfilerStubs {
   void record(int* device, ProfilerEventStub* event, int64_t* cpu_ns)
       const override {
     if (device) {
       TORCH_CUDA_CHECK(cudaGetDevice(device));
     }
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     CUevent_st* cuda_event_ptr;
     TORCH_CUDA_CHECK(cudaEventCreate(&cuda_event_ptr));
     *event = std::shared_ptr<CUevent_st>(cuda_event_ptr, [](CUevent_st* ptr) {
       TORCH_CUDA_CHECK(cudaEventDestroy(ptr));
     });
     auto stream = at::cuda::getCurrentCUDAStream();
     if (cpu_ns) {
       *cpu_ns = torch::profiler::impl::getTime();
     }
     TORCH_CUDA_CHECK(cudaEventRecord(cuda_event_ptr, stream));
   }

   float elapsed(const ProfilerEventStub* event, const ProfilerEventStub* event2)
       const override {
     TORCH_CUDA_CHECK(cudaEventSynchronize(event->get()));
     TORCH_CUDA_CHECK(cudaEventSynchronize(event2->get()));
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     float ms;
     TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event->get(), event2->get()));
     // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions)
     return ms * 1000.0;
   }

   void mark(const char* name) const override {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     ::nvtxMark(name);
   }

   void rangePush(const char* name) const override {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     ::nvtxRangePushA(name);
   }

   void rangePop() const override {
     ::nvtxRangePop();
   }

   void onEachDevice(std::function<void(int)> op) const override {
     at::cuda::OptionalCUDAGuard device_guard;
     for (const auto i : c10::irange(at::cuda::device_count())) {
       device_guard.set_index(i);
       op(i);
     }
   }

   void synchronize() const override {
     TORCH_CUDA_CHECK(cudaDeviceSynchronize());
   }

   bool enabled() const override {
     return true;
   }
 };

 struct RegisterCUDAMethods {
   RegisterCUDAMethods() {
     static CUDAMethods methods;
     registerCUDAMethods(&methods);
   }
 };
 RegisterCUDAMethods reg;

 } // namespace
 } // namespace impl
 } // namespace profiler
 } // namespace torch
	#include <sstream>

	#ifndef FBCODE_CAFFE2
	#include <nvtx3/nvToolsExt.h>
	#else
	#include <nvToolsExt.h>
	#endif

	#include <c10/cuda/CUDAGuard.h>
	#include <c10/util/irange.h>
	#include <torch/csrc/profiler/stubs/base.h>
	#include <torch/csrc/profiler/util.h>

	namespace torch {
	namespace profiler {
	namespace impl {
	namespace {

	static inline void cudaCheck(cudaError_t result, const char* file, int line) {
	if (result != cudaSuccess) {
	std::stringstream ss;
	ss << file << ":" << line << ": ";
	if (result == cudaErrorInitializationError) {
	// It is common for users to use DataLoader with multiple workers
	// and the autograd profiler. Throw a nice error message here.
	ss << "CUDA initialization error. "
	<< "This can occur if one runs the profiler in CUDA mode on code "
	<< "that creates a DataLoader with num_workers > 0. This operation "
	<< "is currently unsupported; potential workarounds are: "
	<< "(1) don't use the profiler in CUDA mode or (2) use num_workers=0 "
	<< "in the DataLoader or (3) Don't profile the data loading portion "
	<< "of your code. https://github.com/pytorch/pytorch/issues/6313 "
	<< "tracks profiler support for multi-worker DataLoader.";
	} else {
	ss << cudaGetErrorString(result);
	}
	throw std::runtime_error(ss.str());
	}
	}
	#define TORCH_CUDA_CHECK(result) cudaCheck(result, __FILE__, __LINE__);

	struct CUDAMethods : public ProfilerStubs {
	void record(int* device, ProfilerEventStub* event, int64_t* cpu_ns)
	const override {
	if (device) {
	TORCH_CUDA_CHECK(cudaGetDevice(device));
	}
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	CUevent_st* cuda_event_ptr;
	TORCH_CUDA_CHECK(cudaEventCreate(&cuda_event_ptr));
	event = std::shared_ptr<CUevent_st>(cuda_event_ptr, [](CUevent_st ptr) {
	TORCH_CUDA_CHECK(cudaEventDestroy(ptr));
	});
	auto stream = at::cuda::getCurrentCUDAStream();
	if (cpu_ns) {
	*cpu_ns = torch::profiler::impl::getTime();
	}
	TORCH_CUDA_CHECK(cudaEventRecord(cuda_event_ptr, stream));
	}

	float elapsed(const ProfilerEventStub* event, const ProfilerEventStub* event2)
	const override {
	TORCH_CUDA_CHECK(cudaEventSynchronize(event->get()));
	TORCH_CUDA_CHECK(cudaEventSynchronize(event2->get()));
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	float ms;
	TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event->get(), event2->get()));
	// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions)
	return ms * 1000.0;
	}

	void mark(const char* name) const override {
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	::nvtxMark(name);
	}

	void rangePush(const char* name) const override {
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	::nvtxRangePushA(name);
	}

	void rangePop() const override {
	::nvtxRangePop();
	}

	void onEachDevice(std::function<void(int)> op) const override {
	at::cuda::OptionalCUDAGuard device_guard;
	for (const auto i : c10::irange(at::cuda::device_count())) {
	device_guard.set_index(i);
	op(i);
	}
	}

	void synchronize() const override {
	TORCH_CUDA_CHECK(cudaDeviceSynchronize());
	}

	bool enabled() const override {
	return true;
	}
	};

	struct RegisterCUDAMethods {
	RegisterCUDAMethods() {
	static CUDAMethods methods;
	registerCUDAMethods(&methods);
	}
	};
	RegisterCUDAMethods reg;

	} // namespace
	} // namespace impl
	} // namespace profiler
	} // namespace torch