add device to CUDAEvent (#9415)
Summary:
This PR add a device_ member to CUDAEvent. This is necessary since if we create a cudaEvent on one device but destroy it from another, it also creates an additional context on that device. So this device information is needed to guard the cudaEventDestroy. (cc: ngimel is this expected behavior? I can provide a simple cu script to repro this).
c10d tests are probably not in CI yet, please let me know how the test are run and I could double check.
Thanks pietern apaszke for help debugging!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9415
Reviewed By: apaszke
Differential Revision: D8839688
Pulled By: ailzhang
fbshipit-source-id: b950ba37d57b9e3c5fe71726ec92f6a9601c4d0e
diff --git a/torch/lib/c10d/CUDAUtils.cpp b/torch/lib/c10d/CUDAUtils.cpp
index 7ef2617..06d26c6 100644
--- a/torch/lib/c10d/CUDAUtils.cpp
+++ b/torch/lib/c10d/CUDAUtils.cpp
@@ -5,13 +5,20 @@
namespace c10d {
CUDAEvent CUDAEvent::create(unsigned int flags) {
- CUDAEvent event;
+ int current_device;
+ C10D_CUDA_CHECK(cudaGetDevice(¤t_device));
+ CUDAEvent event(nullptr, current_device);
+
C10D_CUDA_CHECK(cudaEventCreateWithFlags(&event.event_, flags));
return event;
}
CUDAEvent::~CUDAEvent() {
if (event_ != nullptr) {
+ // cudaEventDestroy must run on the same device of the event,
+ // otherwise it creates a context on default device as well.
+ at::DeviceGuard guard(device_);
+
C10D_CUDA_CHECK(cudaEventDestroy(event_));
}
}
diff --git a/torch/lib/c10d/CUDAUtils.hpp b/torch/lib/c10d/CUDAUtils.hpp
index 7c43ef8..f57f692 100644
--- a/torch/lib/c10d/CUDAUtils.hpp
+++ b/torch/lib/c10d/CUDAUtils.hpp
@@ -12,9 +12,9 @@
// RAII wrapper for CUDA events.
class CUDAEvent {
public:
- CUDAEvent(cudaEvent_t event) : event_(event) {}
+ CUDAEvent(cudaEvent_t event, int device) : device_(device), event_(event) {}
- CUDAEvent() : CUDAEvent(nullptr) {}
+ CUDAEvent() : CUDAEvent(nullptr, 0) {}
~CUDAEvent();
@@ -27,11 +27,13 @@
// Must be move constructable.
CUDAEvent(CUDAEvent&& other) {
std::swap(event_, other.event_);
+ std::swap(device_, other.device_);
}
// Must be move assignable.
CUDAEvent& operator=(CUDAEvent&& other) {
std::swap(event_, other.event_);
+ std::swap(device_, other.device_);
return *this;
}
@@ -39,7 +41,12 @@
return event_;
}
+ int getDevice() const {
+ return device_;
+ }
+
protected:
+ int device_;
cudaEvent_t event_;
};