add device to CUDAEvent (#9415)

Summary:
This PR add a device_ member to CUDAEvent. This is necessary since if we create a cudaEvent on one device but destroy it from another, it also creates an additional context on that device. So this device information is needed to guard the cudaEventDestroy. (cc: ngimel is this expected behavior? I can provide a simple cu script to repro this).

c10d tests are probably not in CI yet, please let me know how the test are run and I could double check.

Thanks pietern apaszke for help debugging!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9415

Reviewed By: apaszke

Differential Revision: D8839688

Pulled By: ailzhang

fbshipit-source-id: b950ba37d57b9e3c5fe71726ec92f6a9601c4d0e
diff --git a/torch/lib/c10d/CUDAUtils.cpp b/torch/lib/c10d/CUDAUtils.cpp
index 7ef2617..06d26c6 100644
--- a/torch/lib/c10d/CUDAUtils.cpp
+++ b/torch/lib/c10d/CUDAUtils.cpp
@@ -5,13 +5,20 @@
 namespace c10d {
 
 CUDAEvent CUDAEvent::create(unsigned int flags) {
-  CUDAEvent event;
+  int current_device;
+  C10D_CUDA_CHECK(cudaGetDevice(&current_device));
+  CUDAEvent event(nullptr, current_device);
+
   C10D_CUDA_CHECK(cudaEventCreateWithFlags(&event.event_, flags));
   return event;
 }
 
 CUDAEvent::~CUDAEvent() {
   if (event_ != nullptr) {
+    // cudaEventDestroy must run on the same device of the event,
+    // otherwise it creates a context on default device as well.
+    at::DeviceGuard guard(device_);
+
     C10D_CUDA_CHECK(cudaEventDestroy(event_));
   }
 }
diff --git a/torch/lib/c10d/CUDAUtils.hpp b/torch/lib/c10d/CUDAUtils.hpp
index 7c43ef8..f57f692 100644
--- a/torch/lib/c10d/CUDAUtils.hpp
+++ b/torch/lib/c10d/CUDAUtils.hpp
@@ -12,9 +12,9 @@
 // RAII wrapper for CUDA events.
 class CUDAEvent {
  public:
-  CUDAEvent(cudaEvent_t event) : event_(event) {}
+  CUDAEvent(cudaEvent_t event, int device) : device_(device), event_(event) {}
 
-  CUDAEvent() : CUDAEvent(nullptr) {}
+  CUDAEvent() : CUDAEvent(nullptr, 0) {}
 
   ~CUDAEvent();
 
@@ -27,11 +27,13 @@
   // Must be move constructable.
   CUDAEvent(CUDAEvent&& other) {
     std::swap(event_, other.event_);
+    std::swap(device_, other.device_);
   }
 
   // Must be move assignable.
   CUDAEvent& operator=(CUDAEvent&& other) {
     std::swap(event_, other.event_);
+    std::swap(device_, other.device_);
     return *this;
   }
 
@@ -39,7 +41,12 @@
     return event_;
   }
 
+  int getDevice() const {
+    return device_;
+  }
+
  protected:
+  int device_;
   cudaEvent_t event_;
 };