[Easy][c10d][DDP] (Reland) Minor fixes (#73569)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/73569

Reland https://github.com/pytorch/pytorch/pull/73299 and https://github.com/pytorch/pytorch/pull/73318 together.

Test Plan: Imported from OSS

Reviewed By: zhaojuanmao

Differential Revision: D34552418

Pulled By: awgu

fbshipit-source-id: 95088d2c1c67cd4fb9bbb115e15ba6b26ae06bdb
(cherry picked from commit 695ebc3dc0ccb08a167445588c293b3a6c3c00b7)
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
index a2f03f8..118ee3e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
@@ -69,11 +69,11 @@
     // Create output tensor data structure to pass into allgather.
     std::vector<std::vector<at::Tensor>> output_tensors;
     output_tensors.reserve(tensors_to_verify.size());
-    for (auto& tensor_shape : tensors_to_verify) {
+    for (const auto& tensor_shape : tensors_to_verify) {
       std::vector<at::Tensor> outputs;
       outputs.reserve(pg->getSize());
       for (const auto i : c10::irange(pg->getSize())) {
-        (void)i; //Suppress unused variable warning
+        (void)i;  // Suppress unused variable warning
         outputs.emplace_back(at::zeros_like(tensor_shape));
       }
       output_tensors.emplace_back(outputs);
@@ -143,12 +143,12 @@
     std::vector<std::string> dtype_strs;
     std::vector<std::string> device_type_strs;
     for (const auto& tensor_dtype : collective_fingerprint.tensor_dtypes_) {
-      dtype_strs.push_back(
+      dtype_strs.emplace_back(
           c10::toString(static_cast<at::ScalarType>(tensor_dtype)));
     }
     for (const auto& tensor_device_type :
          collective_fingerprint.tensor_device_types_) {
-      device_type_strs.push_back(
+      device_type_strs.emplace_back(
           c10::toString(static_cast<at::DeviceType>(tensor_device_type)));
     }
 
diff --git a/torch/csrc/distributed/c10d/Utils.cpp b/torch/csrc/distributed/c10d/Utils.cpp
index c8eb6e0..924d0a2 100644
--- a/torch/csrc/distributed/c10d/Utils.cpp
+++ b/torch/csrc/distributed/c10d/Utils.cpp
@@ -22,15 +22,10 @@
   std::vector<at::Tensor> shapeTensors;
   shapeTensors.reserve(tensors.size());
   for (const auto& tensor : tensors) {
-    auto shapesVec = tensor.sizes().vec();
-    int64_t shapes_size = shapesVec.size();
-    // Need to clone here otherwise the shapesVec.data() memory is not copied
-    // and can be released under the hood.
-    at::Tensor shapesTensor = at::from_blob(
-                                  shapesVec.data(),
-                                  {shapes_size},
-                                  at::TensorOptions().dtype(at::kLong))
-                                  .clone();
+    // Use `at::tensor()` to copy the data underlying `sizes()` since it may be
+    // released elsewhere.
+    at::Tensor shapesTensor =
+        at::tensor(tensor.sizes(), at::TensorOptions().dtype(at::kLong));
     shapeTensors.emplace_back(std::move(shapesTensor));
   }
   return shapeTensors;
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index 4e01863..d55eebc 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -125,11 +125,11 @@
   return per_bucket_variable_indices;
 }
 
-std::vector<int> Logger::get_bucket_sizes() {
-  std::vector<int> bucket_sizes;
+std::vector<int64_t> Logger::get_bucket_sizes() {
+  std::vector<int64_t> bucket_sizes;
   for (const auto& bucket : reducer_->buckets_) {
     const auto& variables = bucket.variables;
-    int bucket_size = 0;
+    int64_t bucket_size = 0;
     for (const auto& v : variables) {
       bucket_size += v.numel() * v.element_size();
     }
diff --git a/torch/csrc/distributed/c10d/logger.hpp b/torch/csrc/distributed/c10d/logger.hpp
index 07174d4..62c678d 100644
--- a/torch/csrc/distributed/c10d/logger.hpp
+++ b/torch/csrc/distributed/c10d/logger.hpp
@@ -41,7 +41,7 @@
   // Set parameters stats.
   void set_parameter_stats();
   // Get size of each bucket (Bytes).
-  std::vector<int> get_bucket_sizes();
+  std::vector<int64_t> get_bucket_sizes();
   // Get bucket size limits specified during DDP construction.
   std::vector<int> get_bucket_size_limits();
   // Get variable indices for each bucket.
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 42df400..815e36c 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -1903,7 +1903,7 @@
 // composite key of a tensor's type identifier and its device.
 struct BucketKey {
   BucketKey(c10::ScalarType type, c10::Device device)
-      : type(std::move(type)), device(std::move(device)) {}
+      : type(type), device(device) {}
 
   const c10::ScalarType type;
   const c10::Device device;