[Distributed] [1/N] Fix clang-tidy warnings in torch/csrc/distributed/c10d (#122884)

This PR fixes some clang-tidy warnings in distributed code.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/122884
Approved by: https://github.com/kwen2501
diff --git a/torch/csrc/distributed/c10d/FileStore.cpp b/torch/csrc/distributed/c10d/FileStore.cpp
index a47deca..98b9029 100644
--- a/torch/csrc/distributed/c10d/FileStore.cpp
+++ b/torch/csrc/distributed/c10d/FileStore.cpp
@@ -17,11 +17,6 @@
 
 #include <chrono>
 #include <cstdio>
-#include <functional>
-#include <iostream>
-#include <limits>
-#include <sstream>
-#include <system_error>
 #include <thread>
 #include <utility>
 
@@ -260,7 +255,7 @@
     File& file,
     off_t pos,
     std::unordered_map<std::string, std::vector<uint8_t>>& cache,
-    const std::string deletePrefix) {
+    const std::string& deletePrefix) {
   auto size = file.size();
   if (size != pos) {
     std::string tmpKey;
@@ -429,7 +424,7 @@
   File file(path_, O_RDONLY, timeout_);
   auto lock = file.lockShared();
   pos_ = refresh(file, pos_, cache_, deletePrefix_);
-  return cache_.size();
+  return static_cast<int64_t>(cache_.size());
 }
 
 bool FileStore::deleteKey(const std::string& key) {
diff --git a/torch/csrc/distributed/c10d/PrefixStore.cpp b/torch/csrc/distributed/c10d/PrefixStore.cpp
index 54f6ac9..bd69c9a 100644
--- a/torch/csrc/distributed/c10d/PrefixStore.cpp
+++ b/torch/csrc/distributed/c10d/PrefixStore.cpp
@@ -83,6 +83,7 @@
 std::vector<std::vector<uint8_t>> PrefixStore::multiGet(
     const std::vector<std::string>& keys) {
   std::vector<std::string> prefixed_keys;
+  prefixed_keys.reserve(keys.size());
   for (auto& key : keys) {
     prefixed_keys.push_back(joinKey(key));
   }
@@ -93,6 +94,7 @@
     const std::vector<std::string>& keys,
     const std::vector<std::vector<uint8_t>>& values) {
   std::vector<std::string> prefixed_keys;
+  prefixed_keys.reserve(keys.size());
   for (auto& key : keys) {
     prefixed_keys.push_back(joinKey(key));
   }
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
index 248a2c9..ec80ed9 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@@ -3,6 +3,7 @@
 
 #include <c10/util/Logging.h>
 #include <fmt/format.h>
+#include <string_view>
 
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
@@ -13,7 +14,7 @@
 
 namespace c10d {
 
-static ProcessGroup::BackendType strToBackendType(std::string backend) {
+static ProcessGroup::BackendType strToBackendType(std::string_view backend) {
   if (backend == "undefined") {
     return ProcessGroup::BackendType::UNDEFINED;
   } else if (backend == "gloo") {
@@ -111,7 +112,7 @@
   }
 
   // Get the backend type associated with the device
-  ProcessGroup::BackendType backendType;
+  ProcessGroup::BackendType backendType{ProcessGroup::BackendType::UNDEFINED};
   try {
     backendType = deviceTypeToBackendType_.at(deviceType);
   } catch (const std::out_of_range& e) {
@@ -142,8 +143,8 @@
     : store_(store),
       rank_(rank),
       size_(size),
-      options_(options),
-      backendType_(strToBackendType(options->backend)),
+      options_(std::move(options)),
+      backendType_(strToBackendType(options_->backend)),
       dist_debug_level_(debug_level()) {
   C10_LOG_API_USAGE_ONCE("c10d.process_group");
 }
@@ -159,7 +160,7 @@
 }
 
 const std::string& ProcessGroup::getGroupName() const {
-  TORCH_CHECK(deviceTypeToBackend_.size(), "ProcessGroup name not set");
+  TORCH_CHECK(!deviceTypeToBackend_.empty(), "ProcessGroup name not set");
   return deviceTypeToBackend_.begin()->second->getGroupName();
 }
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index a21b98c..5aba2c4 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -727,7 +727,7 @@
 
   // Debug level setting. It is parsed once when ProcessGroup is constructed and
   // remains the same across use of this process group.
-  DebugLevel dist_debug_level_;
+  DebugLevel dist_debug_level_{DebugLevel::Off};
 
   // Backend classes for this ProcessGroup
   std::unordered_set<c10::DeviceType> deviceTypes_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index d3c9a61..50e1b95 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -7,8 +7,6 @@
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <chrono>
 #include <exception>
-#include <ratio>
-#include <tuple>
 
 #ifdef _WIN32
 #include <gloo/common/win.h>
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
index 45f5135..a6086d2 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
@@ -89,15 +89,15 @@
   // Takes a serialized fingerprint from
   // CollectiveFingerPrint::serialize_fingerprint and deserializes it back to a
   // CollectiveFingerPrint struct
-  CollectiveFingerPrint deserialize_fingerprint(at::Tensor serialized_tensor) {
-    OpType optype;
+  CollectiveFingerPrint deserialize_fingerprint(
+      const at::Tensor& serialized_tensor) {
     auto dtypes = std::vector<int8_t>();
     auto device_types = std::vector<int8_t>();
     auto sizes = std::vector<std::vector<int64_t>>();
     int index = 0;
-    int seq = 0;
+    int64_t seq = 0;
     // 1. OpType
-    optype = OpType(serialized_tensor[index].item<int>());
+    auto optype = OpType(serialized_tensor[index].item<int>());
     index++;
     int num_tensors = 0;
     if (index < serialized_tensor.size(0)) {
@@ -383,7 +383,7 @@
 } // namespace
 
 ProcessGroupWrapper::ProcessGroupWrapper(
-    c10::intrusive_ptr<Backend> backend,
+    const c10::intrusive_ptr<Backend>& backend,
     c10::intrusive_ptr<Backend> glooBackend)
     : Backend(backend->getRank(), backend->getSize()),
       backend_(backend),
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
index 13503ca..50c0bfc 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
@@ -11,7 +11,7 @@
 class TORCH_API ProcessGroupWrapper : public Backend {
  public:
   explicit ProcessGroupWrapper(
-      c10::intrusive_ptr<Backend> backend,
+      const c10::intrusive_ptr<Backend>& backend,
       c10::intrusive_ptr<Backend> glooBackend);
 
   const std::string getBackendName() const override;
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cpp b/torch/csrc/distributed/c10d/intra_node_comm.cpp
index c4120e3..2579b38 100644
--- a/torch/csrc/distributed/c10d/intra_node_comm.cpp
+++ b/torch/csrc/distributed/c10d/intra_node_comm.cpp
@@ -6,7 +6,7 @@
 #include <torch/csrc/distributed/c10d/Utils.hpp>
 
 #include <iostream>
-#include <random>
+#include <utility>
 
 #include <fcntl.h>
 #include <pthread.h>
@@ -22,8 +22,7 @@
 
 #include <cuda_runtime.h>
 
-namespace c10d {
-namespace intra_node_comm {
+namespace c10d::intra_node_comm {
 
 static std::vector<std::string> ENABLE_INTRA_NODE_COMM = {
     "ENABLE_INTRA_NODE_COMM"};
@@ -57,7 +56,7 @@
     for (size_t j = 0; j < kMaxDevices; ++j) {
       oss << nvlMesh[i][j] << " ";
     }
-    oss << std::endl;
+    oss << '\n';
   }
   os << oss.str();
   return os;
@@ -77,7 +76,7 @@
 /**
  * Query the nvlink connection among devices.
  */
-static NvlMesh getNvlMesh(std::vector<std::string> rankToBusId) {
+static NvlMesh getNvlMesh(const std::vector<std::string>& rankToBusId) {
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
   using namespace c10::cuda;
 
@@ -88,12 +87,12 @@
   }
 
   const auto worldSize = rankToBusId.size();
-  std::vector<nvmlDevice_t> devices(worldSize, 0);
+  std::vector<nvmlDevice_t> devices(worldSize, nullptr);
   std::unordered_map<std::string, size_t> busIdToRank;
   std::vector<size_t> switchLinkCount(worldSize, 0);
 
   for (size_t r = 0; r < worldSize; ++r) {
-    busIdToRank.emplace(std::make_pair(rankToBusId[r], r));
+    busIdToRank.emplace(rankToBusId[r], r);
     TORCH_CHECK(
         driverApi->nvmlDeviceGetHandleByPciBusId_v2_(
             rankToBusId[r].c_str(), &devices[r]) == NVML_SUCCESS);
@@ -209,7 +208,7 @@
     size_t rank,
     size_t worldSize,
     c10::optional<size_t> bufferSize)
-    : store_(store),
+    : store_(std::move(store)),
       rank_(rank),
       worldSize_(worldSize),
       bufferSize_(bufferSize.has_value() ? *bufferSize : kDefaultBufferSize) {
@@ -248,12 +247,12 @@
  */
 template <typename T>
 std::vector<T> storeAllGather(
-    c10::intrusive_ptr<c10d::Store> store,
+    const c10::intrusive_ptr<c10d::Store>& store,
     const std::string& prefix,
     size_t rank,
     size_t worldSize,
     T val) {
-  static_assert(std::is_trivially_copyable<T>::value);
+  static_assert(std::is_trivially_copyable_v<T>);
 
   std::vector<std::string> peerKeys;
   for (size_t r = 0; r < worldSize; ++r) {
@@ -278,7 +277,7 @@
     store->wait({peerKeys[r]});
     auto payload = store->get(peerKeys[r]);
     TORCH_CHECK(payload.size() == sizeof(T));
-    T peerVal;
+    T peerVal{};
     std::memcpy(&peerVal, payload.data(), sizeof(T));
     peerVals.push_back(peerVal);
   }
@@ -429,5 +428,4 @@
   return false;
 }
 
-} // namespace intra_node_comm
-} // namespace c10d
+} // namespace c10d::intra_node_comm
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.hpp b/torch/csrc/distributed/c10d/intra_node_comm.hpp
index 7addb49..5c5bcdc 100644
--- a/torch/csrc/distributed/c10d/intra_node_comm.hpp
+++ b/torch/csrc/distributed/c10d/intra_node_comm.hpp
@@ -6,8 +6,7 @@
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/Work.hpp>
 
-namespace c10d {
-namespace intra_node_comm {
+namespace c10d::intra_node_comm {
 
 constexpr size_t kMaxDevices = 8;
 constexpr size_t kDefaultBufferSize = 10 * 1024 * 1024;
@@ -27,7 +26,7 @@
       size_t worldSize,
       c10::optional<size_t> bufferSize = c10::nullopt);
 
-  ~IntraNodeComm();
+  ~IntraNodeComm() override;
 
   static bool isEnabled();
 
@@ -69,11 +68,11 @@
    */
   bool isInitialized_ = false;
   Topology topology_ = Topology::UNKNOWN;
-  std::array<void*, kMaxDevices> p2pStates_;
-  std::array<void*, kMaxDevices> buffers_;
-  void* p2pStatesDev_;
-  void* buffersDev_;
-  void* topoInfo_;
+  std::array<void*, kMaxDevices> p2pStates_{};
+  std::array<void*, kMaxDevices> buffers_{};
+  void* p2pStatesDev_{};
+  void* buffersDev_{};
+  void* topoInfo_{};
 };
 
 /**
@@ -115,5 +114,4 @@
 
 TORCH_API int64_t getIntraNodeCommUsageCounter();
 
-} // namespace intra_node_comm
-} // namespace c10d
+} // namespace c10d::intra_node_comm
diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp
index 5013f25..093a47a 100644
--- a/torch/csrc/distributed/c10d/socket.cpp
+++ b/torch/csrc/distributed/c10d/socket.cpp
@@ -8,7 +8,6 @@
 
 #include <cstring>
 #include <system_error>
-#include <thread>
 #include <utility>
 #include <vector>
 
@@ -38,8 +37,7 @@
 
 #include <c10/util/CallOnce.h>
 
-namespace c10d {
-namespace detail {
+namespace c10d::detail {
 namespace {
 #ifdef _WIN32
 
@@ -183,8 +181,7 @@
 
   Handle hnd_;
 };
-} // namespace detail
-} // namespace c10d
+} // namespace c10d::detail
 
 //
 // libfmt formatters for `addrinfo` and `Socket`
@@ -251,8 +248,7 @@
 
 } // namespace fmt
 
-namespace c10d {
-namespace detail {
+namespace c10d::detail {
 
 SocketImpl::~SocketImpl() {
 #ifdef _WIN32
@@ -458,6 +454,7 @@
   bool tryListen(const ::addrinfo& addr);
 
   template <typename... Args>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
   void recordError(fmt::string_view format, Args&&... args) {
     auto msg = fmt::vformat(format, fmt::make_format_args(args...));
 
@@ -614,7 +611,9 @@
   std::unique_ptr<SocketImpl> run() const;
 
  private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const int fd_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const std::uint16_t expected_port_;
 };
 
@@ -624,7 +623,7 @@
 std::unique_ptr<SocketImpl> SocketListenFromFdOp::run() const {
   C10D_DEBUG("listenFromFd: fd {}, expected port {}", fd_, expected_port_);
 
-  ::sockaddr_storage addr_storage;
+  ::sockaddr_storage addr_storage{};
   ::socklen_t addr_len = sizeof(addr_storage);
   if (::getsockname(
           fd_, reinterpret_cast<::sockaddr*>(&addr_storage), &addr_len) < 0) {
@@ -691,6 +690,7 @@
   [[noreturn]] void throwTimeoutError() const;
 
   template <typename... Args>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
   void recordError(fmt::string_view format, Args&&... args) {
     auto msg = fmt::vformat(format, fmt::make_format_args(args...));
 
@@ -1033,6 +1033,4 @@
   return impl_->waitForInput(timeout);
 }
 
-} // namespace detail
-
-} // namespace c10d
+} // namespace c10d::detail