[profiler] Fix description to use nelems rather than size (#114735)

We were storing the number of elements in the tensor, rather than the actual bytes.

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/114735
Approved by: https://github.com/aaronenyeshi, https://github.com/yoyoyocmu, https://github.com/kwen2501, https://github.com/fduwjj
diff --git a/torch/csrc/distributed/c10d/ParamCommsUtils.cpp b/torch/csrc/distributed/c10d/ParamCommsUtils.cpp
index a0818a8..0b01bda 100644
--- a/torch/csrc/distributed/c10d/ParamCommsUtils.cpp
+++ b/torch/csrc/distributed/c10d/ParamCommsUtils.cpp
@@ -10,8 +10,8 @@
 ParamCommsDebugInfo::ParamCommsDebugInfo(
     int rank,
     std::string&& colName,
-    int inSize,
-    int outSize,
+    int inNelems,
+    int outNelems,
     at::ScalarType dType,
     std::vector<int64_t> inSplitSizes,
     std::vector<int64_t> outSplitSizes,
@@ -19,8 +19,8 @@
     : rank_(rank),
       worldSize_(worldSize),
       columnName_(colName),
-      inMessageSize_(inSize),
-      outMessageSize_(outSize),
+      inMessageNelems_(inNelems),
+      outMessageNelems_(outNelems),
       dType_(dType),
       inputSplitSizes_(std::move(inSplitSizes)),
       outputSplitSizes_(std::move(outSplitSizes)) {}
diff --git a/torch/csrc/distributed/c10d/ParamCommsUtils.hpp b/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
index f6fd374..25a0b6c 100644
--- a/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
+++ b/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
@@ -15,8 +15,8 @@
   ParamCommsDebugInfo(
       int rank,
       std::string&& colName,
-      int inSize,
-      int outSize,
+      int inNelems,
+      int outNelems,
       at::ScalarType dType,
       std::vector<int64_t> inSplitSizes,
       std::vector<int64_t> outSplitSizes,
@@ -36,12 +36,12 @@
     return columnName_;
   }
 
-  int getInMessageSize() const {
-    return inMessageSize_;
+  int getInMessageNelems() const {
+    return inMessageNelems_;
   }
 
-  int getOutMessageSize() const {
-    return outMessageSize_;
+  int getOutMessageNelems() const {
+    return outMessageNelems_;
   }
 
   at::ScalarType getDType() const {
@@ -60,8 +60,8 @@
   int rank_{};
   int worldSize_{};
   std::string columnName_;
-  int inMessageSize_{};
-  int outMessageSize_{};
+  int inMessageNelems_{};
+  int outMessageNelems_{};
   at::ScalarType dType_ = at::kByte;
   std::vector<int64_t> inputSplitSizes_;
   std::vector<int64_t> outputSplitSizes_;
@@ -72,8 +72,8 @@
     pg_ptr,                                                                    \
     rank,                                                                      \
     colName,                                                                   \
-    inSize,                                                                    \
-    outSize,                                                                   \
+    inNelems,                                                                  \
+    outNelems,                                                                 \
     dType,                                                                     \
     inSplitSizes,                                                              \
     outSplitSizes,                                                             \
@@ -81,8 +81,8 @@
   auto paramCommsInfo = std::make_shared<torch::ParamCommsDebugInfo>(          \
       rank,                                                                    \
       colName,                                                                 \
-      inSize,                                                                  \
-      outSize,                                                                 \
+      inNelems,                                                                \
+      outNelems,                                                               \
       dType,                                                                   \
       inSplitSizes,                                                            \
       outSplitSizes,                                                           \
@@ -106,8 +106,8 @@
     OutputTensors,                                                             \
     rank,                                                                      \
     colName,                                                                   \
-    inSize,                                                                    \
-    outSize,                                                                   \
+    inNelems,                                                                  \
+    outNelems,                                                                 \
     dType,                                                                     \
     inSplitSizes,                                                              \
     outSplitSizes,                                                             \
@@ -115,8 +115,8 @@
   auto paramCommsInfo = std::make_shared<torch::ParamCommsDebugInfo>(          \
       rank,                                                                    \
       colName,                                                                 \
-      inSize,                                                                  \
-      outSize,                                                                 \
+      inNelems,                                                                \
+      outNelems,                                                               \
       dType,                                                                   \
       inSplitSizes,                                                            \
       outSplitSizes,                                                           \
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 44dc1d0..f26f21c 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -620,8 +620,8 @@
       0, // process group ptr
       rank_, // rank
       "wait", // colName
-      0, // inSize
-      0, // outSize
+      0, // inNelems
+      0, // outNelems
       at::kByte, // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
@@ -792,8 +792,8 @@
       this->getID(),
       rank, // rank
       "init", // colName
-      0, // inSize
-      0, // outSize
+      0, // inNelems
+      0, // outNelems
       at::kByte, // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
@@ -2591,8 +2591,8 @@
       tensors, // outputTensors
       rank_, // rank
       "allreduce", // colName
-      tensor.numel(), // inSize
-      tensor.numel(), // outSize
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
@@ -2616,8 +2616,8 @@
       tensors, // outputTensors
       rank_, // rank
       "allreduce_coalesced", // colName
-      total_numel, // inSize
-      total_numel, // outSize
+      total_numel, // inNelems
+      total_numel, // outNelems
       tensors[0].scalar_type(), // dType
       // I'm not sure what in,outSplitSizes mean here.
       std::vector<int64_t>(), // inSplitSizes
@@ -2643,8 +2643,8 @@
       tensors, // outputTensors
       opts.rootRank, // root rank
       "broadcast", // colName
-      tensor.numel(), // inSize
-      tensor.numel(), // outSize
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
@@ -2706,8 +2706,8 @@
       outputTensors, // outputTensors
       opts.rootRank, // root rank
       "_broadcast_oop", // colName
-      tensor.numel(), // inSize
-      tensor.numel(), // outSize
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
@@ -2748,8 +2748,8 @@
       tensors, // outputTensors
       opts.rootRank, // root rank
       "reduce", // colName
-      tensor.numel(), // inSize
-      tensor.numel(), // outSize
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
@@ -2812,8 +2812,8 @@
       outputTensors, // outputTensors
       opts.rootRank, // root rank
       "_reduce_oop", // colName
-      tensor.numel(), // inSize
-      tensor.numel(), // outSize
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
@@ -2869,8 +2869,8 @@
         outputTensors, // outputTensors
         rank_, // rank
         "all_gather", // colName
-        tensor.numel(), // inSize
-        tensor.numel() * // outSize
+        tensor.numel(), // inNelems
+        tensor.numel() * // outNelems
             this->getSize(),
         tensor.scalar_type(), // dType
         std::vector<int64_t>(), // inSplitSizes
@@ -3013,8 +3013,8 @@
         outputTensors, // outputTensors
         rank_, // rank
         "reduce_scatter", // colName
-        tensor.numel() * this->getSize(), // inSize
-        tensor.numel(), // outSize
+        tensor.numel() * this->getSize(), // inNelems
+        tensor.numel(), // outNelems
         tensor.scalar_type(), // dType
         std::vector<int64_t>(), // inSplitSizes
         std::vector<int64_t>(), // outSplitSizes
@@ -3132,8 +3132,8 @@
       outputTensor, // outputTensor
       rank_, // rank
       "_reduce_scatter_base", // colName
-      inputTensor.numel(), // inSize
-      tensor.numel(), // outSize
+      inputTensor.numel(), // inNelems
+      tensor.numel(), // outNelems
       tensor.scalar_type(), // dtype
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
@@ -3220,8 +3220,8 @@
       this->getID(),
       rank_, // rank
       "barrier", // colName
-      0, // inSize
-      0, // outSize
+      0, // inNelems
+      0, // outNelems
       at::kByte, // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
@@ -3301,8 +3301,8 @@
         outputTensor, // outputTensor
         rank_, // rank
         "all_to_all", // colName
-        inputTensor.numel(), // inSize
-        outputTensor.numel(), // outSize
+        inputTensor.numel(), // inNelems
+        outputTensor.numel(), // outNelems
         inputTensor.scalar_type(), // dType
         std::vector<int64_t>(), // inSplitSizes
         std::vector<int64_t>(), // outSplitSizes
@@ -3343,8 +3343,8 @@
         outputTensor, // outputTensor
         rank_, // rank
         "all_to_allv", // colName
-        inputTensor.numel(), // inSize
-        outputTensor.numel(), // outSize
+        inputTensor.numel(), // inNelems
+        outputTensor.numel(), // outNelems
         inputTensor.scalar_type(), // dType
         inputSplitSizes, // inSplitSizes
         outputSplitSizes, // outSplitSizes
@@ -3419,8 +3419,8 @@
       outputTensors, // outputTensors
       rank_, // rank
       "all_to_all", // colName
-      total_numel, // inSize
-      total_numel, // outSize
+      total_numel, // inNelems
+      total_numel, // outNelems
       inputTensors.front().scalar_type(), // dType
       inSplitSizes, // inSplitSizes
       outSplitSizes, // outSplitSizes
@@ -3470,8 +3470,8 @@
       tensors, // outputTensors
       dstRank, // dst rank
       "send", // colName
-      tensor.numel(), // inSize
-      tensor.numel(), // outSize
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
@@ -3508,8 +3508,8 @@
       tensors, // outputTensors
       srcRank, // src rank
       "recv", // colName
-      tensor.numel(), // inSize
-      tensor.numel(), // outSize
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
@@ -3668,8 +3668,8 @@
       outputTensors, // outputTensors
       opts.rootRank, // root rank
       "gather", // colName
-      tensor.numel(), // inSize
-      tensor.numel() * this->getSize(), // outSize
+      tensor.numel(), // inNelems
+      tensor.numel() * this->getSize(), // outNelems
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSize
@@ -3755,8 +3755,8 @@
       outputTensors, // outputTensors
       opts.rootRank, // root rank
       "scatter", // colName
-      tensor.numel(), // inSize
-      tensor.numel() * this->getSize(), // outSize
+      tensor.numel(), // inNelems
+      tensor.numel() * this->getSize(), // outNelems
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSize
@@ -3826,8 +3826,8 @@
       output_tensor, // outputTensors
       rank_, // rank
       "_allgather_base", // colName
-      input_tensor.numel(), // inSize
-      tensor.numel(), // outSize
+      input_tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSize
diff --git a/torch/csrc/distributed/c10d/UCCTracing.cpp b/torch/csrc/distributed/c10d/UCCTracing.cpp
index db7d687..d75bb8c 100644
--- a/torch/csrc/distributed/c10d/UCCTracing.cpp
+++ b/torch/csrc/distributed/c10d/UCCTracing.cpp
@@ -85,8 +85,8 @@
     const int world_size,
     const std::vector<at::Tensor>& inputTensors,
     const std::vector<at::Tensor>& outputTensors) {
-  auto inSize = (!inputTensors.empty()) ? inputTensors[0].numel() : 0;
-  auto outSize = (!outputTensors.empty()) ? outputTensors[0].numel() : 0;
+  auto inNelems = (!inputTensors.empty()) ? inputTensors[0].numel() : 0;
+  auto outNelems = (!outputTensors.empty()) ? outputTensors[0].numel() : 0;
   auto dtype =
       (!outputTensors.empty()) ? outputTensors[0].scalar_type() : at::kByte;
   auto devType = (!outputTensors.empty()) ? outputTensors[0].device().type()
@@ -116,14 +116,14 @@
       ",\n\t\t\"world_size\": ",
       world_size);
 
-  if (inSize > 0 || outSize > 0) {
+  if (inNelems > 0 || outNelems > 0) {
     // for most collectives - append msg sizes, data type, device type
     cur_trace_ = c10::str(
         cur_trace_,
         ",\n\t\t\"in_msg_size\": ",
-        inSize,
+        inNelems,
         ",\n\t\t\"out_msg_size\": ",
-        outSize,
+        outNelems,
         ",\n\t\t\"dtype\": \"",
         at::toString(dtype),
         "\",\n\t\t\"devType\": \"",
@@ -153,8 +153,8 @@
       0, // process group ptr
       rank,
       commName.c_str(),
-      inSize,
-      outSize,
+      inNelems,
+      outNelems,
       dtype,
       curInSplitSizes_,
       curOutSplitSizes_,
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index 75500c55..2183a32 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -340,8 +340,8 @@
 #ifdef USE_C10D
 static constexpr auto kCommuName = "Collective name";
 static constexpr auto kDtype = "dtype";
-static constexpr auto kInMsgSize = "In msg size";
-static constexpr auto kOutMsgSize = "Out msg size";
+static constexpr auto kInMsgNelems = "In msg nelems";
+static constexpr auto kOutMsgNelems = "Out msg nelems";
 static constexpr auto kInSplit = "In split size";
 static constexpr auto kOutSplit = "Out split size";
 static constexpr auto kGroupSize = "Group size";
@@ -365,8 +365,8 @@
   map.emplace(kCommuName, fmt::format("\"{}\"", debugInfo->getColumnName()));
   map.emplace(
       kDtype, fmt::format("\"{}\"", c10::toString(debugInfo->getDType())));
-  map.emplace(kInMsgSize, std::to_string(debugInfo->getInMessageSize()));
-  map.emplace(kOutMsgSize, std::to_string(debugInfo->getOutMessageSize()));
+  map.emplace(kInMsgNelems, std::to_string(debugInfo->getInMessageNelems()));
+  map.emplace(kOutMsgNelems, std::to_string(debugInfo->getOutMessageNelems()));
   auto& inSplitSizes = debugInfo->getInputSplitSizes();
   if (!inSplitSizes.empty() && inSplitSizes.size() <= kTruncatLength) {
     map.emplace(