[profiler] Fix description to use nelems rather than size (#114735)
We were storing the number of elements in the tensor, rather than the actual bytes.
Fixes #ISSUE_NUMBER
Pull Request resolved: https://github.com/pytorch/pytorch/pull/114735
Approved by: https://github.com/aaronenyeshi, https://github.com/yoyoyocmu, https://github.com/kwen2501, https://github.com/fduwjj
diff --git a/torch/csrc/distributed/c10d/ParamCommsUtils.cpp b/torch/csrc/distributed/c10d/ParamCommsUtils.cpp
index a0818a8..0b01bda 100644
--- a/torch/csrc/distributed/c10d/ParamCommsUtils.cpp
+++ b/torch/csrc/distributed/c10d/ParamCommsUtils.cpp
@@ -10,8 +10,8 @@
ParamCommsDebugInfo::ParamCommsDebugInfo(
int rank,
std::string&& colName,
- int inSize,
- int outSize,
+ int inNelems,
+ int outNelems,
at::ScalarType dType,
std::vector<int64_t> inSplitSizes,
std::vector<int64_t> outSplitSizes,
@@ -19,8 +19,8 @@
: rank_(rank),
worldSize_(worldSize),
columnName_(colName),
- inMessageSize_(inSize),
- outMessageSize_(outSize),
+ inMessageNelems_(inNelems),
+ outMessageNelems_(outNelems),
dType_(dType),
inputSplitSizes_(std::move(inSplitSizes)),
outputSplitSizes_(std::move(outSplitSizes)) {}
diff --git a/torch/csrc/distributed/c10d/ParamCommsUtils.hpp b/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
index f6fd374..25a0b6c 100644
--- a/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
+++ b/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
@@ -15,8 +15,8 @@
ParamCommsDebugInfo(
int rank,
std::string&& colName,
- int inSize,
- int outSize,
+ int inNelems,
+ int outNelems,
at::ScalarType dType,
std::vector<int64_t> inSplitSizes,
std::vector<int64_t> outSplitSizes,
@@ -36,12 +36,12 @@
return columnName_;
}
- int getInMessageSize() const {
- return inMessageSize_;
+ int getInMessageNelems() const {
+ return inMessageNelems_;
}
- int getOutMessageSize() const {
- return outMessageSize_;
+ int getOutMessageNelems() const {
+ return outMessageNelems_;
}
at::ScalarType getDType() const {
@@ -60,8 +60,8 @@
int rank_{};
int worldSize_{};
std::string columnName_;
- int inMessageSize_{};
- int outMessageSize_{};
+ int inMessageNelems_{};
+ int outMessageNelems_{};
at::ScalarType dType_ = at::kByte;
std::vector<int64_t> inputSplitSizes_;
std::vector<int64_t> outputSplitSizes_;
@@ -72,8 +72,8 @@
pg_ptr, \
rank, \
colName, \
- inSize, \
- outSize, \
+ inNelems, \
+ outNelems, \
dType, \
inSplitSizes, \
outSplitSizes, \
@@ -81,8 +81,8 @@
auto paramCommsInfo = std::make_shared<torch::ParamCommsDebugInfo>( \
rank, \
colName, \
- inSize, \
- outSize, \
+ inNelems, \
+ outNelems, \
dType, \
inSplitSizes, \
outSplitSizes, \
@@ -106,8 +106,8 @@
OutputTensors, \
rank, \
colName, \
- inSize, \
- outSize, \
+ inNelems, \
+ outNelems, \
dType, \
inSplitSizes, \
outSplitSizes, \
@@ -115,8 +115,8 @@
auto paramCommsInfo = std::make_shared<torch::ParamCommsDebugInfo>( \
rank, \
colName, \
- inSize, \
- outSize, \
+ inNelems, \
+ outNelems, \
dType, \
inSplitSizes, \
outSplitSizes, \
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 44dc1d0..f26f21c 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -620,8 +620,8 @@
0, // process group ptr
rank_, // rank
"wait", // colName
- 0, // inSize
- 0, // outSize
+ 0, // inNelems
+ 0, // outNelems
at::kByte, // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -792,8 +792,8 @@
this->getID(),
rank, // rank
"init", // colName
- 0, // inSize
- 0, // outSize
+ 0, // inNelems
+ 0, // outNelems
at::kByte, // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -2591,8 +2591,8 @@
tensors, // outputTensors
rank_, // rank
"allreduce", // colName
- tensor.numel(), // inSize
- tensor.numel(), // outSize
+ tensor.numel(), // inNelems
+ tensor.numel(), // outNelems
tensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -2616,8 +2616,8 @@
tensors, // outputTensors
rank_, // rank
"allreduce_coalesced", // colName
- total_numel, // inSize
- total_numel, // outSize
+ total_numel, // inNelems
+ total_numel, // outNelems
tensors[0].scalar_type(), // dType
// I'm not sure what in,outSplitSizes mean here.
std::vector<int64_t>(), // inSplitSizes
@@ -2643,8 +2643,8 @@
tensors, // outputTensors
opts.rootRank, // root rank
"broadcast", // colName
- tensor.numel(), // inSize
- tensor.numel(), // outSize
+ tensor.numel(), // inNelems
+ tensor.numel(), // outNelems
tensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -2706,8 +2706,8 @@
outputTensors, // outputTensors
opts.rootRank, // root rank
"_broadcast_oop", // colName
- tensor.numel(), // inSize
- tensor.numel(), // outSize
+ tensor.numel(), // inNelems
+ tensor.numel(), // outNelems
tensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -2748,8 +2748,8 @@
tensors, // outputTensors
opts.rootRank, // root rank
"reduce", // colName
- tensor.numel(), // inSize
- tensor.numel(), // outSize
+ tensor.numel(), // inNelems
+ tensor.numel(), // outNelems
tensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -2812,8 +2812,8 @@
outputTensors, // outputTensors
opts.rootRank, // root rank
"_reduce_oop", // colName
- tensor.numel(), // inSize
- tensor.numel(), // outSize
+ tensor.numel(), // inNelems
+ tensor.numel(), // outNelems
tensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -2869,8 +2869,8 @@
outputTensors, // outputTensors
rank_, // rank
"all_gather", // colName
- tensor.numel(), // inSize
- tensor.numel() * // outSize
+ tensor.numel(), // inNelems
+ tensor.numel() * // outNelems
this->getSize(),
tensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
@@ -3013,8 +3013,8 @@
outputTensors, // outputTensors
rank_, // rank
"reduce_scatter", // colName
- tensor.numel() * this->getSize(), // inSize
- tensor.numel(), // outSize
+ tensor.numel() * this->getSize(), // inNelems
+ tensor.numel(), // outNelems
tensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -3132,8 +3132,8 @@
outputTensor, // outputTensor
rank_, // rank
"_reduce_scatter_base", // colName
- inputTensor.numel(), // inSize
- tensor.numel(), // outSize
+ inputTensor.numel(), // inNelems
+ tensor.numel(), // outNelems
tensor.scalar_type(), // dtype
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -3220,8 +3220,8 @@
this->getID(),
rank_, // rank
"barrier", // colName
- 0, // inSize
- 0, // outSize
+ 0, // inNelems
+ 0, // outNelems
at::kByte, // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -3301,8 +3301,8 @@
outputTensor, // outputTensor
rank_, // rank
"all_to_all", // colName
- inputTensor.numel(), // inSize
- outputTensor.numel(), // outSize
+ inputTensor.numel(), // inNelems
+ outputTensor.numel(), // outNelems
inputTensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -3343,8 +3343,8 @@
outputTensor, // outputTensor
rank_, // rank
"all_to_allv", // colName
- inputTensor.numel(), // inSize
- outputTensor.numel(), // outSize
+ inputTensor.numel(), // inNelems
+ outputTensor.numel(), // outNelems
inputTensor.scalar_type(), // dType
inputSplitSizes, // inSplitSizes
outputSplitSizes, // outSplitSizes
@@ -3419,8 +3419,8 @@
outputTensors, // outputTensors
rank_, // rank
"all_to_all", // colName
- total_numel, // inSize
- total_numel, // outSize
+ total_numel, // inNelems
+ total_numel, // outNelems
inputTensors.front().scalar_type(), // dType
inSplitSizes, // inSplitSizes
outSplitSizes, // outSplitSizes
@@ -3470,8 +3470,8 @@
tensors, // outputTensors
dstRank, // dst rank
"send", // colName
- tensor.numel(), // inSize
- tensor.numel(), // outSize
+ tensor.numel(), // inNelems
+ tensor.numel(), // outNelems
tensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -3508,8 +3508,8 @@
tensors, // outputTensors
srcRank, // src rank
"recv", // colName
- tensor.numel(), // inSize
- tensor.numel(), // outSize
+ tensor.numel(), // inNelems
+ tensor.numel(), // outNelems
tensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSizes
@@ -3668,8 +3668,8 @@
outputTensors, // outputTensors
opts.rootRank, // root rank
"gather", // colName
- tensor.numel(), // inSize
- tensor.numel() * this->getSize(), // outSize
+ tensor.numel(), // inNelems
+ tensor.numel() * this->getSize(), // outNelems
tensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSize
@@ -3755,8 +3755,8 @@
outputTensors, // outputTensors
opts.rootRank, // root rank
"scatter", // colName
- tensor.numel(), // inSize
- tensor.numel() * this->getSize(), // outSize
+ tensor.numel(), // inNelems
+ tensor.numel() * this->getSize(), // outNelems
tensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSize
@@ -3826,8 +3826,8 @@
output_tensor, // outputTensors
rank_, // rank
"_allgather_base", // colName
- input_tensor.numel(), // inSize
- tensor.numel(), // outSize
+ input_tensor.numel(), // inNelems
+ tensor.numel(), // outNelems
tensor.scalar_type(), // dType
std::vector<int64_t>(), // inSplitSizes
std::vector<int64_t>(), // outSplitSize
diff --git a/torch/csrc/distributed/c10d/UCCTracing.cpp b/torch/csrc/distributed/c10d/UCCTracing.cpp
index db7d687..d75bb8c 100644
--- a/torch/csrc/distributed/c10d/UCCTracing.cpp
+++ b/torch/csrc/distributed/c10d/UCCTracing.cpp
@@ -85,8 +85,8 @@
const int world_size,
const std::vector<at::Tensor>& inputTensors,
const std::vector<at::Tensor>& outputTensors) {
- auto inSize = (!inputTensors.empty()) ? inputTensors[0].numel() : 0;
- auto outSize = (!outputTensors.empty()) ? outputTensors[0].numel() : 0;
+ auto inNelems = (!inputTensors.empty()) ? inputTensors[0].numel() : 0;
+ auto outNelems = (!outputTensors.empty()) ? outputTensors[0].numel() : 0;
auto dtype =
(!outputTensors.empty()) ? outputTensors[0].scalar_type() : at::kByte;
auto devType = (!outputTensors.empty()) ? outputTensors[0].device().type()
@@ -116,14 +116,14 @@
",\n\t\t\"world_size\": ",
world_size);
- if (inSize > 0 || outSize > 0) {
+ if (inNelems > 0 || outNelems > 0) {
// for most collectives - append msg sizes, data type, device type
cur_trace_ = c10::str(
cur_trace_,
",\n\t\t\"in_msg_size\": ",
- inSize,
+ inNelems,
",\n\t\t\"out_msg_size\": ",
- outSize,
+ outNelems,
",\n\t\t\"dtype\": \"",
at::toString(dtype),
"\",\n\t\t\"devType\": \"",
@@ -153,8 +153,8 @@
0, // process group ptr
rank,
commName.c_str(),
- inSize,
- outSize,
+ inNelems,
+ outNelems,
dtype,
curInSplitSizes_,
curOutSplitSizes_,
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index 75500c55..2183a32 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -340,8 +340,8 @@
#ifdef USE_C10D
static constexpr auto kCommuName = "Collective name";
static constexpr auto kDtype = "dtype";
-static constexpr auto kInMsgSize = "In msg size";
-static constexpr auto kOutMsgSize = "Out msg size";
+static constexpr auto kInMsgNelems = "In msg nelems";
+static constexpr auto kOutMsgNelems = "Out msg nelems";
static constexpr auto kInSplit = "In split size";
static constexpr auto kOutSplit = "Out split size";
static constexpr auto kGroupSize = "Group size";
@@ -365,8 +365,8 @@
map.emplace(kCommuName, fmt::format("\"{}\"", debugInfo->getColumnName()));
map.emplace(
kDtype, fmt::format("\"{}\"", c10::toString(debugInfo->getDType())));
- map.emplace(kInMsgSize, std::to_string(debugInfo->getInMessageSize()));
- map.emplace(kOutMsgSize, std::to_string(debugInfo->getOutMessageSize()));
+ map.emplace(kInMsgNelems, std::to_string(debugInfo->getInMessageNelems()));
+ map.emplace(kOutMsgNelems, std::to_string(debugInfo->getOutMessageNelems()));
auto& inSplitSizes = debugInfo->getInputSplitSizes();
if (!inSplitSizes.empty() && inSplitSizes.size() <= kTruncatLength) {
map.emplace(