make torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp data_ptr-correct (#100888)
make torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp data_ptr-correct
Test Plan: Rely on CI.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/100888
Approved by: https://github.com/ezyang
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 3894b57..255a730 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -85,12 +85,15 @@
ncclRedOp_t preMulSum;
bool has_tensor = preMulSupplement->tensor_factor.defined();
auto residence = has_tensor ? ncclScalarDevice : ncclScalarHostImmediate;
- T* ptr_factor =
- has_tensor ? preMulSupplement->tensor_factor.data_ptr<T>() : nullptr;
+ const T* ptr_factor = has_tensor
+ ? preMulSupplement->tensor_factor.const_data_ptr<T>()
+ : nullptr;
T scalar_factor = T(preMulSupplement->double_factor);
ncclRedOpCreatePreMulSum(
&preMulSum,
- has_tensor ? ptr_factor : &scalar_factor,
+ // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/ops.html#ncclredopcreatepremulsum
+ // tells us that the scalar input is strictly a multiplier.
+ /*scalar=*/has_tensor ? const_cast<T*>(ptr_factor) : &scalar_factor,
dataType,
residence,
comm);