Fix: memory cross-border access on the ROCM platform (#76100) Fixes #76095, memory cross-border access on the ROCM platform. Pull Request resolved: https://github.com/pytorch/pytorch/pull/76100 Approved by: https://github.com/jeffdaily, https://github.com/kit1980

commit: 62de55ed6b58275f9e7899aab89910818254e0b6 [log] [tgz]
author: xu.wang <xu.wang@iluvatar.ai> Tue Aug 02 02:47:16 2022 +0000
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Tue Aug 02 02:47:18 2022 +0000
tree: d06f1ca357358288baa4d987f3dc1cd904e8cb1d
parent: 41b54c303d3fe6ed527c0d935300d76683739691 [diff]
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 8a241ca..568669e 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu

@@ -98,10 +98,9 @@
       // then finishes by adding the accumulated buffer to dst_row in grad_weight.
       if(dst_row != padding_idx && src_row < n) // Per-warp exit condition, safe with ballot_sync
       {
-        int match_found_this_thread =
-          (dst_row == indices_batch[chunk_start - batch_start + threadIdx.x]);
-        if(threadIdx.x >= n_this_chunk)
-          match_found_this_thread = 0;
+        int match_found_this_thread = 0;
+        if(threadIdx.x < n_this_chunk)
+          match_found_this_thread = (dst_row == indices_batch[chunk_start - batch_start + threadIdx.x]);
 #if defined(USE_ROCM)
         unsigned long long int matchmask = WARP_BALLOT(match_found_this_thread);
         int first_remaining_peer = __ffsll(matchmask) - 1;
commit	62de55ed6b58275f9e7899aab89910818254e0b6	[log] [tgz]
author	xu.wang <xu.wang@iluvatar.ai>	Tue Aug 02 02:47:16 2022 +0000
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Tue Aug 02 02:47:18 2022 +0000
tree	d06f1ca357358288baa4d987f3dc1cd904e8cb1d
parent	41b54c303d3fe6ed527c0d935300d76683739691 [diff]