Fix DistributedDataParallelTest.test_accumulate_gradients (#20351)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/20351

This was broken because of a merge race between #20282 and the stack in #20236.

Cleaned up the test and comments a bit as well.

Differential Revision: D15292786

fbshipit-source-id: a4379ea700cad959d3a6921fc5ddf9384fb8f228
diff --git a/test/test_c10d.py b/test/test_c10d.py
index e897059..842092b 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -2366,16 +2366,15 @@
     @skip_if_not_multigpu
     @skip_if_not_nccl
     def test_accumulate_gradients(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank][0:1]
-        self.assertEqual(len(gpus), 1)
+        int_devices = gpus_for_rank(self.world_size)[self.rank][:1]
+        devices = list([torch.device('cuda:' + str(i)) for i in int_devices])
         store = c10d.FileStore(self.file.name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        local_batch_size = len(gpus)
-        global_batch_size = self.world_size * local_batch_size
+        global_batch_size = self.world_size
 
         model, ddp_model, input, target = \
             self._prepare_single_device_module(
-                process_group, gpus, global_batch_size)
+                process_group, devices, devices, global_batch_size)
 
         def step_model(model, input, target):
             model.train()
@@ -2388,25 +2387,25 @@
             ddp_model.train()
             ddp_model.module(input)
 
-        # check two model parameters over 2 iterations
+        # Check two model parameters over 4 iterations.
+        # Use 4 iterations because we alternate between reducing and
+        # not reducing and want to make sure we switch both ways.
         for iteration in range(4):
-            # single cpu/gpu training
             step_model(model, input, target)
 
             if iteration % 2 == 0:
                 # Skip gradients sync without calling prepare_for_backward
-                step_model(ddp_model.module,
-                           input[self.rank * local_batch_size: (self.rank + 1) * local_batch_size],
-                           target[self.rank * local_batch_size: (self.rank + 1) * local_batch_size])
-
+                step_model(
+                    ddp_model.module,
+                    input[self.rank : (self.rank + 1)],
+                    target[self.rank : (self.rank + 1)])
                 for i, j in zip(model.parameters(), ddp_model.parameters()):
                     self.assertNotEqual(i.grad, j.grad)
             else:
-                # DDP training, DDP scatters subsets of input_cpu to nodes/GPUs
-                step_model(ddp_model,
-                           input[self.rank * local_batch_size: (self.rank + 1) * local_batch_size],
-                           target[self.rank * local_batch_size: (self.rank + 1) * local_batch_size])
-
+                step_model(
+                    ddp_model,
+                    input[self.rank : (self.rank + 1)],
+                    target[self.rank : (self.rank + 1)])
                 for i, j in zip(model.parameters(), ddp_model.parameters()):
                     self.assertEqual(i.grad, j.grad)