[FSDP] Subtest `CPUOffload` for `test_fsdp_grad_acc.py` (#90545)
In preparation for the next PR, I wanted to reduce the time to run these gradient accumulation tests.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90545
Approved by: https://github.com/mrshenli
diff --git a/test/distributed/fsdp/test_fsdp_grad_acc.py b/test/distributed/fsdp/test_fsdp_grad_acc.py
index ef20d2a..78c2731 100644
--- a/test/distributed/fsdp/test_fsdp_grad_acc.py
+++ b/test/distributed/fsdp/test_fsdp_grad_acc.py
@@ -220,7 +220,11 @@
None,
BackwardPrefetch.BACKWARD_PRE,
BackwardPrefetch.BACKWARD_POST,
- ]
+ ],
+ "cpu_offload": [
+ CPUOffload(offload_params=False),
+ CPUOffload(offload_params=True),
+ ],
}
@skip_if_lt_x_gpu(2)
@@ -244,10 +248,6 @@
],
)
@parametrize(
- "cpu_offload",
- [CPUOffload(offload_params=False), CPUOffload(offload_params=True)],
- )
- @parametrize(
"sharding_strategy",
[
ShardingStrategy.FULL_SHARD,
@@ -258,7 +258,6 @@
def test_grad_acc(
self,
configs: _GradAccConfigs,
- cpu_offload: CPUOffload,
sharding_strategy: ShardingStrategy,
):
"""
@@ -281,7 +280,6 @@
self._test_grad_acc,
batch_dim=1,
configs=configs.configs,
- cpu_offload=cpu_offload,
sharding_strategy=sharding_strategy,
)