| commit | 33ce9cf7f9ddd19d1b71048f55e36610328ac0e1 | [log] [tgz] |
|---|---|---|
| author | Andrew Gu <andgu@fb.com> | Mon Jul 29 20:48:32 2024 -0700 |
| committer | PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> | Tue Jul 30 14:28:12 2024 +0000 |
| tree | e52188a6e89464be3a67bd692c47b79fcfb5523d | |
| parent | 16e0868a3d27ab8aabdb26d7ff90deed34e8de48 [diff] |
[FSDP2] Relaxed overlap timing check to avoid flakiness (#132116) Trying to fix https://github.com/pytorch/pytorch/issues/131081 See https://github.com/pytorch/pytorch/issues/131081#issuecomment-2239443504 for detailed context. This PR is relaxing one assertion against the _baseline_ to try to fix the flakiness. Pull Request resolved: https://github.com/pytorch/pytorch/pull/132116 Approved by: https://github.com/Skylion007
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_overlap.py b/test/distributed/_composable/fsdp/test_fully_shard_overlap.py index b31b5a8..3b5d5f0 100644 --- a/test/distributed/_composable/fsdp/test_fully_shard_overlap.py +++ b/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
@@ -139,7 +139,11 @@ num_iters * (3 * compute_sleep_ms + buffer_ms) + comm_sleep_ms ) self.assertLessEqual(test_time, expected_test_time) - self.assertGreater(baseline_time, expected_test_time) + # Since `get_cycles_per_ms` uses lru cache, there may be some variance + # between the initially determined cycles vs. the current cycles per + # ms, so we relax the baseline check to just that it is greater than + # the test time rather than the expected test time + self.assertGreater(baseline_time, test_time) def _time_fn(self, fn: Callable): start_event = torch.cuda.Event(enable_timing=True)