Remove compile_threads=1 in test_inductor_collectives.py (#128580)
Summary: I believe https://github.com/pytorch/pytorch/issues/125235 should be fixed after switching to subprocess-based parallel compile.
Test Plan: Ran locally with python-3.9
Pull Request resolved: https://github.com/pytorch/pytorch/pull/128580
Approved by: https://github.com/eellison
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index 35e44b1..ee4535f 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -60,8 +60,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_broadcast_inductor(self):
"""
Testing if broadcast works correctly when using inductor
@@ -94,8 +92,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_allreduce_inductor(self):
"""
This is matmul/cat/allreduce is a pattern we aim to optimize.
@@ -129,8 +125,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_allreduce_inductor_cudagraph_trees(self):
"""
Tests whether cudagraph trees support all_reduce from nccl
@@ -177,8 +171,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_eager_allreduce_inductor_wait(self):
def eager_func(a, b, c, d, *, tag, ranks, group_size):
x = torch.matmul(a, b)
@@ -218,8 +210,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_inductor_allreduce_eager_wait(self):
def inductor_func(a, b, c, d, *, tag, ranks, group_size):
x = torch.matmul(a, b)
@@ -256,8 +246,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
@patch.object(torch._inductor.config, "allow_buffer_reuse", True)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_allreduce_input_buffer_reuse(self):
def func(a, *, tag, ranks, group_size):
ar = _functional_collectives.all_reduce(a, "sum", ranks, tag)
@@ -275,8 +263,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_permute_tensor(self):
def func(tensor, src_dst_pairs, *, tag, ranks, group_size):
return _functional_collectives.permute_tensor(
@@ -304,8 +290,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
@patch.object(torch._inductor.config, "allow_buffer_reuse", True)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_allgather_output_buffer_reuse(self):
class Model(torch.nn.Module):
def __init__(self, *args, **kwargs) -> None:
@@ -329,8 +313,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_allgather_contiguous_input(self):
class Model(torch.nn.Module):
def __init__(self, *args, **kwargs) -> None:
@@ -355,8 +337,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_allgather_into_tensor_inductor(self):
"""
This is matmul/cat/allreduce is a pattern we aim to optimize.
@@ -388,8 +368,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_reduce_scatter_tensor_inductor(self):
def example(a, b, *, tag, ranks, group_size):
c = torch.matmul(a, b)
@@ -418,8 +396,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
@patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_all_to_all_single_inductor(self):
def example(
inp,
@@ -488,8 +464,6 @@
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@skip_if_lt_x_gpu(2)
- # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
- @patch.object(torch._inductor.config, "compile_threads", 1)
def test_all_to_all_single_inductor_split_sizes_none(self):
def example(inp, *, tag, ranks, group_size):
a2a = torch.ops.c10d_functional.all_to_all_single(