test/distributed/test_hooks.py - platform/external/pytorch - Git at Google

 # Owner(s): ["oncall: distributed"]

 import os
 import sys
 import tempfile
 import threading
 from functools import partial, wraps

 import torch
 import torch.distributed as dist
 import torch.distributed._hooks as dhooks

 if not dist.is_available():
     print("torch.distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)


 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     skip_if_lt_x_gpu,
 )

 from torch.testing._internal.common_utils import run_tests, TestCase


 class PgHooks(MultiProcessTestCase):
     @property
     def world_size(self) -> int:
         return 4

     def setUp(self) -> None:
         super().setUp()
         self._spawn_processes()

     def tearDown(self):
         super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
             pass

     def test_pg_hook(self):
         pgs = []

         def pg_hook(pg, pg_name):
             pgs.append((pg, pg_name))

         dhooks.register_process_group_hook(pg_hook)
         dist.init_process_group(
             backend="gloo",
             rank=self.rank,
             world_size=self.world_size,
             store=dist.FileStore(self.file_name, self.world_size),
         )
         self.assertEqual(len(pgs), 1)
         self.assertEqual(pgs[0][0], dist.group.WORLD)

         # create two partial world PGs
         pg0 = dist.new_group(ranks=[0, 1])
         pg1 = dist.new_group(ranks=[2, 3])

         # Each rank only observe two PGs being created: the default PG and one covering its ranks
         # We don't emit events for PG creation if the current rank doesn't belong to it.
         # For example, say you're rank 1, you'll get an event for pg0 but not pg1 even though the API contact
         # dictates you need to call new_group for both.
         self.assertEqual(len(pgs), 2)
         self.assertEqual(pgs[1][0], pg0 if self.rank < 2 else pg1)


 def with_comms(func=None):
     if func is None:
         return partial(
             with_comms,
         )

     @wraps(func)
     def wrapper(self, *args, **kwargs):
         self.init_comms()
         func(self, *args, **kwargs)
         self.destroy_comms()

     return wrapper


 class CollectiveHooks:
     @property
     def world_size(self) -> int:
         return 4

     def _collective_hooks(self):
         # it's ok to access them directly since there's a single bg thread poking at them.
         starts = []
         ends = []
         cv = threading.Condition()

         def coll_start(status):
             starts.append(status)
             print(f"col_start {len(starts)} rank{self.rank}")

         def coll_end(status):
             ends.append(status)
             print(f"col_end {len(ends)} rank{self.rank}")
             if len(ends) == 2:
                 with cv:
                     cv.notify()

         dhooks.register_collective_start_hook(coll_start)
         dhooks.register_collective_end_hook(coll_end)

         tensor = torch.ones([2, 3]).to(self.device) * self.rank
         tensor_list = [torch.empty_like(tensor) for _ in range(self.world_size)]

         dist.all_gather(tensor_list, tensor)

         tensor2 = torch.ones([2, 3]).to(self.device) * self.rank
         dist.all_reduce(tensor2)

         with cv:
             cv.wait(1)

         default_pg_name = dist.group.WORLD.group_name
         self.assertEqual(2, len(starts))
         self.assertEqual(2, len(ends))

         def check_op(idx, coll_name):
             self.assertEqual(default_pg_name, starts[idx].pg_name)
             self.assertEqual(self.backend_name, starts[idx].backend)
             self.assertGreaterEqual(starts[idx].sequence_number, 0)
             self.assertGreaterEqual(starts[idx].timestamp, 0)
             self.assertEqual(coll_name, starts[idx].operation)

             self.assertEqual(default_pg_name, ends[idx].pg_name)
             self.assertEqual(self.backend_name, ends[idx].backend)

             self.assertEqual(starts[idx].sequence_number, ends[idx].sequence_number)
             self.assertLessEqual(starts[idx].timestamp, ends[idx].timestamp)
             self.assertEqual(coll_name, ends[idx].operation)

         check_op(0, "ALLGATHER")
         check_op(1, "ALLREDUCE")


 class GlooHooks(MultiProcessTestCase, CollectiveHooks):
     def setUp(self) -> None:
         super().setUp()
         self._spawn_processes()

     def tearDown(self):
         super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
             pass

     def init_comms(self):
         dist.init_process_group(
             backend="gloo",
             rank=self.rank,
             world_size=self.world_size,
             store=dist.FileStore(self.file_name, self.world_size),
         )

     def destroy_comms(self):
         dist.destroy_process_group()

     @property
     def backend_name(self):
         return "gloo"

     @property
     def device(self):
         return "cpu"

     @with_comms
     def test_collective_hooks(self):
         self._collective_hooks()


 class NcclHooks(MultiProcessTestCase, CollectiveHooks):
     def setUp(self) -> None:
         super().setUp()
         self._spawn_processes()

     def tearDown(self):
         super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
             pass

     def init_comms(self):
         dist.init_process_group(
             backend="nccl",
             rank=self.rank,
             world_size=self.world_size,
             store=dist.FileStore(self.file_name, self.world_size),
         )

     def destroy_comms(self):
         dist.destroy_process_group()

     @property
     def backend_name(self):
         return "nccl"

     @property
     def device(self):
         return f"cuda:{self.rank}"

     @skip_if_lt_x_gpu(4)
     @with_comms
     def test_collective_hooks(self):
         self._collective_hooks()


 class SingleRankTests(TestCase):
     def setUp(self) -> None:
         super().setUp()
         self.rank = 0
         self.file_name = tempfile.NamedTemporaryFile(delete=False).name
         dist.init_process_group(
             backend="gloo",
             rank=0,
             world_size=1,
             store=dist.FileStore(self.file_name, 1),
         )

     def tearDown(self) -> None:
         dist.destroy_process_group()

     def test_queue_overflow(self) -> None:
         cv_done_colls = threading.Condition()
         cv_done_cb = threading.Condition()
         colls_done = False
         starts = []
         status_with_dropped = None

         def coll_start(status: dhooks.CollectiveStatus):
             starts.append(status)
             with cv_done_colls:
                 while not colls_done:
                     cv_done_colls.wait()
             if status.drop_count > 0:
                 nonlocal status_with_dropped
                 status_with_dropped = status
                 with cv_done_cb:
                     cv_done_cb.notify()

         dhooks.register_collective_start_hook(coll_start)

         # native limit is 512
         for i in range(600):
             dist.all_reduce(torch.ones([2, 3]))
         colls_done = True
         with cv_done_colls:
             cv_done_colls.notify()

         with cv_done_cb:
             cv_done_cb.wait(10)

         self.assertTrue(status_with_dropped is not None)
         self.assertTrue(status_with_dropped.drop_count > 0)


 if __name__ == "__main__":
     assert (
         not torch.cuda._initialized
     ), "test_distributed must not have initialized CUDA context on main process"

     run_tests()
	# Owner(s): ["oncall: distributed"]

	import os
	import sys
	import tempfile
	import threading
	from functools import partial, wraps

	import torch
	import torch.distributed as dist
	import torch.distributed._hooks as dhooks

	if not dist.is_available():
	print("torch.distributed not available, skipping tests", file=sys.stderr)
	sys.exit(0)


	from torch.testing._internal.common_distributed import (
	MultiProcessTestCase,
	skip_if_lt_x_gpu,
	)

	from torch.testing._internal.common_utils import run_tests, TestCase


	class PgHooks(MultiProcessTestCase):
	@property
	def world_size(self) -> int:
	return 4

	def setUp(self) -> None:
	super().setUp()
	self._spawn_processes()

	def tearDown(self):
	super().tearDown()
	try:
	os.remove(self.file_name)
	except OSError:
	pass

	def test_pg_hook(self):
	pgs = []

	def pg_hook(pg, pg_name):
	pgs.append((pg, pg_name))

	dhooks.register_process_group_hook(pg_hook)
	dist.init_process_group(
	backend="gloo",
	rank=self.rank,
	world_size=self.world_size,
	store=dist.FileStore(self.file_name, self.world_size),
	)
	self.assertEqual(len(pgs), 1)
	self.assertEqual(pgs[0][0], dist.group.WORLD)

	# create two partial world PGs
	pg0 = dist.new_group(ranks=[0, 1])
	pg1 = dist.new_group(ranks=[2, 3])

	# Each rank only observe two PGs being created: the default PG and one covering its ranks
	# We don't emit events for PG creation if the current rank doesn't belong to it.
	# For example, say you're rank 1, you'll get an event for pg0 but not pg1 even though the API contact
	# dictates you need to call new_group for both.
	self.assertEqual(len(pgs), 2)
	self.assertEqual(pgs[1][0], pg0 if self.rank < 2 else pg1)


	def with_comms(func=None):
	if func is None:
	return partial(
	with_comms,
	)

	@wraps(func)
	def wrapper(self, args, *kwargs):
	self.init_comms()
	func(self, args, *kwargs)
	self.destroy_comms()

	return wrapper


	class CollectiveHooks:
	@property
	def world_size(self) -> int:
	return 4

	def _collective_hooks(self):
	# it's ok to access them directly since there's a single bg thread poking at them.
	starts = []
	ends = []
	cv = threading.Condition()

	def coll_start(status):
	starts.append(status)
	print(f"col_start {len(starts)} rank{self.rank}")

	def coll_end(status):
	ends.append(status)
	print(f"col_end {len(ends)} rank{self.rank}")
	if len(ends) == 2:
	with cv:
	cv.notify()

	dhooks.register_collective_start_hook(coll_start)
	dhooks.register_collective_end_hook(coll_end)

	tensor = torch.ones([2, 3]).to(self.device) * self.rank
	tensor_list = [torch.empty_like(tensor) for _ in range(self.world_size)]

	dist.all_gather(tensor_list, tensor)

	tensor2 = torch.ones([2, 3]).to(self.device) * self.rank
	dist.all_reduce(tensor2)

	with cv:
	cv.wait(1)

	default_pg_name = dist.group.WORLD.group_name
	self.assertEqual(2, len(starts))
	self.assertEqual(2, len(ends))

	def check_op(idx, coll_name):
	self.assertEqual(default_pg_name, starts[idx].pg_name)
	self.assertEqual(self.backend_name, starts[idx].backend)
	self.assertGreaterEqual(starts[idx].sequence_number, 0)
	self.assertGreaterEqual(starts[idx].timestamp, 0)
	self.assertEqual(coll_name, starts[idx].operation)

	self.assertEqual(default_pg_name, ends[idx].pg_name)
	self.assertEqual(self.backend_name, ends[idx].backend)

	self.assertEqual(starts[idx].sequence_number, ends[idx].sequence_number)
	self.assertLessEqual(starts[idx].timestamp, ends[idx].timestamp)
	self.assertEqual(coll_name, ends[idx].operation)

	check_op(0, "ALLGATHER")
	check_op(1, "ALLREDUCE")


	class GlooHooks(MultiProcessTestCase, CollectiveHooks):
	def setUp(self) -> None:
	super().setUp()
	self._spawn_processes()

	def tearDown(self):
	super().tearDown()
	try:
	os.remove(self.file_name)
	except OSError:
	pass

	def init_comms(self):
	dist.init_process_group(
	backend="gloo",
	rank=self.rank,
	world_size=self.world_size,
	store=dist.FileStore(self.file_name, self.world_size),
	)

	def destroy_comms(self):
	dist.destroy_process_group()

	@property
	def backend_name(self):
	return "gloo"

	@property
	def device(self):
	return "cpu"

	@with_comms
	def test_collective_hooks(self):
	self._collective_hooks()


	class NcclHooks(MultiProcessTestCase, CollectiveHooks):
	def setUp(self) -> None:
	super().setUp()
	self._spawn_processes()

	def tearDown(self):
	super().tearDown()
	try:
	os.remove(self.file_name)
	except OSError:
	pass

	def init_comms(self):
	dist.init_process_group(
	backend="nccl",
	rank=self.rank,
	world_size=self.world_size,
	store=dist.FileStore(self.file_name, self.world_size),
	)

	def destroy_comms(self):
	dist.destroy_process_group()

	@property
	def backend_name(self):
	return "nccl"

	@property
	def device(self):
	return f"cuda:{self.rank}"

	@skip_if_lt_x_gpu(4)
	@with_comms
	def test_collective_hooks(self):
	self._collective_hooks()


	class SingleRankTests(TestCase):
	def setUp(self) -> None:
	super().setUp()
	self.rank = 0
	self.file_name = tempfile.NamedTemporaryFile(delete=False).name
	dist.init_process_group(
	backend="gloo",
	rank=0,
	world_size=1,
	store=dist.FileStore(self.file_name, 1),
	)

	def tearDown(self) -> None:
	dist.destroy_process_group()

	def test_queue_overflow(self) -> None:
	cv_done_colls = threading.Condition()
	cv_done_cb = threading.Condition()
	colls_done = False
	starts = []
	status_with_dropped = None

	def coll_start(status: dhooks.CollectiveStatus):
	starts.append(status)
	with cv_done_colls:
	while not colls_done:
	cv_done_colls.wait()
	if status.drop_count > 0:
	nonlocal status_with_dropped
	status_with_dropped = status
	with cv_done_cb:
	cv_done_cb.notify()

	dhooks.register_collective_start_hook(coll_start)

	# native limit is 512
	for i in range(600):
	dist.all_reduce(torch.ones([2, 3]))
	colls_done = True
	with cv_done_colls:
	cv_done_colls.notify()

	with cv_done_cb:
	cv_done_cb.wait(10)

	self.assertTrue(status_with_dropped is not None)
	self.assertTrue(status_with_dropped.drop_count > 0)


	if __name__ == "__main__":
	assert (
	not torch.cuda._initialized
	), "test_distributed must not have initialized CUDA context on main process"

	run_tests()