test/distributed/_composable/test_compose.py - platform/external/pytorch - Git at Google

 # Owner(s): ["oncall: distributed"]

 import copy
 import sys

 from typing import Dict

 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed._composable import checkpoint, fully_shard, replicate
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType
 from torch.distributed.fsdp.api import MixedPrecision, ShardingStrategy
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.testing._internal.common_dist_composable import (
     CompositeModel,
     CompositeParamModel,
     UnitModule,
 )
 from torch.testing._internal.common_distributed import (
     SaveForwardInputsModel,
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
 )


 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)


 if TEST_WITH_DEV_DBG_ASAN:
     print(
         "Skip dev-asan as torch + multiprocessing spawn have known issues",
         file=sys.stderr,
     )
     sys.exit(0)


 class TestFSDPCheckpoint(FSDPTest):
     @property
     def world_size(self) -> int:
         return 2

     # TODO: Define `use_same_inputs_across_ranks` for now for BC since some
     # test model configs do not have a simple base model to compare against. In
     # those cases, we use the same inputs across ranks so that the averaged
     # gradient equals the local gradient to check for parity. This means that
     # the gradient reduction is unchecked.
     def _test_parity(
         self,
         base_model: nn.Module,
         test_model: nn.Module,
         inp_size: torch.Size,
         inp_device: torch.device,
         grad_to_none: bool,
         use_same_inputs_across_ranks: bool,
     ):
         LR = 0.01
         base_optim = torch.optim.Adam(base_model.parameters(), lr=LR)
         test_optim = torch.optim.Adam(test_model.parameters(), lr=LR)

         for _ in range(5):
             if use_same_inputs_across_ranks:
                 torch.manual_seed(0)
             x = torch.randn(inp_size, device=inp_device)
             test_loss = test_model(x).sum()
             base_loss = base_model(x).sum()

             self.assertEqual(test_loss, base_loss)

             test_loss.backward()
             test_optim.step()
             test_optim.zero_grad(set_to_none=grad_to_none)

             base_loss.backward()
             base_optim.step()
             base_optim.zero_grad(set_to_none=grad_to_none)

     @skip_if_lt_x_gpu(2)
     def test_wrap_same_submodule(self):
         model = UnitModule(device=torch.device("cuda"))

         base_model = copy.deepcopy(model)

         test_model = copy.deepcopy(model)
         # compose checkpoint and fully_shard
         test_model.seq = checkpoint(test_model.seq)
         test_model.seq = fully_shard(
             test_model.seq,
             policy=ModuleWrapPolicy({nn.Linear}),
         )

         self.run_subtests(
             {
                 "base_model": [base_model],
                 "test_model": [test_model],
                 "inp_size": [torch.Size((2, 100))],
                 "inp_device": [torch.device("cuda")],
                 "grad_to_none": [True, False],
                 "use_same_inputs_across_ranks": [True],
             },
             self._test_parity,
         )

     def _test_checkpoint_fsdp_submodules(self):
         model = CompositeModel(device=torch.device("cuda"))

         base_model = copy.deepcopy(model)

         test_model = copy.deepcopy(model)
         test_model.u1 = fully_shard(test_model.u1, policy=None)
         test_model.u2 = fully_shard(test_model.u2)

         test_model.u1.seq = checkpoint(test_model.u1.seq)
         test_model.u2.seq = checkpoint(test_model.u2.seq)

         self.run_subtests(
             {
                 "base_model": [base_model],
                 "test_model": [test_model],
                 "inp_size": [torch.Size((2, 100))],
                 "inp_device": [torch.device("cuda")],
                 "grad_to_none": [True, False],
                 "use_same_inputs_across_ranks": [True],
             },
             self._test_parity,
         )

     @skip_if_lt_x_gpu(2)
     def test_checkpoint_fsdp_submodules_non_reentrant(self):
         self._test_checkpoint_fsdp_submodules()

     @skip_if_lt_x_gpu(2)
     def test_checkpoint_fully_shard_cast_forward_inputs(self):
         self.run_subtests(
             {
                 "checkpoint_strict_submodule": [False, True],
             },
             self._test_checkpoint_fully_shard_cast_forward_inputs,
         )

     def _test_checkpoint_fully_shard_cast_forward_inputs(
         self, checkpoint_strict_submodule: bool
     ):
         forward_inputs: Dict[nn.Module, torch.Tensor] = {}
         fp16_mp = MixedPrecision(param_dtype=torch.float16, cast_forward_inputs=True)
         fp32_mp = MixedPrecision(param_dtype=torch.float32, cast_forward_inputs=True)

         model = SaveForwardInputsModel(
             forward_inputs=forward_inputs, cast_forward_inputs=False
         ).cuda()
         x = torch.zeros(2, 100, device="cuda")

         fully_shard(model.c2, mixed_precision=fp16_mp)
         if checkpoint_strict_submodule:
             checkpoint(model.c2.l)
         else:
             checkpoint(model.c2)
         fully_shard(model, mixed_precision=fp32_mp)

         loss = model(x).sum()
         loss.backward()

         self.assertEqual(forward_inputs[model].dtype, torch.float32)
         self.assertEqual(forward_inputs[model.c1].dtype, torch.float32)
         # Notably, check that the recomputed forward preserves the right dtype
         self.assertEqual(forward_inputs[model.c2].dtype, torch.float16)

     @skip_if_lt_x_gpu(2)
     def test_fully_shard_replicate_correct_replicate_params(self):
         model = CompositeParamModel(device=torch.device("cuda"))
         # Shard Linears within UnitModule
         fully_shard(model.u1, policy=ModuleWrapPolicy({nn.Linear}))
         fully_shard(model.u2, policy=ModuleWrapPolicy({nn.Linear}))
         # replicate the rest
         replicate(model)
         # Run fwd + bwd to initialize DDP
         inp = torch.randn(2, 100, device="cuda")
         model(inp).sum().backward()
         # Ensure replicate param names are as expected, i.e.
         # immediate parameters of model and parameters of model's non-UnitModule
         # submodules are replicated
         param_names = replicate.state(model)._replicate_param_names
         replicated_modules = [
             (name, mod)
             for (name, mod) in model.named_children()
             if mod not in [model.u1, model.u2]
         ]
         replicated_param_names = [
             f"{module_name}.{n}"
             for module_name, mod in replicated_modules
             for n, _ in mod.named_parameters()
         ]
         replicated_param_names.extend(
             [n for n, _ in model.named_parameters(recurse=False)]
         )
         self.assertEqual(set(param_names), set(replicated_param_names))

     @skip_if_lt_x_gpu(2)
     def test_checkpoint_fsdp_submodules_with_param(self):
         model = CompositeParamModel(device=torch.device("cuda"))

         base_model = copy.deepcopy(model)

         test_model = copy.deepcopy(model)
         test_model.u1.seq = checkpoint(test_model.u1.seq)
         test_model.u2.seq = checkpoint(test_model.u2.seq)
         test_model = fully_shard(test_model)

         self.run_subtests(
             {
                 "base_model": [base_model],
                 "test_model": [test_model],
                 "inp_size": [torch.Size((2, 100))],
                 "inp_device": [torch.device("cuda")],
                 "grad_to_none": [True, False],
                 "use_same_inputs_across_ranks": [True],
             },
             self._test_parity,
         )

     @skip_if_lt_x_gpu(2)
     def test_checkpoint_fsdp_submodules_with_param_no_shard(self):
         model = CompositeParamModel(device=torch.device("cuda"))

         base_model = copy.deepcopy(model)

         test_model = copy.deepcopy(model)
         test_model.u1.seq = checkpoint(test_model.u1.seq)
         test_model.u2.seq = checkpoint(test_model.u2.seq)
         test_model = fully_shard(test_model, strategy=ShardingStrategy.NO_SHARD)

         self.run_subtests(
             {
                 "base_model": [base_model],
                 "test_model": [test_model],
                 "inp_size": [torch.Size((2, 100))],
                 "inp_device": [torch.device("cuda")],
                 "grad_to_none": [True, False],
                 "use_same_inputs_across_ranks": [True],
             },
             self._test_parity,
         )

     @skip_if_lt_x_gpu(2)
     def test_composable_fsdp_replicate(self):
         # Verify how the APIs can be composed, e.g. if both `fully_shard` and
         # `replicate` are applied on the same module, it should raise exception.
         model = CompositeModel(device=torch.device("cpu"))
         fully_shard(model.l1)
         with self.assertRaisesRegex(AssertionError, "Cannot apply .*replicate"):
             replicate(model.l1)
         replicate(model.l2)  # should not raise

     @skip_if_lt_x_gpu(2)
     def test_fully_shard_replicate_composability(self):
         """
         Tests composing ``fully_shard`` and ``replicate``. To save unit test
         time, we run the different configs in subtests.
         """
         self.run_subtests(
             {
                 "config": [
                     "1fm,1r",
                     "1r,1fm",
                     "1r,1fa",
                     "1r1fm,1fm",
                     "1r1fa,1fm",
                     "1fm1fm,1r1r,1fm",
                 ]
             },
             self._test_replicate_in_fully_shard,
         )

     def _test_replicate_in_fully_shard(self, config: str):
         """
         To interpret the config, each comma delineates a level in the module
         tree ordered bottom-up; 'r' means ``replicate``; 'f' means
         ``fully_shard``; 'a' means auto wrap; and 'm' means manual wrap.
         """
         # Set the seed to ensure that all ranks initialize the same model
         torch.manual_seed(0)
         if config == "1fm,1r":
             base_model = CompositeModel(device=torch.device("cuda"))
             test_model = copy.deepcopy(base_model)
             fully_shard(test_model.l1)
             replicate(test_model)
         elif config == "1r,1fm":
             base_model = CompositeParamModel(torch.device("cuda"))
             test_model = copy.deepcopy(base_model)
             replicate(test_model.u1)
             fully_shard(test_model)
         elif config == "1r,1fa":
             base_model = CompositeParamModel(torch.device("cuda"))
             test_model = copy.deepcopy(base_model)
             replicate(test_model.u1)
             fully_shard(test_model, policy=ModuleWrapPolicy({UnitModule}))
         elif config == "1r1fm,1fm":
             base_model = CompositeParamModel(torch.device("cuda"))
             test_model = copy.deepcopy(base_model)
             replicate(test_model.u1)
             fully_shard(test_model.u2)
             fully_shard(test_model)
         elif config == "1r1fa,1fm":
             base_model = CompositeParamModel(torch.device("cuda"))
             test_model = copy.deepcopy(base_model)
             replicate(test_model.u1)
             fully_shard(test_model.u2, policy=ModuleWrapPolicy({UnitModule}))
             fully_shard(test_model)
         elif config == "1fm1fm,1r1r,1fm":
             base_model = CompositeParamModel(torch.device("cuda"))
             test_model = copy.deepcopy(base_model)
             fully_shard(test_model.u1.seq)
             fully_shard(test_model.u2.seq)
             replicate(test_model.u1)
             replicate(test_model.u2)
             fully_shard(test_model)
         else:
             raise ValueError(f"Unknown config: {config}")
         # Apply data parallelism to the base model for parity since we apply
         # data parallelism to the test model
         replicate(base_model)

         # Set the seed to ensure that ranks get different input data
         torch.manual_seed(self.rank + 1)
         self._test_parity(
             base_model,
             test_model,
             torch.Size((2, 100)),
             torch.device("cuda"),
             True,
             False,
         )

     @skip_if_lt_x_gpu(2)
     def test_state_dict_fsdp_submodules(self):
         model = CompositeModel(device=torch.device("cuda"))

         full_shard_args = {"strategy": ShardingStrategy.FULL_SHARD}
         no_shard_args = {"strategy": ShardingStrategy.NO_SHARD}

         model.u1 = fully_shard(model.u1, **full_shard_args)
         model.u2 = fully_shard(model.u2, **no_shard_args)

         FSDP.set_state_dict_type(
             model,
             StateDictType.SHARDED_STATE_DICT,
         )

         state_dict = model.state_dict()
         for fqn, tensor in state_dict.items():
             if "u1" in fqn:
                 self.assertIsInstance(tensor, ShardedTensor)
             elif "u2" in fqn:
                 self.assertIsInstance(tensor, torch.Tensor)
         # Ensure that get_state_dict_type can still correctly get the settings.
         _ = FSDP.get_state_dict_type(model)


 instantiate_parametrized_tests(TestFSDPCheckpoint)


 if __name__ == "__main__":
     run_tests()
	# Owner(s): ["oncall: distributed"]

	import copy
	import sys

	from typing import Dict

	import torch
	import torch.distributed as dist
	import torch.nn as nn
	from torch.distributed._composable import checkpoint, fully_shard, replicate
	from torch.distributed._shard.sharded_tensor import ShardedTensor
	from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType
	from torch.distributed.fsdp.api import MixedPrecision, ShardingStrategy
	from torch.distributed.fsdp.wrap import ModuleWrapPolicy
	from torch.testing._internal.common_dist_composable import (
	CompositeModel,
	CompositeParamModel,
	UnitModule,
	)
	from torch.testing._internal.common_distributed import (
	SaveForwardInputsModel,
	skip_if_lt_x_gpu,
	)
	from torch.testing._internal.common_fsdp import FSDPTest
	from torch.testing._internal.common_utils import (
	instantiate_parametrized_tests,
	run_tests,
	TEST_WITH_DEV_DBG_ASAN,
	)


	if not dist.is_available():
	print("Distributed not available, skipping tests", file=sys.stderr)
	sys.exit(0)


	if TEST_WITH_DEV_DBG_ASAN:
	print(
	"Skip dev-asan as torch + multiprocessing spawn have known issues",
	file=sys.stderr,
	)
	sys.exit(0)


	class TestFSDPCheckpoint(FSDPTest):
	@property
	def world_size(self) -> int:
	return 2

	# TODO: Define `use_same_inputs_across_ranks` for now for BC since some
	# test model configs do not have a simple base model to compare against. In
	# those cases, we use the same inputs across ranks so that the averaged
	# gradient equals the local gradient to check for parity. This means that
	# the gradient reduction is unchecked.
	def _test_parity(
	self,
	base_model: nn.Module,
	test_model: nn.Module,
	inp_size: torch.Size,
	inp_device: torch.device,
	grad_to_none: bool,
	use_same_inputs_across_ranks: bool,
	):
	LR = 0.01
	base_optim = torch.optim.Adam(base_model.parameters(), lr=LR)
	test_optim = torch.optim.Adam(test_model.parameters(), lr=LR)

	for _ in range(5):
	if use_same_inputs_across_ranks:
	torch.manual_seed(0)
	x = torch.randn(inp_size, device=inp_device)
	test_loss = test_model(x).sum()
	base_loss = base_model(x).sum()

	self.assertEqual(test_loss, base_loss)

	test_loss.backward()
	test_optim.step()
	test_optim.zero_grad(set_to_none=grad_to_none)

	base_loss.backward()
	base_optim.step()
	base_optim.zero_grad(set_to_none=grad_to_none)

	@skip_if_lt_x_gpu(2)
	def test_wrap_same_submodule(self):
	model = UnitModule(device=torch.device("cuda"))

	base_model = copy.deepcopy(model)

	test_model = copy.deepcopy(model)
	# compose checkpoint and fully_shard
	test_model.seq = checkpoint(test_model.seq)
	test_model.seq = fully_shard(
	test_model.seq,
	policy=ModuleWrapPolicy({nn.Linear}),
	)

	self.run_subtests(
	{
	"base_model": [base_model],
	"test_model": [test_model],
	"inp_size": [torch.Size((2, 100))],
	"inp_device": [torch.device("cuda")],
	"grad_to_none": [True, False],
	"use_same_inputs_across_ranks": [True],
	},
	self._test_parity,
	)

	def _test_checkpoint_fsdp_submodules(self):
	model = CompositeModel(device=torch.device("cuda"))

	base_model = copy.deepcopy(model)

	test_model = copy.deepcopy(model)
	test_model.u1 = fully_shard(test_model.u1, policy=None)
	test_model.u2 = fully_shard(test_model.u2)

	test_model.u1.seq = checkpoint(test_model.u1.seq)
	test_model.u2.seq = checkpoint(test_model.u2.seq)

	self.run_subtests(
	{
	"base_model": [base_model],
	"test_model": [test_model],
	"inp_size": [torch.Size((2, 100))],
	"inp_device": [torch.device("cuda")],
	"grad_to_none": [True, False],
	"use_same_inputs_across_ranks": [True],
	},
	self._test_parity,
	)

	@skip_if_lt_x_gpu(2)
	def test_checkpoint_fsdp_submodules_non_reentrant(self):
	self._test_checkpoint_fsdp_submodules()

	@skip_if_lt_x_gpu(2)
	def test_checkpoint_fully_shard_cast_forward_inputs(self):
	self.run_subtests(
	{
	"checkpoint_strict_submodule": [False, True],
	},
	self._test_checkpoint_fully_shard_cast_forward_inputs,
	)

	def _test_checkpoint_fully_shard_cast_forward_inputs(
	self, checkpoint_strict_submodule: bool
	):
	forward_inputs: Dict[nn.Module, torch.Tensor] = {}
	fp16_mp = MixedPrecision(param_dtype=torch.float16, cast_forward_inputs=True)
	fp32_mp = MixedPrecision(param_dtype=torch.float32, cast_forward_inputs=True)

	model = SaveForwardInputsModel(
	forward_inputs=forward_inputs, cast_forward_inputs=False
	).cuda()
	x = torch.zeros(2, 100, device="cuda")

	fully_shard(model.c2, mixed_precision=fp16_mp)
	if checkpoint_strict_submodule:
	checkpoint(model.c2.l)
	else:
	checkpoint(model.c2)
	fully_shard(model, mixed_precision=fp32_mp)

	loss = model(x).sum()
	loss.backward()

	self.assertEqual(forward_inputs[model].dtype, torch.float32)
	self.assertEqual(forward_inputs[model.c1].dtype, torch.float32)
	# Notably, check that the recomputed forward preserves the right dtype
	self.assertEqual(forward_inputs[model.c2].dtype, torch.float16)

	@skip_if_lt_x_gpu(2)
	def test_fully_shard_replicate_correct_replicate_params(self):
	model = CompositeParamModel(device=torch.device("cuda"))
	# Shard Linears within UnitModule
	fully_shard(model.u1, policy=ModuleWrapPolicy({nn.Linear}))
	fully_shard(model.u2, policy=ModuleWrapPolicy({nn.Linear}))
	# replicate the rest
	replicate(model)
	# Run fwd + bwd to initialize DDP
	inp = torch.randn(2, 100, device="cuda")
	model(inp).sum().backward()
	# Ensure replicate param names are as expected, i.e.
	# immediate parameters of model and parameters of model's non-UnitModule
	# submodules are replicated
	param_names = replicate.state(model)._replicate_param_names
	replicated_modules = [
	(name, mod)
	for (name, mod) in model.named_children()
	if mod not in [model.u1, model.u2]
	]
	replicated_param_names = [
	f"{module_name}.{n}"
	for module_name, mod in replicated_modules
	for n, _ in mod.named_parameters()
	]
	replicated_param_names.extend(
	[n for n, _ in model.named_parameters(recurse=False)]
	)
	self.assertEqual(set(param_names), set(replicated_param_names))

	@skip_if_lt_x_gpu(2)
	def test_checkpoint_fsdp_submodules_with_param(self):
	model = CompositeParamModel(device=torch.device("cuda"))

	base_model = copy.deepcopy(model)

	test_model = copy.deepcopy(model)
	test_model.u1.seq = checkpoint(test_model.u1.seq)
	test_model.u2.seq = checkpoint(test_model.u2.seq)
	test_model = fully_shard(test_model)

	self.run_subtests(
	{
	"base_model": [base_model],
	"test_model": [test_model],
	"inp_size": [torch.Size((2, 100))],
	"inp_device": [torch.device("cuda")],
	"grad_to_none": [True, False],
	"use_same_inputs_across_ranks": [True],
	},
	self._test_parity,
	)

	@skip_if_lt_x_gpu(2)
	def test_checkpoint_fsdp_submodules_with_param_no_shard(self):
	model = CompositeParamModel(device=torch.device("cuda"))

	base_model = copy.deepcopy(model)

	test_model = copy.deepcopy(model)
	test_model.u1.seq = checkpoint(test_model.u1.seq)
	test_model.u2.seq = checkpoint(test_model.u2.seq)
	test_model = fully_shard(test_model, strategy=ShardingStrategy.NO_SHARD)

	self.run_subtests(
	{
	"base_model": [base_model],
	"test_model": [test_model],
	"inp_size": [torch.Size((2, 100))],
	"inp_device": [torch.device("cuda")],
	"grad_to_none": [True, False],
	"use_same_inputs_across_ranks": [True],
	},
	self._test_parity,
	)

	@skip_if_lt_x_gpu(2)
	def test_composable_fsdp_replicate(self):
	# Verify how the APIs can be composed, e.g. if both `fully_shard` and
	# `replicate` are applied on the same module, it should raise exception.
	model = CompositeModel(device=torch.device("cpu"))
	fully_shard(model.l1)
	with self.assertRaisesRegex(AssertionError, "Cannot apply .*replicate"):
	replicate(model.l1)
	replicate(model.l2) # should not raise

	@skip_if_lt_x_gpu(2)
	def test_fully_shard_replicate_composability(self):
	"""
	Tests composing ``fully_shard`` and ``replicate``. To save unit test
	time, we run the different configs in subtests.
	"""
	self.run_subtests(
	{
	"config": [
	"1fm,1r",
	"1r,1fm",
	"1r,1fa",
	"1r1fm,1fm",
	"1r1fa,1fm",
	"1fm1fm,1r1r,1fm",
	]
	},
	self._test_replicate_in_fully_shard,
	)

	def _test_replicate_in_fully_shard(self, config: str):
	"""
	To interpret the config, each comma delineates a level in the module
	tree ordered bottom-up; 'r' means ``replicate``; 'f' means
	``fully_shard``; 'a' means auto wrap; and 'm' means manual wrap.
	"""
	# Set the seed to ensure that all ranks initialize the same model
	torch.manual_seed(0)
	if config == "1fm,1r":
	base_model = CompositeModel(device=torch.device("cuda"))
	test_model = copy.deepcopy(base_model)
	fully_shard(test_model.l1)
	replicate(test_model)
	elif config == "1r,1fm":
	base_model = CompositeParamModel(torch.device("cuda"))
	test_model = copy.deepcopy(base_model)
	replicate(test_model.u1)
	fully_shard(test_model)
	elif config == "1r,1fa":
	base_model = CompositeParamModel(torch.device("cuda"))
	test_model = copy.deepcopy(base_model)
	replicate(test_model.u1)
	fully_shard(test_model, policy=ModuleWrapPolicy({UnitModule}))
	elif config == "1r1fm,1fm":
	base_model = CompositeParamModel(torch.device("cuda"))
	test_model = copy.deepcopy(base_model)
	replicate(test_model.u1)
	fully_shard(test_model.u2)
	fully_shard(test_model)
	elif config == "1r1fa,1fm":
	base_model = CompositeParamModel(torch.device("cuda"))
	test_model = copy.deepcopy(base_model)
	replicate(test_model.u1)
	fully_shard(test_model.u2, policy=ModuleWrapPolicy({UnitModule}))
	fully_shard(test_model)
	elif config == "1fm1fm,1r1r,1fm":
	base_model = CompositeParamModel(torch.device("cuda"))
	test_model = copy.deepcopy(base_model)
	fully_shard(test_model.u1.seq)
	fully_shard(test_model.u2.seq)
	replicate(test_model.u1)
	replicate(test_model.u2)
	fully_shard(test_model)
	else:
	raise ValueError(f"Unknown config: {config}")
	# Apply data parallelism to the base model for parity since we apply
	# data parallelism to the test model
	replicate(base_model)

	# Set the seed to ensure that ranks get different input data
	torch.manual_seed(self.rank + 1)
	self._test_parity(
	base_model,
	test_model,
	torch.Size((2, 100)),
	torch.device("cuda"),
	True,
	False,
	)

	@skip_if_lt_x_gpu(2)
	def test_state_dict_fsdp_submodules(self):
	model = CompositeModel(device=torch.device("cuda"))

	full_shard_args = {"strategy": ShardingStrategy.FULL_SHARD}
	no_shard_args = {"strategy": ShardingStrategy.NO_SHARD}

	model.u1 = fully_shard(model.u1, **full_shard_args)
	model.u2 = fully_shard(model.u2, **no_shard_args)

	FSDP.set_state_dict_type(
	model,
	StateDictType.SHARDED_STATE_DICT,
	)

	state_dict = model.state_dict()
	for fqn, tensor in state_dict.items():
	if "u1" in fqn:
	self.assertIsInstance(tensor, ShardedTensor)
	elif "u2" in fqn:
	self.assertIsInstance(tensor, torch.Tensor)
	# Ensure that get_state_dict_type can still correctly get the settings.
	_ = FSDP.get_state_dict_type(model)


	instantiate_parametrized_tests(TestFSDPCheckpoint)


	if __name__ == "__main__":
	run_tests()