[dtensor] update README to make all example runnable (#115365) as titled, also add torchrun commands Pull Request resolved: https://github.com/pytorch/pytorch/pull/115365 Approved by: https://github.com/fegin

commit: 6a6a1e3ef7ba9033dac27988f9b9d38662d743e5 [log] [tgz]
author: Wanchao Liang <wanchaol@users.noreply.github.com> Thu Dec 07 10:41:50 2023 -0800
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Thu Dec 07 20:23:37 2023 +0000
tree: c2a52115dfc6ee422f99aea8b3ff7c372826f13f
parent: c06ab369e829971e0ecc661604ed07b4f6f83b69 [diff]
diff --git a/torch/distributed/_tensor/README.md b/torch/distributed/_tensor/README.md
index 3902248..bb58d7e 100644
--- a/torch/distributed/_tensor/README.md
+++ b/torch/distributed/_tensor/README.md

@@ -6,14 +6,18 @@
 We propose distributed tensor primitives to allow easier distributed computation authoring in SPMD(Single Program Multiple Devices) paradigm. The primitives are simple but powerful when used to express tensor distributions with both sharding and replication parallelism strategies. This could empower native Tensor parallelism among other advanced parallelism explorations. For example, to shard a big tensor across devices with 3 lines of code:
 
 ```python
+# to run this file (i.e. dtensor_example.py):
+# torchrun --standalone --nnodes=1 --nproc-per-node=4 dtensor_example.py
+import os
 import torch
-from torch.distributed._tensor import DeviceMesh, Shard, distribute_tensor
+from torch.distributed._tensor import init_device_mesh, Shard, distribute_tensor
 
 # Create a mesh topology with the available devices:
-# 1. We can directly create the mesh using elastic launcher,
-# 2. If using mp.spawn, we need to initialize the world process_group first.
+# 1. We can directly create the mesh using elastic launcher, (recommended)
+# 2. If using mp.spawn, one need to initialize the world process_group first and set device
 #   i.e. torch.distributed.init_process_group(backend="nccl", world_size=world_size)
-mesh = DeviceMesh("cuda", list(range(world_size)))
+
+mesh = init_device_mesh("cuda", (int(os.environ["WORLD_SIZE"]),))
 big_tensor = torch.randn(100000, 88)
 # Shard this tensor over the mesh by sharding `big_tensor`'s 0th dimension over the 0th dimension of `mesh`.
 my_dtensor = distribute_tensor(big_tensor, mesh, [Shard(dim=0)])
@@ -48,11 +52,12 @@
 3. How to “reshard” an existing DTensor to a different DTensor with modified placement strategy or world size.
 
 ```python
+# torchrun --standalone --nnodes=1 --nproc-per-node=4 dtensor_example.py
 import torch
-from torch.distributed._tensor import DTensor, DeviceMesh, Shard, Replicate, distribute_tensor, distribute_module
+from torch.distributed._tensor import DTensor, Shard, Replicate, distribute_tensor, distribute_module, init_device_mesh
 
 # construct a device mesh with available devices (multi-host or single host)
-device_mesh = DeviceMesh("cuda", [0, 1, 2, 3])
+device_mesh = init_device_mesh("cuda", )
 # if we want to do row-wise sharding
 rowwise_placement=[Shard(0)]
 # if we want to do col-wise sharding
@@ -68,7 +73,7 @@
 replica_tensor = distribute_tensor(big_tensor, device_mesh=device_mesh, placements=replica_placement)
 
 # if we want to distributed a tensor with both replication and sharding
-device_mesh = DeviceMesh("cuda", [[0, 1], [2, 3]])
+device_mesh = init_device_mesh("cuda", (2, 2))
 # replicate across the first dimension of device mesh, then sharding on the second dimension of device mesh
 spec=[Replicate(), Shard(0)]
 partial_replica = distribute_tensor(big_tensor, device_mesh=device_mesh, placements=spec)
@@ -80,7 +85,6 @@
 # reshard the current row-wise tensor to a colwise tensor or replicate tensor
 colwise_tensor = rowwise_tensor.redistribute(device_mesh, colwise_placement)
 replica_tensor = colwise_tensor.redistribute(device_mesh, replica_placement)
-
 ```
 
 #### High level User Facing APIs
@@ -109,6 +113,9 @@
 #### High level API examples:
 
 ```python
+import torch.nn as nn
+from torch.distributed._tensor import Shard, distribute_tensor, distribute_module, init_device_mesh
+
 class MyModule(nn.Module):
     def __init__(self):
         super().__init__()
@@ -119,22 +126,20 @@
     def forward(self, input):
         return self.relu(self.fc1(input) + self.fc2(input))
 
-mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1], [2, 3]])
+mesh = init_device_mesh("cuda", (4,))
 
 def shard_params(mod_name, mod, mesh):
-    rowwise_placement = [Shard(0)]
-    def to_dist_tensor(t): return distribute_tensor(t, mesh, rowwise_placement)
-    mod._apply(to_dist_tensor)
+    col_linear_placement = [Shard(0)]
+    # shard fc1 and fc2
+    if isinstance(mod, nn.Linear):
+        for name, param in mod.named_parameters():
+            dist_param = nn.Parameter(
+                distribute_tensor(param, mesh, col_linear_placement)
+            )
+            mod.register_parameter(name, dist_param)
 
 sharded_module = distribute_module(MyModule(), mesh, partition_fn=shard_params)
 
-def shard_fc(mod_name, mod, mesh):
-    rowwise_placement = [Shard(0)]
-    if mod_name == "fc1":
-        mod.weight = torch.nn.Parameter(distribute_tensor(mod.weight, mesh, rowwise_placement))
-
-sharded_module = distribute_module(MyModule(), mesh, partition_fn=shard_fc)
-
 ```
 
 ## Compiler and PyTorch DTensor
commit	6a6a1e3ef7ba9033dac27988f9b9d38662d743e5	[log] [tgz]
author	Wanchao Liang <wanchaol@users.noreply.github.com>	Thu Dec 07 10:41:50 2023 -0800
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Thu Dec 07 20:23:37 2023 +0000
tree	c2a52115dfc6ee422f99aea8b3ff7c372826f13f
parent	c06ab369e829971e0ecc661604ed07b4f6f83b69 [diff]