[DataPipe] Add docstrings for IterDataPipe and MapDataPipe, along with small doc changes for consistency (#72618) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72618 The major changes are in torch/utils/data/dataset.py Let me know if anything is unclear. I'm open to suggestion. Test Plan: Imported from OSS Reviewed By: VitalyFedyunin Differential Revision: D34119492 Pulled By: NivekT fbshipit-source-id: 358cb6d33d18501f9042431350f872ebaa9b4070 (cherry picked from commit 53b484f60ad942c9b86b060c40fe5a3b994424f9)

commit: 3e1eff9a0ecbe618f96e0e8444bf976fef129a74 [log] [tgz]
author: Kevin Tse <ktse@fb.com> Thu Feb 10 08:11:22 2022 -0800
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Thu Feb 10 16:25:36 2022 +0000
tree: 33096b4de1bda8f5de34982092ecea2b8b94db24
parent: 634427d65c510f8dd4a2baef2584f751f1e864c2 [diff]
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index 255a14a..200f0c3 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py

@@ -13,7 +13,7 @@
 @functional_datapipe("map")
 class MapperIterDataPipe(IterDataPipe[T_co]):
     r"""
-    Apply a function over each item from the source DataPipe (functional name: ``map``).
+    Applies a function over each item from the source DataPipe (functional name: ``map``).
     The function can be any regular Python function or partial object. Lambda
     function is not recommended as it is not supported by pickle.
 
@@ -137,7 +137,7 @@
 @functional_datapipe("collate")
 class CollatorIterDataPipe(MapperIterDataPipe):
     r"""
-    Collate samples from DataPipe to Tensor(s) by a custom collate function (functional name: ``collate``).
+    Collates samples from DataPipe to Tensor(s) by a custom collate function (functional name: ``collate``).
     By default, it uses :func:`torch.utils.data.default_collate`.
 
     .. note::

diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index 585ca8e..e08ad75 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py

@@ -8,7 +8,7 @@
 
 class SamplerIterDataPipe(IterDataPipe[T_co]):
     r"""
-    Generate sample elements using the provided ``Sampler`` (defaults to :class:`SequentialSampler`).
+    Generates sample elements using the provided ``Sampler`` (defaults to :class:`SequentialSampler`).
 
     Args:
         datapipe: IterDataPipe to sample from
@@ -46,7 +46,7 @@
 @functional_datapipe('shuffle')
 class ShufflerIterDataPipe(IterDataPipe[T_co]):
     r"""
-    Shuffle the input DataPipe with a buffer (functional name: ``shuffle``). The buffer
+    Shuffles the input DataPipe with a buffer (functional name: ``shuffle``). The buffer
     with ``buffer_size`` is filled with elements from the datapipe first. Then,
     each item will be yielded from the buffer by reservoir sampling via iterator.
 

diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 4d62375..01cbc8c 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py

@@ -16,7 +16,7 @@
 @functional_datapipe('concat')
 class ConcaterIterDataPipe(IterDataPipe):
     r"""
-    Concatenate multiple Iterable DataPipes (functional name: ``concat``). The resulting DataPipe will
+    Concatenates multiple Iterable DataPipes (functional name: ``concat``). The resulting DataPipe will
     yield all the elements from the first input DataPipe, before yielding from the subsequent ones.
 
     Args:
@@ -53,7 +53,7 @@
 @functional_datapipe('fork')
 class ForkerIterDataPipe(IterDataPipe):
     r"""
-    Create multiple instances of the same Iterable DataPipe (functional name: ``fork``).
+    Creates multiple instances of the same Iterable DataPipe (functional name: ``fork``).
 
     Args:
         datapipe: Iterable DataPipe being copied
@@ -176,7 +176,7 @@
 @functional_datapipe('demux')
 class DemultiplexerIterDataPipe(IterDataPipe):
     r"""
-    Split the input DataPipe into multiple child DataPipes, using the given
+    Splits the input DataPipe into multiple child DataPipes, using the given
     classification function (functional name: ``demux``). A list of the child DataPipes is returned from this operation.
 
     Args:

diff --git a/torch/utils/data/datapipes/iter/filelister.py b/torch/utils/data/datapipes/iter/filelister.py
index 2512ea6..4de205e 100644
--- a/torch/utils/data/datapipes/iter/filelister.py
+++ b/torch/utils/data/datapipes/iter/filelister.py

@@ -6,7 +6,7 @@
 
 class FileListerIterDataPipe(IterDataPipe[str]):
     r"""
-    Given path(s) to the root directory, yield file pathname(s) (path + filename) of files within the root directory.
+    Given path(s) to the root directory, yields file pathname(s) (path + filename) of files within the root directory.
     Multiple root directories can be provided.
 
     Args:

diff --git a/torch/utils/data/datapipes/iter/fileopener.py b/torch/utils/data/datapipes/iter/fileopener.py
index b44b8ed..b7198d7 100644
--- a/torch/utils/data/datapipes/iter/fileopener.py
+++ b/torch/utils/data/datapipes/iter/fileopener.py

@@ -7,7 +7,7 @@
 
 class FileOpenerIterDataPipe(IterDataPipe[Tuple[str, IOBase]]):
     r"""
-    Given pathnames, open files and yield pathname and file stream in a tuple.
+    Given pathnames, opens files and yield pathname and file stream in a tuple.
 
     Args:
         datapipe: Iterable datapipe that provides pathnames

diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index 196141b..7e92a49 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py

@@ -48,7 +48,7 @@
 @functional_datapipe('batch')
 class BatcherIterDataPipe(IterDataPipe[DataChunk]):
     r"""
-    Create mini-batches of data (functional name: ``batch``). An outer dimension will be added as
+    Creates mini-batches of data (functional name: ``batch``). An outer dimension will be added as
     ``batch_size`` if ``drop_last`` is set to ``True``, or ``length % batch_size`` for the
     last batch if ``drop_last`` is set to ``False``.
 
@@ -104,7 +104,7 @@
 @functional_datapipe('unbatch')
 class UnBatcherIterDataPipe(IterDataPipe):
     r"""
-    Undo batching of data (functional name: ``unbatch``). In other words, it flattens the data up to the specified level
+    Undoes batching of data (functional name: ``unbatch``). In other words, it flattens the data up to the specified level
     within a batched DataPipe.
 
     Args:
@@ -148,8 +148,8 @@
 @functional_datapipe('groupby')
 class GrouperIterDataPipe(IterDataPipe[DataChunk]):
     r"""
-    Group data from input IterDataPipe by keys which are generated from ``group_key_fn``,
-    and yield a ``DataChunk`` with size ranging from ``guaranteed_group_size``
+    Groups data from input IterDataPipe by keys which are generated from ``group_key_fn``,
+    and yields a ``DataChunk`` with size ranging from ``guaranteed_group_size``
     to ``group_size`` (functional name: ``groupby``).
 
     Args:

diff --git a/torch/utils/data/datapipes/iter/routeddecoder.py b/torch/utils/data/datapipes/iter/routeddecoder.py
index 79b08d0..7925023 100644
--- a/torch/utils/data/datapipes/iter/routeddecoder.py
+++ b/torch/utils/data/datapipes/iter/routeddecoder.py

@@ -14,7 +14,7 @@
 @functional_datapipe('routed_decode')
 class RoutedDecoderIterDataPipe(IterDataPipe[Tuple[str, Any]]):
     r"""
-    Decode binary streams from input DataPipe, yield pathname and decoded data
+    Decodes binary streams from input DataPipe, yields pathname and decoded data
     in a tuple (functional name: ``routed_decode``).
 
     Args:

diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py
index f4180fa..818a27e 100644
--- a/torch/utils/data/datapipes/iter/selecting.py
+++ b/torch/utils/data/datapipes/iter/selecting.py

@@ -22,7 +22,7 @@
 @functional_datapipe('filter')
 class FilterIterDataPipe(IterDataPipe[T_co]):
     r"""
-    Filter out elements from the source datapipe according to input ``filter_fn`` (functional name: ``filter``).
+    Filters out elements from the source datapipe according to input ``filter_fn`` (functional name: ``filter``).
 
     Args:
         datapipe: Iterable DataPipe being filtered

diff --git a/torch/utils/data/datapipes/iter/streamreader.py b/torch/utils/data/datapipes/iter/streamreader.py
index 13444c9..3a731f1 100644
--- a/torch/utils/data/datapipes/iter/streamreader.py
+++ b/torch/utils/data/datapipes/iter/streamreader.py

@@ -4,7 +4,7 @@
 
 class StreamReaderIterDataPipe(IterDataPipe[Tuple[str, bytes]]):
     r"""
-    Given IO streams and their label names, yield bytes with label name in a tuple.
+    Given IO streams and their label names, yields bytes with label name in a tuple.
 
     Args:
         datapipe: Iterable DataPipe provides label/URL and byte stream

diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index b6b1594..5411e7e 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py

@@ -76,6 +76,34 @@
 
 
 class MapDataPipe(Dataset[T_co], metaclass=_DataPipeMeta):
+    r"""
+    Map-style DataPipe.
+
+    All datasets that represent a map from keys to data samples should subclass this.
+    Subclasses should overwrite :meth:`__getitem__`, supporting fetching a
+    data sample for a given, unique key. Subclasses can also optionally overwrite
+    :meth:`__len__`, which is expected to return the size of the dataset by many
+    :class:`~torch.utils.data.Sampler` implementations and the default options
+    of :class:`~torch.utils.data.DataLoader`.
+
+    These DataPipes can be invoked in two ways, using the class constructor or applying their
+    functional form onto an existing `MapDataPipe` (available to most but not all DataPipes).
+
+    Note:
+        :class:`~torch.utils.data.DataLoader` by default constructs an index
+        sampler that yields integral indices. To make it work with a map-style
+        DataPipe with non-integral indices/keys, a custom sampler must be provided.
+
+    Example:
+        >>> from torchdata.datapipes.map import SequenceWrapper, Mapper
+        >>> dp = SequenceWrapper(range(10))
+        >>> map_dp_1 = dp.map(lambda x: x + 1)  # Using functional form
+        >>> list(map_dp_1)  # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        >>> map_dp_2 = Mapper(dp, lambda x: x + 1)  # Using class constructor
+        >>> list(map_dp_2)  # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        >>> batch_dp = map_dp_1.batch(batch_size=2)
+        >>> list(batch_dp)  # [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+    """
     functions: Dict[str, Callable] = {}
 
     def __getattr__(self, attribute_name):
@@ -215,6 +243,45 @@
 
 
 class IterDataPipe(IterableDataset[T_co], metaclass=_DataPipeMeta):
+    r"""
+    Iterable-style DataPipe.
+
+    All DataPipes that represent an iterable of data samples should subclass this.
+    This style of DataPipes is particularly useful when data come from a stream, or
+    when the number of samples is too large to fit them all in memory.
+
+    All subclasses should overwrite :meth:`__iter__`, which would return an
+    iterator of samples in this DataPipe.
+
+    `IterDataPipe` is lazily initialized and its elements are computed only when ``next()`` is called
+    on its iterator.
+
+    These DataPipes can be invoked in two ways, using the class constructor or applying their
+    functional form onto an existing `IterDataPipe` (available to most but not all DataPipes).
+    You can chain multiple `IterDataPipe` together to form a pipeline that will perform multiple
+    operations in succession.
+
+    Note:
+        When a subclass is used with :class:`~torch.utils.data.DataLoader`, each
+        item in the DataPipe will be yielded from the :class:`~torch.utils.data.DataLoader`
+        iterator. When :attr:`num_workers > 0`, each worker process will have a
+        different copy of the DataPipe object, so it is often desired to configure
+        each copy independently to avoid having duplicate data returned from the
+        workers. :func:`~torch.utils.data.get_worker_info`, when called in a worker
+        process, returns information about the worker. It can be used in either the
+        dataset's :meth:`__iter__` method or the :class:`~torch.utils.data.DataLoader` 's
+        :attr:`worker_init_fn` option to modify each copy's behavior.
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper, Mapper
+        >>> dp = IterableWrapper(range(10))
+        >>> map_dp_1 = Mapper(dp, lambda x: x + 1)  # Using class constructor
+        >>> map_dp_2 = dp.map(lambda x: x + 1)  # Using functional form
+        >>> list(map_dp_1)  # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        >>> list(map_dp_2)  # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        >>> filter_dp = map_dp_1.filter(lambda x: x % 2 == 0)
+        >>> list(filter_dp)  # [2, 4, 6, 8, 10]
+    """
     functions: Dict[str, Callable] = {}
     reduce_ex_hook : Optional[Callable] = None
     getstate_hook: Optional[Callable] = None
commit	3e1eff9a0ecbe618f96e0e8444bf976fef129a74	[log] [tgz]
author	Kevin Tse <ktse@fb.com>	Thu Feb 10 08:11:22 2022 -0800
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Thu Feb 10 16:25:36 2022 +0000
tree	33096b4de1bda8f5de34982092ecea2b8b94db24
parent	634427d65c510f8dd4a2baef2584f751f1e864c2 [diff]