[DataPipe] Add docstrings for IterDataPipe and MapDataPipe, along with small doc changes for consistency (#72618)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/72618
The major changes are in torch/utils/data/dataset.py
Let me know if anything is unclear. I'm open to suggestion.
Test Plan: Imported from OSS
Reviewed By: VitalyFedyunin
Differential Revision: D34119492
Pulled By: NivekT
fbshipit-source-id: 358cb6d33d18501f9042431350f872ebaa9b4070
(cherry picked from commit 53b484f60ad942c9b86b060c40fe5a3b994424f9)
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index 255a14a..200f0c3 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -13,7 +13,7 @@
@functional_datapipe("map")
class MapperIterDataPipe(IterDataPipe[T_co]):
r"""
- Apply a function over each item from the source DataPipe (functional name: ``map``).
+ Applies a function over each item from the source DataPipe (functional name: ``map``).
The function can be any regular Python function or partial object. Lambda
function is not recommended as it is not supported by pickle.
@@ -137,7 +137,7 @@
@functional_datapipe("collate")
class CollatorIterDataPipe(MapperIterDataPipe):
r"""
- Collate samples from DataPipe to Tensor(s) by a custom collate function (functional name: ``collate``).
+ Collates samples from DataPipe to Tensor(s) by a custom collate function (functional name: ``collate``).
By default, it uses :func:`torch.utils.data.default_collate`.
.. note::
diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index 585ca8e..e08ad75 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py
@@ -8,7 +8,7 @@
class SamplerIterDataPipe(IterDataPipe[T_co]):
r"""
- Generate sample elements using the provided ``Sampler`` (defaults to :class:`SequentialSampler`).
+ Generates sample elements using the provided ``Sampler`` (defaults to :class:`SequentialSampler`).
Args:
datapipe: IterDataPipe to sample from
@@ -46,7 +46,7 @@
@functional_datapipe('shuffle')
class ShufflerIterDataPipe(IterDataPipe[T_co]):
r"""
- Shuffle the input DataPipe with a buffer (functional name: ``shuffle``). The buffer
+ Shuffles the input DataPipe with a buffer (functional name: ``shuffle``). The buffer
with ``buffer_size`` is filled with elements from the datapipe first. Then,
each item will be yielded from the buffer by reservoir sampling via iterator.
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 4d62375..01cbc8c 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -16,7 +16,7 @@
@functional_datapipe('concat')
class ConcaterIterDataPipe(IterDataPipe):
r"""
- Concatenate multiple Iterable DataPipes (functional name: ``concat``). The resulting DataPipe will
+ Concatenates multiple Iterable DataPipes (functional name: ``concat``). The resulting DataPipe will
yield all the elements from the first input DataPipe, before yielding from the subsequent ones.
Args:
@@ -53,7 +53,7 @@
@functional_datapipe('fork')
class ForkerIterDataPipe(IterDataPipe):
r"""
- Create multiple instances of the same Iterable DataPipe (functional name: ``fork``).
+ Creates multiple instances of the same Iterable DataPipe (functional name: ``fork``).
Args:
datapipe: Iterable DataPipe being copied
@@ -176,7 +176,7 @@
@functional_datapipe('demux')
class DemultiplexerIterDataPipe(IterDataPipe):
r"""
- Split the input DataPipe into multiple child DataPipes, using the given
+ Splits the input DataPipe into multiple child DataPipes, using the given
classification function (functional name: ``demux``). A list of the child DataPipes is returned from this operation.
Args:
diff --git a/torch/utils/data/datapipes/iter/filelister.py b/torch/utils/data/datapipes/iter/filelister.py
index 2512ea6..4de205e 100644
--- a/torch/utils/data/datapipes/iter/filelister.py
+++ b/torch/utils/data/datapipes/iter/filelister.py
@@ -6,7 +6,7 @@
class FileListerIterDataPipe(IterDataPipe[str]):
r"""
- Given path(s) to the root directory, yield file pathname(s) (path + filename) of files within the root directory.
+ Given path(s) to the root directory, yields file pathname(s) (path + filename) of files within the root directory.
Multiple root directories can be provided.
Args:
diff --git a/torch/utils/data/datapipes/iter/fileopener.py b/torch/utils/data/datapipes/iter/fileopener.py
index b44b8ed..b7198d7 100644
--- a/torch/utils/data/datapipes/iter/fileopener.py
+++ b/torch/utils/data/datapipes/iter/fileopener.py
@@ -7,7 +7,7 @@
class FileOpenerIterDataPipe(IterDataPipe[Tuple[str, IOBase]]):
r"""
- Given pathnames, open files and yield pathname and file stream in a tuple.
+ Given pathnames, opens files and yield pathname and file stream in a tuple.
Args:
datapipe: Iterable datapipe that provides pathnames
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index 196141b..7e92a49 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -48,7 +48,7 @@
@functional_datapipe('batch')
class BatcherIterDataPipe(IterDataPipe[DataChunk]):
r"""
- Create mini-batches of data (functional name: ``batch``). An outer dimension will be added as
+ Creates mini-batches of data (functional name: ``batch``). An outer dimension will be added as
``batch_size`` if ``drop_last`` is set to ``True``, or ``length % batch_size`` for the
last batch if ``drop_last`` is set to ``False``.
@@ -104,7 +104,7 @@
@functional_datapipe('unbatch')
class UnBatcherIterDataPipe(IterDataPipe):
r"""
- Undo batching of data (functional name: ``unbatch``). In other words, it flattens the data up to the specified level
+ Undoes batching of data (functional name: ``unbatch``). In other words, it flattens the data up to the specified level
within a batched DataPipe.
Args:
@@ -148,8 +148,8 @@
@functional_datapipe('groupby')
class GrouperIterDataPipe(IterDataPipe[DataChunk]):
r"""
- Group data from input IterDataPipe by keys which are generated from ``group_key_fn``,
- and yield a ``DataChunk`` with size ranging from ``guaranteed_group_size``
+ Groups data from input IterDataPipe by keys which are generated from ``group_key_fn``,
+ and yields a ``DataChunk`` with size ranging from ``guaranteed_group_size``
to ``group_size`` (functional name: ``groupby``).
Args:
diff --git a/torch/utils/data/datapipes/iter/routeddecoder.py b/torch/utils/data/datapipes/iter/routeddecoder.py
index 79b08d0..7925023 100644
--- a/torch/utils/data/datapipes/iter/routeddecoder.py
+++ b/torch/utils/data/datapipes/iter/routeddecoder.py
@@ -14,7 +14,7 @@
@functional_datapipe('routed_decode')
class RoutedDecoderIterDataPipe(IterDataPipe[Tuple[str, Any]]):
r"""
- Decode binary streams from input DataPipe, yield pathname and decoded data
+ Decodes binary streams from input DataPipe, yields pathname and decoded data
in a tuple (functional name: ``routed_decode``).
Args:
diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py
index f4180fa..818a27e 100644
--- a/torch/utils/data/datapipes/iter/selecting.py
+++ b/torch/utils/data/datapipes/iter/selecting.py
@@ -22,7 +22,7 @@
@functional_datapipe('filter')
class FilterIterDataPipe(IterDataPipe[T_co]):
r"""
- Filter out elements from the source datapipe according to input ``filter_fn`` (functional name: ``filter``).
+ Filters out elements from the source datapipe according to input ``filter_fn`` (functional name: ``filter``).
Args:
datapipe: Iterable DataPipe being filtered
diff --git a/torch/utils/data/datapipes/iter/streamreader.py b/torch/utils/data/datapipes/iter/streamreader.py
index 13444c9..3a731f1 100644
--- a/torch/utils/data/datapipes/iter/streamreader.py
+++ b/torch/utils/data/datapipes/iter/streamreader.py
@@ -4,7 +4,7 @@
class StreamReaderIterDataPipe(IterDataPipe[Tuple[str, bytes]]):
r"""
- Given IO streams and their label names, yield bytes with label name in a tuple.
+ Given IO streams and their label names, yields bytes with label name in a tuple.
Args:
datapipe: Iterable DataPipe provides label/URL and byte stream
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index b6b1594..5411e7e 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -76,6 +76,34 @@
class MapDataPipe(Dataset[T_co], metaclass=_DataPipeMeta):
+ r"""
+ Map-style DataPipe.
+
+ All datasets that represent a map from keys to data samples should subclass this.
+ Subclasses should overwrite :meth:`__getitem__`, supporting fetching a
+ data sample for a given, unique key. Subclasses can also optionally overwrite
+ :meth:`__len__`, which is expected to return the size of the dataset by many
+ :class:`~torch.utils.data.Sampler` implementations and the default options
+ of :class:`~torch.utils.data.DataLoader`.
+
+ These DataPipes can be invoked in two ways, using the class constructor or applying their
+ functional form onto an existing `MapDataPipe` (available to most but not all DataPipes).
+
+ Note:
+ :class:`~torch.utils.data.DataLoader` by default constructs an index
+ sampler that yields integral indices. To make it work with a map-style
+ DataPipe with non-integral indices/keys, a custom sampler must be provided.
+
+ Example:
+ >>> from torchdata.datapipes.map import SequenceWrapper, Mapper
+ >>> dp = SequenceWrapper(range(10))
+ >>> map_dp_1 = dp.map(lambda x: x + 1) # Using functional form
+ >>> list(map_dp_1) # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+ >>> map_dp_2 = Mapper(dp, lambda x: x + 1) # Using class constructor
+ >>> list(map_dp_2) # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+ >>> batch_dp = map_dp_1.batch(batch_size=2)
+ >>> list(batch_dp) # [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+ """
functions: Dict[str, Callable] = {}
def __getattr__(self, attribute_name):
@@ -215,6 +243,45 @@
class IterDataPipe(IterableDataset[T_co], metaclass=_DataPipeMeta):
+ r"""
+ Iterable-style DataPipe.
+
+ All DataPipes that represent an iterable of data samples should subclass this.
+ This style of DataPipes is particularly useful when data come from a stream, or
+ when the number of samples is too large to fit them all in memory.
+
+ All subclasses should overwrite :meth:`__iter__`, which would return an
+ iterator of samples in this DataPipe.
+
+ `IterDataPipe` is lazily initialized and its elements are computed only when ``next()`` is called
+ on its iterator.
+
+ These DataPipes can be invoked in two ways, using the class constructor or applying their
+ functional form onto an existing `IterDataPipe` (available to most but not all DataPipes).
+ You can chain multiple `IterDataPipe` together to form a pipeline that will perform multiple
+ operations in succession.
+
+ Note:
+ When a subclass is used with :class:`~torch.utils.data.DataLoader`, each
+ item in the DataPipe will be yielded from the :class:`~torch.utils.data.DataLoader`
+ iterator. When :attr:`num_workers > 0`, each worker process will have a
+ different copy of the DataPipe object, so it is often desired to configure
+ each copy independently to avoid having duplicate data returned from the
+ workers. :func:`~torch.utils.data.get_worker_info`, when called in a worker
+ process, returns information about the worker. It can be used in either the
+ dataset's :meth:`__iter__` method or the :class:`~torch.utils.data.DataLoader` 's
+ :attr:`worker_init_fn` option to modify each copy's behavior.
+
+ Example:
+ >>> from torchdata.datapipes.iter import IterableWrapper, Mapper
+ >>> dp = IterableWrapper(range(10))
+ >>> map_dp_1 = Mapper(dp, lambda x: x + 1) # Using class constructor
+ >>> map_dp_2 = dp.map(lambda x: x + 1) # Using functional form
+ >>> list(map_dp_1) # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+ >>> list(map_dp_2) # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+ >>> filter_dp = map_dp_1.filter(lambda x: x % 2 == 0)
+ >>> list(filter_dp) # [2, 4, 6, 8, 10]
+ """
functions: Dict[str, Callable] = {}
reduce_ex_hook : Optional[Callable] = None
getstate_hook: Optional[Callable] = None