Add __all__ for torch.optim and torch.nn.modules modules (#80237)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/80237
Approved by: https://github.com/albanD
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index c844156..c7aa017 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2022,60 +2022,6 @@
   "torch.nn.intrinsic.quantized.modules.conv_relu": [
     "fuse_conv_bn_weights"
   ],
-  "torch.nn.modules.activation": [
-    "Module",
-    "NonDynamicallyQuantizableLinear",
-    "Optional",
-    "Parameter",
-    "Tensor",
-    "Tuple",
-    "constant_",
-    "xavier_normal_",
-    "xavier_uniform_"
-  ],
-  "torch.nn.modules.adaptive": [
-    "Linear",
-    "List",
-    "Module",
-    "ModuleList",
-    "Sequence",
-    "Sequential",
-    "Tensor",
-    "log_softmax",
-    "namedtuple"
-  ],
-  "torch.nn.modules.batchnorm": [
-    "Any",
-    "LazyModuleMixin",
-    "Module",
-    "Optional",
-    "Parameter",
-    "Tensor",
-    "UninitializedBuffer",
-    "UninitializedParameter",
-    "sync_batch_norm"
-  ],
-  "torch.nn.modules.channelshuffle": [
-    "Module",
-    "Tensor"
-  ],
-  "torch.nn.modules.container": [
-    "Any",
-    "Dict",
-    "Iterable",
-    "Iterator",
-    "Mapping",
-    "Module",
-    "Optional",
-    "OrderedDict",
-    "Parameter",
-    "Tuple",
-    "TypeVar",
-    "Union",
-    "chain",
-    "islice",
-    "overload"
-  ],
   "torch.nn.modules.conv": [
     "LazyModuleMixin",
     "List",
@@ -2405,94 +2351,6 @@
     "TensorProtoDataType",
     "TrainingMode"
   ],
-  "torch.optim.adadelta": [
-    "List",
-    "Optimizer",
-    "Optional",
-    "Tensor"
-  ],
-  "torch.optim.adagrad": [
-    "List",
-    "Optimizer",
-    "Optional",
-    "Tensor"
-  ],
-  "torch.optim.adam": [
-    "List",
-    "Optimizer",
-    "Optional",
-    "Tensor"
-  ],
-  "torch.optim.adamax": [
-    "List",
-    "Optimizer",
-    "Optional",
-    "Tensor"
-  ],
-  "torch.optim.adamw": [
-    "List",
-    "Optimizer",
-    "Optional",
-    "Tensor"
-  ],
-  "torch.optim.asgd": [
-    "List",
-    "Optimizer",
-    "Optional",
-    "Tensor"
-  ],
-  "torch.optim.lbfgs": [
-    "Optimizer",
-    "reduce"
-  ],
-  "torch.optim.lr_scheduler": [
-    "Counter",
-    "Optimizer",
-    "bisect_right",
-    "wraps"
-  ],
-  "torch.optim.nadam": [
-    "List",
-    "Optimizer",
-    "Optional",
-    "Tensor"
-  ],
-  "torch.optim.optimizer": [
-    "chain",
-    "deepcopy",
-    "defaultdict"
-  ],
-  "torch.optim.radam": [
-    "List",
-    "Optimizer",
-    "Optional",
-    "Tensor"
-  ],
-  "torch.optim.rmsprop": [
-    "List",
-    "Optimizer",
-    "Optional",
-    "Tensor"
-  ],
-  "torch.optim.rprop": [
-    "List",
-    "Optimizer",
-    "Optional",
-    "Tensor"
-  ],
-  "torch.optim.sgd": [
-    "List",
-    "Optimizer",
-    "Optional",
-    "Tensor"
-  ],
-  "torch.optim.sparse_adam": [
-    "Optimizer"
-  ],
-  "torch.optim.swa_utils": [
-    "Module",
-    "deepcopy"
-  ],
   "torch.overrides": [
     "BaseTorchFunctionMode",
     "TorchFunctionMode",
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index a0ee1ef..3aae87b 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -9,6 +9,10 @@
 from .module import Module
 from .. import functional as F
 
+__all__ = ['Threshold', 'ReLU', 'RReLU', 'Hardtanh', 'ReLU6', 'Sigmoid', 'Hardsigmoid', 'Tanh',
+           'SiLU', 'Mish', 'Hardswish', 'ELU', 'CELU', 'SELU', 'GLU', 'GELU', 'Hardshrink', 'LeakyReLU',
+           'LogSigmoid', 'Softplus', 'Softshrink', 'MultiheadAttention', 'PReLU', 'Softsign', 'Tanhshrink',
+           'Softmin', 'Softmax', 'Softmax2d', 'LogSoftmax']
 
 class Threshold(Module):
     r"""Thresholds each element of the input Tensor.
diff --git a/torch/nn/modules/adaptive.py b/torch/nn/modules/adaptive.py
index 621bcee..5f6fb08 100644
--- a/torch/nn/modules/adaptive.py
+++ b/torch/nn/modules/adaptive.py
@@ -11,6 +11,7 @@
 from .module import Module
 from ..functional import log_softmax
 
+__all__ = ['AdaptiveLogSoftmaxWithLoss']
 
 _ASMoutput = namedtuple('_ASMoutput', ['output', 'loss'])
 
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index 65271eb..6d6d1d2 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -10,6 +10,8 @@
 from .lazy import LazyModuleMixin
 from .module import Module
 
+__all__ = ['BatchNorm1d', 'LazyBatchNorm1d', 'BatchNorm2d', 'LazyBatchNorm2d', 'BatchNorm3d',
+           'LazyBatchNorm3d', 'SyncBatchNorm']
 
 class _NormBase(Module):
     """Common base of _InstanceNorm and _BatchNorm"""
diff --git a/torch/nn/modules/channelshuffle.py b/torch/nn/modules/channelshuffle.py
index 740ee60..efaa76d 100644
--- a/torch/nn/modules/channelshuffle.py
+++ b/torch/nn/modules/channelshuffle.py
@@ -3,6 +3,7 @@
 
 from torch import Tensor
 
+__all__ = ['ChannelShuffle']
 
 class ChannelShuffle(Module):
     r"""Divide the channels in a tensor of shape :math:`(*, C , H, W)`
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 9061669..7280a44 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -10,6 +10,8 @@
 
 from typing import Any, Dict, Iterable, Iterator, Mapping, Optional, overload, Tuple, TypeVar, Union
 
+__all__ = ['Container', 'Sequential', 'ModuleList', 'ModuleDict', 'ParameterList', 'ParameterDict']
+
 T = TypeVar('T', bound=Module)
 
 
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index eb1d4e3f..027130a 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -4,6 +4,7 @@
 from .optimizer import Optimizer
 from typing import List, Optional
 
+__all__ = ['Adadelta', 'adadelta']
 
 class Adadelta(Optimizer):
     r"""Implements Adadelta algorithm.
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index c19202e..123a18f 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -4,6 +4,7 @@
 from .optimizer import Optimizer
 from typing import List, Optional
 
+__all__ = ['Adagrad', 'adagrad']
 
 class Adagrad(Optimizer):
     r"""Implements Adagrad algorithm.
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 4ed3555..ccef933 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -4,6 +4,7 @@
 from .optimizer import Optimizer
 from typing import List, Optional
 
+__all__ = ['Adam', 'adam']
 
 class Adam(Optimizer):
     r"""Implements Adam algorithm.
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index f73c86a..8faa644 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -4,6 +4,7 @@
 from .optimizer import Optimizer
 from typing import List, Optional
 
+__all__ = ['Adamax', 'adamax']
 
 class Adamax(Optimizer):
     r"""Implements Adamax algorithm (a variant of Adam based on infinity norm).
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index cfa33d1..9b0dace 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -4,6 +4,7 @@
 from .optimizer import Optimizer
 from typing import List, Optional
 
+__all__ = ['AdamW', 'adamw']
 
 class AdamW(Optimizer):
     r"""Implements AdamW algorithm.
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index 687ab50..51903c2 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -5,6 +5,7 @@
 from .optimizer import Optimizer
 from typing import List, Optional
 
+__all__ = ['ASGD', 'asgd']
 
 class ASGD(Optimizer):
     """Implements Averaged Stochastic Gradient Descent.
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
index d82c5ed..dc5a11b 100644
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@@ -2,6 +2,7 @@
 from functools import reduce
 from .optimizer import Optimizer
 
+__all__ = ['LBFGS']
 
 def _cubic_interpolate(x1, f1, g1, x2, f2, g2, bounds=None):
     # ported from https://github.com/torch/optim/blob/master/polyinterp.lua
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 0cd53f8..de64c88 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -9,6 +9,9 @@
 
 from .optimizer import Optimizer
 
+__all__ = ['LambdaLR', 'MultiplicativeLR', 'StepLR', 'MultiStepLR', 'ConstantLR', 'LinearLR',
+           'ExponentialLR', 'SequentialLR', 'CosineAnnealingLR', 'ChainedScheduler', 'ReduceLROnPlateau',
+           'CyclicLR', 'CosineAnnealingWarmRestarts', 'OneCycleLR']
 
 EPOCH_DEPRECATION_WARNING = (
     "The epoch parameter in `scheduler.step()` was not necessary and is being "
diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py
index 7916fa6..0b79474 100644
--- a/torch/optim/nadam.py
+++ b/torch/optim/nadam.py
@@ -4,6 +4,7 @@
 from .optimizer import Optimizer
 from typing import List, Optional
 
+__all__ = ['NAdam', 'nadam']
 
 class NAdam(Optimizer):
     r"""Implements NAdam algorithm.
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 2d4ca41..4619550 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -6,6 +6,7 @@
 import warnings
 import functools
 
+__all__ = ['Optimizer']
 
 class _RequiredParameter(object):
     """Singleton class representing a required parameter for an Optimizer."""
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
index f416b51..9847087 100644
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@@ -5,6 +5,7 @@
 from .optimizer import Optimizer
 from typing import List, Optional
 
+__all__ = ['RAdam', 'radam']
 
 class RAdam(Optimizer):
     r"""Implements RAdam algorithm.
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index 313c4e9..32f5bd9 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -3,6 +3,7 @@
 from .optimizer import Optimizer
 from typing import List, Optional
 
+__all__ = ['RMSprop', 'rmsprop']
 
 class RMSprop(Optimizer):
     r"""Implements RMSprop algorithm.
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index d976647..db67014 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -3,6 +3,7 @@
 from .optimizer import Optimizer
 from typing import List, Optional
 
+__all__ = ['Rprop', 'rprop']
 
 class Rprop(Optimizer):
     r"""Implements the resilient backpropagation algorithm.
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index d52301e..aac475f 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -3,6 +3,7 @@
 from .optimizer import Optimizer, required
 from typing import List, Optional
 
+__all__ = ['SGD', 'sgd']
 
 class SGD(Optimizer):
     r"""Implements stochastic gradient descent (optionally with momentum).
diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py
index f31ddda..5cfcbf5 100644
--- a/torch/optim/sparse_adam.py
+++ b/torch/optim/sparse_adam.py
@@ -2,6 +2,7 @@
 from . import _functional as F
 from .optimizer import Optimizer
 
+__all__ = ['SparseAdam']
 
 class SparseAdam(Optimizer):
     r"""Implements lazy version of Adam algorithm suitable for sparse tensors.
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index 83b6291..d44fc3f 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -7,6 +7,7 @@
 from torch.nn import Module
 from torch.optim.lr_scheduler import _LRScheduler
 
+__all__ = ['AveragedModel', 'update_bn', 'SWALR']
 
 class AveragedModel(Module):
     r"""Implements averaged model for Stochastic Weight Averaging (SWA).