Revert D29241736: [pytorch][PR] To add Rectified Adam Algorithm to Optimizers

Test Plan: revert-hammer

Differential Revision:
D29241736 (https://github.com/pytorch/pytorch/commit/0d2a936176ab8c4ecf3651f3360212f088039ff1)

Original commit changeset: 288b9b1f3125

fbshipit-source-id: 56c4ec98647c6f1822b130726741a1c9ca193670
diff --git a/docs/source/optim.rst b/docs/source/optim.rst
index db2ced4..b8ef01f 100644
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@@ -132,7 +132,6 @@
     Adamax
     ASGD
     LBFGS
-    RAdam
     RMSprop
     Rprop
     SGD
diff --git a/test/optim/tests.json b/test/optim/tests.json
index 38d0de4..3412eb6 100644
--- a/test/optim/tests.json
+++ b/test/optim/tests.json
@@ -26,17 +26,6 @@
         ]
     },
     {
-        "algorithm": "radam",
-        "config": [
-            {},
-            {"learningRate": 1e-4},
-            {"learningRate": 1e-4, "beta1": 0.92},
-            {"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96},
-            {"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96, "epsilon": 1e-3},
-            {"learningRate": 1e-4, "weightDecay": 0.1}
-        ]
-    },
-    {
         "algorithm": "adamw",
         "config": [
             {},
diff --git a/test/test_optim.py b/test/test_optim.py
index e0e3902..54ae721 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -533,29 +533,6 @@
             with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
                 optimizer(None, lr=1e-2, betas=(0.0, 1.0))
 
-    def test_radam(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.RAdam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3, weight_decay=0.1)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3),
-            [lambda opt: ExponentialLR(opt, gamma=0.9),
-                lambda opt: ReduceLROnPlateau(opt)]
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
-            optim.RAdam(None, lr=1e-2, betas=(1.0, 0.0))
-
-        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
-            optim.RAdam(None, lr=1e-2, weight_decay=-1)
-
     def test_rmsprop(self):
         for optimizer in [optim.RMSprop, optim_mt.RMSprop]:
             self._test_basic_cases(
diff --git a/torch/optim/__init__.py b/torch/optim/__init__.py
index 3c22fbd..5d36510 100644
--- a/torch/optim/__init__.py
+++ b/torch/optim/__init__.py
@@ -13,7 +13,6 @@
 from .adamax import Adamax
 from .asgd import ASGD
 from .sgd import SGD
-from .radam import RAdam
 from .rprop import Rprop
 from .rmsprop import RMSprop
 from .optimizer import Optimizer
@@ -29,7 +28,6 @@
 del adamax
 del asgd
 del sgd
-del radam
 del rprop
 del rmsprop
 del optimizer
diff --git a/torch/optim/__init__.pyi b/torch/optim/__init__.pyi
index e25adc4..f1ea0d3 100644
--- a/torch/optim/__init__.pyi
+++ b/torch/optim/__init__.pyi
@@ -8,7 +8,6 @@
 from .asgd import ASGD as ASGD
 from .lbfgs import LBFGS as LBFGS
 from .optimizer import Optimizer as Optimizer
-from .radam import RAdam as RAdam
 from .rmsprop import RMSprop as RMSprop
 from .rprop import Rprop as Rprop
 from .sgd import SGD as SGD
diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py
index 610d9df..cd5815e 100644
--- a/torch/optim/_functional.py
+++ b/torch/optim/_functional.py
@@ -357,53 +357,3 @@
             ax.add_(param.sub(ax).mul(mu))
         else:
             ax.copy_(param)
-
-
-def radam(params: List[Tensor],
-          grads: List[Tensor],
-          exp_avgs: List[Tensor],
-          exp_avg_sqs: List[Tensor],
-          state_steps: List[int],
-          *,
-          beta1: float,
-          beta2: float,
-          lr: float,
-          weight_decay: float,
-          eps: float):
-    r"""Functional API that performs RAdam algorithm computation.
-
-    See :class:`~torch.optim.Adam` for details.
-    """
-
-    for i, param in enumerate(params):
-        grad = grads[i]
-        exp_avg = exp_avgs[i]
-        exp_avg_sq = exp_avg_sqs[i]
-        step = state_steps[i]
-
-        bias_correction1 = 1 - beta1 ** step
-        bias_correction2 = 1 - beta2 ** step
-
-        if weight_decay != 0:
-            grad = grad.add(param, alpha=weight_decay)
-
-        # Decay the first and second moment running average coefficient
-        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-
-        # correcting bias for the first moving moment
-        bias_corrected_exp_avg = exp_avg / bias_correction1
-
-        # maximum length of the approximated SMA
-        rho_inf = 2 / (1 - beta2) - 1
-        # compute the length of the approximated SMA
-        rho_t = rho_inf - 2 * step * (beta2 ** step) / bias_correction2
-
-        if rho_t > 5.:
-            # Compute the variance rectification term and update parameters accordingly
-            rect = math.sqrt((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t))
-            adaptive_lr = math.sqrt(bias_correction2) / exp_avg_sq.sqrt().add_(eps)
-
-            param.add_(bias_corrected_exp_avg * lr * adaptive_lr * rect, alpha=-1.0)
-        else:
-            param.add_(bias_corrected_exp_avg * lr, alpha=-1.0)
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
deleted file mode 100644
index 37f8708..0000000
--- a/torch/optim/radam.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import torch
-from . import _functional as F
-from .optimizer import Optimizer
-
-
-class RAdam(Optimizer):
-    r"""Implements RAdam algorithm.
-    It has been proposed in `On the variance of the adaptive learning rate and beyond`_.
-    Args:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 2e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-    .. _On the variance of the adaptive learning rate and beyond:
-        https://arxiv.org/pdf/1908.03265.pdf
-    """
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=0):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
-        super(RAdam, self).__init__(params, defaults)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-        Args:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            exp_avgs = []
-            exp_avg_sqs = []
-            max_exp_avg_sqs = []
-            state_steps = []
-            beta1, beta2 = group['betas']
-
-            for p in group['params']:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    if p.grad.is_sparse:
-                        raise RuntimeError('RAdam does not support sparse gradients')
-                    grads.append(p.grad)
-
-                    state = self.state[p]
-                    # Lazy state initialization
-                    if len(state) == 0:
-                        state['step'] = 0
-                        # Exponential moving average of gradient values
-                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                        # Exponential moving average of squared gradient values
-                        state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                    exp_avgs.append(state['exp_avg'])
-                    exp_avg_sqs.append(state['exp_avg_sq'])
-
-                    # update the steps for each param group update
-                    state['step'] += 1
-                    # record the step after step update
-                    state_steps.append(state['step'])
-
-            F.radam(params_with_grad,
-                    grads,
-                    exp_avgs,
-                    exp_avg_sqs,
-                    state_steps,
-                    beta1=beta1,
-                    beta2=beta2,
-                    lr=group['lr'],
-                    weight_decay=group['weight_decay'],
-                    eps=group['eps'])
-        return loss
diff --git a/torch/optim/radam.pyi b/torch/optim/radam.pyi
deleted file mode 100644
index 1bc77ce..0000000
--- a/torch/optim/radam.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from .optimizer import _params_t, Optimizer
-
-class RAdam(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=...) -> None: ...