Revert D29241736: [pytorch][PR] To add Rectified Adam Algorithm to Optimizers
Test Plan: revert-hammer
Differential Revision:
D29241736 (https://github.com/pytorch/pytorch/commit/0d2a936176ab8c4ecf3651f3360212f088039ff1)
Original commit changeset: 288b9b1f3125
fbshipit-source-id: 56c4ec98647c6f1822b130726741a1c9ca193670
diff --git a/docs/source/optim.rst b/docs/source/optim.rst
index db2ced4..b8ef01f 100644
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@@ -132,7 +132,6 @@
Adamax
ASGD
LBFGS
- RAdam
RMSprop
Rprop
SGD
diff --git a/test/optim/tests.json b/test/optim/tests.json
index 38d0de4..3412eb6 100644
--- a/test/optim/tests.json
+++ b/test/optim/tests.json
@@ -26,17 +26,6 @@
]
},
{
- "algorithm": "radam",
- "config": [
- {},
- {"learningRate": 1e-4},
- {"learningRate": 1e-4, "beta1": 0.92},
- {"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96},
- {"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96, "epsilon": 1e-3},
- {"learningRate": 1e-4, "weightDecay": 0.1}
- ]
- },
- {
"algorithm": "adamw",
"config": [
{},
diff --git a/test/test_optim.py b/test/test_optim.py
index e0e3902..54ae721 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -533,29 +533,6 @@
with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
optimizer(None, lr=1e-2, betas=(0.0, 1.0))
- def test_radam(self):
- self._test_basic_cases(
- lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3)
- )
- self._test_basic_cases(
- lambda weight, bias: optim.RAdam(
- self._build_params_dict(weight, bias, lr=1e-2),
- lr=1e-3)
- )
- self._test_basic_cases(
- lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3, weight_decay=0.1)
- )
- self._test_basic_cases(
- lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3),
- [lambda opt: ExponentialLR(opt, gamma=0.9),
- lambda opt: ReduceLROnPlateau(opt)]
- )
- with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
- optim.RAdam(None, lr=1e-2, betas=(1.0, 0.0))
-
- with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
- optim.RAdam(None, lr=1e-2, weight_decay=-1)
-
def test_rmsprop(self):
for optimizer in [optim.RMSprop, optim_mt.RMSprop]:
self._test_basic_cases(
diff --git a/torch/optim/__init__.py b/torch/optim/__init__.py
index 3c22fbd..5d36510 100644
--- a/torch/optim/__init__.py
+++ b/torch/optim/__init__.py
@@ -13,7 +13,6 @@
from .adamax import Adamax
from .asgd import ASGD
from .sgd import SGD
-from .radam import RAdam
from .rprop import Rprop
from .rmsprop import RMSprop
from .optimizer import Optimizer
@@ -29,7 +28,6 @@
del adamax
del asgd
del sgd
-del radam
del rprop
del rmsprop
del optimizer
diff --git a/torch/optim/__init__.pyi b/torch/optim/__init__.pyi
index e25adc4..f1ea0d3 100644
--- a/torch/optim/__init__.pyi
+++ b/torch/optim/__init__.pyi
@@ -8,7 +8,6 @@
from .asgd import ASGD as ASGD
from .lbfgs import LBFGS as LBFGS
from .optimizer import Optimizer as Optimizer
-from .radam import RAdam as RAdam
from .rmsprop import RMSprop as RMSprop
from .rprop import Rprop as Rprop
from .sgd import SGD as SGD
diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py
index 610d9df..cd5815e 100644
--- a/torch/optim/_functional.py
+++ b/torch/optim/_functional.py
@@ -357,53 +357,3 @@
ax.add_(param.sub(ax).mul(mu))
else:
ax.copy_(param)
-
-
-def radam(params: List[Tensor],
- grads: List[Tensor],
- exp_avgs: List[Tensor],
- exp_avg_sqs: List[Tensor],
- state_steps: List[int],
- *,
- beta1: float,
- beta2: float,
- lr: float,
- weight_decay: float,
- eps: float):
- r"""Functional API that performs RAdam algorithm computation.
-
- See :class:`~torch.optim.Adam` for details.
- """
-
- for i, param in enumerate(params):
- grad = grads[i]
- exp_avg = exp_avgs[i]
- exp_avg_sq = exp_avg_sqs[i]
- step = state_steps[i]
-
- bias_correction1 = 1 - beta1 ** step
- bias_correction2 = 1 - beta2 ** step
-
- if weight_decay != 0:
- grad = grad.add(param, alpha=weight_decay)
-
- # Decay the first and second moment running average coefficient
- exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
- exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-
- # correcting bias for the first moving moment
- bias_corrected_exp_avg = exp_avg / bias_correction1
-
- # maximum length of the approximated SMA
- rho_inf = 2 / (1 - beta2) - 1
- # compute the length of the approximated SMA
- rho_t = rho_inf - 2 * step * (beta2 ** step) / bias_correction2
-
- if rho_t > 5.:
- # Compute the variance rectification term and update parameters accordingly
- rect = math.sqrt((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t))
- adaptive_lr = math.sqrt(bias_correction2) / exp_avg_sq.sqrt().add_(eps)
-
- param.add_(bias_corrected_exp_avg * lr * adaptive_lr * rect, alpha=-1.0)
- else:
- param.add_(bias_corrected_exp_avg * lr, alpha=-1.0)
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
deleted file mode 100644
index 37f8708..0000000
--- a/torch/optim/radam.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import torch
-from . import _functional as F
-from .optimizer import Optimizer
-
-
-class RAdam(Optimizer):
- r"""Implements RAdam algorithm.
- It has been proposed in `On the variance of the adaptive learning rate and beyond`_.
- Args:
- params (iterable): iterable of parameters to optimize or dicts defining
- parameter groups
- lr (float, optional): learning rate (default: 2e-3)
- betas (Tuple[float, float], optional): coefficients used for computing
- running averages of gradient and its square (default: (0.9, 0.999))
- eps (float, optional): term added to the denominator to improve
- numerical stability (default: 1e-8)
- weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
- .. _On the variance of the adaptive learning rate and beyond:
- https://arxiv.org/pdf/1908.03265.pdf
- """
-
- def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
- weight_decay=0):
- if not 0.0 <= lr:
- raise ValueError("Invalid learning rate: {}".format(lr))
- if not 0.0 <= eps:
- raise ValueError("Invalid epsilon value: {}".format(eps))
- if not 0.0 <= betas[0] < 1.0:
- raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
- if not 0.0 <= betas[1] < 1.0:
- raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
- if not 0.0 <= weight_decay:
- raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
- defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
- super(RAdam, self).__init__(params, defaults)
-
- @torch.no_grad()
- def step(self, closure=None):
- """Performs a single optimization step.
- Args:
- closure (callable, optional): A closure that reevaluates the model
- and returns the loss.
- """
- loss = None
- if closure is not None:
- with torch.enable_grad():
- loss = closure()
-
- for group in self.param_groups:
- params_with_grad = []
- grads = []
- exp_avgs = []
- exp_avg_sqs = []
- max_exp_avg_sqs = []
- state_steps = []
- beta1, beta2 = group['betas']
-
- for p in group['params']:
- if p.grad is not None:
- params_with_grad.append(p)
- if p.grad.is_sparse:
- raise RuntimeError('RAdam does not support sparse gradients')
- grads.append(p.grad)
-
- state = self.state[p]
- # Lazy state initialization
- if len(state) == 0:
- state['step'] = 0
- # Exponential moving average of gradient values
- state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
- # Exponential moving average of squared gradient values
- state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
- exp_avgs.append(state['exp_avg'])
- exp_avg_sqs.append(state['exp_avg_sq'])
-
- # update the steps for each param group update
- state['step'] += 1
- # record the step after step update
- state_steps.append(state['step'])
-
- F.radam(params_with_grad,
- grads,
- exp_avgs,
- exp_avg_sqs,
- state_steps,
- beta1=beta1,
- beta2=beta2,
- lr=group['lr'],
- weight_decay=group['weight_decay'],
- eps=group['eps'])
- return loss
diff --git a/torch/optim/radam.pyi b/torch/optim/radam.pyi
deleted file mode 100644
index 1bc77ce..0000000
--- a/torch/optim/radam.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from .optimizer import _params_t, Optimizer
-
-class RAdam(Optimizer):
- def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=...) -> None: ...