Use amax/maximum instead of max in optimizers (#43797)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43797
Reviewed By: malfet
Differential Revision: D23406641
Pulled By: mruberry
fbshipit-source-id: 0cd075124aa6533b21375fe2c90c44a5d05ad6e6
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 93cf3a5..22a9e38 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -104,7 +104,7 @@
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
- torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+ torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
else:
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index 21cd43d..1bb8423 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -80,7 +80,7 @@
exp_inf.mul_(beta2).unsqueeze(0),
grad.abs().add_(eps).unsqueeze_(0)
], 0)
- torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))
+ torch.amax(norm_buf, 0, keepdim=False, out=exp_inf)
bias_correction = 1 - beta1 ** state['step']
clr = group['lr'] / bias_correction
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index 5d61c8d..07d5831 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -105,7 +105,7 @@
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
- torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+ torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
else: