Use amax/maximum instead of max in optimizers (#43797)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43797

Reviewed By: malfet

Differential Revision: D23406641

Pulled By: mruberry

fbshipit-source-id: 0cd075124aa6533b21375fe2c90c44a5d05ad6e6
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 93cf3a5..22a9e38 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -104,7 +104,7 @@
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                 if amsgrad:
                     # Maintains the maximum of all 2nd moment running avg. till now
-                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                     # Use the max. for normalizing running avg. of gradient
                     denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                 else:
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index 21cd43d..1bb8423 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -80,7 +80,7 @@
                     exp_inf.mul_(beta2).unsqueeze(0),
                     grad.abs().add_(eps).unsqueeze_(0)
                 ], 0)
-                torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))
+                torch.amax(norm_buf, 0, keepdim=False, out=exp_inf)
 
                 bias_correction = 1 - beta1 ** state['step']
                 clr = group['lr'] / bias_correction
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index 5d61c8d..07d5831 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -105,7 +105,7 @@
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                 if amsgrad:
                     # Maintains the maximum of all 2nd moment running avg. till now
-                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                     # Use the max. for normalizing running avg. of gradient
                     denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                 else: