Use amax/maximum instead of max in optimizers (#43797) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43797 Reviewed By: malfet Differential Revision: D23406641 Pulled By: mruberry fbshipit-source-id: 0cd075124aa6533b21375fe2c90c44a5d05ad6e6

commit: 6bc77f4d35f6c6562c5c544a2c4af0766d1e909b [log] [tgz]
author: Xiang Gao <qasdfgtyuiop@gmail.com> Tue Sep 15 10:15:40 2020 -0700
committer: Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com> Tue Sep 15 10:39:40 2020 -0700
tree: 5fbff7fe7712eacabdf3c54618b23993956840b2
parent: 9c364da9b9b8f4ef03747bfcc1dd03bb0ee89b25 [diff]
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 93cf3a5..22a9e38 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py

@@ -104,7 +104,7 @@
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                 if amsgrad:
                     # Maintains the maximum of all 2nd moment running avg. till now
-                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                     # Use the max. for normalizing running avg. of gradient
                     denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                 else:

diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index 21cd43d..1bb8423 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py

@@ -80,7 +80,7 @@
                     exp_inf.mul_(beta2).unsqueeze(0),
                     grad.abs().add_(eps).unsqueeze_(0)
                 ], 0)
-                torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))
+                torch.amax(norm_buf, 0, keepdim=False, out=exp_inf)
 
                 bias_correction = 1 - beta1 ** state['step']
                 clr = group['lr'] / bias_correction

diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index 5d61c8d..07d5831 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py

@@ -105,7 +105,7 @@
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                 if amsgrad:
                     # Maintains the maximum of all 2nd moment running avg. till now
-                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                     # Use the max. for normalizing running avg. of gradient
                     denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                 else:
commit	6bc77f4d35f6c6562c5c544a2c4af0766d1e909b	[log] [tgz]
author	Xiang Gao <qasdfgtyuiop@gmail.com>	Tue Sep 15 10:15:40 2020 -0700
committer	Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>	Tue Sep 15 10:39:40 2020 -0700
tree	5fbff7fe7712eacabdf3c54618b23993956840b2
parent	9c364da9b9b8f4ef03747bfcc1dd03bb0ee89b25 [diff]