Don't copy on clamp, clamp_out (#10352)

Summary:
This makes clamp and relu faster (fixes #10276).

The extra copying was introduced when clamp moved to ATen and
the _th_clamp_ wrapper was used to forward to TH/THC,
we remove that and add _th_clamp(_out) instead.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10352

Reviewed By: ezyang

Differential Revision: D9233590

Pulled By: SsnL

fbshipit-source-id: 4f86a045498e5e577fb22656c71f171add7ed0ac
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
index 33eb98d..983f496 100644
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -2267,39 +2267,42 @@
         - THTensor* other
 ]]
 [[
-  name: _th_clamp_
+  name: _th_clamp
   cname: clamp
   variants:
     - method
     - function
   return: argument 0
   arguments:
-    - THTensor* self
+    - arg: THTensor* result
+      output: True
     - THTensor* self
     - real min
     - real max
 ]]
 [[
-  name: _th_clamp_min_
+  name: _th_clamp_min
   cname: cmaxValue
   variants:
     - method
     - function
   return: argument 0
   arguments:
-    - THTensor* self
+    - arg: THTensor* result
+      output: True
     - THTensor* self
     - real min
 ]]
 [[
-  name: _th_clamp_max_
+  name: _th_clamp_max
   cname: cminValue
   variants:
     - method
     - function
   return: argument 0
   arguments:
-    - THTensor* self
+    - arg: THTensor* result
+      output: True
     - THTensor* self
     - real max
 ]]
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 2e349aa..2d8a3e2 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -47,11 +47,11 @@
 
 Tensor& _clamp__cpu(Tensor& self, Scalar min, Scalar max) {
   if (!std::isnan(min.toDouble()) && !std::isnan(max.toDouble())) {
-    return _th_clamp_(self, min, max);
+    return _th_clamp_out(self, self, min, max);
   } else if (std::isnan(min.toDouble())) {
-    return _th_clamp_max_(self, max);
+    return _th_clamp_max_out(self, self, max);
   } else if (std::isnan(max.toDouble())) {
-    return _th_clamp_min_(self, min);
+    return _th_clamp_min_out(self, self, min);
   } else {
     return self;
   }
@@ -62,36 +62,30 @@
     const Tensor& self,
     Scalar min,
     Scalar max) {
-  result.resize_(self.sizes());
-  result.copy_(self);
   if (!std::isnan(min.toDouble()) && !std::isnan(max.toDouble())) {
-    _th_clamp_(result, min, max);
+    _th_clamp_out(result, self, min, max);
   } else if (std::isnan(min.toDouble())) {
-    _th_clamp_max_(result, max);
+    _th_clamp_max_out(result, self, max);
   } else if (std::isnan(max.toDouble())) {
-    _th_clamp_min_(result, min);
+    _th_clamp_min_out(result, self, min);
   }
   return result;
 }
 
 Tensor& _clamp_max__cpu(Tensor& self, Scalar max) {
-  return _th_clamp_max_(self, max);
+  return _th_clamp_max_out(self, self, max);
 }
 
 Tensor& _clamp_max_out_cpu(Tensor& result, const Tensor& self, Scalar max) {
-  result.resize_(self.sizes());
-  result.copy_(self);
-  return _th_clamp_max_(result, max);
+  return _th_clamp_max_out(result, self, max);
 }
 
 Tensor& _clamp_min__cpu(Tensor& self, Scalar min) {
-  return _th_clamp_min_(self, min);
+  return _th_clamp_min_out(self, self, min);
 }
 
 Tensor& _clamp_min_out_cpu(Tensor& result, const Tensor& self, Scalar min) {
-  result.resize_(self.sizes());
-  result.copy_(self);
-  return _th_clamp_min_(result, min);
+  return _th_clamp_min_out(result, self, min);
 }
 
 Tensor& fill_(Tensor& self, Scalar value) {
diff --git a/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp b/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp
index 1380039..b55d561 100644
--- a/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp
+++ b/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp
@@ -4,11 +4,11 @@
 
 Tensor& _clamp__cuda(Tensor& self, Scalar min, Scalar max) {
   if (!std::isnan(min.toDouble()) && !std::isnan(max.toDouble())) {
-    return _th_clamp_(self, min, max);
+    return _th_clamp_out(self, self, min, max);
   } else if (std::isnan(min.toDouble())) {
-    return _th_clamp_max_(self, max);
+    return _th_clamp_max_out(self, self, max);
   } else if (std::isnan(max.toDouble())) {
-    return _th_clamp_min_(self, min);
+    return _th_clamp_min_out(self, self, min);
   } else {
     return self;
   }
@@ -19,36 +19,30 @@
     const Tensor& self,
     Scalar min,
     Scalar max) {
-  result.resize_(self.sizes());
-  result.copy_(self);
   if (!std::isnan(min.toDouble()) && !std::isnan(max.toDouble())) {
-    _th_clamp_(result, min, max);
+    _th_clamp_out(result, self, min, max);
   } else if (std::isnan(min.toDouble())) {
-    _th_clamp_max_(result, max);
+    _th_clamp_max_out(result, self, max);
   } else if (std::isnan(max.toDouble())) {
-    _th_clamp_min_(result, min);
+    _th_clamp_min_out(result, self, min);
   }
   return result;
 }
 
 Tensor& _clamp_max__cuda(Tensor& self, Scalar max) {
-  return _th_clamp_max_(self, max);
+  return _th_clamp_max_out(self, self, max);
 }
 
 Tensor& _clamp_max_out_cuda(Tensor& result, const Tensor& self, Scalar max) {
-  result.resize_(self.sizes());
-  result.copy_(self);
-  return _th_clamp_max_(result, max);
+  return _th_clamp_max_out(result, self, max);
 }
 
 Tensor& _clamp_min__cuda(Tensor& self, Scalar min) {
-  return _th_clamp_min_(self, min);
+  return _th_clamp_min_out(self, self, min);
 }
 
 Tensor& _clamp_min_out_cuda(Tensor& result, const Tensor& self, Scalar min) {
-  result.resize_(self.sizes());
-  result.copy_(self);
-  return _th_clamp_min_(result, min);
+  return _th_clamp_min_out(result, self, min);
 }
 
 // These are just forwarding stubs