Some fixes to smooth_l1_loss (#45532)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45532

- updated documentation
- explicitly not supporting negative values for beta (previously the
result was incorrect)
- Removing default value for beta in the backwards function, since it's
only used internally by autograd (as per convention)

Test Plan: Imported from OSS

Reviewed By: gchanan

Differential Revision: D24002415

Pulled By: bdhirsh

fbshipit-source-id: 980c141019ec2d437b771ee11fc1cec4b1fcfb48
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 3563a74..2a3e97c 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -296,8 +296,10 @@
 }
 
 Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction, double beta) {
-  if (beta <= 0)
+  TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.")
+  if (beta == 0) {
       return at::native::l1_loss(input, target, reduction);
+  }
   Tensor loss;
   auto iter = TensorIterator::binary_op(loss, input, target);
   smooth_l1_stub(iter.device_type(), iter, beta);
@@ -305,8 +307,10 @@
 }
 
 Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
-  if (beta <= 0)
+  TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.")
+  if (beta == 0) {
       return at::native::l1_loss_out(result, input, target, reduction);
+  }
   if (reduction != Reduction::None) {
     Tensor loss;
     auto iter = TensorIterator::binary_op(loss, input, target);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 43df603..2aa5cbf 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6794,13 +6794,13 @@
   dispatch:
     CPU, CUDA: smooth_l1_loss
 
-- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0, *, Tensor(a!) grad_input) -> Tensor(a!)
+- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: smooth_l1_loss_backward_out
     CUDA: smooth_l1_loss_backward_out
 
-- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
+- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
 
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 707baa3..92ee277 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1589,7 +1589,7 @@
   grad_output: replication_pad3d(grad, padding)
   self: zeros_like(self)
 
-- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
+- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
   grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, beta)
   self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
   target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index bd3be4e..e408bde 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -758,7 +758,7 @@
 
 class SmoothL1Loss(_Loss):
     r"""Creates a criterion that uses a squared term if the absolute
-    element-wise error falls below 1 and an L1 term otherwise.
+    element-wise error falls below beta and an L1 term otherwise.
     It is less sensitive to outliers than the `MSELoss` and in some cases
     prevents exploding gradients (e.g. see `Fast R-CNN` paper by Ross Girshick).
     Also known as the Huber loss:
@@ -780,6 +780,9 @@
 
     beta is an optional parameter that defaults to 1.
 
+    Note: When beta is set to 0, this is equivalent to we call out directly to :class:`L1Loss`.
+    Passing a negative value in for beta will result in an exception.
+
     The division by :math:`n` can be avoided if sets ``reduction = 'sum'``.
 
     Args: