Remove default arguments before calling to __torch_dispatch__ (#61123)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/61123

This applies the design pattern of removing explicit arguments when they
coincide with the default arguments.  This simplifies argument patterns
that dispatch kernels receive and make it easier for us to maintain BC
(as addition of a new default argument isn't immediately BC-breaking
for dispatch implementors).

There is an important extra API which I haven't implemented here yet,
which is to take an incomplete sequence of arguments and fill out their
defaults (in case the user did want normalization).  I plan on adding
that in a future PR.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: saketh-are

Differential Revision: D29853616

Pulled By: ezyang

fbshipit-source-id: 71c672cb3a7d4d01f838a1c7fcdb75a8ce7d058e
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index 5a55b8e..5688c75 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -122,7 +122,7 @@
 $2 = input('grad_y')
 $3 = torch._ops.aten.mul($2, $0)
 $4 = torch._ops.aten.mul($2, $0)
-$5 = torch._ops.aten.add($4, $3, 1)''')
+$5 = torch._ops.aten.add($4, $3)''')
 
     def test_out(self) -> None:
         with capture_logs() as logs:
@@ -172,9 +172,9 @@
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 return "arf"
 
-        self.assertExpectedRaisesInline(
-            RuntimeError, lambda: A(torch.zeros(1)).neg(),
-            """Unable to cast Python instance of type <class 'str'> to C++ type 'at::Tensor'"""
+        # Wobbles depending on NDEBUG mode of pybind11
+        self.assertRaisesRegexp(
+            RuntimeError, "Unable to cast", lambda: A(torch.zeros(1)).neg(),
         )
         self.assertExpectedRaisesInline(
             RuntimeError, lambda: A(torch.zeros(1)).detach(),
@@ -250,7 +250,7 @@
 $3 = input('grad_output')
 $4 = torch._ops.aten.mul($3, tensor(2))
 $5 = torch._ops.aten.mul($4, $0)
-$6 = torch._ops.aten.add_($1, $5, 1)''')
+$6 = torch._ops.aten.add_($1, $5)''')
 
 
 if __name__ == '__main__':
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index c358630..f0cd80d 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -1501,17 +1501,34 @@
   py::gil_scoped_acquire g;
 
   std::vector<py::handle> overloaded_args;
-  auto args = py::reinterpret_steal<py::object>(PyTuple_New(num_arguments));
-  // TODO: actually populate kwargs sometimes?  At the moment, every argument
-  // just gets passed positionally
-  py::dict kwargs;
   // For now, overloads get coalesced.  Might be easier for users if they get
   // overload resolution but is more complicated (need to expose separate
   // functions per overload)
   py::handle torch_api_function = py::module::import("torch").attr("ops").attr(ns).attr(func_name);
   std::string module_name_str = "torch.ops." + ns_str;
 
-  for (int64_t idx = 0; idx < arguments.size(); idx++) {
+  // Pre-scan for arguments that match defaults
+  int64_t default_suffix_len = 0;
+  for (int64_t idx = arguments.size() - 1; idx >= 0; idx--) {
+    const auto& arg = schema.arguments()[idx];
+    if (!arg.default_value().has_value()) {
+      break;
+    }
+    const auto& default_ivalue = *arg.default_value();
+    const auto& ivalue = arguments[idx];
+    if (default_ivalue != ivalue) {
+      break;
+    }
+    default_suffix_len++;
+  }
+
+  auto args = py::reinterpret_steal<py::object>(PyTuple_New(num_arguments - default_suffix_len));
+
+  // TODO: actually populate kwargs sometimes?  At the moment, every argument
+  // just gets passed positionally
+  py::dict kwargs;
+
+  for (int64_t idx = 0; idx < arguments.size() - default_suffix_len; idx++) {
     auto& ivalue = arguments[idx];
     // Search for Tensors (as they may have the torch functions we need)
     if (ivalue.isTensor()) {