[inductor]a less ambitious way to slove the scalar tensor (#132702)

Fixes #121374

The previous https://github.com/pytorch/pytorch/pull/131775 was trying to convert the 0dim cpu tensor to a DynamicScalar in lowering stage. But there are so many lowering rules uncompatible with that way. So, this PR is trying to do the conversion in codegen stage.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/132702
Approved by: https://github.com/eellison
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 5e0235a..bc984c6 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -10893,6 +10893,28 @@
         _, code = run_and_get_code(fn, x, x2)
         FileCheck().check("aten.view.dtype(reinterpret_tensor").run(code[0])
 
+    @requires_gpu()
+    def test_scalar_cpu_tensor_arg(self):
+        def fn(x, y):
+            return x + y.sum()
+
+        test_dtypes = [
+            torch.float32,
+            torch.float64,
+            torch.float16,
+            torch.bfloat16,
+        ]
+        for cpu_dtype in test_dtypes:
+            x = torch.rand([20], device=GPU_TYPE)
+            y = torch.rand([4], device="cpu", dtype=cpu_dtype)
+            self.common(
+                fn,
+                (x, y),
+                check_lowp=False,
+                copy_to_gpu=False,
+                reference_in_float=False,
+            )
+
     def test_float16_to_int16(self):
         def fn(x):
             x_view = x.view(dtype=torch.int16)
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 6d09528..d116da7 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -350,6 +350,7 @@
         self.graph_input_names: List[str] = []
         self.graph_inputs: Dict[str, TensorBox] = {}
         self.graph_inputs_original: Dict[str, InputBuffer] = {}
+        self.zero_dim_cpu_tensor_list: OrderedSet[str] = OrderedSet()
         self.device_types: OrderedSet[str] = (
             const_module.device_types if const_module else OrderedSet()
         )
@@ -1867,4 +1868,4 @@
             name in self.graph_inputs.keys()
             and self.graph_inputs[name].get_numel() == 1
             and self.graph_inputs[name].get_device().type == "cpu"
-        )
+        ) or name in self.zero_dim_cpu_tensor_list
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index a595630..97953f7 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -1640,7 +1640,7 @@
         )
 
         self.nodes = [self.create_scheduler_node(n) for n in nodes]
-
+        self.update_zero_dim_cpu_tensor()
         # some new constants could have been created above
         self.available_buffer_names.update(V.graph.constants.keys())
         for node in self.nodes:
@@ -3320,6 +3320,20 @@
         assert buf.node is not None
         return buf.node.get_layout()
 
+    def update_zero_dim_cpu_tensor(self) -> None:
+        for node in self.nodes:
+            if node.get_device() and is_gpu(node.get_device().type):
+                for read in node.read_writes.reads:
+                    buffer = V.graph.name_to_buffer.get(read.name)
+                    if (
+                        buffer
+                        and buffer.get_device()
+                        and buffer.get_device().type == "cpu"
+                        and not isinstance(buffer.layout, MultiOutputLayout)
+                        and buffer.get_size() == []
+                    ):
+                        V.graph.zero_dim_cpu_tensor_list.add(read.name)
+
 
 class BaseScheduling:
     @classmethod