[inductor]a less ambitious way to slove the scalar tensor (#132702)
Fixes #121374
The previous https://github.com/pytorch/pytorch/pull/131775 was trying to convert the 0dim cpu tensor to a DynamicScalar in lowering stage. But there are so many lowering rules uncompatible with that way. So, this PR is trying to do the conversion in codegen stage.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/132702
Approved by: https://github.com/eellison
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 5e0235a..bc984c6 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -10893,6 +10893,28 @@
_, code = run_and_get_code(fn, x, x2)
FileCheck().check("aten.view.dtype(reinterpret_tensor").run(code[0])
+ @requires_gpu()
+ def test_scalar_cpu_tensor_arg(self):
+ def fn(x, y):
+ return x + y.sum()
+
+ test_dtypes = [
+ torch.float32,
+ torch.float64,
+ torch.float16,
+ torch.bfloat16,
+ ]
+ for cpu_dtype in test_dtypes:
+ x = torch.rand([20], device=GPU_TYPE)
+ y = torch.rand([4], device="cpu", dtype=cpu_dtype)
+ self.common(
+ fn,
+ (x, y),
+ check_lowp=False,
+ copy_to_gpu=False,
+ reference_in_float=False,
+ )
+
def test_float16_to_int16(self):
def fn(x):
x_view = x.view(dtype=torch.int16)
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 6d09528..d116da7 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -350,6 +350,7 @@
self.graph_input_names: List[str] = []
self.graph_inputs: Dict[str, TensorBox] = {}
self.graph_inputs_original: Dict[str, InputBuffer] = {}
+ self.zero_dim_cpu_tensor_list: OrderedSet[str] = OrderedSet()
self.device_types: OrderedSet[str] = (
const_module.device_types if const_module else OrderedSet()
)
@@ -1867,4 +1868,4 @@
name in self.graph_inputs.keys()
and self.graph_inputs[name].get_numel() == 1
and self.graph_inputs[name].get_device().type == "cpu"
- )
+ ) or name in self.zero_dim_cpu_tensor_list
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index a595630..97953f7 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -1640,7 +1640,7 @@
)
self.nodes = [self.create_scheduler_node(n) for n in nodes]
-
+ self.update_zero_dim_cpu_tensor()
# some new constants could have been created above
self.available_buffer_names.update(V.graph.constants.keys())
for node in self.nodes:
@@ -3320,6 +3320,20 @@
assert buf.node is not None
return buf.node.get_layout()
+ def update_zero_dim_cpu_tensor(self) -> None:
+ for node in self.nodes:
+ if node.get_device() and is_gpu(node.get_device().type):
+ for read in node.read_writes.reads:
+ buffer = V.graph.name_to_buffer.get(read.name)
+ if (
+ buffer
+ and buffer.get_device()
+ and buffer.get_device().type == "cpu"
+ and not isinstance(buffer.layout, MultiOutputLayout)
+ and buffer.get_size() == []
+ ):
+ V.graph.zero_dim_cpu_tensor_list.add(read.name)
+
class BaseScheduling:
@classmethod