[PyTorch] AOTI: add CPU fast path in aoti_torch_empty_strided (#110877)
This seems to reduce benchmark time by 15-20%. Supersedes D49835545.
Differential Revision: [D49974460](https://our.internmc.facebook.com/intern/diff/D49974460/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/110877
Approved by: https://github.com/chenyang78, https://github.com/jansel, https://github.com/desertfire
ghstack dependencies: #110876
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index 46ebe94..96dd856 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -183,12 +183,18 @@
AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
c10::IntArrayRef sizes(sizes_ptr, ndim);
c10::IntArrayRef strides(strides_ptr, ndim);
- c10::Device device = c10_device(device_type, device_index);
- c10::TensorOptions options = c10::TensorOptions().device(device).dtype(
- static_cast<c10::ScalarType>(dtype));
- at::Tensor* new_tensor =
- new at::Tensor(at::empty_strided(sizes, strides, options));
- *ret_new_tensor = tensor_pointer_to_tensor_handle(new_tensor);
+ if (c10::DeviceType(device_type) == c10::DeviceType::CPU) {
+ *ret_new_tensor = tensor_pointer_to_tensor_handle(
+ new at::Tensor(at::detail::empty_strided_cpu(
+ sizes, strides, static_cast<c10::ScalarType>(dtype))));
+ } else {
+ c10::Device device = c10_device(device_type, device_index);
+ c10::TensorOptions options = c10::TensorOptions().device(device).dtype(
+ static_cast<c10::ScalarType>(dtype));
+ at::Tensor* new_tensor =
+ new at::Tensor(at::empty_strided(sizes, strides, options));
+ *ret_new_tensor = tensor_pointer_to_tensor_handle(new_tensor);
+ }
});
}