[PyTorch] AOTI: add CPU fast path in aoti_torch_empty_strided (#110877)

This seems to reduce benchmark time by 15-20%. Supersedes D49835545.

Differential Revision: [D49974460](https://our.internmc.facebook.com/intern/diff/D49974460/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/110877
Approved by: https://github.com/chenyang78, https://github.com/jansel, https://github.com/desertfire
ghstack dependencies: #110876
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index 46ebe94..96dd856 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -183,12 +183,18 @@
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
     c10::IntArrayRef sizes(sizes_ptr, ndim);
     c10::IntArrayRef strides(strides_ptr, ndim);
-    c10::Device device = c10_device(device_type, device_index);
-    c10::TensorOptions options = c10::TensorOptions().device(device).dtype(
-        static_cast<c10::ScalarType>(dtype));
-    at::Tensor* new_tensor =
-        new at::Tensor(at::empty_strided(sizes, strides, options));
-    *ret_new_tensor = tensor_pointer_to_tensor_handle(new_tensor);
+    if (c10::DeviceType(device_type) == c10::DeviceType::CPU) {
+      *ret_new_tensor = tensor_pointer_to_tensor_handle(
+          new at::Tensor(at::detail::empty_strided_cpu(
+              sizes, strides, static_cast<c10::ScalarType>(dtype))));
+    } else {
+      c10::Device device = c10_device(device_type, device_index);
+      c10::TensorOptions options = c10::TensorOptions().device(device).dtype(
+          static_cast<c10::ScalarType>(dtype));
+      at::Tensor* new_tensor =
+          new at::Tensor(at::empty_strided(sizes, strides, options));
+      *ret_new_tensor = tensor_pointer_to_tensor_handle(new_tensor);
+    }
   });
 }