Revert "fix Invalid call to aoti_torch_tensor_copy_ #123039 (#124037)"

This reverts commit f9379ebbbf1369aad8179cac4a2eb7d72f25739e.

Reverted https://github.com/pytorch/pytorch/pull/124037 on behalf of https://github.com/jeanschmidt due to introducing regressions in benchmark, see D56623194 for more details ([comment](https://github.com/pytorch/pytorch/pull/124037#issuecomment-2079574308))
diff --git a/test/inductor/test_cuda_cpp_wrapper.py b/test/inductor/test_cuda_cpp_wrapper.py
index fa717ab..b662e24 100644
--- a/test/inductor/test_cuda_cpp_wrapper.py
+++ b/test/inductor/test_cuda_cpp_wrapper.py
@@ -109,7 +109,9 @@
         test_failures_cuda_wrapper[
             f"{test_name}_dynamic_shapes"
         ] = test_torchinductor.TestFailure(("cuda_wrapper",), is_skip=False)
-    skip_list = []
+    skip_list = [
+        "test_multi_device_cuda",
+    ]
     for test_name in skip_list:
         test_failures_cuda_wrapper[test_name] = test_torchinductor.TestFailure(
             ("cuda_wrapper",), is_skip=True
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index f50e258..95e4ef3 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -896,11 +896,9 @@
     @cache_on_self
     def get_output_refs(self):
         return [
-            (
-                f"torch::tensor({x.codegen_reference(self.wrapper_call)})"
-                if isinstance(x, ir.ShapeAsConstantBuffer) and not config.abi_compatible
-                else x.codegen_reference(self.wrapper_call)
-            )
+            f"torch::tensor({x.codegen_reference(self.wrapper_call)})"
+            if isinstance(x, ir.ShapeAsConstantBuffer) and not config.abi_compatible
+            else x.codegen_reference(self.wrapper_call)
             for x in V.graph.graph_outputs
         ]
 
@@ -1100,11 +1098,9 @@
             outputs_str = "output_tensors"
         else:
             outputs = [
-                (
-                    f"output_tensors[{i}]"
-                    if self.output_is_tensor[i]
-                    else f"output_tensors[{i}].item()"
-                )
+                f"output_tensors[{i}]"
+                if self.output_is_tensor[i]
+                else f"output_tensors[{i}].item()"
                 for i in range(len(V.graph.graph_outputs))
             ]
             outputs_str = f"[{', '.join(outputs)}]"
@@ -1430,7 +1426,6 @@
             and ir.is_contiguous_strides_for_shape(
                 buffer.get_stride(), buffer.get_size()
             )
-            and not buffer.is_extern()
         )
 
     def make_buffer_free(self, buffer):