Speed up fusion compiler tensor allocation (#13914)

Summary:
Previously the fusion compiler would allocate an empty tensor and then
resize it to the correct size. This PR changes the fusion compiler to
allocate a tensor of the correct size the first time around. The
difference between these approaches for a single tensor is around 400ns;
for something like LSTMCell's FusionGroup that emits 8 outputs this is
theoretically a 3us win.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/13914

Differential Revision: D13046728

Pulled By: zou3519

fbshipit-source-id: e2f28c0dc2ee5bcfee0efe10610039694691415c
diff --git a/torch/csrc/jit/fuser/executor.cpp b/torch/csrc/jit/fuser/executor.cpp
index 808994e..8f4cd70 100644
--- a/torch/csrc/jit/fuser/executor.cpp
+++ b/torch/csrc/jit/fuser/executor.cpp
@@ -168,16 +168,6 @@
 , const int device
 , const at::ArrayRef<at::Tensor>& inputs
 , std::vector<at::Tensor>& outputs) {
-  // Allocates tensors for outputs
-  auto& ref_type = inputs[0].type();
-  outputs.reserve(fusion.outputDesc().size());
-  for (const auto& od : fusion.outputDesc()) {
-    if (device >= 0) // GPU
-      outputs.push_back(at::empty({0}, ref_type.options().dtype(od.scalar_type).device_index(device)));
-    else // CPU
-      outputs.push_back(at::empty({0}, ref_type.options().dtype(od.scalar_type).device(at::Device{at::DeviceType::CPU})));
-  }
-
   // Fails if fusion and given inputs disagree
   JIT_ASSERT(inputs.size() == fusion.inputDesc().size());
 
@@ -262,17 +252,19 @@
   }
 
   // Adds (flattened) output arguments
+  outputs.reserve(fusion.outputDesc().size());
+  const auto& ref_options = inputs[0].options();
   for (size_t i = 0; i < fusion.outputDesc().size(); ++i) {
     const auto& c = fusion.concatDesc()[i];
-    auto& o = outputs[i];
     if (c.isNoop()) {
-      o.resize_(map_size);
+      outputs.push_back(at::empty(map_size, ref_options));
       addTensorInfo(fusion.outputDesc()[i], outputs[i]);
     } else {
       size_t small_size = map_size[c.dim()];
       std::vector<int64_t> concat_size(map_size.begin(), map_size.end());
       concat_size[c.dim()] = small_size * c.nSubTensors();
-      o.resize_(concat_size);
+      outputs.push_back(at::empty(concat_size, ref_options));
+      const auto& o = outputs[i];
       size_t offset = 0;
       for (size_t j = 0; j < c.nSubTensors(); ++j) {
         // because the concatenated_output stays live, the underlying data