Speed up fusion compiler tensor allocation (#13914) Summary: Previously the fusion compiler would allocate an empty tensor and then resize it to the correct size. This PR changes the fusion compiler to allocate a tensor of the correct size the first time around. The difference between these approaches for a single tensor is around 400ns; for something like LSTMCell's FusionGroup that emits 8 outputs this is theoretically a 3us win. Pull Request resolved: https://github.com/pytorch/pytorch/pull/13914 Differential Revision: D13046728 Pulled By: zou3519 fbshipit-source-id: e2f28c0dc2ee5bcfee0efe10610039694691415c

commit: 1e45e7a404d61406c674fb9787be85a48ee98e23 [log] [tgz]
author: Richard Zou <zou3519@gmail.com> Wed Nov 14 07:22:43 2018 -0800
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> Wed Nov 14 07:26:27 2018 -0800
tree: 7573ccc52003ccfaa3fa7f2932e1aeaf714f1364
parent: 109dd5b4123c3506e4adf44cda7097a11f835041 [diff]
diff --git a/torch/csrc/jit/fuser/executor.cpp b/torch/csrc/jit/fuser/executor.cpp
index 808994e..8f4cd70 100644
--- a/torch/csrc/jit/fuser/executor.cpp
+++ b/torch/csrc/jit/fuser/executor.cpp

@@ -168,16 +168,6 @@
 , const int device
 , const at::ArrayRef<at::Tensor>& inputs
 , std::vector<at::Tensor>& outputs) {
-  // Allocates tensors for outputs
-  auto& ref_type = inputs[0].type();
-  outputs.reserve(fusion.outputDesc().size());
-  for (const auto& od : fusion.outputDesc()) {
-    if (device >= 0) // GPU
-      outputs.push_back(at::empty({0}, ref_type.options().dtype(od.scalar_type).device_index(device)));
-    else // CPU
-      outputs.push_back(at::empty({0}, ref_type.options().dtype(od.scalar_type).device(at::Device{at::DeviceType::CPU})));
-  }
-
   // Fails if fusion and given inputs disagree
   JIT_ASSERT(inputs.size() == fusion.inputDesc().size());
 
@@ -262,17 +252,19 @@
   }
 
   // Adds (flattened) output arguments
+  outputs.reserve(fusion.outputDesc().size());
+  const auto& ref_options = inputs[0].options();
   for (size_t i = 0; i < fusion.outputDesc().size(); ++i) {
     const auto& c = fusion.concatDesc()[i];
-    auto& o = outputs[i];
     if (c.isNoop()) {
-      o.resize_(map_size);
+      outputs.push_back(at::empty(map_size, ref_options));
       addTensorInfo(fusion.outputDesc()[i], outputs[i]);
     } else {
       size_t small_size = map_size[c.dim()];
       std::vector<int64_t> concat_size(map_size.begin(), map_size.end());
       concat_size[c.dim()] = small_size * c.nSubTensors();
-      o.resize_(concat_size);
+      outputs.push_back(at::empty(concat_size, ref_options));
+      const auto& o = outputs[i];
       size_t offset = 0;
       for (size_t j = 0; j < c.nSubTensors(); ++j) {
         // because the concatenated_output stays live, the underlying data
commit	1e45e7a404d61406c674fb9787be85a48ee98e23	[log] [tgz]
author	Richard Zou <zou3519@gmail.com>	Wed Nov 14 07:22:43 2018 -0800
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	Wed Nov 14 07:26:27 2018 -0800
tree	7573ccc52003ccfaa3fa7f2932e1aeaf714f1364
parent	109dd5b4123c3506e4adf44cda7097a11f835041 [diff]