fix perf bug in TransposeOp for CUDA Summary: It was allocating TensorCPU always, so causing mutex to be acquired in PinnedCPUAllocator. Not much impact as everyone should use the CUDNN transpose, but good to fix anyway. Reviewed By: jamesr66a Differential Revision: D5332858 fbshipit-source-id: 287643df623b7cd59ab1028ed8b2ed1d3c1da44e

commit: ee1f21a53e499a4804a9ed5a3274d03edbaecb03 [log] [tgz]
author: Aapo Kyrola <akyrola@fb.com> Tue Jun 27 15:04:29 2017 -0700
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> Tue Jun 27 15:27:28 2017 -0700
tree: 81c5919c1abac490804b0875e7ccf139a1daf8f7
parent: 81f539a28307aca4d92feb3843f7a650138ab45c [diff]
diff --git a/caffe2/operators/transpose_op.cu b/caffe2/operators/transpose_op.cu
index 275d4fb..ddd2448 100644
--- a/caffe2/operators/transpose_op.cu
+++ b/caffe2/operators/transpose_op.cu

@@ -52,8 +52,8 @@
   // (1) the dimenions of the inputs
   // (2) the dimension of the outputs
   // (3) the axis mapping from inputs to outputs
-  TensorCPU buffer_cpu(vector<int>{3 * ndim});
-  int* buffer_data = buffer_cpu.mutable_data<int>();
+  buffer_cpu_.Resize(3 * ndim);
+  int* buffer_data = buffer_cpu_.mutable_data<int>();
   for (int i = 0; i < ndim; ++i) {
     *(buffer_data++) = input.dim32(i);
   }
@@ -64,7 +64,7 @@
     *(buffer_data++) = axes_[i];
   }
   // Copy the dimension information to GPU.
-  buffer_.CopyFrom(buffer_cpu, &context_);
+  buffer_.CopyFrom(buffer_cpu_, &context_);
   transpose_gpu<T><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS,
                      0, context_.cuda_stream()>>>(
       count, input.template data<T>(), output->template mutable_data<T>(),

diff --git a/caffe2/operators/transpose_op.h b/caffe2/operators/transpose_op.h
index 14403fe..baefc5f 100644
--- a/caffe2/operators/transpose_op.h
+++ b/caffe2/operators/transpose_op.h

@@ -58,6 +58,7 @@
   // buffer_ is used in TransposeOp<CUDAContext> so we can obtain a consistent
   // buffer on the GPU. It is not used in the CPUContext implementation.
   Tensor<Context> buffer_;
+  TensorCPU buffer_cpu_;
 };
 
 } // namespace caffe2
commit	ee1f21a53e499a4804a9ed5a3274d03edbaecb03	[log] [tgz]
author	Aapo Kyrola <akyrola@fb.com>	Tue Jun 27 15:04:29 2017 -0700
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	Tue Jun 27 15:27:28 2017 -0700
tree	81c5919c1abac490804b0875e7ccf139a1daf8f7
parent	81f539a28307aca4d92feb3843f7a650138ab45c [diff]