fix perf bug in TransposeOp for CUDA

Summary:
It was allocating TensorCPU always, so causing mutex to be acquired in PinnedCPUAllocator.
Not much impact as everyone should use the CUDNN transpose, but good to fix anyway.

Reviewed By: jamesr66a

Differential Revision: D5332858

fbshipit-source-id: 287643df623b7cd59ab1028ed8b2ed1d3c1da44e
diff --git a/caffe2/operators/transpose_op.cu b/caffe2/operators/transpose_op.cu
index 275d4fb..ddd2448 100644
--- a/caffe2/operators/transpose_op.cu
+++ b/caffe2/operators/transpose_op.cu
@@ -52,8 +52,8 @@
   // (1) the dimenions of the inputs
   // (2) the dimension of the outputs
   // (3) the axis mapping from inputs to outputs
-  TensorCPU buffer_cpu(vector<int>{3 * ndim});
-  int* buffer_data = buffer_cpu.mutable_data<int>();
+  buffer_cpu_.Resize(3 * ndim);
+  int* buffer_data = buffer_cpu_.mutable_data<int>();
   for (int i = 0; i < ndim; ++i) {
     *(buffer_data++) = input.dim32(i);
   }
@@ -64,7 +64,7 @@
     *(buffer_data++) = axes_[i];
   }
   // Copy the dimension information to GPU.
-  buffer_.CopyFrom(buffer_cpu, &context_);
+  buffer_.CopyFrom(buffer_cpu_, &context_);
   transpose_gpu<T><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS,
                      0, context_.cuda_stream()>>>(
       count, input.template data<T>(), output->template mutable_data<T>(),
diff --git a/caffe2/operators/transpose_op.h b/caffe2/operators/transpose_op.h
index 14403fe..baefc5f 100644
--- a/caffe2/operators/transpose_op.h
+++ b/caffe2/operators/transpose_op.h
@@ -58,6 +58,7 @@
   // buffer_ is used in TransposeOp<CUDAContext> so we can obtain a consistent
   // buffer on the GPU. It is not used in the CPUContext implementation.
   Tensor<Context> buffer_;
+  TensorCPU buffer_cpu_;
 };
 
 } // namespace caffe2