fix perf bug in TransposeOp for CUDA
Summary:
It was allocating TensorCPU always, so causing mutex to be acquired in PinnedCPUAllocator.
Not much impact as everyone should use the CUDNN transpose, but good to fix anyway.
Reviewed By: jamesr66a
Differential Revision: D5332858
fbshipit-source-id: 287643df623b7cd59ab1028ed8b2ed1d3c1da44e
diff --git a/caffe2/operators/transpose_op.cu b/caffe2/operators/transpose_op.cu
index 275d4fb..ddd2448 100644
--- a/caffe2/operators/transpose_op.cu
+++ b/caffe2/operators/transpose_op.cu
@@ -52,8 +52,8 @@
// (1) the dimenions of the inputs
// (2) the dimension of the outputs
// (3) the axis mapping from inputs to outputs
- TensorCPU buffer_cpu(vector<int>{3 * ndim});
- int* buffer_data = buffer_cpu.mutable_data<int>();
+ buffer_cpu_.Resize(3 * ndim);
+ int* buffer_data = buffer_cpu_.mutable_data<int>();
for (int i = 0; i < ndim; ++i) {
*(buffer_data++) = input.dim32(i);
}
@@ -64,7 +64,7 @@
*(buffer_data++) = axes_[i];
}
// Copy the dimension information to GPU.
- buffer_.CopyFrom(buffer_cpu, &context_);
+ buffer_.CopyFrom(buffer_cpu_, &context_);
transpose_gpu<T><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
count, input.template data<T>(), output->template mutable_data<T>(),
diff --git a/caffe2/operators/transpose_op.h b/caffe2/operators/transpose_op.h
index 14403fe..baefc5f 100644
--- a/caffe2/operators/transpose_op.h
+++ b/caffe2/operators/transpose_op.h
@@ -58,6 +58,7 @@
// buffer_ is used in TransposeOp<CUDAContext> so we can obtain a consistent
// buffer on the GPU. It is not used in the CPUContext implementation.
Tensor<Context> buffer_;
+ TensorCPU buffer_cpu_;
};
} // namespace caffe2