Cleaning up CUDA code base. Got rid of useless device syncs
diff --git a/THCTensorConv.cu b/THCTensorConv.cu
index 6dc615c..ba5cbb8 100644
--- a/THCTensorConv.cu
+++ b/THCTensorConv.cu
@@ -352,9 +352,6 @@
dim3 blocks(nOutputPlane,yblocks);
dim3 threads(32,8);
- // sync any previous kernel exec
- cudaDeviceSynchronize();
-
// convolution: xcorr2 or conv2
if (type[1] == 'x') {
if ((nKernelCols == 3) && (nKernelRows == 3))
@@ -485,8 +482,7 @@
srow, scol);
}
- // sync & clean
- cudaDeviceSynchronize();
+ // clean
if (*type != 'f') THCudaTensor_free(input);
THCudaTensor_free(kernel);
@@ -588,9 +584,6 @@
dim3 blocks(nOutputPlane*nbatch,yblocks);
dim3 threads(32,8);
- // sync any previous kernel exec
- cudaDeviceSynchronize();
-
// convolution: xcorr2 or conv2
if (type[1] == 'x') {
if ((nKernelCols == 3) && (nKernelRows == 3))
@@ -721,8 +714,7 @@
srow, scol);
}
- // sync & clean
- cudaDeviceSynchronize();
+ // clean
if (*type != 'f') THCudaTensor_free(input);
THCudaTensor_free(kernel);
@@ -797,17 +789,13 @@
dim3 blocks(nKernelPlane, nInputPlane);
dim3 threads(128/nOutputRows, nOutputRows);
- // sync previous jobs
- cudaDeviceSynchronize();
-
// compute rev conv
conv2genericrev <<<blocks, threads>>> (input_data, kernel_data, output_data,
nInputPlane, nInputRows, nInputCols,
nKernelPlane, nKernelRows, nKernelCols,
alpha, srow, scol);
- // sync & clean
- cudaDeviceSynchronize();
+ // clean
THCudaTensor_free(input);
THCudaTensor_free(kernel);
@@ -869,9 +857,6 @@
float *kernel_data = THCudaTensor_data(kernel);
float *output_data = THCudaTensor_data(output);
- // sync previous jobs
- cudaDeviceSynchronize();
-
// kernel is called multiple times
// (the arbitrary split below is just here to make sure we dont go over 256 threads)
for (int sl=0; sl<nbatch; sl+=6) {
@@ -891,8 +876,7 @@
alpha, srow, scol);
}
- // sync & clean
- cudaDeviceSynchronize();
+ // clean
THCudaTensor_free(input);
THCudaTensor_free(kernel);
@@ -1137,8 +1121,7 @@
block_height = 1;
dim3 blocks(nOutputPlane,block_height);
dim3 threads(nthreads_x,nthreads_y);
- // sync any previous kernel exec
- cudaDeviceSynchronize();
+
if ((nKernelCols == 3) && (nKernelRows == 3))
conv2mapgeneric <false, 3, 3> <<<blocks, threads>>> (input_data,
kernel_data,
@@ -1320,9 +1303,8 @@
stride_y,
table_data,
fanin);
- // sync & clean
- cudaDeviceSynchronize();
+ // clean
THCudaTensor_free(input);
THCudaTensor_free(kernel);
THCudaTensor_free(table);
diff --git a/THCTensorCopy.c b/THCTensorCopy.c
index 8eada23..3d25025 100644
--- a/THCTensorCopy.c
+++ b/THCTensorCopy.c
@@ -7,8 +7,6 @@
{
THArgCheck(THCudaTensor_nElement(self) == THFloatTensor_nElement(src), 2, "sizes do not match");
- cudaDeviceSynchronize();
-
{
THCudaTensor *selfc = THCudaTensor_newContiguous(self);
src = THFloatTensor_newContiguous(src);
@@ -26,8 +24,6 @@
{ \
THArgCheck(THCudaTensor_nElement(self) == TH##TYPEC##Tensor_nElement(src), 2, "sizes do not match"); \
\
- cudaDeviceSynchronize(); \
- \
{ \
THLongStorage *size = TH##TYPEC##Tensor_newSizeOf(src); \
THFloatTensor *srcf = THFloatTensor_newWithSize(size, NULL); \
@@ -53,8 +49,6 @@
{
THArgCheck(THFloatTensor_nElement(self) == THCudaTensor_nElement(src), 2, "sizes do not match");
- cudaDeviceSynchronize();
-
{
THFloatTensor *selfc = THFloatTensor_newContiguous(self);
src = THCudaTensor_newContiguous(src);
@@ -71,8 +65,6 @@
{ \
THArgCheck(TH##TYPEC##Tensor_nElement(self) == THCudaTensor_nElement(src), 2, "sizes do not match"); \
\
- cudaDeviceSynchronize(); \
- \
{ \
THLongStorage *size = THCudaTensor_newSizeOf(src); \
THFloatTensor *srcf = THFloatTensor_newWithSize(size, NULL); \
diff --git a/THCTensorCopy.cu b/THCTensorCopy.cu
index 555f9d0..ca3c7c1 100644
--- a/THCTensorCopy.cu
+++ b/THCTensorCopy.cu
@@ -86,8 +86,6 @@
if(errcode != cudaSuccess)
THError(cudaGetErrorString(errcode));
- cudaThreadSynchronize();
-
THCudaCheck(cudaFree(d_self_sz));
THCudaCheck(cudaFree(d_self_st));
THCudaCheck(cudaFree(d_src_sz));
diff --git a/THCTensorMath.cu b/THCTensorMath.cu
index ddcd0eb..426ea1c 100644
--- a/THCTensorMath.cu
+++ b/THCTensorMath.cu
@@ -154,8 +154,6 @@
if(errcode != cudaSuccess)
THError(cudaGetErrorString(errcode));
- cudaThreadSynchronize();
-
THCudaTensor_free(src1);
THCudaTensor_free(src2);
THCudaTensor_freeCopyTo(self, self_);
@@ -193,8 +191,6 @@
if(errcode != cudaSuccess)
THError(cudaGetErrorString(errcode));
- cudaThreadSynchronize();
-
THCudaTensor_free(src1);
THCudaTensor_free(src2);
THCudaTensor_freeCopyTo(self, self_);