Cleaning up CUDA code base. Got rid of useless device syncs
diff --git a/THCTensorConv.cu b/THCTensorConv.cu
index 6dc615c..ba5cbb8 100644
--- a/THCTensorConv.cu
+++ b/THCTensorConv.cu
@@ -352,9 +352,6 @@
   dim3 blocks(nOutputPlane,yblocks);
   dim3 threads(32,8);
 
-  // sync any previous kernel exec
-  cudaDeviceSynchronize();
-
   // convolution: xcorr2 or conv2
   if (type[1] == 'x') {
     if ((nKernelCols == 3) && (nKernelRows == 3))
@@ -485,8 +482,7 @@
                                                         srow, scol);
   }
 
-  // sync & clean
-  cudaDeviceSynchronize();
+  // clean
   if (*type != 'f') THCudaTensor_free(input);
   THCudaTensor_free(kernel);
 
@@ -588,9 +584,6 @@
   dim3 blocks(nOutputPlane*nbatch,yblocks);
   dim3 threads(32,8);
 
-  // sync any previous kernel exec
-  cudaDeviceSynchronize();
-
   // convolution: xcorr2 or conv2
   if (type[1] == 'x') {
     if ((nKernelCols == 3) && (nKernelRows == 3))
@@ -721,8 +714,7 @@
 							     srow, scol);
   }
 
-  // sync & clean
-  cudaDeviceSynchronize();
+  // clean
   if (*type != 'f') THCudaTensor_free(input);
   THCudaTensor_free(kernel);
 
@@ -797,17 +789,13 @@
   dim3 blocks(nKernelPlane, nInputPlane);
   dim3 threads(128/nOutputRows, nOutputRows);
 
-  // sync previous jobs
-  cudaDeviceSynchronize();
-
   // compute rev conv
   conv2genericrev <<<blocks, threads>>> (input_data, kernel_data, output_data,
                                          nInputPlane, nInputRows, nInputCols,
                                          nKernelPlane, nKernelRows, nKernelCols,
                                          alpha, srow, scol);
 
-  // sync & clean
-  cudaDeviceSynchronize();
+  // clean
   THCudaTensor_free(input);
   THCudaTensor_free(kernel);
 
@@ -869,9 +857,6 @@
   float *kernel_data = THCudaTensor_data(kernel);
   float *output_data = THCudaTensor_data(output);
 
-  // sync previous jobs
-  cudaDeviceSynchronize();
-
   // kernel is called multiple times
   // (the arbitrary split below is just here to make sure we dont go over 256 threads)
   for (int sl=0; sl<nbatch; sl+=6) {
@@ -891,8 +876,7 @@
                                            alpha, srow, scol);
   }
 
-  // sync & clean
-  cudaDeviceSynchronize();
+  // clean
   THCudaTensor_free(input);
   THCudaTensor_free(kernel);
 
@@ -1137,8 +1121,7 @@
     block_height = 1;
   dim3 blocks(nOutputPlane,block_height);
   dim3 threads(nthreads_x,nthreads_y);
-  // sync any previous kernel exec
-  cudaDeviceSynchronize();
+  
   if ((nKernelCols == 3) && (nKernelRows == 3))
     conv2mapgeneric <false, 3, 3> <<<blocks, threads>>> (input_data, 
                                                          kernel_data, 
@@ -1320,9 +1303,8 @@
                                                           stride_y, 
                                                           table_data, 
                                                           fanin);
-  // sync & clean
-  cudaDeviceSynchronize();
 
+  // clean
   THCudaTensor_free(input);
   THCudaTensor_free(kernel);
   THCudaTensor_free(table);
diff --git a/THCTensorCopy.c b/THCTensorCopy.c
index 8eada23..3d25025 100644
--- a/THCTensorCopy.c
+++ b/THCTensorCopy.c
@@ -7,8 +7,6 @@
 {
   THArgCheck(THCudaTensor_nElement(self) == THFloatTensor_nElement(src), 2, "sizes do not match"); 
 
-  cudaDeviceSynchronize();
-
   {
     THCudaTensor *selfc = THCudaTensor_newContiguous(self);
     src = THFloatTensor_newContiguous(src);
@@ -26,8 +24,6 @@
 {                                                                       \
   THArgCheck(THCudaTensor_nElement(self) == TH##TYPEC##Tensor_nElement(src), 2, "sizes do not match"); \
                                                                         \
-  cudaDeviceSynchronize();                                              \
-                                                                        \
   {                                                                     \
     THLongStorage *size = TH##TYPEC##Tensor_newSizeOf(src);             \
     THFloatTensor *srcf = THFloatTensor_newWithSize(size, NULL);        \
@@ -53,8 +49,6 @@
 {
   THArgCheck(THFloatTensor_nElement(self) == THCudaTensor_nElement(src), 2, "sizes do not match"); 
 
-  cudaDeviceSynchronize();
-
   {
     THFloatTensor *selfc = THFloatTensor_newContiguous(self);
     src = THCudaTensor_newContiguous(src);
@@ -71,8 +65,6 @@
   {                                                                     \
     THArgCheck(TH##TYPEC##Tensor_nElement(self) == THCudaTensor_nElement(src), 2, "sizes do not match"); \
                                                                         \
-    cudaDeviceSynchronize();                                            \
-                                                                        \
     {                                                                   \
       THLongStorage *size = THCudaTensor_newSizeOf(src);                \
       THFloatTensor *srcf = THFloatTensor_newWithSize(size, NULL);      \
diff --git a/THCTensorCopy.cu b/THCTensorCopy.cu
index 555f9d0..ca3c7c1 100644
--- a/THCTensorCopy.cu
+++ b/THCTensorCopy.cu
@@ -86,8 +86,6 @@
     if(errcode != cudaSuccess)
       THError(cudaGetErrorString(errcode));
 
-    cudaThreadSynchronize();
-
     THCudaCheck(cudaFree(d_self_sz));
     THCudaCheck(cudaFree(d_self_st));
     THCudaCheck(cudaFree(d_src_sz));
diff --git a/THCTensorMath.cu b/THCTensorMath.cu
index ddcd0eb..426ea1c 100644
--- a/THCTensorMath.cu
+++ b/THCTensorMath.cu
@@ -154,8 +154,6 @@
     if(errcode != cudaSuccess)
       THError(cudaGetErrorString(errcode));
 
-    cudaThreadSynchronize();
-
     THCudaTensor_free(src1);
     THCudaTensor_free(src2);
     THCudaTensor_freeCopyTo(self, self_);
@@ -193,8 +191,6 @@
     if(errcode != cudaSuccess)
       THError(cudaGetErrorString(errcode));
 
-    cudaThreadSynchronize();
-
     THCudaTensor_free(src1);
     THCudaTensor_free(src2);
     THCudaTensor_freeCopyTo(self, self_);