use current stream in cat array kernel launch
diff --git a/generic/THCTensorMath.cu b/generic/THCTensorMath.cu
index e9f697d..4c609ba 100644
--- a/generic/THCTensorMath.cu
+++ b/generic/THCTensorMath.cu
@@ -203,7 +203,7 @@
 
     // Template Declarations for dim = 1, 2, 3, 4
 #define HANDLE_CASE(DIMS) \
-  CatArrayBatchedCopy<real, unsigned int, DIMS><<<applyGrid, applyBlock>>>(data, d_inputs, param, cat_dimension, param.outputStride[cat_dimension]);
+  CatArrayBatchedCopy<real, unsigned int, DIMS><<<applyGrid, applyBlock, 0, THCState_getCurrentStream(state)>>>(data, d_inputs, param, cat_dimension, param.outputStride[cat_dimension]);
 
     // Now we loop
     offset = 0;