use current stream in cat array kernel launch
diff --git a/generic/THCTensorMath.cu b/generic/THCTensorMath.cu
index e9f697d..4c609ba 100644
--- a/generic/THCTensorMath.cu
+++ b/generic/THCTensorMath.cu
@@ -203,7 +203,7 @@
// Template Declarations for dim = 1, 2, 3, 4
#define HANDLE_CASE(DIMS) \
- CatArrayBatchedCopy<real, unsigned int, DIMS><<<applyGrid, applyBlock>>>(data, d_inputs, param, cat_dimension, param.outputStride[cat_dimension]);
+ CatArrayBatchedCopy<real, unsigned int, DIMS><<<applyGrid, applyBlock, 0, THCState_getCurrentStream(state)>>>(data, d_inputs, param, cat_dimension, param.outputStride[cat_dimension]);
// Now we loop
offset = 0;