fix cuda GatherOp for empty batch

Summary: as title

Differential Revision: D5840432

fbshipit-source-id: 5d9021f152c21d24e91dc0cc3d95443782afc228
diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu
index 3e877fd..aed3511 100644
--- a/caffe2/operators/utility_ops.cu
+++ b/caffe2/operators/utility_ops.cu
@@ -215,6 +215,12 @@
   const Index* idxs = indices.template data<Index>();
   auto out = static_cast<float*>(output->raw_mutable_data(data.meta()));
 
+  // return early when the input is empty, since CUDA kernel will fail for
+  // empty input.
+  if (N <= 0) {
+    return true;
+  }
+
   GatherKernel<<<
       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py
index 4fde1d3..323082b 100644
--- a/caffe2/python/operator_test/gather_ops_test.py
+++ b/caffe2/python/operator_test/gather_ops_test.py
@@ -13,17 +13,20 @@
 
 class TestGatherOps(hu.HypothesisTestCase):
     @given(rows_num=st.integers(1, 10000),
-           index_num=st.integers(1, 5000),
+           index_num=st.integers(0, 5000),
            **hu.gcs)
     def test_gather_ops(self, rows_num, index_num, gc, dc):
         data = np.random.random((rows_num, 10, 20)).astype(np.float32)
-        ind = np.random.randint(rows_num, size=(index_num, 1)).astype('int32')
+        ind = np.random.randint(rows_num, size=(index_num, )).astype('int32')
         op = core.CreateOperator(
             'Gather',
             ['data', 'ind'],
             ['output'])
 
         def ref_gather(data, ind):
+            if ind.size == 0:
+                return [np.zeros((0, 10, 20)).astype(np.float32)]
+
             output = [r for r in [data[i] for i in ind]]
             return [output]