Fix SpatialSubSampling (was doing non-atomic writes in backprop).

Also some changes to test to make it less flaky:
- Decrease some output counts to reduce out-of-memory issues.
- Decrease some tolerances.
- Increase precision of random seed so that multiple test launches at
  the same time start with different seeds.
diff --git a/SpatialSubSampling.cu b/SpatialSubSampling.cu
index 91d6beb..5d22aa4 100644
--- a/SpatialSubSampling.cu
+++ b/SpatialSubSampling.cu
@@ -342,9 +342,15 @@
     dim3 threads(32,8);
 
     // run updateGradInput kernel
-    subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
-      gradInput_data, gradOutput_data, weight_data,
-      nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    if (kH <= dH && kW <= dW) {
+      subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+        gradInput_data, gradOutput_data, weight_data,
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    } else {
+      subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+        gradInput_data, gradOutput_data, weight_data,
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    }
   } else {
     long nInputCols = input->size[3];
     long nInputRows = input->size[2];
@@ -365,15 +371,14 @@
     dim3 threads(32,8);
 
     // run updateGradInput kernel
-    if (kH == dH && kW == dW) {
+    if (kH <= dH && kW <= dW) {
       subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
         gradInput_data, gradOutput_data, weight_data,
         nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
     } else {
       subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
         gradInput_data, gradOutput_data, weight_data,
-        nInputPlane, nInputRows, nInputCols,
-        kH, kW, dH, dW);
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
     }
   }