Rollback of https://github.com/tensorflow/tensorflow/commit/8649852b75ed43fd62e1429086e4a8f5dd6d38ee.

The commit slowed down testGradientTensor4D and broke the Macos nightly build.

PiperOrigin-RevId: 396901985
Change-Id: I7d06790b814fc6dba9c32b6403dace4f63972fc5
diff --git a/tensorflow/core/kernels/redux_functor.h b/tensorflow/core/kernels/redux_functor.h
index 6681bec..e07fa53 100644
--- a/tensorflow/core/kernels/redux_functor.h
+++ b/tensorflow/core/kernels/redux_functor.h
@@ -230,6 +230,11 @@
           input.template flat<InputT>().template cast<OutputT>().reshape(
               output_dims);
       return;
+    } else if (1 == inner_dim) {
+      // Equivalent to ReduceOuterDimensions.
+      const ReduceOuterDimensions<InputT, AccumT, OutputT, BinaryFunctor> redux;
+      redux(device, input_dims, input, output);
+      return;
     }
 
     // Compute block size along the outer dimension for efficiency.
diff --git a/tensorflow/python/kernel_tests/bias_op_base.py b/tensorflow/python/kernel_tests/bias_op_base.py
index b629123..a3b2ad5 100644
--- a/tensorflow/python/kernel_tests/bias_op_base.py
+++ b/tensorflow/python/kernel_tests/bias_op_base.py
@@ -254,7 +254,7 @@
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
   def testGradientTensor4D(self):
-    for (data_format, use_gpu) in [("NHWC", False), ("NCHW", False)]:
+    for (data_format, use_gpu) in [("NHWC", False)]:
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
         np_input = np.arange(
             1.0, 49.0,
@@ -273,13 +273,6 @@
         self._testGradient(np_input,
                            np.random.rand(64).astype(dtype.as_numpy_dtype),
                            dtype, data_format, use_gpu)
-        np_input = np.arange(
-            1.0, 129.0,
-            dtype=dtype.as_numpy_dtype).reshape([4, 1, 1,
-                                                 32]).astype(np.float32)
-        self._testGradient(np_input,
-                           np.random.rand(32).astype(dtype.as_numpy_dtype),
-                           dtype, data_format, use_gpu)
 
   def testGradientTensor5D(self):
     for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),