Mini-optimization on MeanStdDevNormalization: Don't reload tmp[local_id], it has the same value as in previous iterations.

Measured ~2% speedup on Intel HD500, <0.5% everywhere else.

Move a large comment out of the kernel string, the OpenCL compiler doesn't really need to deal with it.

Also fix the tolerance of the test; it was failing before the algorithm change.

PiperOrigin-RevId: 332297724
Change-Id: Id1600edcb3d2df6d61d47520ac541f8e9a517ca6
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
index ff69dbe..039db8e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
@@ -36,8 +36,16 @@
 
 std::string GetReduceCode() {
   // If it is supported, use the built-in work_group_reduce_add function.
-  // Otherwise, implement a reduction using __local memory. Note this only works
-  // with power-of-two work group sizes.
+  // Otherwise, implement a reduction using __local memory.
+
+  // In the reduction step add upper half of the still-to-be-summed vector to
+  // the lower half, while taking care of odd sizes and rounding. E.g.:
+  // Number of items still to be summed before: 5
+  // Local memory before: [a, b, c, d, e];
+  // Local memory after: [a+d, b+e, c, d, e];
+  // Threads doing work: id < 2 = floor(5/2)
+  // Offset to the added items: 3 = ceil(5/2)
+  // Number of items still to be summed after: 3 = ceil(5/2)
   return R"(
 #if (__OPENCL_C_VERSION__ >= 200) && (__OPENCL_C_VERSION__ < 300) && \
   !defined(__opencl_c_work_group_collective_functions)
@@ -54,18 +62,11 @@
   // The number of items still need to be summed
   int reduction_size = get_local_size(0);
   while (reduction_size > 1) {
-    // Reduction step: add upper half of the still-to-be-summed vector to the
-    // lower half, while taking care of odd sizes and rounding. E.g.:
-    // Number of items still to be summed before: 5
-    // Local memory before: [a, b, c, d, e];
-    // Local memory after: [a+d, b+e, c, d, e];
-    // Threads doing work: id < 2 = floor(5/2)
-    // Offset to the added items: 3 = ceil(5/2)
-    // Number of items still to be summed after: 3 = ceil(5/2)
     const int active_thread_limit = reduction_size / 2;
     const int offset = (reduction_size + 1) / 2;
     if (local_id < active_thread_limit) {
-      tmp[local_id] += tmp[local_id + offset];
+      item += tmp[local_id + offset];
+      tmp[local_id] = item;
     }
     barrier(CLK_LOCAL_MEM_FENCE);
     reduction_size = offset;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
index 995a803..7ceaf96 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
@@ -104,6 +104,8 @@
   };
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps =
+          precision == CalculationsPrecision::F32 ? 2.53e-05f : 3.57e-4f;
       OperationDef op_def;
       op_def.precision = precision;
       auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -128,8 +130,8 @@
           -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // large mean, small variance
           -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // large mean, large variance
       };
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(3.57e-4f), expected_output));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected_output))
+          << "Failed using precision " << ToString(precision);
     }
   }
 }
@@ -153,6 +155,8 @@
 
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps =
+          precision == CalculationsPrecision::F32 ? 0.0f : 8.60e-4f;
       OperationDef op_def;
       op_def.precision = precision;
       auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -175,8 +179,8 @@
         expected_output[i + 0] = +expected_elem;
         expected_output[i + 1] = -expected_elem;
       }
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(1.17e-4f), expected_output));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected_output))
+          << "Failed using precision " << ToString(precision);
     }
   }
 }