Mini-optimization on MeanStdDevNormalization: Don't reload tmp[local_id], it has the same value as in previous iterations.
Measured ~2% speedup on Intel HD500, <0.5% everywhere else.
Move a large comment out of the kernel string, the OpenCL compiler doesn't really need to deal with it.
Also fix the tolerance of the test; it was failing before the algorithm change.
PiperOrigin-RevId: 332297724
Change-Id: Id1600edcb3d2df6d61d47520ac541f8e9a517ca6
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
index ff69dbe..039db8e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
@@ -36,8 +36,16 @@
std::string GetReduceCode() {
// If it is supported, use the built-in work_group_reduce_add function.
- // Otherwise, implement a reduction using __local memory. Note this only works
- // with power-of-two work group sizes.
+ // Otherwise, implement a reduction using __local memory.
+
+ // In the reduction step add upper half of the still-to-be-summed vector to
+ // the lower half, while taking care of odd sizes and rounding. E.g.:
+ // Number of items still to be summed before: 5
+ // Local memory before: [a, b, c, d, e];
+ // Local memory after: [a+d, b+e, c, d, e];
+ // Threads doing work: id < 2 = floor(5/2)
+ // Offset to the added items: 3 = ceil(5/2)
+ // Number of items still to be summed after: 3 = ceil(5/2)
return R"(
#if (__OPENCL_C_VERSION__ >= 200) && (__OPENCL_C_VERSION__ < 300) && \
!defined(__opencl_c_work_group_collective_functions)
@@ -54,18 +62,11 @@
// The number of items still need to be summed
int reduction_size = get_local_size(0);
while (reduction_size > 1) {
- // Reduction step: add upper half of the still-to-be-summed vector to the
- // lower half, while taking care of odd sizes and rounding. E.g.:
- // Number of items still to be summed before: 5
- // Local memory before: [a, b, c, d, e];
- // Local memory after: [a+d, b+e, c, d, e];
- // Threads doing work: id < 2 = floor(5/2)
- // Offset to the added items: 3 = ceil(5/2)
- // Number of items still to be summed after: 3 = ceil(5/2)
const int active_thread_limit = reduction_size / 2;
const int offset = (reduction_size + 1) / 2;
if (local_id < active_thread_limit) {
- tmp[local_id] += tmp[local_id + offset];
+ item += tmp[local_id + offset];
+ tmp[local_id] = item;
}
barrier(CLK_LOCAL_MEM_FENCE);
reduction_size = offset;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
index 995a803..7ceaf96 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
@@ -104,6 +104,8 @@
};
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
+ const float eps =
+ precision == CalculationsPrecision::F32 ? 2.53e-05f : 3.57e-4f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -128,8 +130,8 @@
-ksqrt16, -ksqrt04, ksqrt04, ksqrt16, // large mean, small variance
-ksqrt16, -ksqrt04, ksqrt04, ksqrt16, // large mean, large variance
};
- EXPECT_THAT(dst_tensor.data,
- Pointwise(FloatNear(3.57e-4f), expected_output));
+ EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected_output))
+ << "Failed using precision " << ToString(precision);
}
}
}
@@ -153,6 +155,8 @@
for (auto storage : env_.GetSupportedStorages()) {
for (auto precision : env_.GetSupportedPrecisions()) {
+ const float eps =
+ precision == CalculationsPrecision::F32 ? 0.0f : 8.60e-4f;
OperationDef op_def;
op_def.precision = precision;
auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -175,8 +179,8 @@
expected_output[i + 0] = +expected_elem;
expected_output[i + 1] = -expected_elem;
}
- EXPECT_THAT(dst_tensor.data,
- Pointwise(FloatNear(1.17e-4f), expected_output));
+ EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected_output))
+ << "Failed using precision " << ToString(precision);
}
}
}