Fix performance when reading or writing large buffers.

Blur intrinsic which uses ~25mb of data would spill the
L2 cache when a smarter walking pattern could reduce this
hit.  We now vary the chunk size on both the processor
count and data size.

N7 execution time drops 1959ms to 930ms
Mako 470ms to 385ms
Manta, no change.

Change-Id: I57e36e2fec3bb51033a9bd9f3040963f5b071eb0
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index ce43331..460967e 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -342,16 +342,39 @@
     RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
 
     if ((dc->mWorkers.mCount >= 1) && s->mHal.info.isThreadable && !dc->mInForEach) {
+        const size_t targetByteChunk = 16 * 1024;
         dc->mInForEach = true;
         if (mtls->fep.dimY > 1) {
-            mtls->mSliceSize = mtls->fep.dimY / (dc->mWorkers.mCount * 4);
+            uint32_t s1 = mtls->fep.dimY / ((dc->mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.yStrideOut) {
+                s2 = targetByteChunk / mtls->fep.yStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.yStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
             if(mtls->mSliceSize < 1) {
                 mtls->mSliceSize = 1;
             }
 
             rsdLaunchThreads(mrsc, wc_xy, mtls);
         } else {
-            mtls->mSliceSize = mtls->fep.dimX / (dc->mWorkers.mCount * 4);
+            uint32_t s1 = mtls->fep.dimX / ((dc->mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.eStrideOut) {
+                s2 = targetByteChunk / mtls->fep.eStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.eStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
             if(mtls->mSliceSize < 1) {
                 mtls->mSliceSize = 1;
             }