Revert "In SkImageBlurFilter, use ScaledDividerU32"

This reverts commit 8021f21b132ce5b90ed1d885311ec7f936466ede.

Reason for revert: Missing change

Original change's description:
> In SkImageBlurFilter, use ScaledDividerU32
>
> Bug: skia:12522
>
> Change-Id: I6608e66b44710f3633635d3621bdae2a5523f28b
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/458456
> Reviewed-by: Brian Osman <brianosman@google.com>
> Commit-Queue: Herb Derby <herb@google.com>

Bug: skia:12522
Change-Id: I5c677ce435c9964be648acbda17d694c12894b3b
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/458719
Auto-Submit: Herb Derby <herb@google.com>
Commit-Queue: Rubber Stamper <rubber-stamper@appspot.gserviceaccount.com>
Bot-Commit: Rubber Stamper <rubber-stamper@appspot.gserviceaccount.com>
diff --git a/src/effects/imagefilters/SkBlurImageFilter.cpp b/src/effects/imagefilters/SkBlurImageFilter.cpp
index 12c38e5..059a076 100644
--- a/src/effects/imagefilters/SkBlurImageFilter.cpp
+++ b/src/effects/imagefilters/SkBlurImageFilter.cpp
@@ -306,7 +306,22 @@
         int window2 = window * window;
         int window3 = window2 * window;
         int divisor = (window & 1) == 1 ? window3 : window3 + window2;
-        return alloc->make<GaussPass>(buffer0, buffer1, buffer2, buffersEnd, border, divisor);
+
+        // NB the sums in the blur code use the following technique to avoid
+        // adding 1/2 to round the divide.
+        //
+        //   Sum/d + 1/2 == (Sum + h) / d
+        //   Sum + d(1/2) ==  Sum + h
+        //     h == (1/2)d
+        //
+        // But the d/2 it self should be rounded.
+        //    h == d/2 + 1/2 == (d + 1) / 2
+        //
+        // divisorFactor = (1 / d) * 2 ^ 32
+        auto divisorFactor = static_cast<uint32_t>(round((1.0 / divisor) * (1ull << 32)));
+        auto half = static_cast<uint32_t>((divisor + 1) / 2);
+        return alloc->make<GaussPass>(
+                buffer0, buffer1, buffer2, buffersEnd, border, divisorFactor, half);
     }
 
     GaussPass(skvx::Vec<4, uint32_t>* buffer0,
@@ -314,21 +329,22 @@
               skvx::Vec<4, uint32_t>* buffer2,
               skvx::Vec<4, uint32_t>* buffersEnd,
               int border,
-              int divisor)
+              uint32_t divisorFactor,
+              uint32_t half)
         : Pass{border}
         , fBuffer0{buffer0}
         , fBuffer1{buffer1}
         , fBuffer2{buffer2}
         , fBuffersEnd{buffersEnd}
-        , fDivider(divisor) {}
+        , fDivisorFactor{divisorFactor}
+        , fHalf{half} {}
 
 private:
     void startBlur() override {
         skvx::Vec<4, uint32_t> zero = {0u, 0u, 0u, 0u};
         zero.store(fSum0);
         zero.store(fSum1);
-        auto half = fDivider.half();
-        skvx::Vec<4, uint32_t>{half, half, half, half}.store(fSum2);
+        skvx::Vec<4, uint32_t>{fHalf, fHalf, fHalf, fHalf}.store(fSum2);
         sk_bzero(fBuffer0, (fBuffersEnd - fBuffer0) * sizeof(skvx::Vec<4, uint32_t>));
 
         fBuffer0Cursor = fBuffer0;
@@ -389,7 +405,8 @@
             sum1 += sum0;
             sum2 += sum1;
 
-            skvx::Vec<4, uint32_t> blurred = fDivider.divide(sum2);
+            skvx::Vec<4, uint64_t> w = skvx::cast<uint64_t>(sum2) * fDivisorFactor;
+            skvx::Vec<4, uint32_t> value = skvx::cast<uint32_t>(w >> 32);
 
             sum2 -= *buffer2Cursor;
             *buffer2Cursor = sum1;
@@ -401,7 +418,7 @@
             *buffer0Cursor = leadingEdge;
             buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0;
 
-            return skvx::cast<uint8_t>(blurred);
+            return skvx::cast<uint8_t>(value);
         };
 
         auto loadEdge = [&](const uint32_t* srcCursor) {
@@ -444,7 +461,8 @@
     skvx::Vec<4, uint32_t>* const fBuffer1;
     skvx::Vec<4, uint32_t>* const fBuffer2;
     skvx::Vec<4, uint32_t>* const fBuffersEnd;
-    const skvx::ScaledDividerU32 fDivider;
+    const uint32_t fDivisorFactor;
+    const uint32_t fHalf;
 
     // blur state
     char fSum0[sizeof(skvx::Vec<4, uint32_t>)];
@@ -559,25 +577,40 @@
         int border = window - 1;
 
         int divisor = window * window;
-        return alloc->make<TentPass>(buffer0, buffer1, buffersEnd, border, divisor);
+
+        // NB the sums in the blur code use the following technique to avoid
+        // adding 1/2 to round the divide.
+        //
+        //   Sum/d + 1/2 == (Sum + h) / d
+        //   Sum + d(1/2) ==  Sum + h
+        //     h == (1/2)d
+        //
+        // But the d/2 it self should be rounded.
+        //    h == d/2 + 1/2 == (d + 1) / 2
+        //
+        // divisorFactor = (1 / d) * 2 ^ 32
+        auto divisorFactor = static_cast<uint32_t>(round((1.0 / divisor) * (1ull << 32)));
+        auto half = static_cast<uint32_t>((divisor + 1) / 2);
+        return alloc->make<TentPass>(buffer0, buffer1, buffersEnd, border, divisorFactor, half);
     }
 
     TentPass(skvx::Vec<4, uint32_t>* buffer0,
              skvx::Vec<4, uint32_t>* buffer1,
              skvx::Vec<4, uint32_t>* buffersEnd,
              int border,
-             int divisor)
+             uint32_t divisorFactor,
+             uint32_t half)
          : Pass{border}
          , fBuffer0{buffer0}
          , fBuffer1{buffer1}
          , fBuffersEnd{buffersEnd}
-         , fDivider(divisor) {}
+         , fDivisorFactor{divisorFactor}
+         , fHalf{half} {}
 
 private:
     void startBlur() override {
         skvx::Vec<4, uint32_t>{0u, 0u, 0u, 0u}.store(fSum0);
-        auto half = fDivider.half();
-        skvx::Vec<4, uint32_t>{half, half, half, half}.store(fSum1);
+        skvx::Vec<4, uint32_t>{fHalf, fHalf, fHalf, fHalf}.store(fSum1);
         sk_bzero(fBuffer0, (fBuffersEnd - fBuffer0) * sizeof(skvx::Vec<4, uint32_t>));
 
         fBuffer0Cursor = fBuffer0;
@@ -630,7 +663,8 @@
             sum0 += leadingEdge;
             sum1 += sum0;
 
-            skvx::Vec<4, uint32_t> blurred = fDivider.divide(sum1);
+            skvx::Vec<4, uint64_t> w = skvx::cast<uint64_t>(sum1) * fDivisorFactor;
+            skvx::Vec<4, uint32_t> value = skvx::cast<uint32_t>(w >> 32);
 
             sum1 -= *buffer1Cursor;
             *buffer1Cursor = sum0;
@@ -639,7 +673,7 @@
             *buffer0Cursor = leadingEdge;
             buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0;
 
-            return skvx::cast<uint8_t>(blurred);
+            return skvx::cast<uint8_t>(value);
         };
 
         auto loadEdge = [&](const uint32_t* srcCursor) {
@@ -678,7 +712,8 @@
     skvx::Vec<4, uint32_t>* const fBuffer0;
     skvx::Vec<4, uint32_t>* const fBuffer1;
     skvx::Vec<4, uint32_t>* const fBuffersEnd;
-    const skvx::ScaledDividerU32 fDivider;
+    const uint32_t fDivisorFactor;
+    const uint32_t fHalf;
 
     // blur state
     char fSum0[sizeof(skvx::Vec<4, uint32_t>)];