Revert "Support large kernels on GPU in matrix convolution effect"

This reverts commit 1ed4391fe7d218caf21754de0932b5e96043621e.

Reason for revert: Looks like some bad images showed up at gold.skia.org and that the ProcessorCloneTest is crashing on Windows bots:
https://logs.chromium.org/logs/skia/4bfabe0bad476911/+/steps/dm/0/stdout

Original change's description:
> Support large kernels on GPU in matrix convolution effect
> 
> Currently matrix convolution falls back to CPU execution for large kernels, due to the argument limit for fragment shaders.
> 
> Now for large kernels, we store them in a texture and sample them in a shader to sidestep the limit.
> 
> Change-Id: Icc069a701ea8e9cd0adf75f4bfd149fd22e31afd
> Bug: skia:8449
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/263495
> Reviewed-by: Michael Ludwig <michaelludwig@google.com>
> Commit-Queue: Adlai Holler <adlai@google.com>

TBR=robertphillips@google.com,michaelludwig@google.com,adlai@google.com

Change-Id: Iaf4858131046a343481bcf0fd9cc3919d9fc2bda
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Bug: skia:8449
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/287736
Reviewed-by: Brian Salomon <bsalomon@google.com>
Commit-Queue: Brian Salomon <bsalomon@google.com>
diff --git a/gm/matrixconvolution.cpp b/gm/matrixconvolution.cpp
index 6164dbe..5897ffb 100644
--- a/gm/matrixconvolution.cpp
+++ b/gm/matrixconvolution.cpp
@@ -79,7 +79,7 @@
                 return SkImageFilters::MatrixConvolution({3,3}, kernel.data(), /* gain */ 0.3f, /* bias */ SkIntToScalar(100), kernelOffset, tileMode, convolveAlpha, nullptr, cropRect);
             }
             case kLarge_KernelFixture: {
-                // Intentionally go over the uniform kernel size limit of 25.
+                // Intentionally go over the MAX_KERNEL_SIZE limit and trigger CPU fallback.
                 // All 1s except center value, which is -47 (sum of 1).
                 std::vector<SkScalar> kernel(49, SkIntToScalar(1));
                 kernel[24] = SkIntToScalar(-47);
diff --git a/src/core/SkGpuBlurUtils.cpp b/src/core/SkGpuBlurUtils.cpp
index 8dd06d1..7ff294c 100644
--- a/src/core/SkGpuBlurUtils.cpp
+++ b/src/core/SkGpuBlurUtils.cpp
@@ -128,9 +128,8 @@
     SkIPoint kernelOffset = SkIPoint::Make(radiusX, radiusY);
     GrPaint paint;
     auto wm = SkTileModeToWrapMode(mode);
-    auto conv = GrMatrixConvolutionEffect::MakeGaussian(context, std::move(srcView), srcBounds,
-                                                        size, 1.0, 0.0, kernelOffset, wm, true,
-                                                        sigmaX, sigmaY,
+    auto conv = GrMatrixConvolutionEffect::MakeGaussian(std::move(srcView), srcBounds, size, 1.0,
+                                                        0.0, kernelOffset, wm, true, sigmaX, sigmaY,
                                                         *renderTargetContext->caps());
     paint.addColorFragmentProcessor(std::move(conv));
     paint.setPorterDuffXPFactory(SkBlendMode::kSrc);
@@ -448,8 +447,7 @@
     if (scaleFactorX == 1 && scaleFactorY == 1) {
         // For really small blurs (certainly no wider than 5x5 on desktop GPUs) it is faster to just
         // launch a single non separable kernel vs two launches.
-        const int kernelSize = (2 * radiusX + 1) * (2 * radiusY + 1);
-        if (sigmaX > 0 && sigmaY > 0 && kernelSize <= GrMatrixConvolutionEffect::kMaxUniformSize) {
+        if (sigmaX > 0 && sigmaY > 0 && (2 * radiusX + 1) * (2 * radiusY + 1) <= MAX_KERNEL_SIZE) {
             // Apply the proxy offset to src bounds and offset directly
             return convolve_gaussian_2d(context, std::move(srcView), srcColorType, srcBounds,
                                         dstBounds, radiusX, radiusY, sigmaX, sigmaY, mode,
diff --git a/src/effects/imagefilters/SkMatrixConvolutionImageFilter.cpp b/src/effects/imagefilters/SkMatrixConvolutionImageFilter.cpp
index 0bac7eb..8b242c8 100644
--- a/src/effects/imagefilters/SkMatrixConvolutionImageFilter.cpp
+++ b/src/effects/imagefilters/SkMatrixConvolutionImageFilter.cpp
@@ -391,7 +391,9 @@
     }
 
 #if SK_SUPPORT_GPU
-    if (ctx.gpuBacked()) {
+    // Note: if the kernel is too big, the GPU path falls back to SW
+    if (ctx.gpuBacked() &&
+        fKernelSize.width() * fKernelSize.height() <= MAX_KERNEL_SIZE) {
         auto context = ctx.getContext();
 
         // Ensure the input is in the destination color space. Typically applyCropRect will have
@@ -412,8 +414,7 @@
         // Map srcBounds from input's logical image domain to that of the proxy
         srcBounds.offset(input->subset().x(), input->subset().y());
 
-        auto fp = GrMatrixConvolutionEffect::Make(context,
-                                                  std::move(inputView),
+        auto fp = GrMatrixConvolutionEffect::Make(std::move(inputView),
                                                   srcBounds,
                                                   fKernelSize,
                                                   fKernel,
diff --git a/src/gpu/GrFragmentProcessor.h b/src/gpu/GrFragmentProcessor.h
index efc4c9a..194bb29 100644
--- a/src/gpu/GrFragmentProcessor.h
+++ b/src/gpu/GrFragmentProcessor.h
@@ -490,8 +490,6 @@
 
     TextureSampler(GrSurfaceProxyView, GrSamplerState = {});
 
-    TextureSampler(TextureSampler&&) = default;
-    TextureSampler& operator=(TextureSampler&&) = default;
     TextureSampler& operator=(const TextureSampler&) = delete;
 
     bool operator==(const TextureSampler& that) const {
diff --git a/src/gpu/effects/GrMatrixConvolutionEffect.cpp b/src/gpu/effects/GrMatrixConvolutionEffect.cpp
index c108d34..8bb5a56 100644
--- a/src/gpu/effects/GrMatrixConvolutionEffect.cpp
+++ b/src/gpu/effects/GrMatrixConvolutionEffect.cpp
@@ -6,10 +6,6 @@
  */
 #include "src/gpu/effects/GrMatrixConvolutionEffect.h"
 
-#include "src/gpu/GrBitmapTextureMaker.h"
-#include "src/gpu/GrContextPriv.h"
-#include "src/gpu/GrProxyProvider.h"
-#include "src/gpu/GrRecordingContextPriv.h"
 #include "src/gpu/GrTexture.h"
 #include "src/gpu/GrTextureProxy.h"
 #include "src/gpu/effects/GrTextureEffect.h"
@@ -34,151 +30,52 @@
     UniformHandle               fKernelOffsetUni;
     UniformHandle               fGainUni;
     UniformHandle               fBiasUni;
-    UniformHandle               fKernelBiasUni;
 
     typedef GrGLSLFragmentProcessor INHERITED;
 };
 
-GrMatrixConvolutionEffect::KernelWrapper GrMatrixConvolutionEffect::KernelWrapper::Make(
-        GrRecordingContext* context, SkISize size, const SkScalar* values) {
-    if (nullptr == context || nullptr == values || size.isEmpty()) {
-        return {};
-    }
-    const int length = size.area();
-    // Small kernel -> just fill the array.
-    KernelWrapper result(size);
-    if (length <= kMaxUniformSize) {
-        for (int i = 0; i < length; i++) {
-            result.fArray[i] = SkScalarToFloat(values[i]);
-        }
-        return result;
-    }
-
-    ScalableSampler& scalableSampler = result.fScalableSampler;
-    // Determine min and max values to figure out inner gain & bias.
-    SkScalar min = values[0];
-    SkScalar max = values[0];
-    for (int i = 1; i < length; i++) {
-        if (values[i] < min) {
-            min = values[i];
-        }
-        if (values[i] > max) {
-            max = values[i];
-        }
-    }
-    // Treat near-0 gain (i.e. box blur) as 1, and let the kernelBias
-    // move everything up to the final value.
-    const SkScalar computedGain = max - min;
-    scalableSampler.fGain = SkScalarNearlyZero(computedGain) ? 1.0f : SkScalarToFloat(computedGain);
-    // Inner bias is pre-inner-gain so we divide that out.
-    scalableSampler.fBias = SkScalarToFloat(min) / scalableSampler.fGain;
-
-    static const GrUniqueKey::Domain kDomain = GrUniqueKey::GenerateDomain();
-    GrUniqueKey key;
-    GrUniqueKey::Builder builder(&key, kDomain, length, "Matrix Convolution Kernel");
-    // Texture cache key is the exact content of the kernel.
-    static_assert(sizeof(float) == 4);
-    for (int i = 0; i < length; i++) {
-        builder[i] = *(const uint32_t*)&values[i];
-    }
-    builder.finish();
-
-    // Find or create a texture.
-    GrProxyProvider* proxyProvider = context->priv().proxyProvider();
-    GrSurfaceProxyView view;
-    if (sk_sp<GrTextureProxy> kernelProxy = proxyProvider->findOrCreateProxyByUniqueKey(key)) {
-        GrSwizzle swizzle =
-            context->priv().caps()->getReadSwizzle(kernelProxy->backendFormat(),
-                                                   GrColorType::kAlpha_8);
-        view = {std::move(kernelProxy), kTopLeft_GrSurfaceOrigin, swizzle};
-    } else {
-        SkBitmap bm;
-        if (!bm.tryAllocPixels(SkImageInfo::MakeA8(GrNextPow2(length), 1))) {
-            return {};
-        }
-        for (int i = 0; i < length; i++) {
-            *(bm.getAddr8(i, 0)) =
-                SkScalarRoundToInt((values[i] - min) / scalableSampler.fGain * 255);
-        }
-        bm.setImmutable();
-        GrBitmapTextureMaker maker(context, bm, GrImageTexGenPolicy::kNew_Uncached_Budgeted);
-        view = maker.view(GrMipMapped::kNo);
-        if (!view) {
-            return {};
-        }
-        proxyProvider->assignUniqueKeyToProxy(key, view.asTextureProxy());
-    }
-    scalableSampler.fSampler = { std::move(view) };
-    return result;
-}
-
-bool GrMatrixConvolutionEffect::KernelWrapper::operator==(const KernelWrapper& k) const {
-    return fSize == k.fSize &&
-           (this->isSampled() ? fScalableSampler == k.fScalableSampler : fArray == k.fArray);
-}
-
-bool GrMatrixConvolutionEffect::KernelWrapper::ScalableSampler::operator==(
-                                                                const ScalableSampler& k) const {
-    return fSampler == k.fSampler && fGain == k.fGain && fBias == k.fBias;
-}
-
 void GrGLMatrixConvolutionEffect::emitCode(EmitArgs& args) {
     const GrMatrixConvolutionEffect& mce = args.fFp.cast<GrMatrixConvolutionEffect>();
 
-    int kernelWidth = mce.kernelSize().width();
-    int kernelHeight = mce.kernelSize().height();
+    int kWidth = mce.kernelSize().width();
+    int kHeight = mce.kernelSize().height();
 
-    int arrayCount = (kernelWidth * kernelHeight + 3) / 4;
-    SkASSERT(4 * arrayCount >= kernelWidth * kernelHeight);
+    int arrayCount = (kWidth * kHeight + 3) / 4;
+    SkASSERT(4 * arrayCount >= kWidth * kHeight);
 
     GrGLSLUniformHandler* uniformHandler = args.fUniformHandler;
-    if (mce.kernelIsSampled()) {
-        fKernelBiasUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag,
-                                                    kFloat_GrSLType, "KernelBias");
-    } else {
-        fKernelUni = uniformHandler->addUniformArray(&mce, kFragment_GrShaderFlag,
-                                                     kFloat4_GrSLType, "Kernel", arrayCount);
-    }
+    fKernelUni = uniformHandler->addUniformArray(&mce, kFragment_GrShaderFlag, kHalf4_GrSLType,
+                                                 "Kernel",
+                                                 arrayCount);
     fKernelOffsetUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag, kHalf2_GrSLType,
                                                   "KernelOffset");
-    fGainUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag, kFloat_GrSLType, "Gain");
-    fBiasUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag, kFloat_GrSLType, "Bias");
+    fGainUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag, kHalf_GrSLType, "Gain");
+    fBiasUni = uniformHandler->addUniform(&mce, kFragment_GrShaderFlag, kHalf_GrSLType, "Bias");
 
     const char* kernelOffset = uniformHandler->getUniformCStr(fKernelOffsetUni);
+    const char* kernel = uniformHandler->getUniformCStr(fKernelUni);
     const char* gain = uniformHandler->getUniformCStr(fGainUni);
     const char* bias = uniformHandler->getUniformCStr(fBiasUni);
 
     GrGLSLFPFragmentBuilder* fragBuilder = args.fFragBuilder;
     SkString coords2D = fragBuilder->ensureCoords2D(args.fTransformedCoords[0].fVaryingPoint,
                                                     mce.sampleMatrix());
-    fragBuilder->codeAppend("float4 sum = float4(0, 0, 0, 0);");
+    fragBuilder->codeAppend("half4 sum = half4(0, 0, 0, 0);");
     fragBuilder->codeAppendf("float2 coord = %s - %s;", coords2D.c_str(), kernelOffset);
-    fragBuilder->codeAppend("float4 c;");
+    fragBuilder->codeAppend("half4 c;");
 
-    for (int y = 0; y < kernelHeight; y++) {
-        for (int x = 0; x < kernelWidth; x++) {
+    const char* kVecSuffix[4] = { ".x", ".y", ".z", ".w" };
+    for (int y = 0; y < kHeight; y++) {
+        for (int x = 0; x < kWidth; x++) {
             GrGLSLShaderBuilder::ShaderBlock block(fragBuilder);
-            int offset = y*kernelWidth + x;
+            int offset = y*kWidth + x;
 
-            if (mce.kernelIsSampled()) {
-                const char* kernelBias = uniformHandler->getUniformCStr(fKernelBiasUni);
-                float xCoord = offset / (float)GrNextPow2(mce.kernelSize().area());
-
-                fragBuilder->codeAppend("float k = ");
-                fragBuilder->appendTextureLookup(args.fTexSamplers[0],
-                                         SkSL::String::printf("half2(%f, 0.5)", xCoord).c_str());
-                fragBuilder->codeAppendf(".w + %s;", kernelBias);
-            } else {
-                static constexpr const char* kVecSuffix[4] = { ".x", ".y", ".z", ".w" };
-                const char* kernel = uniformHandler->getUniformCStr(fKernelUni);
-                fragBuilder->codeAppendf("float k = %s[%d]%s;", kernel, offset / 4,
-                                         kVecSuffix[offset & 0x3]);
-            }
-
+            fragBuilder->codeAppendf("half k = %s[%d]%s;", kernel, offset / 4,
+                                     kVecSuffix[offset & 0x3]);
             SkSL::String coord;
             coord.appendf("coord + half2(%d, %d)", x, y);
             auto sample = this->invokeChild(0, args, coord);
-            fragBuilder->codeAppendf("float4 c = %s;", sample.c_str());
+            fragBuilder->codeAppendf("half4 c = %s;", sample.c_str());
             if (!mce.convolveAlpha()) {
                 fragBuilder->codeAppend("c.rgb /= c.a;");
                 fragBuilder->codeAppend("c.rgb = saturate(c.rgb);");
@@ -187,16 +84,15 @@
         }
     }
     if (mce.convolveAlpha()) {
-        fragBuilder->codeAppendf("%s = half4(sum * %s + %s);", args.fOutputColor, gain, bias);
+        fragBuilder->codeAppendf("%s = sum * %s + %s;", args.fOutputColor, gain, bias);
         fragBuilder->codeAppendf("%s.a = saturate(%s.a);", args.fOutputColor, args.fOutputColor);
         fragBuilder->codeAppendf("%s.rgb = clamp(%s.rgb, 0.0, %s.a);",
                                  args.fOutputColor, args.fOutputColor, args.fOutputColor);
     } else {
         auto sample = this->invokeChild(0, args, coords2D.c_str());
         fragBuilder->codeAppendf("c = %s;", sample.c_str());
-        fragBuilder->codeAppendf("%s.a = half(c.a);", args.fOutputColor);
-        fragBuilder->codeAppendf("%s.rgb = half3(saturate(sum.rgb * %s + %s));",
-                                 args.fOutputColor, gain, bias);
+        fragBuilder->codeAppendf("%s.a = c.a;", args.fOutputColor);
+        fragBuilder->codeAppendf("%s.rgb = saturate(sum.rgb * %s + %s);", args.fOutputColor, gain, bias);
         fragBuilder->codeAppendf("%s.rgb *= %s.a;", args.fOutputColor, args.fOutputColor);
     }
     fragBuilder->codeAppendf("%s *= %s;\n", args.fOutputColor, args.fInputColor);
@@ -215,22 +111,17 @@
                                             const GrFragmentProcessor& processor) {
     const GrMatrixConvolutionEffect& conv = processor.cast<GrMatrixConvolutionEffect>();
     pdman.set2fv(fKernelOffsetUni, 1, conv.kernelOffset().ptr());
-    float totalGain = conv.gain();
-    if (conv.kernelIsSampled()) {
-        totalGain *= conv.kernelSampleGain();
-        pdman.set1f(fKernelBiasUni, conv.kernelSampleBias());
-    } else {
-        int kernelCount = conv.kernelSize().area();
-        int arrayCount = (kernelCount + 3) / 4;
-        SkASSERT(4 * arrayCount >= kernelCount);
-        pdman.set4fv(fKernelUni, arrayCount, conv.kernel());
-    }
+    int kernelCount = conv.kernelSize().width() * conv.kernelSize().height();
+    int arrayCount = (kernelCount + 3) / 4;
+    SkASSERT(4 * arrayCount >= kernelCount);
+    pdman.set4fv(fKernelUni, arrayCount, conv.kernel());
+    pdman.set1f(fGainUni, conv.gain());
     pdman.set1f(fBiasUni, conv.bias());
-    pdman.set1f(fGainUni, totalGain);
 }
 
 GrMatrixConvolutionEffect::GrMatrixConvolutionEffect(std::unique_ptr<GrFragmentProcessor> child,
-                                                     KernelWrapper kernel,
+                                                     const SkISize& kernelSize,
+                                                     const SkScalar* kernel,
                                                      SkScalar gain,
                                                      SkScalar bias,
                                                      const SkIPoint& kernelOffset,
@@ -238,14 +129,14 @@
         // To advertise either the modulation or opaqueness optimizations we'd have to examine the
         // parameters.
         : INHERITED(kGrMatrixConvolutionEffect_ClassID, kNone_OptimizationFlags)
-        , fKernel(std::move(kernel))
+        , fKernelSize(kernelSize)
         , fGain(SkScalarToFloat(gain))
         , fBias(SkScalarToFloat(bias) / 255.0f)
         , fConvolveAlpha(convolveAlpha) {
     child->setSampledWithExplicitCoords();
     this->registerChildProcessor(std::move(child));
-    if (fKernel.isSampled()) {
-        this->setTextureSamplerCnt(1);
+    for (int i = 0; i < kernelSize.width() * kernelSize.height(); i++) {
+        fKernel[i] = SkScalarToFloat(kernel[i]);
     }
     fKernelOffset = {static_cast<float>(kernelOffset.x()),
                      static_cast<float>(kernelOffset.y())};
@@ -254,7 +145,7 @@
 
 GrMatrixConvolutionEffect::GrMatrixConvolutionEffect(const GrMatrixConvolutionEffect& that)
         : INHERITED(kGrMatrixConvolutionEffect_ClassID, kNone_OptimizationFlags)
-        , fKernel(that.fKernel)
+        , fKernelSize(that.fKernelSize)
         , fGain(that.fGain)
         , fBias(that.fBias)
         , fKernelOffset(that.fKernelOffset)
@@ -262,9 +153,7 @@
     auto child = that.childProcessor(0).clone();
     child->setSampledWithExplicitCoords();
     this->registerChildProcessor(std::move(child));
-    if (fKernel.isSampled()) {
-        this->setTextureSamplerCnt(1);
-    }
+    std::copy_n(that.fKernel, fKernelSize.width() * fKernelSize.height(), fKernel);
     this->addCoordTransform(&fCoordTransform);
 }
 
@@ -283,18 +172,14 @@
 
 bool GrMatrixConvolutionEffect::onIsEqual(const GrFragmentProcessor& sBase) const {
     const GrMatrixConvolutionEffect& s = sBase.cast<GrMatrixConvolutionEffect>();
-    return fKernel == s.fKernel &&
+    return fKernelSize == s.kernelSize() &&
+           std::equal(fKernel, fKernel + fKernelSize.area(), s.fKernel) &&
            fGain == s.gain() &&
            fBias == s.bias() &&
            fKernelOffset == s.kernelOffset() &&
            fConvolveAlpha == s.convolveAlpha();
 }
 
-const GrFragmentProcessor::TextureSampler& GrMatrixConvolutionEffect::onTextureSampler(
-        int index) const {
-    return IthTextureSampler(index, fKernel.scalableSampler().fSampler);
-}
-
 static void fill_in_1D_gaussian_kernel_with_stride(float* kernel, int size, int stride,
                                                    float twoSigmaSqrd) {
     SkASSERT(!SkScalarNearlyZero(twoSigmaSqrd, SK_ScalarNearlyZero));
@@ -319,6 +204,7 @@
 
 static void fill_in_2D_gaussian_kernel(float* kernel, int width, int height,
                                        SkScalar sigmaX, SkScalar sigmaY) {
+    SkASSERT(width * height <= MAX_KERNEL_SIZE);
     const float twoSigmaSqrdX = 2.0f * SkScalarToFloat(SkScalarSquare(sigmaX));
     const float twoSigmaSqrdY = 2.0f * SkScalarToFloat(SkScalarSquare(sigmaY));
 
@@ -374,8 +260,7 @@
     }
 }
 
-std::unique_ptr<GrFragmentProcessor> GrMatrixConvolutionEffect::Make(GrRecordingContext* context,
-                                                                     GrSurfaceProxyView srcView,
+std::unique_ptr<GrFragmentProcessor> GrMatrixConvolutionEffect::Make(GrSurfaceProxyView srcView,
                                                                      const SkIRect& srcBounds,
                                                                      const SkISize& kernelSize,
                                                                      const SkScalar* kernel,
@@ -385,19 +270,14 @@
                                                                      GrSamplerState::WrapMode wm,
                                                                      bool convolveAlpha,
                                                                      const GrCaps& caps) {
-    auto kw = KernelWrapper::Make(context, kernelSize, kernel);
-    if (!kw.isValid()) {
-        return nullptr;
-    }
     GrSamplerState sampler(wm, GrSamplerState::Filter::kNearest);
     auto child = GrTextureEffect::MakeSubset(std::move(srcView), kPremul_SkAlphaType, SkMatrix::I(),
                                              sampler, SkRect::Make(srcBounds), caps);
     return std::unique_ptr<GrFragmentProcessor>(new GrMatrixConvolutionEffect(
-            std::move(child), std::move(kw), gain, bias, kernelOffset, convolveAlpha));
+            std::move(child), kernelSize, kernel, gain, bias, kernelOffset, convolveAlpha));
 }
 
 std::unique_ptr<GrFragmentProcessor> GrMatrixConvolutionEffect::MakeGaussian(
-        GrRecordingContext* context,
         GrSurfaceProxyView srcView,
         const SkIRect& srcBounds,
         const SkISize& kernelSize,
@@ -409,11 +289,11 @@
         SkScalar sigmaX,
         SkScalar sigmaY,
         const GrCaps& caps) {
-    SkAutoSTMalloc<32, float> kernel(kernelSize.area());
-    fill_in_2D_gaussian_kernel(kernel.get(), kernelSize.width(), kernelSize.height(),
-                               sigmaX, sigmaY);
-    return Make(context, std::move(srcView), srcBounds, kernelSize, kernel.get(),
-                gain, bias, kernelOffset, wm, convolveAlpha, caps);
+    float kernel[MAX_KERNEL_SIZE];
+
+    fill_in_2D_gaussian_kernel(kernel, kernelSize.width(), kernelSize.height(), sigmaX, sigmaY);
+    return Make(std::move(srcView), srcBounds, kernelSize, kernel, gain, bias, kernelOffset, wm,
+                convolveAlpha, caps);
 }
 
 GR_DEFINE_FRAGMENT_PROCESSOR_TEST(GrMatrixConvolutionEffect);
@@ -422,9 +302,8 @@
 std::unique_ptr<GrFragmentProcessor> GrMatrixConvolutionEffect::TestCreate(GrProcessorTestData* d) {
     auto [view, ct, at] = d->randomView();
 
-    static constexpr size_t kMaxTestKernelSize = 2 * kMaxUniformSize;
-    int width = d->fRandom->nextRangeU(1, kMaxTestKernelSize);
-    int height = d->fRandom->nextRangeU(1, kMaxTestKernelSize / width);
+    int width = d->fRandom->nextRangeU(1, MAX_KERNEL_SIZE);
+    int height = d->fRandom->nextRangeU(1, MAX_KERNEL_SIZE / width);
     SkISize kernelSize = SkISize::Make(width, height);
     std::unique_ptr<SkScalar[]> kernel(new SkScalar[width * height]);
     for (int i = 0; i < width * height; i++) {
@@ -446,8 +325,8 @@
     auto wm = static_cast<GrSamplerState::WrapMode>(
             d->fRandom->nextULessThan(GrSamplerState::kWrapModeCount));
     bool convolveAlpha = d->fRandom->nextBool();
-    return GrMatrixConvolutionEffect::Make(d->context()->priv().asRecordingContext(),
-                                           std::move(view),
+
+    return GrMatrixConvolutionEffect::Make(std::move(view),
                                            bounds,
                                            kernelSize,
                                            kernel.get(),
diff --git a/src/gpu/effects/GrMatrixConvolutionEffect.h b/src/gpu/effects/GrMatrixConvolutionEffect.h
index 125fa7f..5ff61cc 100644
--- a/src/gpu/effects/GrMatrixConvolutionEffect.h
+++ b/src/gpu/effects/GrMatrixConvolutionEffect.h
@@ -9,17 +9,14 @@
 #define GrMatrixConvolutionEffect_DEFINED
 
 #include "src/gpu/GrFragmentProcessor.h"
-#include <array>
-#include <new>
+
+// A little bit less than the minimum # uniforms required by DX9SM2 (32).
+// Allows for a 5x5 kernel (or 25x1, for that matter).
+#define MAX_KERNEL_SIZE 25
 
 class GrMatrixConvolutionEffect : public GrFragmentProcessor {
 public:
-    // A little bit less than the minimum # uniforms required by DX9SM2 (32).
-    // Allows for a 5x5 kernel (or 25x1, for that matter).
-    static constexpr int kMaxUniformSize = 25;
-
-    static std::unique_ptr<GrFragmentProcessor> Make(GrRecordingContext*,
-                                                     GrSurfaceProxyView srcView,
+    static std::unique_ptr<GrFragmentProcessor> Make(GrSurfaceProxyView srcView,
                                                      const SkIRect& srcBounds,
                                                      const SkISize& kernelSize,
                                                      const SkScalar* kernel,
@@ -30,8 +27,7 @@
                                                      bool convolveAlpha,
                                                      const GrCaps&);
 
-    static std::unique_ptr<GrFragmentProcessor> MakeGaussian(GrRecordingContext*,
-                                                             GrSurfaceProxyView srcView,
+    static std::unique_ptr<GrFragmentProcessor> MakeGaussian(GrSurfaceProxyView srcView,
                                                              const SkIRect& srcBounds,
                                                              const SkISize& kernelSize,
                                                              SkScalar gain,
@@ -44,12 +40,9 @@
                                                              const GrCaps&);
 
     const SkIRect& bounds() const { return fBounds; }
-    SkISize kernelSize() const { return fKernel.size(); }
+    const SkISize& kernelSize() const { return fKernelSize; }
     const SkV2 kernelOffset() const { return fKernelOffset; }
-    bool kernelIsSampled() const { return fKernel.isSampled(); }
-    const float *kernel() const { return fKernel.array().data(); }
-    float kernelSampleGain() const { return fKernel.scalableSampler().fGain; }
-    float kernelSampleBias() const { return fKernel.scalableSampler().fBias; }
+    const float* kernel() const { return fKernel; }
     float gain() const { return fGain; }
     float bias() const { return fBias; }
     bool convolveAlpha() const { return fConvolveAlpha; }
@@ -59,72 +52,11 @@
     std::unique_ptr<GrFragmentProcessor> clone() const override;
 
 private:
-    /**
-     * Small kernels are represented as float-arrays and uploaded as uniforms.
-     * Large kernels go over the uniform limit and are uploaded as textures and sampled.
-     */
-    class KernelWrapper {
-    public:
-        struct ScalableSampler {
-            TextureSampler fSampler;
-            // Applied before any other math.
-            float fBias = 0.0f;
-            // Premultiplied in with user gain to save time.
-            float fGain = 1.0f;
-            bool operator==(const ScalableSampler&) const;
-        };
-        static KernelWrapper Make(GrRecordingContext*, SkISize, const float* values);
-
-        KernelWrapper(KernelWrapper&& that) : fSize(that.fSize) {
-            if (that.isSampled()) {
-                new (&fScalableSampler) ScalableSampler(std::move(that.fScalableSampler));
-            } else {
-                new (&fArray) std::array<float, kMaxUniformSize>(std::move(that.fArray));
-            }
-        }
-        KernelWrapper(const KernelWrapper& that) : fSize(that.fSize) {
-            if (that.isSampled()) {
-                new (&fScalableSampler) ScalableSampler(that.fScalableSampler);
-            } else {
-                new (&fArray) std::array<float, kMaxUniformSize>(that.fArray);
-            }
-        }
-        ~KernelWrapper() {
-            if (this->isSampled()) {
-                fScalableSampler.~ScalableSampler();
-            }
-        }
-
-        bool isValid() const { return !fSize.isEmpty(); }
-        SkISize size() const { return fSize; }
-        bool isSampled() const { return fSize.area() > kMaxUniformSize; }
-        const std::array<float, kMaxUniformSize>& array() const {
-            SkASSERT(!this->isSampled());
-            return fArray;
-        }
-        const ScalableSampler& scalableSampler() const {
-            SkASSERT(this->isSampled());
-            return fScalableSampler;
-        }
-        bool operator==(const KernelWrapper&) const;
-
-    private:
-        KernelWrapper() : fSize({}) {}
-        KernelWrapper(SkISize size) : fSize(size) {
-            if (this->isSampled()) {
-                new (&fScalableSampler) ScalableSampler;
-            }
-        }
-
-        SkISize fSize;
-        union {
-            std::array<float, kMaxUniformSize> fArray;
-            ScalableSampler fScalableSampler;
-        };
-    };
-
-    GrMatrixConvolutionEffect(std::unique_ptr<GrFragmentProcessor> child,
-                              KernelWrapper kernel,
+    // srcProxy is the texture that is going to be convolved
+    // srcBounds is the subset of 'srcProxy' that will be used (e.g., for clamp mode)
+    GrMatrixConvolutionEffect(std::unique_ptr<GrFragmentProcessor>,
+                              const SkISize& kernelSize,
+                              const SkScalar* kernel,
                               SkScalar gain,
                               SkScalar bias,
                               const SkIPoint& kernelOffset,
@@ -138,13 +70,12 @@
 
     bool onIsEqual(const GrFragmentProcessor&) const override;
 
-    const GrFragmentProcessor::TextureSampler& onTextureSampler(int index) const override;
-
     // We really just want the unaltered local coords, but the only way to get that right now is
     // an identity coord transform.
     GrCoordTransform fCoordTransform = {};
     SkIRect          fBounds;
-    KernelWrapper    fKernel;
+    SkISize          fKernelSize;
+    float            fKernel[MAX_KERNEL_SIZE];
     float            fGain;
     float            fBias;
     SkV2             fKernelOffset;