Add a 'pathMatrix' to GrPathTessellator::prepare

The pathMatrix is applied on the CPU while the geometry is being
written out. It is a tool for batching, and is applied in addition to
the shader's on-GPU matrix. This CL also updates GrPathStencilCoverOp
do do all its path transformations with pathMatrix on the CPU side.
The next step will be for atlases to use the pathMatrix instead of
creating uber paths.

Bug: skia:12258
Change-Id: Ib924dfb06a2c0eed8f9045adc6ae9eefad510082
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/433236
Reviewed-by: Brian Salomon <bsalomon@google.com>
Commit-Queue: Chris Dalton <csmartdalton@google.com>
diff --git a/bench/TessellateBench.cpp b/bench/TessellateBench.cpp
index 7cb634c..25468aa 100644
--- a/bench/TessellateBench.cpp
+++ b/bench/TessellateBench.cpp
@@ -110,7 +110,12 @@
     DEF_BENCH( return new PathTessellateBenchmark_##NAME(); ); \
     void PathTessellateBenchmark_##NAME::runBench()
 
-DEF_PATH_TESS_BENCH(GrPathOuterCurveTessellator, make_cubic_path(8), SkMatrix::I()) {
+static const SkMatrix gAlmostIdentity = SkMatrix::MakeAll(
+        1.0001f, 0.0001f, 0.0001f,
+        -.0001f, 0.9999f, -.0001f,
+              0,       0,       1);
+
+DEF_PATH_TESS_BENCH(GrPathCurveTessellator, make_cubic_path(8), SkMatrix::I()) {
     SkArenaAlloc arena(1024);
     GrPipeline noVaryingsPipeline(GrScissorTest::kDisabled, SkBlendMode::kSrcOver,
                                   GrSwizzle::RGBA());
@@ -118,7 +123,7 @@
                                              GrPathCurveTessellator::DrawInnerFan::kNo,
                                              fTarget->caps().minPathVerbsForHwTessellation(),
                                              noVaryingsPipeline, fTarget->caps());
-    tess->prepare(fTarget.get(), SkRectPriv::MakeLargest(), fPath, nullptr);
+    tess->prepare(fTarget.get(), SkRectPriv::MakeLargest(), gAlmostIdentity, fPath, nullptr);
 }
 
 DEF_PATH_TESS_BENCH(GrPathWedgeTessellator, make_cubic_path(8), SkMatrix::I()) {
@@ -128,7 +133,7 @@
     auto tess = GrPathWedgeTessellator::Make(&arena, fMatrix, SK_PMColor4fTRANSPARENT,
                                              fTarget->caps().minPathVerbsForHwTessellation(),
                                              noVaryingsPipeline, fTarget->caps());
-    tess->prepare(fTarget.get(), SkRectPriv::MakeLargest(), fPath, nullptr);
+    tess->prepare(fTarget.get(), SkRectPriv::MakeLargest(), gAlmostIdentity, fPath, nullptr);
 }
 
 static void benchmark_wangs_formula_cubic_log2(const SkMatrix& matrix, const SkPath& path) {
@@ -203,8 +208,8 @@
     GrVertexWriter vertexWriter = static_cast<SkPoint*>(fTarget->makeVertexSpace(
             sizeof(SkPoint), kNumCubicsInChalkboard, &buffer, &baseVertex));
     int numTrianglesWritten;
-    GrMiddleOutPolygonTriangulator::WritePathInnerFan(std::move(vertexWriter), 0, 0, fPath,
-                                                      &numTrianglesWritten);
+    GrMiddleOutPolygonTriangulator::WritePathInnerFan(std::move(vertexWriter), 0, 0,
+                                                      gAlmostIdentity, fPath, &numTrianglesWritten);
 }
 
 using PathStrokeList = GrStrokeTessellator::PathStrokeList;
diff --git a/gn/gpu.gni b/gn/gpu.gni
index 27f7fb7..9eba609 100644
--- a/gn/gpu.gni
+++ b/gn/gpu.gni
@@ -392,6 +392,7 @@
   "$_src/gpu/tessellate/GrPathTessellator.h",
   "$_src/gpu/tessellate/GrPathWedgeTessellator.cpp",
   "$_src/gpu/tessellate/GrPathWedgeTessellator.h",
+  "$_src/gpu/tessellate/GrPathXform.h",
   "$_src/gpu/tessellate/GrStrokeFixedCountTessellator.cpp",
   "$_src/gpu/tessellate/GrStrokeFixedCountTessellator.h",
   "$_src/gpu/tessellate/GrStrokeHardwareTessellator.cpp",
diff --git a/samplecode/SamplePathTessellators.cpp b/samplecode/SamplePathTessellators.cpp
index 8b847c1..b355592 100644
--- a/samplecode/SamplePathTessellators.cpp
+++ b/samplecode/SamplePathTessellators.cpp
@@ -71,6 +71,8 @@
     void onPrepare(GrOpFlushState* flushState) override {
         constexpr static SkPMColor4f kCyan = {0,1,1,1};
         auto alloc = flushState->allocator();
+        const SkMatrix& shaderMatrix = SkMatrix::I();
+        const SkMatrix& pathMatrix = fMatrix;
         const GrCaps& caps = flushState->caps();
         int numVerbsToGetMiddleOut = 0;
         int numVerbsToGetTessellation = caps.minPathVerbsForHwTessellation();
@@ -79,29 +81,29 @@
         switch (fMode) {
             using DrawInnerFan = GrPathCurveTessellator::DrawInnerFan;
             case Mode::kWedgeMiddleOut:
-                fTessellator = GrPathWedgeTessellator::Make(alloc, fMatrix, kCyan,
+                fTessellator = GrPathWedgeTessellator::Make(alloc, shaderMatrix, kCyan,
                                                             numVerbsToGetMiddleOut, *pipeline,
                                                             caps);
                 break;
             case Mode::kCurveMiddleOut:
-                fTessellator = GrPathCurveTessellator::Make(alloc, fMatrix, kCyan,
+                fTessellator = GrPathCurveTessellator::Make(alloc, shaderMatrix, kCyan,
                                                             DrawInnerFan::kYes,
                                                             numVerbsToGetMiddleOut, *pipeline,
                                                             caps);
                 break;
             case Mode::kWedgeTessellate:
-                fTessellator = GrPathWedgeTessellator::Make(alloc, fMatrix, kCyan,
+                fTessellator = GrPathWedgeTessellator::Make(alloc, shaderMatrix, kCyan,
                                                             numVerbsToGetTessellation, *pipeline,
                                                             caps);
                 break;
             case Mode::kCurveTessellate:
-                fTessellator = GrPathCurveTessellator::Make(alloc, fMatrix, kCyan,
+                fTessellator = GrPathCurveTessellator::Make(alloc, shaderMatrix, kCyan,
                                                             DrawInnerFan::kYes,
                                                             numVerbsToGetTessellation, *pipeline,
                                                             caps);
                 break;
         }
-        fTessellator->prepare(flushState, this->bounds(), fPath);
+        fTessellator->prepare(flushState, this->bounds(), pathMatrix, fPath);
         fProgram = GrTessellationShader::MakeProgram({alloc, flushState->writeView(),
                                                      &flushState->dstProxyView(),
                                                      flushState->renderPassBarriers(),
diff --git a/src/gpu/tessellate/GrMiddleOutPolygonTriangulator.h b/src/gpu/tessellate/GrMiddleOutPolygonTriangulator.h
index b925181..2fc1806 100644
--- a/src/gpu/tessellate/GrMiddleOutPolygonTriangulator.h
+++ b/src/gpu/tessellate/GrMiddleOutPolygonTriangulator.h
@@ -14,6 +14,7 @@
 #include "src/core/SkMathPriv.h"
 #include "src/core/SkPathPriv.h"
 #include "src/gpu/GrVertexWriter.h"
+#include "src/gpu/tessellate/GrPathXform.h"
 
 // This class emits a polygon triangulation with a "middle-out" topology. Conceptually, middle-out
 // emits one large triangle with vertices on both endpoints and a middle point, then recurses on
@@ -127,20 +128,25 @@
     static GrVertexWriter WritePathInnerFan(GrVertexWriter&& vertexWriter,
                                             int pad32Count,
                                             uint32_t pad32Value,
+                                            const GrPathXform& pathXform,
                                             const SkPath& path,
                                             int* numTrianglesWritten) {
-        GrMiddleOutPolygonTriangulator middleOut(std::move(vertexWriter), pad32Count, pad32Value,
+        GrMiddleOutPolygonTriangulator middleOut(std::move(vertexWriter),
+                                                 pad32Count,
+                                                 pad32Value,
                                                  path.countVerbs());
         for (auto [verb, pts, w] : SkPathPriv::Iterate(path)) {
             switch (verb) {
+                SkPoint pt;
                 case SkPathVerb::kMove:
-                    middleOut.closeAndMove(pts[0]);
+                    middleOut.closeAndMove(pathXform.mapPoint(pts[0]));
                     break;
                 case SkPathVerb::kLine:
                 case SkPathVerb::kQuad:
                 case SkPathVerb::kConic:
                 case SkPathVerb::kCubic:
-                    middleOut.pushVertex(pts[SkPathPriv::PtsInIter((unsigned)verb) - 1]);
+                    pt = pts[SkPathPriv::PtsInIter((unsigned)verb) - 1];
+                    middleOut.pushVertex(pathXform.mapPoint(pt));
                     break;
                 case SkPathVerb::kClose:
                     break;
diff --git a/src/gpu/tessellate/GrPathCurveTessellator.cpp b/src/gpu/tessellate/GrPathCurveTessellator.cpp
index a79cca8..e1c48ce 100644
--- a/src/gpu/tessellate/GrPathCurveTessellator.cpp
+++ b/src/gpu/tessellate/GrPathCurveTessellator.cpp
@@ -13,6 +13,7 @@
 #include "src/gpu/geometry/GrWangsFormula.h"
 #include "src/gpu/tessellate/GrCullTest.h"
 #include "src/gpu/tessellate/GrMiddleOutPolygonTriangulator.h"
+#include "src/gpu/tessellate/GrPathXform.h"
 #include "src/gpu/tessellate/shaders/GrPathTessellationShader.h"
 
 namespace {
@@ -23,23 +24,29 @@
 // supported by the hardware.
 class CurveWriter {
 public:
-    CurveWriter(const SkRect& cullBounds, const SkMatrix& viewMatrix, int maxSegments)
-            : fCullTest(cullBounds, viewMatrix)
-            , fVectorXform(viewMatrix)
+    CurveWriter(const SkRect& cullBounds,
+                const SkMatrix& totalMatrix,  // shaderMatrix * pathMatrix
+                const SkMatrix& pathMatrix,
+                int maxSegments)
+            : fCullTest(cullBounds, totalMatrix)
+            , fTotalVectorXform(totalMatrix)
+            , fPathXform(pathMatrix)
             , fMaxSegments_pow2(maxSegments * maxSegments)
             , fMaxSegments_pow4(fMaxSegments_pow2 * fMaxSegments_pow2) {
     }
 
+    const GrPathXform& pathXform() const { return fPathXform; }
+
     SK_ALWAYS_INLINE void writeQuadratic(const GrShaderCaps& shaderCaps,
                                          GrVertexChunkBuilder* chunker, const SkPoint p[3]) {
-        float numSegments_pow4 = GrWangsFormula::quadratic_pow4(kPrecision, p, fVectorXform);
+        float numSegments_pow4 = GrWangsFormula::quadratic_pow4(kPrecision, p, fTotalVectorXform);
         if (numSegments_pow4 > fMaxSegments_pow4) {
             this->chopAndWriteQuadratic(shaderCaps, chunker, p);
             return;
         }
         if (numSegments_pow4 > 1) {
             if (GrVertexWriter vertexWriter = chunker->appendVertex()) {
-                GrPathUtils::writeQuadAsCubic(p, &vertexWriter);
+                fPathXform.mapQuadToCubic(&vertexWriter, p);
                 vertexWriter.write(GrVertexWriter::If(!shaderCaps.infinitySupport(),
                                                       GrTessellationShader::kCubicCurveType));
             }
@@ -49,14 +56,14 @@
 
     SK_ALWAYS_INLINE void writeConic(const GrShaderCaps& shaderCaps, GrVertexChunkBuilder* chunker,
                                      const SkPoint p[3], float w) {
-        float numSegments_pow2 = GrWangsFormula::conic_pow2(kPrecision, p, w, fVectorXform);
+        float numSegments_pow2 = GrWangsFormula::conic_pow2(kPrecision, p, w, fTotalVectorXform);
         if (numSegments_pow2 > fMaxSegments_pow2) {
             this->chopAndWriteConic(shaderCaps, chunker, {p, w});
             return;
         }
         if (numSegments_pow2 > 1) {
             if (GrVertexWriter vertexWriter = chunker->appendVertex()) {
-                GrTessellationShader::WriteConicPatch(p, w, &vertexWriter);
+                fPathXform.mapConicToPatch(&vertexWriter, p, w);
                 vertexWriter.write(GrVertexWriter::If(!shaderCaps.infinitySupport(),
                                                       GrTessellationShader::kConicCurveType));
             }
@@ -67,14 +74,14 @@
 
     SK_ALWAYS_INLINE void writeCubic(const GrShaderCaps& shaderCaps, GrVertexChunkBuilder* chunker,
                                      const SkPoint p[4]) {
-        float numSegments_pow4 = GrWangsFormula::cubic_pow4(kPrecision, p, fVectorXform);
+        float numSegments_pow4 = GrWangsFormula::cubic_pow4(kPrecision, p, fTotalVectorXform);
         if (numSegments_pow4 > fMaxSegments_pow4) {
             this->chopAndWriteCubic(shaderCaps, chunker, p);
             return;
         }
         if (numSegments_pow4 > 1) {
             if (GrVertexWriter vertexWriter = chunker->appendVertex()) {
-                vertexWriter.writeArray(p, 4);
+                fPathXform.map4Points(&vertexWriter, p);
                 vertexWriter.write(GrVertexWriter::If(!shaderCaps.infinitySupport(),
                                                       GrTessellationShader::kCubicCurveType));
             }
@@ -131,7 +138,9 @@
     void writeTriangle(const GrShaderCaps& shaderCaps, GrVertexChunkBuilder* chunker, SkPoint p0,
                        SkPoint p1, SkPoint p2) {
         if (GrVertexWriter vertexWriter = chunker->appendVertex()) {
-            vertexWriter.write(p0, p1, p2);
+            vertexWriter.write(fPathXform.mapPoint(p0),
+                               fPathXform.mapPoint(p1),
+                               fPathXform.mapPoint(p2));
             // Mark this instance as a triangle by setting it to a conic with w=Inf.
             vertexWriter.fill(GrVertexWriter::kIEEE_32_infinity, 2);
             vertexWriter.write(GrVertexWriter::If(!shaderCaps.infinitySupport(),
@@ -140,7 +149,8 @@
     }
 
     GrCullTest fCullTest;
-    GrVectorXform fVectorXform;
+    GrVectorXform fTotalVectorXform;
+    GrPathXform fPathXform;
     const float fMaxSegments_pow2;
     const float fMaxSegments_pow4;
 
@@ -179,7 +189,9 @@
 GR_DECLARE_STATIC_UNIQUE_KEY(gFixedCountVertexBufferKey);
 GR_DECLARE_STATIC_UNIQUE_KEY(gFixedCountIndexBufferKey);
 
-void GrPathCurveTessellator::prepare(GrMeshDrawTarget* target, const SkRect& cullBounds,
+void GrPathCurveTessellator::prepare(GrMeshDrawTarget* target,
+                                     const SkRect& cullBounds,
+                                     const SkMatrix& pathMatrix,
                                      const SkPath& path,
                                      const BreadcrumbTriangleList* breadcrumbTriangleList) {
     SkASSERT(fVertexChunkArray.empty());
@@ -207,6 +219,21 @@
                                                                : fShader->instanceStride();
     GrVertexChunkBuilder chunker(target, &fVertexChunkArray, patchStride, patchAllocCount);
 
+    int maxSegments;
+    if (fShader->willUseTessellationShaders()) {
+        // The curve shader tessellates T=0..(1/2) on the first side of the canonical triangle and
+        // T=(1/2)..1 on the second side. This means we get double the max tessellation segments
+        // for the range T=0..1.
+        maxSegments = shaderCaps.maxTessellationSegments() * 2;
+    } else {
+        maxSegments = GrPathTessellationShader::kMaxFixedCountSegments;
+    }
+
+    CurveWriter curveWriter(cullBounds,
+                            SkMatrix::Concat(fShader->viewMatrix(), pathMatrix),
+                            pathMatrix,
+                            maxSegments);
+
     // Write out the triangles.
     if (maxTriangles) {
         GrVertexWriter vertexWriter = chunker.appendVertices(maxTriangles);
@@ -224,7 +251,12 @@
                     : sk_bit_cast<uint32_t>(GrTessellationShader::kTriangularConicCurveType);
             int numTrianglesWritten;
             vertexWriter = GrMiddleOutPolygonTriangulator::WritePathInnerFan(
-                    std::move(vertexWriter), pad32Count, pad32Value, path, &numTrianglesWritten);
+                    std::move(vertexWriter),
+                    pad32Count,
+                    pad32Value,
+                    curveWriter.pathXform(),
+                    path,
+                    &numTrianglesWritten);
             numRemainingTriangles -= numTrianglesWritten;
         }
         if (breadcrumbTriangleList) {
@@ -243,7 +275,7 @@
                     // introduce T-junctions.
                     continue;
                 }
-                vertexWriter.writeArray(tri->fPts, 3);
+                curveWriter.pathXform().map3Points(&vertexWriter, tri->fPts);
                 // Mark this instance as a triangle by setting it to a conic with w=Inf.
                 vertexWriter.fill(GrVertexWriter::kIEEE_32_infinity, 2);
                 vertexWriter.write(
@@ -257,17 +289,6 @@
         chunker.popVertices(numRemainingTriangles);
     }
 
-    int maxSegments;
-    if (fShader->willUseTessellationShaders()) {
-        // The curve shader tessellates T=0..(1/2) on the first side of the canonical triangle and
-        // T=(1/2)..1 on the second side. This means we get double the max tessellation segments
-        // for the range T=0..1.
-        maxSegments = shaderCaps.maxTessellationSegments() * 2;
-    } else {
-        maxSegments = GrPathTessellationShader::kMaxFixedCountSegments;
-    }
-
-    CurveWriter curveWriter(cullBounds, fShader->viewMatrix(), maxSegments);
     for (auto [verb, pts, w] : SkPathPriv::Iterate(path)) {
         switch (verb) {
             case SkPathVerb::kQuad:
diff --git a/src/gpu/tessellate/GrPathCurveTessellator.h b/src/gpu/tessellate/GrPathCurveTessellator.h
index 5035d8f..7d2601c 100644
--- a/src/gpu/tessellate/GrPathCurveTessellator.h
+++ b/src/gpu/tessellate/GrPathCurveTessellator.h
@@ -32,8 +32,13 @@
                                         const SkPMColor4f&, DrawInnerFan, int numPathVerbs,
                                         const GrPipeline&, const GrCaps&);
 
-    void prepare(GrMeshDrawTarget*, const SkRect& cullBounds, const SkPath&,
+
+    void prepare(GrMeshDrawTarget*,
+                 const SkRect& cullBounds,
+                 const SkMatrix& pathMatrix,
+                 const SkPath&,
                  const BreadcrumbTriangleList*) override;
+
     void draw(GrOpFlushState*) const override;
 
     // Draws a 4-point instance for each curve. This method is used for drawing convex hulls over
diff --git a/src/gpu/tessellate/GrPathInnerTriangulateOp.cpp b/src/gpu/tessellate/GrPathInnerTriangulateOp.cpp
index be9875a..0322191 100644
--- a/src/gpu/tessellate/GrPathInnerTriangulateOp.cpp
+++ b/src/gpu/tessellate/GrPathInnerTriangulateOp.cpp
@@ -400,7 +400,7 @@
 
     if (fTessellator) {
         // Must be called after polysToTriangles() in order for fFanBreadcrumbs to be complete.
-        fTessellator->prepare(flushState, this->bounds(), fPath, &fFanBreadcrumbs);
+        fTessellator->prepare(flushState, this->bounds(), SkMatrix::I(), fPath, &fFanBreadcrumbs);
     }
 
     if (!flushState->caps().shaderCaps()->vertexIDSupport()) {
diff --git a/src/gpu/tessellate/GrPathStencilCoverOp.cpp b/src/gpu/tessellate/GrPathStencilCoverOp.cpp
index 2b56078..a4d6621 100644
--- a/src/gpu/tessellate/GrPathStencilCoverOp.cpp
+++ b/src/gpu/tessellate/GrPathStencilCoverOp.cpp
@@ -139,6 +139,8 @@
     SkASSERT(!fStencilPathProgram);
     SkASSERT(!fCoverBBoxProgram);
 
+    // We transform paths on the CPU. This allows for better batching.
+    const SkMatrix& shaderMatrix = SkMatrix::I();
     const GrPipeline* stencilPipeline = GrPathTessellationShader::MakeStencilOnlyPipeline(
             args, fAAType, fPathFlags, appliedClip.hardClip());
     const GrUserStencilSettings* stencilPathSettings =
@@ -148,19 +150,27 @@
         // Large complex paths do better with a dedicated triangle shader for the inner fan.
         // This takes less PCI bus bandwidth (6 floats per triangle instead of 8) and allows us
         // to make sure it has an efficient middle-out topology.
-        auto shader = GrPathTessellationShader::MakeSimpleTriangleShader(
-                args.fArena, fViewMatrix, SK_PMColor4fTRANSPARENT);
-        fStencilFanProgram = GrTessellationShader::MakeProgram(args, shader, stencilPipeline,
+        auto shader = GrPathTessellationShader::MakeSimpleTriangleShader(args.fArena,
+                                                                         shaderMatrix,
+                                                                         SK_PMColor4fTRANSPARENT);
+        fStencilFanProgram = GrTessellationShader::MakeProgram(args,
+                                                               shader,
+                                                               stencilPipeline,
                                                                stencilPathSettings);
-        fTessellator = GrPathCurveTessellator::Make(args.fArena, fViewMatrix,
+        fTessellator = GrPathCurveTessellator::Make(args.fArena,
+                                                    shaderMatrix,
                                                     SK_PMColor4fTRANSPARENT,
                                                     GrPathCurveTessellator::DrawInnerFan::kNo,
-                                                    fPath.countVerbs(), *stencilPipeline,
+                                                    fPath.countVerbs(),
+                                                    *stencilPipeline,
                                                     *args.fCaps);
     } else {
-        fTessellator = GrPathWedgeTessellator::Make(args.fArena, fViewMatrix,
-                                                    SK_PMColor4fTRANSPARENT, fPath.countVerbs(),
-                                                    *stencilPipeline, *args.fCaps);
+        fTessellator = GrPathWedgeTessellator::Make(args.fArena,
+                                                    shaderMatrix,
+                                                    SK_PMColor4fTRANSPARENT,
+                                                    fPath.countVerbs(),
+                                                    *stencilPipeline,
+                                                    *args.fCaps);
     }
     fStencilPathProgram = GrTessellationShader::MakeProgram(args, fTessellator->shader(),
                                                             stencilPipeline, stencilPathSettings);
@@ -219,6 +229,9 @@
         }
     }
 
+    // We transform paths on the CPU. This allows for better batching.
+    const SkMatrix& pathMatrix = fViewMatrix;
+
     if (fStencilFanProgram) {
         // The inner fan isn't built into the tessellator. Generate a standard Redbook fan with a
         // middle-out topology.
@@ -226,14 +239,18 @@
         int maxFanTriangles = fPath.countVerbs() - 2;  // n - 2 triangles make an n-gon.
         GrVertexWriter triangleVertexWriter = vertexAlloc.lock<SkPoint>(maxFanTriangles * 3);
         int numTrianglesWritten;
-        GrMiddleOutPolygonTriangulator::WritePathInnerFan(std::move(triangleVertexWriter), 0, 0,
-                                                          fPath, &numTrianglesWritten);
+        GrMiddleOutPolygonTriangulator::WritePathInnerFan(std::move(triangleVertexWriter),
+                                                          0,
+                                                          0,
+                                                          pathMatrix,
+                                                          fPath,
+                                                          &numTrianglesWritten);
         fFanVertexCount = 3 * numTrianglesWritten;
         SkASSERT(fFanVertexCount <= maxFanTriangles * 3);
         vertexAlloc.unlock(fFanVertexCount);
     }
 
-    fTessellator->prepare(flushState, this->bounds(), fPath);
+    fTessellator->prepare(flushState, this->bounds(), pathMatrix, fPath);
 
     if (fCoverBBoxProgram) {
         GrVertexWriter vertexWriter = flushState->makeVertexSpace(sizeof(SkRect), 1, &fBBoxBuffer,
diff --git a/src/gpu/tessellate/GrPathTessellateOp.cpp b/src/gpu/tessellate/GrPathTessellateOp.cpp
index 6230a6a..173e39f 100644
--- a/src/gpu/tessellate/GrPathTessellateOp.cpp
+++ b/src/gpu/tessellate/GrPathTessellateOp.cpp
@@ -57,7 +57,7 @@
                                   flushState->detachAppliedClip());
         SkASSERT(fTessellator);
     }
-    fTessellator->prepare(flushState, this->bounds(), fPath);
+    fTessellator->prepare(flushState, this->bounds(), SkMatrix::I(), fPath);
 }
 
 void GrPathTessellateOp::onExecute(GrOpFlushState* flushState, const SkRect& chainBounds) {
diff --git a/src/gpu/tessellate/GrPathTessellator.h b/src/gpu/tessellate/GrPathTessellator.h
index b4e7009..2e0a566 100644
--- a/src/gpu/tessellate/GrPathTessellator.h
+++ b/src/gpu/tessellate/GrPathTessellator.h
@@ -27,10 +27,17 @@
 
     const GrPathTessellationShader* shader() const { return fShader; }
 
-    // Called before draw(). Prepares GPU buffers containing the geometry to tessellate. If the
-    // given BreadcrumbTriangleList is non-null, then this class will also include the breadcrumb
-    // triangles in its draw.
-    virtual void prepare(GrMeshDrawTarget*, const SkRect& cullBounds, const SkPath&,
+    // Called before draw(). Prepares GPU buffers containing the geometry to tessellate.
+    //
+    // 'pathMatrix' is applied on the CPU while the geometry is being written out. This is a tool
+    // for batching, and is applied in addition to the shader's on-GPU matrix.
+    //
+    // If the given BreadcrumbTriangleList is non-null, then we also emit geometry for the
+    // breadcrumb triangles.
+    virtual void prepare(GrMeshDrawTarget*,
+                         const SkRect& cullBounds,
+                         const SkMatrix& pathMatrix,
+                         const SkPath&,
                          const BreadcrumbTriangleList* = nullptr) = 0;
 
     // Issues draw calls for the tessellated geometry. The caller is responsible for binding its
diff --git a/src/gpu/tessellate/GrPathWedgeTessellator.cpp b/src/gpu/tessellate/GrPathWedgeTessellator.cpp
index 5949795..869fc33 100644
--- a/src/gpu/tessellate/GrPathWedgeTessellator.cpp
+++ b/src/gpu/tessellate/GrPathWedgeTessellator.cpp
@@ -11,6 +11,7 @@
 #include "src/gpu/geometry/GrPathUtils.h"
 #include "src/gpu/geometry/GrWangsFormula.h"
 #include "src/gpu/tessellate/GrCullTest.h"
+#include "src/gpu/tessellate/GrPathXform.h"
 #include "src/gpu/tessellate/shaders/GrPathTessellationShader.h"
 
 namespace {
@@ -110,18 +111,24 @@
 // by the hardware.
 class WedgeWriter {
 public:
-    WedgeWriter(const SkRect& cullBounds, const SkMatrix& viewMatrix, int maxSegments)
-            : fCullTest(cullBounds, viewMatrix)
-            , fVectorXform(viewMatrix)
+    WedgeWriter(const SkRect& cullBounds,
+                const SkMatrix& totalMatrix,  // shaderMatrix * pathMatrix
+                const SkMatrix& pathMatrix,
+                int maxSegments)
+            : fCullTest(cullBounds, totalMatrix)
+            , fTotalVectorXform(totalMatrix)
+            , fPathXform(pathMatrix)
             , fMaxSegments_pow2(maxSegments * maxSegments)
             , fMaxSegments_pow4(fMaxSegments_pow2 * fMaxSegments_pow2) {
     }
 
+    const GrPathXform& pathXform() const { return fPathXform; }
+
     SK_ALWAYS_INLINE void writeFlatWedge(const GrShaderCaps& shaderCaps,
                                          GrVertexChunkBuilder* chunker, SkPoint p0, SkPoint p1,
                                          SkPoint midpoint) {
         if (GrVertexWriter vertexWriter = chunker->appendVertex()) {
-            GrPathUtils::writeLineAsCubic(p0, p1, &vertexWriter);
+            fPathXform.mapLineToCubic(&vertexWriter, p0, p1);
             vertexWriter.write(midpoint);
             vertexWriter.write(GrVertexWriter::If(!shaderCaps.infinitySupport(),
                                                   GrTessellationShader::kCubicCurveType));
@@ -131,13 +138,13 @@
     SK_ALWAYS_INLINE void writeQuadraticWedge(const GrShaderCaps& shaderCaps,
                                               GrVertexChunkBuilder* chunker, const SkPoint p[3],
                                               SkPoint midpoint) {
-        float numSegments_pow4 = GrWangsFormula::quadratic_pow4(kPrecision, p, fVectorXform);
+        float numSegments_pow4 = GrWangsFormula::quadratic_pow4(kPrecision, p, fTotalVectorXform);
         if (numSegments_pow4 > fMaxSegments_pow4) {
             this->chopAndWriteQuadraticWedges(shaderCaps, chunker, p, midpoint);
             return;
         }
         if (GrVertexWriter vertexWriter = chunker->appendVertex()) {
-            GrPathUtils::writeQuadAsCubic(p, &vertexWriter);
+            fPathXform.mapQuadToCubic(&vertexWriter, p);
             vertexWriter.write(midpoint);
             vertexWriter.write(GrVertexWriter::If(!shaderCaps.infinitySupport(),
                                                   GrTessellationShader::kCubicCurveType));
@@ -148,13 +155,13 @@
     SK_ALWAYS_INLINE void writeConicWedge(const GrShaderCaps& shaderCaps,
                                           GrVertexChunkBuilder* chunker, const SkPoint p[3],
                                           float w, SkPoint midpoint) {
-        float numSegments_pow2 = GrWangsFormula::conic_pow2(kPrecision, p, w, fVectorXform);
-        if (GrWangsFormula::conic_pow2(kPrecision, p, w, fVectorXform) > fMaxSegments_pow2) {
+        float numSegments_pow2 = GrWangsFormula::conic_pow2(kPrecision, p, w, fTotalVectorXform);
+        if (numSegments_pow2 > fMaxSegments_pow2) {
             this->chopAndWriteConicWedges(shaderCaps, chunker, {p, w}, midpoint);
             return;
         }
         if (GrVertexWriter vertexWriter = chunker->appendVertex()) {
-            GrTessellationShader::WriteConicPatch(p, w, &vertexWriter);
+            fPathXform.mapConicToPatch(&vertexWriter, p, w);
             vertexWriter.write(midpoint);
             vertexWriter.write(GrVertexWriter::If(!shaderCaps.infinitySupport(),
                                                   GrTessellationShader::kConicCurveType));
@@ -166,13 +173,13 @@
     SK_ALWAYS_INLINE void writeCubicWedge(const GrShaderCaps& shaderCaps,
                                           GrVertexChunkBuilder* chunker, const SkPoint p[4],
                                           SkPoint midpoint) {
-        float numSegments_pow4 = GrWangsFormula::cubic_pow4(kPrecision, p, fVectorXform);
+        float numSegments_pow4 = GrWangsFormula::cubic_pow4(kPrecision, p, fTotalVectorXform);
         if (numSegments_pow4 > fMaxSegments_pow4) {
             this->chopAndWriteCubicWedges(shaderCaps, chunker, p, midpoint);
             return;
         }
         if (GrVertexWriter vertexWriter = chunker->appendVertex()) {
-            vertexWriter.writeArray(p, 4);
+            fPathXform.map4Points(&vertexWriter, p);
             vertexWriter.write(midpoint);
             vertexWriter.write(GrVertexWriter::If(!shaderCaps.infinitySupport(),
                                                   GrTessellationShader::kCubicCurveType));
@@ -228,7 +235,8 @@
     }
 
     GrCullTest fCullTest;
-    GrVectorXform fVectorXform;
+    GrVectorXform fTotalVectorXform;
+    GrPathXform fPathXform;
     const float fMaxSegments_pow2;
     const float fMaxSegments_pow4;
 
@@ -263,7 +271,9 @@
 GR_DECLARE_STATIC_UNIQUE_KEY(gFixedCountVertexBufferKey);
 GR_DECLARE_STATIC_UNIQUE_KEY(gFixedCountIndexBufferKey);
 
-void GrPathWedgeTessellator::prepare(GrMeshDrawTarget* target, const SkRect& cullBounds,
+void GrPathWedgeTessellator::prepare(GrMeshDrawTarget* target,
+                                     const SkRect& cullBounds,
+                                     const SkMatrix& pathMatrix,
                                      const SkPath& path,
                                      const BreadcrumbTriangleList* breadcrumbTriangleList) {
     SkASSERT(!breadcrumbTriangleList);
@@ -288,10 +298,14 @@
         maxSegments = GrPathTessellationShader::kMaxFixedCountSegments;
     }
 
-    WedgeWriter wedgeWriter(cullBounds, fShader->viewMatrix(), maxSegments);
+    WedgeWriter wedgeWriter(cullBounds,
+                            SkMatrix::Concat(fShader->viewMatrix(), pathMatrix),
+                            pathMatrix,
+                            maxSegments);
+
     MidpointContourParser parser(path);
     while (parser.parseNextContour()) {
-        SkPoint midpoint = parser.currentMidpoint();
+        SkPoint midpoint = wedgeWriter.pathXform().mapPoint(parser.currentMidpoint());
         SkPoint startPoint = {0, 0};
         SkPoint lastPoint = startPoint;
         for (auto [verb, pts, w] : parser.currentContour()) {
diff --git a/src/gpu/tessellate/GrPathWedgeTessellator.h b/src/gpu/tessellate/GrPathWedgeTessellator.h
index 3f887fb..50490e4 100644
--- a/src/gpu/tessellate/GrPathWedgeTessellator.h
+++ b/src/gpu/tessellate/GrPathWedgeTessellator.h
@@ -25,8 +25,12 @@
     static GrPathTessellator* Make(SkArenaAlloc*, const SkMatrix& viewMatrix, const SkPMColor4f&,
                                    int numPathVerbs, const GrPipeline&, const GrCaps&);
 
-    void prepare(GrMeshDrawTarget*, const SkRect& cullBounds, const SkPath&,
+    void prepare(GrMeshDrawTarget*,
+                 const SkRect& cullBounds,
+                 const SkMatrix& pathMatrix,
+                 const SkPath&,
                  const BreadcrumbTriangleList*) override;
+
     void draw(GrOpFlushState*) const override;
 
 private:
diff --git a/src/gpu/tessellate/GrPathXform.h b/src/gpu/tessellate/GrPathXform.h
new file mode 100644
index 0000000..2203df1
--- /dev/null
+++ b/src/gpu/tessellate/GrPathXform.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2021 Google LLC.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GrPathXform_DEFINED
+#define GrPathXform_DEFINED
+
+#include "include/core/SkMatrix.h"
+#include "src/gpu/GrVertexWriter.h"
+#include "src/gpu/GrVx.h"
+
+// Applies an affine 2d transformation to points and path components. Converts path components to
+// tessellation patches. Uses SIMD, but takes care to map points identically, regardless of which
+// method is called.
+//
+// This class stores redundant data, so it is best used only as a stack-allocated object at the
+// point of use.
+class GrPathXform {
+    using float2 = grvx::float2;
+    using float4 = grvx::float4;
+
+public:
+    GrPathXform(const SkMatrix& m)
+            : fScale{m.getScaleX(), m.getScaleY(), m.getScaleX(), m.getScaleY()}
+            , fSkew{m.getSkewX(), m.getSkewY(), m.getSkewX(), m.getSkewY()}
+            , fTrans{m.getTranslateX(), m.getTranslateY(), m.getTranslateX(), m.getTranslateY()} {
+        SkASSERT(!m.hasPerspective());
+    }
+
+    SK_ALWAYS_INLINE float2 mapPoint(float2 p) const {
+        return fScale.lo * p + (fSkew.lo * skvx::shuffle<1,0>(p) + fTrans.lo);
+    }
+
+    SK_ALWAYS_INLINE SkPoint mapPoint(SkPoint p) const {
+        return skvx::bit_pun<SkPoint>(this->mapPoint(skvx::bit_pun<float2>(p)));
+    }
+
+    SK_ALWAYS_INLINE void map2Points(GrVertexWriter* writer, const SkPoint pts[2]) const {
+        float4 p = float4::Load(pts);
+        writer->write(fScale * p + (fSkew * skvx::shuffle<1,0,3,2>(p) + fTrans));
+    }
+
+    SK_ALWAYS_INLINE void map3Points(GrVertexWriter* writer, const SkPoint pts[3]) const {
+        writer->write(this->mapPoint(pts[0]));
+        this->map2Points(writer, pts + 1);
+    }
+
+    SK_ALWAYS_INLINE void map4Points(GrVertexWriter* writer, const SkPoint pts[4]) const {
+        this->map2Points(writer, pts);
+        this->map2Points(writer, pts + 2);
+    }
+
+    // Emits a degenerate, 4-point transformed cubic bezier equal to a line.
+    SK_ALWAYS_INLINE void mapLineToCubic(GrVertexWriter* writer,
+                                         SkPoint startPt,
+                                         SkPoint endPt) const {
+        float2 p0 = this->mapPoint(skvx::bit_pun<float2>(startPt));
+        float2 p1 = this->mapPoint(skvx::bit_pun<float2>(endPt));
+        float2 v = (p1 - p0) * (1/3.f);
+        writer->write(p0, p0 + v, p1 - v, p1);
+    }
+
+    // Emits a degenerate, 4-point transformed bezier equal to a quadratic.
+    SK_ALWAYS_INLINE void mapQuadToCubic(GrVertexWriter* writer, const SkPoint pts[3]) const {
+        float2 p0 = this->mapPoint(skvx::bit_pun<float2>(pts[0]));
+        float2 p1 = this->mapPoint(skvx::bit_pun<float2>(pts[1]));
+        float2 p2 = this->mapPoint(skvx::bit_pun<float2>(pts[2]));
+        float2 c = p1 * (2/3.f);
+        writer->write(p0, p0 * 1/3.f + c, p2 * 1/3.f + c, p2);
+    }
+
+    // Writes out the 3 conic points transformed, plus a 4th point with the conic weight in x and
+    // infinity in y. Infinite y flags the 4-point patch as a conic.
+    SK_ALWAYS_INLINE void mapConicToPatch(GrVertexWriter* writer,
+                                          const SkPoint pts[3],
+                                          float w) const {
+        this->map3Points(writer, pts);
+        writer->write(w, GrVertexWriter::kIEEE_32_infinity);
+    }
+
+private:
+    float4 fScale;
+    float4 fSkew;
+    float4 fTrans;
+};
+
+#endif