Use packed contexts for small swizzles in SkRP.

Small (2/3/4-element) swizzles now use 8-bit offsets, and a 32-bit
base pointer RPOffset, in order to squeeze into a 64-bit context.

"Shuffles" (16-element swizzles) continue to use an unpacked
context and 16-bit offsets, but can still share an implementation;
a template papers over the difference.

Small swizzles are very common in SkRP since they are also used to
splat values, so this should be a useful optimization.

Change-Id: I52e6744da39aa59c7d9e49ba941d9631caba51cf
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/677716
Auto-Submit: John Stiles <johnstiles@google.com>
Reviewed-by: Arman Uguray <armansito@google.com>
Commit-Queue: John Stiles <johnstiles@google.com>
diff --git a/src/core/SkRasterPipelineOpContexts.h b/src/core/SkRasterPipelineOpContexts.h
index 3b30291..f6ba259 100644
--- a/src/core/SkRasterPipelineOpContexts.h
+++ b/src/core/SkRasterPipelineOpContexts.h
@@ -177,8 +177,12 @@
 };
 
 struct SkRasterPipeline_SwizzleCtx {
-    float *ptr;
-    uint16_t offsets[4];  // values must be byte offsets (4 * highp-stride * component-index)
+    // If we are processing more than 16 pixels at a time, an 8-bit offset won't be sufficient and
+    // `offsets` will need to use uint16_t (or dial down the premultiplication).
+    static_assert(SkRasterPipeline_kMaxStride_highp <= 16);
+
+    SkRPOffset dst;
+    uint8_t offsets[4];  // values must be byte offsets (4 * highp-stride * component-index)
 };
 
 struct SkRasterPipeline_ShuffleCtx {
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 32ea7d9..7ce03cd 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -3588,18 +3588,18 @@
     copy_n_slots_masked_fn<4>(packed, base, execution_mask());
 }
 
-template <int LoopCount>
-SI void shuffle_fn(F* dst, uint16_t* offsets, int numSlots) {
+template <int LoopCount, typename OffsetType>
+SI void shuffle_fn(std::byte* ptr, OffsetType* offsets, int numSlots) {
     F scratch[16];
-    std::byte* src = (std::byte*)dst;
     for (int count = 0; count < LoopCount; ++count) {
-        scratch[count] = *(F*)(src + offsets[count]);
+        scratch[count] = *(F*)(ptr + offsets[count]);
     }
     // Surprisingly, this switch generates significantly better code than a memcpy (on x86-64) when
     // the number of slots is unknown at compile time, and generates roughly identical code when the
     // number of slots is hardcoded. Using a switch allows `scratch` to live in ymm0-ymm15 instead
     // of being written out to the stack and then read back in. Also, the intrinsic memcpy assumes
     // that `numSlots` could be arbitrarily large, and so it emits more code than we need.
+    F* dst = (F*)ptr;
     switch (numSlots) {
         case 16: dst[15] = scratch[15]; [[fallthrough]];
         case 15: dst[14] = scratch[14]; [[fallthrough]];
@@ -3620,20 +3620,26 @@
     }
 }
 
-STAGE_TAIL(swizzle_1, SkRasterPipeline_SwizzleCtx* ctx) {
-    shuffle_fn<1>((F*)ctx->ptr, ctx->offsets, 1);
+template <int N>
+SI void small_swizzle_fn(SkRasterPipeline_SwizzleCtx* packed, std::byte* base) {
+    auto ctx = SkRPCtxUtils::Unpack(packed);
+    shuffle_fn<N>(base + ctx.dst, ctx.offsets, N);
 }
-STAGE_TAIL(swizzle_2, SkRasterPipeline_SwizzleCtx* ctx) {
-    shuffle_fn<2>((F*)ctx->ptr, ctx->offsets, 2);
+
+STAGE_TAIL(swizzle_1, SkRasterPipeline_SwizzleCtx* packed) {
+    small_swizzle_fn<1>(packed, base);
 }
-STAGE_TAIL(swizzle_3, SkRasterPipeline_SwizzleCtx* ctx) {
-    shuffle_fn<3>((F*)ctx->ptr, ctx->offsets, 3);
+STAGE_TAIL(swizzle_2, SkRasterPipeline_SwizzleCtx* packed) {
+    small_swizzle_fn<2>(packed, base);
 }
-STAGE_TAIL(swizzle_4, SkRasterPipeline_SwizzleCtx* ctx) {
-    shuffle_fn<4>((F*)ctx->ptr, ctx->offsets, 4);
+STAGE_TAIL(swizzle_3, SkRasterPipeline_SwizzleCtx* packed) {
+    small_swizzle_fn<3>(packed, base);
+}
+STAGE_TAIL(swizzle_4, SkRasterPipeline_SwizzleCtx* packed) {
+    small_swizzle_fn<4>(packed, base);
 }
 STAGE_TAIL(shuffle, SkRasterPipeline_ShuffleCtx* ctx) {
-    shuffle_fn<16>((F*)ctx->ptr, ctx->offsets, ctx->count);
+    shuffle_fn<16>((std::byte*)ctx->ptr, ctx->offsets, ctx->count);
 }
 
 template <int NumSlots>
diff --git a/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp b/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
index 6192d9f..894c92b 100644
--- a/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
+++ b/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
@@ -824,7 +824,8 @@
     return packed;
 }
 
-static void unpack_nybbles_to_offsets(uint32_t components, SkSpan<uint16_t> offsets) {
+template <typename T>
+static void unpack_nybbles_to_offsets(uint32_t components, SkSpan<T> offsets) {
     // Unpack component nybbles into byte-offsets pointing at stack slots.
     for (size_t index = 0; index < offsets.size(); ++index) {
         offsets[index] = (components & 0xF) * SkOpts::raster_pipeline_highp_stride * sizeof(float);
@@ -1629,11 +1630,11 @@
             case BuilderOp::swizzle_2:
             case BuilderOp::swizzle_3:
             case BuilderOp::swizzle_4: {
-                auto* ctx = alloc->make<SkRasterPipeline_SwizzleCtx>();
-                ctx->ptr = tempStackPtr - (N * inst.fImmA);
+                SkRasterPipeline_SwizzleCtx ctx;
+                ctx.dst = OffsetFromBase(tempStackPtr - (N * inst.fImmA));
                 // Unpack component nybbles into byte-offsets pointing at stack slots.
-                unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
-                pipeline->push_back({(ProgramOp)inst.fOp, ctx});
+                unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx.offsets));
+                pipeline->push_back({(ProgramOp)inst.fOp, SkRPCtxUtils::Pack(ctx, alloc)});
                 break;
             }
             case BuilderOp::shuffle: {
@@ -2223,9 +2224,14 @@
             return "ExternalPtr(" + AsRange(0, numSlots) + ")";
         };
 
+        // Converts an RP offset to a pointer.
+        auto OffsetToPtr = [&](SkRPOffset offset) -> std::byte* {
+            return (std::byte*)slots.values.data() + offset;
+        };
+
         // Interprets a slab offset as a slot range.
         auto OffsetCtx = [&](SkRPOffset offset, int numSlots) -> std::string {
-            return PtrCtx((std::byte*)slots.values.data() + offset, numSlots);
+            return PtrCtx(OffsetToPtr(offset), numSlots);
         };
 
         // Interpret the context value as a pointer to two adjacent values.
@@ -2287,7 +2293,7 @@
         };
 
         // Stringize a span of swizzle offsets to the textual equivalent (`xyzw`).
-        auto SwizzleOffsetSpan = [&](SkSpan<const uint16_t> offsets) {
+        auto SwizzleOffsetSpan = [&](const auto offsets) {
             std::string src;
             for (uint16_t offset : offsets) {
                 if (offset == (0 * N * sizeof(float))) {
@@ -2308,7 +2314,7 @@
         // When we decode a swizzle, we don't know the slot width of the original value; that's not
         // preserved in the instruction encoding. (e.g., myFloat4.y would be indistinguishable from
         // myFloat2.y.) We do our best to make a readable dump using the data we have.
-        auto SwizzleWidth = [&](SkSpan<const uint16_t> offsets) {
+        auto SwizzleWidth = [&](const auto offsets) {
             size_t highestComponent = *std::max_element(offsets.begin(), offsets.end()) /
                                       (N * sizeof(float));
             size_t swizzleWidth = offsets.size();
@@ -2316,17 +2322,18 @@
         };
 
         // Stringize a swizzled pointer.
-        auto SwizzlePtr = [&](const float* ptr, SkSpan<const uint16_t> offsets) {
-            return "(" + PtrCtx(ptr, SwizzleWidth(offsets)) + ")." + SwizzleOffsetSpan(offsets);
+        auto SwizzlePtr = [&](const void* ptr, const auto offsets) {
+            return "(" + PtrCtx(ptr, SwizzleWidth(SkSpan(offsets))) + ")." +
+                   SwizzleOffsetSpan(SkSpan(offsets));
         };
 
         // Interpret the context value as a Swizzle structure.
         auto SwizzleCtx = [&](ProgramOp op, const void* v) -> std::tuple<std::string, std::string> {
-            const auto* ctx = static_cast<const SkRasterPipeline_SwizzleCtx*>(v);
+            auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_SwizzleCtx*)v);
             int destSlots = (int)op - (int)BuilderOp::swizzle_1 + 1;
-
-            return std::make_tuple(PtrCtx(ctx->ptr, destSlots),
-                                   SwizzlePtr(ctx->ptr, SkSpan(ctx->offsets, destSlots)));
+            return std::make_tuple(
+                    OffsetCtx(ctx.dst, destSlots),
+                    SwizzlePtr(OffsetToPtr(ctx.dst), SkSpan(ctx.offsets, destSlots)));
         };
 
         // Interpret the context value as a SwizzleCopy structure.
diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp
index da9ff66..459f146 100644
--- a/tests/SkRasterPipelineTest.cpp
+++ b/tests/SkRasterPipelineTest.cpp
@@ -1288,8 +1288,8 @@
 
     struct TestPattern {
         SkRasterPipelineOp stage;
-        uint16_t swizzle[4];
-        uint16_t expectation[4];
+        uint8_t swizzle[4];
+        uint8_t expectation[4];
     };
     static const TestPattern kPatterns[] = {
         {SkRasterPipelineOp::swizzle_1, {3},          {3, 1, 2, 3}}, // (1,2,3,4).w    = (4)
@@ -1307,11 +1307,12 @@
         SkArenaAlloc alloc(/*firstHeapAllocation=*/256);
         SkRasterPipeline p(&alloc);
         SkRasterPipeline_SwizzleCtx ctx;
-        ctx.ptr = slots;
+        ctx.dst = 0;
         for (size_t index = 0; index < std::size(ctx.offsets); ++index) {
             ctx.offsets[index] = pattern.swizzle[index] * N * sizeof(float);
         }
-        p.append(pattern.stage, &ctx);
+        p.append(SkRasterPipelineOp::set_base_pointer, &slots[0]);
+        p.append(pattern.stage, SkRPCtxUtils::Pack(ctx, &alloc));
         p.run(0,0,1,1);
 
         // Verify that the swizzle has been applied in each slot.