Use packed contexts for small swizzles in SkRP.
Small (2/3/4-element) swizzles now use 8-bit offsets, and a 32-bit
base pointer RPOffset, in order to squeeze into a 64-bit context.
"Shuffles" (16-element swizzles) continue to use an unpacked
context and 16-bit offsets, but can still share an implementation;
a template papers over the difference.
Small swizzles are very common in SkRP since they are also used to
splat values, so this should be a useful optimization.
Change-Id: I52e6744da39aa59c7d9e49ba941d9631caba51cf
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/677716
Auto-Submit: John Stiles <johnstiles@google.com>
Reviewed-by: Arman Uguray <armansito@google.com>
Commit-Queue: John Stiles <johnstiles@google.com>
diff --git a/src/core/SkRasterPipelineOpContexts.h b/src/core/SkRasterPipelineOpContexts.h
index 3b30291..f6ba259 100644
--- a/src/core/SkRasterPipelineOpContexts.h
+++ b/src/core/SkRasterPipelineOpContexts.h
@@ -177,8 +177,12 @@
};
struct SkRasterPipeline_SwizzleCtx {
- float *ptr;
- uint16_t offsets[4]; // values must be byte offsets (4 * highp-stride * component-index)
+ // If we are processing more than 16 pixels at a time, an 8-bit offset won't be sufficient and
+ // `offsets` will need to use uint16_t (or dial down the premultiplication).
+ static_assert(SkRasterPipeline_kMaxStride_highp <= 16);
+
+ SkRPOffset dst;
+ uint8_t offsets[4]; // values must be byte offsets (4 * highp-stride * component-index)
};
struct SkRasterPipeline_ShuffleCtx {
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 32ea7d9..7ce03cd 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -3588,18 +3588,18 @@
copy_n_slots_masked_fn<4>(packed, base, execution_mask());
}
-template <int LoopCount>
-SI void shuffle_fn(F* dst, uint16_t* offsets, int numSlots) {
+template <int LoopCount, typename OffsetType>
+SI void shuffle_fn(std::byte* ptr, OffsetType* offsets, int numSlots) {
F scratch[16];
- std::byte* src = (std::byte*)dst;
for (int count = 0; count < LoopCount; ++count) {
- scratch[count] = *(F*)(src + offsets[count]);
+ scratch[count] = *(F*)(ptr + offsets[count]);
}
// Surprisingly, this switch generates significantly better code than a memcpy (on x86-64) when
// the number of slots is unknown at compile time, and generates roughly identical code when the
// number of slots is hardcoded. Using a switch allows `scratch` to live in ymm0-ymm15 instead
// of being written out to the stack and then read back in. Also, the intrinsic memcpy assumes
// that `numSlots` could be arbitrarily large, and so it emits more code than we need.
+ F* dst = (F*)ptr;
switch (numSlots) {
case 16: dst[15] = scratch[15]; [[fallthrough]];
case 15: dst[14] = scratch[14]; [[fallthrough]];
@@ -3620,20 +3620,26 @@
}
}
-STAGE_TAIL(swizzle_1, SkRasterPipeline_SwizzleCtx* ctx) {
- shuffle_fn<1>((F*)ctx->ptr, ctx->offsets, 1);
+template <int N>
+SI void small_swizzle_fn(SkRasterPipeline_SwizzleCtx* packed, std::byte* base) {
+ auto ctx = SkRPCtxUtils::Unpack(packed);
+ shuffle_fn<N>(base + ctx.dst, ctx.offsets, N);
}
-STAGE_TAIL(swizzle_2, SkRasterPipeline_SwizzleCtx* ctx) {
- shuffle_fn<2>((F*)ctx->ptr, ctx->offsets, 2);
+
+STAGE_TAIL(swizzle_1, SkRasterPipeline_SwizzleCtx* packed) {
+ small_swizzle_fn<1>(packed, base);
}
-STAGE_TAIL(swizzle_3, SkRasterPipeline_SwizzleCtx* ctx) {
- shuffle_fn<3>((F*)ctx->ptr, ctx->offsets, 3);
+STAGE_TAIL(swizzle_2, SkRasterPipeline_SwizzleCtx* packed) {
+ small_swizzle_fn<2>(packed, base);
}
-STAGE_TAIL(swizzle_4, SkRasterPipeline_SwizzleCtx* ctx) {
- shuffle_fn<4>((F*)ctx->ptr, ctx->offsets, 4);
+STAGE_TAIL(swizzle_3, SkRasterPipeline_SwizzleCtx* packed) {
+ small_swizzle_fn<3>(packed, base);
+}
+STAGE_TAIL(swizzle_4, SkRasterPipeline_SwizzleCtx* packed) {
+ small_swizzle_fn<4>(packed, base);
}
STAGE_TAIL(shuffle, SkRasterPipeline_ShuffleCtx* ctx) {
- shuffle_fn<16>((F*)ctx->ptr, ctx->offsets, ctx->count);
+ shuffle_fn<16>((std::byte*)ctx->ptr, ctx->offsets, ctx->count);
}
template <int NumSlots>
diff --git a/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp b/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
index 6192d9f..894c92b 100644
--- a/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
+++ b/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
@@ -824,7 +824,8 @@
return packed;
}
-static void unpack_nybbles_to_offsets(uint32_t components, SkSpan<uint16_t> offsets) {
+template <typename T>
+static void unpack_nybbles_to_offsets(uint32_t components, SkSpan<T> offsets) {
// Unpack component nybbles into byte-offsets pointing at stack slots.
for (size_t index = 0; index < offsets.size(); ++index) {
offsets[index] = (components & 0xF) * SkOpts::raster_pipeline_highp_stride * sizeof(float);
@@ -1629,11 +1630,11 @@
case BuilderOp::swizzle_2:
case BuilderOp::swizzle_3:
case BuilderOp::swizzle_4: {
- auto* ctx = alloc->make<SkRasterPipeline_SwizzleCtx>();
- ctx->ptr = tempStackPtr - (N * inst.fImmA);
+ SkRasterPipeline_SwizzleCtx ctx;
+ ctx.dst = OffsetFromBase(tempStackPtr - (N * inst.fImmA));
// Unpack component nybbles into byte-offsets pointing at stack slots.
- unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
- pipeline->push_back({(ProgramOp)inst.fOp, ctx});
+ unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx.offsets));
+ pipeline->push_back({(ProgramOp)inst.fOp, SkRPCtxUtils::Pack(ctx, alloc)});
break;
}
case BuilderOp::shuffle: {
@@ -2223,9 +2224,14 @@
return "ExternalPtr(" + AsRange(0, numSlots) + ")";
};
+ // Converts an RP offset to a pointer.
+ auto OffsetToPtr = [&](SkRPOffset offset) -> std::byte* {
+ return (std::byte*)slots.values.data() + offset;
+ };
+
// Interprets a slab offset as a slot range.
auto OffsetCtx = [&](SkRPOffset offset, int numSlots) -> std::string {
- return PtrCtx((std::byte*)slots.values.data() + offset, numSlots);
+ return PtrCtx(OffsetToPtr(offset), numSlots);
};
// Interpret the context value as a pointer to two adjacent values.
@@ -2287,7 +2293,7 @@
};
// Stringize a span of swizzle offsets to the textual equivalent (`xyzw`).
- auto SwizzleOffsetSpan = [&](SkSpan<const uint16_t> offsets) {
+ auto SwizzleOffsetSpan = [&](const auto offsets) {
std::string src;
for (uint16_t offset : offsets) {
if (offset == (0 * N * sizeof(float))) {
@@ -2308,7 +2314,7 @@
// When we decode a swizzle, we don't know the slot width of the original value; that's not
// preserved in the instruction encoding. (e.g., myFloat4.y would be indistinguishable from
// myFloat2.y.) We do our best to make a readable dump using the data we have.
- auto SwizzleWidth = [&](SkSpan<const uint16_t> offsets) {
+ auto SwizzleWidth = [&](const auto offsets) {
size_t highestComponent = *std::max_element(offsets.begin(), offsets.end()) /
(N * sizeof(float));
size_t swizzleWidth = offsets.size();
@@ -2316,17 +2322,18 @@
};
// Stringize a swizzled pointer.
- auto SwizzlePtr = [&](const float* ptr, SkSpan<const uint16_t> offsets) {
- return "(" + PtrCtx(ptr, SwizzleWidth(offsets)) + ")." + SwizzleOffsetSpan(offsets);
+ auto SwizzlePtr = [&](const void* ptr, const auto offsets) {
+ return "(" + PtrCtx(ptr, SwizzleWidth(SkSpan(offsets))) + ")." +
+ SwizzleOffsetSpan(SkSpan(offsets));
};
// Interpret the context value as a Swizzle structure.
auto SwizzleCtx = [&](ProgramOp op, const void* v) -> std::tuple<std::string, std::string> {
- const auto* ctx = static_cast<const SkRasterPipeline_SwizzleCtx*>(v);
+ auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_SwizzleCtx*)v);
int destSlots = (int)op - (int)BuilderOp::swizzle_1 + 1;
-
- return std::make_tuple(PtrCtx(ctx->ptr, destSlots),
- SwizzlePtr(ctx->ptr, SkSpan(ctx->offsets, destSlots)));
+ return std::make_tuple(
+ OffsetCtx(ctx.dst, destSlots),
+ SwizzlePtr(OffsetToPtr(ctx.dst), SkSpan(ctx.offsets, destSlots)));
};
// Interpret the context value as a SwizzleCopy structure.
diff --git a/tests/SkRasterPipelineTest.cpp b/tests/SkRasterPipelineTest.cpp
index da9ff66..459f146 100644
--- a/tests/SkRasterPipelineTest.cpp
+++ b/tests/SkRasterPipelineTest.cpp
@@ -1288,8 +1288,8 @@
struct TestPattern {
SkRasterPipelineOp stage;
- uint16_t swizzle[4];
- uint16_t expectation[4];
+ uint8_t swizzle[4];
+ uint8_t expectation[4];
};
static const TestPattern kPatterns[] = {
{SkRasterPipelineOp::swizzle_1, {3}, {3, 1, 2, 3}}, // (1,2,3,4).w = (4)
@@ -1307,11 +1307,12 @@
SkArenaAlloc alloc(/*firstHeapAllocation=*/256);
SkRasterPipeline p(&alloc);
SkRasterPipeline_SwizzleCtx ctx;
- ctx.ptr = slots;
+ ctx.dst = 0;
for (size_t index = 0; index < std::size(ctx.offsets); ++index) {
ctx.offsets[index] = pattern.swizzle[index] * N * sizeof(float);
}
- p.append(pattern.stage, &ctx);
+ p.append(SkRasterPipelineOp::set_base_pointer, &slots[0]);
+ p.append(pattern.stage, SkRPCtxUtils::Pack(ctx, &alloc));
p.run(0,0,1,1);
// Verify that the swizzle has been applied in each slot.