centralize and force-inline unaligned_{load,store}
We've got slightly different implementations of unaligned_load(), and
that's causing a x86 Chromium Debug builds to go a bit haywire. A
single best implementation with an SK_ALWAYS_INLINE and explanation
should help things.
Might as well move its companion unaligned_store() too.
Bug: chromium:974542
Change-Id: If9f3eb4d33bfb8390c661f9e196122da0b9b84ca
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/221409
Reviewed-by: Nico Weber <thakis@chromium.org>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkUtils.h b/src/core/SkUtils.h
index f5d981e..cf2ee10 100644
--- a/src/core/SkUtils.h
+++ b/src/core/SkUtils.h
@@ -70,4 +70,29 @@
extern const char gLower[16]; // 0-9a-f
}
+///////////////////////////////////////////////////////////////////////////////
+
+// If T is an 8-byte GCC or Clang vector extension type, it would naturally
+// pass or return in the MMX mm0 register on 32-bit x86 builds. This has the
+// fun side effect of clobbering any state in the x87 st0 register. (There is
+// no ABI governing who should preserve mm?/st? registers, so no one does!)
+//
+// We force-inline sk_unaligned_load() and sk_unaligned_store() to avoid that,
+// making them safe to use for all types on all platforms, thus solving the
+// problem once and for all!
+
+template <typename T, typename P>
+static SK_ALWAYS_INLINE T sk_unaligned_load(const P* ptr) {
+ // TODO: static_assert desirable things about T here so as not to be totally abused.
+ T val;
+ memcpy(&val, ptr, sizeof(val));
+ return val;
+}
+
+template <typename T, typename P>
+static SK_ALWAYS_INLINE void sk_unaligned_store(P* ptr, T val) {
+ // TODO: ditto
+ memcpy(ptr, &val, sizeof(val));
+}
+
#endif
diff --git a/src/opts/SkChecksum_opts.h b/src/opts/SkChecksum_opts.h
index 9df4db6..7880847 100644
--- a/src/opts/SkChecksum_opts.h
+++ b/src/opts/SkChecksum_opts.h
@@ -10,6 +10,7 @@
#include "include/core/SkTypes.h"
#include "include/private/SkChecksum.h"
+#include "src/core/SkUtils.h" // sk_unaligned_load
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
#include <immintrin.h>
@@ -19,13 +20,6 @@
namespace SK_OPTS_NS {
-template <typename T, typename P>
-static inline T unaligned_load(const P* p) {
- T v;
- memcpy(&v, p, sizeof(v));
- return v;
-}
-
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
// This is not a CRC32. It's Just A Hash that uses those instructions because they're fast.
/*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
@@ -42,9 +36,9 @@
c = hash;
size_t steps = bytes/24;
while (steps --> 0) {
- a = _mm_crc32_u64(a, unaligned_load<uint64_t>(data+ 0));
- b = _mm_crc32_u64(b, unaligned_load<uint64_t>(data+ 8));
- c = _mm_crc32_u64(c, unaligned_load<uint64_t>(data+16));
+ a = _mm_crc32_u64(a, sk_unaligned_load<uint64_t>(data+ 0));
+ b = _mm_crc32_u64(b, sk_unaligned_load<uint64_t>(data+ 8));
+ c = _mm_crc32_u64(c, sk_unaligned_load<uint64_t>(data+16));
data += 24;
}
bytes %= 24;
@@ -53,14 +47,14 @@
SkASSERT(bytes < 24);
if (bytes >= 16) {
- hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
+ hash = _mm_crc32_u64(hash, sk_unaligned_load<uint64_t>(data));
bytes -= 8;
data += 8;
}
SkASSERT(bytes < 16);
if (bytes & 8) {
- hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
+ hash = _mm_crc32_u64(hash, sk_unaligned_load<uint64_t>(data));
data += 8;
}
@@ -69,15 +63,15 @@
auto hash32 = (uint32_t)hash;
if (bytes & 4) {
- hash32 = _mm_crc32_u32(hash32, unaligned_load<uint32_t>(data));
+ hash32 = _mm_crc32_u32(hash32, sk_unaligned_load<uint32_t>(data));
data += 4;
}
if (bytes & 2) {
- hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data));
+ hash32 = _mm_crc32_u16(hash32, sk_unaligned_load<uint16_t>(data));
data += 2;
}
if (bytes & 1) {
- hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data));
+ hash32 = _mm_crc32_u8(hash32, sk_unaligned_load<uint8_t>(data));
}
return hash32;
}
@@ -96,9 +90,9 @@
c = hash;
size_t steps = bytes/12;
while (steps --> 0) {
- a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0));
- b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4));
- c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8));
+ a = _mm_crc32_u32(a, sk_unaligned_load<uint32_t>(data+0));
+ b = _mm_crc32_u32(b, sk_unaligned_load<uint32_t>(data+4));
+ c = _mm_crc32_u32(c, sk_unaligned_load<uint32_t>(data+8));
data += 12;
}
bytes %= 12;
@@ -107,22 +101,22 @@
SkASSERT(bytes < 12);
if (bytes >= 8) {
- hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
+ hash = _mm_crc32_u32(hash, sk_unaligned_load<uint32_t>(data));
bytes -= 4;
data += 4;
}
SkASSERT(bytes < 8);
if (bytes & 4) {
- hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
+ hash = _mm_crc32_u32(hash, sk_unaligned_load<uint32_t>(data));
data += 4;
}
if (bytes & 2) {
- hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data));
+ hash = _mm_crc32_u16(hash, sk_unaligned_load<uint16_t>(data));
data += 2;
}
if (bytes & 1) {
- hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data));
+ hash = _mm_crc32_u8(hash, sk_unaligned_load<uint8_t>(data));
}
return hash;
}
@@ -136,9 +130,9 @@
c = hash;
size_t steps = bytes/24;
while (steps --> 0) {
- a = __crc32d(a, unaligned_load<uint64_t>(data+ 0));
- b = __crc32d(b, unaligned_load<uint64_t>(data+ 8));
- c = __crc32d(c, unaligned_load<uint64_t>(data+16));
+ a = __crc32d(a, sk_unaligned_load<uint64_t>(data+ 0));
+ b = __crc32d(b, sk_unaligned_load<uint64_t>(data+ 8));
+ c = __crc32d(c, sk_unaligned_load<uint64_t>(data+16));
data += 24;
}
bytes %= 24;
@@ -147,26 +141,26 @@
SkASSERT(bytes < 24);
if (bytes >= 16) {
- hash = __crc32d(hash, unaligned_load<uint64_t>(data));
+ hash = __crc32d(hash, sk_unaligned_load<uint64_t>(data));
bytes -= 8;
data += 8;
}
SkASSERT(bytes < 16);
if (bytes & 8) {
- hash = __crc32d(hash, unaligned_load<uint64_t>(data));
+ hash = __crc32d(hash, sk_unaligned_load<uint64_t>(data));
data += 8;
}
if (bytes & 4) {
- hash = __crc32w(hash, unaligned_load<uint32_t>(data));
+ hash = __crc32w(hash, sk_unaligned_load<uint32_t>(data));
data += 4;
}
if (bytes & 2) {
- hash = __crc32h(hash, unaligned_load<uint16_t>(data));
+ hash = __crc32h(hash, sk_unaligned_load<uint16_t>(data));
data += 2;
}
if (bytes & 1) {
- hash = __crc32b(hash, unaligned_load<uint8_t>(data));
+ hash = __crc32b(hash, sk_unaligned_load<uint8_t>(data));
}
return hash;
}
@@ -180,7 +174,7 @@
// Handle 4 bytes at a time while possible.
while (bytes >= 4) {
- uint32_t k = unaligned_load<uint32_t>(data);
+ uint32_t k = sk_unaligned_load<uint32_t>(data);
k *= 0xcc9e2d51;
k = (k << 15) | (k >> 17);
k *= 0x1b873593;
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 9b24c85..f2a802a 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -9,6 +9,7 @@
#define SkRasterPipeline_opts_DEFINED
#include "include/core/SkTypes.h"
+#include "src/core/SkUtils.h" // unaligned_{load,store}
// Every function in this file should be marked static and inline using SI.
#if defined(__clang__)
@@ -17,23 +18,10 @@
#define SI static inline
#endif
-
-template <typename T, typename P>
-SI T unaligned_load(const P* p) { // const void* would work too, but const P* helps ARMv7 codegen.
- T v;
- memcpy(&v, p, sizeof(v));
- return v;
-}
-
-template <typename T, typename P>
-SI void unaligned_store(P* p, T v) {
- memcpy(p, &v, sizeof(v));
-}
-
template <typename Dst, typename Src>
SI Dst bit_cast(const Src& src) {
static_assert(sizeof(Dst) == sizeof(Src), "");
- return unaligned_load<Dst>(&src);
+ return sk_unaligned_load<Dst>(&src);
}
template <typename Dst, typename Src>
@@ -380,7 +368,7 @@
}
SI U8 pack(U16 v) {
auto r = _mm_packus_epi16(v,v);
- return unaligned_load<U8>(&r);
+ return sk_unaligned_load<U8>(&r);
}
SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
@@ -712,12 +700,12 @@
auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16);
p = _mm_packs_epi32(p,p);
#endif
- return unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
+ return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
}
SI U8 pack(U16 v) {
auto r = widen_cast<__m128i>(v);
r = _mm_packus_epi16(r,r);
- return unaligned_load<U8>(&r);
+ return sk_unaligned_load<U8>(&r);
}
SI F if_then_else(I32 c, F t, F e) {
@@ -758,8 +746,8 @@
auto R = _mm_shuffle_epi32(rg, 0x88); // r0 r1 r2 r3 r0 r1 r2 r3
auto G = _mm_shuffle_epi32(rg, 0xDD); // g0 g1 g2 g3 g0 g1 g2 g3
- *r = unaligned_load<U16>(&R);
- *g = unaligned_load<U16>(&G);
+ *r = sk_unaligned_load<U16>(&R);
+ *g = sk_unaligned_load<U16>(&G);
}
SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
U32 rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g));
@@ -810,9 +798,9 @@
G = _mm_srli_si128(R, 8),
B = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 xx xx xx xx
- *r = unaligned_load<U16>(&R);
- *g = unaligned_load<U16>(&G);
- *b = unaligned_load<U16>(&B);
+ *r = sk_unaligned_load<U16>(&R);
+ *g = sk_unaligned_load<U16>(&G);
+ *b = sk_unaligned_load<U16>(&B);
}
SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
@@ -834,10 +822,10 @@
auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
- *r = unaligned_load<U16>((uint16_t*)&rg + 0);
- *g = unaligned_load<U16>((uint16_t*)&rg + 4);
- *b = unaligned_load<U16>((uint16_t*)&ba + 0);
- *a = unaligned_load<U16>((uint16_t*)&ba + 4);
+ *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0);
+ *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4);
+ *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0);
+ *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4);
}
SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
@@ -945,7 +933,7 @@
// when generating code for SSE2 and SSE4.1. We'll do it manually...
auto v = widen_cast<__m128i>(x);
v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8);
- return unaligned_load<U16>(&v);
+ return sk_unaligned_load<U16>(&v);
#else
return (x<<8) | (x>>8);
#endif
@@ -1157,7 +1145,7 @@
return v;
}
#endif
- return unaligned_load<V>(src);
+ return sk_unaligned_load<V>(src);
}
template <typename V, typename T>
@@ -1177,7 +1165,7 @@
return;
}
#endif
- unaligned_store(dst, v);
+ sk_unaligned_store(dst, v);
}
SI F from_byte(U8 b) {
@@ -1274,7 +1262,7 @@
// It's important for speed to explicitly cast(dx) and cast(dy),
// which has the effect of splatting them to vectors before converting to floats.
// On Intel this breaks a data dependency on previous loop iterations' registers.
- r = cast(dx) + unaligned_load<F>(iota);
+ r = cast(dx) + sk_unaligned_load<F>(iota);
g = cast(dy) + 0.5f;
b = 1.0f;
a = 0;
@@ -1284,7 +1272,7 @@
STAGE(dither, const float* rate) {
// Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors.
uint32_t iota[] = {0,1,2,3,4,5,6,7};
- U32 X = dx + unaligned_load<U32>(iota),
+ U32 X = dx + sk_unaligned_load<U32>(iota),
Y = dy;
// We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering.
@@ -1339,34 +1327,34 @@
// load registers r,g,b,a from context (mirrors store_rgba)
STAGE(load_src, const float* ptr) {
- r = unaligned_load<F>(ptr + 0*N);
- g = unaligned_load<F>(ptr + 1*N);
- b = unaligned_load<F>(ptr + 2*N);
- a = unaligned_load<F>(ptr + 3*N);
+ r = sk_unaligned_load<F>(ptr + 0*N);
+ g = sk_unaligned_load<F>(ptr + 1*N);
+ b = sk_unaligned_load<F>(ptr + 2*N);
+ a = sk_unaligned_load<F>(ptr + 3*N);
}
// store registers r,g,b,a into context (mirrors load_rgba)
STAGE(store_src, float* ptr) {
- unaligned_store(ptr + 0*N, r);
- unaligned_store(ptr + 1*N, g);
- unaligned_store(ptr + 2*N, b);
- unaligned_store(ptr + 3*N, a);
+ sk_unaligned_store(ptr + 0*N, r);
+ sk_unaligned_store(ptr + 1*N, g);
+ sk_unaligned_store(ptr + 2*N, b);
+ sk_unaligned_store(ptr + 3*N, a);
}
// load registers dr,dg,db,da from context (mirrors store_dst)
STAGE(load_dst, const float* ptr) {
- dr = unaligned_load<F>(ptr + 0*N);
- dg = unaligned_load<F>(ptr + 1*N);
- db = unaligned_load<F>(ptr + 2*N);
- da = unaligned_load<F>(ptr + 3*N);
+ dr = sk_unaligned_load<F>(ptr + 0*N);
+ dg = sk_unaligned_load<F>(ptr + 1*N);
+ db = sk_unaligned_load<F>(ptr + 2*N);
+ da = sk_unaligned_load<F>(ptr + 3*N);
}
// store registers dr,dg,db,da into context (mirrors load_dst)
STAGE(store_dst, float* ptr) {
- unaligned_store(ptr + 0*N, dr);
- unaligned_store(ptr + 1*N, dg);
- unaligned_store(ptr + 2*N, db);
- unaligned_store(ptr + 3*N, da);
+ sk_unaligned_store(ptr + 0*N, dr);
+ sk_unaligned_store(ptr + 1*N, dg);
+ sk_unaligned_store(ptr + 2*N, db);
+ sk_unaligned_store(ptr + 3*N, da);
}
// Most blend modes apply the same logic to each channel.
@@ -1757,7 +1745,7 @@
a = lerp(da, a, *c);
}
STAGE(lerp_native, const float scales[]) {
- auto c = unaligned_load<F>(scales);
+ auto c = sk_unaligned_load<F>(scales);
r = lerp(dr, r, c);
g = lerp(dg, g, c);
b = lerp(db, b, c);
@@ -2205,20 +2193,20 @@
STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
auto w = ctx->limit_x;
- unaligned_store(ctx->mask, cond_to_mask((0 <= r) & (r < w)));
+ sk_unaligned_store(ctx->mask, cond_to_mask((0 <= r) & (r < w)));
}
STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
auto h = ctx->limit_y;
- unaligned_store(ctx->mask, cond_to_mask((0 <= g) & (g < h)));
+ sk_unaligned_store(ctx->mask, cond_to_mask((0 <= g) & (g < h)));
}
STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
auto w = ctx->limit_x;
auto h = ctx->limit_y;
- unaligned_store(ctx->mask,
+ sk_unaligned_store(ctx->mask,
cond_to_mask((0 <= r) & (r < w) & (0 <= g) & (g < h)));
}
STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
- auto mask = unaligned_load<U32>(ctx->mask);
+ auto mask = sk_unaligned_load<U32>(ctx->mask);
r = bit_cast<F>( bit_cast<U32>(r) & mask );
g = bit_cast<F>( bit_cast<U32>(g) & mask );
b = bit_cast<F>( bit_cast<U32>(b) & mask );
@@ -2433,18 +2421,18 @@
F& t = r;
auto is_degenerate = (t != t); // NaN
t = if_then_else(is_degenerate, F(0), t);
- unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
+ sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
}
STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) {
F& t = r;
auto is_degenerate = (t <= 0) | (t != t);
t = if_then_else(is_degenerate, F(0), t);
- unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
+ sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
}
STAGE(apply_vector_mask, const uint32_t* ctx) {
- const U32 mask = unaligned_load<U32>(ctx);
+ const U32 mask = sk_unaligned_load<U32>(ctx);
r = bit_cast<F>(bit_cast<U32>(r) & mask);
g = bit_cast<F>(bit_cast<U32>(g) & mask);
b = bit_cast<F>(bit_cast<U32>(b) & mask);
@@ -2459,17 +2447,17 @@
fy = fract(g + 0.5f);
// Samplers will need to load x and fx, or y and fy.
- unaligned_store(c->x, r);
- unaligned_store(c->y, g);
- unaligned_store(c->fx, fx);
- unaligned_store(c->fy, fy);
+ sk_unaligned_store(c->x, r);
+ sk_unaligned_store(c->y, g);
+ sk_unaligned_store(c->fx, fx);
+ sk_unaligned_store(c->fy, fy);
}
STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) {
// Bilinear and bicubic filters are both separable, so we produce independent contributions
// from x and y, multiplying them together here to get each pixel's total scale factor.
- auto scale = unaligned_load<F>(c->scalex)
- * unaligned_load<F>(c->scaley);
+ auto scale = sk_unaligned_load<F>(c->scalex)
+ * sk_unaligned_load<F>(c->scaley);
dr = mad(scale, r, dr);
dg = mad(scale, g, dg);
db = mad(scale, b, db);
@@ -2483,23 +2471,23 @@
template <int kScale>
SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
- *x = unaligned_load<F>(ctx->x) + (kScale * 0.5f);
- F fx = unaligned_load<F>(ctx->fx);
+ *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f);
+ F fx = sk_unaligned_load<F>(ctx->fx);
F scalex;
if (kScale == -1) { scalex = 1.0f - fx; }
if (kScale == +1) { scalex = fx; }
- unaligned_store(ctx->scalex, scalex);
+ sk_unaligned_store(ctx->scalex, scalex);
}
template <int kScale>
SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
- *y = unaligned_load<F>(ctx->y) + (kScale * 0.5f);
- F fy = unaligned_load<F>(ctx->fy);
+ *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f);
+ F fy = sk_unaligned_load<F>(ctx->fy);
F scaley;
if (kScale == -1) { scaley = 1.0f - fy; }
if (kScale == +1) { scaley = fy; }
- unaligned_store(ctx->scaley, scaley);
+ sk_unaligned_store(ctx->scaley, scaley);
}
STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-1>(ctx, &r); }
@@ -2525,27 +2513,27 @@
template <int kScale>
SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
- *x = unaligned_load<F>(ctx->x) + (kScale * 0.5f);
- F fx = unaligned_load<F>(ctx->fx);
+ *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f);
+ F fx = sk_unaligned_load<F>(ctx->fx);
F scalex;
if (kScale == -3) { scalex = bicubic_far (1.0f - fx); }
if (kScale == -1) { scalex = bicubic_near(1.0f - fx); }
if (kScale == +1) { scalex = bicubic_near( fx); }
if (kScale == +3) { scalex = bicubic_far ( fx); }
- unaligned_store(ctx->scalex, scalex);
+ sk_unaligned_store(ctx->scalex, scalex);
}
template <int kScale>
SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
- *y = unaligned_load<F>(ctx->y) + (kScale * 0.5f);
- F fy = unaligned_load<F>(ctx->fy);
+ *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f);
+ F fy = sk_unaligned_load<F>(ctx->fy);
F scaley;
if (kScale == -3) { scaley = bicubic_far (1.0f - fy); }
if (kScale == -1) { scaley = bicubic_near(1.0f - fy); }
if (kScale == +1) { scaley = bicubic_near( fy); }
if (kScale == +3) { scaley = bicubic_far ( fy); }
- unaligned_store(ctx->scaley, scaley);
+ sk_unaligned_store(ctx->scaley, scaley);
}
STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-3>(ctx, &r); }
@@ -2962,7 +2950,7 @@
0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
};
- x = cast<F>(I32(dx)) + unaligned_load<F>(iota);
+ x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota);
y = cast<F>(I32(dy)) + 0.5f;
}
@@ -3492,28 +3480,28 @@
// ~~~~~~ Coverage scales / lerps ~~~~~~ //
STAGE_PP(load_src, const uint16_t* ptr) {
- r = unaligned_load<U16>(ptr + 0*N);
- g = unaligned_load<U16>(ptr + 1*N);
- b = unaligned_load<U16>(ptr + 2*N);
- a = unaligned_load<U16>(ptr + 3*N);
+ r = sk_unaligned_load<U16>(ptr + 0*N);
+ g = sk_unaligned_load<U16>(ptr + 1*N);
+ b = sk_unaligned_load<U16>(ptr + 2*N);
+ a = sk_unaligned_load<U16>(ptr + 3*N);
}
STAGE_PP(store_src, uint16_t* ptr) {
- unaligned_store(ptr + 0*N, r);
- unaligned_store(ptr + 1*N, g);
- unaligned_store(ptr + 2*N, b);
- unaligned_store(ptr + 3*N, a);
+ sk_unaligned_store(ptr + 0*N, r);
+ sk_unaligned_store(ptr + 1*N, g);
+ sk_unaligned_store(ptr + 2*N, b);
+ sk_unaligned_store(ptr + 3*N, a);
}
STAGE_PP(load_dst, const uint16_t* ptr) {
- dr = unaligned_load<U16>(ptr + 0*N);
- dg = unaligned_load<U16>(ptr + 1*N);
- db = unaligned_load<U16>(ptr + 2*N);
- da = unaligned_load<U16>(ptr + 3*N);
+ dr = sk_unaligned_load<U16>(ptr + 0*N);
+ dg = sk_unaligned_load<U16>(ptr + 1*N);
+ db = sk_unaligned_load<U16>(ptr + 2*N);
+ da = sk_unaligned_load<U16>(ptr + 3*N);
}
STAGE_PP(store_dst, uint16_t* ptr) {
- unaligned_store(ptr + 0*N, dr);
- unaligned_store(ptr + 1*N, dg);
- unaligned_store(ptr + 2*N, db);
- unaligned_store(ptr + 3*N, da);
+ sk_unaligned_store(ptr + 0*N, dr);
+ sk_unaligned_store(ptr + 1*N, dg);
+ sk_unaligned_store(ptr + 2*N, db);
+ sk_unaligned_store(ptr + 3*N, da);
}
// ~~~~~~ Coverage scales / lerps ~~~~~~ //
@@ -3533,7 +3521,7 @@
a = lerp(da, a, c);
}
STAGE_PP(lerp_native, const uint16_t scales[]) {
- auto c = unaligned_load<U16>(scales);
+ auto c = sk_unaligned_load<U16>(scales);
r = lerp(dr, r, c);
g = lerp(dg, g, c);
b = lerp(db, b, c);
@@ -3608,19 +3596,19 @@
STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
auto w = ctx->limit_x;
- unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w)));
+ sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w)));
}
STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
auto h = ctx->limit_y;
- unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h)));
+ sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h)));
}
STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
auto w = ctx->limit_x;
auto h = ctx->limit_y;
- unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h)));
+ sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h)));
}
STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
- auto mask = unaligned_load<U16>(ctx->mask);
+ auto mask = sk_unaligned_load<U16>(ctx->mask);
r = r & mask;
g = g & mask;
b = b & mask;
diff --git a/src/sksl/SkSLByteCode.cpp b/src/sksl/SkSLByteCode.cpp
index 739eed4..8b9fc3e 100644
--- a/src/sksl/SkSLByteCode.cpp
+++ b/src/sksl/SkSLByteCode.cpp
@@ -9,6 +9,7 @@
#include "include/core/SkPoint3.h"
#include "include/private/SkVx.h"
+#include "src/core/SkUtils.h" // sk_unaligned_load
#include "src/sksl/SkSLByteCode.h"
#include "src/sksl/SkSLByteCodeGenerator.h"
#include "src/sksl/SkSLExternalValue.h"
@@ -24,16 +25,9 @@
using I32 = skvx::Vec<VecWidth, int32_t>;
using U32 = skvx::Vec<VecWidth, uint32_t>;
-template <typename T>
-static T unaligned_load(const void* ptr) {
- T val;
- memcpy(&val, ptr, sizeof(val));
- return val;
-}
-
#define READ8() (*(ip++))
-#define READ16() (ip += 2, unaligned_load<uint16_t>(ip - 2))
-#define READ32() (ip += 4, unaligned_load<uint32_t>(ip - 4))
+#define READ16() (ip += 2, sk_unaligned_load<uint16_t>(ip - 2))
+#define READ32() (ip += 4, sk_unaligned_load<uint32_t>(ip - 4))
#define VECTOR_DISASSEMBLE(op, text) \
case ByteCodeInstruction::op: printf(text); break; \