centralize and force-inline unaligned_{load,store}

We've got slightly different implementations of unaligned_load(), and
that's causing a x86 Chromium Debug builds to go a bit haywire.  A
single best implementation with an SK_ALWAYS_INLINE and explanation
should help things.

Might as well move its companion unaligned_store() too.

Bug: chromium:974542
Change-Id: If9f3eb4d33bfb8390c661f9e196122da0b9b84ca
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/221409
Reviewed-by: Nico Weber <thakis@chromium.org>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkUtils.h b/src/core/SkUtils.h
index f5d981e..cf2ee10 100644
--- a/src/core/SkUtils.h
+++ b/src/core/SkUtils.h
@@ -70,4 +70,29 @@
     extern const char gLower[16];  // 0-9a-f
 }
 
+///////////////////////////////////////////////////////////////////////////////
+
+// If T is an 8-byte GCC or Clang vector extension type, it would naturally
+// pass or return in the MMX mm0 register on 32-bit x86 builds.  This has the
+// fun side effect of clobbering any state in the x87 st0 register.  (There is
+// no ABI governing who should preserve mm?/st? registers, so no one does!)
+//
+// We force-inline sk_unaligned_load() and sk_unaligned_store() to avoid that,
+// making them safe to use for all types on all platforms, thus solving the
+// problem once and for all!
+
+template <typename T, typename P>
+static SK_ALWAYS_INLINE T sk_unaligned_load(const P* ptr) {
+    // TODO: static_assert desirable things about T here so as not to be totally abused.
+    T val;
+    memcpy(&val, ptr, sizeof(val));
+    return val;
+}
+
+template <typename T, typename P>
+static SK_ALWAYS_INLINE void sk_unaligned_store(P* ptr, T val) {
+    // TODO: ditto
+    memcpy(ptr, &val, sizeof(val));
+}
+
 #endif
diff --git a/src/opts/SkChecksum_opts.h b/src/opts/SkChecksum_opts.h
index 9df4db6..7880847 100644
--- a/src/opts/SkChecksum_opts.h
+++ b/src/opts/SkChecksum_opts.h
@@ -10,6 +10,7 @@
 
 #include "include/core/SkTypes.h"
 #include "include/private/SkChecksum.h"
+#include "src/core/SkUtils.h"   // sk_unaligned_load
 
 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
     #include <immintrin.h>
@@ -19,13 +20,6 @@
 
 namespace SK_OPTS_NS {
 
-template <typename T, typename P>
-static inline T unaligned_load(const P* p) {
-    T v;
-    memcpy(&v, p, sizeof(v));
-    return v;
-}
-
 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
     // This is not a CRC32.  It's Just A Hash that uses those instructions because they're fast.
     /*not static*/ inline uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
@@ -42,9 +36,9 @@
                      c = hash;
             size_t steps = bytes/24;
             while (steps --> 0) {
-                a = _mm_crc32_u64(a, unaligned_load<uint64_t>(data+ 0));
-                b = _mm_crc32_u64(b, unaligned_load<uint64_t>(data+ 8));
-                c = _mm_crc32_u64(c, unaligned_load<uint64_t>(data+16));
+                a = _mm_crc32_u64(a, sk_unaligned_load<uint64_t>(data+ 0));
+                b = _mm_crc32_u64(b, sk_unaligned_load<uint64_t>(data+ 8));
+                c = _mm_crc32_u64(c, sk_unaligned_load<uint64_t>(data+16));
                 data += 24;
             }
             bytes %= 24;
@@ -53,14 +47,14 @@
 
         SkASSERT(bytes < 24);
         if (bytes >= 16) {
-            hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
+            hash = _mm_crc32_u64(hash, sk_unaligned_load<uint64_t>(data));
             bytes -= 8;
             data  += 8;
         }
 
         SkASSERT(bytes < 16);
         if (bytes & 8) {
-            hash = _mm_crc32_u64(hash, unaligned_load<uint64_t>(data));
+            hash = _mm_crc32_u64(hash, sk_unaligned_load<uint64_t>(data));
             data  += 8;
         }
 
@@ -69,15 +63,15 @@
         auto hash32 = (uint32_t)hash;
 
         if (bytes & 4) {
-            hash32 = _mm_crc32_u32(hash32, unaligned_load<uint32_t>(data));
+            hash32 = _mm_crc32_u32(hash32, sk_unaligned_load<uint32_t>(data));
             data += 4;
         }
         if (bytes & 2) {
-            hash32 = _mm_crc32_u16(hash32, unaligned_load<uint16_t>(data));
+            hash32 = _mm_crc32_u16(hash32, sk_unaligned_load<uint16_t>(data));
             data += 2;
         }
         if (bytes & 1) {
-            hash32 = _mm_crc32_u8(hash32, unaligned_load<uint8_t>(data));
+            hash32 = _mm_crc32_u8(hash32, sk_unaligned_load<uint8_t>(data));
         }
         return hash32;
     }
@@ -96,9 +90,9 @@
                      c = hash;
             size_t steps = bytes/12;
             while (steps --> 0) {
-                a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0));
-                b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4));
-                c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8));
+                a = _mm_crc32_u32(a, sk_unaligned_load<uint32_t>(data+0));
+                b = _mm_crc32_u32(b, sk_unaligned_load<uint32_t>(data+4));
+                c = _mm_crc32_u32(c, sk_unaligned_load<uint32_t>(data+8));
                 data += 12;
             }
             bytes %= 12;
@@ -107,22 +101,22 @@
 
         SkASSERT(bytes < 12);
         if (bytes >= 8) {
-            hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
+            hash = _mm_crc32_u32(hash, sk_unaligned_load<uint32_t>(data));
             bytes -= 4;
             data  += 4;
         }
 
         SkASSERT(bytes < 8);
         if (bytes & 4) {
-            hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
+            hash = _mm_crc32_u32(hash, sk_unaligned_load<uint32_t>(data));
             data += 4;
         }
         if (bytes & 2) {
-            hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data));
+            hash = _mm_crc32_u16(hash, sk_unaligned_load<uint16_t>(data));
             data += 2;
         }
         if (bytes & 1) {
-            hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data));
+            hash = _mm_crc32_u8(hash, sk_unaligned_load<uint8_t>(data));
         }
         return hash;
     }
@@ -136,9 +130,9 @@
                      c = hash;
             size_t steps = bytes/24;
             while (steps --> 0) {
-                a = __crc32d(a, unaligned_load<uint64_t>(data+ 0));
-                b = __crc32d(b, unaligned_load<uint64_t>(data+ 8));
-                c = __crc32d(c, unaligned_load<uint64_t>(data+16));
+                a = __crc32d(a, sk_unaligned_load<uint64_t>(data+ 0));
+                b = __crc32d(b, sk_unaligned_load<uint64_t>(data+ 8));
+                c = __crc32d(c, sk_unaligned_load<uint64_t>(data+16));
                 data += 24;
             }
             bytes %= 24;
@@ -147,26 +141,26 @@
 
         SkASSERT(bytes < 24);
         if (bytes >= 16) {
-            hash = __crc32d(hash, unaligned_load<uint64_t>(data));
+            hash = __crc32d(hash, sk_unaligned_load<uint64_t>(data));
             bytes -= 8;
             data  += 8;
         }
 
         SkASSERT(bytes < 16);
         if (bytes & 8) {
-            hash = __crc32d(hash, unaligned_load<uint64_t>(data));
+            hash = __crc32d(hash, sk_unaligned_load<uint64_t>(data));
             data += 8;
         }
         if (bytes & 4) {
-            hash = __crc32w(hash, unaligned_load<uint32_t>(data));
+            hash = __crc32w(hash, sk_unaligned_load<uint32_t>(data));
             data += 4;
         }
         if (bytes & 2) {
-            hash = __crc32h(hash, unaligned_load<uint16_t>(data));
+            hash = __crc32h(hash, sk_unaligned_load<uint16_t>(data));
             data += 2;
         }
         if (bytes & 1) {
-            hash = __crc32b(hash, unaligned_load<uint8_t>(data));
+            hash = __crc32b(hash, sk_unaligned_load<uint8_t>(data));
         }
         return hash;
     }
@@ -180,7 +174,7 @@
 
         // Handle 4 bytes at a time while possible.
         while (bytes >= 4) {
-            uint32_t k = unaligned_load<uint32_t>(data);
+            uint32_t k = sk_unaligned_load<uint32_t>(data);
             k *= 0xcc9e2d51;
             k = (k << 15) | (k >> 17);
             k *= 0x1b873593;
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 9b24c85..f2a802a 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -9,6 +9,7 @@
 #define SkRasterPipeline_opts_DEFINED
 
 #include "include/core/SkTypes.h"
+#include "src/core/SkUtils.h"  // unaligned_{load,store}
 
 // Every function in this file should be marked static and inline using SI.
 #if defined(__clang__)
@@ -17,23 +18,10 @@
     #define SI static inline
 #endif
 
-
-template <typename T, typename P>
-SI T unaligned_load(const P* p) {  // const void* would work too, but const P* helps ARMv7 codegen.
-    T v;
-    memcpy(&v, p, sizeof(v));
-    return v;
-}
-
-template <typename T, typename P>
-SI void unaligned_store(P* p, T v) {
-    memcpy(p, &v, sizeof(v));
-}
-
 template <typename Dst, typename Src>
 SI Dst bit_cast(const Src& src) {
     static_assert(sizeof(Dst) == sizeof(Src), "");
-    return unaligned_load<Dst>(&src);
+    return sk_unaligned_load<Dst>(&src);
 }
 
 template <typename Dst, typename Src>
@@ -380,7 +368,7 @@
     }
     SI U8 pack(U16 v) {
         auto r = _mm_packus_epi16(v,v);
-        return unaligned_load<U8>(&r);
+        return sk_unaligned_load<U8>(&r);
     }
 
     SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
@@ -712,12 +700,12 @@
         auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16);
         p = _mm_packs_epi32(p,p);
     #endif
-        return unaligned_load<U16>(&p);  // We have two copies.  Return (the lower) one.
+        return sk_unaligned_load<U16>(&p);  // We have two copies.  Return (the lower) one.
     }
     SI U8 pack(U16 v) {
         auto r = widen_cast<__m128i>(v);
         r = _mm_packus_epi16(r,r);
-        return unaligned_load<U8>(&r);
+        return sk_unaligned_load<U8>(&r);
     }
 
     SI F if_then_else(I32 c, F t, F e) {
@@ -758,8 +746,8 @@
 
         auto R = _mm_shuffle_epi32(rg, 0x88);  // r0 r1 r2 r3 r0 r1 r2 r3
         auto G = _mm_shuffle_epi32(rg, 0xDD);  // g0 g1 g2 g3 g0 g1 g2 g3
-        *r = unaligned_load<U16>(&R);
-        *g = unaligned_load<U16>(&G);
+        *r = sk_unaligned_load<U16>(&R);
+        *g = sk_unaligned_load<U16>(&G);
     }
     SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
         U32 rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g));
@@ -810,9 +798,9 @@
              G = _mm_srli_si128(R, 8),
              B = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 xx xx xx xx
 
-        *r = unaligned_load<U16>(&R);
-        *g = unaligned_load<U16>(&G);
-        *b = unaligned_load<U16>(&B);
+        *r = sk_unaligned_load<U16>(&R);
+        *g = sk_unaligned_load<U16>(&G);
+        *b = sk_unaligned_load<U16>(&B);
     }
 
     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
@@ -834,10 +822,10 @@
         auto rg = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
              ba = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 a0 a1 a2 a3
 
-        *r = unaligned_load<U16>((uint16_t*)&rg + 0);
-        *g = unaligned_load<U16>((uint16_t*)&rg + 4);
-        *b = unaligned_load<U16>((uint16_t*)&ba + 0);
-        *a = unaligned_load<U16>((uint16_t*)&ba + 4);
+        *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0);
+        *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4);
+        *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0);
+        *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4);
     }
 
     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
@@ -945,7 +933,7 @@
     // when generating code for SSE2 and SSE4.1.  We'll do it manually...
     auto v = widen_cast<__m128i>(x);
     v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8);
-    return unaligned_load<U16>(&v);
+    return sk_unaligned_load<U16>(&v);
 #else
     return (x<<8) | (x>>8);
 #endif
@@ -1157,7 +1145,7 @@
         return v;
     }
 #endif
-    return unaligned_load<V>(src);
+    return sk_unaligned_load<V>(src);
 }
 
 template <typename V, typename T>
@@ -1177,7 +1165,7 @@
         return;
     }
 #endif
-    unaligned_store(dst, v);
+    sk_unaligned_store(dst, v);
 }
 
 SI F from_byte(U8 b) {
@@ -1274,7 +1262,7 @@
     // It's important for speed to explicitly cast(dx) and cast(dy),
     // which has the effect of splatting them to vectors before converting to floats.
     // On Intel this breaks a data dependency on previous loop iterations' registers.
-    r = cast(dx) + unaligned_load<F>(iota);
+    r = cast(dx) + sk_unaligned_load<F>(iota);
     g = cast(dy) + 0.5f;
     b = 1.0f;
     a = 0;
@@ -1284,7 +1272,7 @@
 STAGE(dither, const float* rate) {
     // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors.
     uint32_t iota[] = {0,1,2,3,4,5,6,7};
-    U32 X = dx + unaligned_load<U32>(iota),
+    U32 X = dx + sk_unaligned_load<U32>(iota),
         Y = dy;
 
     // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering.
@@ -1339,34 +1327,34 @@
 
 // load registers r,g,b,a from context (mirrors store_rgba)
 STAGE(load_src, const float* ptr) {
-    r = unaligned_load<F>(ptr + 0*N);
-    g = unaligned_load<F>(ptr + 1*N);
-    b = unaligned_load<F>(ptr + 2*N);
-    a = unaligned_load<F>(ptr + 3*N);
+    r = sk_unaligned_load<F>(ptr + 0*N);
+    g = sk_unaligned_load<F>(ptr + 1*N);
+    b = sk_unaligned_load<F>(ptr + 2*N);
+    a = sk_unaligned_load<F>(ptr + 3*N);
 }
 
 // store registers r,g,b,a into context (mirrors load_rgba)
 STAGE(store_src, float* ptr) {
-    unaligned_store(ptr + 0*N, r);
-    unaligned_store(ptr + 1*N, g);
-    unaligned_store(ptr + 2*N, b);
-    unaligned_store(ptr + 3*N, a);
+    sk_unaligned_store(ptr + 0*N, r);
+    sk_unaligned_store(ptr + 1*N, g);
+    sk_unaligned_store(ptr + 2*N, b);
+    sk_unaligned_store(ptr + 3*N, a);
 }
 
 // load registers dr,dg,db,da from context (mirrors store_dst)
 STAGE(load_dst, const float* ptr) {
-    dr = unaligned_load<F>(ptr + 0*N);
-    dg = unaligned_load<F>(ptr + 1*N);
-    db = unaligned_load<F>(ptr + 2*N);
-    da = unaligned_load<F>(ptr + 3*N);
+    dr = sk_unaligned_load<F>(ptr + 0*N);
+    dg = sk_unaligned_load<F>(ptr + 1*N);
+    db = sk_unaligned_load<F>(ptr + 2*N);
+    da = sk_unaligned_load<F>(ptr + 3*N);
 }
 
 // store registers dr,dg,db,da into context (mirrors load_dst)
 STAGE(store_dst, float* ptr) {
-    unaligned_store(ptr + 0*N, dr);
-    unaligned_store(ptr + 1*N, dg);
-    unaligned_store(ptr + 2*N, db);
-    unaligned_store(ptr + 3*N, da);
+    sk_unaligned_store(ptr + 0*N, dr);
+    sk_unaligned_store(ptr + 1*N, dg);
+    sk_unaligned_store(ptr + 2*N, db);
+    sk_unaligned_store(ptr + 3*N, da);
 }
 
 // Most blend modes apply the same logic to each channel.
@@ -1757,7 +1745,7 @@
     a = lerp(da, a, *c);
 }
 STAGE(lerp_native, const float scales[]) {
-    auto c = unaligned_load<F>(scales);
+    auto c = sk_unaligned_load<F>(scales);
     r = lerp(dr, r, c);
     g = lerp(dg, g, c);
     b = lerp(db, b, c);
@@ -2205,20 +2193,20 @@
 
 STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
     auto w = ctx->limit_x;
-    unaligned_store(ctx->mask, cond_to_mask((0 <= r) & (r < w)));
+    sk_unaligned_store(ctx->mask, cond_to_mask((0 <= r) & (r < w)));
 }
 STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
     auto h = ctx->limit_y;
-    unaligned_store(ctx->mask, cond_to_mask((0 <= g) & (g < h)));
+    sk_unaligned_store(ctx->mask, cond_to_mask((0 <= g) & (g < h)));
 }
 STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
     auto w = ctx->limit_x;
     auto h = ctx->limit_y;
-    unaligned_store(ctx->mask,
+    sk_unaligned_store(ctx->mask,
                     cond_to_mask((0 <= r) & (r < w) & (0 <= g) & (g < h)));
 }
 STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
-    auto mask = unaligned_load<U32>(ctx->mask);
+    auto mask = sk_unaligned_load<U32>(ctx->mask);
     r = bit_cast<F>( bit_cast<U32>(r) & mask );
     g = bit_cast<F>( bit_cast<U32>(g) & mask );
     b = bit_cast<F>( bit_cast<U32>(b) & mask );
@@ -2433,18 +2421,18 @@
     F& t = r;
     auto is_degenerate = (t != t); // NaN
     t = if_then_else(is_degenerate, F(0), t);
-    unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
+    sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
 }
 
 STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) {
     F& t = r;
     auto is_degenerate = (t <= 0) | (t != t);
     t = if_then_else(is_degenerate, F(0), t);
-    unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
+    sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
 }
 
 STAGE(apply_vector_mask, const uint32_t* ctx) {
-    const U32 mask = unaligned_load<U32>(ctx);
+    const U32 mask = sk_unaligned_load<U32>(ctx);
     r = bit_cast<F>(bit_cast<U32>(r) & mask);
     g = bit_cast<F>(bit_cast<U32>(g) & mask);
     b = bit_cast<F>(bit_cast<U32>(b) & mask);
@@ -2459,17 +2447,17 @@
       fy = fract(g + 0.5f);
 
     // Samplers will need to load x and fx, or y and fy.
-    unaligned_store(c->x,  r);
-    unaligned_store(c->y,  g);
-    unaligned_store(c->fx, fx);
-    unaligned_store(c->fy, fy);
+    sk_unaligned_store(c->x,  r);
+    sk_unaligned_store(c->y,  g);
+    sk_unaligned_store(c->fx, fx);
+    sk_unaligned_store(c->fy, fy);
 }
 
 STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) {
     // Bilinear and bicubic filters are both separable, so we produce independent contributions
     // from x and y, multiplying them together here to get each pixel's total scale factor.
-    auto scale = unaligned_load<F>(c->scalex)
-               * unaligned_load<F>(c->scaley);
+    auto scale = sk_unaligned_load<F>(c->scalex)
+               * sk_unaligned_load<F>(c->scaley);
     dr = mad(scale, r, dr);
     dg = mad(scale, g, dg);
     db = mad(scale, b, db);
@@ -2483,23 +2471,23 @@
 
 template <int kScale>
 SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
-    *x = unaligned_load<F>(ctx->x) + (kScale * 0.5f);
-    F fx = unaligned_load<F>(ctx->fx);
+    *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f);
+    F fx = sk_unaligned_load<F>(ctx->fx);
 
     F scalex;
     if (kScale == -1) { scalex = 1.0f - fx; }
     if (kScale == +1) { scalex =        fx; }
-    unaligned_store(ctx->scalex, scalex);
+    sk_unaligned_store(ctx->scalex, scalex);
 }
 template <int kScale>
 SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
-    *y = unaligned_load<F>(ctx->y) + (kScale * 0.5f);
-    F fy = unaligned_load<F>(ctx->fy);
+    *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f);
+    F fy = sk_unaligned_load<F>(ctx->fy);
 
     F scaley;
     if (kScale == -1) { scaley = 1.0f - fy; }
     if (kScale == +1) { scaley =        fy; }
-    unaligned_store(ctx->scaley, scaley);
+    sk_unaligned_store(ctx->scaley, scaley);
 }
 
 STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-1>(ctx, &r); }
@@ -2525,27 +2513,27 @@
 
 template <int kScale>
 SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
-    *x = unaligned_load<F>(ctx->x) + (kScale * 0.5f);
-    F fx = unaligned_load<F>(ctx->fx);
+    *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f);
+    F fx = sk_unaligned_load<F>(ctx->fx);
 
     F scalex;
     if (kScale == -3) { scalex = bicubic_far (1.0f - fx); }
     if (kScale == -1) { scalex = bicubic_near(1.0f - fx); }
     if (kScale == +1) { scalex = bicubic_near(       fx); }
     if (kScale == +3) { scalex = bicubic_far (       fx); }
-    unaligned_store(ctx->scalex, scalex);
+    sk_unaligned_store(ctx->scalex, scalex);
 }
 template <int kScale>
 SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
-    *y = unaligned_load<F>(ctx->y) + (kScale * 0.5f);
-    F fy = unaligned_load<F>(ctx->fy);
+    *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f);
+    F fy = sk_unaligned_load<F>(ctx->fy);
 
     F scaley;
     if (kScale == -3) { scaley = bicubic_far (1.0f - fy); }
     if (kScale == -1) { scaley = bicubic_near(1.0f - fy); }
     if (kScale == +1) { scaley = bicubic_near(       fy); }
     if (kScale == +3) { scaley = bicubic_far (       fy); }
-    unaligned_store(ctx->scaley, scaley);
+    sk_unaligned_store(ctx->scaley, scaley);
 }
 
 STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-3>(ctx, &r); }
@@ -2962,7 +2950,7 @@
         0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
         8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
     };
-    x = cast<F>(I32(dx)) + unaligned_load<F>(iota);
+    x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota);
     y = cast<F>(I32(dy)) + 0.5f;
 }
 
@@ -3492,28 +3480,28 @@
 // ~~~~~~ Coverage scales / lerps ~~~~~~ //
 
 STAGE_PP(load_src, const uint16_t* ptr) {
-    r = unaligned_load<U16>(ptr + 0*N);
-    g = unaligned_load<U16>(ptr + 1*N);
-    b = unaligned_load<U16>(ptr + 2*N);
-    a = unaligned_load<U16>(ptr + 3*N);
+    r = sk_unaligned_load<U16>(ptr + 0*N);
+    g = sk_unaligned_load<U16>(ptr + 1*N);
+    b = sk_unaligned_load<U16>(ptr + 2*N);
+    a = sk_unaligned_load<U16>(ptr + 3*N);
 }
 STAGE_PP(store_src, uint16_t* ptr) {
-    unaligned_store(ptr + 0*N, r);
-    unaligned_store(ptr + 1*N, g);
-    unaligned_store(ptr + 2*N, b);
-    unaligned_store(ptr + 3*N, a);
+    sk_unaligned_store(ptr + 0*N, r);
+    sk_unaligned_store(ptr + 1*N, g);
+    sk_unaligned_store(ptr + 2*N, b);
+    sk_unaligned_store(ptr + 3*N, a);
 }
 STAGE_PP(load_dst, const uint16_t* ptr) {
-    dr = unaligned_load<U16>(ptr + 0*N);
-    dg = unaligned_load<U16>(ptr + 1*N);
-    db = unaligned_load<U16>(ptr + 2*N);
-    da = unaligned_load<U16>(ptr + 3*N);
+    dr = sk_unaligned_load<U16>(ptr + 0*N);
+    dg = sk_unaligned_load<U16>(ptr + 1*N);
+    db = sk_unaligned_load<U16>(ptr + 2*N);
+    da = sk_unaligned_load<U16>(ptr + 3*N);
 }
 STAGE_PP(store_dst, uint16_t* ptr) {
-    unaligned_store(ptr + 0*N, dr);
-    unaligned_store(ptr + 1*N, dg);
-    unaligned_store(ptr + 2*N, db);
-    unaligned_store(ptr + 3*N, da);
+    sk_unaligned_store(ptr + 0*N, dr);
+    sk_unaligned_store(ptr + 1*N, dg);
+    sk_unaligned_store(ptr + 2*N, db);
+    sk_unaligned_store(ptr + 3*N, da);
 }
 
 // ~~~~~~ Coverage scales / lerps ~~~~~~ //
@@ -3533,7 +3521,7 @@
     a = lerp(da, a, c);
 }
 STAGE_PP(lerp_native, const uint16_t scales[]) {
-    auto c = unaligned_load<U16>(scales);
+    auto c = sk_unaligned_load<U16>(scales);
     r = lerp(dr, r, c);
     g = lerp(dg, g, c);
     b = lerp(db, b, c);
@@ -3608,19 +3596,19 @@
 
 STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
     auto w = ctx->limit_x;
-    unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w)));
+    sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w)));
 }
 STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
     auto h = ctx->limit_y;
-    unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h)));
+    sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h)));
 }
 STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
     auto w = ctx->limit_x;
     auto h = ctx->limit_y;
-    unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h)));
+    sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h)));
 }
 STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
-    auto mask = unaligned_load<U16>(ctx->mask);
+    auto mask = sk_unaligned_load<U16>(ctx->mask);
     r = r & mask;
     g = g & mask;
     b = b & mask;
diff --git a/src/sksl/SkSLByteCode.cpp b/src/sksl/SkSLByteCode.cpp
index 739eed4..8b9fc3e 100644
--- a/src/sksl/SkSLByteCode.cpp
+++ b/src/sksl/SkSLByteCode.cpp
@@ -9,6 +9,7 @@
 
 #include "include/core/SkPoint3.h"
 #include "include/private/SkVx.h"
+#include "src/core/SkUtils.h"   // sk_unaligned_load
 #include "src/sksl/SkSLByteCode.h"
 #include "src/sksl/SkSLByteCodeGenerator.h"
 #include "src/sksl/SkSLExternalValue.h"
@@ -24,16 +25,9 @@
 using I32 = skvx::Vec<VecWidth, int32_t>;
 using U32 = skvx::Vec<VecWidth, uint32_t>;
 
-template <typename T>
-static T unaligned_load(const void* ptr) {
-    T val;
-    memcpy(&val, ptr, sizeof(val));
-    return val;
-}
-
 #define READ8() (*(ip++))
-#define READ16() (ip += 2, unaligned_load<uint16_t>(ip - 2))
-#define READ32() (ip += 4, unaligned_load<uint32_t>(ip - 4))
+#define READ16() (ip += 2, sk_unaligned_load<uint16_t>(ip - 2))
+#define READ32() (ip += 4, sk_unaligned_load<uint32_t>(ip - 4))
 
 #define VECTOR_DISASSEMBLE(op, text)                          \
     case ByteCodeInstruction::op: printf(text); break;        \