Switch back to stack-based SkSL interpreter

It's slower, but code size is quite a bit smaller, memory usage is
smaller, and we think that mapping it to SkVM is just as easy.

This effectively reverts all of the following commits:

"Fix gcc9 warning around size of memset."
https://skia-review.googlesource.com/c/skia/+/279861

"Remove unused (and misleading) 'instruction' from SkSLInterpreter.h"
https://skia-review.googlesource.com/c/skia/+/278177

"Interpreter: Fix intrinsics when called with vector types"
https://skia-review.googlesource.com/c/skia/+/272721

"Make it easier to add vector versions of byte code instructions"
https://skia-review.googlesource.com/c/skia/+/272527

"Interpreter: Support returns from runStriped"
https://skia-review.googlesource.com/c/skia/+/268941

"add SkSLInterpreter vector instructions"
https://skia-review.googlesource.com/c/skia/+/266560

"Fix crash when editing particle scripts"
https://skia-review.googlesource.com/c/skia/+/269487

"Revert "Revert "Complete rewrite of the SkSL interpreter"""
https://skia-review.googlesource.com/c/skia/+/266205

Change-Id: I4258596399c4ca94489d4faf8aacfec88afeee13
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/281205
Reviewed-by: Mike Klein <mtklein@google.com>
Reviewed-by: Ethan Nicholas <ethannicholas@google.com>
Commit-Queue: Brian Osman <brianosman@google.com>
diff --git a/bench/SkSLInterpreterBench.cpp b/bench/SkSLInterpreterBench.cpp
index 5684955..22afd28 100644
--- a/bench/SkSLInterpreterBench.cpp
+++ b/bench/SkSLInterpreterBench.cpp
@@ -9,7 +9,6 @@
 #include "include/utils/SkRandom.h"
 #include "src/sksl/SkSLByteCode.h"
 #include "src/sksl/SkSLCompiler.h"
-#include "src/sksl/SkSLInterpreter.h"
 
 // Without this build flag, this bench isn't runnable.
 #if defined(SK_ENABLE_SKSL_INTERPRETER)
@@ -23,8 +22,6 @@
         , fCount(pixels) {}
 
 protected:
-    static constexpr int VecWidth = 16;
-
     const char* onGetName() override {
         return fName.c_str();
     }
@@ -38,10 +35,9 @@
         SkSL::Program::Settings settings;
         auto program = compiler.convertProgram(SkSL::Program::kGeneric_Kind, fSrc, settings);
         SkASSERT(compiler.errorCount() == 0);
-        std::unique_ptr<SkSL::ByteCode> byteCode = compiler.toByteCode(*program);
-        fMain = byteCode->getFunction("main");
-        fInterpreter.reset(new SkSL::Interpreter<VecWidth>(std::move(byteCode)));
+        fByteCode = compiler.toByteCode(*program);
         SkASSERT(compiler.errorCount() == 0);
+        fMain = fByteCode->getFunction("main");
 
         SkRandom rnd;
         fPixels.resize(fCount * 4);
@@ -59,14 +55,14 @@
                 fPixels.data() + 3 * fCount,
             };
 
-            fInterpreter->runStriped(fMain, fCount, (float**) args);
+            SkAssertResult(fByteCode->runStriped(fMain, fCount, args, 4, nullptr, 0, nullptr, 0));
         }
     }
 
 private:
     SkString fName;
     SkSL::String fSrc;
-    std::unique_ptr<SkSL::Interpreter<VecWidth>> fInterpreter;
+    std::unique_ptr<SkSL::ByteCode> fByteCode;
     const SkSL::ByteCodeFunction* fMain;
 
     int fCount;
diff --git a/gn/sksl.gni b/gn/sksl.gni
index d6c1891..e816e62 100644
--- a/gn/sksl.gni
+++ b/gn/sksl.gni
@@ -8,6 +8,7 @@
 
 skia_sksl_sources = [
   "$_src/sksl/SkSLASTNode.cpp",
+  "$_src/sksl/SkSLByteCode.cpp",
   "$_src/sksl/SkSLByteCodeGenerator.cpp",
   "$_src/sksl/SkSLCFGGenerator.cpp",
   "$_src/sksl/SkSLCompiler.cpp",
diff --git a/modules/particles/include/SkParticleEffect.h b/modules/particles/include/SkParticleEffect.h
index b28eac8..b19ce2f 100644
--- a/modules/particles/include/SkParticleEffect.h
+++ b/modules/particles/include/SkParticleEffect.h
@@ -16,7 +16,6 @@
 #include "include/private/SkTemplates.h"
 #include "include/utils/SkRandom.h"
 #include "modules/particles/include/SkParticleData.h"
-#include "src/sksl/SkSLInterpreter.h"
 
 #include <memory>
 
@@ -26,8 +25,6 @@
 class SkParticleDrawable;
 class SkParticleExternalValue;
 
-static constexpr int INTERPRETER_WIDTH = 8;
-
 namespace skresources {
     class ResourceProvider;
 }
@@ -125,16 +122,13 @@
     friend class SkParticleEffect;
 
     // Cached
-    template<int width>
     struct Program {
-        std::unique_ptr<SkSL::Interpreter<width>> fInterpreter;
+        std::unique_ptr<SkSL::ByteCode> fByteCode;
         SkTArray<std::unique_ptr<SkParticleExternalValue>> fExternalValues;
     };
 
-    // for performance it would be better to run this with a Program<1>, but for code-size reasons
-    // we stick to INTERPRETER_WIDTH
-    Program<INTERPRETER_WIDTH> fEffectProgram;
-    Program<INTERPRETER_WIDTH> fParticleProgram;
+    Program fEffectProgram;
+    Program fParticleProgram;
 };
 
 class SkParticleEffect : public SkRefCnt {
@@ -189,17 +183,8 @@
     void setFrame   (float     f) { fState.fFrame    = f; }
     void setFlags   (uint32_t  f) { fState.fFlags    = f; }
 
-    const SkSL::ByteCode* effectCode() const {
-        return fParams->fEffectProgram.fInterpreter ?
-               &fParams->fEffectProgram.fInterpreter->getCode() :
-               nullptr;
-    }
-
-    const SkSL::ByteCode* particleCode() const {
-        return fParams->fParticleProgram.fInterpreter ?
-               &fParams->fParticleProgram.fInterpreter->getCode() :
-               nullptr;
-    }
+    const SkSL::ByteCode* effectCode() const { return fParams->fEffectProgram.fByteCode.get(); }
+    const SkSL::ByteCode* particleCode() const { return fParams->fParticleProgram.fByteCode.get(); }
 
     float* effectUniforms() { return fEffectUniforms.data(); }
     float* particleUniforms() { return fParticleUniforms.data(); }
diff --git a/modules/particles/src/SkParticleEffect.cpp b/modules/particles/src/SkParticleEffect.cpp
index 133222d..baf465c 100644
--- a/modules/particles/src/SkParticleEffect.cpp
+++ b/modules/particles/src/SkParticleEffect.cpp
@@ -119,9 +119,7 @@
         fDrawable->prepare(resourceProvider);
     }
 
-    auto buildProgram = [this](const SkSL::String& code) ->
-                                     std::pair<std::unique_ptr<SkSL::ByteCode>,
-                                               SkTArray<std::unique_ptr<SkParticleExternalValue>>> {
+    auto buildProgram = [this](const SkSL::String& code, Program* p) {
         SkSL::Compiler compiler;
         SkSL::Program::Settings settings;
 
@@ -142,15 +140,17 @@
         auto program = compiler.convertProgram(SkSL::Program::kGeneric_Kind, code, settings);
         if (!program) {
             SkDebugf("%s\n", compiler.errorText().c_str());
-            return std::make_pair(nullptr, std::move(externalValues));
+            return;
         }
 
         auto byteCode = compiler.toByteCode(*program);
         if (!byteCode) {
             SkDebugf("%s\n", compiler.errorText().c_str());
-            return std::make_pair(nullptr, std::move(externalValues));
+            return;
         }
-        return std::make_pair(std::move(byteCode), std::move(externalValues));
+
+        p->fByteCode = std::move(byteCode);
+        p->fExternalValues.swap(externalValues);
     };
 
     SkSL::String effectCode(kCommonHeader);
@@ -160,19 +160,8 @@
     particleCode.append(kParticleHeader);
     particleCode.append(fParticleCode.c_str());
 
-    auto effectProgram = buildProgram(effectCode);
-    if (effectProgram.first) {
-        fEffectProgram.fInterpreter.reset(
-                new SkSL::Interpreter<INTERPRETER_WIDTH>(std::move(effectProgram.first)));
-        fEffectProgram.fExternalValues.swap(effectProgram.second);
-    }
-
-    auto particleProgram = buildProgram(particleCode);
-    if (particleProgram.first) {
-        fParticleProgram.fInterpreter.reset(
-                new SkSL::Interpreter<INTERPRETER_WIDTH>(std::move(particleProgram.first)));
-        fParticleProgram.fExternalValues.swap(particleProgram.second);
-    }
+    buildProgram(effectCode, &fEffectProgram);
+    buildProgram(particleCode, &fParticleProgram);
 }
 
 SkParticleEffect::SkParticleEffect(sk_sp<SkParticleEffectParams> params, const SkRandom& random)
@@ -233,22 +222,15 @@
 }
 
 void SkParticleEffect::runEffectScript(double now, const char* entry) {
-    SkSL::Interpreter<INTERPRETER_WIDTH>* interpreter = fParams->fEffectProgram.fInterpreter.get();
-    if (interpreter) {
-        const auto& byteCode = interpreter->getCode();
-        if (auto fun = byteCode.getFunction(entry)) {
+    if (const auto& byteCode = fParams->fEffectProgram.fByteCode) {
+        if (auto fun = byteCode->getFunction(entry)) {
             for (const auto& value : fParams->fEffectProgram.fExternalValues) {
                 value->setRandom(&fRandom);
                 value->setEffect(this);
             }
-            interpreter->setUniforms(fEffectUniforms.data());
-            static constexpr int numChannels = sizeof(EffectState) / sizeof(float);
-            SkASSERT(numChannels == fun->getParameterSlotCount());
-            float* args[numChannels];
-            for (int i = 0; i < numChannels; ++i) {
-                args[i] = &fState.fAge + i;
-            }
-            SkAssertResult(interpreter->runStriped(fun, 1, args));
+            SkAssertResult(byteCode->run(fun, &fState.fAge, sizeof(EffectState) / sizeof(float),
+                                         nullptr, 0,
+                                         fEffectUniforms.data(), fEffectUniforms.count()));
             this->processEffectSpawnRequests(now);
         }
     }
@@ -281,11 +263,8 @@
 }
 
 void SkParticleEffect::runParticleScript(double now, const char* entry, int start, int count) {
-    SkSL::Interpreter<INTERPRETER_WIDTH>* interpreter =
-                                                       fParams->fParticleProgram.fInterpreter.get();
-    if (interpreter) {
-        const auto& byteCode = interpreter->getCode();
-        if (auto fun = byteCode.getFunction(entry)) {
+    if (const auto& byteCode = fParams->fParticleProgram.fByteCode) {
+        if (auto fun = byteCode->getFunction(entry)) {
             float* args[SkParticles::kNumChannels];
             for (int i = 0; i < SkParticles::kNumChannels; ++i) {
                 args[i] = fParticles.fData[i].get() + start;
@@ -296,8 +275,10 @@
                 value->setEffect(this);
             }
             memcpy(&fParticleUniforms[1], &fState.fAge, sizeof(EffectState));
-            interpreter->setUniforms(fParticleUniforms.data());
-            SkAssertResult(interpreter->runStriped(fun, count, (float**) args));
+            SkAssertResult(byteCode->runStriped(fun, count, args, SkParticles::kNumChannels,
+                                                nullptr, 0,
+                                                fParticleUniforms.data(),
+                                                fParticleUniforms.count()));
             this->processParticleSpawnRequests(now, start);
         }
     }
diff --git a/src/core/SkRasterPipeline.h b/src/core/SkRasterPipeline.h
index 152cac4..cf6ac47 100644
--- a/src/core/SkRasterPipeline.h
+++ b/src/core/SkRasterPipeline.h
@@ -161,15 +161,12 @@
 };
 
 namespace SkSL {
+class ByteCode;
 class ByteCodeFunction;
-
-template<int width>
-class Interpreter;
 }
 
 struct SkRasterPipeline_InterpreterCtx {
-    static constexpr int VECTOR_WIDTH = 8;
-    SkSL::Interpreter<VECTOR_WIDTH>* interpreter;
+    const SkSL::ByteCode*         byteCode;
     const SkSL::ByteCodeFunction* fn;
 
     SkColor4f   paintColor;
diff --git a/src/core/SkRuntimeEffect.cpp b/src/core/SkRuntimeEffect.cpp
index be9edb0..518ff74 100644
--- a/src/core/SkRuntimeEffect.cpp
+++ b/src/core/SkRuntimeEffect.cpp
@@ -16,7 +16,6 @@
 #include "src/core/SkWriteBuffer.h"
 #include "src/sksl/SkSLByteCode.h"
 #include "src/sksl/SkSLCompiler.h"
-#include "src/sksl/SkSLInterpreter.h"
 #include "src/sksl/ir/SkSLFunctionDefinition.h"
 #include "src/sksl/ir/SkSLVarDeclarations.h"
 
@@ -357,8 +356,6 @@
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-static constexpr int kVectorWidth = SkRasterPipeline_InterpreterCtx::VECTOR_WIDTH;
-
 class SkRuntimeColorFilter : public SkColorFilter {
 public:
     SkRuntimeColorFilter(sk_sp<SkRuntimeEffect> effect, sk_sp<SkData> inputs,
@@ -390,18 +387,17 @@
         ctx->ninputs = fEffect->uniformSize() / 4;
         ctx->shaderConvention = false;
 
-        SkAutoMutexExclusive ama(fInterpreterMutex);
-        if (!fInterpreter) {
+        SkAutoMutexExclusive ama(fByteCodeMutex);
+        if (!fByteCode) {
             auto [byteCode, errorText] = fEffect->toByteCode(fInputs->data());
             if (!byteCode) {
                 SkDebugf("%s\n", errorText.c_str());
                 return false;
             }
-            fMain = byteCode->getFunction("main");
-            fInterpreter.reset(new SkSL::Interpreter<kVectorWidth>(std::move(byteCode)));
+            fByteCode = std::move(byteCode);
         }
-        ctx->fn = fMain;
-        ctx->interpreter = fInterpreter.get();
+        ctx->byteCode = fByteCode.get();
+        ctx->fn = ctx->byteCode->getFunction("main");
         rec.fPipeline->append(SkRasterPipeline::interpreter, ctx);
         return true;
     }
@@ -431,9 +427,8 @@
     sk_sp<SkData> fInputs;
     std::vector<sk_sp<SkColorFilter>> fChildren;
 
-    mutable SkMutex fInterpreterMutex;
-    mutable std::unique_ptr<SkSL::Interpreter<kVectorWidth>> fInterpreter;
-    mutable const SkSL::ByteCodeFunction* fMain;
+    mutable SkMutex fByteCodeMutex;
+    mutable std::unique_ptr<SkSL::ByteCode> fByteCode;
 };
 
 sk_sp<SkFlattenable> SkRuntimeColorFilter::CreateProc(SkReadBuffer& buffer) {
@@ -511,18 +506,17 @@
         ctx->ninputs = fEffect->uniformSize() / 4;
         ctx->shaderConvention = true;
 
-        SkAutoMutexExclusive ama(fInterpreterMutex);
-        if (!fInterpreter) {
+        SkAutoMutexExclusive ama(fByteCodeMutex);
+        if (!fByteCode) {
             auto[byteCode, errorText] = fEffect->toByteCode(fInputs->data());
             if (!byteCode) {
                 SkDebugf("%s\n", errorText.c_str());
                 return false;
             }
-            fMain = byteCode->getFunction("main");
-            fInterpreter.reset(new SkSL::Interpreter<kVectorWidth>(std::move(byteCode)));
+            fByteCode = std::move(byteCode);
         }
-        ctx->fn = fMain;
-        ctx->interpreter = fInterpreter.get();
+        ctx->byteCode = fByteCode.get();
+        ctx->fn = ctx->byteCode->getFunction("main");
 
         rec.fPipeline->append(SkRasterPipeline::seed_shader);
         rec.fPipeline->append_matrix(rec.fAlloc, inverse);
@@ -571,9 +565,8 @@
     sk_sp<SkData> fInputs;
     std::vector<sk_sp<SkShader>> fChildren;
 
-    mutable SkMutex fInterpreterMutex;
-    mutable std::unique_ptr<SkSL::Interpreter<kVectorWidth>> fInterpreter;
-    mutable const SkSL::ByteCodeFunction* fMain;
+    mutable SkMutex fByteCodeMutex;
+    mutable std::unique_ptr<SkSL::ByteCode> fByteCode;
 };
 
 sk_sp<SkFlattenable> SkRTShader::CreateProc(SkReadBuffer& buffer) {
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 9938667..32f9e8f 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -10,7 +10,7 @@
 
 #include "include/core/SkTypes.h"
 #include "src/core/SkUtils.h"  // unaligned_{load,store}
-#include "src/sksl/SkSLInterpreter.h"
+#include "src/sksl/SkSLByteCode.h"
 
 // Every function in this file should be marked static and inline using SI.
 #if defined(__clang__)
@@ -2721,6 +2721,7 @@
 
     float*  args[]  = { xx, yy, rr, gg, bb, aa };
     float** in_args = args;
+    int     in_count = 6;
 
     if (c->shaderConvention) {
         // our caller must have called seed_shader to set these
@@ -2732,14 +2733,15 @@
         sk_unaligned_store(aa, F(c->paintColor.fA));
     } else {
         in_args += 2;   // skip x,y
+        in_count = 4;
         sk_unaligned_store(rr, r);
         sk_unaligned_store(gg, g);
         sk_unaligned_store(bb, b);
         sk_unaligned_store(aa, a);
     }
 
-    c->interpreter->setUniforms((float*) c->inputs);
-    SkAssertResult(c->interpreter->runStriped(c->fn, tail ? tail : N, (float**) in_args));
+    SkAssertResult(c->byteCode->runStriped(c->fn, tail ? tail : N, in_args, in_count,
+                                           nullptr, 0, (const float*)c->inputs, c->ninputs));
 
     r = sk_unaligned_load<F>(rr);
     g = sk_unaligned_load<F>(gg);
diff --git a/src/sksl/SkSLByteCode.cpp b/src/sksl/SkSLByteCode.cpp
new file mode 100644
index 0000000..a9c3480
--- /dev/null
+++ b/src/sksl/SkSLByteCode.cpp
@@ -0,0 +1,1760 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SKSL_STANDALONE
+
+#include "include/core/SkPoint3.h"
+#include "include/private/SkVx.h"
+#include "src/core/SkUtils.h"   // sk_unaligned_load
+#include "src/sksl/SkSLByteCode.h"
+#include "src/sksl/SkSLByteCodeGenerator.h"
+#include "src/sksl/SkSLExternalValue.h"
+
+#include <vector>
+
+namespace SkSL {
+
+#if defined(SK_ENABLE_SKSL_INTERPRETER)
+
+constexpr int VecWidth = ByteCode::kVecWidth;
+
+struct Interpreter {
+
+using F32 = skvx::Vec<VecWidth, float>;
+using I32 = skvx::Vec<VecWidth, int32_t>;
+using U32 = skvx::Vec<VecWidth, uint32_t>;
+
+#define READ8() (*(ip++))
+#define READ16() (ip += 2, sk_unaligned_load<uint16_t>(ip - 2))
+#define READ32() (ip += 4, sk_unaligned_load<uint32_t>(ip - 4))
+#define READ_INST() (ip += sizeof(instruction), \
+                     sk_unaligned_load<instruction>(ip - sizeof(instruction)))
+
+#define VECTOR_DISASSEMBLE(op, text)                                \
+    case ByteCodeInstruction::op: printf(text); ++ip; break;        \
+    case ByteCodeInstruction::op##2: printf(text "2"); ++ip; break; \
+    case ByteCodeInstruction::op##3: printf(text "3"); ++ip; break; \
+    case ByteCodeInstruction::op##4: printf(text "4"); ++ip; break;
+
+#define VECTOR_DISASSEMBLE_NO_COUNT(op, text)                 \
+    case ByteCodeInstruction::op: printf(text); break;        \
+    case ByteCodeInstruction::op##2: printf(text "2"); break; \
+    case ByteCodeInstruction::op##3: printf(text "3"); break; \
+    case ByteCodeInstruction::op##4: printf(text "4"); break;
+
+#define VECTOR_MATRIX_DISASSEMBLE(op, text) \
+    VECTOR_DISASSEMBLE(op, text)            \
+    case ByteCodeInstruction::op##N: printf(text "N %d", READ8()); break;
+
+#define VECTOR_MATRIX_DISASSEMBLE_NO_COUNT(op, text) \
+    VECTOR_DISASSEMBLE_NO_COUNT(op, text)            \
+    case ByteCodeInstruction::op##N: printf(text "N %d", READ8()); break;
+
+static const uint8_t* DisassembleInstruction(const uint8_t* ip) {
+    switch ((ByteCodeInstruction) (intptr_t) READ_INST()) {
+        VECTOR_MATRIX_DISASSEMBLE(kAddF, "addf")
+        VECTOR_DISASSEMBLE(kAddI, "addi")
+        case ByteCodeInstruction::kAndB: printf("andb"); break;
+        case ByteCodeInstruction::kBranch: printf("branch %d", READ16()); break;
+        case ByteCodeInstruction::kCall: printf("call %d", READ8()); break;
+        case ByteCodeInstruction::kCallExternal: {
+            int argumentCount = READ8();
+            int returnCount = READ8();
+            int externalValue = READ8();
+            printf("callexternal %d, %d, %d", argumentCount, returnCount, externalValue);
+            break;
+        }
+        case ByteCodeInstruction::kClampIndex: printf("clampindex %d", READ8()); break;
+        VECTOR_DISASSEMBLE(kCompareIEQ, "compareieq")
+        VECTOR_DISASSEMBLE(kCompareINEQ, "compareineq")
+        VECTOR_MATRIX_DISASSEMBLE(kCompareFEQ, "comparefeq")
+        VECTOR_MATRIX_DISASSEMBLE(kCompareFNEQ, "comparefneq")
+        VECTOR_DISASSEMBLE(kCompareFGT, "comparefgt")
+        VECTOR_DISASSEMBLE(kCompareFGTEQ, "comparefgteq")
+        VECTOR_DISASSEMBLE(kCompareFLT, "compareflt")
+        VECTOR_DISASSEMBLE(kCompareFLTEQ, "compareflteq")
+        VECTOR_DISASSEMBLE(kCompareSGT, "comparesgt")
+        VECTOR_DISASSEMBLE(kCompareSGTEQ, "comparesgteq")
+        VECTOR_DISASSEMBLE(kCompareSLT, "compareslt")
+        VECTOR_DISASSEMBLE(kCompareSLTEQ, "compareslteq")
+        VECTOR_DISASSEMBLE(kCompareUGT, "compareugt")
+        VECTOR_DISASSEMBLE(kCompareUGTEQ, "compareugteq")
+        VECTOR_DISASSEMBLE(kCompareULT, "compareult")
+        VECTOR_DISASSEMBLE(kCompareULTEQ, "compareulteq")
+        VECTOR_DISASSEMBLE_NO_COUNT(kConvertFtoI, "convertftoi")
+        VECTOR_DISASSEMBLE_NO_COUNT(kConvertStoF, "convertstof")
+        VECTOR_DISASSEMBLE_NO_COUNT(kConvertUtoF, "convertutof")
+        VECTOR_DISASSEMBLE(kCos, "cos")
+        VECTOR_MATRIX_DISASSEMBLE(kDivideF, "dividef")
+        VECTOR_DISASSEMBLE(kDivideS, "divideS")
+        VECTOR_DISASSEMBLE(kDivideU, "divideu")
+        VECTOR_MATRIX_DISASSEMBLE(kDup, "dup")
+        case ByteCodeInstruction::kInverse2x2: printf("inverse2x2"); break;
+        case ByteCodeInstruction::kInverse3x3: printf("inverse3x3"); break;
+        case ByteCodeInstruction::kInverse4x4: printf("inverse4x4"); break;
+        case ByteCodeInstruction::kLoad: printf("load %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoad2: printf("load2 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoad3: printf("load3 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoad4: printf("load4 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadGlobal: printf("loadglobal %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadGlobal2: printf("loadglobal2 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadGlobal3: printf("loadglobal3 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadGlobal4: printf("loadglobal4 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadUniform: printf("loaduniform %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadUniform2: printf("loaduniform2 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadUniform3: printf("loaduniform3 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadUniform4: printf("loaduniform4 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kLoadSwizzle: {
+            int target = READ8();
+            int count = READ8();
+            printf("loadswizzle %d %d", target, count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kLoadSwizzleGlobal: {
+            int target = READ8();
+            int count = READ8();
+            printf("loadswizzleglobal %d %d", target, count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kLoadSwizzleUniform: {
+            int target = READ8();
+            int count = READ8();
+            printf("loadswizzleuniform %d %d", target, count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kLoadExtended: printf("loadextended %d", READ8()); break;
+        case ByteCodeInstruction::kLoadExtendedGlobal: printf("loadextendedglobal %d", READ8());
+            break;
+        case ByteCodeInstruction::kLoadExtendedUniform: printf("loadextendeduniform %d", READ8());
+            break;
+        case ByteCodeInstruction::kMatrixToMatrix: {
+            int srcCols = READ8();
+            int srcRows = READ8();
+            int dstCols = READ8();
+            int dstRows = READ8();
+            printf("matrixtomatrix %dx%d %dx%d", srcCols, srcRows, dstCols, dstRows);
+            break;
+        }
+        case ByteCodeInstruction::kMatrixMultiply: {
+            int lCols = READ8();
+            int lRows = READ8();
+            int rCols = READ8();
+            printf("matrixmultiply %dx%d %dx%d", lCols, lRows, rCols, lCols);
+            break;
+        }
+        VECTOR_MATRIX_DISASSEMBLE(kMultiplyF, "multiplyf")
+        VECTOR_DISASSEMBLE(kMultiplyI, "multiplyi")
+        VECTOR_MATRIX_DISASSEMBLE_NO_COUNT(kNegateF, "negatef")
+        VECTOR_DISASSEMBLE_NO_COUNT(kNegateI, "negatei")
+        case ByteCodeInstruction::kNotB: printf("notb"); break;
+        case ByteCodeInstruction::kOrB: printf("orb"); break;
+        VECTOR_MATRIX_DISASSEMBLE_NO_COUNT(kPop, "pop")
+        case ByteCodeInstruction::kPushImmediate: {
+            uint32_t v = READ32();
+            union { uint32_t u; float f; } pun = { v };
+            printf("pushimmediate %s", (to_string(v) + "(" + to_string(pun.f) + ")").c_str());
+            break;
+        }
+        case ByteCodeInstruction::kReadExternal: printf("readexternal %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kReadExternal2: printf("readexternal2 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kReadExternal3: printf("readexternal3 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kReadExternal4: printf("readexternal4 %d", READ16() >> 8); break;
+        VECTOR_DISASSEMBLE(kRemainderF, "remainderf")
+        VECTOR_DISASSEMBLE(kRemainderS, "remainders")
+        VECTOR_DISASSEMBLE(kRemainderU, "remainderu")
+        case ByteCodeInstruction::kReserve: printf("reserve %d", READ8()); break;
+        case ByteCodeInstruction::kReturn: printf("return %d", READ8()); break;
+        case ByteCodeInstruction::kScalarToMatrix: {
+            int cols = READ8();
+            int rows = READ8();
+            printf("scalartomatrix %dx%d", cols, rows);
+            break;
+        }
+        case ByteCodeInstruction::kShiftLeft: printf("shl %d", READ8()); break;
+        case ByteCodeInstruction::kShiftRightS: printf("shrs %d", READ8()); break;
+        case ByteCodeInstruction::kShiftRightU: printf("shru %d", READ8()); break;
+        VECTOR_DISASSEMBLE(kSin, "sin")
+        VECTOR_DISASSEMBLE_NO_COUNT(kSqrt, "sqrt")
+        case ByteCodeInstruction::kStore: printf("store %d", READ8()); break;
+        case ByteCodeInstruction::kStore2: printf("store2 %d", READ8()); break;
+        case ByteCodeInstruction::kStore3: printf("store3 %d", READ8()); break;
+        case ByteCodeInstruction::kStore4: printf("store4 %d", READ8()); break;
+        case ByteCodeInstruction::kStoreGlobal: printf("storeglobal %d", READ8()); break;
+        case ByteCodeInstruction::kStoreGlobal2: printf("storeglobal2 %d", READ8()); break;
+        case ByteCodeInstruction::kStoreGlobal3: printf("storeglobal3 %d", READ8()); break;
+        case ByteCodeInstruction::kStoreGlobal4: printf("storeglobal4 %d", READ8()); break;
+        case ByteCodeInstruction::kStoreSwizzle: {
+            int target = READ8();
+            int count = READ8();
+            printf("storeswizzle %d %d", target, count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kStoreSwizzleGlobal: {
+            int target = READ8();
+            int count = READ8();
+            printf("storeswizzleglobal %d %d", target, count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kStoreSwizzleIndirect: {
+            int count = READ8();
+            printf("storeswizzleindirect %d", count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kStoreSwizzleIndirectGlobal: {
+            int count = READ8();
+            printf("storeswizzleindirectglobal %d", count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        case ByteCodeInstruction::kStoreExtended: printf("storeextended %d", READ8()); break;
+        case ByteCodeInstruction::kStoreExtendedGlobal: printf("storeextendedglobal %d", READ8());
+            break;
+        VECTOR_MATRIX_DISASSEMBLE(kSubtractF, "subtractf")
+        VECTOR_DISASSEMBLE(kSubtractI, "subtracti")
+        case ByteCodeInstruction::kSwizzle: {
+            printf("swizzle %d, ", READ8());
+            int count = READ8();
+            printf("%d", count);
+            for (int i = 0; i < count; ++i) {
+                printf(", %d", READ8());
+            }
+            break;
+        }
+        VECTOR_DISASSEMBLE(kTan, "tan")
+        case ByteCodeInstruction::kWriteExternal: printf("writeexternal %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kWriteExternal2: printf("writeexternal2 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kWriteExternal3: printf("writeexternal3 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kWriteExternal4: printf("writeexternal4 %d", READ16() >> 8); break;
+        case ByteCodeInstruction::kXorB: printf("xorb"); break;
+        case ByteCodeInstruction::kMaskPush: printf("maskpush"); break;
+        case ByteCodeInstruction::kMaskPop: printf("maskpop"); break;
+        case ByteCodeInstruction::kMaskNegate: printf("masknegate"); break;
+        case ByteCodeInstruction::kMaskBlend: printf("maskblend %d", READ8()); break;
+        case ByteCodeInstruction::kBranchIfAllFalse:
+            printf("branchifallfalse %d", READ16());
+            break;
+        case ByteCodeInstruction::kLoopBegin: printf("loopbegin"); break;
+        case ByteCodeInstruction::kLoopNext: printf("loopnext"); break;
+        case ByteCodeInstruction::kLoopMask: printf("loopmask"); break;
+        case ByteCodeInstruction::kLoopEnd: printf("loopend"); break;
+        case ByteCodeInstruction::kLoopContinue: printf("loopcontinue"); break;
+        case ByteCodeInstruction::kLoopBreak: printf("loopbreak"); break;
+        default:
+            ip -= sizeof(instruction);
+            printf("unknown(%d)\n", (int) (intptr_t) READ_INST());
+            SkASSERT(false);
+    }
+    return ip;
+}
+
+#ifdef SKSLC_THREADED_CODE
+    #define LABEL(name) name:
+    #ifdef TRACE
+        #define NEXT() goto next
+    #else
+        #define NEXT() goto *READ_INST()
+    #endif
+#else
+    #define LABEL(name) case ByteCodeInstruction::name:
+    #define NEXT() continue
+#endif
+
+#define VECTOR_BINARY_OP(base, field, op)             \
+    LABEL(base ## 4)                                  \
+        sp[-4] = sp[-4].field op sp[0].field;         \
+        POP();                                        \
+        /* fall through */                            \
+    LABEL(base ## 3) {                                \
+        sp[-ip[0]] = sp[-ip[0]].field op sp[0].field; \
+        POP();                                        \
+    }   /* fall through */                            \
+    LABEL(base ## 2) {                                \
+        sp[-ip[0]] = sp[-ip[0]].field op sp[0].field; \
+        POP();                                        \
+    }   /* fall through */                            \
+    LABEL(base) {                                     \
+        sp[-ip[0]] = sp[-ip[0]].field op sp[0].field; \
+        POP();                                        \
+        ++ip;                                         \
+        NEXT();                                       \
+    }
+
+// A naive implementation of / or % using skvx operations will likely crash with a divide by zero
+// in inactive vector lanesm, so we need to be sure to avoid masked-off lanes.
+#define VECTOR_BINARY_MASKED_OP(base, field, op)            \
+    LABEL(base ## 4)                                        \
+        for (int i = 0; i < VecWidth; ++i) {                \
+            if (mask()[i]) {                                \
+                sp[-4].field[i] op ## = sp[0].field[i];     \
+            }                                               \
+        }                                                   \
+        POP();                                              \
+        /* fall through */                                  \
+    LABEL(base ## 3) {                                      \
+        for (int i = 0; i < VecWidth; ++i) {                \
+            if (mask()[i]) {                                \
+                sp[-ip[0]].field[i] op ## = sp[0].field[i]; \
+            }                                               \
+        }                                                   \
+        POP();                                              \
+    }   /* fall through */                                  \
+    LABEL(base ## 2) {                                      \
+        for (int i = 0; i < VecWidth; ++i) {                \
+            if (mask()[i]) {                                \
+                sp[-ip[0]].field[i] op ## = sp[0].field[i]; \
+            }                                               \
+        }                                                   \
+        POP();                                              \
+    }   /* fall through */                                  \
+    LABEL(base) {                                           \
+        for (int i = 0; i < VecWidth; ++i) {                \
+            if (mask()[i]) {                                \
+                sp[-ip[0]].field[i] op ## = sp[0].field[i]; \
+            }                                               \
+        }                                                   \
+        POP();                                              \
+        ++ip;                                               \
+        NEXT();                                             \
+    }
+
+
+#define VECTOR_MATRIX_BINARY_OP(base, field, op)          \
+    VECTOR_BINARY_OP(base, field, op)                     \
+    LABEL(base ## N) {                                    \
+        int count = READ8();                              \
+        for (int i = count; i > 0; --i) {                 \
+            sp[-count] = sp[-count].field op sp[0].field; \
+            POP();                                        \
+        }                                                 \
+        NEXT();                                           \
+    }
+
+#define VECTOR_BINARY_FN(base, field, fn)               \
+    LABEL(base ## 4)                                    \
+        sp[-4] = fn(sp[-4].field, sp[0].field);         \
+        POP();                                          \
+        /* fall through */                              \
+    LABEL(base ## 3) {                                  \
+        sp[-ip[0]] = fn(sp[-ip[0]].field, sp[0].field); \
+        POP();                                          \
+    }   /* fall through */                              \
+    LABEL(base ## 2) {                                  \
+        sp[-ip[0]] = fn(sp[-ip[0]].field, sp[0].field); \
+        POP();                                          \
+    }   /* fall through */                              \
+    LABEL(base) {                                       \
+        sp[-ip[0]] = fn(sp[-ip[0]].field, sp[0].field); \
+        POP();                                          \
+        ++ip;                                           \
+        NEXT();                                         \
+    }
+
+#define VECTOR_UNARY_FN(base, fn, field)         \
+    LABEL(base ## 4)  sp[-3] = fn(sp[-3].field); \
+    LABEL(base ## 3)  sp[-2] = fn(sp[-2].field); \
+    LABEL(base ## 2)  sp[-1] = fn(sp[-1].field); \
+    LABEL(base)       sp[ 0] = fn(sp[ 0].field); \
+                      NEXT();
+
+#define VECTOR_UNARY_FN_VEC(base, fn)                     \
+    LABEL(base ## 4)                                      \
+    LABEL(base ## 3)                                      \
+    LABEL(base ## 2)                                      \
+    LABEL(base) {                                         \
+        int count = READ8();                              \
+        float* v = (float*)sp - count + 1;                \
+        for (int i = VecWidth * count; i > 0; --i, ++v) { \
+            *v = fn(*v);                                  \
+        }                                                 \
+        NEXT();                                           \
+    }
+
+#define VECTOR_LABELS(base) \
+    &&base ## 4,            \
+    &&base ## 3,            \
+    &&base ## 2,            \
+    &&base
+
+#define VECTOR_MATRIX_LABELS(base) \
+    VECTOR_LABELS(base),           \
+    &&base ## N
+
+// If you trip this assert, it means that the order of the opcodes listed in ByteCodeInstruction
+// does not match the order of the opcodes listed in the 'labels' array in innerRun().
+#define CHECK_LABEL(name) \
+    SkASSERT(labels[(int) ByteCodeInstruction::name] == &&name)
+
+#define CHECK_VECTOR_LABELS(name) \
+    CHECK_LABEL(name ## 4);       \
+    CHECK_LABEL(name ## 3);       \
+    CHECK_LABEL(name ## 2);       \
+    CHECK_LABEL(name)
+
+#define CHECK_VECTOR_MATRIX_LABELS(name) \
+    CHECK_VECTOR_LABELS(name);           \
+    CHECK_LABEL(name ## N)
+
+union VValue {
+    VValue() {}
+    VValue(F32 f) : fFloat(f) {}
+    VValue(I32 s) : fSigned(s) {}
+    VValue(U32 u) : fUnsigned(u) {}
+
+    F32 fFloat;
+    I32 fSigned;
+    U32 fUnsigned;
+};
+
+struct StackFrame {
+    const uint8_t* fCode;
+    const uint8_t* fIP;
+    VValue* fStack;
+    int fParameterCount;
+};
+
+static F32 VecMod(F32 a, F32 b) {
+    return a - skvx::trunc(a / b) * b;
+}
+
+#define spf(index)  sp[index].fFloat
+
+static void CallExternal(const ByteCode* byteCode, const uint8_t*& ip, VValue*& sp,
+                          int baseIndex, I32 mask) {
+    int argumentCount = READ8();
+    int returnCount = READ8();
+    int target = READ8();
+    ExternalValue* v = byteCode->fExternalValues[target];
+    sp -= argumentCount - 1;
+
+    float tmpArgs[4];
+    float tmpReturn[4];
+    SkASSERT(argumentCount <= (int)SK_ARRAY_COUNT(tmpArgs));
+    SkASSERT(returnCount <= (int)SK_ARRAY_COUNT(tmpReturn));
+
+    for (int i = 0; i < VecWidth; ++i) {
+        if (mask[i]) {
+            for (int j = 0; j < argumentCount; ++j) {
+                tmpArgs[j] = sp[j].fFloat[i];
+            }
+            v->call(baseIndex + i, tmpArgs, tmpReturn);
+            for (int j = 0; j < returnCount; ++j) {
+                sp[j].fFloat[i] = tmpReturn[j];
+            }
+        }
+    }
+    sp += returnCount - 1;
+}
+
+static void Inverse2x2(VValue* sp) {
+    F32 a = sp[-3].fFloat,
+        b = sp[-2].fFloat,
+        c = sp[-1].fFloat,
+        d = sp[ 0].fFloat;
+    F32 idet = F32(1) / (a*d - b*c);
+    sp[-3].fFloat = d * idet;
+    sp[-2].fFloat = -b * idet;
+    sp[-1].fFloat = -c * idet;
+    sp[ 0].fFloat = a * idet;
+}
+
+static void Inverse3x3(VValue* sp) {
+    F32 a11 = sp[-8].fFloat, a12 = sp[-5].fFloat, a13 = sp[-2].fFloat,
+        a21 = sp[-7].fFloat, a22 = sp[-4].fFloat, a23 = sp[-1].fFloat,
+        a31 = sp[-6].fFloat, a32 = sp[-3].fFloat, a33 = sp[ 0].fFloat;
+    F32 idet = F32(1) / (a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
+                         a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31);
+    sp[-8].fFloat = (a22 * a33 - a23 * a32) * idet;
+    sp[-7].fFloat = (a23 * a31 - a21 * a33) * idet;
+    sp[-6].fFloat = (a21 * a32 - a22 * a31) * idet;
+    sp[-5].fFloat = (a13 * a32 - a12 * a33) * idet;
+    sp[-4].fFloat = (a11 * a33 - a13 * a31) * idet;
+    sp[-3].fFloat = (a12 * a31 - a11 * a32) * idet;
+    sp[-2].fFloat = (a12 * a23 - a13 * a22) * idet;
+    sp[-1].fFloat = (a13 * a21 - a11 * a23) * idet;
+    sp[ 0].fFloat = (a11 * a22 - a12 * a21) * idet;
+}
+
+static void Inverse4x4(VValue* sp) {
+    F32 a00 = spf(-15), a10 = spf(-11), a20 = spf( -7), a30 = spf( -3),
+        a01 = spf(-14), a11 = spf(-10), a21 = spf( -6), a31 = spf( -2),
+        a02 = spf(-13), a12 = spf( -9), a22 = spf( -5), a32 = spf( -1),
+        a03 = spf(-12), a13 = spf( -8), a23 = spf( -4), a33 = spf(  0);
+
+    F32 b00 = a00 * a11 - a01 * a10,
+        b01 = a00 * a12 - a02 * a10,
+        b02 = a00 * a13 - a03 * a10,
+        b03 = a01 * a12 - a02 * a11,
+        b04 = a01 * a13 - a03 * a11,
+        b05 = a02 * a13 - a03 * a12,
+        b06 = a20 * a31 - a21 * a30,
+        b07 = a20 * a32 - a22 * a30,
+        b08 = a20 * a33 - a23 * a30,
+        b09 = a21 * a32 - a22 * a31,
+        b10 = a21 * a33 - a23 * a31,
+        b11 = a22 * a33 - a23 * a32;
+
+    F32 idet = F32(1) /
+               (b00 * b11 - b01 * b10 + b02 * b09 + b03 * b08 - b04 * b07 + b05 * b06);
+
+    b00 *= idet;
+    b01 *= idet;
+    b02 *= idet;
+    b03 *= idet;
+    b04 *= idet;
+    b05 *= idet;
+    b06 *= idet;
+    b07 *= idet;
+    b08 *= idet;
+    b09 *= idet;
+    b10 *= idet;
+    b11 *= idet;
+
+    spf(-15) = a11 * b11 - a12 * b10 + a13 * b09;
+    spf(-14) = a02 * b10 - a01 * b11 - a03 * b09;
+    spf(-13) = a31 * b05 - a32 * b04 + a33 * b03;
+    spf(-12) = a22 * b04 - a21 * b05 - a23 * b03;
+    spf(-11) = a12 * b08 - a10 * b11 - a13 * b07;
+    spf(-10) = a00 * b11 - a02 * b08 + a03 * b07;
+    spf( -9) = a32 * b02 - a30 * b05 - a33 * b01;
+    spf( -8) = a20 * b05 - a22 * b02 + a23 * b01;
+    spf( -7) = a10 * b10 - a11 * b08 + a13 * b06;
+    spf( -6) = a01 * b08 - a00 * b10 - a03 * b06;
+    spf( -5) = a30 * b04 - a31 * b02 + a33 * b00;
+    spf( -4) = a21 * b02 - a20 * b04 - a23 * b00;
+    spf( -3) = a11 * b07 - a10 * b09 - a12 * b06;
+    spf( -2) = a00 * b09 - a01 * b07 + a02 * b06;
+    spf( -1) = a31 * b01 - a30 * b03 - a32 * b00;
+    spf(  0) = a20 * b03 - a21 * b01 + a22 * b00;
+}
+
+static bool InnerRun(const ByteCode* byteCode, const ByteCodeFunction* f, VValue* stack,
+                     float* outReturn[], VValue globals[], const float uniforms[],
+                     bool stripedOutput, int N, int baseIndex) {
+#ifdef SKSLC_THREADED_CODE
+    static const void* labels[] = {
+        // If you aren't familiar with it, the &&label syntax is the GCC / Clang "labels as values"
+        // extension. If you add anything to this array, be sure to add the corresponding
+        // CHECK_LABEL() or CHECK_*_LABELS() assert below.
+        VECTOR_MATRIX_LABELS(kAddF),
+        VECTOR_LABELS(kAddI),
+        &&kAndB,
+        &&kBranch,
+        &&kCall,
+        &&kCallExternal,
+        &&kClampIndex,
+        VECTOR_LABELS(kCompareIEQ),
+        VECTOR_LABELS(kCompareINEQ),
+        VECTOR_MATRIX_LABELS(kCompareFEQ),
+        VECTOR_MATRIX_LABELS(kCompareFNEQ),
+        VECTOR_LABELS(kCompareFGT),
+        VECTOR_LABELS(kCompareFGTEQ),
+        VECTOR_LABELS(kCompareFLT),
+        VECTOR_LABELS(kCompareFLTEQ),
+        VECTOR_LABELS(kCompareSGT),
+        VECTOR_LABELS(kCompareSGTEQ),
+        VECTOR_LABELS(kCompareSLT),
+        VECTOR_LABELS(kCompareSLTEQ),
+        VECTOR_LABELS(kCompareUGT),
+        VECTOR_LABELS(kCompareUGTEQ),
+        VECTOR_LABELS(kCompareULT),
+        VECTOR_LABELS(kCompareULTEQ),
+        VECTOR_LABELS(kConvertFtoI),
+        VECTOR_LABELS(kConvertStoF),
+        VECTOR_LABELS(kConvertUtoF),
+        VECTOR_LABELS(kCos),
+        VECTOR_MATRIX_LABELS(kDivideF),
+        VECTOR_LABELS(kDivideS),
+        VECTOR_LABELS(kDivideU),
+        VECTOR_MATRIX_LABELS(kDup),
+        &&kInverse2x2,
+        &&kInverse3x3,
+        &&kInverse4x4,
+        VECTOR_LABELS(kLoad),
+        VECTOR_LABELS(kLoadGlobal),
+        VECTOR_LABELS(kLoadUniform),
+        &&kLoadSwizzle,
+        &&kLoadSwizzleGlobal,
+        &&kLoadSwizzleUniform,
+        &&kLoadExtended,
+        &&kLoadExtendedGlobal,
+        &&kLoadExtendedUniform,
+        &&kMatrixToMatrix,
+        &&kMatrixMultiply,
+        VECTOR_MATRIX_LABELS(kNegateF),
+        VECTOR_LABELS(kNegateI),
+        VECTOR_MATRIX_LABELS(kMultiplyF),
+        VECTOR_LABELS(kMultiplyI),
+        &&kNotB,
+        &&kOrB,
+        VECTOR_MATRIX_LABELS(kPop),
+        &&kPushImmediate,
+        VECTOR_LABELS(kReadExternal),
+        VECTOR_LABELS(kRemainderF),
+        VECTOR_LABELS(kRemainderS),
+        VECTOR_LABELS(kRemainderU),
+        &&kReserve,
+        &&kReturn,
+        &&kScalarToMatrix,
+        &&kShiftLeft,
+        &&kShiftRightS,
+        &&kShiftRightU,
+        VECTOR_LABELS(kSin),
+        VECTOR_LABELS(kSqrt),
+        VECTOR_LABELS(kStore),
+        VECTOR_LABELS(kStoreGlobal),
+        &&kStoreExtended,
+        &&kStoreExtendedGlobal,
+        &&kStoreSwizzle,
+        &&kStoreSwizzleGlobal,
+        &&kStoreSwizzleIndirect,
+        &&kStoreSwizzleIndirectGlobal,
+        &&kSwizzle,
+        VECTOR_MATRIX_LABELS(kSubtractF),
+        VECTOR_LABELS(kSubtractI),
+        VECTOR_LABELS(kTan),
+        VECTOR_LABELS(kWriteExternal),
+        &&kXorB,
+
+        &&kMaskPush,
+        &&kMaskPop,
+        &&kMaskNegate,
+        &&kMaskBlend,
+        &&kBranchIfAllFalse,
+
+        &&kLoopBegin,
+        &&kLoopNext,
+        &&kLoopMask,
+        &&kLoopEnd,
+        &&kLoopBreak,
+        &&kLoopContinue,
+    };
+    // Verify that the order of the labels array matches the order of the ByteCodeInstruction enum.
+    CHECK_VECTOR_MATRIX_LABELS(kAddF);
+    CHECK_VECTOR_LABELS(kAddI);
+    CHECK_LABEL(kAndB);
+    CHECK_LABEL(kBranch);
+    CHECK_LABEL(kCall);
+    CHECK_LABEL(kCallExternal);
+    CHECK_LABEL(kClampIndex);
+    CHECK_VECTOR_LABELS(kCompareIEQ);
+    CHECK_VECTOR_LABELS(kCompareINEQ);
+    CHECK_VECTOR_MATRIX_LABELS(kCompareFEQ);
+    CHECK_VECTOR_MATRIX_LABELS(kCompareFNEQ);
+    CHECK_VECTOR_LABELS(kCompareFGT);
+    CHECK_VECTOR_LABELS(kCompareFGTEQ);
+    CHECK_VECTOR_LABELS(kCompareFLT);
+    CHECK_VECTOR_LABELS(kCompareFLTEQ);
+    CHECK_VECTOR_LABELS(kCompareSGT);
+    CHECK_VECTOR_LABELS(kCompareSGTEQ);
+    CHECK_VECTOR_LABELS(kCompareSLT);
+    CHECK_VECTOR_LABELS(kCompareSLTEQ);
+    CHECK_VECTOR_LABELS(kCompareUGT);
+    CHECK_VECTOR_LABELS(kCompareUGTEQ);
+    CHECK_VECTOR_LABELS(kCompareULT);
+    CHECK_VECTOR_LABELS(kCompareULTEQ);
+    CHECK_VECTOR_LABELS(kConvertFtoI);
+    CHECK_VECTOR_LABELS(kConvertStoF);
+    CHECK_VECTOR_LABELS(kConvertUtoF);
+    CHECK_VECTOR_LABELS(kCos);
+    CHECK_VECTOR_MATRIX_LABELS(kDivideF);
+    CHECK_VECTOR_LABELS(kDivideS);
+    CHECK_VECTOR_LABELS(kDivideU);
+    CHECK_VECTOR_MATRIX_LABELS(kDup);
+    CHECK_LABEL(kInverse2x2);
+    CHECK_LABEL(kInverse3x3);
+    CHECK_LABEL(kInverse4x4);
+    CHECK_VECTOR_LABELS(kLoad);
+    CHECK_VECTOR_LABELS(kLoadGlobal);
+    CHECK_VECTOR_LABELS(kLoadUniform);
+    CHECK_LABEL(kLoadSwizzle);
+    CHECK_LABEL(kLoadSwizzleGlobal);
+    CHECK_LABEL(kLoadSwizzleUniform);
+    CHECK_LABEL(kLoadExtended);
+    CHECK_LABEL(kLoadExtendedGlobal);
+    CHECK_LABEL(kLoadExtendedUniform);
+    CHECK_LABEL(kMatrixToMatrix);
+    CHECK_LABEL(kMatrixMultiply);
+    CHECK_VECTOR_MATRIX_LABELS(kNegateF);
+    CHECK_VECTOR_LABELS(kNegateI);
+    CHECK_VECTOR_MATRIX_LABELS(kMultiplyF);
+    CHECK_VECTOR_LABELS(kMultiplyI);
+    CHECK_LABEL(kNotB);
+    CHECK_LABEL(kOrB);
+    CHECK_VECTOR_MATRIX_LABELS(kPop);
+    CHECK_LABEL(kPushImmediate);
+    CHECK_VECTOR_LABELS(kReadExternal);
+    CHECK_VECTOR_LABELS(kRemainderF);
+    CHECK_VECTOR_LABELS(kRemainderS);
+    CHECK_VECTOR_LABELS(kRemainderU);
+    CHECK_LABEL(kReserve);
+    CHECK_LABEL(kReturn);
+    CHECK_LABEL(kScalarToMatrix);
+    CHECK_LABEL(kShiftLeft);
+    CHECK_LABEL(kShiftRightS);
+    CHECK_LABEL(kShiftRightU);
+    CHECK_VECTOR_LABELS(kSin);
+    CHECK_VECTOR_LABELS(kSqrt);
+    CHECK_VECTOR_LABELS(kStore);
+    CHECK_VECTOR_LABELS(kStoreGlobal);
+    CHECK_LABEL(kStoreExtended);
+    CHECK_LABEL(kStoreExtendedGlobal);
+    CHECK_LABEL(kStoreSwizzle);
+    CHECK_LABEL(kStoreSwizzleGlobal);
+    CHECK_LABEL(kStoreSwizzleIndirect);
+    CHECK_LABEL(kStoreSwizzleIndirectGlobal);
+    CHECK_LABEL(kSwizzle);
+    CHECK_VECTOR_MATRIX_LABELS(kSubtractF);
+    CHECK_VECTOR_LABELS(kSubtractI);
+    CHECK_VECTOR_LABELS(kTan);
+    CHECK_VECTOR_LABELS(kWriteExternal);
+    CHECK_LABEL(kXorB);
+    CHECK_LABEL(kMaskPush);
+    CHECK_LABEL(kMaskPop);
+    CHECK_LABEL(kMaskNegate);
+    CHECK_LABEL(kMaskBlend);
+    CHECK_LABEL(kBranchIfAllFalse);
+    CHECK_LABEL(kLoopBegin);
+    CHECK_LABEL(kLoopNext);
+    CHECK_LABEL(kLoopMask);
+    CHECK_LABEL(kLoopEnd);
+    CHECK_LABEL(kLoopBreak);
+    CHECK_LABEL(kLoopContinue);
+    f->fPreprocessOnce([f] { ((ByteCodeFunction*)f)->preprocess(labels); });
+#endif
+
+    // Needs to be the first N non-negative integers, at least as large as VecWidth
+    static const Interpreter::I32 gLanes = {
+        0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+    };
+
+    VValue* sp = stack + f->fParameterCount + f->fLocalCount - 1;
+
+    #define POP() (*(sp--))
+    #define PUSH(v) (sp[1] = v, ++sp)
+
+    const uint8_t* code = f->fCode.data();
+    const uint8_t* ip = code;
+    std::vector<StackFrame> frames;
+
+    I32 condStack[16];  // Independent condition masks
+    I32 maskStack[16];  // Combined masks (eg maskStack[0] & maskStack[1] & ...)
+    I32 contStack[16];  // Continue flags for loops
+    I32 loopStack[16];  // Loop execution masks
+    condStack[0] = maskStack[0] = (gLanes < N);
+    contStack[0] = I32( 0);
+    loopStack[0] = I32(~0);
+    I32* condPtr = condStack;
+    I32* maskPtr = maskStack;
+    I32* contPtr = contStack;
+    I32* loopPtr = loopStack;
+
+    if (f->fConditionCount + 1 > (int)SK_ARRAY_COUNT(condStack) ||
+        f->fLoopCount + 1 > (int)SK_ARRAY_COUNT(loopStack)) {
+        return false;
+    }
+
+    auto mask = [&]() { return *maskPtr & *loopPtr; };
+
+#ifdef SKSLC_THREADED_CODE
+    // If the "labels as values" extension is available, we implement this using threaded code.
+    // Instead of opcodes, the code directly contains the addresses of the labels to jump to. Then
+    // the code for each opcode simply grabs the address of the next opcode and uses a goto to jump
+    // there.
+    NEXT();
+#else
+    // Otherwise, we have to use a switch statement and a loop to execute the right label.
+    for (;;) {
+        #ifdef TRACE
+            printf("at %3d ", (int) (ip - code));
+            disassemble_instruction(ip);
+            printf(" (stack: %d)\n", (int) (sp - stack) + 1);
+        #endif
+        switch ((ByteCodeInstruction) READ16()) {
+#endif
+
+    VECTOR_MATRIX_BINARY_OP(kAddF, fFloat, +)
+    VECTOR_BINARY_OP(kAddI, fSigned, +)
+
+    // Booleans are integer masks: 0/~0 for false/true. So bitwise ops do what we want:
+    LABEL(kAndB)
+        sp[-1] = sp[-1].fSigned & sp[0].fSigned;
+        POP();
+        NEXT();
+    LABEL(kNotB)
+        sp[0] = ~sp[0].fSigned;
+        NEXT();
+    LABEL(kOrB)
+        sp[-1] = sp[-1].fSigned | sp[0].fSigned;
+        POP();
+        NEXT();
+    LABEL(kXorB)
+        sp[-1] = sp[-1].fSigned ^ sp[0].fSigned;
+        POP();
+        NEXT();
+
+    LABEL(kBranch)
+        ip = code + READ16();
+        NEXT();
+
+    LABEL(kCall) {
+        // Precursor code reserved space for the return value, and pushed all parameters to
+        // the stack. Update our bottom of stack to point at the first parameter, and our
+        // sp to point past those parameters (plus space for locals).
+        int target = READ8();
+        const ByteCodeFunction* fun = byteCode->fFunctions[target].get();
+#ifdef SKSLC_THREADED_CODE
+        fun->fPreprocessOnce([fun] { ((ByteCodeFunction*)fun)->preprocess(labels); });
+#endif
+        if (skvx::any(mask())) {
+            frames.push_back({ code, ip, stack, fun->fParameterCount });
+            ip = code = fun->fCode.data();
+            stack = sp - fun->fParameterCount + 1;
+            sp = stack + fun->fParameterCount + fun->fLocalCount - 1;
+        }
+        NEXT();
+    }
+
+    LABEL(kCallExternal) {
+        CallExternal(byteCode, ip, sp, baseIndex, mask());
+        NEXT();
+    }
+
+    LABEL(kClampIndex) {
+        int length = READ8();
+        if (skvx::any(mask() & ((sp[0].fSigned < 0) | (sp[0].fSigned >= length)))) {
+            return false;
+        }
+        NEXT();
+    }
+
+    VECTOR_BINARY_OP(kCompareIEQ, fSigned, ==)
+    VECTOR_MATRIX_BINARY_OP(kCompareFEQ, fFloat, ==)
+    VECTOR_BINARY_OP(kCompareINEQ, fSigned, !=)
+    VECTOR_MATRIX_BINARY_OP(kCompareFNEQ, fFloat, !=)
+    VECTOR_BINARY_OP(kCompareSGT, fSigned, >)
+    VECTOR_BINARY_OP(kCompareUGT, fUnsigned, >)
+    VECTOR_BINARY_OP(kCompareFGT, fFloat, >)
+    VECTOR_BINARY_OP(kCompareSGTEQ, fSigned, >=)
+    VECTOR_BINARY_OP(kCompareUGTEQ, fUnsigned, >=)
+    VECTOR_BINARY_OP(kCompareFGTEQ, fFloat, >=)
+    VECTOR_BINARY_OP(kCompareSLT, fSigned, <)
+    VECTOR_BINARY_OP(kCompareULT, fUnsigned, <)
+    VECTOR_BINARY_OP(kCompareFLT, fFloat, <)
+    VECTOR_BINARY_OP(kCompareSLTEQ, fSigned, <=)
+    VECTOR_BINARY_OP(kCompareULTEQ, fUnsigned, <=)
+    VECTOR_BINARY_OP(kCompareFLTEQ, fFloat, <=)
+
+    LABEL(kConvertFtoI4) sp[-3] = skvx::cast<int>(sp[-3].fFloat);
+    LABEL(kConvertFtoI3) sp[-2] = skvx::cast<int>(sp[-2].fFloat);
+    LABEL(kConvertFtoI2) sp[-1] = skvx::cast<int>(sp[-1].fFloat);
+    LABEL(kConvertFtoI)  sp[ 0] = skvx::cast<int>(sp[ 0].fFloat);
+                         NEXT();
+
+    LABEL(kConvertStoF4) sp[-3] = skvx::cast<float>(sp[-3].fSigned);
+    LABEL(kConvertStoF3) sp[-2] = skvx::cast<float>(sp[-2].fSigned);
+    LABEL(kConvertStoF2) sp[-1] = skvx::cast<float>(sp[-1].fSigned);
+    LABEL(kConvertStoF)  sp[ 0] = skvx::cast<float>(sp[ 0].fSigned);
+                         NEXT();
+
+    LABEL(kConvertUtoF4) sp[-3] = skvx::cast<float>(sp[-3].fUnsigned);
+    LABEL(kConvertUtoF3) sp[-2] = skvx::cast<float>(sp[-2].fUnsigned);
+    LABEL(kConvertUtoF2) sp[-1] = skvx::cast<float>(sp[-1].fUnsigned);
+    LABEL(kConvertUtoF)  sp[ 0] = skvx::cast<float>(sp[ 0].fUnsigned);
+                         NEXT();
+
+    VECTOR_UNARY_FN_VEC(kCos, cosf)
+
+    VECTOR_BINARY_MASKED_OP(kDivideS, fSigned, /)
+    VECTOR_BINARY_MASKED_OP(kDivideU, fUnsigned, /)
+    VECTOR_MATRIX_BINARY_OP(kDivideF, fFloat, /)
+
+    LABEL(kDup4) PUSH(sp[1 - ip[0]]);
+    LABEL(kDup3) PUSH(sp[1 - ip[0]]);
+    LABEL(kDup2) PUSH(sp[1 - ip[0]]);
+    LABEL(kDup)  PUSH(sp[1 - ip[0]]);
+                 ++ip;
+                 NEXT();
+
+    LABEL(kDupN) {
+        int count = READ8();
+        memcpy(sp + 1, sp - count + 1, count * sizeof(VValue));
+        sp += count;
+        NEXT();
+    }
+
+    LABEL(kInverse2x2) {
+        Inverse2x2(sp);
+        NEXT();
+    }
+    LABEL(kInverse3x3) {
+        Inverse3x3(sp);
+        NEXT();
+    }
+    LABEL(kInverse4x4) {
+        Inverse4x4(sp);
+        NEXT();
+    }
+
+    LABEL(kLoad4) sp[4] = stack[ip[1] + 3];
+    LABEL(kLoad3) sp[3] = stack[ip[1] + 2];
+    LABEL(kLoad2) sp[2] = stack[ip[1] + 1];
+    LABEL(kLoad)  sp[1] = stack[ip[1] + 0];
+                  sp += ip[0];
+                  ip += 2;
+                  NEXT();
+
+    LABEL(kLoadGlobal4) sp[4] = globals[ip[1] + 3];
+    LABEL(kLoadGlobal3) sp[3] = globals[ip[1] + 2];
+    LABEL(kLoadGlobal2) sp[2] = globals[ip[1] + 1];
+    LABEL(kLoadGlobal)  sp[1] = globals[ip[1] + 0];
+                        sp += ip[0];
+                        ip += 2;
+                        NEXT();
+
+    LABEL(kLoadUniform4) sp[4].fFloat = uniforms[ip[1] + 3];
+    LABEL(kLoadUniform3) sp[3].fFloat = uniforms[ip[1] + 2];
+    LABEL(kLoadUniform2) sp[2].fFloat = uniforms[ip[1] + 1];
+    LABEL(kLoadUniform)  sp[1].fFloat = uniforms[ip[1] + 0];
+                        sp += ip[0];
+                        ip += 2;
+                        NEXT();
+
+    LABEL(kLoadExtended) {
+        int count = READ8();
+        I32 src = POP().fSigned;
+        I32 m = mask();
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    sp[i + 1].fSigned[j] = stack[src[j] + i].fSigned[j];
+                }
+            }
+        }
+        sp += count;
+        NEXT();
+    }
+
+    LABEL(kLoadExtendedGlobal) {
+        int count = READ8();
+        I32 src = POP().fSigned;
+        I32 m = mask();
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    sp[i + 1].fSigned[j] = globals[src[j] + i].fSigned[j];
+                }
+            }
+        }
+        sp += count;
+        NEXT();
+    }
+
+    LABEL(kLoadExtendedUniform) {
+        int count = READ8();
+        I32 src = POP().fSigned;
+        I32 m = mask();
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    sp[i + 1].fFloat[j] = uniforms[src[j] + i];
+                }
+            }
+        }
+        sp += count;
+        NEXT();
+    }
+
+    LABEL(kLoadSwizzle) {
+        int src = READ8();
+        int count = READ8();
+        for (int i = 0; i < count; ++i) {
+            PUSH(stack[src + *(ip + i)]);
+        }
+        ip += count;
+        NEXT();
+    }
+
+    LABEL(kLoadSwizzleGlobal) {
+        int src = READ8();
+        int count = READ8();
+        for (int i = 0; i < count; ++i) {
+            PUSH(globals[src + *(ip + i)]);
+        }
+        ip += count;
+        NEXT();
+    }
+
+    LABEL(kLoadSwizzleUniform) {
+        int src = READ8();
+        int count = READ8();
+        for (int i = 0; i < count; ++i) {
+            PUSH(F32(uniforms[src + *(ip + i)]));
+        }
+        ip += count;
+        NEXT();
+    }
+
+    LABEL(kMatrixToMatrix) {
+        int srcCols = READ8();
+        int srcRows = READ8();
+        int dstCols = READ8();
+        int dstRows = READ8();
+        SkASSERT(srcCols >= 2 && srcCols <= 4);
+        SkASSERT(srcRows >= 2 && srcRows <= 4);
+        SkASSERT(dstCols >= 2 && dstCols <= 4);
+        SkASSERT(dstRows >= 2 && dstRows <= 4);
+        F32 tmp[16];
+        memset(tmp, 0, sizeof(tmp));
+        tmp[0] = tmp[5] = tmp[10] = tmp[15] = F32(1.0f);
+        for (int c = srcCols - 1; c >= 0; --c) {
+            for (int r = srcRows - 1; r >= 0; --r) {
+                tmp[c*4 + r] = POP().fFloat;
+            }
+        }
+        for (int c = 0; c < dstCols; ++c) {
+            for (int r = 0; r < dstRows; ++r) {
+                PUSH(tmp[c*4 + r]);
+            }
+        }
+        NEXT();
+    }
+
+    LABEL(kMatrixMultiply) {
+        int lCols = READ8();
+        int lRows = READ8();
+        int rCols = READ8();
+        int rRows = lCols;
+        F32 tmp[16] = { 0.0f };
+        F32* B = &(sp - (rCols * rRows) + 1)->fFloat;
+        F32* A = B - (lCols * lRows);
+        for (int c = 0; c < rCols; ++c) {
+            for (int r = 0; r < lRows; ++r) {
+                for (int j = 0; j < lCols; ++j) {
+                    tmp[c*lRows + r] += A[j*lRows + r] * B[c*rRows + j];
+                }
+            }
+        }
+        sp -= (lCols * lRows) + (rCols * rRows);
+        memcpy(sp + 1, tmp, rCols * lRows * sizeof(VValue));
+        sp += (rCols * lRows);
+        NEXT();
+    }
+
+    VECTOR_BINARY_OP(kMultiplyI, fSigned, *)
+    VECTOR_MATRIX_BINARY_OP(kMultiplyF, fFloat, *)
+
+    LABEL(kNegateF4) sp[-3] = -sp[-3].fFloat;
+    LABEL(kNegateF3) sp[-2] = -sp[-2].fFloat;
+    LABEL(kNegateF2) sp[-1] = -sp[-1].fFloat;
+    LABEL(kNegateF)  sp[ 0] = -sp[ 0].fFloat;
+                     NEXT();
+
+    LABEL(kNegateFN) {
+        int count = READ8();
+        for (int i = count - 1; i >= 0; --i) {
+            sp[-i] = -sp[-i].fFloat;
+        }
+        NEXT();
+    }
+
+    LABEL(kNegateI4) sp[-3] = -sp[-3].fSigned;
+    LABEL(kNegateI3) sp[-2] = -sp[-2].fSigned;
+    LABEL(kNegateI2) sp[-1] = -sp[-1].fSigned;
+    LABEL(kNegateI)  sp[ 0] = -sp[ 0].fSigned;
+                     NEXT();
+
+    LABEL(kPop4) POP();
+    LABEL(kPop3) POP();
+    LABEL(kPop2) POP();
+    LABEL(kPop)  POP();
+                 NEXT();
+
+    LABEL(kPopN)
+        sp -= READ8();
+        NEXT();
+
+    LABEL(kPushImmediate)
+        PUSH(U32(READ32()));
+        NEXT();
+
+    LABEL(kReadExternal)
+    LABEL(kReadExternal2)
+    LABEL(kReadExternal3)
+    LABEL(kReadExternal4) {
+        int count = READ8();
+        int src = READ8();
+        float tmp[4];
+        I32 m = mask();
+        for (int i = 0; i < VecWidth; ++i) {
+            if (m[i]) {
+                byteCode->fExternalValues[src]->read(baseIndex + i, tmp);
+                for (int j = 0; j < count; ++j) {
+                    sp[j + 1].fFloat[i] = tmp[j];
+                }
+            }
+        }
+        sp += count;
+        NEXT();
+    }
+
+    VECTOR_BINARY_FN(kRemainderF, fFloat, VecMod)
+    VECTOR_BINARY_MASKED_OP(kRemainderS, fSigned, %)
+    VECTOR_BINARY_MASKED_OP(kRemainderU, fUnsigned, %)
+
+    LABEL(kReserve)
+        sp += READ8();
+        NEXT();
+
+    LABEL(kReturn) {
+        int count = READ8();
+        if (frames.empty()) {
+            if (outReturn) {
+                VValue* src = sp - count + 1;
+                if (stripedOutput) {
+                    for (int i = 0; i < count; ++i) {
+                        memcpy(outReturn[i], &src->fFloat, N * sizeof(float));
+                        ++src;
+                    }
+                } else {
+                    float* outPtr = outReturn[0];
+                    for (int i = 0; i < count; ++i) {
+                        for (int j = 0; j < N; ++j) {
+                            outPtr[count * j] = src->fFloat[j];
+                        }
+                        ++outPtr;
+                        ++src;
+                    }
+                }
+            }
+            return true;
+        } else {
+            // When we were called, the caller reserved stack space for their copy of our
+            // return value, then 'stack' was positioned after that, where our parameters
+            // were placed. Copy our return values to their reserved area.
+            memcpy(stack - count, sp - count + 1, count * sizeof(VValue));
+
+            // Now move the stack pointer to the end of the passed-in parameters. This odd
+            // calling convention requires the caller to pop the arguments after calling,
+            // but allows them to store any out-parameters back during that unwinding.
+            // After that sequence finishes, the return value will be the top of the stack.
+            const StackFrame& frame(frames.back());
+            sp = stack + frame.fParameterCount - 1;
+            stack = frame.fStack;
+            code = frame.fCode;
+            ip = frame.fIP;
+            frames.pop_back();
+            NEXT();
+        }
+    }
+
+    LABEL(kScalarToMatrix) {
+        int cols = READ8();
+        int rows = READ8();
+        VValue v = POP();
+        for (int c = 0; c < cols; ++c) {
+            for (int r = 0; r < rows; ++r) {
+                PUSH(c == r ? v : F32(0.0f));
+            }
+        }
+        NEXT();
+    }
+
+    LABEL(kShiftLeft)
+        sp[0] = sp[0].fSigned << READ8();
+        NEXT();
+    LABEL(kShiftRightS)
+        sp[0] = sp[0].fSigned >> READ8();
+        NEXT();
+    LABEL(kShiftRightU)
+        sp[0] = sp[0].fUnsigned >> READ8();
+        NEXT();
+
+    VECTOR_UNARY_FN_VEC(kSin, sinf)
+    VECTOR_UNARY_FN(kSqrt, skvx::sqrt, fFloat)
+
+    LABEL(kStore4)
+        stack[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+3].fFloat);
+    LABEL(kStore3)
+        stack[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+2].fFloat);
+    LABEL(kStore2)
+        stack[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+1].fFloat);
+    LABEL(kStore)
+        stack[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, stack[*ip+0].fFloat);
+        ++ip;
+        NEXT();
+
+    LABEL(kStoreGlobal4)
+        globals[*ip+3] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+3].fFloat);
+    LABEL(kStoreGlobal3)
+        globals[*ip+2] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+2].fFloat);
+    LABEL(kStoreGlobal2)
+        globals[*ip+1] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+1].fFloat);
+    LABEL(kStoreGlobal)
+        globals[*ip+0] = skvx::if_then_else(mask(), POP().fFloat, globals[*ip+0].fFloat);
+        ++ip;
+        NEXT();
+
+    LABEL(kStoreExtended) {
+        int count = READ8();
+        I32 target = POP().fSigned;
+        VValue* src = sp - count + 1;
+        I32 m = mask();
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    stack[target[j] + i].fSigned[j] = src[i].fSigned[j];
+                }
+            }
+        }
+        sp -= count;
+        NEXT();
+    }
+    LABEL(kStoreExtendedGlobal) {
+        int count = READ8();
+        I32 target = POP().fSigned;
+        VValue* src = sp - count + 1;
+        I32 m = mask();
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    globals[target[j] + i].fSigned[j] = src[i].fSigned[j];
+                }
+            }
+        }
+        sp -= count;
+        NEXT();
+    }
+
+    LABEL(kStoreSwizzle) {
+        int target = READ8();
+        int count = READ8();
+        for (int i = count - 1; i >= 0; --i) {
+            stack[target + *(ip + i)] = skvx::if_then_else(
+                    mask(), POP().fFloat, stack[target + *(ip + i)].fFloat);
+        }
+        ip += count;
+        NEXT();
+    }
+
+    LABEL(kStoreSwizzleGlobal) {
+        int target = READ8();
+        int count = READ8();
+        for (int i = count - 1; i >= 0; --i) {
+            globals[target + *(ip + i)] = skvx::if_then_else(
+                    mask(), POP().fFloat, globals[target + *(ip + i)].fFloat);
+        }
+        ip += count;
+        NEXT();
+    }
+
+    LABEL(kStoreSwizzleIndirect) {
+        int count = READ8();
+        I32 target = POP().fSigned;
+        I32 m = mask();
+        for (int i = count - 1; i >= 0; --i) {
+            I32 v = POP().fSigned;
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    stack[target[j] + *(ip + i)].fSigned[j] = v[j];
+                }
+            }
+        }
+        ip += count;
+        NEXT();
+    }
+
+    LABEL(kStoreSwizzleIndirectGlobal) {
+        int count = READ8();
+        I32 target = POP().fSigned;
+        I32 m = mask();
+        for (int i = count - 1; i >= 0; --i) {
+            I32 v = POP().fSigned;
+            for (int j = 0; j < VecWidth; ++j) {
+                if (m[j]) {
+                    globals[target[j] + *(ip + i)].fSigned[j] = v[j];
+                }
+            }
+        }
+        ip += count;
+        NEXT();
+    }
+
+    VECTOR_BINARY_OP(kSubtractI, fSigned, -)
+    VECTOR_MATRIX_BINARY_OP(kSubtractF, fFloat, -)
+
+    LABEL(kSwizzle) {
+        VValue tmp[4];
+        for (int i = READ8() - 1; i >= 0; --i) {
+            tmp[i] = POP();
+        }
+        for (int i = READ8() - 1; i >= 0; --i) {
+            PUSH(tmp[READ8()]);
+        }
+        NEXT();
+    }
+
+    VECTOR_UNARY_FN_VEC(kTan, tanf)
+
+    LABEL(kWriteExternal4)
+    LABEL(kWriteExternal3)
+    LABEL(kWriteExternal2)
+    LABEL(kWriteExternal) {
+        int count = READ8();
+        int target = READ8();
+        float tmp[4];
+        I32 m = mask();
+        sp -= count;
+        for (int i = 0; i < VecWidth; ++i) {
+            if (m[i]) {
+                for (int j = 0; j < count; ++j) {
+                    tmp[j] = sp[j + 1].fFloat[i];
+                }
+                byteCode->fExternalValues[target]->write(baseIndex + i, tmp);
+            }
+        }
+        NEXT();
+    }
+
+    LABEL(kMaskPush)
+        condPtr[1] = POP().fSigned;
+        maskPtr[1] = maskPtr[0] & condPtr[1];
+        ++condPtr; ++maskPtr;
+        NEXT();
+    LABEL(kMaskPop)
+        --condPtr; --maskPtr;
+        NEXT();
+    LABEL(kMaskNegate)
+        maskPtr[0] = maskPtr[-1] & ~condPtr[0];
+        NEXT();
+    LABEL(kMaskBlend) {
+        int count = READ8();
+        I32 m = condPtr[0];
+        --condPtr; --maskPtr;
+        for (int i = 0; i < count; ++i) {
+            sp[-count] = skvx::if_then_else(m, sp[-count].fFloat, sp[0].fFloat);
+            --sp;
+        }
+        NEXT();
+    }
+    LABEL(kBranchIfAllFalse) {
+        int target = READ16();
+        if (!skvx::any(mask())) {
+            ip = code + target;
+        }
+        NEXT();
+    }
+
+    LABEL(kLoopBegin)
+        contPtr[1] = 0;
+        loopPtr[1] = loopPtr[0];
+        ++contPtr; ++loopPtr;
+        NEXT();
+    LABEL(kLoopNext)
+        *loopPtr |= *contPtr;
+        *contPtr = 0;
+        NEXT();
+    LABEL(kLoopMask)
+        *loopPtr &= POP().fSigned;
+        NEXT();
+    LABEL(kLoopEnd)
+        --contPtr; --loopPtr;
+        NEXT();
+    LABEL(kLoopBreak)
+        *loopPtr &= ~mask();
+        NEXT();
+    LABEL(kLoopContinue) {
+        I32 m = mask();
+        *contPtr |=  m;
+        *loopPtr &= ~m;
+        NEXT();
+    }
+#ifdef SKSLC_THREADED_CODE
+    #ifdef TRACE
+        next:
+            printf("at %3d (stack: %d) (disable threaded code for disassembly)\n",
+                   (int) (ip - code), (int) (sp - stack) + 1);
+            goto *READ_INST();
+    #endif
+#else
+        }
+    }
+#endif
+}
+
+}; // class Interpreter
+
+#endif // SK_ENABLE_SKSL_INTERPRETER
+
+#undef spf
+
+void ByteCodeFunction::disassemble() const {
+#if defined(SK_ENABLE_SKSL_INTERPRETER)
+    const uint8_t* ip = fCode.data();
+    while (ip < fCode.data() + fCode.size()) {
+        printf("%d: ", (int)(ip - fCode.data()));
+        ip = Interpreter::DisassembleInstruction(ip);
+        printf("\n");
+    }
+#endif
+}
+
+#define VECTOR_PREPROCESS(base)          \
+    case ByteCodeInstruction::base ## 4: \
+    case ByteCodeInstruction::base ## 3: \
+    case ByteCodeInstruction::base ## 2: \
+    case ByteCodeInstruction::base: READ8(); break;
+
+#define VECTOR_PREPROCESS_NO_COUNT(base) \
+    case ByteCodeInstruction::base ## 4: \
+    case ByteCodeInstruction::base ## 3: \
+    case ByteCodeInstruction::base ## 2: \
+    case ByteCodeInstruction::base: break;
+
+#define VECTOR_MATRIX_PREPROCESS(base) \
+    VECTOR_PREPROCESS(base)            \
+    case ByteCodeInstruction::base ## N: READ8(); break;
+
+#define VECTOR_MATRIX_PREPROCESS_NO_COUNT(base) \
+    VECTOR_PREPROCESS_NO_COUNT(base)            \
+    case ByteCodeInstruction::base ## N: READ8(); break;
+
+void ByteCodeFunction::preprocess(const void* labels[]) {
+#if defined(SK_ENABLE_SKSL_INTERPRETER)
+#ifdef TRACE
+    this->disassemble();
+#endif
+    uint8_t* ip = fCode.data();
+    while (ip < fCode.data() + fCode.size()) {
+        ByteCodeInstruction inst = (ByteCodeInstruction) (intptr_t) READ_INST();
+        const void* label = labels[(int) inst];
+        memcpy(ip - sizeof(instruction), &label, sizeof(label));
+        switch (inst) {
+            VECTOR_MATRIX_PREPROCESS(kAddF)
+            VECTOR_PREPROCESS(kAddI)
+            case ByteCodeInstruction::kAndB: break;
+            case ByteCodeInstruction::kBranch: READ16(); break;
+            case ByteCodeInstruction::kCall: READ8(); break;
+            case ByteCodeInstruction::kCallExternal: {
+                READ8();
+                READ8();
+                READ8();
+                break;
+            }
+            case ByteCodeInstruction::kClampIndex: READ8(); break;
+            VECTOR_PREPROCESS(kCompareIEQ)
+            VECTOR_PREPROCESS(kCompareINEQ)
+            VECTOR_MATRIX_PREPROCESS(kCompareFEQ)
+            VECTOR_MATRIX_PREPROCESS(kCompareFNEQ)
+            VECTOR_PREPROCESS(kCompareFGT)
+            VECTOR_PREPROCESS(kCompareFGTEQ)
+            VECTOR_PREPROCESS(kCompareFLT)
+            VECTOR_PREPROCESS(kCompareFLTEQ)
+            VECTOR_PREPROCESS(kCompareSGT)
+            VECTOR_PREPROCESS(kCompareSGTEQ)
+            VECTOR_PREPROCESS(kCompareSLT)
+            VECTOR_PREPROCESS(kCompareSLTEQ)
+            VECTOR_PREPROCESS(kCompareUGT)
+            VECTOR_PREPROCESS(kCompareUGTEQ)
+            VECTOR_PREPROCESS(kCompareULT)
+            VECTOR_PREPROCESS(kCompareULTEQ)
+            VECTOR_PREPROCESS_NO_COUNT(kConvertFtoI)
+            VECTOR_PREPROCESS_NO_COUNT(kConvertStoF)
+            VECTOR_PREPROCESS_NO_COUNT(kConvertUtoF)
+            VECTOR_PREPROCESS(kCos)
+            VECTOR_MATRIX_PREPROCESS(kDivideF)
+            VECTOR_PREPROCESS(kDivideS)
+            VECTOR_PREPROCESS(kDivideU)
+            VECTOR_MATRIX_PREPROCESS(kDup)
+
+            case ByteCodeInstruction::kInverse2x2:
+            case ByteCodeInstruction::kInverse3x3:
+            case ByteCodeInstruction::kInverse4x4: break;
+
+            case ByteCodeInstruction::kLoad:
+            case ByteCodeInstruction::kLoad2:
+            case ByteCodeInstruction::kLoad3:
+            case ByteCodeInstruction::kLoad4:
+            case ByteCodeInstruction::kLoadGlobal:
+            case ByteCodeInstruction::kLoadGlobal2:
+            case ByteCodeInstruction::kLoadGlobal3:
+            case ByteCodeInstruction::kLoadGlobal4:
+            case ByteCodeInstruction::kLoadUniform:
+            case ByteCodeInstruction::kLoadUniform2:
+            case ByteCodeInstruction::kLoadUniform3:
+            case ByteCodeInstruction::kLoadUniform4: READ16(); break;
+
+            case ByteCodeInstruction::kLoadSwizzle:
+            case ByteCodeInstruction::kLoadSwizzleGlobal:
+            case ByteCodeInstruction::kLoadSwizzleUniform: {
+                READ8();
+                int count = READ8();
+                ip += count;
+                break;
+            }
+
+            case ByteCodeInstruction::kLoadExtended:
+            case ByteCodeInstruction::kLoadExtendedGlobal:
+            case ByteCodeInstruction::kLoadExtendedUniform:
+                READ8();
+                break;
+
+            case ByteCodeInstruction::kMatrixToMatrix: {
+                READ8();
+                READ8();
+                READ8();
+                READ8();
+                break;
+            }
+            case ByteCodeInstruction::kMatrixMultiply: {
+                READ8();
+                READ8();
+                READ8();
+                break;
+            }
+            VECTOR_MATRIX_PREPROCESS(kMultiplyF)
+            VECTOR_PREPROCESS(kMultiplyI)
+            VECTOR_MATRIX_PREPROCESS_NO_COUNT(kNegateF)
+            VECTOR_PREPROCESS_NO_COUNT(kNegateI)
+            case ByteCodeInstruction::kNotB: break;
+            case ByteCodeInstruction::kOrB: break;
+            VECTOR_MATRIX_PREPROCESS_NO_COUNT(kPop)
+            case ByteCodeInstruction::kPushImmediate: READ32(); break;
+
+            case ByteCodeInstruction::kReadExternal:
+            case ByteCodeInstruction::kReadExternal2:
+            case ByteCodeInstruction::kReadExternal3:
+            case ByteCodeInstruction::kReadExternal4: READ16(); break;
+
+            VECTOR_PREPROCESS(kRemainderF)
+            VECTOR_PREPROCESS(kRemainderS)
+            VECTOR_PREPROCESS(kRemainderU)
+            case ByteCodeInstruction::kReserve: READ8(); break;
+            case ByteCodeInstruction::kReturn: READ8(); break;
+            case ByteCodeInstruction::kScalarToMatrix: READ8(); READ8(); break;
+            case ByteCodeInstruction::kShiftLeft: READ8(); break;
+            case ByteCodeInstruction::kShiftRightS: READ8(); break;
+            case ByteCodeInstruction::kShiftRightU: READ8(); break;
+            VECTOR_PREPROCESS(kSin)
+            VECTOR_PREPROCESS_NO_COUNT(kSqrt)
+
+            case ByteCodeInstruction::kStore:
+            case ByteCodeInstruction::kStore2:
+            case ByteCodeInstruction::kStore3:
+            case ByteCodeInstruction::kStore4:
+            case ByteCodeInstruction::kStoreGlobal:
+            case ByteCodeInstruction::kStoreGlobal2:
+            case ByteCodeInstruction::kStoreGlobal3:
+            case ByteCodeInstruction::kStoreGlobal4: READ8(); break;
+
+            case ByteCodeInstruction::kStoreSwizzle:
+            case ByteCodeInstruction::kStoreSwizzleGlobal: {
+                READ8();
+                int count = READ8();
+                ip += count;
+                break;
+            }
+
+            case ByteCodeInstruction::kStoreSwizzleIndirect:
+            case ByteCodeInstruction::kStoreSwizzleIndirectGlobal: {
+                int count = READ8();
+                ip += count;
+                break;
+            }
+
+            case ByteCodeInstruction::kStoreExtended: READ8(); break;
+            case ByteCodeInstruction::kStoreExtendedGlobal: READ8(); break;
+
+            VECTOR_MATRIX_PREPROCESS(kSubtractF)
+            VECTOR_PREPROCESS(kSubtractI)
+
+            case ByteCodeInstruction::kSwizzle: {
+                READ8();
+                int count = READ8();
+                ip += count;
+                break;
+            }
+            VECTOR_PREPROCESS(kTan)
+            case ByteCodeInstruction::kWriteExternal:
+            case ByteCodeInstruction::kWriteExternal2:
+            case ByteCodeInstruction::kWriteExternal3:
+            case ByteCodeInstruction::kWriteExternal4: READ16(); break;
+
+            case ByteCodeInstruction::kXorB: break;
+            case ByteCodeInstruction::kMaskPush: break;
+            case ByteCodeInstruction::kMaskPop: break;
+            case ByteCodeInstruction::kMaskNegate: break;
+            case ByteCodeInstruction::kMaskBlend: READ8(); break;
+            case ByteCodeInstruction::kBranchIfAllFalse: READ16(); break;
+            case ByteCodeInstruction::kLoopBegin: break;
+            case ByteCodeInstruction::kLoopNext: break;
+            case ByteCodeInstruction::kLoopMask: break;
+            case ByteCodeInstruction::kLoopEnd: break;
+            case ByteCodeInstruction::kLoopContinue:  break;
+            case ByteCodeInstruction::kLoopBreak: break;
+            default:
+                ip -= 2;
+                printf("unknown(%d)\n", READ16());
+                SkASSERT(false);
+        }
+    }
+#endif
+}
+
+bool ByteCode::run(const ByteCodeFunction* f,
+                   float* args, int argCount,
+                   float* outReturn, int returnCount,
+                   const float* uniforms, int uniformCount) const {
+#if defined(SK_ENABLE_SKSL_INTERPRETER)
+    Interpreter::VValue stack[128];
+    int stackNeeded = f->fParameterCount + f->fLocalCount + f->fStackCount;
+    if (stackNeeded > (int)SK_ARRAY_COUNT(stack)) {
+        return false;
+    }
+
+    if (argCount != f->fParameterCount ||
+        returnCount != f->fReturnCount ||
+        uniformCount != fUniformSlotCount) {
+        return false;
+    }
+
+    Interpreter::VValue globals[32];
+    if (fGlobalSlotCount > (int)SK_ARRAY_COUNT(globals)) {
+        return false;
+    }
+
+    // Transpose args into stack
+    {
+        float* src = args;
+        float* dst = (float*)stack;
+        for (int i = 0; i < argCount; ++i) {
+            *dst = *src++;
+            dst += VecWidth;
+        }
+    }
+
+    bool stripedOutput = false;
+    float** outArray = outReturn ? &outReturn : nullptr;
+    if (!Interpreter::InnerRun(this, f, stack, outArray, globals, uniforms, stripedOutput, 1, 0)) {
+        return false;
+    }
+
+    // Transpose out parameters back
+    {
+        float* dst = args;
+        float* src = (float*)stack;
+        for (const auto& p : f->fParameters) {
+            if (p.fIsOutParameter) {
+                for (int i = p.fSlotCount; i > 0; --i) {
+                    *dst++ = *src;
+                    src += VecWidth;
+                }
+            } else {
+                dst += p.fSlotCount;
+                src += p.fSlotCount * VecWidth;
+            }
+        }
+    }
+
+    return true;
+#else
+    SkDEBUGFAIL("ByteCode interpreter not enabled");
+    return false;
+#endif
+}
+
+bool ByteCode::runStriped(const ByteCodeFunction* f, int N,
+                          float* args[], int argCount,
+                          float* outReturn[], int returnCount,
+                          const float* uniforms, int uniformCount) const {
+#if defined(SK_ENABLE_SKSL_INTERPRETER)
+    Interpreter::VValue stack[128];
+    int stackNeeded = f->fParameterCount + f->fLocalCount + f->fStackCount;
+    if (stackNeeded > (int)SK_ARRAY_COUNT(stack)) {
+        return false;
+    }
+
+    if (argCount != f->fParameterCount ||
+        returnCount != f->fReturnCount ||
+        uniformCount != fUniformSlotCount) {
+        return false;
+    }
+
+    Interpreter::VValue globals[32];
+    if (fGlobalSlotCount > (int)SK_ARRAY_COUNT(globals)) {
+        return false;
+    }
+
+    // innerRun just takes outArgs, so clear it if the count is zero
+    if (returnCount == 0) {
+        outReturn = nullptr;
+    }
+
+    int baseIndex = 0;
+
+    while (N) {
+        int w = std::min(N, VecWidth);
+
+        // Copy args into stack
+        for (int i = 0; i < argCount; ++i) {
+            memcpy((void*)(stack + i), args[i], w * sizeof(float));
+        }
+
+        bool stripedOutput = true;
+        if (!Interpreter::InnerRun(this, f, stack, outReturn, globals, uniforms, stripedOutput, w,
+                                   baseIndex)) {
+            return false;
+        }
+
+        // Copy out parameters back
+        int slot = 0;
+        for (const auto& p : f->fParameters) {
+            if (p.fIsOutParameter) {
+                for (int i = slot; i < slot + p.fSlotCount; ++i) {
+                    memcpy(args[i], stack + i, w * sizeof(float));
+                }
+            }
+            slot += p.fSlotCount;
+        }
+
+        // Step each argument pointer ahead
+        for (int i = 0; i < argCount; ++i) {
+            args[i] += w;
+        }
+        N -= w;
+        baseIndex += w;
+    }
+
+    return true;
+#else
+    SkDEBUGFAIL("ByteCode interpreter not enabled");
+    return false;
+#endif
+}
+
+} // namespace SkSL
+
+#endif
diff --git a/src/sksl/SkSLByteCode.h b/src/sksl/SkSLByteCode.h
index d5a38e2..f917eec 100644
--- a/src/sksl/SkSLByteCode.h
+++ b/src/sksl/SkSLByteCode.h
@@ -9,59 +9,206 @@
 #define SKSL_BYTECODE
 
 #include "include/private/SkOnce.h"
-#include "include/private/SkVx.h"
 #include "src/sksl/SkSLString.h"
-#include "src/sksl/ir/SkSLFunctionDeclaration.h"
 
 #include <memory>
 #include <vector>
 
 namespace SkSL {
 
-class ByteCode;
-class ExternalValue;
+class  ExternalValue;
+struct FunctionDeclaration;
+
+// GCC and Clang support the "labels as values" extension which we need to implement the interpreter
+// using threaded code. Otherwise, we fall back to using a switch statement in a for loop.
+#if defined(__GNUC__) || defined(__clang__)
+    #define SKSLC_THREADED_CODE
+    using instruction = void*;
+#else
+    using instruction = uint16_t;
+#endif
+
+#define VECTOR(name) name ## 4, name ## 3, name ## 2, name
+#define VECTOR_MATRIX(name) name ## 4, name ## 3, name ## 2, name, name ## N
+
+enum class ByteCodeInstruction : uint16_t {
+    // B = bool, F = float, I = int, S = signed, U = unsigned
+    // All binary VECTOR instructions (kAddF, KSubtractI, kCompareIEQ, etc.) are followed by a byte
+    // indicating the count, even though it is redundant due to the count appearing in the opcode.
+    // This is because the original opcodes are lost after we preprocess it into threaded code, and
+    // we need to still be able to access the count so as to permit the implementation to use opcode
+    // fallthrough.
+    VECTOR_MATRIX(kAddF),
+    VECTOR(kAddI),
+    kAndB,
+    kBranch,
+    // Followed by a byte indicating the index of the function to call
+    kCall,
+    // Followed by three bytes indicating: the number of argument slots, the number of return slots,
+    // and the index of the external value to call
+    kCallExternal,
+    // For dynamic array access: Followed by byte indicating length of array
+    kClampIndex,
+    VECTOR(kCompareIEQ),
+    VECTOR(kCompareINEQ),
+    VECTOR_MATRIX(kCompareFEQ),
+    VECTOR_MATRIX(kCompareFNEQ),
+    VECTOR(kCompareFGT),
+    VECTOR(kCompareFGTEQ),
+    VECTOR(kCompareFLT),
+    VECTOR(kCompareFLTEQ),
+    VECTOR(kCompareSGT),
+    VECTOR(kCompareSGTEQ),
+    VECTOR(kCompareSLT),
+    VECTOR(kCompareSLTEQ),
+    VECTOR(kCompareUGT),
+    VECTOR(kCompareUGTEQ),
+    VECTOR(kCompareULT),
+    VECTOR(kCompareULTEQ),
+    VECTOR(kConvertFtoI),
+    VECTOR(kConvertStoF),
+    VECTOR(kConvertUtoF),
+    // Followed by a (redundant) byte indicating the count
+    VECTOR(kCos),
+    VECTOR_MATRIX(kDivideF),
+    VECTOR(kDivideS),
+    VECTOR(kDivideU),
+    // Duplicates the top stack value. Followed by a (redundant) byte indicating the count.
+    VECTOR_MATRIX(kDup),
+    kInverse2x2,
+    kInverse3x3,
+    kInverse4x4,
+    // kLoad/kLoadGlobal are followed by a byte indicating the count, and a byte indicating the
+    // local/global slot to load
+    VECTOR(kLoad),
+    VECTOR(kLoadGlobal),
+    VECTOR(kLoadUniform),
+    // As kLoad/kLoadGlobal, then a count byte (1-4), and then one byte per swizzle component (0-3).
+    kLoadSwizzle,
+    kLoadSwizzleGlobal,
+    kLoadSwizzleUniform,
+    // kLoadExtended* are fallback load ops when we lack a specialization. They are followed by a
+    // count byte, and get the slot to load from the top of the stack.
+    kLoadExtended,
+    kLoadExtendedGlobal,
+    kLoadExtendedUniform,
+    // Followed by four bytes: srcCols, srcRows, dstCols, dstRows. Consumes the src matrix from the
+    // stack, and replaces it with the dst matrix. Per GLSL rules, there are no restrictions on
+    // dimensions. Any overlapping values are copied, and any other values are filled in with the
+    // identity matrix.
+    kMatrixToMatrix,
+    // Followed by three bytes: leftCols (== rightRows), leftRows, rightCols
+    kMatrixMultiply,
+    VECTOR_MATRIX(kNegateF),
+    VECTOR(kNegateI),
+    VECTOR_MATRIX(kMultiplyF),
+    VECTOR(kMultiplyI),
+    kNotB,
+    kOrB,
+    VECTOR_MATRIX(kPop),
+    // Followed by a 32 bit value containing the value to push
+    kPushImmediate,
+    // Followed by a byte indicating external value to read
+    VECTOR(kReadExternal),
+    VECTOR(kRemainderF),
+    VECTOR(kRemainderS),
+    VECTOR(kRemainderU),
+    // Followed by a byte indicating the number of slots to reserve on the stack (for later return)
+    kReserve,
+    // Followed by a byte indicating the number of slots being returned
+    kReturn,
+    // Followed by two bytes indicating columns and rows of matrix (2, 3, or 4 each).
+    // Takes a single value from the top of the stack, and converts to a CxR matrix with that value
+    // replicated along the diagonal (and zero elsewhere), per the GLSL matrix construction rules.
+    kScalarToMatrix,
+    // Followed by a byte indicating the number of bits to shift
+    kShiftLeft,
+    kShiftRightS,
+    kShiftRightU,
+    // Followed by a (redundant) byte indicating the count
+    VECTOR(kSin),
+    VECTOR(kSqrt),
+    // kStore/kStoreGlobal are followed by a byte indicating the local/global slot to store
+    VECTOR(kStore),
+    VECTOR(kStoreGlobal),
+    // Fallback stores. Followed by count byte, and get the slot to store from the top of the stack
+    kStoreExtended,
+    kStoreExtendedGlobal,
+    // As kStore/kStoreGlobal, then a count byte (1-4), then one byte per swizzle component (0-3).
+    // Expects the stack to look like: ... v1 v2 v3 v4, where the number of 'v's is equal to the
+    // number of swizzle components. After the store, all v's are popped from the stack.
+    kStoreSwizzle,
+    kStoreSwizzleGlobal,
+    // As above, but gets the store slot from the top of the stack (before values to be stored)
+    kStoreSwizzleIndirect,
+    kStoreSwizzleIndirectGlobal,
+    // Followed by two count bytes (1-4), and then one byte per swizzle component (0-3). The first
+    // count byte provides the current vector size (the vector is the top n stack elements), and the
+    // second count byte provides the swizzle component count.
+    kSwizzle,
+    VECTOR_MATRIX(kSubtractF),
+    VECTOR(kSubtractI),
+    // Followed by a (redundant) byte indicating the count
+    VECTOR(kTan),
+    // Followed by a byte indicating external value to write
+    VECTOR(kWriteExternal),
+    kXorB,
+
+    kMaskPush,
+    kMaskPop,
+    kMaskNegate,
+    // Followed by count byte
+    kMaskBlend,
+    // Followed by address
+    kBranchIfAllFalse,
+
+    kLoopBegin,
+    kLoopNext,
+    kLoopMask,
+    kLoopEnd,
+    kLoopBreak,
+    kLoopContinue,
+};
+#undef VECTOR
 
 class ByteCodeFunction {
 public:
-    // all counts are of 32-bit values, so a float4 counts as 4 parameter or return slots
+    int getParameterCount() const { return fParameterCount; }
+    int getReturnCount() const { return fReturnCount; }
+
+    /**
+     * Print bytecode disassembly to stdout.
+     */
+    void disassemble() const;
+
+private:
+    ByteCodeFunction(const FunctionDeclaration* declaration);
+
+    friend class ByteCode;
+    friend class ByteCodeGenerator;
+    friend struct Interpreter;
+
     struct Parameter {
         int fSlotCount;
         bool fIsOutParameter;
     };
 
-    /**
-     * Note that this is the actual number of parameters, not the number of parameter slots.
-     */
-    int getParameterCount() const { return fParameters.size(); }
-
-    Parameter getParameter(int idx) const { return fParameters[idx]; }
-
-    int getParameterSlotCount() const { return fParameterSlotCount; }
-
-    int getReturnSlotCount() const { return fReturnSlotCount; }
-
-    void disassemble() const { }
-
-private:
-    ByteCodeFunction(const FunctionDeclaration* declaration)
-        : fName(declaration->fName) {}
-
-    String fName;
-
+    SkSL::String fName;
     std::vector<Parameter> fParameters;
+    int fParameterCount;
+    int fReturnCount = 0;
 
-    int fParameterSlotCount;
-
-    int fReturnSlotCount;
-
-    int fStackSlotCount;
-
+    int fLocalCount = 0;
+    int fStackCount = 0;
+    int fConditionCount = 0;
+    int fLoopCount = 0;
+    mutable SkOnce fPreprocessOnce;
     std::vector<uint8_t> fCode;
 
-    friend class ByteCode;
-    friend class ByteCodeGenerator;
-    template<int width>
-    friend class Interpreter;
+    /**
+     * Replace each opcode with the corresponding entry from the labels array.
+     */
+    void preprocess(const void* labels[]);
 };
 
 enum class TypeCategory {
@@ -73,272 +220,9 @@
 
 class SK_API ByteCode {
 public:
+    static constexpr int kVecWidth = 8;
+
     ByteCode() = default;
-    ByteCode(ByteCode&&) = default;
-    ByteCode& operator =(ByteCode&&) = default;
-
-    template<int width>
-    union Vector {
-        skvx::Vec<width, float> fFloat;
-        skvx::Vec<width, int32_t> fInt;
-        skvx::Vec<width, uint32_t> fUInt;
-
-        Vector() = default;
-
-        Vector(skvx::Vec<width, float> f)
-            : fFloat(f) {}
-
-        Vector(skvx::Vec<width, int32_t> i)
-            : fInt(i) {}
-
-        Vector(skvx::Vec<width, uint32_t> u)
-            : fUInt(u) {}
-    };
-
-// All V(I) instructions have a second (vector) instruction, that is encoded with a uint8_t count
-// immediately following the instruction (and before any other arguments).
-#define V(Inst) Inst, Inst ## N
-
-    enum class Instruction : uint8_t {
-        // no parameters
-        kNop,
-        // no parameters
-        kAbort,
-        // Register target, Register src1, Register src2
-        V(kAddF),
-        // Register target, Register src1, Register src2
-        V(kAddI),
-        // Register target, Register src1, Register src2
-        kAnd,
-        // Register index, int arrayLength
-        kBoundsCheck,
-        // Pointer target
-        kBranch,
-        // Pointer target
-        kBranchIfAllFalse,
-        // no parameters
-        kBreak,
-        // Register target, uint8_t functionIndex, Register parameters
-        kCall,
-        // Register target, uint8_t externalValueIndex, uint8_t targetSize, Register arguments,
-        // uint8_t argumentSize
-        kCallExternal,
-        // Register target, Register src1, Register src2
-        kCompareEQF,
-        // Register target, Register src1, Register src2
-        kCompareEQI,
-        // Register target, Register src1, Register src2
-        kCompareNEQF,
-        // Register target, Register src1, Register src2
-        kCompareNEQI,
-        // Register target, Register src1, Register src2
-        kCompareGTF,
-        // Register target, Register src1, Register src2
-        kCompareGTS,
-        // Register target, Register src1, Register src2
-        kCompareGTU,
-        // Register target, Register src1, Register src2
-        kCompareGTEQF,
-        // Register target, Register src1, Register src2
-        kCompareGTEQS,
-        // Register target, Register src1, Register src2
-        kCompareGTEQU,
-        // Register target, Register src1, Register src2
-        kCompareLTF,
-        // Register target, Register src1, Register src2
-        kCompareLTS,
-        // Register target, Register src1, Register src2
-        kCompareLTU,
-        // Register target, Register src1, Register src2
-        kCompareLTEQF,
-        // Register target, Register src1, Register src2
-        kCompareLTEQS,
-        // Register target, Register src1, Register src2
-        kCompareLTEQU,
-        // no parameters
-        kContinue,
-        // Register target, Register src
-        kCopy,
-        // Register target, Register src,
-        kCos,
-        // Register target, Register src1, Register src2
-        V(kDivideF),
-        // Register target, Register src1, Register src2
-        V(kDivideS),
-        // Register target, Register src1, Register src2
-        V(kDivideU),
-        // Register target, Register src
-        kFloatToSigned,
-        // Register target, Register src
-        kFloatToUnsigned,
-        // Load a constant into a register
-        // Register target, Immediate value
-        kImmediate,
-        // Register target, Register src
-        kInverse2x2,
-        // Register target, Register src
-        kInverse3x3,
-        // Register target, Register src
-        kInverse4x4,
-        // Load the memory cell pointed to by srcPtr into a register
-        // Register target, Register srcPtr
-        V(kLoad),
-        // Load the memory cell pointed to by src into a register
-        // Register target, Pointer src
-        V(kLoadDirect),
-        // Load the parameter slot pointed to by srcPtr into a register
-        // Register target, Register srcPtr
-        V(kLoadParameter),
-        // Load the parameter slot pointed to by src into a register
-        // Register target, Pointer src
-        V(kLoadParameterDirect),
-        // Load the stack cell pointed to by srcPtr + sp into a register
-        // Register target, Register srcPtr
-        V(kLoadStack),
-        // Load the stack cell pointed to by src + sp into a register
-        // Register target, Pointer src
-        V(kLoadStackDirect),
-        // Pushes a new loop onto the loop and continue stacks
-        // no parameters
-        kLoopBegin,
-        // Pops the loop and continue stacks
-        // no parameters
-        kLoopEnd,
-        // Register mask
-        kLoopMask,
-        // no parameters
-        kLoopNext,
-        // no parameters
-        kMaskNegate,
-        // no parameters
-        kMaskPop,
-        // Register mask
-        kMaskPush,
-        // Register target, Register left, Register right, uint8_t leftColsAndRightRows,
-        // uint8_t leftRows, uint8_t rightCols
-        kMatrixMultiply,
-        // Register target, Register src, uint8_t srcColumns, uint8_t srcRows, uint8_t dstColumns,
-        // uint8_t dstRows
-        kMatrixToMatrix,
-        // Register target, Register src1, Register src2
-        V(kMultiplyF),
-        // Register target, Register src1, Register src2
-        V(kMultiplyI),
-        // Register target, Register src
-        kNegateF,
-        // Register target, Register src
-        kNegateS,
-        // Register target, Register src
-        kNot,
-        // Register target, Register src1, Register src2
-        kOr,
-        // Register src
-        kPrint,
-        // Register target, uint8_t count, uint8_t index
-        kReadExternal,
-        // Register target, Register src1, Register src2
-        V(kRemainderF),
-        // Register target, Register src1, Register src2
-        V(kRemainderS),
-        // Register target, Register src1, Register src2
-        V(kRemainderU),
-        // no parameters
-        kReturn,
-        // Register value
-        kReturnValue,
-        // Register target, Register src, uint8_t columns, uint8_t rows
-        kScalarToMatrix,
-        // Register target, Register test, Register ifTrue, Register ifFalse
-        kSelect,
-        // Register target, Register src, uint8_t count
-        kShiftLeft,
-        // Register target, Register src, uint8_t count
-        kShiftRightS,
-        // Register target, Register src, uint8_t count
-        kShiftRightU,
-        // Register target, Register src
-        kSignedToFloat,
-        // Register target, Register src,
-        kSin,
-        // Duplicates the src to <count> targets
-        // uint8_t count, Register target, Register src
-        kSplat,
-        // Register target, Register src,
-        kSqrt,
-        // Store to the memory cell pointed to by dstPtr
-        // Register dstPtr, Register src
-        V(kStore),
-        // Store to the memory cell pointed to by dst
-        // Pointer dst, Register src
-        V(kStoreDirect),
-        // Store to the parameter slot pointed to by dstPtr
-        // Register dstPtr, Register src
-        V(kStoreParameter),
-        // Store to the parameter slot pointed to by dst
-        // Pointer dst, Register src
-        V(kStoreParameterDirect),
-        // Stores a register into the stack cell pointed to by dst + sp
-        // Register dst, Register src
-        V(kStoreStack),
-        // Stores a register into the stack cell pointed to by dstPtr + sp
-        // Pointer dst, Register src
-        V(kStoreStackDirect),
-        // Register target, Register src1, Register src2
-        V(kSubtractF),
-        // Register target, Register src1, Register src2
-        V(kSubtractI),
-        // Register target, Register src,
-        kTan,
-        // Register target, Register src,
-        kUnsignedToFloat,
-        // uint8_t index, uint8_t count, Register src
-        kWriteExternal,
-        // Register target, Register src1, Register src2
-        kXor,
-    };
-
-#undef V
-
-    // Compound values like vectors span multiple Registers or Pointer addresses. We always refer to
-    // them by the address of their first slot, so for instance if you add two float4's together,
-    // the resulting Register contains the first channel of the result, with the other three
-    // channels following in the next three Registers.
-
-    struct Register {
-        uint16_t fIndex;
-
-        Register operator+(uint16_t offset) const {
-            return Register{(uint16_t) (fIndex + offset)};
-        }
-    };
-
-    struct Pointer {
-        uint16_t fAddress;
-
-        Pointer operator+(uint16_t offset) const {
-            return Pointer{(uint16_t) (fAddress + offset)};
-        }
-    };
-
-    union Immediate {
-        float fFloat;
-        int32_t fInt;
-        uint32_t fUInt;
-
-        Immediate() {}
-
-        Immediate(float f)
-            : fFloat(f) {}
-
-        Immediate(int32_t i)
-            : fInt(i) {}
-
-        Immediate(uint32_t u)
-            : fUInt(u) {}
-    };
-
-    static constexpr int kPointerMax = 65535;
-    static constexpr int kRegisterMax = 65535;
 
     const ByteCodeFunction* getFunction(const char* name) const {
         for (const auto& f : fFunctions) {
@@ -349,9 +233,36 @@
         return nullptr;
     }
 
-    int getGlobalSlotCount() const {
-        return fGlobalSlotCount;
-    }
+    /**
+     * Invokes the specified function once, with the given arguments.
+     * 'args', 'outReturn', and 'uniforms' are collections of 32-bit values (typically floats,
+     * but possibly int32_t or uint32_t, depending on the types used in the SkSL).
+     * Any 'out' or 'inout' parameters will result in the 'args' array being modified.
+     * The return value is stored in 'outReturn' (may be null, to discard the return value).
+     * 'uniforms' are mapped to 'uniform' globals, in order.
+     */
+    bool SKSL_WARN_UNUSED_RESULT run(const ByteCodeFunction*,
+                                     float* args, int argCount,
+                                     float* outReturn, int returnCount,
+                                     const float* uniforms, int uniformCount) const;
+
+    /**
+     * Invokes the specified function with the given arguments, 'N' times. 'args' and 'outReturn'
+     * are accepted and returned in structure-of-arrays form:
+     *   args[0] points to an array of N values, the first argument for each invocation
+     *   ...
+     *   args[argCount - 1] points to an array of N values, the last argument for each invocation
+     *
+     * All values in 'args', 'outReturn', and 'uniforms' are 32-bit values (typically floats,
+     * but possibly int32_t or uint32_t, depending on the types used in the SkSL).
+     * Any 'out' or 'inout' parameters will result in the 'args' array being modified.
+     * The return value is stored in 'outReturn' (may be null, to discard the return value).
+     * 'uniforms' are mapped to 'uniform' globals, in order.
+     */
+    bool SKSL_WARN_UNUSED_RESULT runStriped(const ByteCodeFunction*, int N,
+                                            float* args[], int argCount,
+                                            float* outReturn[], int returnCount,
+                                            const float* uniforms, int uniformCount) const;
 
     struct Uniform {
         SkSL::String fName;
@@ -377,19 +288,17 @@
     ByteCode(const ByteCode&) = delete;
     ByteCode& operator=(const ByteCode&) = delete;
 
-    std::vector<std::unique_ptr<ByteCodeFunction>> fFunctions;
-    std::vector<ExternalValue*> fExternalValues;
+    friend class ByteCodeGenerator;
+    friend struct Interpreter;
 
-    int fGlobalSlotCount;
-
+    int fGlobalSlotCount = 0;
     int fUniformSlotCount = 0;
     std::vector<Uniform> fUniforms;
 
-    friend class ByteCodeGenerator;
-    template<int width>
-    friend class Interpreter;
+    std::vector<std::unique_ptr<ByteCodeFunction>> fFunctions;
+    std::vector<ExternalValue*> fExternalValues;
 };
 
-} // namespace
+}
 
 #endif
diff --git a/src/sksl/SkSLByteCodeGenerator.cpp b/src/sksl/SkSLByteCodeGenerator.cpp
index 3a65d8c..d2e960a 100644
--- a/src/sksl/SkSLByteCodeGenerator.cpp
+++ b/src/sksl/SkSLByteCodeGenerator.cpp
@@ -7,24 +7,50 @@
 
 #include "src/sksl/SkSLByteCodeGenerator.h"
 
+#include <algorithm>
+
 namespace SkSL {
 
-ByteCodeGenerator::ByteCodeGenerator(const Program* program, ErrorReporter* errors,
-                                     ByteCode* output)
+static TypeCategory type_category(const Type& type) {
+    switch (type.kind()) {
+        case Type::Kind::kVector_Kind:
+        case Type::Kind::kMatrix_Kind:
+            return type_category(type.componentType());
+        default:
+            if (type.fName == "bool") {
+                return TypeCategory::kBool;
+            } else if (type.fName == "int" ||
+                       type.fName == "short" ||
+                       type.fName == "$intLiteral") {
+                return TypeCategory::kSigned;
+            } else if (type.fName == "uint" ||
+                       type.fName == "ushort") {
+                return TypeCategory::kUnsigned;
+            } else {
+                SkASSERT(type.fName == "float" ||
+                         type.fName == "half" ||
+                         type.fName == "$floatLiteral");
+                return TypeCategory::kFloat;
+            }
+            ABORT("unsupported type: %s\n", type.displayName().c_str());
+    }
+}
+
+
+ByteCodeGenerator::ByteCodeGenerator(const Context* context, const Program* program, ErrorReporter* errors,
+                  ByteCode* output)
     : INHERITED(program, errors, nullptr)
+    , fContext(*context)
     , fOutput(output)
     , fIntrinsics {
-        // "Normal" intrinsics are all $genType f($genType), mapped to a single instruction
-        { "cos",     ByteCode::Instruction::kCos },
-        { "sin",     ByteCode::Instruction::kSin },
-        { "sqrt",    ByteCode::Instruction::kSqrt },
-        { "tan",     ByteCode::Instruction::kTan },
-
-        // Special intrinsics have other signatures, or non-standard code-gen
+        { "cos",     ByteCodeInstruction::kCos },
         { "dot",     SpecialIntrinsic::kDot },
-        { "inverse", SpecialIntrinsic::kInverse },
-        { "print",   SpecialIntrinsic::kPrint },
-    } {}
+        { "inverse", ByteCodeInstruction::kInverse2x2 },
+        { "sin",     ByteCodeInstruction::kSin },
+        { "sqrt",    ByteCodeInstruction::kSqrt },
+        { "tan",     ByteCodeInstruction::kTan },
+      } {}
+
 
 int ByteCodeGenerator::SlotCount(const Type& type) {
     if (type.kind() == Type::kOther_Kind) {
@@ -54,74 +80,89 @@
 static inline bool is_in(const SkSL::Variable& var) {
     return var.fModifiers.fFlags & Modifiers::kIn_Flag;
 }
-ByteCodeGenerator::Location ByteCodeGenerator::getLocation(const Variable& var) {
-    // given that we seldom have more than a couple of variables, linear search is probably the most
-    // efficient way to handle lookups
-    switch (var.fStorage) {
-        case Variable::kLocal_Storage: {
-            for (int i = fLocals.size() - 1; i >= 0; --i) {
-                if (fLocals[i] == &var) {
-                    return ByteCode::Pointer{(uint16_t) (i + fParameterCount)};
-                }
-            }
-            int result = fLocals.size() + fParameterCount;
-            fLocals.push_back(&var);
-            for (int i = 0; i < SlotCount(var.fType) - 1; ++i) {
-                fLocals.push_back(nullptr);
-            }
-            SkASSERT(result <= ByteCode::kPointerMax);
-            return ByteCode::Pointer{(uint16_t) result};
+
+void ByteCodeGenerator::gatherUniforms(const Type& type, const String& name) {
+    if (type.kind() == Type::kOther_Kind) {
+        return;
+    } else if (type.kind() == Type::kStruct_Kind) {
+        for (const auto& f : type.fields()) {
+            this->gatherUniforms(*f.fType, name + "." + f.fName);
         }
-        case Variable::kParameter_Storage: {
-            int offset = 0;
-            for (const auto& p : fFunction->fDeclaration.fParameters) {
-                if (p == &var) {
-                    SkASSERT(offset <= ByteCode::kPointerMax);
-                    return ByteCode::Pointer{(uint16_t) offset};
-                }
-                offset += SlotCount(p->fType);
-            }
-            SkASSERT(false);
-            return ByteCode::Pointer{0};
+    } else if (type.kind() == Type::kArray_Kind) {
+        for (int i = 0; i < type.columns(); ++i) {
+            this->gatherUniforms(type.componentType(), String::printf("%s[%d]", name.c_str(), i));
         }
-        case Variable::kGlobal_Storage: {
-            if (is_in(var)) {
-                // If you see this error, it means the program is using raw 'in' variables. You
-                // should either specialize the program (Compiler::specialize) to bake in the final
-                // values of the 'in' variables, or not use 'in' variables (maybe you meant to use
-                // 'uniform' instead?).
-                fErrors.error(var.fOffset,
-                              "'in' variable is not specialized or has unsupported type");
-                return ByteCode::Pointer{0};
+    } else {
+        fOutput->fUniforms.push_back({ name, type_category(type), type.rows(), type.columns(),
+                                       fOutput->fUniformSlotCount });
+        fOutput->fUniformSlotCount += type.columns() * type.rows();
+    }
+}
+
+bool ByteCodeGenerator::generateCode() {
+    for (const auto& e : fProgram) {
+        switch (e.fKind) {
+            case ProgramElement::kFunction_Kind: {
+                std::unique_ptr<ByteCodeFunction> f = this->writeFunction((FunctionDefinition&) e);
+                if (!f) {
+                    return false;
+                }
+                fOutput->fFunctions.push_back(std::move(f));
+                fFunctions.push_back(&(FunctionDefinition&)e);
+                break;
             }
-            bool isUniform = is_uniform(var);
-            int offset = isUniform ? fOutput->getGlobalSlotCount() : 0;
-            for (const auto& e : fProgram) {
-                if (e.fKind == ProgramElement::kVar_Kind) {
-                    VarDeclarations& decl = (VarDeclarations&) e;
-                    for (const auto& v : decl.fVars) {
-                        const Variable* declVar = ((VarDeclaration&) *v).fVar;
-                        if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
-                            continue;
-                        }
-                        if (isUniform != is_uniform(*declVar)) {
-                            continue;
-                        }
-                        if (declVar == &var) {
-                            SkASSERT(offset <= ByteCode::kPointerMax);
-                            return ByteCode::Pointer{(uint16_t) offset};
-                        }
-                        offset += SlotCount(declVar->fType);
+            case ProgramElement::kVar_Kind: {
+                VarDeclarations& decl = (VarDeclarations&) e;
+                for (const auto& v : decl.fVars) {
+                    const Variable* declVar = ((VarDeclaration&) *v).fVar;
+                    if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
+                        continue;
+                    }
+                    if (is_uniform(*declVar)) {
+                        this->gatherUniforms(declVar->fType, declVar->fName);
+                    } else {
+                        fOutput->fGlobalSlotCount += SlotCount(declVar->fType);
                     }
                 }
+                break;
             }
-            SkASSERT(false);
-            return ByteCode::Pointer{0};
+            default:
+                ; // ignore
         }
-        default:
-            SkASSERT(false);
-            return ByteCode::Pointer{0};
     }
+    return 0 == fErrors.errorCount();
+}
+
+std::unique_ptr<ByteCodeFunction> ByteCodeGenerator::writeFunction(const FunctionDefinition& f) {
+    fFunction = &f;
+    std::unique_ptr<ByteCodeFunction> result(new ByteCodeFunction(&f.fDeclaration));
+    fParameterCount = result->fParameterCount;
+    fLoopCount = fMaxLoopCount = 0;
+    fConditionCount = fMaxConditionCount = 0;
+    fStackCount = fMaxStackCount = 0;
+    fCode = &result->fCode;
+
+    this->writeStatement(*f.fBody);
+    if (0 == fErrors.errorCount()) {
+        SkASSERT(fLoopCount == 0);
+        SkASSERT(fConditionCount == 0);
+        SkASSERT(fStackCount == 0);
+    }
+    this->write(ByteCodeInstruction::kReturn, 0);
+    this->write8(0);
+
+    result->fLocalCount     = fLocals.size();
+    result->fConditionCount = fMaxConditionCount;
+    result->fLoopCount      = fMaxLoopCount;
+    result->fStackCount     = fMaxStackCount;
+
+    const Type& returnType = f.fDeclaration.fReturnType;
+    if (returnType != *fContext.fVoid_Type) {
+        result->fReturnCount = SlotCount(returnType);
+    }
+    fLocals.clear();
+    fFunction = nullptr;
+    return result;
 }
 
 // A "simple" Swizzle is based on a variable (or a compound variable like a struct or array), and
@@ -145,439 +186,520 @@
     return true;
 }
 
+int ByteCodeGenerator::StackUsage(ByteCodeInstruction inst, int count_) {
+    // Ensures that we use count iff we're passed a non-default value. Most instructions have an
+    // implicit count, so the caller shouldn't need to worry about it (or count makes no sense).
+    // The asserts avoids callers thinking they're supplying useful information in that scenario,
+    // or failing to supply necessary information for the ops that need a count.
+    struct CountValue {
+        operator int() {
+            SkASSERT(val != ByteCodeGenerator::kUnusedStackCount);
+            SkDEBUGCODE(used = true);
+            return val;
+        }
+        ~CountValue() {
+            SkASSERT(used || val == ByteCodeGenerator::kUnusedStackCount);
+        }
+        int val;
+        SkDEBUGCODE(bool used = false;)
+    } count = { count_ };
+
+    switch (inst) {
+        // Unary functions/operators that don't change stack depth at all:
+#define VECTOR_UNARY_OP(base)                \
+        case ByteCodeInstruction::base:      \
+        case ByteCodeInstruction::base ## 2: \
+        case ByteCodeInstruction::base ## 3: \
+        case ByteCodeInstruction::base ## 4: \
+            return 0;
+
+        VECTOR_UNARY_OP(kConvertFtoI)
+        VECTOR_UNARY_OP(kConvertStoF)
+        VECTOR_UNARY_OP(kConvertUtoF)
+
+        VECTOR_UNARY_OP(kCos)
+        VECTOR_UNARY_OP(kSin)
+        VECTOR_UNARY_OP(kSqrt)
+        VECTOR_UNARY_OP(kTan)
+
+        VECTOR_UNARY_OP(kNegateF)
+        VECTOR_UNARY_OP(kNegateI)
+
+        case ByteCodeInstruction::kInverse2x2:
+        case ByteCodeInstruction::kInverse3x3:
+        case ByteCodeInstruction::kInverse4x4: return 0;
+
+        case ByteCodeInstruction::kClampIndex: return 0;
+        case ByteCodeInstruction::kNotB: return 0;
+        case ByteCodeInstruction::kNegateFN: return 0;
+        case ByteCodeInstruction::kShiftLeft: return 0;
+        case ByteCodeInstruction::kShiftRightS: return 0;
+        case ByteCodeInstruction::kShiftRightU: return 0;
+
+#undef VECTOR_UNARY_OP
+
+        // Binary functions/operators that do a 2 -> 1 reduction (possibly N times)
+#define VECTOR_BINARY_OP(base)                          \
+        case ByteCodeInstruction::base:      return -1; \
+        case ByteCodeInstruction::base ## 2: return -2; \
+        case ByteCodeInstruction::base ## 3: return -3; \
+        case ByteCodeInstruction::base ## 4: return -4;
+
+#define VECTOR_MATRIX_BINARY_OP(base)                   \
+        VECTOR_BINARY_OP(base)                          \
+        case ByteCodeInstruction::base ## N: return -count;
+
+        case ByteCodeInstruction::kAndB: return -1;
+        case ByteCodeInstruction::kOrB:  return -1;
+        case ByteCodeInstruction::kXorB: return -1;
+
+        VECTOR_BINARY_OP(kAddI)
+        VECTOR_MATRIX_BINARY_OP(kAddF)
+
+        VECTOR_BINARY_OP(kCompareIEQ)
+        VECTOR_MATRIX_BINARY_OP(kCompareFEQ)
+        VECTOR_BINARY_OP(kCompareINEQ)
+        VECTOR_MATRIX_BINARY_OP(kCompareFNEQ)
+        VECTOR_BINARY_OP(kCompareSGT)
+        VECTOR_BINARY_OP(kCompareUGT)
+        VECTOR_BINARY_OP(kCompareFGT)
+        VECTOR_BINARY_OP(kCompareSGTEQ)
+        VECTOR_BINARY_OP(kCompareUGTEQ)
+        VECTOR_BINARY_OP(kCompareFGTEQ)
+        VECTOR_BINARY_OP(kCompareSLT)
+        VECTOR_BINARY_OP(kCompareULT)
+        VECTOR_BINARY_OP(kCompareFLT)
+        VECTOR_BINARY_OP(kCompareSLTEQ)
+        VECTOR_BINARY_OP(kCompareULTEQ)
+        VECTOR_BINARY_OP(kCompareFLTEQ)
+
+        VECTOR_BINARY_OP(kDivideS)
+        VECTOR_BINARY_OP(kDivideU)
+        VECTOR_MATRIX_BINARY_OP(kDivideF)
+        VECTOR_BINARY_OP(kMultiplyI)
+        VECTOR_MATRIX_BINARY_OP(kMultiplyF)
+        VECTOR_BINARY_OP(kRemainderF)
+        VECTOR_BINARY_OP(kRemainderS)
+        VECTOR_BINARY_OP(kRemainderU)
+        VECTOR_BINARY_OP(kSubtractI)
+        VECTOR_MATRIX_BINARY_OP(kSubtractF)
+
+#undef VECTOR_BINARY_OP
+#undef VECTOR_MATRIX_BINARY_OP
+
+        // Ops that push or load data to grow the stack:
+        case ByteCodeInstruction::kDup:
+        case ByteCodeInstruction::kLoad:
+        case ByteCodeInstruction::kLoadGlobal:
+        case ByteCodeInstruction::kLoadUniform:
+        case ByteCodeInstruction::kReadExternal:
+        case ByteCodeInstruction::kPushImmediate:
+            return 1;
+
+        case ByteCodeInstruction::kDup2:
+        case ByteCodeInstruction::kLoad2:
+        case ByteCodeInstruction::kLoadGlobal2:
+        case ByteCodeInstruction::kLoadUniform2:
+        case ByteCodeInstruction::kReadExternal2:
+            return 2;
+
+        case ByteCodeInstruction::kDup3:
+        case ByteCodeInstruction::kLoad3:
+        case ByteCodeInstruction::kLoadGlobal3:
+        case ByteCodeInstruction::kLoadUniform3:
+        case ByteCodeInstruction::kReadExternal3:
+            return 3;
+
+        case ByteCodeInstruction::kDup4:
+        case ByteCodeInstruction::kLoad4:
+        case ByteCodeInstruction::kLoadGlobal4:
+        case ByteCodeInstruction::kLoadUniform4:
+        case ByteCodeInstruction::kReadExternal4:
+            return 4;
+
+        case ByteCodeInstruction::kDupN:
+        case ByteCodeInstruction::kLoadSwizzle:
+        case ByteCodeInstruction::kLoadSwizzleGlobal:
+        case ByteCodeInstruction::kLoadSwizzleUniform:
+            return count;
+
+        // Pushes 'count' values, minus one for the 'address' that's consumed first
+        case ByteCodeInstruction::kLoadExtended:
+        case ByteCodeInstruction::kLoadExtendedGlobal:
+        case ByteCodeInstruction::kLoadExtendedUniform:
+            return count - 1;
+
+        // Ops that pop or store data to shrink the stack:
+        case ByteCodeInstruction::kPop:
+        case ByteCodeInstruction::kStore:
+        case ByteCodeInstruction::kStoreGlobal:
+        case ByteCodeInstruction::kWriteExternal:
+            return -1;
+
+        case ByteCodeInstruction::kPop2:
+        case ByteCodeInstruction::kStore2:
+        case ByteCodeInstruction::kStoreGlobal2:
+        case ByteCodeInstruction::kWriteExternal2:
+            return -2;
+
+        case ByteCodeInstruction::kPop3:
+        case ByteCodeInstruction::kStore3:
+        case ByteCodeInstruction::kStoreGlobal3:
+        case ByteCodeInstruction::kWriteExternal3:
+            return -3;
+
+        case ByteCodeInstruction::kPop4:
+        case ByteCodeInstruction::kStore4:
+        case ByteCodeInstruction::kStoreGlobal4:
+        case ByteCodeInstruction::kWriteExternal4:
+            return -4;
+
+        case ByteCodeInstruction::kPopN:
+        case ByteCodeInstruction::kStoreSwizzle:
+        case ByteCodeInstruction::kStoreSwizzleGlobal:
+            return -count;
+
+        // Consumes 'count' values, plus one for the 'address'
+        case ByteCodeInstruction::kStoreExtended:
+        case ByteCodeInstruction::kStoreExtendedGlobal:
+        case ByteCodeInstruction::kStoreSwizzleIndirect:
+        case ByteCodeInstruction::kStoreSwizzleIndirectGlobal:
+            return -count - 1;
+
+        // Strange ops where the caller computes the delta for us:
+        case ByteCodeInstruction::kCallExternal:
+        case ByteCodeInstruction::kMatrixToMatrix:
+        case ByteCodeInstruction::kMatrixMultiply:
+        case ByteCodeInstruction::kReserve:
+        case ByteCodeInstruction::kReturn:
+        case ByteCodeInstruction::kScalarToMatrix:
+        case ByteCodeInstruction::kSwizzle:
+            return count;
+
+        // Miscellaneous
+
+        // kCall is net-zero. Max stack depth is adjusted in writeFunctionCall.
+        case ByteCodeInstruction::kCall:             return 0;
+        case ByteCodeInstruction::kBranch:           return 0;
+        case ByteCodeInstruction::kBranchIfAllFalse: return 0;
+
+        case ByteCodeInstruction::kMaskPush:         return -1;
+        case ByteCodeInstruction::kMaskPop:          return 0;
+        case ByteCodeInstruction::kMaskNegate:       return 0;
+        case ByteCodeInstruction::kMaskBlend:        return -count;
+
+        case ByteCodeInstruction::kLoopBegin:        return 0;
+        case ByteCodeInstruction::kLoopNext:         return 0;
+        case ByteCodeInstruction::kLoopMask:         return -1;
+        case ByteCodeInstruction::kLoopEnd:          return 0;
+        case ByteCodeInstruction::kLoopBreak:        return 0;
+        case ByteCodeInstruction::kLoopContinue:     return 0;
+
+        default:
+            ABORT("unsupported instruction %d\n", (int)inst);
+            return 0;
+    }
+}
+
+ByteCodeGenerator::Location ByteCodeGenerator::getLocation(const Variable& var) {
+    // given that we seldom have more than a couple of variables, linear search is probably the most
+    // efficient way to handle lookups
+    switch (var.fStorage) {
+        case Variable::kLocal_Storage: {
+            for (int i = fLocals.size() - 1; i >= 0; --i) {
+                if (fLocals[i] == &var) {
+                    SkASSERT(fParameterCount + i <= 255);
+                    return { fParameterCount + i, Storage::kLocal };
+                }
+            }
+            int result = fParameterCount + fLocals.size();
+            fLocals.push_back(&var);
+            for (int i = 0; i < SlotCount(var.fType) - 1; ++i) {
+                fLocals.push_back(nullptr);
+            }
+            SkASSERT(result <= 255);
+            return { result, Storage::kLocal };
+        }
+        case Variable::kParameter_Storage: {
+            int offset = 0;
+            for (const auto& p : fFunction->fDeclaration.fParameters) {
+                if (p == &var) {
+                    SkASSERT(offset <= 255);
+                    return { offset, Storage::kLocal };
+                }
+                offset += SlotCount(p->fType);
+            }
+            SkASSERT(false);
+            return Location::MakeInvalid();
+        }
+        case Variable::kGlobal_Storage: {
+            if (is_in(var)) {
+                // If you see this error, it means the program is using raw 'in' variables. You
+                // should either specialize the program (Compiler::specialize) to bake in the final
+                // values of the 'in' variables, or not use 'in' variables (maybe you meant to use
+                // 'uniform' instead?).
+                fErrors.error(var.fOffset,
+                              "'in' variable is not specialized or has unsupported type");
+                return Location::MakeInvalid();
+            }
+            int offset = 0;
+            bool isUniform = is_uniform(var);
+            for (const auto& e : fProgram) {
+                if (e.fKind == ProgramElement::kVar_Kind) {
+                    VarDeclarations& decl = (VarDeclarations&) e;
+                    for (const auto& v : decl.fVars) {
+                        const Variable* declVar = ((VarDeclaration&) *v).fVar;
+                        if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
+                            continue;
+                        }
+                        if (isUniform != is_uniform(*declVar)) {
+                            continue;
+                        }
+                        if (declVar == &var) {
+                            SkASSERT(offset <= 255);
+                            return  { offset, isUniform ? Storage::kUniform : Storage::kGlobal };
+                        }
+                        offset += SlotCount(declVar->fType);
+                    }
+                }
+            }
+            SkASSERT(false);
+            return Location::MakeInvalid();
+        }
+        default:
+            SkASSERT(false);
+            return Location::MakeInvalid();
+    }
+}
+
 ByteCodeGenerator::Location ByteCodeGenerator::getLocation(const Expression& expr) {
     switch (expr.fKind) {
         case Expression::kFieldAccess_Kind: {
-            const FieldAccess& f = (const FieldAccess&) expr;
-            Location result = this->getLocation(*f.fBase);
+            const FieldAccess& f = (const FieldAccess&)expr;
+            Location baseLoc = this->getLocation(*f.fBase);
             int offset = 0;
             for (int i = 0; i < f.fFieldIndex; ++i) {
                 offset += SlotCount(*f.fBase->fType.fields()[i].fType);
             }
-            return result.offset(*this, offset);
-        }
-        case Expression::kIndex_Kind: {
-            const IndexExpression& idx = (const IndexExpression&) expr;
-            int stride = SlotCount(idx.fType);
-            int length = idx.fBase->fType.columns();
-            Location result = this->getLocation(*idx.fBase);
-            if (idx.fIndex->isConstant()) {
-                int64_t index = idx.fIndex->getConstantInt();
-                if (index < 0 || index >= length) {
-                    fErrors.error(idx.fIndex->fOffset, "Array index out of bounds");
-                    return result;
+            if (baseLoc.isOnStack()) {
+                if (offset != 0) {
+                    this->write(ByteCodeInstruction::kPushImmediate);
+                    this->write32(offset);
+                    this->write(ByteCodeInstruction::kAddI);
+                    this->write8(1);
                 }
-                return result.offset(*this, index * stride);
+                return baseLoc;
             } else {
-                ByteCode::Register index = this->next(1);
-                this->writeExpression(*idx.fIndex, index);
-                this->write(ByteCode::Instruction::kBoundsCheck);
-                this->write(index);
-                this->write(length);
-                ByteCode::Register imm = this->next(1);
-                this->write(ByteCode::Instruction::kImmediate);
-                this->write(imm);
-                this->write(ByteCode::Immediate{stride});
-                ByteCode::Register offset = this->next(1);
-                this->write(ByteCode::Instruction::kMultiplyI);
-                this->write(offset);
-                this->write(index);
-                this->write(imm);
-                return result.offset(*this, offset);
+                return baseLoc + offset;
             }
         }
+        case Expression::kIndex_Kind: {
+            const IndexExpression& i = (const IndexExpression&)expr;
+            int stride = SlotCount(i.fType);
+            int length = i.fBase->fType.columns();
+            SkASSERT(length <= 255);
+            int offset = -1;
+            if (i.fIndex->isConstant()) {
+                int64_t index = i.fIndex->getConstantInt();
+                if (index < 0 || index >= length) {
+                    fErrors.error(i.fIndex->fOffset, "Array index out of bounds.");
+                    return Location::MakeInvalid();
+                }
+                offset = index * stride;
+            } else {
+                if (i.fIndex->hasSideEffects()) {
+                    // Having a side-effect in an indexer is technically safe for an rvalue,
+                    // but with lvalues we have to evaluate the indexer twice, so make it an error.
+                    fErrors.error(i.fIndex->fOffset,
+                            "Index expressions with side-effects not supported in byte code.");
+                    return Location::MakeInvalid();
+                }
+                this->writeExpression(*i.fIndex);
+                this->write(ByteCodeInstruction::kClampIndex);
+                this->write8(length);
+                if (stride != 1) {
+                    this->write(ByteCodeInstruction::kPushImmediate);
+                    this->write32(stride);
+                    this->write(ByteCodeInstruction::kMultiplyI);
+                    this->write8(1);
+                }
+            }
+            Location baseLoc = this->getLocation(*i.fBase);
+
+            // Are both components known statically?
+            if (!baseLoc.isOnStack() && offset >= 0) {
+                return baseLoc + offset;
+            }
+
+            // At least one component is dynamic (and on the stack).
+
+            // If the other component is zero, we're done
+            if (baseLoc.fSlot == 0 || offset == 0) {
+                return baseLoc.makeOnStack();
+            }
+
+            // Push the non-dynamic component (if any) to the stack, then add the two
+            if (!baseLoc.isOnStack()) {
+                this->write(ByteCodeInstruction::kPushImmediate);
+                this->write32(baseLoc.fSlot);
+            }
+            if (offset >= 0) {
+                this->write(ByteCodeInstruction::kPushImmediate);
+                this->write32(offset);
+            }
+            this->write(ByteCodeInstruction::kAddI);
+            this->write8(1);
+            return baseLoc.makeOnStack();
+        }
         case Expression::kSwizzle_Kind: {
-            const Swizzle& s = (const Swizzle&) expr;
+            const Swizzle& s = (const Swizzle&)expr;
             SkASSERT(swizzle_is_simple(s));
-            return this->getLocation(*s.fBase).offset(*this, s.fComponents[0]);
+            Location baseLoc = this->getLocation(*s.fBase);
+            int offset = s.fComponents[0];
+            if (baseLoc.isOnStack()) {
+                if (offset != 0) {
+                    this->write(ByteCodeInstruction::kPushImmediate);
+                    this->write32(offset);
+                    this->write(ByteCodeInstruction::kAddI);
+                    this->write8(1);
+                }
+                return baseLoc;
+            } else {
+                return baseLoc + offset;
+            }
         }
         case Expression::kVariableReference_Kind: {
-            const Variable& var = ((const VariableReference&) expr).fVariable;
+            const Variable& var = ((const VariableReference&)expr).fVariable;
             return this->getLocation(var);
         }
         default:
             SkASSERT(false);
-            return ByteCode::Pointer{0};
+            return Location::MakeInvalid();
     }
 }
 
-Variable::Storage ByteCodeGenerator::getStorage(const Expression& expr) {
-    switch (expr.fKind) {
-        case Expression::kFieldAccess_Kind: {
-            const FieldAccess& f = (const FieldAccess&) expr;
-            return this->getStorage(*f.fBase);
-        }
-        case Expression::kIndex_Kind: {
-            const IndexExpression& idx = (const IndexExpression&) expr;
-            return this->getStorage(*idx.fBase);
-        }
-        case Expression::kSwizzle_Kind: {
-            const Swizzle& s = (const Swizzle&) expr;
-            return this->getStorage(*s.fBase);
-        }
-        case Expression::kVariableReference_Kind: {
-            const Variable& var = ((const VariableReference&) expr).fVariable;
-            return var.fStorage;
-        }
-        default:
-            SkASSERT(false);
-            return Variable::kLocal_Storage;
-    }
+void ByteCodeGenerator::write8(uint8_t b) {
+    fCode->push_back(b);
 }
 
-ByteCode::Instruction ByteCodeGenerator::getLoadInstruction(ByteCodeGenerator::Location location,
-                                                            Variable::Storage storage) {
-    switch (storage) {
-        case Variable::kGlobal_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kLoadDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kLoad;
-            }
-        case Variable::kParameter_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kLoadParameterDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kLoadParameter;
-            }
-        case Variable::kLocal_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kLoadStackDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kLoadStack;
-            }
-        default:
-            break;
-    }
-    SkASSERT(false);
-    return ByteCode::Instruction::kNop;
+void ByteCodeGenerator::write16(uint16_t i) {
+    size_t n = fCode->size();
+    fCode->resize(n+2);
+    memcpy(fCode->data() + n, &i, 2);
 }
 
-ByteCode::Instruction ByteCodeGenerator::getStoreInstruction(ByteCodeGenerator::Location location,
-                                                             Variable::Storage storage) {
-    switch (storage) {
-        case Variable::kGlobal_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kStoreDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kStore;
-            }
-        case Variable::kParameter_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kStoreParameterDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kStoreParameter;
-            }
-        case Variable::kLocal_Storage:
-            switch (location.fKind) {
-                case Location::kPointer_Kind: return ByteCode::Instruction::kStoreStackDirect;
-                case Location::kRegister_Kind: return ByteCode::Instruction::kStoreStack;
-            }
-        default:
-            break;
-    }
-    SkASSERT(false);
-    return ByteCode::Instruction::kNop;
+void ByteCodeGenerator::write32(uint32_t i) {
+    size_t n = fCode->size();
+    fCode->resize(n+4);
+    memcpy(fCode->data() + n, &i, 4);
 }
 
-#define VEC(inst) ((ByteCode::Instruction) ((uint16_t) inst + 1))
+void ByteCodeGenerator::write(ByteCodeInstruction i, int count) {
+    switch (i) {
+        case ByteCodeInstruction::kLoopBegin: this->enterLoop();      break;
+        case ByteCodeInstruction::kLoopEnd:   this->exitLoop();       break;
 
-class ByteCodeSimpleLValue : public ByteCodeGenerator::LValue {
-public:
-    ByteCodeSimpleLValue(ByteCodeGenerator* generator, ByteCodeGenerator::Location location,
-                         int count, ByteCode::Instruction load, ByteCode::Instruction store)
-        : INHERITED(*generator)
-        , fLocation(location)
-        , fCount((uint8_t) count)
-        , fLoad(load)
-        , fStore(store) {}
-
-    void load(ByteCode::Register result) override {
-        fGenerator.write(fLoad, fCount);
-        fGenerator.write(result);
-        fGenerator.write(fLocation);
+        case ByteCodeInstruction::kMaskPush:  this->enterCondition(); break;
+        case ByteCodeInstruction::kMaskPop:
+        case ByteCodeInstruction::kMaskBlend: this->exitCondition();  break;
+        default: /* Do nothing */ break;
     }
-
-    void store(ByteCode::Register src) override {
-        fGenerator.write(fStore, fCount);
-        fGenerator.write(fLocation);
-        fGenerator.write(src);
-    }
-
-private:
-    ByteCodeGenerator::Location fLocation;
-
-    uint8_t fCount;
-
-    ByteCode::Instruction fLoad;
-
-    ByteCode::Instruction fStore;
-
-    typedef ByteCodeGenerator::LValue INHERITED;
-};
-
-class ByteCodeSwizzleLValue : public ByteCodeGenerator::LValue {
-public:
-    ByteCodeSwizzleLValue(ByteCodeGenerator* generator, const Swizzle* swizzle)
-        : INHERITED(*generator)
-        , fSwizzle(*swizzle) {}
-
-    void load(ByteCode::Register result) override {
-        fGenerator.writeSwizzle(fSwizzle, result);
-    }
-
-    void store(ByteCode::Register src) override {
-        ByteCodeGenerator::Location target = fGenerator.getLocation(*fSwizzle.fBase);
-        ByteCode::Instruction inst = fGenerator.getStoreInstruction(
-                                                            target,
-                                                            fGenerator.getStorage(*fSwizzle.fBase));
-        for (size_t i = 0; i < fSwizzle.fComponents.size(); ++i) {
-            ByteCodeGenerator::Location final = target.offset(fGenerator, fSwizzle.fComponents[i]);
-            fGenerator.write(inst);
-            fGenerator.write(final);
-            fGenerator.write(src + i);
-        }
-    }
-
-private:
-    const Swizzle& fSwizzle;
-
-    typedef ByteCodeGenerator::LValue INHERITED;
-};
-
-class ByteCodeExternalValueLValue : public ByteCodeGenerator::LValue {
-public:
-    ByteCodeExternalValueLValue(ByteCodeGenerator* generator, ExternalValue& value, int index)
-        : INHERITED(*generator)
-        , fIndex(index)
-        , fSlotCount(ByteCodeGenerator::SlotCount(value.type())) {
-        SkASSERT(fSlotCount <= 4);
-    }
-
-    void load(ByteCode::Register result) override {
-        fGenerator.write(ByteCode::Instruction::kReadExternal);
-        fGenerator.write(result);
-        fGenerator.write((uint8_t) fSlotCount);
-        fGenerator.write((uint8_t) fIndex);
-    }
-
-    void store(ByteCode::Register src) override {
-        fGenerator.write(ByteCode::Instruction::kWriteExternal);
-        fGenerator.write((uint8_t) fIndex);
-        fGenerator.write((uint8_t) fSlotCount);
-        fGenerator.write(src);
-    }
-
-private:
-    typedef LValue INHERITED;
-
-    int fIndex;
-
-    int fSlotCount;
-};
-
-std::unique_ptr<ByteCodeGenerator::LValue> ByteCodeGenerator::getLValue(const Expression& expr) {
-    switch (expr.fKind) {
-        case Expression::kExternalValue_Kind: {
-            ExternalValue* value = ((ExternalValueReference&) expr).fValue;
-            int index = fOutput->fExternalValues.size();
-            fOutput->fExternalValues.push_back(value);
-            SkASSERT(index <= 255);
-            return std::unique_ptr<LValue>(new ByteCodeExternalValueLValue(this, *value, index));
-        }
-        case Expression::kFieldAccess_Kind:
-        case Expression::kIndex_Kind:
-        case Expression::kVariableReference_Kind: {
-            Location location = this->getLocation(expr);
-            Variable::Storage storage = this->getStorage(expr);
-            ByteCode::Instruction loadInst = this->getLoadInstruction(location, storage);
-            ByteCode::Instruction storeInst = this->getStoreInstruction(location, storage);
-            return std::unique_ptr<LValue>(new ByteCodeSimpleLValue(this, location,
-                                                                    SlotCount(expr.fType),
-                                                                    loadInst, storeInst));
-        }
-        case Expression::kSwizzle_Kind:
-            return std::unique_ptr<LValue>(new ByteCodeSwizzleLValue(this, &(Swizzle&) expr));
-        default:
-            ABORT("unsupported lvalue\n");
-    }
+    instruction val = (instruction) i;
+    size_t n = fCode->size();
+    fCode->resize(n + sizeof(val));
+    memcpy(fCode->data() + n, &val, sizeof(val));
+    fStackCount += StackUsage(i, count);
+    fMaxStackCount = std::max(fMaxStackCount, fStackCount);
 }
 
-ByteCode::Register ByteCodeGenerator::next(int count) {
-    SkASSERT(fNextRegister + count <= ByteCode::kRegisterMax);
-    fNextRegister += count;
-    return ByteCode::Register{(uint16_t) (fNextRegister - count)};
+static ByteCodeInstruction vector_instruction(ByteCodeInstruction base, int count) {
+    SkASSERT(count >= 1 && count <= 4);
+    return ((ByteCodeInstruction) ((int) base + 1 - count));
 }
 
-static TypeCategory type_category(const Type& type) {
-    switch (type.kind()) {
-        case Type::Kind::kVector_Kind:
-        case Type::Kind::kMatrix_Kind:
-            return type_category(type.componentType());
-        default:
-            String name = type.displayName();
-            if (name == "bool") {
-                return TypeCategory::kBool;
-            } else if (name == "int" || name == "short") {
-                return TypeCategory::kSigned;
-            } else if (name == "uint" || name == "ushort") {
-                return TypeCategory::kUnsigned;
-            } else {
-                SkASSERT(name == "float" || name == "half");
-                return TypeCategory::kFloat;
-            }
-            ABORT("unsupported type: %s\n", name.c_str());
-    }
-}
-
-void ByteCodeGenerator::write(ByteCode::Instruction inst, int count) {
-    SkASSERT(count <= 255);
-    if (count > 1) {
-        this->write(VEC(inst));
-        this->write((uint8_t) count);
-    }
-    else {
-        this->write(inst);
-    }
-}
-
-void ByteCodeGenerator::writeTypedInstruction(const Type& type, ByteCode::Instruction s,
-                                              ByteCode::Instruction u, ByteCode::Instruction f) {
+void ByteCodeGenerator::writeTypedInstruction(const Type& type, ByteCodeInstruction s,
+                                              ByteCodeInstruction u, ByteCodeInstruction f,
+                                              int count, bool writeCount) {
     switch (type_category(type)) {
         case TypeCategory::kSigned:
-            this->write(s);
+            this->write(vector_instruction(s, count));
             break;
         case TypeCategory::kUnsigned:
-            this->write(u);
+            this->write(vector_instruction(u, count));
             break;
         case TypeCategory::kFloat: {
-            this->write(f);
+            if (count > 4) {
+                this->write((ByteCodeInstruction)((int)f + 1), count);
+            } else {
+                this->write(vector_instruction(f, count));
+            }
             break;
         }
         default:
             SkASSERT(false);
     }
-}
-
-void ByteCodeGenerator::writeVectorBinaryInstruction(const Type& operandType,
-                                                     ByteCode::Register left,
-                                                     ByteCode::Register right,
-                                                     ByteCode::Instruction s,
-                                                     ByteCode::Instruction u,
-                                                     ByteCode::Instruction f,
-                                                     ByteCode::Register result) {
-    uint8_t count = (uint8_t) SlotCount(operandType);
-    if (count == 1) {
-        this->writeTypedInstruction(operandType, s, u, f);
-    }
-    else {
-        this->writeTypedInstruction(operandType, VEC(s), VEC(u), VEC(f));
-        this->write(count);
-    }
-    this->write(result);
-    this->write(left);
-    this->write(right);
-}
-
-void ByteCodeGenerator::writeBinaryInstruction(const Type& operandType,
-                                               ByteCode::Register left,
-                                               ByteCode::Register right,
-                                               ByteCode::Instruction s,
-                                               ByteCode::Instruction u,
-                                               ByteCode::Instruction f,
-                                               ByteCode::Register result) {
-    for (int i = 0; i < SlotCount(operandType); ++i) {
-        this->writeTypedInstruction(operandType, s, u, f);
-        this->write(result + i);
-        this->write(left + i);
-        this->write(right + i);
+    if (writeCount) {
+        this->write8(count);
     }
 }
 
-void ByteCodeGenerator::writeBinaryExpression(const BinaryExpression& b,
-                                              ByteCode::Register result) {
+bool ByteCodeGenerator::writeBinaryExpression(const BinaryExpression& b, bool discard) {
     if (b.fOperator == Token::Kind::EQ) {
         std::unique_ptr<LValue> lvalue = this->getLValue(*b.fLeft);
-        this->writeExpression(*b.fRight, result);
-        lvalue->store(result);
-        return;
+        this->writeExpression(*b.fRight);
+        lvalue->store(discard);
+        discard = false;
+        return discard;
     }
     const Type& lType = b.fLeft->fType;
     const Type& rType = b.fRight->fType;
     bool lVecOrMtx = (lType.kind() == Type::kVector_Kind || lType.kind() == Type::kMatrix_Kind);
     bool rVecOrMtx = (rType.kind() == Type::kVector_Kind || rType.kind() == Type::kMatrix_Kind);
-    const Type* operandType;
-    if (!lVecOrMtx && rVecOrMtx) {
-        operandType = &rType;
-    } else {
-        operandType = &lType;
-    }
     Token::Kind op;
     std::unique_ptr<LValue> lvalue;
-    ByteCode::Register left;
-    switch (b.fOperator) {
-        case Token::Kind::LOGICALAND:
-        case Token::Kind::LOGICALANDEQ:
-        case Token::Kind::LOGICALOR:
-        case Token::Kind::LOGICALOREQ:
-            left = result;
-            break;
-        default:
-            left = this->next(SlotCount(*operandType));
-    }
     if (is_assignment(b.fOperator)) {
         lvalue = this->getLValue(*b.fLeft);
-        lvalue->load(left);
+        lvalue->load();
         op = remove_assignment(b.fOperator);
     } else {
-        this->writeExpression(*b.fLeft, left);
+        this->writeExpression(*b.fLeft);
         op = b.fOperator;
         if (!lVecOrMtx && rVecOrMtx) {
-            this->write(ByteCode::Instruction::kSplat);
-            this->write((uint8_t) (SlotCount(rType) - 1));
-            this->write(left + 1);
-            this->write(left);
+            for (int i = SlotCount(rType); i > 1; --i) {
+                this->write(ByteCodeInstruction::kDup);
+                this->write8(1);
+            }
         }
     }
-    SkDEBUGCODE(TypeCategory tc = type_category(lType));
     int count = std::max(SlotCount(lType), SlotCount(rType));
+    SkDEBUGCODE(TypeCategory tc = type_category(lType));
     switch (op) {
         case Token::Kind::LOGICALAND: {
-            SkASSERT(left.fIndex == result.fIndex);
-            this->write(ByteCode::Instruction::kMaskPush);
-            ++fConditionCount;
-            this->write(left);
-            this->write(ByteCode::Instruction::kBranchIfAllFalse);
+            SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
+            this->write(ByteCodeInstruction::kDup);
+            this->write8(1);
+            this->write(ByteCodeInstruction::kMaskPush);
+            this->write(ByteCodeInstruction::kBranchIfAllFalse);
             DeferredLocation falseLocation(this);
-            SkASSERT(SlotCount(b.fRight->fType) == 1);
-            ByteCode::Register right = this->next(1);
-            this->writeExpression(*b.fRight, right);
-            this->write(ByteCode::Instruction::kAnd);
-            this->write(result);
-            this->write(left);
-            this->write(right);
+            this->writeExpression(*b.fRight);
+            this->write(ByteCodeInstruction::kAndB);
             falseLocation.set();
-            --fConditionCount;
-            this->write(ByteCode::Instruction::kMaskPop);
-            return;
+            this->write(ByteCodeInstruction::kMaskPop);
+            return false;
         }
         case Token::Kind::LOGICALOR: {
-            SkASSERT(left.fIndex == result.fIndex);
-            ByteCode::Register mask = this->next(1);
-            this->write(ByteCode::Instruction::kNot);
-            this->write(mask);
-            this->write(left);
-            this->write(ByteCode::Instruction::kMaskPush);
-            ++fConditionCount;
-            this->write(mask);
-            this->write(ByteCode::Instruction::kBranchIfAllFalse);
+            SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
+            this->write(ByteCodeInstruction::kDup);
+            this->write8(1);
+            this->write(ByteCodeInstruction::kNotB);
+            this->write(ByteCodeInstruction::kMaskPush);
+            this->write(ByteCodeInstruction::kBranchIfAllFalse);
             DeferredLocation falseLocation(this);
-            SkASSERT(SlotCount(b.fRight->fType) == 1);
-            ByteCode::Register right = this->next(1);
-            this->writeExpression(*b.fRight, right);
-            this->write(ByteCode::Instruction::kOr);
-            this->write(result);
-            this->write(left);
-            this->write(right);
+            this->writeExpression(*b.fRight);
+            this->write(ByteCodeInstruction::kOrB);
             falseLocation.set();
-            --fConditionCount;
-            this->write(ByteCode::Instruction::kMaskPop);
-            return;
+            this->write(ByteCodeInstruction::kMaskPop);
+            return false;
         }
         case Token::Kind::SHL:
         case Token::Kind::SHR: {
@@ -585,666 +707,775 @@
                                     tc == SkSL::TypeCategory::kUnsigned));
             if (!b.fRight->isConstant()) {
                 fErrors.error(b.fRight->fOffset, "Shift amounts must be constant");
-                return;
+                return false;
             }
             int64_t shift = b.fRight->getConstantInt();
             if (shift < 0 || shift > 31) {
                 fErrors.error(b.fRight->fOffset, "Shift amount out of range");
-                return;
+                return false;
             }
 
             if (op == Token::Kind::SHL) {
-                this->write(ByteCode::Instruction::kShiftLeft);
+                this->write(ByteCodeInstruction::kShiftLeft);
             } else {
                 this->write(type_category(lType) == TypeCategory::kSigned
-                                ? ByteCode::Instruction::kShiftRightS
-                                : ByteCode::Instruction::kShiftRightU);
+                                ? ByteCodeInstruction::kShiftRightS
+                                : ByteCodeInstruction::kShiftRightU);
             }
-            this->write(result);
-            this->write(left);
-            this->write((uint8_t) shift);
-            return;
+            this->write8(shift);
+            return false;
         }
-        case Token::Kind::STAR:
-            // Special case for M*V, V*M, M*M (but not V*V!)
-            if (lType.columns() > 1 && rType.columns() > 1 &&
-                (lType.rows() > 1 || rType.rows() > 1)) {
-                ByteCode::Register right = this->next(SlotCount(rType));
-                this->writeExpression(*b.fRight, right);
-                int rCols = rType.columns(),
-                    rRows = rType.rows(),
-                    lCols = lType.columns(),
-                    lRows = lType.rows();
-                // M*V treats the vector as a column
-                if (rType.kind() == Type::kVector_Kind) {
-                    std::swap(rCols, rRows);
-                }
-                SkASSERT(lCols == rRows);
-                SkASSERT(SlotCount(b.fType) == lRows * rCols);
-                this->write(ByteCode::Instruction::kMatrixMultiply);
-                this->write(result);
-                this->write(left);
-                this->write(right);
-                this->write((uint8_t) lCols);
-                this->write((uint8_t) lRows);
-                this->write((uint8_t) rCols);
-                return;
-            }
 
         default:
             break;
     }
-    ByteCode::Register right = this->next(SlotCount(*operandType));
-    this->writeExpression(*b.fRight, right);
+    this->writeExpression(*b.fRight);
     if (lVecOrMtx && !rVecOrMtx) {
-        this->write(ByteCode::Instruction::kSplat);
-        this->write((uint8_t) (SlotCount(*operandType) - 1));
-        this->write(right + 1);
-        this->write(right);
+        for (int i = SlotCount(lType); i > 1; --i) {
+            this->write(ByteCodeInstruction::kDup);
+            this->write8(1);
+        }
     }
-    switch (op) {
-        case Token::Kind::EQEQ:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareEQI,
-                                         ByteCode::Instruction::kCompareEQI,
-                                         ByteCode::Instruction::kCompareEQF,
-                                         result);
-            // Collapse to a single bool
-            for (int i = 1; i < count; ++i) {
-                this->write(ByteCode::Instruction::kAnd);
-                this->write(result);
-                this->write(result);
-                this->write(result + i);
-            }
-            break;
-        case Token::Kind::GT:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareGTS,
-                                         ByteCode::Instruction::kCompareGTU,
-                                         ByteCode::Instruction::kCompareGTF,
-                                         result);
-            break;
-        case Token::Kind::GTEQ:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareGTEQS,
-                                         ByteCode::Instruction::kCompareGTEQU,
-                                         ByteCode::Instruction::kCompareGTEQF,
-                                         result);
-            break;
-        case Token::Kind::LT:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareLTS,
-                                         ByteCode::Instruction::kCompareLTU,
-                                         ByteCode::Instruction::kCompareLTF,
-                                         result);
-            break;
-        case Token::Kind::LTEQ:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareLTEQS,
-                                         ByteCode::Instruction::kCompareLTEQU,
-                                         ByteCode::Instruction::kCompareLTEQF,
-                                         result);
-            break;
-        case Token::Kind::MINUS:
-            this->writeVectorBinaryInstruction(*operandType, left, right,
-                                               ByteCode::Instruction::kSubtractI,
-                                               ByteCode::Instruction::kSubtractI,
-                                               ByteCode::Instruction::kSubtractF,
-                                               result);
-            break;
-        case Token::Kind::NEQ:
-            this->writeBinaryInstruction(*operandType, left, right,
-                                         ByteCode::Instruction::kCompareNEQI,
-                                         ByteCode::Instruction::kCompareNEQI,
-                                         ByteCode::Instruction::kCompareNEQF,
-                                         result);
-            // Collapse to a single bool
-            for (int i = 1; i < count; ++i) {
-                this->write(ByteCode::Instruction::kOr);
-                this->write(result);
-                this->write(result);
-                this->write(result + i);
-            }
-            break;
-        case Token::Kind::PERCENT:
-            this->writeVectorBinaryInstruction(*operandType, left, right,
-                                               ByteCode::Instruction::kRemainderS,
-                                               ByteCode::Instruction::kRemainderU,
-                                               ByteCode::Instruction::kRemainderF,
-                                               result);
-            break;
-        case Token::Kind::PLUS:
-            this->writeVectorBinaryInstruction(*operandType, left, right,
-                                               ByteCode::Instruction::kAddI,
-                                               ByteCode::Instruction::kAddI,
-                                               ByteCode::Instruction::kAddF,
-                                               result);
-            break;
-        case Token::Kind::SLASH:
-            this->writeVectorBinaryInstruction(*operandType, left, right,
-                                               ByteCode::Instruction::kDivideS,
-                                               ByteCode::Instruction::kDivideU,
-                                               ByteCode::Instruction::kDivideF,
-                                               result);
-            break;
-        case Token::Kind::STAR:
-            this->writeVectorBinaryInstruction(*operandType, left, right,
-                                               ByteCode::Instruction::kMultiplyI,
-                                               ByteCode::Instruction::kMultiplyI,
-                                               ByteCode::Instruction::kMultiplyF,
-                                               result);
-            break;
-        case Token::Kind::LOGICALXOR: {
-            SkASSERT(tc == SkSL::TypeCategory::kBool);
-            this->write(ByteCode::Instruction::kXor);
-            this->write(result);
-            this->write(left);
-            this->write(right);
-            break;
+    // Special case for M*V, V*M, M*M (but not V*V!)
+    if (op == Token::Kind::STAR && lVecOrMtx && rVecOrMtx &&
+        !(lType.kind() == Type::kVector_Kind && rType.kind() == Type::kVector_Kind)) {
+        this->write(ByteCodeInstruction::kMatrixMultiply,
+                    SlotCount(b.fType) - (SlotCount(lType) + SlotCount(rType)));
+        int rCols = rType.columns(),
+            rRows = rType.rows(),
+            lCols = lType.columns(),
+            lRows = lType.rows();
+        // M*V treats the vector as a column
+        if (rType.kind() == Type::kVector_Kind) {
+            std::swap(rCols, rRows);
         }
-        case Token::Kind::BITWISEAND: {
-            SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
-            this->write(ByteCode::Instruction::kAnd);
-            this->write(result);
-            this->write(left);
-            this->write(right);
-            break;
+        SkASSERT(lCols == rRows);
+        SkASSERT(SlotCount(b.fType) == lRows * rCols);
+        this->write8(lCols);
+        this->write8(lRows);
+        this->write8(rCols);
+    } else {
+        switch (op) {
+            case Token::Kind::EQEQ:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareIEQ,
+                                            ByteCodeInstruction::kCompareIEQ,
+                                            ByteCodeInstruction::kCompareFEQ,
+                                            count);
+                // Collapse to a single bool
+                for (int i = count; i > 1; --i) {
+                    this->write(ByteCodeInstruction::kAndB);
+                }
+                break;
+            case Token::Kind::GT:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSGT,
+                                            ByteCodeInstruction::kCompareUGT,
+                                            ByteCodeInstruction::kCompareFGT,
+                                            count);
+                break;
+            case Token::Kind::GTEQ:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSGTEQ,
+                                            ByteCodeInstruction::kCompareUGTEQ,
+                                            ByteCodeInstruction::kCompareFGTEQ,
+                                            count);
+                break;
+            case Token::Kind::LT:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSLT,
+                                            ByteCodeInstruction::kCompareULT,
+                                            ByteCodeInstruction::kCompareFLT,
+                                            count);
+                break;
+            case Token::Kind::LTEQ:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareSLTEQ,
+                                            ByteCodeInstruction::kCompareULTEQ,
+                                            ByteCodeInstruction::kCompareFLTEQ,
+                                            count);
+                break;
+            case Token::Kind::MINUS:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractF,
+                                            count);
+                break;
+            case Token::Kind::NEQ:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kCompareINEQ,
+                                            ByteCodeInstruction::kCompareINEQ,
+                                            ByteCodeInstruction::kCompareFNEQ,
+                                            count);
+                // Collapse to a single bool
+                for (int i = count; i > 1; --i) {
+                    this->write(ByteCodeInstruction::kOrB);
+                }
+                break;
+            case Token::Kind::PERCENT:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kRemainderS,
+                                            ByteCodeInstruction::kRemainderU,
+                                            ByteCodeInstruction::kRemainderF,
+                                            count);
+                break;
+            case Token::Kind::PLUS:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddF,
+                                            count);
+                break;
+            case Token::Kind::SLASH:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kDivideS,
+                                            ByteCodeInstruction::kDivideU,
+                                            ByteCodeInstruction::kDivideF,
+                                            count);
+                break;
+            case Token::Kind::STAR:
+                this->writeTypedInstruction(lType, ByteCodeInstruction::kMultiplyI,
+                                            ByteCodeInstruction::kMultiplyI,
+                                            ByteCodeInstruction::kMultiplyF,
+                                            count);
+                break;
+
+            case Token::Kind::LOGICALXOR:
+                SkASSERT(tc == SkSL::TypeCategory::kBool && count == 1);
+                this->write(ByteCodeInstruction::kXorB);
+                break;
+
+            case Token::Kind::BITWISEAND:
+                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
+                                        tc == SkSL::TypeCategory::kUnsigned));
+                this->write(ByteCodeInstruction::kAndB);
+                break;
+            case Token::Kind::BITWISEOR:
+                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
+                                        tc == SkSL::TypeCategory::kUnsigned));
+                this->write(ByteCodeInstruction::kOrB);
+                break;
+            case Token::Kind::BITWISEXOR:
+                SkASSERT(count == 1 && (tc == SkSL::TypeCategory::kSigned ||
+                                        tc == SkSL::TypeCategory::kUnsigned));
+                this->write(ByteCodeInstruction::kXorB);
+                break;
+
+            default:
+                fErrors.error(b.fOffset, SkSL::String::printf("Unsupported binary operator '%s'",
+                                                              Compiler::OperatorName(op)));
+                break;
         }
-        case Token::Kind::BITWISEOR: {
-            SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
-            this->write(ByteCode::Instruction::kOr);
-            this->write(result);
-            this->write(left);
-            this->write(right);
-            break;
-        }
-        case Token::Kind::BITWISEXOR: {
-            SkASSERT(tc == SkSL::TypeCategory::kSigned || tc == SkSL::TypeCategory::kUnsigned);
-            this->write(ByteCode::Instruction::kXor);
-            this->write(result);
-            this->write(left);
-            this->write(right);
-            break;
-        }
-        default:
-            fErrors.error(b.fOffset, SkSL::String::printf("Unsupported binary operator '%s'",
-                                                          Compiler::OperatorName(op)));
-            break;
     }
     if (lvalue) {
-        lvalue->store(result);
+        lvalue->store(discard);
+        discard = false;
     }
+    return discard;
 }
 
-void ByteCodeGenerator::writeConstructor(const Constructor& c, ByteCode::Register result) {
-    if (c.fType.rows() > 1) {
-        if (c.fArguments.size() == 1) {
-            if (SlotCount(c.fArguments[0]->fType) == 1) {
-                ByteCode::Register v = this->next(1);
-                this->writeExpression(*c.fArguments[0], v);
-                this->write(ByteCode::Instruction::kScalarToMatrix);
-                this->write(result);
-                this->write(v);
-                this->write((uint8_t) c.fType.columns());
-                this->write((uint8_t) c.fType.rows());
-                return;
-            } else if (c.fArguments[0]->fType.rows() > 1) {
-                ByteCode::Register v = this->next(SlotCount(c.fArguments[0]->fType));
-                this->writeExpression(*c.fArguments[0], v);
-                this->write(ByteCode::Instruction::kMatrixToMatrix);
-                this->write(result);
-                this->write(v);
-                this->write((uint8_t) c.fArguments[0]->fType.columns());
-                this->write((uint8_t) c.fArguments[0]->fType.rows());
-                this->write((uint8_t) c.fType.columns());
-                this->write((uint8_t) c.fType.rows());
-                return;
+void ByteCodeGenerator::writeBoolLiteral(const BoolLiteral& b) {
+    this->write(ByteCodeInstruction::kPushImmediate);
+    this->write32(b.fValue ? ~0 : 0);
+}
+
+void ByteCodeGenerator::writeConstructor(const Constructor& c) {
+    for (const auto& arg : c.fArguments) {
+        this->writeExpression(*arg);
+    }
+    if (c.fArguments.size() == 1) {
+        const Type& inType = c.fArguments[0]->fType;
+        const Type& outType = c.fType;
+        TypeCategory inCategory = type_category(inType);
+        TypeCategory outCategory = type_category(outType);
+        int inCount = SlotCount(inType);
+        int outCount = SlotCount(outType);
+        if (inCategory != outCategory) {
+            SkASSERT(inCount == outCount);
+            if (inCategory == TypeCategory::kFloat) {
+                SkASSERT(outCategory == TypeCategory::kSigned ||
+                         outCategory == TypeCategory::kUnsigned);
+                this->write(vector_instruction(ByteCodeInstruction::kConvertFtoI, outCount));
+            } else if (outCategory == TypeCategory::kFloat) {
+                if (inCategory == TypeCategory::kSigned) {
+                    this->write(vector_instruction(ByteCodeInstruction::kConvertStoF, outCount));
+                } else {
+                    SkASSERT(inCategory == TypeCategory::kUnsigned);
+                    this->write(vector_instruction(ByteCodeInstruction::kConvertUtoF, outCount));
+                }
+            } else {
+                SkASSERT(false);
             }
         }
-        int offset = 0;
-        for (const auto& arg : c.fArguments) {
-            this->writeExpression(*arg, ByteCode::Register{(uint16_t) (result.fIndex + offset)});
-            offset += SlotCount(arg->fType);
-        }
-        return;
-    }
-    if (c.fArguments.size() == 1 && c.fArguments[0]->fType.columns() == 1 &&
-        c.fType.columns() > 1) {
-        SkASSERT(SlotCount(c.fArguments[0]->fType) == 1);
-        ByteCode::Register v = result;
-        this->writeExpression(*c.fArguments[0], v);
-        this->write(ByteCode::Instruction::kSplat);
-        this->write((uint8_t) (c.fType.columns() - 1));
-        this->write(v + 1);
-        this->write(v);
-        return;
-    }
-    ByteCode::Instruction inst;
-    switch (type_category(c.fArguments[0]->fType)) {
-        case TypeCategory::kSigned:
-            if (type_category(c.fType) == TypeCategory::kFloat) {
-                inst = ByteCode::Instruction::kSignedToFloat;
+        if (inType.kind() == Type::kMatrix_Kind && outType.kind() == Type::kMatrix_Kind) {
+            this->write(ByteCodeInstruction::kMatrixToMatrix,
+                        SlotCount(outType) - SlotCount(inType));
+            this->write8(inType.columns());
+            this->write8(inType.rows());
+            this->write8(outType.columns());
+            this->write8(outType.rows());
+        } else if (inCount != outCount) {
+            SkASSERT(inCount == 1);
+            if (outType.kind() == Type::kMatrix_Kind) {
+                this->write(ByteCodeInstruction::kScalarToMatrix, SlotCount(outType) - 1);
+                this->write8(outType.columns());
+                this->write8(outType.rows());
             } else {
-                inst = ByteCode::Instruction::kNop;
-            }
-            break;
-        case TypeCategory::kUnsigned:
-            if (type_category(c.fType) == TypeCategory::kFloat) {
-                inst = ByteCode::Instruction::kUnsignedToFloat;
-            } else {
-                inst = ByteCode::Instruction::kNop;
-            }
-            break;
-        case TypeCategory::kFloat:
-            if (type_category(c.fType) == TypeCategory::kSigned) {
-                inst = ByteCode::Instruction::kFloatToSigned;
-            } else if (type_category(c.fType) == TypeCategory::kUnsigned) {
-                inst = ByteCode::Instruction::kFloatToUnsigned;
-            } else {
-                inst = ByteCode::Instruction::kNop;
-            }
-            break;
-        default:
-            SkASSERT(false);
-            return;
-    }
-    ByteCode::Register values;
-    if (inst == ByteCode::Instruction::kNop) {
-        values = result;
-    } else {
-        values = this->next(SlotCount(c.fType));
-    }
-    ByteCode::Register v = values;
-    for (size_t i = 0; i < c.fArguments.size(); ++i) {
-        this->writeExpression(*c.fArguments[i], v);
-        v.fIndex += SlotCount(c.fArguments[i]->fType);
-    }
-    if (inst != ByteCode::Instruction::kNop) {
-        v = values;
-        ByteCode::Register target = result;
-        for (size_t i = 0; i < c.fArguments.size(); ++i) {
-            int count = SlotCount(c.fArguments[i]->fType);
-            for (int j = 0; j < count; ++j) {
-                this->write(inst);
-                this->write(target);
-                ++target.fIndex;
-                this->write(v + j);
+                SkASSERT(outType.kind() == Type::kVector_Kind);
+                for (; inCount != outCount; ++inCount) {
+                    this->write(ByteCodeInstruction::kDup);
+                    this->write8(1);
+                }
             }
         }
     }
 }
 
-void ByteCodeGenerator::writeExternalFunctionCall(const ExternalFunctionCall& f,
-                                                  ByteCode::Register result) {
+void ByteCodeGenerator::writeExternalFunctionCall(const ExternalFunctionCall& f) {
     int argumentCount = 0;
     for (const auto& arg : f.fArguments) {
+        this->writeExpression(*arg);
         argumentCount += SlotCount(arg->fType);
     }
-    ByteCode::Register args = this->next(argumentCount);
-    argumentCount = 0;
-    for (const auto& arg : f.fArguments) {
-        this->writeExpression(*arg, args + argumentCount);
-        argumentCount += SlotCount(arg->fType);
-    }
-    this->write(ByteCode::Instruction::kCallExternal);
-    this->write(result);
+    this->write(ByteCodeInstruction::kCallExternal, SlotCount(f.fType) - argumentCount);
+    SkASSERT(argumentCount <= 255);
+    this->write8(argumentCount);
+    this->write8(SlotCount(f.fType));
     int index = fOutput->fExternalValues.size();
     fOutput->fExternalValues.push_back(f.fFunction);
     SkASSERT(index <= 255);
-    this->write((uint8_t) index);
-    SkASSERT(SlotCount(f.fType) <= 255);
-    this->write((uint8_t) SlotCount(f.fType));
-    this->write(args);
-    SkASSERT(argumentCount <= 255);
-    this->write((uint8_t) argumentCount);
+    this->write8(index);
 }
 
-void ByteCodeGenerator::writeExternalValue(const ExternalValueReference& e,
-                                           ByteCode::Register result) {
-    this->write(ByteCode::Instruction::kReadExternal);
-    this->write(result);
-    this->write((uint8_t) SlotCount(e.fValue->type()));
+void ByteCodeGenerator::writeExternalValue(const ExternalValueReference& e) {
+    int count = SlotCount(e.fValue->type());
+    this->write(vector_instruction(ByteCodeInstruction::kReadExternal, count));
+    this->write8(count);
     int index = fOutput->fExternalValues.size();
     fOutput->fExternalValues.push_back(e.fValue);
     SkASSERT(index <= 255);
-    this->write((uint8_t) index);
+    this->write8(index);
 }
 
-void ByteCodeGenerator::writeIntrinsicCall(const FunctionCall& c, Intrinsic intrinsic,
-                                           ByteCode::Register result) {
-    if (intrinsic.fIsSpecial) {
-        switch (intrinsic.fValue.fSpecial) {
+void ByteCodeGenerator::writeVariableExpression(const Expression& expr) {
+    Location location = this->getLocation(expr);
+    int count = SlotCount(expr.fType);
+    if (location.isOnStack() || count > 4) {
+        if (!location.isOnStack()) {
+            this->write(ByteCodeInstruction::kPushImmediate);
+            this->write32(location.fSlot);
+        }
+        this->write(location.selectLoad(ByteCodeInstruction::kLoadExtended,
+                                        ByteCodeInstruction::kLoadExtendedGlobal,
+                                        ByteCodeInstruction::kLoadExtendedUniform),
+                    count);
+        this->write8(count);
+    } else {
+        this->write(vector_instruction(location.selectLoad(ByteCodeInstruction::kLoad,
+                                                           ByteCodeInstruction::kLoadGlobal,
+                                                           ByteCodeInstruction::kLoadUniform),
+                                       count));
+        this->write8(count);
+        this->write8(location.fSlot);
+    }
+}
+
+static inline uint32_t float_to_bits(float x) {
+    uint32_t u;
+    memcpy(&u, &x, sizeof(uint32_t));
+    return u;
+}
+
+void ByteCodeGenerator::writeFloatLiteral(const FloatLiteral& f) {
+    this->write(ByteCodeInstruction::kPushImmediate);
+    this->write32(float_to_bits(f.fValue));
+}
+
+void ByteCodeGenerator::writeIntrinsicCall(const FunctionCall& c) {
+    auto found = fIntrinsics.find(c.fFunction.fName);
+    if (found == fIntrinsics.end()) {
+        fErrors.error(c.fOffset, String::printf("Unsupported intrinsic: '%s'",
+                                                String(c.fFunction.fName).c_str()));
+        return;
+    }
+    int count = SlotCount(c.fArguments[0]->fType);
+    if (found->second.fIsSpecial) {
+        SpecialIntrinsic special = found->second.fValue.fSpecial;
+        switch (special) {
             case SpecialIntrinsic::kDot: {
                 SkASSERT(c.fArguments.size() == 2);
-                int count = SlotCount(c.fArguments[0]->fType);
-                ByteCode::Register left = this->next(count);
-                this->writeExpression(*c.fArguments[0], left);
-                ByteCode::Register right = this->next(count);
-                this->writeExpression(*c.fArguments[1], right);
-                ByteCode::Register product = this->next(count);
-                this->writeTypedInstruction(c.fType,
-                                            ByteCode::Instruction::kMultiplyIN,
-                                            ByteCode::Instruction::kMultiplyIN,
-                                            ByteCode::Instruction::kMultiplyFN);
-                this->write((uint8_t) count);
-                this->write(product);
-                this->write(left);
-                this->write(right);
-                ByteCode::Register total = product;
-                for (int i = 1; i < count; ++i) {
-                    this->writeTypedInstruction(c.fType,
-                                                ByteCode::Instruction::kAddI,
-                                                ByteCode::Instruction::kAddI,
-                                                ByteCode::Instruction::kAddF);
-                    ByteCode::Register sum = i == count - 1 ? result : this->next(1);
-                    this->write(sum);
-                    this->write(total);
-                    this->write(product + i);
-                    total = sum;
+                SkASSERT(count == SlotCount(c.fArguments[1]->fType));
+                this->write(vector_instruction(ByteCodeInstruction::kMultiplyF, count));
+                this->write8(count);
+                for (int i = count; i > 1; --i) {
+                    this->write(ByteCodeInstruction::kAddF);
+                    this->write8(1);
                 }
                 break;
             }
-            case SpecialIntrinsic::kInverse: {
-                SkASSERT(c.fArguments.size() == 1);
-                int count = SlotCount(c.fArguments[0]->fType);
-                ByteCode::Register arg = this->next(count);
-                this->writeExpression(*c.fArguments[0], arg);
-                switch (SlotCount(c.fArguments[0]->fType)) {
-                    case 4:  this->write(ByteCode::Instruction::kInverse2x2); break;
-                    case 9:  this->write(ByteCode::Instruction::kInverse3x3); break;
-                    case 16: this->write(ByteCode::Instruction::kInverse4x4); break;
-                    default: SkASSERT(false);
-                }
-                this->write(result);
-                this->write(arg);
-                break;
-            }
-            case SpecialIntrinsic::kPrint: {
-                SkASSERT(c.fArguments.size() == 1);
-                SkASSERT(SlotCount(c.fArguments[0]->fType) == 1);
-                ByteCode::Register arg = this->next(1);
-                this->writeExpression(*c.fArguments[0], arg);
-                this->write(ByteCode::Instruction::kPrint);
-                this->write(arg);
-                break;
-            }
+            default:
+                SkASSERT(false);
         }
     } else {
-        int count = SlotCount(c.fType);
-        std::vector<ByteCode::Register> argRegs;
-        for (const auto& expr : c.fArguments) {
-            SkASSERT(SlotCount(expr->fType) == count);
-            ByteCode::Register reg = this->next(count);
-            this->writeExpression(*expr, reg);
-            argRegs.push_back(reg);
-        }
-        for (int i = 0; i < count; ++i) {
-            this->write(intrinsic.fValue.fInstruction);
-            if (c.fType.fName != "void") {
-                this->write(result + i);
+        switch (found->second.fValue.fInstruction) {
+            case ByteCodeInstruction::kCos:
+            case ByteCodeInstruction::kSin:
+            case ByteCodeInstruction::kTan:
+                SkASSERT(c.fArguments.size() > 0);
+                this->write(vector_instruction(found->second.fValue.fInstruction, count));
+                this->write8(count);
+                break;
+            case ByteCodeInstruction::kSqrt:
+                SkASSERT(c.fArguments.size() > 0);
+                this->write(vector_instruction(found->second.fValue.fInstruction, count));
+                break;
+            case ByteCodeInstruction::kInverse2x2: {
+                SkASSERT(c.fArguments.size() > 0);
+                auto op = ByteCodeInstruction::kInverse2x2;
+                switch (count) {
+                    case 4: break;  // float2x2
+                    case 9:  op = ByteCodeInstruction::kInverse3x3; break;
+                    case 16: op = ByteCodeInstruction::kInverse4x4; break;
+                    default: SkASSERT(false);
+                }
+                this->write(op);
+                break;
             }
-            for (ByteCode::Register arg : argRegs) {
-                this->write(arg + i);
-            }
+            default:
+                SkASSERT(false);
         }
     }
 }
 
-void ByteCodeGenerator::writeFunctionCall(const FunctionCall& c, ByteCode::Register result) {
-    auto found = fIntrinsics.find(c.fFunction.fName);
-    if (found != fIntrinsics.end()) {
-        return this->writeIntrinsicCall(c, found->second, result);
-    }
-    int argCount = c.fArguments.size();
-    std::vector<std::unique_ptr<LValue>> lvalues;
-    int parameterSlotCount = 0;
-    for (const auto& p : c.fFunction.fParameters) {
-        parameterSlotCount += SlotCount(p->fType);
-    }
-    ByteCode::Register argStart = this->next(parameterSlotCount);
-    ByteCode::Register nextArg = argStart;
-    for (int i = 0; i < argCount; ++i) {
-        const auto& param = c.fFunction.fParameters[i];
-        const auto& arg = c.fArguments[i];
-        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
-            lvalues.emplace_back(this->getLValue(*arg));
-            lvalues.back()->load(nextArg);
-        } else {
-            this->writeExpression(*arg, nextArg);
-        }
-        nextArg.fIndex += SlotCount(arg->fType);
-    }
+void ByteCodeGenerator::writeFunctionCall(const FunctionCall& f) {
     // Find the index of the function we're calling. We explicitly do not allow calls to functions
     // before they're defined. This is an easy-to-understand rule that prevents recursion.
-    size_t idx;
-    for (idx = 0; idx < fFunctions.size(); ++idx) {
-        if (c.fFunction.matches(fFunctions[idx]->fDeclaration)) {
+    int idx = -1;
+    for (size_t i = 0; i < fFunctions.size(); ++i) {
+        if (f.fFunction.matches(fFunctions[i]->fDeclaration)) {
+            idx = i;
             break;
         }
     }
-    if (idx > 255) {
-        fErrors.error(c.fOffset, "Function count limit exceeded");
-        return;
-    } else if (idx >= fOutput->fFunctions.size()) {
-        fErrors.error(c.fOffset, "Call to undefined function");
-        return;
-    }
-
-    this->write(ByteCode::Instruction::kCall);
-    this->write(result);
-    this->write((uint8_t) idx);
-    this->write(argStart);
-    nextArg = argStart;
-    auto lvalue = lvalues.begin();
-    for (int i = 0; i < argCount; ++i) {
-        const auto& param = c.fFunction.fParameters[i];
-        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
-            (*(lvalue++))->store(nextArg);
+    if (idx == -1) {
+        for (const auto& arg : f.fArguments) {
+            this->writeExpression(*arg);
         }
-        nextArg.fIndex += SlotCount(param->fType);
+        this->writeIntrinsicCall(f);
+        return;
     }
+
+
+    if (idx > 255) {
+        fErrors.error(f.fOffset, "Function count limit exceeded");
+        return;
+    } else if (idx >= (int) fFunctions.size()) {
+        fErrors.error(f.fOffset, "Call to undefined function");
+        return;
+    }
+
+    // We may need to deal with out parameters, so the sequence is tricky
+    if (int returnCount = SlotCount(f.fType)) {
+        this->write(ByteCodeInstruction::kReserve, returnCount);
+        this->write8(returnCount);
+    }
+
+    int argCount = f.fArguments.size();
+    std::vector<std::unique_ptr<LValue>> lvalues;
+    for (int i = 0; i < argCount; ++i) {
+        const auto& param = f.fFunction.fParameters[i];
+        const auto& arg = f.fArguments[i];
+        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
+            lvalues.emplace_back(this->getLValue(*arg));
+            lvalues.back()->load();
+        } else {
+            this->writeExpression(*arg);
+        }
+    }
+
+    // The space used by the call is based on the callee, but it also unwinds all of that before
+    // we continue execution. We adjust our max stack depths below.
+    this->write(ByteCodeInstruction::kCall);
+    this->write8(idx);
+
+    const ByteCodeFunction* callee = fOutput->fFunctions[idx].get();
+    fMaxLoopCount      = std::max(fMaxLoopCount,      fLoopCount      + callee->fLoopCount);
+    fMaxConditionCount = std::max(fMaxConditionCount, fConditionCount + callee->fConditionCount);
+    fMaxStackCount     = std::max(fMaxStackCount,     fStackCount     + callee->fLocalCount
+                                                                      + callee->fStackCount);
+
+    // After the called function returns, the stack will still contain our arguments. We have to
+    // pop them (storing any out parameters back to their lvalues as we go). We glob together slot
+    // counts for all parameters that aren't out-params, so we can pop them in one big chunk.
+    int popCount = 0;
+    auto pop = [&]() {
+        if (popCount > 4) {
+            this->write(ByteCodeInstruction::kPopN, popCount);
+            this->write8(popCount);
+        } else if (popCount > 0) {
+            this->write(vector_instruction(ByteCodeInstruction::kPop, popCount));
+        }
+        popCount = 0;
+    };
+
+    for (int i = argCount - 1; i >= 0; --i) {
+        const auto& param = f.fFunction.fParameters[i];
+        const auto& arg = f.fArguments[i];
+        if (param->fModifiers.fFlags & Modifiers::kOut_Flag) {
+            pop();
+            lvalues.back()->store(true);
+            lvalues.pop_back();
+        } else {
+            popCount += SlotCount(arg->fType);
+        }
+    }
+    pop();
 }
 
-void ByteCodeGenerator::incOrDec(Token::Kind op, Expression& operand, bool prefix,
-                                 ByteCode::Register result) {
-    SkASSERT(op == Token::Kind::PLUSPLUS || op == Token::Kind::MINUSMINUS);
-    std::unique_ptr<LValue> lvalue = this->getLValue(operand);
-    SkASSERT(SlotCount(operand.fType) == 1);
-    ByteCode::Register value;
-    if (prefix) {
-        value = this->next(1);
-    } else {
-        value = result;
-    }
-    lvalue->load(value);
-    ByteCode::Register one = this->next(1);
-    this->write(ByteCode::Instruction::kImmediate);
-    this->write(one);
-    if (type_category(operand.fType) == TypeCategory::kFloat) {
-        this->write(ByteCode::Immediate(1.0f));
-    } else {
-        this->write(ByteCode::Immediate((int32_t) 1));
-    }
-    if (op == Token::Kind::PLUSPLUS) {
-        this->writeTypedInstruction(operand.fType,
-                                    ByteCode::Instruction::kAddI,
-                                    ByteCode::Instruction::kAddI,
-                                    ByteCode::Instruction::kAddF);
-    } else {
-        this->writeTypedInstruction(operand.fType,
-                                    ByteCode::Instruction::kSubtractI,
-                                    ByteCode::Instruction::kSubtractI,
-                                    ByteCode::Instruction::kSubtractF);
-    }
-    if (prefix) {
-        this->write(result);
-        this->write(value);
-        this->write(one);
-        lvalue->store(result);
-    } else {
-        ByteCode::Register temp = this->next(1);
-        this->write(temp);
-        this->write(value);
-        this->write(one);
-        lvalue->store(temp);
-    }
+void ByteCodeGenerator::writeIntLiteral(const IntLiteral& i) {
+    this->write(ByteCodeInstruction::kPushImmediate);
+    this->write32(i.fValue);
 }
 
-void ByteCodeGenerator::writePostfixExpression(const PostfixExpression& p,
-                                               ByteCode::Register result) {
-    this->incOrDec(p.fOperator, *p.fOperand, false, result);
+void ByteCodeGenerator::writeNullLiteral(const NullLiteral& n) {
+    // not yet implemented
+    abort();
 }
 
-void ByteCodeGenerator::writePrefixExpression(const PrefixExpression& p,
-                                              ByteCode::Register result) {
+bool ByteCodeGenerator::writePrefixExpression(const PrefixExpression& p, bool discard) {
     switch (p.fOperator) {
-        case Token::Kind::PLUSPLUS:
+        case Token::Kind::PLUSPLUS: // fall through
         case Token::Kind::MINUSMINUS: {
-            return this->incOrDec(p.fOperator, *p.fOperand, true, result);
+            SkASSERT(SlotCount(p.fOperand->fType) == 1);
+            std::unique_ptr<LValue> lvalue = this->getLValue(*p.fOperand);
+            lvalue->load();
+            this->write(ByteCodeInstruction::kPushImmediate);
+            this->write32(type_category(p.fType) == TypeCategory::kFloat ? float_to_bits(1.0f) : 1);
+            if (p.fOperator == Token::Kind::PLUSPLUS) {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddF,
+                                            1);
+            } else {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractF,
+                                            1);
+            }
+            lvalue->store(discard);
+            discard = false;
+            break;
         }
         case Token::Kind::MINUS: {
-            ByteCode::Register src = this->next(SlotCount(p.fType));
-            this->writeExpression(*p.fOperand, src);
-            for (int i = 0; i < SlotCount(p.fType); ++i) {
-                this->writeTypedInstruction(p.fType,
-                                            ByteCode::Instruction::kNegateS,
-                                            ByteCode::Instruction::kNegateS,
-                                            ByteCode::Instruction::kNegateF);
-                this->write(result + i);
-                this->write(src + i);
-            }
+            this->writeExpression(*p.fOperand);
+            this->writeTypedInstruction(p.fType,
+                                        ByteCodeInstruction::kNegateI,
+                                        ByteCodeInstruction::kNegateI,
+                                        ByteCodeInstruction::kNegateF,
+                                        SlotCount(p.fOperand->fType),
+                                        false);
             break;
         }
         case Token::Kind::LOGICALNOT:
         case Token::Kind::BITWISENOT: {
-            ByteCode::Register src = this->next(SlotCount(p.fType));
-            this->writeExpression(*p.fOperand, src);
-            for (int i = 0; i < SlotCount(p.fType); ++i) {
-                this->write(ByteCode::Instruction::kNot);
-                this->write(result + i);
-                this->write(src + i);
-            }
+            SkASSERT(SlotCount(p.fOperand->fType) == 1);
+            SkDEBUGCODE(TypeCategory tc = type_category(p.fOperand->fType));
+            SkASSERT((p.fOperator == Token::Kind::LOGICALNOT && tc == TypeCategory::kBool) ||
+                     (p.fOperator == Token::Kind::BITWISENOT && (tc == TypeCategory::kSigned ||
+                                                                 tc == TypeCategory::kUnsigned)));
+            this->writeExpression(*p.fOperand);
+            this->write(ByteCodeInstruction::kNotB);
             break;
         }
         default:
             SkASSERT(false);
     }
+    return discard;
 }
 
-void ByteCodeGenerator::writeSwizzle(const Swizzle& s, ByteCode::Register result) {
+bool ByteCodeGenerator::writePostfixExpression(const PostfixExpression& p, bool discard) {
+    switch (p.fOperator) {
+        case Token::Kind::PLUSPLUS: // fall through
+        case Token::Kind::MINUSMINUS: {
+            SkASSERT(SlotCount(p.fOperand->fType) == 1);
+            std::unique_ptr<LValue> lvalue = this->getLValue(*p.fOperand);
+            lvalue->load();
+            // If we're not supposed to discard the result, then make a copy *before* the +/-
+            if (!discard) {
+                this->write(ByteCodeInstruction::kDup);
+                this->write8(1);
+            }
+            this->write(ByteCodeInstruction::kPushImmediate);
+            this->write32(type_category(p.fType) == TypeCategory::kFloat ? float_to_bits(1.0f) : 1);
+            if (p.fOperator == Token::Kind::PLUSPLUS) {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddI,
+                                            ByteCodeInstruction::kAddF,
+                                            1);
+            } else {
+                this->writeTypedInstruction(p.fType,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractI,
+                                            ByteCodeInstruction::kSubtractF,
+                                            1);
+            }
+            // Always consume the result as part of the store
+            lvalue->store(true);
+            discard = false;
+            break;
+        }
+        default:
+            SkASSERT(false);
+    }
+    return discard;
+}
+
+void ByteCodeGenerator::writeSwizzle(const Swizzle& s) {
     if (swizzle_is_simple(s)) {
-        this->writeVariableExpression(s, result);
+        this->writeVariableExpression(s);
         return;
     }
-    ByteCode::Register base = this->writeExpression(*s.fBase);
-    for (int i = 0; i < (int) s.fComponents.size(); ++i) {
-        this->write(ByteCode::Instruction::kCopy);
-        this->write(result + i);
-        this->write(base + s.fComponents[i]);
+
+    switch (s.fBase->fKind) {
+        case Expression::kVariableReference_Kind: {
+            Location location = this->getLocation(*s.fBase);
+            this->write(location.selectLoad(ByteCodeInstruction::kLoadSwizzle,
+                                            ByteCodeInstruction::kLoadSwizzleGlobal,
+                                            ByteCodeInstruction::kLoadSwizzleUniform),
+                        s.fComponents.size());
+            this->write8(location.fSlot);
+            this->write8(s.fComponents.size());
+            for (int c : s.fComponents) {
+                this->write8(c);
+            }
+            break;
+        }
+        default:
+            this->writeExpression(*s.fBase);
+            this->write(ByteCodeInstruction::kSwizzle,
+                        s.fComponents.size() - s.fBase->fType.columns());
+            this->write8(s.fBase->fType.columns());
+            this->write8(s.fComponents.size());
+            for (int c : s.fComponents) {
+                this->write8(c);
+            }
     }
 }
 
-void ByteCodeGenerator::writeTernaryExpression(const TernaryExpression& t,
-                                               ByteCode::Register result) {
+void ByteCodeGenerator::writeTernaryExpression(const TernaryExpression& t) {
     int count = SlotCount(t.fType);
     SkASSERT(count == SlotCount(t.fIfTrue->fType));
     SkASSERT(count == SlotCount(t.fIfFalse->fType));
 
-    ByteCode::Register test = this->writeExpression(*t.fTest);
-    this->write(ByteCode::Instruction::kMaskPush);
-    ++fConditionCount;
-    this->write(test);
-    ByteCode::Register ifTrue = this->writeExpression(*t.fIfTrue);
-    this->write(ByteCode::Instruction::kMaskNegate);
-    ByteCode::Register ifFalse = this->writeExpression(*t.fIfFalse);
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kMaskPop);
-    for (int i = 0; i < count; ++i) {
-        this->write(ByteCode::Instruction::kSelect);
-        this->write(result + i);
-        this->write(test);
-        this->write(ifTrue + i);
-        this->write(ifFalse + i);
-    }
+    this->writeExpression(*t.fTest);
+    this->write(ByteCodeInstruction::kMaskPush);
+    this->writeExpression(*t.fIfTrue);
+    this->write(ByteCodeInstruction::kMaskNegate);
+    this->writeExpression(*t.fIfFalse);
+    this->write(ByteCodeInstruction::kMaskBlend, count);
+    this->write8(count);
 }
 
-void ByteCodeGenerator::writeVariableExpression(const Expression& expr,
-                                                ByteCode::Register result) {
-    ByteCodeGenerator::Location location = this->getLocation(expr);
-    int count = SlotCount(expr.fType);
-    ByteCode::Instruction load = this->getLoadInstruction(location, this->getStorage(expr));
-    this->write(load, count);
-    this->write(result);
-    this->write(location);
-}
-
-void ByteCodeGenerator::writeExpression(const Expression& expr, ByteCode::Register result) {
-    switch (expr.fKind) {
-        case Expression::kBoolLiteral_Kind: {
-            this->write(ByteCode::Instruction::kImmediate);
-            this->write(result);
-            this->write(ByteCode::Immediate((int32_t) (((BoolLiteral&) expr).fValue ? -1 : 0)));
+void ByteCodeGenerator::writeExpression(const Expression& e, bool discard) {
+    switch (e.fKind) {
+        case Expression::kBinary_Kind:
+            discard = this->writeBinaryExpression((BinaryExpression&) e, discard);
             break;
-        }
-        case Expression::kBinary_Kind: {
-            this->writeBinaryExpression((BinaryExpression&) expr, result);
+        case Expression::kBoolLiteral_Kind:
+            this->writeBoolLiteral((BoolLiteral&) e);
             break;
-        }
-        case Expression::kConstructor_Kind: {
-            this->writeConstructor((Constructor&) expr, result);
+        case Expression::kConstructor_Kind:
+            this->writeConstructor((Constructor&) e);
             break;
-        }
         case Expression::kExternalFunctionCall_Kind:
-            this->writeExternalFunctionCall((ExternalFunctionCall&) expr, result);
+            this->writeExternalFunctionCall((ExternalFunctionCall&) e);
             break;
         case Expression::kExternalValue_Kind:
-            this->writeExternalValue((ExternalValueReference&) expr, result);
-            break;
-        case Expression::kFloatLiteral_Kind: {
-            this->write(ByteCode::Instruction::kImmediate);
-            this->write(result);
-            this->write(ByteCode::Immediate((float) ((FloatLiteral&) expr).fValue));
-            break;
-        }
-        case Expression::kFunctionCall_Kind: {
-            this->writeFunctionCall((FunctionCall&) expr, result);
-            break;
-        }
-        case Expression::kIntLiteral_Kind: {
-            this->write(ByteCode::Instruction::kImmediate);
-            this->write(result);
-            this->write(ByteCode::Immediate((int32_t) ((IntLiteral&) expr).fValue));
-            break;
-        }
-        case Expression::kPostfix_Kind:
-            this->writePostfixExpression((PostfixExpression&) expr, result);
-            break;
-        case Expression::kPrefix_Kind:
-            this->writePrefixExpression((PrefixExpression&) expr, result);
-            break;
-        case Expression::kSwizzle_Kind:
-            this->writeSwizzle((Swizzle&) expr, result);
-            break;
-        case Expression::kTernary_Kind:
-            this->writeTernaryExpression((TernaryExpression&) expr, result);
+            this->writeExternalValue((ExternalValueReference&) e);
             break;
         case Expression::kFieldAccess_Kind:
         case Expression::kIndex_Kind:
         case Expression::kVariableReference_Kind:
-            this->writeVariableExpression(expr, result);
+            this->writeVariableExpression(e);
+            break;
+        case Expression::kFloatLiteral_Kind:
+            this->writeFloatLiteral((FloatLiteral&) e);
+            break;
+        case Expression::kFunctionCall_Kind:
+            this->writeFunctionCall((FunctionCall&) e);
+            break;
+        case Expression::kIntLiteral_Kind:
+            this->writeIntLiteral((IntLiteral&) e);
+            break;
+        case Expression::kNullLiteral_Kind:
+            this->writeNullLiteral((NullLiteral&) e);
+            break;
+        case Expression::kPrefix_Kind:
+            discard = this->writePrefixExpression((PrefixExpression&) e, discard);
+            break;
+        case Expression::kPostfix_Kind:
+            discard = this->writePostfixExpression((PostfixExpression&) e, discard);
+            break;
+        case Expression::kSwizzle_Kind:
+            this->writeSwizzle((Swizzle&) e);
+            break;
+        case Expression::kTernary_Kind:
+            this->writeTernaryExpression((TernaryExpression&) e);
             break;
         default:
 #ifdef SK_DEBUG
-            ABORT("unsupported lvalue %s\n", expr.description().c_str());
+            printf("unsupported expression %s\n", e.description().c_str());
 #endif
-            break;
+            SkASSERT(false);
+    }
+    if (discard) {
+        int count = SlotCount(e.fType);
+        if (count > 4) {
+            this->write(ByteCodeInstruction::kPopN, count);
+            this->write8(count);
+        } else if (count != 0) {
+            this->write(vector_instruction(ByteCodeInstruction::kPop, count));
+        }
+        discard = false;
     }
 }
 
-ByteCode::Register ByteCodeGenerator::writeExpression(const Expression& expr) {
-    ByteCode::Register result = this->next(SlotCount(expr.fType));
-    this->writeExpression(expr, result);
-    return result;
+class ByteCodeExternalValueLValue : public ByteCodeGenerator::LValue {
+public:
+    ByteCodeExternalValueLValue(ByteCodeGenerator* generator, ExternalValue& value, int index)
+        : INHERITED(*generator)
+        , fCount(ByteCodeGenerator::SlotCount(value.type()))
+        , fIndex(index) {}
+
+    void load() override {
+        fGenerator.write(vector_instruction(ByteCodeInstruction::kReadExternal, fCount));
+        fGenerator.write8(fCount);
+        fGenerator.write8(fIndex);
+    }
+
+    void store(bool discard) override {
+        if (!discard) {
+            fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, fCount));
+            fGenerator.write8(fCount);
+        }
+        fGenerator.write(vector_instruction(ByteCodeInstruction::kWriteExternal, fCount));
+        fGenerator.write8(fCount);
+        fGenerator.write8(fIndex);
+    }
+
+private:
+    typedef LValue INHERITED;
+
+    int fCount;
+
+    int fIndex;
+};
+
+class ByteCodeSwizzleLValue : public ByteCodeGenerator::LValue {
+public:
+    ByteCodeSwizzleLValue(ByteCodeGenerator* generator, const Swizzle& swizzle)
+        : INHERITED(*generator)
+        , fSwizzle(swizzle) {}
+
+    void load() override {
+        fGenerator.writeSwizzle(fSwizzle);
+    }
+
+    void store(bool discard) override {
+        int count = fSwizzle.fComponents.size();
+        if (!discard) {
+            fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, count));
+            fGenerator.write8(count);
+        }
+        ByteCodeGenerator::Location location = fGenerator.getLocation(*fSwizzle.fBase);
+        if (location.isOnStack()) {
+            fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreSwizzleIndirect,
+                                                  ByteCodeInstruction::kStoreSwizzleIndirectGlobal),
+                             count);
+        } else {
+            fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreSwizzle,
+                                                  ByteCodeInstruction::kStoreSwizzleGlobal),
+                             count);
+            fGenerator.write8(location.fSlot);
+        }
+        fGenerator.write8(count);
+        for (int c : fSwizzle.fComponents) {
+            fGenerator.write8(c);
+        }
+    }
+
+private:
+    const Swizzle& fSwizzle;
+
+    typedef LValue INHERITED;
+};
+
+class ByteCodeExpressionLValue : public ByteCodeGenerator::LValue {
+public:
+    ByteCodeExpressionLValue(ByteCodeGenerator* generator, const Expression& expr)
+        : INHERITED(*generator)
+        , fExpression(expr) {}
+
+    void load() override {
+        fGenerator.writeVariableExpression(fExpression);
+    }
+
+    void store(bool discard) override {
+        int count = ByteCodeGenerator::SlotCount(fExpression.fType);
+        if (!discard) {
+            if (count > 4) {
+                fGenerator.write(ByteCodeInstruction::kDupN, count);
+                fGenerator.write8(count);
+            } else {
+                fGenerator.write(vector_instruction(ByteCodeInstruction::kDup, count));
+                fGenerator.write8(count);
+            }
+        }
+        ByteCodeGenerator::Location location = fGenerator.getLocation(fExpression);
+        if (location.isOnStack() || count > 4) {
+            if (!location.isOnStack()) {
+                fGenerator.write(ByteCodeInstruction::kPushImmediate);
+                fGenerator.write32(location.fSlot);
+            }
+            fGenerator.write(location.selectStore(ByteCodeInstruction::kStoreExtended,
+                                                  ByteCodeInstruction::kStoreExtendedGlobal),
+                             count);
+            fGenerator.write8(count);
+        } else {
+            fGenerator.write(
+                    vector_instruction(location.selectStore(ByteCodeInstruction::kStore,
+                                                            ByteCodeInstruction::kStoreGlobal),
+                                       count));
+            fGenerator.write8(location.fSlot);
+        }
+    }
+
+private:
+    typedef LValue INHERITED;
+
+    const Expression& fExpression;
+};
+
+std::unique_ptr<ByteCodeGenerator::LValue> ByteCodeGenerator::getLValue(const Expression& e) {
+    switch (e.fKind) {
+        case Expression::kExternalValue_Kind: {
+            ExternalValue* value = ((ExternalValueReference&) e).fValue;
+            int index = fOutput->fExternalValues.size();
+            fOutput->fExternalValues.push_back(value);
+            SkASSERT(index <= 255);
+            return std::unique_ptr<LValue>(new ByteCodeExternalValueLValue(this, *value, index));
+        }
+        case Expression::kFieldAccess_Kind:
+        case Expression::kIndex_Kind:
+        case Expression::kVariableReference_Kind:
+            return std::unique_ptr<LValue>(new ByteCodeExpressionLValue(this, e));
+        case Expression::kSwizzle_Kind: {
+            const Swizzle& s = (const Swizzle&) e;
+            return swizzle_is_simple(s)
+                    ? std::unique_ptr<LValue>(new ByteCodeExpressionLValue(this, e))
+                    : std::unique_ptr<LValue>(new ByteCodeSwizzleLValue(this, s));
+        }
+        case Expression::kTernary_Kind:
+        default:
+#ifdef SK_DEBUG
+            ABORT("unsupported lvalue %s\n", e.description().c_str());
+#endif
+            return nullptr;
+    }
 }
 
 void ByteCodeGenerator::writeBlock(const Block& b) {
@@ -1253,119 +1484,146 @@
     }
 }
 
+void ByteCodeGenerator::setBreakTargets() {
+    std::vector<DeferredLocation>& breaks = fBreakTargets.top();
+    for (DeferredLocation& b : breaks) {
+        b.set();
+    }
+    fBreakTargets.pop();
+}
+
+void ByteCodeGenerator::setContinueTargets() {
+    std::vector<DeferredLocation>& continues = fContinueTargets.top();
+    for (DeferredLocation& c : continues) {
+        c.set();
+    }
+    fContinueTargets.pop();
+}
+
+void ByteCodeGenerator::writeBreakStatement(const BreakStatement& b) {
+    // TODO: Include BranchIfAllFalse to top-most LoopNext
+    this->write(ByteCodeInstruction::kLoopBreak);
+}
+
+void ByteCodeGenerator::writeContinueStatement(const ContinueStatement& c) {
+    // TODO: Include BranchIfAllFalse to top-most LoopNext
+    this->write(ByteCodeInstruction::kLoopContinue);
+}
+
 void ByteCodeGenerator::writeDoStatement(const DoStatement& d) {
-    this->write(ByteCode::Instruction::kLoopBegin);
-    ++fConditionCount;
-    SkASSERT(fCode->size() < ByteCode::kPointerMax);
-    ByteCode::Pointer start{(uint16_t) fCode->size()};
+    this->write(ByteCodeInstruction::kLoopBegin);
+    size_t start = fCode->size();
     this->writeStatement(*d.fStatement);
-    ByteCode::Register test = this->writeExpression(*d.fTest);
-    this->write(ByteCode::Instruction::kLoopNext);
-    this->write(ByteCode::Instruction::kLoopMask);
-    this->write(test);
-    this->write(ByteCode::Instruction::kBranchIfAllFalse);
+    this->write(ByteCodeInstruction::kLoopNext);
+    this->writeExpression(*d.fTest);
+    this->write(ByteCodeInstruction::kLoopMask);
+    // TODO: Could shorten this with kBranchIfAnyTrue
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
     DeferredLocation endLocation(this);
-    this->write(ByteCode::Instruction::kBranch);
-    this->write(start);
+    this->write(ByteCodeInstruction::kBranch);
+    this->write16(start);
     endLocation.set();
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kLoopEnd);
+    this->write(ByteCodeInstruction::kLoopEnd);
 }
 
 void ByteCodeGenerator::writeForStatement(const ForStatement& f) {
+    fContinueTargets.emplace();
+    fBreakTargets.emplace();
     if (f.fInitializer) {
         this->writeStatement(*f.fInitializer);
     }
-    this->write(ByteCode::Instruction::kLoopBegin);
-    ++fConditionCount;
-    ByteCode::Pointer start{(uint16_t) fCode->size()};
+    this->write(ByteCodeInstruction::kLoopBegin);
+    size_t start = fCode->size();
     if (f.fTest) {
-        ByteCode::Register test = this->writeExpression(*f.fTest);
-        this->write(ByteCode::Instruction::kLoopMask);
-        this->write(test);
+        this->writeExpression(*f.fTest);
+        this->write(ByteCodeInstruction::kLoopMask);
     }
-    this->write(ByteCode::Instruction::kBranchIfAllFalse);
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
     DeferredLocation endLocation(this);
     this->writeStatement(*f.fStatement);
-    this->write(ByteCode::Instruction::kLoopNext);
+    this->write(ByteCodeInstruction::kLoopNext);
     if (f.fNext) {
-        this->writeExpression(*f.fNext);
+        this->writeExpression(*f.fNext, true);
     }
-    this->write(ByteCode::Instruction::kBranch);
-    this->write(start);
+    this->write(ByteCodeInstruction::kBranch);
+    this->write16(start);
     endLocation.set();
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kLoopEnd);
+    this->write(ByteCodeInstruction::kLoopEnd);
 }
 
 void ByteCodeGenerator::writeIfStatement(const IfStatement& i) {
-    ByteCode::Register test = this->writeExpression(*i.fTest);
-    this->write(ByteCode::Instruction::kMaskPush);
-    ++fConditionCount;
-    this->write(test);
-    this->write(ByteCode::Instruction::kBranchIfAllFalse);
+    this->writeExpression(*i.fTest);
+    this->write(ByteCodeInstruction::kMaskPush);
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
     DeferredLocation falseLocation(this);
     this->writeStatement(*i.fIfTrue);
     falseLocation.set();
     if (i.fIfFalse) {
-        this->write(ByteCode::Instruction::kMaskNegate);
-        this->write(ByteCode::Instruction::kBranchIfAllFalse);
+        this->write(ByteCodeInstruction::kMaskNegate);
+        this->write(ByteCodeInstruction::kBranchIfAllFalse);
         DeferredLocation endLocation(this);
         this->writeStatement(*i.fIfFalse);
         endLocation.set();
     }
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kMaskPop);
+    this->write(ByteCodeInstruction::kMaskPop);
 }
 
-void ByteCodeGenerator::writeReturn(const ReturnStatement& r) {
-    if (fConditionCount) {
+void ByteCodeGenerator::writeReturnStatement(const ReturnStatement& r) {
+    if (fLoopCount || fConditionCount) {
         fErrors.error(r.fOffset, "return not allowed inside conditional or loop");
         return;
     }
-    if (r.fExpression) {
-        ByteCode::Register value = this->writeExpression(*r.fExpression);
-        this->write(ByteCode::Instruction::kReturnValue);
-        this->write(value);
-    }
-    else {
-        this->write(ByteCode::Instruction::kReturn);
-    }
+    int count = SlotCount(r.fExpression->fType);
+    this->writeExpression(*r.fExpression);
+
+    // Technically, the kReturn also pops fOutput->fLocalCount values from the stack, too, but we
+    // haven't counted pushing those (they're outside the scope of our stack tracking). Instead,
+    // we account for those in writeFunction().
+
+    // This is all fine because we don't allow conditional returns, so we only return once anyway.
+    this->write(ByteCodeInstruction::kReturn, -count);
+    this->write8(count);
+}
+
+void ByteCodeGenerator::writeSwitchStatement(const SwitchStatement& r) {
+    // not yet implemented
+    abort();
 }
 
 void ByteCodeGenerator::writeVarDeclarations(const VarDeclarations& v) {
     for (const auto& declStatement : v.fVars) {
         const VarDeclaration& decl = (VarDeclaration&) *declStatement;
-        // we need to grab the location even if we don't use it, to ensure it
-        // has been allocated
-        ByteCodeGenerator::Location location = this->getLocation(*decl.fVar);
+        // we need to grab the location even if we don't use it, to ensure it has been allocated
+        Location location = this->getLocation(*decl.fVar);
         if (decl.fValue) {
-            ByteCode::Register src = this->writeExpression(*decl.fValue);
-            uint8_t count = (uint8_t) SlotCount(decl.fVar->fType);
-            this->write(ByteCode::Instruction::kStoreStackDirect, count);
-            this->write(location);
-            this->write(src);
+            this->writeExpression(*decl.fValue);
+            int count = SlotCount(decl.fValue->fType);
+            if (count > 4) {
+                this->write(ByteCodeInstruction::kPushImmediate);
+                this->write32(location.fSlot);
+                this->write(ByteCodeInstruction::kStoreExtended, count);
+                this->write8(count);
+            } else {
+                this->write(vector_instruction(ByteCodeInstruction::kStore, count));
+                this->write8(location.fSlot);
+            }
         }
     }
 }
 
 void ByteCodeGenerator::writeWhileStatement(const WhileStatement& w) {
-    this->write(ByteCode::Instruction::kLoopBegin);
-    ++fConditionCount;
-    SkASSERT(fCode->size() < ByteCode::kPointerMax);
-    ByteCode::Pointer start{(uint16_t) fCode->size()};
-    ByteCode::Register test = this->writeExpression(*w.fTest);
-    this->write(ByteCode::Instruction::kLoopMask);
-    this->write(test);
-    this->write(ByteCode::Instruction::kBranchIfAllFalse);
+    this->write(ByteCodeInstruction::kLoopBegin);
+    size_t cond = fCode->size();
+    this->writeExpression(*w.fTest);
+    this->write(ByteCodeInstruction::kLoopMask);
+    this->write(ByteCodeInstruction::kBranchIfAllFalse);
     DeferredLocation endLocation(this);
     this->writeStatement(*w.fStatement);
-    this->write(ByteCode::Instruction::kLoopNext);
-    this->write(ByteCode::Instruction::kBranch);
-    this->write(start);
+    this->write(ByteCodeInstruction::kLoopNext);
+    this->write(ByteCodeInstruction::kBranch);
+    this->write16(cond);
     endLocation.set();
-    --fConditionCount;
-    this->write(ByteCode::Instruction::kLoopEnd);
+    this->write(ByteCodeInstruction::kLoopEnd);
 }
 
 void ByteCodeGenerator::writeStatement(const Statement& s) {
@@ -1374,16 +1632,19 @@
             this->writeBlock((Block&) s);
             break;
         case Statement::kBreak_Kind:
-            this->write(ByteCode::Instruction::kBreak);
+            this->writeBreakStatement((BreakStatement&) s);
             break;
         case Statement::kContinue_Kind:
-            this->write(ByteCode::Instruction::kContinue);
+            this->writeContinueStatement((ContinueStatement&) s);
             break;
+        case Statement::kDiscard_Kind:
+            // not yet implemented
+            abort();
         case Statement::kDo_Kind:
             this->writeDoStatement((DoStatement&) s);
             break;
         case Statement::kExpression_Kind:
-            this->writeExpression(*((ExpressionStatement&) s).fExpression);
+            this->writeExpression(*((ExpressionStatement&) s).fExpression, true);
             break;
         case Statement::kFor_Kind:
             this->writeForStatement((ForStatement&) s);
@@ -1394,7 +1655,10 @@
         case Statement::kNop_Kind:
             break;
         case Statement::kReturn_Kind:
-            this->writeReturn((ReturnStatement&) s);
+            this->writeReturnStatement((ReturnStatement&) s);
+            break;
+        case Statement::kSwitch_Kind:
+            this->writeSwitchStatement((SwitchStatement&) s);
             break;
         case Statement::kVarDeclarations_Kind:
             this->writeVarDeclarations(*((VarDeclarationsStatement&) s).fDeclaration);
@@ -1403,80 +1667,18 @@
             this->writeWhileStatement((WhileStatement&) s);
             break;
         default:
-            ABORT("unsupported statement\n");
+            SkASSERT(false);
     }
 }
 
-void ByteCodeGenerator::writeFunction(const FunctionDefinition& f) {
-    fFunction = &f;
-    std::unique_ptr<ByteCodeFunction> result(new ByteCodeFunction(&f.fDeclaration));
-    result->fReturnSlotCount = SlotCount(f.fDeclaration.fReturnType);
+ByteCodeFunction::ByteCodeFunction(const FunctionDeclaration* declaration)
+        : fName(declaration->fName) {
     fParameterCount = 0;
-    fConditionCount = 0;
-    for (const auto& p : f.fDeclaration.fParameters) {
-        int count = SlotCount(p->fType);
-        bool isOut = ((p->fModifiers.fFlags & Modifiers::kOut_Flag) != 0);
-        result->fParameters.push_back(ByteCodeFunction::Parameter{count, isOut});
-        fParameterCount += count;
-    }
-    result->fParameterSlotCount = fParameterCount;
-    fCode = &result->fCode;
-    this->writeStatement(*f.fBody);
-    result->fStackSlotCount = fLocals.size();
-    if (f.fDeclaration.fReturnType.fName == "void") {
-        this->write(ByteCode::Instruction::kReturn);
-    } else {
-        this->write(ByteCode::Instruction::kAbort);
-    }
-    fOutput->fFunctions.push_back(std::move(result));
-    SkASSERT(fConditionCount == 0);
-}
-
-void ByteCodeGenerator::gatherUniforms(const Type& type, const String& name) {
-    if (type.kind() == Type::kOther_Kind) {
-        return;
-    } else if (type.kind() == Type::kStruct_Kind) {
-        for (const auto& f : type.fields()) {
-            this->gatherUniforms(*f.fType, name + "." + f.fName);
-        }
-    } else if (type.kind() == Type::kArray_Kind) {
-        for (int i = 0; i < type.columns(); ++i) {
-            this->gatherUniforms(type.componentType(), String::printf("%s[%d]", name.c_str(), i));
-        }
-    } else {
-        fOutput->fUniforms.push_back({ name, type_category(type), type.rows(), type.columns(),
-                                       fOutput->fUniformSlotCount });
-        fOutput->fUniformSlotCount += type.columns() * type.rows();
+    for (const auto& p : declaration->fParameters) {
+        int slots = ByteCodeGenerator::SlotCount(p->fType);
+        fParameters.push_back({ slots, (bool)(p->fModifiers.fFlags & Modifiers::kOut_Flag) });
+        fParameterCount += slots;
     }
 }
 
-bool ByteCodeGenerator::generateCode() {
-    fOutput->fGlobalSlotCount = 0;
-    fOutput->fUniformSlotCount = 0;
-    for (const auto& pe : fProgram) {
-        if (pe.fKind == ProgramElement::kVar_Kind) {
-            VarDeclarations& decl = (VarDeclarations&) pe;
-            for (const auto& v : decl.fVars) {
-                const Variable* declVar = ((VarDeclaration&) *v).fVar;
-                if (declVar->fModifiers.fLayout.fBuiltin >= 0 || is_in(*declVar)) {
-                    continue;
-                }
-                if (is_uniform(*declVar)) {
-                    this->gatherUniforms(declVar->fType, declVar->fName);
-                } else {
-                    fOutput->fGlobalSlotCount += SlotCount(declVar->fType);
-                }
-            }
-        }
-    }
-    for (const auto& pe : fProgram) {
-        if (pe.fKind == ProgramElement::kFunction_Kind) {
-            FunctionDefinition& f = (FunctionDefinition&) pe;
-            fFunctions.push_back(&f);
-            this->writeFunction(f);
-        }
-    }
-    return fErrors.errorCount() == 0;
 }
-
-} // namespace
diff --git a/src/sksl/SkSLByteCodeGenerator.h b/src/sksl/SkSLByteCodeGenerator.h
index 9afe1a5..4e3accd 100644
--- a/src/sksl/SkSLByteCodeGenerator.h
+++ b/src/sksl/SkSLByteCodeGenerator.h
@@ -54,20 +54,95 @@
 
 class ByteCodeGenerator : public CodeGenerator {
 public:
-    ByteCodeGenerator(const Program* program, ErrorReporter* errors, ByteCode* output);
+    class LValue {
+    public:
+        LValue(ByteCodeGenerator& generator)
+            : fGenerator(generator) {}
+
+        virtual ~LValue() {}
+
+        /**
+         * Stack before call: ... lvalue
+         * Stack after call: ... lvalue load
+         */
+        virtual void load() = 0;
+
+        /**
+         * Stack before call: ... lvalue value
+         * Stack after call: ...
+         */
+        virtual void store(bool discard) = 0;
+
+    protected:
+        ByteCodeGenerator& fGenerator;
+    };
+
+    ByteCodeGenerator(const Context* context, const Program* program, ErrorReporter* errors,
+                      ByteCode* output);
 
     bool generateCode() override;
 
+    void write8(uint8_t b);
+
+    void write16(uint16_t b);
+
+    void write32(uint32_t b);
+
+    void write(ByteCodeInstruction inst, int count = kUnusedStackCount);
+
+    /**
+     * Based on 'type', writes the s (signed), u (unsigned), or f (float) instruction.
+     */
+    void writeTypedInstruction(const Type& type, ByteCodeInstruction s, ByteCodeInstruction u,
+                               ByteCodeInstruction f, int count, bool writeCount = true);
+
+    static int SlotCount(const Type& type);
+
 private:
+    static constexpr int kUnusedStackCount = INT32_MAX;
+    static int StackUsage(ByteCodeInstruction, int count);
+
+    // reserves 16 bits in the output code, to be filled in later with an address once we determine
+    // it
+    class DeferredLocation {
+    public:
+        DeferredLocation(ByteCodeGenerator* generator)
+            : fGenerator(*generator)
+            , fOffset(generator->fCode->size()) {
+            generator->write16(0);
+        }
+
+#ifdef SK_DEBUG
+        ~DeferredLocation() {
+            SkASSERT(fSet);
+        }
+#endif
+
+        void set() {
+            int target = fGenerator.fCode->size();
+            SkASSERT(target <= 65535);
+            (*fGenerator.fCode)[fOffset] = target;
+            (*fGenerator.fCode)[fOffset + 1] = target >> 8;
+#ifdef SK_DEBUG
+            fSet = true;
+#endif
+        }
+
+    private:
+        ByteCodeGenerator& fGenerator;
+        size_t fOffset;
+#ifdef SK_DEBUG
+        bool fSet = false;
+#endif
+    };
+
     // Intrinsics which do not simply map to a single opcode
     enum class SpecialIntrinsic {
         kDot,
-        kInverse,
-        kPrint,
     };
 
     struct Intrinsic {
-        Intrinsic(ByteCode::Instruction instruction)
+        Intrinsic(ByteCodeInstruction instruction)
             : fIsSpecial(false)
             , fValue(instruction) {}
 
@@ -78,257 +153,201 @@
         bool fIsSpecial;
 
         union Value {
-            Value(ByteCode::Instruction instruction)
+            Value(ByteCodeInstruction instruction)
                 : fInstruction(instruction) {}
 
             Value(SpecialIntrinsic special)
                 : fSpecial(special) {}
 
-            ByteCode::Instruction fInstruction;
+            ByteCodeInstruction fInstruction;
             SpecialIntrinsic fSpecial;
         } fValue;
     };
 
-    class LValue {
-    public:
-        LValue(ByteCodeGenerator& generator)
-            : fGenerator(generator) {}
 
-        virtual ~LValue() {}
-
-        virtual void load(ByteCode::Register result) = 0;
-
-        virtual void store(ByteCode::Register src) = 0;
-
-    protected:
-        ByteCodeGenerator& fGenerator;
+    // Similar to Variable::Storage, but locals and parameters are grouped together, and globals
+    // are further subidivided into uniforms and other (writable) globals.
+    enum class Storage {
+        kLocal,    // include parameters
+        kGlobal,   // non-uniform globals
+        kUniform,  // uniform globals
     };
 
     struct Location {
-        enum {
-            kPointer_Kind,
-            kRegister_Kind
-        } fKind;
+        int     fSlot;
+        Storage fStorage;
 
-        union {
-            ByteCode::Pointer fPointer;
-            ByteCode::Register fRegister;
-        };
+        // Not really invalid, but a "safe" placeholder to be more explicit at call-sites
+        static Location MakeInvalid() { return { 0, Storage::kLocal }; }
 
-        Location(ByteCode::Pointer p)
-            : fKind(kPointer_Kind)
-            , fPointer(p) {}
+        Location makeOnStack() { return { -1, fStorage }; }
+        bool isOnStack() const { return fSlot < 0; }
 
-        Location(ByteCode::Register r)
-            : fKind(kRegister_Kind)
-            , fRegister(r) {}
-
-        /**
-         * Returns this location offset by 'offset' bytes. For pointers, this is a compile-time
-         * operation, while for registers there will be CPU instructions output to handle the
-         * runtime calculation of the address.
-         */
-        Location offset(ByteCodeGenerator& generator, int offset) {
-            if (!offset) {
-                return *this;
-            }
-            if (fKind == kPointer_Kind) {
-                return Location(fPointer + offset);
-            }
-            ByteCode::Register a = generator.next(1);
-            generator.write(ByteCode::Instruction::kImmediate);
-            generator.write(a);
-            generator.write(ByteCode::Immediate{offset});
-            ByteCode::Register result = generator.next(1);
-            generator.write(ByteCode::Instruction::kAddI);
-            generator.write(result);
-            generator.write(fRegister);
-            generator.write(a);
-            return result;
+        Location operator+(int offset) {
+            SkASSERT(fSlot >= 0);
+            return { fSlot + offset, fStorage };
         }
 
-        /**
-         * Returns this location offset by the number of bytes stored in the 'offset' register. This
-         * will output the necessary CPU instructions to perform the math and return a new register
-         * location.
-         */
-        Location offset(ByteCodeGenerator& generator, ByteCode::Register offset) {
-            ByteCode::Register current;
-            switch (fKind) {
-                case kPointer_Kind:
-                    current = generator.next(1);
-                    generator.write(ByteCode::Instruction::kImmediate);
-                    generator.write(current);
-                    generator.write(ByteCode::Immediate{fPointer.fAddress});
-                    break;
-                case kRegister_Kind:
-                    current = fRegister;
+        ByteCodeInstruction selectLoad(ByteCodeInstruction local,
+                                       ByteCodeInstruction global,
+                                       ByteCodeInstruction uniform) const {
+            switch (fStorage) {
+                case Storage::kLocal:   return local;
+                case Storage::kGlobal:  return global;
+                case Storage::kUniform: return uniform;
             }
-            ByteCode::Register result = generator.next(1);
-            generator.write(ByteCode::Instruction::kAddI);
-            generator.write(result);
-            generator.write(current);
-            generator.write(offset);
-            return result;
+            SkUNREACHABLE;
+        }
+
+        ByteCodeInstruction selectStore(ByteCodeInstruction local,
+                                        ByteCodeInstruction global) const {
+            switch (fStorage) {
+                case Storage::kLocal:   return local;
+                case Storage::kGlobal:  return global;
+                case Storage::kUniform: ABORT("Trying to store to a uniform"); break;
+            }
+            return local;
         }
     };
 
-    // reserves 16 bits in the output code, to be filled in later with an address once we determine
-    // it
-    class DeferredLocation {
-    public:
-        explicit DeferredLocation(ByteCodeGenerator* generator)
-            : fGenerator(*generator)
-            , fOffset(generator->fCode->size()) {
-            generator->write(ByteCode::Pointer{65535});
-        }
-
-        void set() {
-            SkASSERT(fGenerator.fCode->size() <= ByteCode::kPointerMax);
-            static_assert(sizeof(ByteCode::Pointer) == 2,
-                          "failed assumption that ByteCode::Pointer is uint16_t");
-            void* dst = &(*fGenerator.fCode)[fOffset];
-            // ensure that the placeholder value 65535 hasn't been modified yet
-            SkASSERT(((uint8_t*) dst)[0] == 255 && ((uint8_t*) dst)[1] == 255);
-            ByteCode::Pointer target{(uint16_t) fGenerator.fCode->size()};
-            memcpy(dst, &target, sizeof(target));
-        }
-
-    private:
-        ByteCodeGenerator& fGenerator;
-        size_t fOffset;
-    };
-
-    template<typename T>
-    void write(T value) {
-        size_t n = fCode->size();
-        fCode->resize(n + sizeof(value));
-        memcpy(fCode->data() + n, &value, sizeof(value));
-    }
-
-    ByteCode::Register next(int slotCount);
-
-    void write(ByteCode::Instruction inst, int count);
-
     /**
-     * Based on 'type', writes the s (signed), u (unsigned), or f (float) instruction.
+     * Returns the local slot into which var should be stored, allocating a new slot if it has not
+     * already been assigned one. Compound variables (e.g. vectors) will consume more than one local
+     * slot, with the getLocation return value indicating where the first element should be stored.
      */
-    void writeTypedInstruction(const Type& type, ByteCode::Instruction s, ByteCode::Instruction u,
-                               ByteCode::Instruction f);
-
-    ByteCode::Instruction getLoadInstruction(Location location, Variable::Storage storage);
-
-    ByteCode::Instruction getStoreInstruction(Location location, Variable::Storage storage);
-
-    static int SlotCount(const Type& type);
-
     Location getLocation(const Variable& var);
 
+    /**
+     * As above, but computes the (possibly dynamic) address of an expression involving indexing &
+     * field access. If the address is known, it's returned. If not, -1 is returned, and the
+     * location will be left on the top of the stack.
+     */
     Location getLocation(const Expression& expr);
 
-    Variable::Storage getStorage(const Expression& expr);
-
-    std::unique_ptr<LValue> getLValue(const Expression& expr);
-
-    void writeFunction(const FunctionDefinition& f);
-
-    // For compound values, the result argument specifies the first component. Subsequent components
-    // will be in subsequent registers.
-
-    void writeBinaryInstruction(const Type& operandType, ByteCode::Register left,
-                                ByteCode::Register right, ByteCode::Instruction s,
-                                ByteCode::Instruction u, ByteCode::Instruction f,
-                                ByteCode::Register result);
-
-    void writeVectorBinaryInstruction(const Type& operandType, ByteCode::Register left,
-                                      ByteCode::Register right, ByteCode::Instruction s,
-                                      ByteCode::Instruction u, ByteCode::Instruction f,
-                                      ByteCode::Register result);
-
-    void writeBinaryExpression(const BinaryExpression& expr, ByteCode::Register result);
-
-    void writeConstructor(const Constructor& c, ByteCode::Register result);
-
-    void writeExternalFunctionCall(const ExternalFunctionCall& f, ByteCode::Register result);
-
-    void writeExternalValue(const ExternalValueReference& e, ByteCode::Register result);
-
-    void writeIntrinsicCall(const FunctionCall& c, Intrinsic intrinsic, ByteCode::Register result);
-
-    void writeFunctionCall(const FunctionCall& c, ByteCode::Register result);
-
-    void incOrDec(Token::Kind op, Expression& operand, bool prefix, ByteCode::Register result);
-
-    void writePostfixExpression(const PostfixExpression& p, ByteCode::Register result);
-
-    void writePrefixExpression(const PrefixExpression& p, ByteCode::Register result);
-
-    void writeSwizzle(const Swizzle& s, ByteCode::Register result);
-
-    void writeTernaryExpression(const TernaryExpression& t, ByteCode::Register result);
-
-    void writeVariableExpression(const Expression& e, ByteCode::Register result);
-
-    void writeExpression(const Expression& expr, ByteCode::Register result);
-
-    ByteCode::Register writeExpression(const Expression& expr);
-
-    void writeBlock(const Block& b);
-
-    void writeDoStatement(const DoStatement& d);
-
-    void writeForStatement(const ForStatement& f);
-
-    void writeIfStatement(const IfStatement& i);
-
-    void writeReturn(const ReturnStatement& r);
-
-    void writeVarDeclarations(const VarDeclarations& v);
-
-    void writeWhileStatement(const WhileStatement& w);
-
-    void writeStatement(const Statement& s);
-
     void gatherUniforms(const Type& type, const String& name);
 
+    std::unique_ptr<ByteCodeFunction> writeFunction(const FunctionDefinition& f);
+
+    void writeVarDeclarations(const VarDeclarations& decl);
+
+    void writeVariableExpression(const Expression& expr);
+
+    void writeExpression(const Expression& expr, bool discard = false);
+
+    /**
+     * Pushes whatever values are required by the lvalue onto the stack, and returns an LValue
+     * permitting loads and stores to it.
+     */
+    std::unique_ptr<LValue> getLValue(const Expression& expr);
+
+    void writeIntrinsicCall(const FunctionCall& c);
+
+    void writeFunctionCall(const FunctionCall& c);
+
+    void writeConstructor(const Constructor& c);
+
+    void writeExternalFunctionCall(const ExternalFunctionCall& c);
+
+    void writeExternalValue(const ExternalValueReference& r);
+
+    void writeSwizzle(const Swizzle& swizzle);
+
+    bool writeBinaryExpression(const BinaryExpression& b, bool discard);
+
+    void writeTernaryExpression(const TernaryExpression& t);
+
+    void writeNullLiteral(const NullLiteral& n);
+
+    bool writePrefixExpression(const PrefixExpression& p, bool discard);
+
+    bool writePostfixExpression(const PostfixExpression& p, bool discard);
+
+    void writeBoolLiteral(const BoolLiteral& b);
+
+    void writeIntLiteral(const IntLiteral& i);
+
+    void writeFloatLiteral(const FloatLiteral& f);
+
+    void writeStatement(const Statement& s);
+
+    void writeBlock(const Block& b);
+
+    void writeBreakStatement(const BreakStatement& b);
+
+    void writeContinueStatement(const ContinueStatement& c);
+
+    void writeIfStatement(const IfStatement& stmt);
+
+    void writeForStatement(const ForStatement& f);
+
+    void writeWhileStatement(const WhileStatement& w);
+
+    void writeDoStatement(const DoStatement& d);
+
+    void writeSwitchStatement(const SwitchStatement& s);
+
+    void writeReturnStatement(const ReturnStatement& r);
+
+    // updates the current set of breaks to branch to the current location
+    void setBreakTargets();
+
+    // updates the current set of continues to branch to the current location
+    void setContinueTargets();
+
+    void enterLoop() {
+        fLoopCount++;
+        fMaxLoopCount = std::max(fMaxLoopCount, fLoopCount);
+    }
+
+    void exitLoop() {
+        SkASSERT(fLoopCount > 0);
+        fLoopCount--;
+    }
+
+    void enterCondition() {
+        fConditionCount++;
+        fMaxConditionCount = std::max(fMaxConditionCount, fConditionCount);
+    }
+
+    void exitCondition() {
+        SkASSERT(fConditionCount > 0);
+        fConditionCount--;
+    }
+
+    const Context& fContext;
+
     ByteCode* fOutput;
 
-    int fNextRegister = 0;
-
     const FunctionDefinition* fFunction;
 
-    std::vector<const FunctionDefinition*> fFunctions;
-
     std::vector<uint8_t>* fCode;
 
     std::vector<const Variable*> fLocals;
 
-    int fParameterCount;
+    std::stack<std::vector<DeferredLocation>> fContinueTargets;
 
+    std::stack<std::vector<DeferredLocation>> fBreakTargets;
+
+    std::vector<const FunctionDefinition*> fFunctions;
+
+    int fParameterCount;
+    int fStackCount;
+    int fMaxStackCount;
+
+    int fLoopCount;
+    int fMaxLoopCount;
     int fConditionCount;
+    int fMaxConditionCount;
 
     const std::unordered_map<String, Intrinsic> fIntrinsics;
 
     friend class DeferredLocation;
-    friend class ByteCodeExternalValueLValue;
-    friend class ByteCodeSimpleLValue;
+    friend class ByteCodeExpressionLValue;
     friend class ByteCodeSwizzleLValue;
 
     typedef CodeGenerator INHERITED;
 };
 
-template<>
-inline void ByteCodeGenerator::write(ByteCodeGenerator::Location loc) {
-    switch (loc.fKind) {
-        case ByteCodeGenerator::Location::kPointer_Kind:
-            this->write(loc.fPointer);
-            break;
-        case ByteCodeGenerator::Location::kRegister_Kind:
-            this->write(loc.fRegister);
-            break;
-    }
-}
-
 }
 
 #endif
diff --git a/src/sksl/SkSLCompiler.cpp b/src/sksl/SkSLCompiler.cpp
index 92c0c03..08bac70 100644
--- a/src/sksl/SkSLCompiler.cpp
+++ b/src/sksl/SkSLCompiler.cpp
@@ -78,17 +78,14 @@
 namespace SkSL {
 
 static void grab_intrinsics(std::vector<std::unique_ptr<ProgramElement>>* src,
-               std::map<String, std::pair<std::unique_ptr<ProgramElement>, bool>>* target) {
-    for (auto iter = src->begin(); iter != src->end(); ) {
-        std::unique_ptr<ProgramElement>& element = *iter;
+               std::map<StringFragment, std::pair<std::unique_ptr<ProgramElement>, bool>>* target) {
+    for (auto& element : *src) {
         switch (element->fKind) {
             case ProgramElement::kFunction_Kind: {
                 FunctionDefinition& f = (FunctionDefinition&) *element;
-                SkASSERT(f.fDeclaration.fBuiltin);
-                String key = f.fDeclaration.declaration();
-                SkASSERT(target->find(key) == target->end());
-                (*target)[key] = std::make_pair(std::move(element), false);
-                iter = src->erase(iter);
+                StringFragment name = f.fDeclaration.fName;
+                SkASSERT(target->find(name) == target->end());
+                (*target)[name] = std::make_pair(std::move(element), false);
                 break;
             }
             case ProgramElement::kEnum_Kind: {
@@ -96,7 +93,6 @@
                 StringFragment name = e.fTypeName;
                 SkASSERT(target->find(name) == target->end());
                 (*target)[name] = std::make_pair(std::move(element), false);
-                iter = src->erase(iter);
                 break;
             }
             default:
@@ -283,13 +279,11 @@
     this->processIncludeFile(Program::kPipelineStage_Kind, SKSL_PIPELINE_INCLUDE,
                              strlen(SKSL_PIPELINE_INCLUDE), fGpuSymbolTable, &fPipelineInclude,
                              &fPipelineSymbolTable);
+    std::vector<std::unique_ptr<ProgramElement>> interpIntrinsics;
     this->processIncludeFile(Program::kGeneric_Kind, SKSL_INTERP_INCLUDE,
                              strlen(SKSL_INTERP_INCLUDE), symbols, &fInterpreterInclude,
                              &fInterpreterSymbolTable);
-    grab_intrinsics(&fInterpreterInclude, &fInterpreterIntrinsics);
-    // need to hang on to the source so that FunctionDefinition.fSource pointers in this file
-    // remain valid
-    fInterpreterIncludeSource = std::move(fIRGenerator->fFile);
+    grab_intrinsics(&interpIntrinsics, &fInterpreterIntrinsics);
 }
 
 Compiler::~Compiler() {
@@ -1639,8 +1633,10 @@
     }
     fSource = program.fSource.get();
     std::unique_ptr<ByteCode> result(new ByteCode());
-    ByteCodeGenerator cg(&program, this, result.get());
-    if (cg.generateCode()) {
+    ByteCodeGenerator cg(fContext.get(), &program, this, result.get());
+    bool success = cg.generateCode();
+    fSource = nullptr;
+    if (success) {
         return result;
     }
 #else
diff --git a/src/sksl/SkSLCompiler.h b/src/sksl/SkSLCompiler.h
index 6aebb78..65f7a70 100644
--- a/src/sksl/SkSLCompiler.h
+++ b/src/sksl/SkSLCompiler.h
@@ -215,8 +215,8 @@
 
     Position position(int offset);
 
-    std::map<String, std::pair<std::unique_ptr<ProgramElement>, bool>> fGPUIntrinsics;
-    std::map<String, std::pair<std::unique_ptr<ProgramElement>, bool>> fInterpreterIntrinsics;
+    std::map<StringFragment, std::pair<std::unique_ptr<ProgramElement>, bool>> fGPUIntrinsics;
+    std::map<StringFragment, std::pair<std::unique_ptr<ProgramElement>, bool>> fInterpreterIntrinsics;
     std::unique_ptr<ASTFile> fGpuIncludeSource;
     std::shared_ptr<SymbolTable> fGpuSymbolTable;
     std::vector<std::unique_ptr<ProgramElement>> fVertexInclude;
@@ -227,7 +227,6 @@
     std::shared_ptr<SymbolTable> fGeometrySymbolTable;
     std::vector<std::unique_ptr<ProgramElement>> fPipelineInclude;
     std::shared_ptr<SymbolTable> fPipelineSymbolTable;
-    std::unique_ptr<ASTFile> fInterpreterIncludeSource;
     std::vector<std::unique_ptr<ProgramElement>> fInterpreterInclude;
     std::shared_ptr<SymbolTable> fInterpreterSymbolTable;
 
diff --git a/src/sksl/SkSLIRGenerator.cpp b/src/sksl/SkSLIRGenerator.cpp
index 5e4a8d3..cec91c0 100644
--- a/src/sksl/SkSLIRGenerator.cpp
+++ b/src/sksl/SkSLIRGenerator.cpp
@@ -1773,7 +1773,7 @@
                                               const FunctionDeclaration& function,
                                               std::vector<std::unique_ptr<Expression>> arguments) {
     if (function.fBuiltin) {
-        auto found = fIntrinsics->find(function.declaration());
+        auto found = fIntrinsics->find(function.fName);
         if (found != fIntrinsics->end() && !found->second.second) {
             found->second.second = true;
             const FunctionDeclaration* old = fCurrentFunction;
@@ -2188,7 +2188,7 @@
         }
     }
     fErrors.error(base->fOffset, "type '" + base->fType.displayName() + "' does not have a "
-                                 "field named '" + field + "'");
+                                 "field named '" + field + "");
     return nullptr;
 }
 
diff --git a/src/sksl/SkSLIRGenerator.h b/src/sksl/SkSLIRGenerator.h
index 2895a34..51381e0 100644
--- a/src/sksl/SkSLIRGenerator.h
+++ b/src/sksl/SkSLIRGenerator.h
@@ -158,7 +158,7 @@
     std::shared_ptr<SymbolTable> fSymbolTable;
     // Symbols which have definitions in the include files. The bool tells us whether this
     // intrinsic has been included already.
-    std::map<String, std::pair<std::unique_ptr<ProgramElement>, bool>>* fIntrinsics = nullptr;
+    std::map<StringFragment, std::pair<std::unique_ptr<ProgramElement>, bool>>* fIntrinsics = nullptr;
     int fLoopLevel;
     int fSwitchLevel;
     ErrorReporter& fErrors;
diff --git a/src/sksl/SkSLInterpreter.h b/src/sksl/SkSLInterpreter.h
deleted file mode 100644
index c4f889b..0000000
--- a/src/sksl/SkSLInterpreter.h
+++ /dev/null
@@ -1,1720 +0,0 @@
-/*
- * Copyright 2020 Google LLC
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "include/private/GrTypesPriv.h" // GrAlignTo
-#include "src/core/SkUtils.h" // sk_unaligned_load
-#include "src/sksl/SkSLByteCode.h"
-#include "src/sksl/SkSLExternalValue.h"
-
-#include <stack>
-
-#ifndef SKSL_INTERPRETER
-#define SKSL_INTERPRETER
-
-namespace SkSL {
-
-// GCC and Clang support the "labels as values" extension which we need to implement the interpreter
-// using threaded code. Otherwise, we fall back to using a switch statement in a for loop.
-#if defined(__GNUC__) || defined(__clang__)
-    #define SKSL_THREADED_CODE
-#endif
-
-#ifdef SKSL_THREADED_CODE
-    #define LABEL(name) name:
-    #ifdef TRACE
-        #define NEXT()                                   \
-            {                                            \
-                const uint8_t* trace_ip = ip;            \
-                printf("%d: ", (int) (trace_ip - code)); \
-                disassemble(&trace_ip);                  \
-            }                                            \
-            goto *labels[(int) read<ByteCode::Instruction>(&ip)]
-    #else
-        #define NEXT() goto *labels[(int) read<ByteCode::Instruction>(&ip)]
-    #endif
-#else
-    #define LABEL(name) case ByteCode::Instruction::name:
-    #define NEXT() continue
-#endif
-
-// If you trip this assert, it means that the order of the opcodes listed in ByteCodeInstruction
-// does not match the order of the opcodes listed in the 'labels' array in innerRun().
-#define CHECK_LABEL(name) \
-    SkASSERT(labels[(int) ByteCode::Instruction::name] == &&name)
-
-template<typename T>
-static T read(const uint8_t** ip) {
-    *ip += sizeof(T);
-    return sk_unaligned_load<T>(*ip - sizeof(T));
-}
-
-#define BINARY_OP(inst, src, result, op)                                  \
-    LABEL(inst) {                                                         \
-        ByteCode::Register target = read<ByteCode::Register>(&ip);        \
-        ByteCode::Register src1 = read<ByteCode::Register>(&ip);          \
-        ByteCode::Register src2 = read<ByteCode::Register>(&ip);          \
-        fRegisters[target.fIndex].result = fRegisters[src1.fIndex].src op \
-                                           fRegisters[src2.fIndex].src;   \
-        NEXT();                                                           \
-    }
-
-#define MASKED_BINARY_OP(inst, src, result, op)                                         \
-    LABEL(inst) {                                                                       \
-        ByteCode::Register target = read<ByteCode::Register>(&ip);                      \
-        ByteCode::Register src1 = read<ByteCode::Register>(&ip);                        \
-        ByteCode::Register src2 = read<ByteCode::Register>(&ip);                        \
-        auto m = mask();                                                                \
-        for (int i = 0; i < width; ++i) {                                               \
-            if (m[i]) {                                                                 \
-                fRegisters[target.fIndex].result[i] = fRegisters[src1.fIndex].src[i] op \
-                                                   fRegisters[src2.fIndex].src[i];      \
-            }                                                                           \
-        }                                                                               \
-        NEXT();                                                                         \
-    }
-
-#define MASKED_VECTOR_BINARY_OP(inst, src, result, op)                                             \
-    LABEL(inst) {                                                                                  \
-        ByteCode::Register target = read<ByteCode::Register>(&ip);                                 \
-        ByteCode::Register src1 = read<ByteCode::Register>(&ip);                                   \
-        ByteCode::Register src2 = read<ByteCode::Register>(&ip);                                   \
-        auto m = mask();                                                                           \
-        for (int i = 0; i < width; ++i) {                                                          \
-            if (m[i]) {                                                                            \
-                fRegisters[target.fIndex].result[i] = fRegisters[src1.fIndex].src[i] op            \
-                                                      fRegisters[src2.fIndex].src[i];              \
-            }                                                                                      \
-        }                                                                                          \
-        NEXT();                                                                                    \
-    }                                                                                              \
-    LABEL(inst ## N) {                                                                             \
-        uint8_t count = read<uint8_t>(&ip);                                                        \
-        ByteCode::Register target = read<ByteCode::Register>(&ip);                                 \
-        ByteCode::Register src1 = read<ByteCode::Register>(&ip);                                   \
-        ByteCode::Register src2 = read<ByteCode::Register>(&ip);                                   \
-        auto m = mask();                                                                           \
-        for (int i = 0; i < width; ++i) {                                                          \
-            if (m[i]) {                                                                            \
-                for (int j = 0; j < count; ++j) {                                                  \
-                    fRegisters[target.fIndex + j].result[i] = fRegisters[src1.fIndex + j].src[i]   \
-                                                            op fRegisters[src2.fIndex + j].src[i]; \
-                }                                                                                  \
-            }                                                                                      \
-        }                                                                                          \
-        NEXT();                                                                                    \
-    }
-
-#define VECTOR_BINARY_OP(inst, src, result, op)                                       \
-    LABEL(inst) {                                                                     \
-        ByteCode::Register target = read<ByteCode::Register>(&ip);                    \
-        ByteCode::Register src1 = read<ByteCode::Register>(&ip);                      \
-        ByteCode::Register src2 = read<ByteCode::Register>(&ip);                      \
-        fRegisters[target.fIndex].result = fRegisters[src1.fIndex].src op             \
-                                               fRegisters[src2.fIndex].src;           \
-        NEXT();                                                                       \
-    }                                                                                 \
-    LABEL(inst ## N) {                                                                \
-        uint8_t count = read<uint8_t>(&ip);                                           \
-        ByteCode::Register target = read<ByteCode::Register>(&ip);                    \
-        ByteCode::Register src1 = read<ByteCode::Register>(&ip);                      \
-        ByteCode::Register src2 = read<ByteCode::Register>(&ip);                      \
-        for (int i = 0; i < count; ++i) {                                             \
-            fRegisters[target.fIndex + i].result = fRegisters[src1.fIndex + i].src op \
-                                                   fRegisters[src2.fIndex + i].src;   \
-        }                                                                             \
-        NEXT();                                                                       \
-    }
-
-#define VECTOR_UNARY_FN(inst, fn)                                                       \
-    LABEL(inst) {                                                                       \
-        ByteCode::Register target = read<ByteCode::Register>(&ip);                      \
-        ByteCode::Register src = read<ByteCode::Register>(&ip);                         \
-        for (int i = 0; i < width; ++ i) {                                              \
-            fRegisters[target.fIndex].fFloat[i] = fn(fRegisters[src.fIndex].fFloat[i]); \
-        }                                                                               \
-        NEXT();                                                                         \
-    }
-
-#define DISASSEMBLE_0(inst, name) \
-    case ByteCode::Instruction::inst: printf(name "\n"); break;
-
-#define DISASSEMBLE_1(inst, name)                                   \
-    case ByteCode::Instruction::inst:                               \
-        printf(name " $%d\n", read<ByteCode::Register>(ip).fIndex); \
-        break;
-
-#define DISASSEMBLE_UNARY(inst, name)                             \
-    case ByteCode::Instruction::inst: {                           \
-        ByteCode::Register target = read<ByteCode::Register>(ip); \
-        ByteCode::Register src = read<ByteCode::Register>(ip);    \
-        printf(name " $%d -> $%d\n", src.fIndex, target.fIndex);  \
-        break;                                                    \
-    }
-
-#define DISASSEMBLE_VECTOR_UNARY(inst, name)                              \
-    case ByteCode::Instruction::inst: {                                   \
-        ByteCode::Register target = read<ByteCode::Register>(ip);         \
-        ByteCode::Register src = read<ByteCode::Register>(ip);            \
-        printf(name " $%d -> $%d\n", src.fIndex, target.fIndex);          \
-        break;                                                            \
-    }                                                                     \
-    case ByteCode::Instruction::inst ## N: {                              \
-        uint8_t count = read<uint8_t>(ip);                                \
-        ByteCode::Register target = read<ByteCode::Register>(ip);         \
-        ByteCode::Register src = read<ByteCode::Register>(ip);            \
-        printf(name "%d $%d -> $%d\n", count, src.fIndex, target.fIndex); \
-        break;                                                            \
-    }
-
-#define DISASSEMBLE_BINARY(inst, name)                                              \
-    case ByteCode::Instruction::inst: {                                             \
-        ByteCode::Register target = read<ByteCode::Register>(ip);                   \
-        ByteCode::Register src1 = read<ByteCode::Register>(ip);                     \
-        ByteCode::Register src2 = read<ByteCode::Register>(ip);                     \
-        printf(name " $%d, $%d -> $%d\n", src1.fIndex, src2.fIndex, target.fIndex); \
-        break;                                                                      \
-    }
-
-#define DISASSEMBLE_VECTOR_BINARY(inst, name)                                                \
-    case ByteCode::Instruction::inst: {                                                      \
-        ByteCode::Register target = read<ByteCode::Register>(ip);                            \
-        ByteCode::Register src1 = read<ByteCode::Register>(ip);                              \
-        ByteCode::Register src2 = read<ByteCode::Register>(ip);                              \
-        printf(name " $%d, $%d -> $%d\n", src1.fIndex, src2.fIndex, target.fIndex);          \
-        break;                                                                               \
-    }                                                                                        \
-    case ByteCode::Instruction::inst ## N: {                                                 \
-        uint8_t count = read<uint8_t>(ip);                                                   \
-        ByteCode::Register target = read<ByteCode::Register>(ip);                            \
-        ByteCode::Register src1 = read<ByteCode::Register>(ip);                              \
-        ByteCode::Register src2 = read<ByteCode::Register>(ip);                              \
-        printf(name "%d $%d, $%d -> $%d\n", count, src1.fIndex, src2.fIndex, target.fIndex); \
-        break;                                                                               \
-    }
-
-/**
- * Operates on vectors of the specified width, so creating an Interpreter<16> means that all inputs,
- * outputs, and internal calculations will be 16-wide vectors.
- */
-template<int width>
-class Interpreter {
-public:
-    using Vector = ByteCode::Vector<width>;
-    using VectorI = skvx::Vec<width, int32_t>;
-    using VectorF = skvx::Vec<width, float>;
-
-    Interpreter(std::unique_ptr<ByteCode> code)
-        : fCode(std::move(code)) {
-        // C++ doesn't guarantee proper alignment of naively-allocated vectors, so we can't have the
-        // registers and memory directly as fields of this object without jumping through some hoops
-        // during Interpreter allocation and deallocation. We simplify this by having the backing
-        // store be a separate allocation, jumping through the hoops ourselves rather than require
-        // Interpreter's clients to be aware of alignment.
-        // Ideally, we could use std::aligned_alloc here, but as of this writing it is not available
-        // on some compilers despite claiming to support C++17.
-        fBackingStore = calloc(sizeof(Vector), MEMORY_SIZE + REGISTER_COUNT + 1);
-        fMemory = (Vector*) GrAlignTo((size_t) fBackingStore, alignof(Vector));
-        fRegisters = fMemory + MEMORY_SIZE;
-    }
-
-    ~Interpreter() {
-        free(fBackingStore);
-    }
-
-    void setUniforms(const float uniforms[]) {
-        for (int i = 0; i < fCode->getUniformSlotCount(); ++i) {
-            fMemory[fCode->getGlobalSlotCount() + i].fFloat = VectorF(uniforms[i]);
-        }
-    }
-
-    /**
-     * Returns true on success and stores a pointer to the first slot of the result into outResult.
-     * This pointer is only guaranteed to be valid until the next run() call.
-     */
-     bool run(const ByteCodeFunction* f, Vector args[], Vector** outResult) {
-        SkASSERT(f);
-        VectorI condStack[MASK_STACK_SIZE];
-        memset(&condStack[0], 255, sizeof(condStack[0]));
-        VectorI maskStack[MASK_STACK_SIZE];
-        memset(&maskStack[0], 255, sizeof(maskStack[0]));
-        VectorI loopStack[LOOP_STACK_SIZE];
-        memset(&loopStack[0], 255, sizeof(loopStack[0]));
-        VectorI continueStack[LOOP_STACK_SIZE];
-        memset(&continueStack[0], 0, sizeof(continueStack[0]));
-        Vector* stack = fMemory + MEMORY_SIZE;
-        int stackCount = f->fStackSlotCount + f->fParameterSlotCount;
-        stack -= stackCount;
-        if (f->fParameterSlotCount) {
-            memcpy(stack, args, f->fParameterSlotCount * sizeof(Vector));
-        }
-        Context context(fMemory, stack, condStack, maskStack, loopStack, continueStack);
-        if (this->innerRun(f, context, 0, outResult)) {
-            int slot = 0;
-            for (const auto& p : f->fParameters) {
-                if (p.fIsOutParameter) {
-                    memcpy(&args[slot], &stack[slot], p.fSlotCount * sizeof(Vector));
-                }
-                slot += p.fSlotCount;
-            }
-            return true;
-        }
-        return false;
-    }
-
-    /**
-     * Invokes the specified function with the given arguments, 'count' times. 'args' and
-     * 'outResult' are accepted and returned in structure-of-arrays form:
-     *   args[0] points to an array of N values, the first argument for each invocation
-     *   ...
-     *   args[argCount - 1] points to an array of N values, the last argument for each invocation
-     *
-     * All values in 'args', 'outResult', and 'uniforms' are 32-bit values (typically floats,
-     * but possibly int32_t or uint32_t, depending on the types used in the SkSL).
-     * Any 'out' or 'inout' parameters will result in the 'args' array being modified.
-     */
-    bool runStriped(const ByteCodeFunction* f, int count, float* args[],
-                    float* outResult[] = nullptr) {
-        SkASSERT(f);
-        Vector* stack = fMemory + MEMORY_SIZE;
-        int stackCount = f->fStackSlotCount + f->fParameterSlotCount;
-        stack -= stackCount;
-        VectorI condStack[MASK_STACK_SIZE];
-        VectorI maskStack[MASK_STACK_SIZE];
-        VectorI loopStack[LOOP_STACK_SIZE];
-        VectorI continueStack[LOOP_STACK_SIZE];
-        Vector* innerResult = nullptr;
-        Context context(fMemory, stack, condStack, maskStack, loopStack, continueStack);
-        for (int i = 0; i < count; i += width) {
-            int lanes = std::min(width, count - i);
-            size_t size = lanes * sizeof(float);
-            memset(&maskStack[0], 255, sizeof(maskStack[0]));
-            memset(&loopStack[0], 255, sizeof(loopStack[0]));
-            for (int j = lanes; j < width; ++j) {
-                maskStack[0][j] = 0;
-                loopStack[0][j] = 0;
-            }
-            memset(&continueStack[0], 0, sizeof(continueStack[0]));
-            for (int j = 0; j < f->fParameterSlotCount; ++j) {
-                memcpy(stack + j, &args[j][i], size);
-            }
-            if (!this->innerRun(f, context, i, &innerResult)) {
-                return false;
-            }
-            int slot = 0;
-            for (const auto& p : f->fParameters) {
-                if (p.fIsOutParameter) {
-                    for (int j = 0; j < p.fSlotCount; ++j) {
-                        memcpy(&args[slot + j][i], stack + slot + j, size);
-                    }
-                }
-                slot += p.fSlotCount;
-            }
-            if (outResult) {
-                for (int j = 0; j < f->fReturnSlotCount; ++j) {
-                    memcpy(&outResult[j][i], &innerResult[j], size);
-                }
-            }
-        }
-        return true;
-    }
-
-    const ByteCode& getCode() {
-        return *fCode;
-    }
-
-private:
-    static constexpr size_t REGISTER_COUNT = 1024;
-
-    static constexpr size_t MEMORY_SIZE = 1024;
-
-    static constexpr size_t MASK_STACK_SIZE = 64;
-
-    static constexpr size_t LOOP_STACK_SIZE = 16;
-
-    struct StackFrame {
-        StackFrame(const ByteCodeFunction* function, const uint8_t* ip, const int stackSlotCount,
-                   Vector* parameters, Vector* returnValue)
-            : fFunction(function)
-            , fIP(ip)
-            , fStackSlotCount(stackSlotCount)
-            , fParameters(parameters)
-            , fReturnValue(returnValue) {}
-
-        const ByteCodeFunction* fFunction;
-        const uint8_t* fIP;
-        const int fStackSlotCount;
-        Vector* fParameters;
-        Vector* fReturnValue;
-    };
-
-    struct Context {
-        Context(Vector* memory, Vector* stack, VectorI* condStack, VectorI* maskStack,
-                VectorI* loopStack,VectorI* continueStack)
-            : fMemory(memory)
-            , fStack(stack)
-            , fCondStack(condStack)
-            , fMaskStack(maskStack)
-            , fLoopStack(loopStack)
-            , fContinueStack(continueStack) {}
-
-        Vector* fMemory;
-        Vector* fStack;
-        VectorI* fCondStack;
-        VectorI* fMaskStack;
-        VectorI* fLoopStack;
-        VectorI* fContinueStack;
-        std::stack<StackFrame> fCallStack;
-    };
-
-    // $x = register
-    // @x = memory cell
-    // &x = parameter
-    void disassemble(const uint8_t** ip) {
-        ByteCode::Instruction inst = read<ByteCode::Instruction>(ip);
-        switch (inst) {
-            DISASSEMBLE_VECTOR_BINARY(kAddF, "addF")
-            DISASSEMBLE_VECTOR_BINARY(kAddI, "addI")
-            DISASSEMBLE_BINARY(kAnd, "and")
-            DISASSEMBLE_BINARY(kCompareEQF, "compare eqF")
-            DISASSEMBLE_BINARY(kCompareEQI, "compare eqI")
-            DISASSEMBLE_BINARY(kCompareNEQF, "compare neqF")
-            DISASSEMBLE_BINARY(kCompareNEQI, "compare neqI")
-            DISASSEMBLE_BINARY(kCompareGTF, "compare gtF")
-            DISASSEMBLE_BINARY(kCompareGTS, "compare gtS")
-            DISASSEMBLE_BINARY(kCompareGTU, "compare gtU")
-            DISASSEMBLE_BINARY(kCompareGTEQF, "compare gteqF")
-            DISASSEMBLE_BINARY(kCompareGTEQS, "compare gteqS")
-            DISASSEMBLE_BINARY(kCompareGTEQU, "compare gteqU")
-            DISASSEMBLE_BINARY(kCompareLTF, "compare ltF")
-            DISASSEMBLE_BINARY(kCompareLTS, "compare ltS")
-            DISASSEMBLE_BINARY(kCompareLTU, "compare ltU")
-            DISASSEMBLE_BINARY(kCompareLTEQF, "compare lteqF")
-            DISASSEMBLE_BINARY(kCompareLTEQS, "compare lteqS")
-            DISASSEMBLE_BINARY(kCompareLTEQU, "compare lteqU")
-            DISASSEMBLE_VECTOR_BINARY(kSubtractF, "subF")
-            DISASSEMBLE_VECTOR_BINARY(kSubtractI, "subI")
-            DISASSEMBLE_VECTOR_BINARY(kDivideF, "divF")
-            DISASSEMBLE_VECTOR_BINARY(kDivideS, "divS")
-            DISASSEMBLE_VECTOR_BINARY(kDivideU, "divU")
-            DISASSEMBLE_VECTOR_BINARY(kRemainderS, "remS")
-            DISASSEMBLE_VECTOR_BINARY(kRemainderU, "remU")
-            DISASSEMBLE_VECTOR_BINARY(kRemainderF, "remF")
-            DISASSEMBLE_VECTOR_BINARY(kMultiplyF, "mulF")
-            DISASSEMBLE_VECTOR_BINARY(kMultiplyI, "mulI")
-            DISASSEMBLE_BINARY(kOr, "or")
-            DISASSEMBLE_BINARY(kXor, "xor")
-            DISASSEMBLE_0(kNop, "nop")
-            case ByteCode::Instruction::kBoundsCheck: {
-                ByteCode::Register r = read<ByteCode::Register>(ip);
-                int length = read<int>(ip);
-                printf("boundsCheck 0 <= $%d < %d\n", r.fIndex, length);
-                break;
-            }
-            case ByteCode::Instruction::kBranch:
-                printf("branch %d\n", read<ByteCode::Pointer>(ip).fAddress);
-                break;
-            case ByteCode::Instruction::kBranchIfAllFalse:
-                printf("branchIfAllFalse %d\n", read<ByteCode::Pointer>(ip).fAddress);
-                break;
-            DISASSEMBLE_0(kBreak, "break")
-            case ByteCode::Instruction::kCall: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                uint8_t idx = read<uint8_t>(ip);
-                ByteCode::Register args = read<ByteCode::Register>(ip);
-                ByteCodeFunction* f = fCode->fFunctions[idx].get();
-                printf("call %s($%d...) -> $%d", f->fName.c_str(), args.fIndex, target.fIndex);
-                printf("\n");
-                break;
-            }
-            case ByteCode::Instruction::kCallExternal: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                uint8_t idx = read<uint8_t>(ip);
-                uint8_t targetCount = read<uint8_t>(ip);
-                ByteCode::Register args = read<ByteCode::Register>(ip);
-                uint8_t argCount = read<uint8_t>(ip);
-                ExternalValue* ev = fCode->fExternalValues[idx];
-                printf("callExternal %s($%d(%d)...) -> $%d(%d)", String(ev->fName).c_str(),
-                        args.fIndex, argCount, target.fIndex, targetCount);
-                printf("\n");
-                break;
-            }
-            DISASSEMBLE_0(kContinue, "continue")
-            DISASSEMBLE_UNARY(kCopy, "copy")
-            DISASSEMBLE_UNARY(kCos, "cos")
-            DISASSEMBLE_UNARY(kFloatToSigned, "FtoS")
-            DISASSEMBLE_UNARY(kFloatToUnsigned, "FtoU")
-            case ByteCode::Instruction::kImmediate: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Immediate src = read<ByteCode::Immediate>(ip);
-                printf("immediate (%d | %f) -> $%d\n", src.fInt, src.fFloat, target.fIndex);
-                break;
-            }
-            DISASSEMBLE_UNARY(kInverse2x2, "inverse2x2")
-            DISASSEMBLE_UNARY(kInverse3x3, "inverse3x3")
-            DISASSEMBLE_UNARY(kInverse4x4, "inverse4x4")
-            DISASSEMBLE_VECTOR_UNARY(kLoad, "load")
-            case ByteCode::Instruction::kLoadDirect: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
-                printf("loadDirect @%d -> $%d\n", src.fAddress, target.fIndex);
-                break;
-            }
-            case ByteCode::Instruction::kLoadDirectN: {
-                uint8_t count = read<uint8_t>(ip);
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
-                printf("loadDirect%d @%d -> $%d\n", count, src.fAddress, target.fIndex);
-                break;
-            }
-            DISASSEMBLE_VECTOR_UNARY(kLoadParameter, "loadParameter")
-            case ByteCode::Instruction::kLoadParameterDirect: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
-                printf("loadParameterDirect &%d -> $%d\n", src.fAddress, target.fIndex);
-                break;
-            }
-            case ByteCode::Instruction::kLoadParameterDirectN: {
-                uint8_t count = read<uint8_t>(ip);
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
-                printf("loadParameterDirect%d &%d -> $%d\n", count, src.fAddress, target.fIndex);
-                break;
-            }
-            DISASSEMBLE_VECTOR_UNARY(kLoadStack, "loadStack")
-            case ByteCode::Instruction::kLoadStackDirect: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
-                printf("loadStackDirect @%d -> $%d\n", src.fAddress, target.fIndex);
-                break;
-            }
-            case ByteCode::Instruction::kLoadStackDirectN: {
-                uint8_t count = read<uint8_t>(ip);
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
-                printf("loadStackDirect%d @%d -> $%d\n", count, src.fAddress, target.fIndex);
-                break;
-            }
-            DISASSEMBLE_0(kLoopBegin, "loopBegin")
-            DISASSEMBLE_0(kLoopEnd, "loopEnd")
-            DISASSEMBLE_1(kLoopMask, "loopMask")
-            DISASSEMBLE_0(kLoopNext, "loopNext")
-            DISASSEMBLE_0(kMaskNegate, "maskNegate")
-            DISASSEMBLE_0(kMaskPop, "maskPop")
-            DISASSEMBLE_1(kMaskPush, "maskPush")
-            case ByteCode::Instruction::kMatrixMultiply: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Register left = read<ByteCode::Register>(ip);
-                ByteCode::Register right = read<ByteCode::Register>(ip);
-                uint8_t leftColsAndRightRows = read<uint8_t>(ip);
-                uint8_t leftRows = read<uint8_t>(ip);
-                uint8_t rightColumns = read<uint8_t>(ip);
-                printf("matrixMultiply $%d, $%d, %d, %d, %d -> $%d\n", left.fIndex, right.fIndex,
-                       leftColsAndRightRows, leftRows, rightColumns, target.fIndex);
-                break;
-            }
-            case ByteCode::Instruction::kMatrixToMatrix: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                uint8_t srcColumns = read<uint8_t>(ip);
-                uint8_t srcRows = read<uint8_t>(ip);
-                uint8_t dstColumns = read<uint8_t>(ip);
-                uint8_t dstRows = read<uint8_t>(ip);
-                printf("matrixToMatrix $%d, %dx%d to %dx%d -> $%d\n", src.fIndex, srcColumns,
-                       srcRows, dstColumns, dstRows, target.fIndex);
-                break;
-            }
-            DISASSEMBLE_UNARY(kNegateF, "negateF")
-            DISASSEMBLE_UNARY(kNegateS, "negateS")
-            DISASSEMBLE_UNARY(kNot, "not")
-            case ByteCode::Instruction::kReadExternal: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                uint8_t count = read<uint8_t>(ip);
-                uint8_t index = read<uint8_t>(ip);
-                printf("readExternal %d, %d -> $%d\n", count, index, target.fIndex);
-                break;
-            }
-            DISASSEMBLE_1(kPrint, "print")
-            DISASSEMBLE_0(kReturn, "return")
-            DISASSEMBLE_1(kReturnValue, "returnValue")
-            case ByteCode::Instruction::kScalarToMatrix: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                uint8_t columns = read<uint8_t>(ip);
-                uint8_t rows = read<uint8_t>(ip);
-                printf("scalarToMatrix $%d, %dx%d -> $%d\n", src.fIndex, columns, rows,
-                       target.fIndex);
-                break;
-            }
-            case ByteCode::Instruction::kSelect: {
-                ByteCode::Register target = read<ByteCode::Register>(ip);
-                ByteCode::Register test = read<ByteCode::Register>(ip);
-                ByteCode::Register src1 = read<ByteCode::Register>(ip);
-                ByteCode::Register src2 = read<ByteCode::Register>(ip);
-                printf("select $%d, $%d, $%d -> %d\n", test.fIndex, src1.fIndex, src2.fIndex,
-                       target.fIndex);
-                break;
-            }
-            DISASSEMBLE_BINARY(kShiftLeft, "shiftLeft")
-            DISASSEMBLE_BINARY(kShiftRightS, "shiftRightS")
-            DISASSEMBLE_BINARY(kShiftRightU, "shiftRightU")
-            DISASSEMBLE_UNARY(kSignedToFloat, "signedToFloat")
-            DISASSEMBLE_UNARY(kSin, "sin")
-            case ByteCode::Instruction::kSplat: {
-                uint8_t count = read<uint8_t>(ip);
-                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                printf("splat%d $%d -> @%d\n", count, src.fIndex, target.fAddress);
-                break;
-            }
-            DISASSEMBLE_UNARY(kSqrt, "sqrt")
-            DISASSEMBLE_VECTOR_UNARY(kStore, "store")
-            case ByteCode::Instruction::kStoreDirect: {
-                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                printf("store $%d -> @%d\n", src.fIndex, target.fAddress);
-                break;
-            }
-            case ByteCode::Instruction::kStoreDirectN: {
-                uint8_t count = read<uint8_t>(ip);
-                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                printf("store%d $%d -> @%d\n", count, src.fIndex, target.fAddress);
-                break;
-            }
-            DISASSEMBLE_VECTOR_UNARY(kStoreParameter, "storeParameter")
-            case ByteCode::Instruction::kStoreParameterDirect: {
-                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                printf("storeParameterDirect $%d -> &%d\n", src.fIndex, target.fAddress);
-                break;
-            }
-            case ByteCode::Instruction::kStoreParameterDirectN: {
-                uint8_t count = read<uint8_t>(ip);
-                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                printf("storeParameterDirect%d $%d -> &%d\n", count, src.fIndex, target.fAddress);
-                break;
-            }
-            DISASSEMBLE_VECTOR_UNARY(kStoreStack, "storeStack")
-            case ByteCode::Instruction::kStoreStackDirect: {
-                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                printf("storeStackDirect $%d -> @%d\n", src.fIndex, target.fAddress);
-                break;
-            }
-            case ByteCode::Instruction::kStoreStackDirectN: {
-                uint8_t count = read<uint8_t>(ip);
-                ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                printf("storeStackDirect%d $%d -> @%d\n", count, src.fIndex, target.fAddress);
-                break;
-            }
-            DISASSEMBLE_UNARY(kTan, "tan")
-            DISASSEMBLE_UNARY(kUnsignedToFloat, "unsignedToFloat")
-            case ByteCode::Instruction::kWriteExternal: {
-                uint8_t index = read<uint8_t>(ip);
-                uint8_t count = read<uint8_t>(ip);
-                ByteCode::Register src = read<ByteCode::Register>(ip);
-                printf("writeExternal $%d, %d -> %d\n", src.fIndex, count, index);
-                break;
-            }
-            default:
-                printf("unsupported: %d\n", (int) inst);
-                SkASSERT(false);
-        }
-    }
-
-    static Vector VecMod(Vector x, Vector y) {
-        return Vector(x.fFloat - skvx::trunc(x.fFloat / y.fFloat) * y.fFloat);
-    }
-
-    #define CHECK_STACK_BOUNDS(address)                              \
-        SkASSERT(context.fStack + address >= fMemory &&              \
-                 context.fStack + address <= fMemory + MEMORY_SIZE)
-
-    static void Inverse2x2(Vector* in, Vector* out) {
-        VectorF a = in[0].fFloat,
-                b = in[1].fFloat,
-                c = in[2].fFloat,
-                d = in[3].fFloat;
-        VectorF idet = VectorF(1) / (a*d - b*c);
-        out[0].fFloat = d * idet;
-        out[1].fFloat = -b * idet;
-        out[2].fFloat = -c * idet;
-        out[3].fFloat = a * idet;
-    }
-
-    static void Inverse3x3(Vector* in, Vector* out) {
-        VectorF a11 = in[0].fFloat, a12 = in[3].fFloat, a13 = in[6].fFloat,
-                a21 = in[1].fFloat, a22 = in[4].fFloat, a23 = in[7].fFloat,
-                a31 = in[2].fFloat, a32 = in[5].fFloat, a33 = in[8].fFloat;
-        VectorF idet = VectorF(1) / (a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
-                                     a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31);
-        out[0].fFloat = (a22 * a33 - a23 * a32) * idet;
-        out[1].fFloat = (a23 * a31 - a21 * a33) * idet;
-        out[2].fFloat = (a21 * a32 - a22 * a31) * idet;
-        out[3].fFloat = (a13 * a32 - a12 * a33) * idet;
-        out[4].fFloat = (a11 * a33 - a13 * a31) * idet;
-        out[5].fFloat = (a12 * a31 - a11 * a32) * idet;
-        out[6].fFloat = (a12 * a23 - a13 * a22) * idet;
-        out[7].fFloat = (a13 * a21 - a11 * a23) * idet;
-        out[8].fFloat = (a11 * a22 - a12 * a21) * idet;
-    }
-
-
-    static void Inverse4x4(Vector* in, Vector* out) {
-        #define inf(index)  in[index].fFloat
-        #define outf(index) out[index].fFloat
-        VectorF a00 = inf(0), a10 = inf(4), a20 = inf( 8), a30 = inf(12),
-                a01 = inf(1), a11 = inf(5), a21 = inf( 9), a31 = inf(13),
-                a02 = inf(2), a12 = inf(6), a22 = inf(10), a32 = inf(14),
-                a03 = inf(3), a13 = inf(7), a23 = inf(11), a33 = inf(15);
-
-        VectorF b00 = a00 * a11 - a01 * a10,
-                b01 = a00 * a12 - a02 * a10,
-                b02 = a00 * a13 - a03 * a10,
-                b03 = a01 * a12 - a02 * a11,
-                b04 = a01 * a13 - a03 * a11,
-                b05 = a02 * a13 - a03 * a12,
-                b06 = a20 * a31 - a21 * a30,
-                b07 = a20 * a32 - a22 * a30,
-                b08 = a20 * a33 - a23 * a30,
-                b09 = a21 * a32 - a22 * a31,
-                b10 = a21 * a33 - a23 * a31,
-                b11 = a22 * a33 - a23 * a32;
-
-        VectorF idet = VectorF(1) /
-                            (b00 * b11 - b01 * b10 + b02 * b09 + b03 * b08 - b04 * b07 + b05 * b06);
-
-        b00 *= idet;
-        b01 *= idet;
-        b02 *= idet;
-        b03 *= idet;
-        b04 *= idet;
-        b05 *= idet;
-        b06 *= idet;
-        b07 *= idet;
-        b08 *= idet;
-        b09 *= idet;
-        b10 *= idet;
-        b11 *= idet;
-
-        outf( 0) = a11 * b11 - a12 * b10 + a13 * b09;
-        outf( 1) = a02 * b10 - a01 * b11 - a03 * b09;
-        outf( 2) = a31 * b05 - a32 * b04 + a33 * b03;
-        outf( 3) = a22 * b04 - a21 * b05 - a23 * b03;
-        outf( 4) = a12 * b08 - a10 * b11 - a13 * b07;
-        outf( 5) = a00 * b11 - a02 * b08 + a03 * b07;
-        outf( 6) = a32 * b02 - a30 * b05 - a33 * b01;
-        outf( 7) = a20 * b05 - a22 * b02 + a23 * b01;
-        outf( 8) = a10 * b10 - a11 * b08 + a13 * b06;
-        outf( 9) = a01 * b08 - a00 * b10 - a03 * b06;
-        outf(10) = a30 * b04 - a31 * b02 + a33 * b00;
-        outf(11) = a21 * b02 - a20 * b04 - a23 * b00;
-        outf(12) = a11 * b07 - a10 * b09 - a12 * b06;
-        outf(13) = a00 * b09 - a01 * b07 + a02 * b06;
-        outf(14) = a31 * b01 - a30 * b03 - a32 * b00;
-        outf(15) = a20 * b03 - a21 * b01 + a22 * b00;
-        #undef inf
-        #undef outf
-    }
-
-    bool innerRun(const ByteCodeFunction* f, Context context, int baseIndex, Vector** outResult) {
-#ifdef SKSL_THREADED_CODE
-        static const void* labels[] = {
-            // If you aren't familiar with it, the &&label syntax is the GCC / Clang "labels as
-            // values" extension. If you add anything to this array, be sure to add the
-            // corresponding CHECK_LABEL() assert below.
-            &&kNop,
-            &&kAbort,
-            &&kAddF,
-            &&kAddFN,
-            &&kAddI,
-            &&kAddIN,
-            &&kAnd,
-            &&kBoundsCheck,
-            &&kBranch,
-            &&kBranchIfAllFalse,
-            &&kBreak,
-            &&kCall,
-            &&kCallExternal,
-            &&kCompareEQF,
-            &&kCompareEQI,
-            &&kCompareNEQF,
-            &&kCompareNEQI,
-            &&kCompareGTF,
-            &&kCompareGTS,
-            &&kCompareGTU,
-            &&kCompareGTEQF,
-            &&kCompareGTEQS,
-            &&kCompareGTEQU,
-            &&kCompareLTF,
-            &&kCompareLTS,
-            &&kCompareLTU,
-            &&kCompareLTEQF,
-            &&kCompareLTEQS,
-            &&kCompareLTEQU,
-            &&kContinue,
-            &&kCopy,
-            &&kCos,
-            &&kDivideF,
-            &&kDivideFN,
-            &&kDivideS,
-            &&kDivideSN,
-            &&kDivideU,
-            &&kDivideUN,
-            &&kFloatToSigned,
-            &&kFloatToUnsigned,
-            &&kImmediate,
-            &&kInverse2x2,
-            &&kInverse3x3,
-            &&kInverse4x4,
-            &&kLoad,
-            &&kLoadN,
-            &&kLoadDirect,
-            &&kLoadDirectN,
-            &&kLoadParameter,
-            &&kLoadParameterN,
-            &&kLoadParameterDirect,
-            &&kLoadParameterDirectN,
-            &&kLoadStack,
-            &&kLoadStackN,
-            &&kLoadStackDirect,
-            &&kLoadStackDirectN,
-            &&kLoopBegin,
-            &&kLoopEnd,
-            &&kLoopMask,
-            &&kLoopNext,
-            &&kMaskNegate,
-            &&kMaskPop,
-            &&kMaskPush,
-            &&kMatrixMultiply,
-            &&kMatrixToMatrix,
-            &&kMultiplyF,
-            &&kMultiplyFN,
-            &&kMultiplyI,
-            &&kMultiplyIN,
-            &&kNegateF,
-            &&kNegateS,
-            &&kNot,
-            &&kOr,
-            &&kPrint,
-            &&kReadExternal,
-            &&kRemainderF,
-            &&kRemainderFN,
-            &&kRemainderS,
-            &&kRemainderSN,
-            &&kRemainderU,
-            &&kRemainderUN,
-            &&kReturn,
-            &&kReturnValue,
-            &&kScalarToMatrix,
-            &&kSelect,
-            &&kShiftLeft,
-            &&kShiftRightS,
-            &&kShiftRightU,
-            &&kSignedToFloat,
-            &&kSin,
-            &&kSplat,
-            &&kSqrt,
-            &&kStore,
-            &&kStoreN,
-            &&kStoreDirect,
-            &&kStoreDirectN,
-            &&kStoreParameter,
-            &&kStoreParameterN,
-            &&kStoreParameterDirect,
-            &&kStoreParameterDirectN,
-            &&kStoreStack,
-            &&kStoreStackN,
-            &&kStoreStackDirect,
-            &&kStoreStackDirectN,
-            &&kSubtractF,
-            &&kSubtractFN,
-            &&kSubtractI,
-            &&kSubtractIN,
-            &&kTan,
-            &&kUnsignedToFloat,
-            &&kWriteExternal,
-            &&kXor
-        };
-        CHECK_LABEL(kNop);
-        CHECK_LABEL(kAbort);
-        CHECK_LABEL(kAddF);
-        CHECK_LABEL(kAddI);
-        CHECK_LABEL(kAnd);
-        CHECK_LABEL(kBoundsCheck);
-        CHECK_LABEL(kBranch);
-        CHECK_LABEL(kBranchIfAllFalse);
-        CHECK_LABEL(kBreak);
-        CHECK_LABEL(kCall);
-        CHECK_LABEL(kCallExternal);
-        CHECK_LABEL(kCompareEQF);
-        CHECK_LABEL(kCompareEQI);
-        CHECK_LABEL(kCompareNEQF);
-        CHECK_LABEL(kCompareNEQI);
-        CHECK_LABEL(kCompareGTF);
-        CHECK_LABEL(kCompareGTS);
-        CHECK_LABEL(kCompareGTU);
-        CHECK_LABEL(kCompareGTEQF);
-        CHECK_LABEL(kCompareGTEQS);
-        CHECK_LABEL(kCompareGTEQU);
-        CHECK_LABEL(kCompareLTF);
-        CHECK_LABEL(kCompareLTS);
-        CHECK_LABEL(kCompareLTU);
-        CHECK_LABEL(kCompareLTEQF);
-        CHECK_LABEL(kCompareLTEQS);
-        CHECK_LABEL(kCompareLTEQU);
-        CHECK_LABEL(kContinue);
-        CHECK_LABEL(kCopy);
-        CHECK_LABEL(kCos);
-        CHECK_LABEL(kDivideF);
-        CHECK_LABEL(kDivideFN);
-        CHECK_LABEL(kDivideS);
-        CHECK_LABEL(kDivideSN);
-        CHECK_LABEL(kDivideU);
-        CHECK_LABEL(kDivideUN);
-        CHECK_LABEL(kFloatToSigned);
-        CHECK_LABEL(kFloatToUnsigned);
-        CHECK_LABEL(kImmediate);
-        CHECK_LABEL(kInverse2x2);
-        CHECK_LABEL(kInverse3x3);
-        CHECK_LABEL(kInverse4x4);
-        CHECK_LABEL(kLoad);
-        CHECK_LABEL(kLoadN);
-        CHECK_LABEL(kLoadDirect);
-        CHECK_LABEL(kLoadDirectN);
-        CHECK_LABEL(kLoadParameter);
-        CHECK_LABEL(kLoadParameterN);
-        CHECK_LABEL(kLoadParameterDirect);
-        CHECK_LABEL(kLoadParameterDirectN);
-        CHECK_LABEL(kLoadStack);
-        CHECK_LABEL(kLoadStackN);
-        CHECK_LABEL(kLoadStackDirect);
-        CHECK_LABEL(kLoadStackDirectN);
-        CHECK_LABEL(kLoopBegin);
-        CHECK_LABEL(kLoopEnd);
-        CHECK_LABEL(kLoopMask);
-        CHECK_LABEL(kLoopNext);
-        CHECK_LABEL(kMaskNegate);
-        CHECK_LABEL(kMaskPop);
-        CHECK_LABEL(kMaskPush);
-        CHECK_LABEL(kMatrixMultiply);
-        CHECK_LABEL(kMatrixToMatrix);
-        CHECK_LABEL(kMultiplyF);
-        CHECK_LABEL(kMultiplyFN);
-        CHECK_LABEL(kMultiplyI);
-        CHECK_LABEL(kMultiplyIN);
-        CHECK_LABEL(kNegateF);
-        CHECK_LABEL(kNegateS);
-        CHECK_LABEL(kNot);
-        CHECK_LABEL(kOr);
-        CHECK_LABEL(kPrint);
-        CHECK_LABEL(kReadExternal);
-        CHECK_LABEL(kRemainderF);
-        CHECK_LABEL(kRemainderFN);
-        CHECK_LABEL(kRemainderS);
-        CHECK_LABEL(kRemainderSN);
-        CHECK_LABEL(kRemainderU);
-        CHECK_LABEL(kRemainderUN);
-        CHECK_LABEL(kReturn);
-        CHECK_LABEL(kReturnValue);
-        CHECK_LABEL(kScalarToMatrix);
-        CHECK_LABEL(kSelect);
-        CHECK_LABEL(kShiftLeft);
-        CHECK_LABEL(kShiftRightS);
-        CHECK_LABEL(kShiftRightU);
-        CHECK_LABEL(kSignedToFloat);
-        CHECK_LABEL(kSin);
-        CHECK_LABEL(kSplat);
-        CHECK_LABEL(kSqrt);
-        CHECK_LABEL(kStore);
-        CHECK_LABEL(kStoreN);
-        CHECK_LABEL(kStoreDirect);
-        CHECK_LABEL(kStoreDirectN);
-        CHECK_LABEL(kStoreParameter);
-        CHECK_LABEL(kStoreParameterN);
-        CHECK_LABEL(kStoreParameterDirect);
-        CHECK_LABEL(kStoreParameterDirectN);
-        CHECK_LABEL(kStoreStack);
-        CHECK_LABEL(kStoreStackN);
-        CHECK_LABEL(kStoreStackDirect);
-        CHECK_LABEL(kStoreStackDirectN);
-        CHECK_LABEL(kSubtractF);
-        CHECK_LABEL(kSubtractFN);
-        CHECK_LABEL(kSubtractI);
-        CHECK_LABEL(kSubtractIN);
-        CHECK_LABEL(kTan);
-        CHECK_LABEL(kUnsignedToFloat);
-        CHECK_LABEL(kWriteExternal);
-        CHECK_LABEL(kXor);
-#endif
-        auto mask = [&]() { return *context.fMaskStack & *context.fLoopStack; };
-        auto parameterBase = [&]() {
-            return context.fCallStack.empty() ? context.fStack
-                                              : context.fCallStack.top().fParameters;
-        };
-        const uint8_t* code = f->fCode.data();
-        const uint8_t* ip = code;
-#ifdef SKSL_THREADED_CODE
-        #ifdef TRACE
-            const uint8_t* trace_ip = ip;
-            printf("0: ");
-            disassemble(&trace_ip);
-        #endif
-        goto *labels[(int) read<ByteCode::Instruction>(&ip)];
-#else
-        for (;;) {
-            #ifdef TRACE
-                const uint8_t* trace_ip = ip;
-                disassemble(&trace_ip);
-            #endif
-            ByteCode::Instruction inst = read<ByteCode::Instruction>(&ip);
-            switch (inst) {
-#endif
-                VECTOR_BINARY_OP(kAddF, fFloat, fFloat, +)
-                VECTOR_BINARY_OP(kAddI, fInt, fInt, +)
-                BINARY_OP(kAnd, fInt, fInt, &)
-                BINARY_OP(kCompareEQF, fFloat, fInt, ==)
-                BINARY_OP(kCompareEQI, fInt, fInt, ==)
-                BINARY_OP(kCompareNEQF, fFloat, fInt, !=)
-                BINARY_OP(kCompareNEQI, fInt, fInt, !=)
-                BINARY_OP(kCompareGTF, fFloat, fInt, >)
-                BINARY_OP(kCompareGTS, fInt, fInt, >)
-                BINARY_OP(kCompareGTU, fUInt, fUInt, >)
-                BINARY_OP(kCompareGTEQF, fFloat, fInt, >=)
-                BINARY_OP(kCompareGTEQS, fInt, fInt, >=)
-                BINARY_OP(kCompareGTEQU, fUInt, fUInt, >=)
-                BINARY_OP(kCompareLTF, fFloat, fInt, <)
-                BINARY_OP(kCompareLTS, fInt, fInt, <)
-                BINARY_OP(kCompareLTU, fUInt, fUInt, <)
-                BINARY_OP(kCompareLTEQF, fFloat, fInt, <=)
-                BINARY_OP(kCompareLTEQS, fInt, fInt, <=)
-                BINARY_OP(kCompareLTEQU, fUInt, fUInt, <=)
-                VECTOR_BINARY_OP(kSubtractF, fFloat, fFloat, -)
-                VECTOR_BINARY_OP(kSubtractI, fInt, fInt, -)
-                VECTOR_BINARY_OP(kDivideF, fFloat, fFloat, /)
-                MASKED_VECTOR_BINARY_OP(kDivideS, fInt, fInt, /)
-                MASKED_VECTOR_BINARY_OP(kDivideU, fUInt, fUInt, /)
-                MASKED_VECTOR_BINARY_OP(kRemainderS, fInt, fInt, %)
-                MASKED_VECTOR_BINARY_OP(kRemainderU, fUInt, fUInt, %)
-                VECTOR_BINARY_OP(kMultiplyF, fFloat, fFloat, *)
-                VECTOR_BINARY_OP(kMultiplyI, fInt, fInt, *)
-                BINARY_OP(kOr, fInt, fInt, |)
-                BINARY_OP(kXor, fInt, fInt, ^)
-                LABEL(kAbort)
-                    SkASSERT(false);
-                    return false;
-                LABEL(kBoundsCheck) {
-                    ByteCode::Register r = read<ByteCode::Register>(&ip);
-                    int length = read<int>(&ip);
-                    if (skvx::any(mask() & ((fRegisters[r.fIndex].fInt < 0) |
-                                            (fRegisters[r.fIndex].fInt >= length)))) {
-                        return false;
-                    }
-                    NEXT();
-                }
-                LABEL(kBranch) {
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    ip = code + target.fAddress;
-                    NEXT();
-                }
-                LABEL(kBranchIfAllFalse) {
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    if (!skvx::any(mask())) {
-                        ip = code + target.fAddress;
-                    }
-                    NEXT();
-                }
-                LABEL(kBreak)
-                    *context.fLoopStack &= ~mask();
-                    NEXT();
-                LABEL(kCall) {
-                    ByteCode::Register returnValue = read<ByteCode::Register>(&ip);
-                    uint8_t idx = read<uint8_t>(&ip);
-                    ByteCode::Register args = read<ByteCode::Register>(&ip);
-                    const ByteCodeFunction* target = fCode->fFunctions[idx].get();
-                    int stackSlotCount = target->fStackSlotCount + target->fParameterSlotCount;
-                    context.fCallStack.push(StackFrame(f, ip, stackSlotCount,
-                                                       &fRegisters[args.fIndex],
-                                                       &fRegisters[returnValue.fIndex]));
-                    f = target;
-                    code = f->fCode.data();
-                    ip = code;
-                    context.fStack -= stackSlotCount;
-                    memcpy(context.fStack, &fRegisters[args.fIndex],
-                           f->fParameterSlotCount * sizeof(Vector));
-                    NEXT();
-                }
-                LABEL(kCallExternal) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    uint8_t index = read<uint8_t>(&ip);
-                    uint8_t targetSize = read<uint8_t>(&ip);
-                    ByteCode::Register arguments = read<ByteCode::Register>(&ip);
-                    uint8_t argumentSize = read<uint8_t>(&ip);
-                    ExternalValue* v = fCode->fExternalValues[index];
-                    float tmpReturn[64];
-                    SkASSERT(targetSize < 64);
-                    float tmpArgs[64];
-                    SkASSERT(argumentSize < 64);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            for (int j = 0; j < argumentSize; j++) {
-                                tmpArgs[j] = fRegisters[arguments.fIndex + j].fFloat[i];
-                            }
-                            v->call(baseIndex + i, tmpArgs, tmpReturn);
-                            for (int j = 0; j < targetSize; j++) {
-                                fRegisters[target.fIndex + j].fFloat[i] = tmpReturn[j];
-                            }
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kContinue) {
-                    VectorI m = mask();
-                    *context.fContinueStack |= m;
-                    *context.fLoopStack &= ~m;
-                    NEXT();
-                }
-                LABEL(kCopy) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex].fInt = fRegisters[src.fIndex].fInt;
-                    NEXT();
-                }
-                VECTOR_UNARY_FN(kCos, cosf)
-                LABEL(kFloatToSigned) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex] = Vector(skvx::cast<int32_t>(
-                                                       fRegisters[src.fIndex].fFloat));
-                    NEXT();
-                }
-                LABEL(kFloatToUnsigned) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex] = Vector(skvx::cast<uint32_t>(
-                                                       fRegisters[src.fIndex].fFloat));
-                    NEXT();
-                }
-                LABEL(kImmediate) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Immediate src = read<ByteCode::Immediate>(&ip);
-                    fRegisters[target.fIndex].fInt = src.fInt;
-                    NEXT();
-                }
-                LABEL(kInverse2x2) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Inverse2x2(&fRegisters[src.fIndex], &fRegisters[target.fIndex]);
-                    NEXT();
-                }
-                LABEL(kInverse3x3) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Inverse3x3(&fRegisters[src.fIndex], &fRegisters[target.fIndex]);
-                    NEXT();
-                }
-                LABEL(kInverse4x4) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Inverse4x4(&fRegisters[src.fIndex], &fRegisters[target.fIndex]);
-                    NEXT();
-                }
-                LABEL(kLoad) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            fRegisters[target.fIndex].fInt[i] =
-                                                    fMemory[fRegisters[src.fIndex].fInt[i]].fInt[i];
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kLoadN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            for (int j = 0; j < count; ++j) {
-                                fRegisters[target.fIndex + j].fInt[i] =
-                                                fMemory[fRegisters[src.fIndex].fInt[i] + j].fInt[i];
-                            }
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kLoadDirect) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
-                    fRegisters[target.fIndex].fInt = fMemory[src.fAddress].fInt;
-                    NEXT();
-                }
-                LABEL(kLoadDirectN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
-                    for (int i = 0; i < count; ++i) {
-                        fRegisters[target.fIndex + i].fInt = fMemory[src.fAddress + i].fInt;
-                    }
-                    NEXT();
-                }
-                LABEL(kLoadParameter) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Vector* base = parameterBase();
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            fRegisters[target.fIndex].fInt[i] =
-                                                       base[fRegisters[src.fIndex].fInt[i]].fInt[i];
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kLoadParameterN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Vector* base = parameterBase();
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            for (int j = 0; j < count; ++j) {
-                                fRegisters[target.fIndex + j].fInt[i] =
-                                                   base[fRegisters[src.fIndex].fInt[i] + j].fInt[i];
-                            }
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kLoadParameterDirect) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
-                    Vector* base = parameterBase();
-                    fRegisters[target.fIndex].fInt = base[src.fAddress].fInt;
-                    NEXT();
-                }
-                LABEL(kLoadParameterDirectN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
-                    Vector* base = parameterBase();
-                    for (int i = 0; i < count; ++i) {
-                        fRegisters[target.fIndex + i].fInt = base[src.fAddress + i].fInt;
-                    }
-                    NEXT();
-                }
-                LABEL(kLoadStack) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            fRegisters[target.fIndex].fInt[i] =
-                                             context.fStack[fRegisters[src.fIndex].fInt[i]].fInt[i];
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kLoadStackN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            for (int j = 0; j < count; ++j) {
-                                fRegisters[target.fIndex + j].fInt[i] =
-                                         context.fStack[fRegisters[src.fIndex].fInt[i] + j].fInt[i];
-                            }
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kLoadStackDirect) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
-                    CHECK_STACK_BOUNDS(src.fAddress);
-                    fRegisters[target.fIndex].fInt = context.fStack[src.fAddress].fInt;
-                    NEXT();
-                }
-                LABEL(kLoadStackDirectN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
-                    CHECK_STACK_BOUNDS(src.fAddress);
-                    for (int i = 0; i < count; ++i) {
-                        fRegisters[target.fIndex + i].fInt = context.fStack[src.fAddress + i].fInt;
-                    }
-                    NEXT();
-                }
-                LABEL(kLoopBegin) {
-                    context.fLoopStack[1] = context.fLoopStack[0];
-                    ++context.fLoopStack;
-                    context.fContinueStack[1] = 0;
-                    ++context.fContinueStack;
-                    NEXT();
-                }
-                LABEL(kLoopEnd) {
-                    --context.fLoopStack;
-                    --context.fContinueStack;
-                    NEXT();
-                }
-                LABEL(kLoopMask) {
-                    ByteCode::Register value = read<ByteCode::Register>(&ip);
-                    *context.fLoopStack &= fRegisters[value.fIndex].fInt;
-                    NEXT();
-                }
-                LABEL(kLoopNext) {
-                    *context.fLoopStack |= *context.fContinueStack;
-                    *context.fContinueStack = 0;
-                    NEXT();
-                }
-                LABEL(kMaskNegate) {
-                    *context.fMaskStack = context.fMaskStack[-1] & ~context.fCondStack[0];
-                    NEXT();
-                }
-                LABEL(kMaskPop) {
-                    --context.fMaskStack;
-                    --context.fCondStack;
-                    NEXT();
-                }
-                LABEL(kMaskPush) {
-                    ByteCode::Register value = read<ByteCode::Register>(&ip);
-                    context.fCondStack[1] = fRegisters[value.fIndex].fInt;
-                    context.fMaskStack[1] = context.fMaskStack[0] & context.fCondStack[1];
-                    ++context.fCondStack;
-                    ++context.fMaskStack;
-                    NEXT();
-                }
-                LABEL(kMatrixMultiply) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register left = read<ByteCode::Register>(&ip);
-                    ByteCode::Register right = read<ByteCode::Register>(&ip);
-                    uint8_t lCols = read<uint8_t>(&ip);
-                    uint8_t lRows = read<uint8_t>(&ip);
-                    uint8_t rCols = read<uint8_t>(&ip);
-                    uint8_t rRows = lCols;
-                    memset(&fRegisters[target.fIndex], 0, sizeof(Vector) * rCols * lRows);
-                    for (int c = 0; c < rCols; ++c) {
-                        for (int r = 0; r < lRows; ++r) {
-                            for (int j = 0; j < lCols; ++j) {
-                                fRegisters[target.fIndex + c * lRows + r].fFloat +=
-                                        fRegisters[left.fIndex + j * lRows + r].fFloat *
-                                        fRegisters[right.fIndex + c * rRows + j].fFloat;
-                            }
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kMatrixToMatrix) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    uint8_t srcColumns = read<uint8_t>(&ip);
-                    uint8_t srcRows = read<uint8_t>(&ip);
-                    uint8_t dstColumns = read<uint8_t>(&ip);
-                    uint8_t dstRows = read<uint8_t>(&ip);
-                    int offset = 0;
-                    for (int i = 0; i < dstColumns; ++i) {
-                        for (int j = 0; j < dstRows; ++j) {
-                            if (i < srcColumns && j < srcRows) {
-                                fRegisters[target.fIndex + offset] =
-                                                         fRegisters[src.fIndex + (srcRows * i) + j];
-                            } else {
-                                if (i == j) {
-                                    fRegisters[target.fIndex + offset].fFloat = 1;
-                                } else {
-                                    fRegisters[target.fIndex + offset].fFloat = 0;
-                                }
-                            }
-                            ++offset;
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kNegateF) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex].fFloat = -fRegisters[src.fIndex].fFloat;
-                    NEXT();
-                }
-                LABEL(kNegateS) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex].fInt = -fRegisters[src.fIndex].fInt;
-                    NEXT();
-                }
-                LABEL(kNop)
-                    NEXT();
-                LABEL(kNot) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex].fInt = ~fRegisters[src.fIndex].fInt;
-                    NEXT();
-                }
-                LABEL(kPrint) {
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    if (skvx::any(mask())) {
-                        printf("[");
-                        const char* separator = "";
-                        for (int i = 0; i < width; ++i) {
-                            if (mask()[i]) {
-                                printf("%s%f", separator, fRegisters[src.fIndex].fFloat[i]);
-                            }
-                            else {
-                                printf("%s-", separator);
-                            }
-                            separator = ", ";
-                        }
-                        printf("]\n");
-                    }
-                    NEXT();
-                }
-                LABEL(kReadExternal) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    uint8_t count = read<uint8_t>(&ip);
-                    uint8_t index = read<uint8_t>(&ip);
-                    SkASSERT(count <= 4);
-                    SkASSERT(fCode->fExternalValues.size() > index);
-                    float tmp[4];
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            fCode->fExternalValues[index]->read(baseIndex + i, tmp);
-                            for (int j = 0; j < count; ++j) {
-                                fRegisters[target.fIndex + j].fFloat[i] = tmp[j];
-                            }
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kRemainderF) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src1 = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src2 = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex] = VecMod(fRegisters[src1.fIndex],
-                                                       fRegisters[src2.fIndex]);
-                    NEXT();
-                }
-                LABEL(kRemainderFN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src1 = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src2 = read<ByteCode::Register>(&ip);
-                    for (int i = 0; i < count; ++i) {
-                        fRegisters[target.fIndex + i] = VecMod(fRegisters[src1.fIndex + i],
-                                                               fRegisters[src2.fIndex + i]);
-                    }
-                    NEXT();
-                }
-                LABEL(kReturn) {
-                    if (context.fCallStack.empty()) {
-                        return true;
-                    }
-                    StackFrame frame = context.fCallStack.top();
-                    f = frame.fFunction;
-                    code = f->fCode.data();
-                    ip = frame.fIP;
-                    context.fStack += frame.fStackSlotCount;
-                    context.fCallStack.pop();
-                    NEXT();
-                }
-                LABEL(kReturnValue) {
-                    ByteCode::Register returnValue = read<ByteCode::Register>(&ip);
-                    if (context.fCallStack.empty()) {
-                        if (outResult) {
-                            *outResult = &fRegisters[returnValue.fIndex];
-                        }
-                        return true;
-                    }
-                    StackFrame frame = context.fCallStack.top();
-                    ip = frame.fIP;
-                    context.fStack += frame.fStackSlotCount;
-                    memcpy(frame.fReturnValue, &fRegisters[returnValue.fIndex],
-                           sizeof(Vector) * f->fReturnSlotCount);
-                    f = frame.fFunction;
-                    code = f->fCode.data();
-                    context.fCallStack.pop();
-                    NEXT();
-                }
-                LABEL(kScalarToMatrix) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    uint8_t columns = read<uint8_t>(&ip);
-                    uint8_t rows = read<uint8_t>(&ip);
-                    int offset = 0;
-                    for (int i = 0; i < columns; ++i) {
-                        for (int j = 0; j < rows; ++j) {
-                            if (i == j) {
-                                fRegisters[target.fIndex + offset] = fRegisters[src.fIndex];
-                            } else {
-                                fRegisters[target.fIndex + offset].fFloat = 0;
-                            }
-                            ++offset;
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kSelect) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register test = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src1 = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src2 = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex] = skvx::if_then_else(fRegisters[test.fIndex].fInt,
-                                                                   fRegisters[src1.fIndex].fFloat,
-                                                                   fRegisters[src2.fIndex].fFloat);
-                    NEXT();
-                }
-                LABEL(kShiftLeft) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    uint8_t count = read<uint8_t>(&ip);
-                    fRegisters[target.fIndex].fInt = fRegisters[src.fIndex].fInt << count;
-                    NEXT();
-                }
-                LABEL(kShiftRightS) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    int8_t count = read<int8_t>(&ip);
-                    fRegisters[target.fIndex].fInt = fRegisters[src.fIndex].fInt >> count;
-                    NEXT();
-                }
-                LABEL(kShiftRightU) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    uint8_t count = read<uint8_t>(&ip);
-                    fRegisters[target.fIndex].fUInt = fRegisters[src.fIndex].fUInt >> count;
-                    NEXT();
-                }
-                LABEL(kSignedToFloat) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex] = Vector(skvx::cast<float>(
-                                                                      fRegisters[src.fIndex].fInt));
-                    NEXT();
-                }
-                VECTOR_UNARY_FN(kSin, sinf)
-                LABEL(kSplat) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    for (int i = 0; i < count; ++i) {
-                        fRegisters[target.fIndex + i] = fRegisters[src.fIndex];
-                    }
-                    NEXT();
-                }
-                LABEL(kSqrt) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex].fFloat = skvx::sqrt(fRegisters[src.fIndex].fFloat);
-                    NEXT();
-                }
-                LABEL(kStore) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            fMemory[fRegisters[target.fIndex].fInt[i]].fInt[i] =
-                                                                     fRegisters[src.fIndex].fInt[i];
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kStoreN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            for (int j = 0; j < count; ++j) {
-                                fMemory[fRegisters[target.fIndex].fInt[i] + j].fInt[i] =
-                                                                 fRegisters[src.fIndex + j].fInt[i];
-                            }
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kStoreDirect) {
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fMemory[target.fAddress] = skvx::if_then_else(mask(),
-                                                                  fRegisters[src.fIndex].fFloat,
-                                                                  fMemory[target.fAddress].fFloat);
-                    NEXT();
-                }
-                LABEL(kStoreDirectN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    for (int i = 0; i < count; ++i) {
-                        fMemory[target.fAddress + i] = skvx::if_then_else(
-                                                               mask(),
-                                                               fRegisters[src.fIndex + i].fFloat,
-                                                               fMemory[target.fAddress + i].fFloat);
-                    }
-                    NEXT();
-                }
-                LABEL(kStoreParameter) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Vector* base = parameterBase();
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            base[fRegisters[target.fIndex].fInt[i]].fInt[i] =
-                                                                     fRegisters[src.fIndex].fInt[i];
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kStoreParameterN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Vector* base = parameterBase();
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            for (int j = 0; j < count; ++j) {
-                                base[fRegisters[target.fIndex].fInt[i] + j].fInt[i] =
-                                                                 fRegisters[src.fIndex + j].fInt[i];
-                            }
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kStoreParameterDirect) {
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Vector* base = parameterBase();
-                    base[target.fAddress].fFloat = skvx::if_then_else(mask(),
-                                                                      fRegisters[src.fIndex].fFloat,
-                                                                      base[target.fAddress].fFloat);
-                    NEXT();
-                }
-                LABEL(kStoreParameterDirectN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    Vector* base = parameterBase();
-                    for (int i = 0; i < count; ++i) {
-                        base[target.fAddress + i].fFloat = skvx::if_then_else(
-                                                                  mask(),
-                                                                  fRegisters[src.fIndex + i].fFloat,
-                                                                  base[target.fAddress + i].fFloat);
-                    }
-                    NEXT();
-                }
-                LABEL(kStoreStack) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            context.fStack[fRegisters[target.fIndex].fInt[i]].fInt[i] =
-                                                                     fRegisters[src.fIndex].fInt[i];
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kStoreStackN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            for (int j = 0; j < count; ++j) {
-                                context.fStack[fRegisters[target.fIndex].fInt[i] + j].fInt[i] =
-                                                                 fRegisters[src.fIndex + j].fInt[i];
-                            }
-                        }
-                    }
-                    NEXT();
-                }
-                LABEL(kStoreStackDirect) {
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    CHECK_STACK_BOUNDS(target.fAddress);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    context.fStack[target.fAddress] = skvx::if_then_else(
-                                                            mask(),
-                                                            fRegisters[src.fIndex].fFloat,
-                                                            context.fStack[target.fAddress].fFloat);
-                    NEXT();
-                }
-                LABEL(kStoreStackDirectN) {
-                    uint8_t count = read<uint8_t>(&ip);
-                    ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
-                    CHECK_STACK_BOUNDS(target.fAddress);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    for (int i = 0; i < count; ++i) {
-                        context.fStack[target.fAddress + i] = skvx::if_then_else(
-                                                        mask(),
-                                                        fRegisters[src.fIndex + i].fFloat,
-                                                        context.fStack[target.fAddress + i].fFloat);
-                    }
-                    NEXT();
-                }
-                VECTOR_UNARY_FN(kTan, tanf)
-                LABEL(kUnsignedToFloat) {
-                    ByteCode::Register target = read<ByteCode::Register>(&ip);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    fRegisters[target.fIndex] = Vector(skvx::cast<float>(
-                                                                     fRegisters[src.fIndex].fUInt));
-                    NEXT();
-                }
-                LABEL(kWriteExternal) {
-                    uint8_t index = read<uint8_t>(&ip);
-                    uint8_t count = read<uint8_t>(&ip);
-                    SkASSERT(count <= 4);
-                    SkASSERT(fCode->fExternalValues.size() > index);
-                    ByteCode::Register src = read<ByteCode::Register>(&ip);
-                    float tmp[4];
-                    VectorI m = mask();
-                    for (int i = 0; i < width; ++i) {
-                        if (m[i]) {
-                            for (int j = 0; j < count; ++j) {
-                                tmp[j] = fRegisters[src.fIndex + j].fFloat[i];
-                            }
-                            fCode->fExternalValues[index]->write(baseIndex + i, tmp);
-                        }
-                    }
-                    NEXT();
-                }
-#ifndef SKSL_THREADED_CODE
-            }
-        }
-#endif
-    }
-
-    const std::unique_ptr<ByteCode> fCode;
-
-    void* fBackingStore;
-
-    Vector* fRegisters;
-
-    Vector* fMemory;
-
-    friend class ByteCode;
-
-    friend class ByteCodeGenerator;
-};
-
-#undef BINARY_OP
-#undef CHECK_STACK_BOUNDS
-
-} // namespace
-
-#endif
diff --git a/src/sksl/SkSLUtil.cpp b/src/sksl/SkSLUtil.cpp
index 686ece3..b2c5162 100644
--- a/src/sksl/SkSLUtil.cpp
+++ b/src/sksl/SkSLUtil.cpp
@@ -9,10 +9,6 @@
 
 #include "src/sksl/SkSLStringStream.h"
 
-#if !defined(SKSL_STANDALONE) & SK_SUPPORT_GPU
-#include "include/gpu/GrContextOptions.h"
-#endif
-
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif
@@ -77,177 +73,4 @@
     }
 }
 
-#if !defined(SKSL_STANDALONE) & SK_SUPPORT_GPU
-sk_sp<GrShaderCaps> ShaderCapsFactory::Default() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fShaderDerivativeSupport = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::Version450Core() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 450 core";
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::Version110() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 110";
-    result->fGLSLGeneration = GrGLSLGeneration::k110_GrGLSLGeneration;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::UsesPrecisionModifiers() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fUsesPrecisionModifiers = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::CannotUseMinAndAbsTogether() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fCanUseMinAndAbsTogether = false;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::CannotUseFractForNegativeValues() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fCanUseFractForNegativeValues = false;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::MustForceNegatedAtanParamToFloat() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fMustForceNegatedAtanParamToFloat = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::ShaderDerivativeExtensionString() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fShaderDerivativeSupport = true;
-    result->fShaderDerivativeExtensionString = "GL_OES_standard_derivatives";
-    result->fUsesPrecisionModifiers = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::FragCoordsOld() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 110";
-    result->fGLSLGeneration = GrGLSLGeneration::k110_GrGLSLGeneration;
-    result->fFragCoordConventionsExtensionString = "GL_ARB_fragment_coord_conventions";
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::FragCoordsNew() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fFragCoordConventionsExtensionString = "GL_ARB_fragment_coord_conventions";
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::GeometryShaderSupport() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fGeometryShaderSupport = true;
-    result->fGSInvocationsSupport = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::NoGSInvocationsSupport() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fGeometryShaderSupport = true;
-    result->fGSInvocationsSupport = false;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::GeometryShaderExtensionString() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 310es";
-    result->fGeometryShaderSupport = true;
-    result->fGeometryShaderExtensionString = "GL_EXT_geometry_shader";
-    result->fGSInvocationsSupport = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::GSInvocationsExtensionString() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fGeometryShaderSupport = true;
-    result->fGSInvocationsSupport = true;
-    result->fGSInvocationsExtensionString = "GL_ARB_gpu_shader5";
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::VariousCaps() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fExternalTextureSupport = true;
-    result->fFBFetchSupport = false;
-    result->fCanUseAnyFunctionInShader = false;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::CannotUseFragCoord() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fCanUseFragCoord = false;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::IncompleteShortIntPrecision() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 310es";
-    result->fUsesPrecisionModifiers = true;
-    result->fIncompleteShortIntPrecision = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::AddAndTrueToLoopCondition() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fAddAndTrueToLoopCondition = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::UnfoldShortCircuitAsTernary() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fUnfoldShortCircuitAsTernary = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::EmulateAbsIntFunction() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fEmulateAbsIntFunction = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::RewriteDoWhileLoops() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fRewriteDoWhileLoops = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::RemovePowWithConstantExponent() {
-    sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
-    result->fVersionDeclString = "#version 400";
-    result->fRemovePowWithConstantExponent = true;
-    return result;
-}
-
-sk_sp<GrShaderCaps> ShaderCapsFactory::SampleMaskSupport() {
-    sk_sp<GrShaderCaps> result = Default();
-    result->fSampleMaskSupport = true;
-    return result;
-}
-#endif
-
 } // namespace
diff --git a/src/sksl/SkSLUtil.h b/src/sksl/SkSLUtil.h
index 3a2b42b..08f2842 100644
--- a/src/sksl/SkSLUtil.h
+++ b/src/sksl/SkSLUtil.h
@@ -18,11 +18,13 @@
 #ifndef SKSL_STANDALONE
 #include "include/core/SkTypes.h"
 #if SK_SUPPORT_GPU
-#include "include/core/SkRefCnt.h"
+#include "include/gpu/GrContextOptions.h"
 #include "src/gpu/GrShaderCaps.h"
 #endif // SK_SUPPORT_GPU
 #endif // SKSL_STANDALONE
 
+class GrShaderCaps;
+
 namespace SkSL {
 
 class OutputStream;
@@ -219,51 +221,176 @@
 // Various sets of caps for use in tests
 class ShaderCapsFactory {
 public:
-    static sk_sp<GrShaderCaps> Default();
+    static sk_sp<GrShaderCaps> Default() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fShaderDerivativeSupport = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> Version450Core();
+    static sk_sp<GrShaderCaps> Version450Core() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 450 core";
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> Version110();
+    static sk_sp<GrShaderCaps> Version110() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 110";
+        result->fGLSLGeneration = GrGLSLGeneration::k110_GrGLSLGeneration;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> UsesPrecisionModifiers();
+    static sk_sp<GrShaderCaps> UsesPrecisionModifiers() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fUsesPrecisionModifiers = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> CannotUseMinAndAbsTogether();
+    static sk_sp<GrShaderCaps> CannotUseMinAndAbsTogether() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fCanUseMinAndAbsTogether = false;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> CannotUseFractForNegativeValues();
+    static sk_sp<GrShaderCaps> CannotUseFractForNegativeValues() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fCanUseFractForNegativeValues = false;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> MustForceNegatedAtanParamToFloat();
+    static sk_sp<GrShaderCaps> MustForceNegatedAtanParamToFloat() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fMustForceNegatedAtanParamToFloat = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> ShaderDerivativeExtensionString();
+    static sk_sp<GrShaderCaps> ShaderDerivativeExtensionString() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fShaderDerivativeSupport = true;
+        result->fShaderDerivativeExtensionString = "GL_OES_standard_derivatives";
+        result->fUsesPrecisionModifiers = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> FragCoordsOld();
+    static sk_sp<GrShaderCaps> FragCoordsOld() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 110";
+        result->fGLSLGeneration = GrGLSLGeneration::k110_GrGLSLGeneration;
+        result->fFragCoordConventionsExtensionString = "GL_ARB_fragment_coord_conventions";
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> FragCoordsNew();
+    static sk_sp<GrShaderCaps> FragCoordsNew() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fFragCoordConventionsExtensionString = "GL_ARB_fragment_coord_conventions";
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> GeometryShaderSupport();
+    static sk_sp<GrShaderCaps> GeometryShaderSupport() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fGeometryShaderSupport = true;
+        result->fGSInvocationsSupport = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> NoGSInvocationsSupport();
+    static sk_sp<GrShaderCaps> NoGSInvocationsSupport() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fGeometryShaderSupport = true;
+        result->fGSInvocationsSupport = false;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> GeometryShaderExtensionString();
+    static sk_sp<GrShaderCaps> GeometryShaderExtensionString() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 310es";
+        result->fGeometryShaderSupport = true;
+        result->fGeometryShaderExtensionString = "GL_EXT_geometry_shader";
+        result->fGSInvocationsSupport = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> GSInvocationsExtensionString();
+    static sk_sp<GrShaderCaps> GSInvocationsExtensionString() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fGeometryShaderSupport = true;
+        result->fGSInvocationsSupport = true;
+        result->fGSInvocationsExtensionString = "GL_ARB_gpu_shader5";
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> VariousCaps();
+    static sk_sp<GrShaderCaps> VariousCaps() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fExternalTextureSupport = true;
+        result->fFBFetchSupport = false;
+        result->fCanUseAnyFunctionInShader = false;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> CannotUseFragCoord();
+    static sk_sp<GrShaderCaps> CannotUseFragCoord() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fCanUseFragCoord = false;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> IncompleteShortIntPrecision();
+    static sk_sp<GrShaderCaps> IncompleteShortIntPrecision() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 310es";
+        result->fUsesPrecisionModifiers = true;
+        result->fIncompleteShortIntPrecision = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> AddAndTrueToLoopCondition();
+    static sk_sp<GrShaderCaps> AddAndTrueToLoopCondition() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fAddAndTrueToLoopCondition = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> UnfoldShortCircuitAsTernary();
+    static sk_sp<GrShaderCaps> UnfoldShortCircuitAsTernary() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fUnfoldShortCircuitAsTernary = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> EmulateAbsIntFunction();
+    static sk_sp<GrShaderCaps> EmulateAbsIntFunction() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fEmulateAbsIntFunction = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> RewriteDoWhileLoops();
+    static sk_sp<GrShaderCaps> RewriteDoWhileLoops() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fRewriteDoWhileLoops = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> RemovePowWithConstantExponent();
+    static sk_sp<GrShaderCaps> RemovePowWithConstantExponent() {
+        sk_sp<GrShaderCaps> result = sk_make_sp<GrShaderCaps>(GrContextOptions());
+        result->fVersionDeclString = "#version 400";
+        result->fRemovePowWithConstantExponent = true;
+        return result;
+    }
 
-    static sk_sp<GrShaderCaps> SampleMaskSupport();
+    static sk_sp<GrShaderCaps> SampleMaskSupport() {
+        sk_sp<GrShaderCaps> result = Default();
+        result->fSampleMaskSupport = true;
+        return result;
+    }
 };
 #endif
 
diff --git a/src/sksl/ir/SkSLFunctionDeclaration.h b/src/sksl/ir/SkSLFunctionDeclaration.h
index f7ce904..11b04a5 100644
--- a/src/sksl/ir/SkSLFunctionDeclaration.h
+++ b/src/sksl/ir/SkSLFunctionDeclaration.h
@@ -36,7 +36,7 @@
         for (auto p : fParameters) {
             result += separator;
             separator = ", ";
-            result += p->fType.displayName();
+            result += p->fName;
         }
         result += ")";
         return result;
diff --git a/src/sksl/ir/SkSLSymbolTable.cpp b/src/sksl/ir/SkSLSymbolTable.cpp
index bbf001d..ed2cb4d 100644
--- a/src/sksl/ir/SkSLSymbolTable.cpp
+++ b/src/sksl/ir/SkSLSymbolTable.cpp
@@ -114,7 +114,9 @@
                 break;
             case Symbol::kUnresolvedFunction_Kind:
                 for (auto& f : ((UnresolvedFunction&) *pair.second).fFunctions) {
-                    ((FunctionDeclaration*)f)->fBuiltin = true;
+                    if (!((FunctionDeclaration*)f)->fDefined) {
+                        ((FunctionDeclaration*)f)->fBuiltin = true;
+                    }
                 }
                 break;
             default:
diff --git a/src/sksl/sksl_interp.inc b/src/sksl/sksl_interp.inc
index e576f9f..f43f05f 100644
--- a/src/sksl/sksl_interp.inc
+++ b/src/sksl/sksl_interp.inc
@@ -1,7 +1,5 @@
 STRINGIFY(
 
-sk_has_side_effects void print(float f);
-
 $genType cos($genType y);
 $genHType cos($genHType y);
 float dot($genType x, $genType y);
diff --git a/tests/SkSLInterpreterTest.cpp b/tests/SkSLInterpreterTest.cpp
index f4bf75c..18c1fbf 100644
--- a/tests/SkSLInterpreterTest.cpp
+++ b/tests/SkSLInterpreterTest.cpp
@@ -9,13 +9,21 @@
 #include "src/sksl/SkSLByteCode.h"
 #include "src/sksl/SkSLCompiler.h"
 #include "src/sksl/SkSLExternalValue.h"
-#include "src/sksl/SkSLInterpreter.h"
 #include "src/utils/SkJSON.h"
 
 #include "tests/Test.h"
 
+static bool nearly_equal(const float a[], const float b[], int count) {
+    for (int i = 0; i < count; ++i) {
+        if (!SkScalarNearlyEqual(a[i], b[i])) {
+            return false;
+        }
+    }
+    return true;
+}
+
 void test(skiatest::Reporter* r, const char* src, float* in, float* expected,
-          bool exactCompare = false) {
+          bool exactCompare = true) {
     SkSL::Compiler compiler;
     SkSL::Program::Settings settings;
     std::unique_ptr<SkSL::Program> program = compiler.convertProgram(
@@ -31,17 +39,30 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        SkSL::Interpreter<1> interpreter(std::move(byteCode));
-        SkSL::ByteCode::Vector<1>* result;
-        bool success = interpreter.run(main, (SkSL::ByteCode::Vector<1>*) in, &result);
-        REPORTER_ASSERT(r, success);
-        for (int i = 0; i < main->getReturnSlotCount(); ++i) {
-            if (exactCompare) {
-                REPORTER_ASSERT(r, result[i].fInt[0] == ((int32_t*) expected)[i]);
-            } else {
-                REPORTER_ASSERT(r, SkScalarNearlyZero(result[i].fFloat[0] - expected[i]));
+        int returnCount = main->getReturnCount();
+        std::unique_ptr<float[]> out = std::unique_ptr<float[]>(new float[returnCount]);
+        SkAssertResult(byteCode->run(main, in, main->getParameterCount(), out.get(), returnCount,
+                                     nullptr, 0));
+        bool valid = exactCompare ? !memcmp(out.get(), expected, sizeof(float) * returnCount)
+                                  : nearly_equal(out.get(), expected, returnCount);
+        if (!valid) {
+            printf("for program: %s\n", src);
+            printf("    expected (");
+            const char* separator = "";
+            for (int i = 0; i < returnCount; ++i) {
+                printf("%s%f", separator, expected[i]);
+                separator = ", ";
             }
+            printf("), but received (");
+            separator = "";
+            for (int i = 0; i < returnCount; ++i) {
+                printf("%s%f", separator, out.get()[i]);
+                separator = ", ";
+            }
+            printf(")\n");
+            main->disassemble();
         }
+        REPORTER_ASSERT(r, valid);
     } else {
         printf("%s\n%s", src, compiler.errorText().c_str());
     }
@@ -62,8 +83,7 @@
         return;
     }
 
-    const SkSL::ByteCodeFunction* main1 = byteCode->getFunction("main");
-    SkSL::Interpreter<1> interpreter1(std::move(byteCode));
+    const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
 
     // Test on four different vectors (with varying orderings to get divergent control flow)
     const float input[16] = { 1, 2, 3, 4,
@@ -77,16 +97,9 @@
 
     // First run in scalar mode to determine the expected output
     for (int i = 0; i < 4; ++i) {
-        SkAssertResult(interpreter1.run(main1, (SkSL::ByteCode::Vector<1>*) (out_s + i * 4),
-                       nullptr));
+        SkAssertResult(byteCode->run(main, out_s + i * 4, 4, nullptr, 0, nullptr, 0));
     }
 
-    byteCode = compiler.toByteCode(*program);
-    SkASSERT(compiler.errorCount() == 0);
-
-    const SkSL::ByteCodeFunction* main4 = byteCode->getFunction("main");
-    SkSL::Interpreter<4> interpreter4(std::move(byteCode));
-
     // Need to transpose input vectors for striped execution
     auto transpose = [](float* v) {
         for (int r = 0; r < 4; ++r)
@@ -99,7 +112,7 @@
     float* args[] = { out_v, out_v + 4, out_v + 8, out_v + 12 };
 
     // Now run in parallel and compare results
-    SkAssertResult(interpreter4.runStriped(main4, 4, (float**) args));
+    SkAssertResult(byteCode->runStriped(main, 4, args, 4, nullptr, 0, nullptr, 0));
 
     // Transpose striped outputs back
     transpose(out_v);
@@ -112,7 +125,7 @@
                     out_v[4*i + 0], out_v[4*i + 1], out_v[4*i + 2], out_v[4*i + 3],
                     out_s[4*i + 0], out_s[4*i + 1], out_s[4*i + 2], out_s[4*i + 3]);
         }
-        main4->disassemble();
+        main->disassemble();
         REPORT_FAILURE(r, "VecInterpreter mismatch", SkString());
     }
 }
@@ -134,26 +147,20 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        SkSL::ByteCode::Vector<1> inoutColor[4];
-        inoutColor[0].fFloat[0] = inR;
-        inoutColor[1].fFloat[0] = inG;
-        inoutColor[2].fFloat[0] = inB;
-        inoutColor[3].fFloat[0] = inA;
-        SkSL::Interpreter<1> interpreter(std::move(byteCode));
-        bool success = interpreter.run(main, inoutColor, nullptr);
-        REPORTER_ASSERT(r, success);
-        if (inoutColor[0].fFloat[0] != expectedR || inoutColor[1].fFloat[0] != expectedG ||
-            inoutColor[2].fFloat[0] != expectedB || inoutColor[3].fFloat[0] != expectedA) {
+        float inoutColor[4] = { inR, inG, inB, inA };
+        SkAssertResult(byteCode->run(main, inoutColor, 4, nullptr, 0, nullptr, 0));
+        if (inoutColor[0] != expectedR || inoutColor[1] != expectedG ||
+            inoutColor[2] != expectedB || inoutColor[3] != expectedA) {
             printf("for program: %s\n", src);
             printf("    expected (%f, %f, %f, %f), but received (%f, %f, %f, %f)\n", expectedR,
-                   expectedG, expectedB, expectedA, inoutColor[0].fFloat[0],
-                   inoutColor[1].fFloat[0], inoutColor[2].fFloat[0], inoutColor[3].fFloat[0]);
+                   expectedG, expectedB, expectedA, inoutColor[0], inoutColor[1], inoutColor[2],
+                   inoutColor[3]);
             main->disassemble();
         }
-        REPORTER_ASSERT(r, inoutColor[0].fFloat[0] == expectedR);
-        REPORTER_ASSERT(r, inoutColor[1].fFloat[0] == expectedG);
-        REPORTER_ASSERT(r, inoutColor[2].fFloat[0] == expectedB);
-        REPORTER_ASSERT(r, inoutColor[3].fFloat[0] == expectedA);
+        REPORTER_ASSERT(r, inoutColor[0] == expectedR);
+        REPORTER_ASSERT(r, inoutColor[1] == expectedG);
+        REPORTER_ASSERT(r, inoutColor[2] == expectedB);
+        REPORTER_ASSERT(r, inoutColor[3] == expectedA);
     } else {
         printf("%s\n%s", src, compiler.errorText().c_str());
     }
@@ -170,10 +177,6 @@
          0.5, 1, 1.5, 2);
     test(r, "void main(inout half4 color) { color.r = int(color.r) + int(color.g); }", 1, 3, 0, 0,
          4, 3, 0, 0);
-    test(r, "void main(inout half4 color) { color.rg = color.r + color.gb; }", 1, 2, 3, 4,
-         3, 4, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.rg + color.b; }", 1, 2, 3, 4,
-         4, 5, 3, 4);
 }
 
 DEF_TEST(SkSLInterpreterSubtract, r) {
@@ -186,10 +189,6 @@
     test(r, "void main(inout half4 color) { color = -color; }", 4, 3, 2, 1, -4, -3, -2, -1);
     test(r, "void main(inout half4 color) { color.r = int(color.r) - int(color.g); }", 3, 1, 0, 0,
          2, 1, 0, 0);
-    test(r, "void main(inout half4 color) { color.rg = color.r - color.gb; }", 1, 2, 3, 4,
-         -1, -2, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.rg - color.b; }", 1, 2, 3, 4,
-         -2, -1, 3, 4);
 }
 
 DEF_TEST(SkSLInterpreterMultiply, r) {
@@ -201,10 +200,6 @@
          16, 9, 4, 1);
     test(r, "void main(inout half4 color) { color.r = int(color.r) * int(color.g); }", 3, -2, 0, 0,
          -6, -2, 0, 0);
-    test(r, "void main(inout half4 color) { color.rg = color.r * color.gb; }", 5, 2, 3, 4,
-         10, 15, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.rg * color.b; }", 1, 2, 3, 4,
-         3, 6, 3, 4);
 }
 
 DEF_TEST(SkSLInterpreterDivide, r) {
@@ -216,10 +211,6 @@
          1, 1, 1, 1);
     test(r, "void main(inout half4 color) { color.r = int(color.r) / int(color.g); }", 8, -2, 0, 0,
          -4, -2, 0, 0);
-    test(r, "void main(inout half4 color) { color.rg = color.r / color.gb; }", 12, 2, 3, 4,
-         6, 4, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.rg / color.b; }", 6, 3, 3, 4,
-         2, 1, 3, 4);
 }
 
 DEF_TEST(SkSLInterpreterRemainder, r) {
@@ -231,14 +222,6 @@
          2, 3, 0, 0);
     test(r, "void main(inout half4 color) { color.rg = half2(int2(int(color.r), int(color.g)) % "
                 "int(color.b)); }", 8, 10, 6, 0, 2, 4, 6, 0);
-    test(r, "void main(inout half4 color) { color.rg = color.r + color.gb; }", 1, 2, 3, 4,
-         3, 4, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.rg + color.b; }", 1, 2, 3, 4,
-         4, 5, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.r % color.gb; }", 10, 2, 3, 4,
-         0, 1, 3, 4);
-    test(r, "void main(inout half4 color) { color.rg = color.rg % color.b; }", 6, 3, 4, 4,
-         2, 3, 4, 4);
 }
 
 DEF_TEST(SkSLInterpreterAnd, r) {
@@ -295,7 +278,7 @@
     unsigned out;
 
     out = 0x00000088;
-    test(r, "int main(int x) { return x << 3; }", (float*)&in, (float*)&out);
+    test(r, "int  main(int  x) { return x << 3; }", (float*)&in, (float*)&out);
 
     out = 0xF0000002;
     test(r, "int main(int x) { return x >> 3; }", (float*)&in, (float*)&out);
@@ -435,8 +418,8 @@
     input[1].f = -5.0f;
     expected[0].s = 3;
     expected[1].s = -5;
-    test(r, "int  main(float  x) { return int (x); }", (float*)input, (float*)expected, true);
-    test(r, "int2 main(float2 x) { return int2(x); }", (float*)input, (float*)expected, true);
+    test(r, "int  main(float  x) { return int (x); }", (float*)input, (float*)expected);
+    test(r, "int2 main(float2 x) { return int2(x); }", (float*)input, (float*)expected);
 
     input[0].s = 3;
     expected[0].f = 3.0f;
@@ -491,15 +474,11 @@
     test(r, "void main(inout half4 color) { if (color.rg == color.ba) color.a = 1; }",
          1, 2, 1, 2, 1, 2, 1, 1);
     test(r, "void main(inout half4 color) { if (color.rg == color.ba) color.a = 1; }",
-         1, 2, 1, 3, 1, 2, 1, 3);
-    test(r, "void main(inout half4 color) { if (color.rg == color.ba) color.a = 1; }",
          1, 2, 3, 2, 1, 2, 3, 2);
     test(r, "void main(inout half4 color) { if (color.rg != color.ba) color.a = 1; }",
          1, 2, 1, 2, 1, 2, 1, 2);
     test(r, "void main(inout half4 color) { if (color.rg != color.ba) color.a = 1; }",
          1, 2, 3, 2, 1, 2, 3, 1);
-    test(r, "void main(inout half4 color) { if (color.rg != color.ba) color.a = 1; }",
-         1, 2, 1, 3, 1, 2, 1, 1);
 }
 
 DEF_TEST(SkSLInterpreterWhile, r) {
@@ -662,67 +641,51 @@
     SkIRect gRects[4] = { { 1,2,3,4 }, { 5,6,7,8 }, { 9,10,11,12 }, { 13,14,15,16 } };
     const float* fRects = (const float*)gRects;
 
-    SkSL::Interpreter<1> interpreter(std::move(byteCode));
-    auto geti = [](SkSL::Interpreter<1>::Vector* v) { return v->fInt[0]; };
-    auto getf = [](SkSL::Interpreter<1>::Vector* v) { return v->fFloat[0]; };
-
     {
         SkIRect in = SkIRect::MakeXYWH(10, 10, 20, 30);
-        SkSL::Interpreter<1>::Vector* out;
-        bool success = interpreter.run(rect_height, (SkSL::Interpreter<1>::Vector*) &in, &out);
-        REPORTER_ASSERT(r, success);
-        REPORTER_ASSERT(r, geti(out) == 30);
+        int out = 0;
+        SkAssertResult(byteCode->run(rect_height, (float*)&in, 4, (float*)&out, 1, fRects, 16));
+        REPORTER_ASSERT(r, out == 30);
     }
 
     {
         int in[2] = { 15, 25 };
-        SkSL::Interpreter<1>::Vector* out;
-        bool success = interpreter.run(make_blue_rect, (SkSL::Interpreter<1>::Vector*) in, &out);
-        REPORTER_ASSERT(r, success);
-        RectAndColor result{ { geti(out), geti(out + 1), geti(out + 2), geti(out + 3) },
-                             { getf(out + 4), getf(out + 5), getf(out + 6), getf(out + 7) } };
-        REPORTER_ASSERT(r, result.fRect.width() == 15);
-        REPORTER_ASSERT(r, result.fRect.height() == 25);
+        RectAndColor out;
+        SkAssertResult(byteCode->run(make_blue_rect, (float*)in, 2, (float*)&out, 8, fRects, 16));
+        REPORTER_ASSERT(r, out.fRect.width() == 15);
+        REPORTER_ASSERT(r, out.fRect.height() == 25);
         SkColor4f blue = { 0.0f, 1.0f, 0.0f, 1.0f };
-        REPORTER_ASSERT(r, result.fColor == blue);
+        REPORTER_ASSERT(r, out.fColor == blue);
     }
 
     {
         int in[15] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-        SkSL::Interpreter<1>::Vector* out;
-        bool success = interpreter.run(median, (SkSL::Interpreter<1>::Vector*) in, &out);
-        REPORTER_ASSERT(r, success);
-        REPORTER_ASSERT(r, geti(out) == 8);
+        int out = 0;
+        SkAssertResult(byteCode->run(median, (float*)in, 15, (float*)&out, 1, fRects, 16));
+        REPORTER_ASSERT(r, out == 8);
     }
 
     {
         float in[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
-        SkSL::Interpreter<1>::Vector* out;
-        bool success = interpreter.run(sums, (SkSL::Interpreter<1>::Vector*) in, &out);
-        REPORTER_ASSERT(r, success);
+        float out[8] = { 0 };
+        SkAssertResult(byteCode->run(sums, in, 8, out, 8, fRects, 16));
         for (int i = 0; i < 8; ++i) {
-            REPORTER_ASSERT(r, getf(out + i) == static_cast<float>((i + 1) * (i + 2) / 2));
+            REPORTER_ASSERT(r, out[i] == static_cast<float>((i + 1) * (i + 2) / 2));
         }
     }
 
     {
         int in = 2;
-        interpreter.setUniforms(fRects);
-        SkSL::Interpreter<1>::Vector* out;
-        bool success = interpreter.run(get_rect, (SkSL::Interpreter<1>::Vector*) &in, &out);
-        REPORTER_ASSERT(r, success);
-        REPORTER_ASSERT(r, geti(out) == gRects[2].fLeft);
-        REPORTER_ASSERT(r, geti(out + 1) == gRects[2].fTop);
-        REPORTER_ASSERT(r, geti(out + 2) == gRects[2].fRight);
-        REPORTER_ASSERT(r, geti(out + 3) == gRects[2].fBottom);
+        SkIRect out = SkIRect::MakeEmpty();
+        SkAssertResult(byteCode->run(get_rect, (float*)&in, 1, (float*)&out, 4, fRects, 16));
+        REPORTER_ASSERT(r, out == gRects[2]);
     }
 
     {
         ManyRects in;
         memset(&in, 0, sizeof(in));
         in.fNumRects = 2;
-        bool success = interpreter.run(fill_rects, (SkSL::Interpreter<1>::Vector*) &in, nullptr);
-        REPORTER_ASSERT(r, success);
+        SkAssertResult(byteCode->run(fill_rects, (float*)&in, 33, nullptr, 0, fRects, 16));
         ManyRects expected;
         memset(&expected, 0, sizeof(expected));
         expected.fNumRects = 2;
@@ -755,11 +718,9 @@
     auto byteCode = compiler.toByteCode(*program);
     REPORTER_ASSERT(r, byteCode);
 
-    auto main = byteCode->getFunction("main");
-    SkSL::Interpreter<1> interpreter(std::move(byteCode));
-    SkSL::ByteCode::Vector<1>* result;
-    bool success = interpreter.run(main, (SkSL::ByteCode::Vector<1>*) in, &result);
-    REPORTER_ASSERT(r, !success);
+    auto fun = byteCode->getFunction("main");
+    bool result = byteCode->run(fun, in, fun->getParameterCount(), nullptr, 0, nullptr, 0);
+    REPORTER_ASSERT(r, !result);
 }
 
 DEF_TEST(SkSLInterpreterRestrictFunctionCalls, r) {
@@ -825,72 +786,16 @@
     REPORTER_ASSERT(r, dot3);
     REPORTER_ASSERT(r, dot2);
 
-    SkSL::Interpreter<1> interpreter(std::move(byteCode));
+    float out = 0.0f;
     float in = 3.0f;
+    SkAssertResult(byteCode->run(main, &in, 1, &out, 1, nullptr, 0));
+    REPORTER_ASSERT(r, out = 6.0f);
 
-    SkSL::Interpreter<1>::Vector* out;
-    bool success = interpreter.run(main, (SkSL::Interpreter<1>::Vector*) &in, &out);
-    REPORTER_ASSERT(r, success);
-    REPORTER_ASSERT(r, out->fFloat[0] = 6.0f);
+    SkAssertResult(byteCode->run(dot3, &in, 1, &out, 1, nullptr, 0));
+    REPORTER_ASSERT(r, out = 9.0f);
 
-    success = interpreter.run(dot3, (SkSL::Interpreter<1>::Vector*) &in, &out);
-    REPORTER_ASSERT(r, success);
-    REPORTER_ASSERT(r, out->fFloat[0] = 9.0f);
-
-    success = interpreter.run(dot2, (SkSL::Interpreter<1>::Vector*) &in, &out);
-    REPORTER_ASSERT(r, success);
-    REPORTER_ASSERT(r, out->fFloat[0] = -1.0f);
-}
-
-DEF_TEST(SkSLInterpreterRunStripedReturn, r) {
-    const char* src =
-        "float  prod(float2 v) { return v.x * v.y; }\n"
-        "float2 swap(float2 v) { return v.yx; }\n";
-
-    SkSL::Compiler compiler;
-    SkSL::Program::Settings settings;
-    auto program =
-            compiler.convertProgram(SkSL::Program::kGeneric_Kind, SkSL::String(src), settings);
-    REPORTER_ASSERT(r, program);
-
-    auto byteCode = compiler.toByteCode(*program);
-    REPORTER_ASSERT(r, !compiler.errorCount());
-
-    auto prod = byteCode->getFunction("prod");
-    auto swap = byteCode->getFunction("swap");
-
-    REPORTER_ASSERT(r, prod);
-    REPORTER_ASSERT(r, swap);
-
-    SkSL::Interpreter<4> interpreter(std::move(byteCode));
-    float inX[4] = { 1, 2, 3, 4 };
-    float inY[4] = { 5, 6, 7, 8 };
-    float outX[4], outY[4];
-
-    float* in[] = { inX, inY };
-    float* out[] = { outX, outY };
-
-    for (int count : { 2, 4 }) {
-        memset(outX, 0, sizeof(outX));
-        memset(outY, 0, sizeof(outY));
-
-        bool success = interpreter.runStriped(prod, count, in, out);
-        REPORTER_ASSERT(r, success);
-        for (int i = 0; i < 4; ++i) {
-            REPORTER_ASSERT(r, outX[i] == (i < count ? inX[i] * inY[i] : 0.0f));
-            REPORTER_ASSERT(r, outY[i] == 0.0f);
-        }
-
-        memset(outX, 0, sizeof(outX));
-        memset(outY, 0, sizeof(outY));
-
-        success = interpreter.runStriped(swap, count, in, out);
-        REPORTER_ASSERT(r, success);
-        for (int i = 0; i < 4; ++i) {
-            REPORTER_ASSERT(r, outX[i] == (i < count ? inY[i] : 0.0f));
-            REPORTER_ASSERT(r, outY[i] == (i < count ? inX[i] : 0.0f));
-        }
-    }
+    SkAssertResult(byteCode->run(dot2, &in, 1, &out, 1, nullptr, 0));
+    REPORTER_ASSERT(r, out = -1.0f);
 }
 
 DEF_TEST(SkSLInterpreterOutParams, r) {
@@ -899,30 +804,23 @@
          "void main(inout half4 color) { oneAlpha(color); }",
          0, 0, 0, 0, 0, 0, 0, 1);
     test(r,
-         "half2 tricky(half x, half y, inout half2 color, half z, out half w) {"
+         "half2 tricky(half x, half y, inout half2 color, half z) {"
          "    color.xy = color.yx;"
-         "    w = 47;"
          "    return half2(x + y, z);"
          "}"
          "void main(inout half4 color) {"
-         "    half w;"
-         "    half2 t = tricky(1, 2, color.rb, 5, w);"
-         "    color.r += w;"
+         "    half2 t = tricky(1, 2, color.rb, 5);"
          "    color.ga = t;"
          "}",
-         1, 2, 3, 4, 50, 3, 1, 5);
+         1, 2, 3, 4, 3, 3, 1, 5);
 }
 
 DEF_TEST(SkSLInterpreterMathFunctions, r) {
     float value[4], expected[4];
 
-    value[0] = 0.0f; value[1] = SK_FloatPI / 2;
-    expected[0] = 0.0f; expected[1] = 1.0f;
-    test(r, "float2 main(float2 x) { return sin(x); }", value, expected);
-
-    value[0] = 0.0f; value[1] = SK_FloatPI / 4;
-    expected[0] = 0.0f; expected[1] = 1.0f;
-    test(r, "float2 main(float2 x) { return tan(x); }", value, expected);
+    value[0] = 0.0f; expected[0] = 0.0f;
+    test(r, "float main(float x) { return sin(x); }", value, expected);
+    test(r, "float main(float x) { return tan(x); }", value, expected);
 
     value[0] = 0.0f; expected[0] = 1.0f;
     test(r, "float main(float x) { return cos(x); }", value, expected);
@@ -1131,11 +1029,9 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        SkSL::Interpreter<1> interpreter(std::move(byteCode));
-        SkSL::ByteCode::Vector<1>* result;
-        bool success = interpreter.run(main, nullptr, &result);
-        REPORTER_ASSERT(r, success);
-        REPORTER_ASSERT(r, result->fFloat[0] == 66.0);
+        float out;
+        SkAssertResult(byteCode->run(main, nullptr, 0, &out, 1, nullptr, 0));
+        REPORTER_ASSERT(r, out == 66.0);
         REPORTER_ASSERT(r, outValue == 152);
     } else {
         printf("%s\n%s", src, compiler.errorText().c_str());
@@ -1166,9 +1062,7 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        SkSL::Interpreter<1> interpreter(std::move(byteCode));
-        bool success = interpreter.run(main, nullptr, nullptr);
-        REPORTER_ASSERT(r, success);
+        SkAssertResult(byteCode->run(main, nullptr, 0, nullptr, 0, nullptr, 0));
         REPORTER_ASSERT(r, value[0] == 2);
         REPORTER_ASSERT(r, value[1] == 4);
         REPORTER_ASSERT(r, value[2] == 6);
@@ -1233,11 +1127,9 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        SkSL::Interpreter<1> interpreter(std::move(byteCode));
-        SkSL::ByteCode::Vector<1>* result;
-        bool success = interpreter.run(main, nullptr, &result);
-        REPORTER_ASSERT(r, success);
-        REPORTER_ASSERT(r, result->fFloat[0] == 5.0);
+        float out;
+        SkAssertResult(byteCode->run(main, nullptr, 0, &out, 1, nullptr, 0));
+        REPORTER_ASSERT(r, out == 5.0);
     } else {
         printf("%s\n%s", src, compiler.errorText().c_str());
     }
@@ -1250,23 +1142,32 @@
         : INHERITED(name, *compiler.context().fFloat4_Type)
         , fCompiler(compiler)
         , fFunction(function) {}
+
     bool canCall() const override {
         return true;
     }
+
     int callParameterCount() const override {
         return 1;
     }
+
     void getCallParameterTypes(const SkSL::Type** outTypes) const override {
         outTypes[0] = fCompiler.context().fFloat4_Type.get();
     }
+
     void call(int /*unusedIndex*/, float* arguments, float* outReturn) override {
         fFunction(arguments, outReturn);
     }
+
 private:
     SkSL::Compiler& fCompiler;
+
     void (*fFunction)(float[4], float[4]);
+
     typedef SkSL::ExternalValue INHERITED;
 };
+
+
 DEF_TEST(SkSLInterpreterExternalValuesVectorCall, r) {
     SkSL::Compiler compiler;
     SkSL::Program::Settings settings;
@@ -1294,14 +1195,12 @@
             return;
         }
         const SkSL::ByteCodeFunction* main = byteCode->getFunction("main");
-        SkSL::Interpreter<1> interpreter(std::move(byteCode));
-        SkSL::ByteCode::Vector<1>* result;
-        bool success = interpreter.run(main, nullptr, &result);
-        REPORTER_ASSERT(r, success);
-        REPORTER_ASSERT(r, result[0].fFloat[0] == 1.0);
-        REPORTER_ASSERT(r, result[1].fFloat[0] == 2.0);
-        REPORTER_ASSERT(r, result[2].fFloat[0] == 3.0);
-        REPORTER_ASSERT(r, result[3].fFloat[0] == 4.0);
+        float out[4];
+        SkAssertResult(byteCode->run(main, nullptr, 0, out, 4, nullptr, 0));
+        REPORTER_ASSERT(r, out[0] == 1.0);
+        REPORTER_ASSERT(r, out[1] == 2.0);
+        REPORTER_ASSERT(r, out[2] == 3.0);
+        REPORTER_ASSERT(r, out[3] == 4.0);
     } else {
         printf("%s\n%s", src, compiler.errorText().c_str());
     }