add SkVM_Overhead bench, simple improvements

This new bench lets us measure the overhead of program building,
optimization, and JITting.  Surprisingly, at head the optimization in
Builder::done() takes longer than the JIT.

The new bench clocks in around 40µs on my laptop at head,
  then 32µs after switching val_to_reg to be an std::vector,
  then 27µs after switching deaths to be an std::vector too,
  then 22µs after switching fIndex to be an SkTHashMap,
  then 20µs after calling program.reserve(fProgram.size()),
  then 19µs after switching JIT data maps to SkTHashMap too.

I tried swapping some std::vector for SkTDArray to no benefit, actually
a little detriment.  So I think this is roughly all the low-hanging
fruit, with time split now roughly equally between Builder::Done(),
JITting in Program::eval(), and the original calls to Builder
themselves.

Also disable perf dumps on Mac.  No real value there until I can dump a
dylib, and it's just one more thing I have to remember to disable before
running this sort of benchmark.

Change-Id: I1c6e58ed00ac94ad622c7d740712634f60787102
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/222984
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/bench/SkVMBench.cpp b/bench/SkVMBench.cpp
index 12e0577..91bb277 100644
--- a/bench/SkVMBench.cpp
+++ b/bench/SkVMBench.cpp
@@ -126,3 +126,21 @@
 DEF_BENCH(return (new SkVMBench{ 256, I32_SWAR});)
 DEF_BENCH(return (new SkVMBench{1024, I32_SWAR});)
 DEF_BENCH(return (new SkVMBench{4096, I32_SWAR});)
+
+class SkVM_Overhead : public Benchmark {
+public:
+    SkVM_Overhead() {}
+
+private:
+    const char* onGetName() override { return "SkVM_Overhead"; }
+    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
+
+    void onDraw(int loops, SkCanvas*) override {
+        while (loops --> 0) {
+            float dummy;
+            skvm::Program program = SrcoverBuilder_F32{}.done();
+            program.eval(0, &dummy, &dummy);
+        }
+    }
+};
+DEF_BENCH(return new SkVM_Overhead;)
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 4193cdc..dc0b677 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -81,7 +81,8 @@
         }
 
         // We'll need to map each live value to a register.
-        std::unordered_map<ID, ID> val_to_reg;
+        // TODO: this could be another field on Instruction / fProgram to avoid this side alloc?
+        std::vector<ID> val_to_reg(fProgram.size());
 
         // Count the registers we've used so far.
         ID next_reg = 0;
@@ -101,7 +102,7 @@
         // values have finite liftimes, so we track pre-owned registers that have become available
         // and a schedule of which registers become available as we reach a given instruction.
         std::vector<ID>                         avail;
-        std::unordered_map<ID, std::vector<ID>> deaths;
+        std::vector<std::vector<ID>>            deaths(fProgram.size());
 
         for (ID val = 0; val < (ID)fProgram.size(); val++) {
             Instruction& inst = fProgram[val];
@@ -144,6 +145,7 @@
         // The loop begins at the loop'th Instruction.
         int loop = 0;
         std::vector<Program::Instruction> program;
+        program.reserve(fProgram.size());
 
         auto push_instruction = [&](ID id, const Builder::Instruction& inst) {
             Program::Instruction pinst{
@@ -187,14 +189,14 @@
 
         // Basic common subexpression elimination:
         // if we've already seen this exact Instruction, use it instead of creating a new one.
-        auto lookup = fIndex.find(inst);
-        if (lookup != fIndex.end()) {
-            return lookup->second;
+
+        if (ID* lookup = fIndex.find(inst)) {
+            return *lookup;
         }
 
         ID id = static_cast<ID>(fProgram.size());
         fProgram.push_back(inst);
-        fIndex[inst] = id;
+        fIndex.set(inst, id);
         return id;
     }
 
@@ -735,9 +737,9 @@
         // the instructions that use them... no relocations.
 
         // Map from our bytes() control y.imm to 32-byte mask for vpshufb.
-        std::unordered_map<int, A::Label> vpshufb_masks;
+        SkTHashMap<int, A::Label> vpshufb_masks;
         for (const Program::Instruction& inst : instructions) {
-            if (inst.op == Op::bytes && vpshufb_masks.end() == vpshufb_masks.find(inst.y.imm)) {
+            if (inst.op == Op::bytes && vpshufb_masks.find(inst.y.imm) == nullptr) {
                 // Translate bytes()'s control nibbles to vpshufb's control bytes.
                 auto nibble_to_vpshufb = [](unsigned n) -> uint8_t {
                     return n == 0 ? 0xff  // Fill with zero.
@@ -774,14 +776,14 @@
                 A::Label label = a.here();
                 a.byte(p, sizeof(p));
                 a.byte(p, sizeof(p));
-                vpshufb_masks[inst.y.imm] = label;
+                vpshufb_masks.set(inst.y.imm, label);
             }
 
         }
 
         // Map from splat bit pattern to 4-byte aligned data location holding that pattern.
         // (If we were really brave we could just point at the copy we already have in Program...)
-        std::unordered_map<int, A::Label> splats;
+        SkTHashMap<int, A::Label> splats;
         for (const Program::Instruction& inst : instructions) {
             if (inst.op == Op::splat) {
                 // Splats are deduplicated at an earlier layer, so we shouldn't find any duplicates.
@@ -790,12 +792,12 @@
                 //
                 // TODO: in an AVX-512 world, it makes less sense to assign splats to registers at
                 // all.  Perhaps we should move the deduping / register coloring for splats here?
-                SkASSERT(splats.end() == splats.find(inst.y.imm));
+                SkASSERT(splats.find(inst.y.imm) == nullptr);
 
                 SkASSERT(a.size() % 4 == 0);
                 A::Label label = a.here();
                 a.byte(&inst.y.imm, 4);
-                splats[inst.y.imm] = label;
+                splats.set(inst.y.imm, label);
             }
         }
 
@@ -833,7 +835,7 @@
                 case Op::load8:  a.vpmovzxbd(r(d), arg[y.imm]); break;
                 case Op::load32: a.vmovups  (r(d), arg[y.imm]); break;
 
-                case Op::splat: a.vbroadcastss(r(d), splats[y.imm]); break;
+                case Op::splat: a.vbroadcastss(r(d), *splats.find(y.imm)); break;
 
                 case Op::add_f32: a.vaddps(r(d), r(x), r(y.id)); break;
                 case Op::sub_f32: a.vsubps(r(d), r(x), r(y.id)); break;
@@ -878,7 +880,7 @@
                 case Op::to_f32: a.vcvtdq2ps (r(d), r(x)); break;
                 case Op::to_i32: a.vcvttps2dq(r(d), r(x)); break;
 
-                case Op::bytes: a.vpshufb(r(d), r(x), vpshufb_masks[y.imm]); break;
+                case Op::bytes: a.vpshufb(r(d), r(x), *vpshufb_masks.find(y.imm)); break;
             }
         }
 
@@ -938,7 +940,7 @@
                 fJIT.entry = entry;
                 fJIT.mask  = mask;
 
-            #if 1   // Debug dumps for profiler.
+            #if 1 && defined(SK_BUILD_FOR_UNIX)   // Debug dumps for perf.
                 // We're doing some really stateful things below so one thread at a time please...
                 static SkSpinlock dump_lock;
                 SkAutoSpinlock lock(dump_lock);
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index dce9ba8..16542cf 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -10,8 +10,8 @@
 
 #include "include/core/SkStream.h"
 #include "include/core/SkTypes.h"
+#include "include/private/SkTHash.h"
 #include "include/private/SkSpinlock.h"
-#include <unordered_map>
 #include <vector>
 
 namespace skvm {
@@ -291,7 +291,7 @@
         ID push(Op, ID x, ID y=NA, ID z=NA, int immy=0, int immz=0);
         bool isZero(ID) const;
 
-        std::unordered_map<Instruction, ID, InstructionHash> fIndex;
+        SkTHashMap<Instruction, ID, InstructionHash>         fIndex;
         std::vector<Instruction>                             fProgram;
     };