add SkVM_Overhead bench, simple improvements
This new bench lets us measure the overhead of program building,
optimization, and JITting. Surprisingly, at head the optimization in
Builder::done() takes longer than the JIT.
The new bench clocks in around 40µs on my laptop at head,
then 32µs after switching val_to_reg to be an std::vector,
then 27µs after switching deaths to be an std::vector too,
then 22µs after switching fIndex to be an SkTHashMap,
then 20µs after calling program.reserve(fProgram.size()),
then 19µs after switching JIT data maps to SkTHashMap too.
I tried swapping some std::vector for SkTDArray to no benefit, actually
a little detriment. So I think this is roughly all the low-hanging
fruit, with time split now roughly equally between Builder::Done(),
JITting in Program::eval(), and the original calls to Builder
themselves.
Also disable perf dumps on Mac. No real value there until I can dump a
dylib, and it's just one more thing I have to remember to disable before
running this sort of benchmark.
Change-Id: I1c6e58ed00ac94ad622c7d740712634f60787102
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/222984
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/bench/SkVMBench.cpp b/bench/SkVMBench.cpp
index 12e0577..91bb277 100644
--- a/bench/SkVMBench.cpp
+++ b/bench/SkVMBench.cpp
@@ -126,3 +126,21 @@
DEF_BENCH(return (new SkVMBench{ 256, I32_SWAR});)
DEF_BENCH(return (new SkVMBench{1024, I32_SWAR});)
DEF_BENCH(return (new SkVMBench{4096, I32_SWAR});)
+
+class SkVM_Overhead : public Benchmark {
+public:
+ SkVM_Overhead() {}
+
+private:
+ const char* onGetName() override { return "SkVM_Overhead"; }
+ bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
+
+ void onDraw(int loops, SkCanvas*) override {
+ while (loops --> 0) {
+ float dummy;
+ skvm::Program program = SrcoverBuilder_F32{}.done();
+ program.eval(0, &dummy, &dummy);
+ }
+ }
+};
+DEF_BENCH(return new SkVM_Overhead;)
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 4193cdc..dc0b677 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -81,7 +81,8 @@
}
// We'll need to map each live value to a register.
- std::unordered_map<ID, ID> val_to_reg;
+ // TODO: this could be another field on Instruction / fProgram to avoid this side alloc?
+ std::vector<ID> val_to_reg(fProgram.size());
// Count the registers we've used so far.
ID next_reg = 0;
@@ -101,7 +102,7 @@
// values have finite liftimes, so we track pre-owned registers that have become available
// and a schedule of which registers become available as we reach a given instruction.
std::vector<ID> avail;
- std::unordered_map<ID, std::vector<ID>> deaths;
+ std::vector<std::vector<ID>> deaths(fProgram.size());
for (ID val = 0; val < (ID)fProgram.size(); val++) {
Instruction& inst = fProgram[val];
@@ -144,6 +145,7 @@
// The loop begins at the loop'th Instruction.
int loop = 0;
std::vector<Program::Instruction> program;
+ program.reserve(fProgram.size());
auto push_instruction = [&](ID id, const Builder::Instruction& inst) {
Program::Instruction pinst{
@@ -187,14 +189,14 @@
// Basic common subexpression elimination:
// if we've already seen this exact Instruction, use it instead of creating a new one.
- auto lookup = fIndex.find(inst);
- if (lookup != fIndex.end()) {
- return lookup->second;
+
+ if (ID* lookup = fIndex.find(inst)) {
+ return *lookup;
}
ID id = static_cast<ID>(fProgram.size());
fProgram.push_back(inst);
- fIndex[inst] = id;
+ fIndex.set(inst, id);
return id;
}
@@ -735,9 +737,9 @@
// the instructions that use them... no relocations.
// Map from our bytes() control y.imm to 32-byte mask for vpshufb.
- std::unordered_map<int, A::Label> vpshufb_masks;
+ SkTHashMap<int, A::Label> vpshufb_masks;
for (const Program::Instruction& inst : instructions) {
- if (inst.op == Op::bytes && vpshufb_masks.end() == vpshufb_masks.find(inst.y.imm)) {
+ if (inst.op == Op::bytes && vpshufb_masks.find(inst.y.imm) == nullptr) {
// Translate bytes()'s control nibbles to vpshufb's control bytes.
auto nibble_to_vpshufb = [](unsigned n) -> uint8_t {
return n == 0 ? 0xff // Fill with zero.
@@ -774,14 +776,14 @@
A::Label label = a.here();
a.byte(p, sizeof(p));
a.byte(p, sizeof(p));
- vpshufb_masks[inst.y.imm] = label;
+ vpshufb_masks.set(inst.y.imm, label);
}
}
// Map from splat bit pattern to 4-byte aligned data location holding that pattern.
// (If we were really brave we could just point at the copy we already have in Program...)
- std::unordered_map<int, A::Label> splats;
+ SkTHashMap<int, A::Label> splats;
for (const Program::Instruction& inst : instructions) {
if (inst.op == Op::splat) {
// Splats are deduplicated at an earlier layer, so we shouldn't find any duplicates.
@@ -790,12 +792,12 @@
//
// TODO: in an AVX-512 world, it makes less sense to assign splats to registers at
// all. Perhaps we should move the deduping / register coloring for splats here?
- SkASSERT(splats.end() == splats.find(inst.y.imm));
+ SkASSERT(splats.find(inst.y.imm) == nullptr);
SkASSERT(a.size() % 4 == 0);
A::Label label = a.here();
a.byte(&inst.y.imm, 4);
- splats[inst.y.imm] = label;
+ splats.set(inst.y.imm, label);
}
}
@@ -833,7 +835,7 @@
case Op::load8: a.vpmovzxbd(r(d), arg[y.imm]); break;
case Op::load32: a.vmovups (r(d), arg[y.imm]); break;
- case Op::splat: a.vbroadcastss(r(d), splats[y.imm]); break;
+ case Op::splat: a.vbroadcastss(r(d), *splats.find(y.imm)); break;
case Op::add_f32: a.vaddps(r(d), r(x), r(y.id)); break;
case Op::sub_f32: a.vsubps(r(d), r(x), r(y.id)); break;
@@ -878,7 +880,7 @@
case Op::to_f32: a.vcvtdq2ps (r(d), r(x)); break;
case Op::to_i32: a.vcvttps2dq(r(d), r(x)); break;
- case Op::bytes: a.vpshufb(r(d), r(x), vpshufb_masks[y.imm]); break;
+ case Op::bytes: a.vpshufb(r(d), r(x), *vpshufb_masks.find(y.imm)); break;
}
}
@@ -938,7 +940,7 @@
fJIT.entry = entry;
fJIT.mask = mask;
- #if 1 // Debug dumps for profiler.
+ #if 1 && defined(SK_BUILD_FOR_UNIX) // Debug dumps for perf.
// We're doing some really stateful things below so one thread at a time please...
static SkSpinlock dump_lock;
SkAutoSpinlock lock(dump_lock);
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index dce9ba8..16542cf 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -10,8 +10,8 @@
#include "include/core/SkStream.h"
#include "include/core/SkTypes.h"
+#include "include/private/SkTHash.h"
#include "include/private/SkSpinlock.h"
-#include <unordered_map>
#include <vector>
namespace skvm {
@@ -291,7 +291,7 @@
ID push(Op, ID x, ID y=NA, ID z=NA, int immy=0, int immz=0);
bool isZero(ID) const;
- std::unordered_map<Instruction, ID, InstructionHash> fIndex;
+ SkTHashMap<Instruction, ID, InstructionHash> fIndex;
std::vector<Instruction> fProgram;
};