thread-local caches?

Here's another idea... these SkVMBlitter program caches are probably
best thread-local.  If there are a bunch of threads doing the same work
with the same program, we don't need them fighting over that one slot in
the cache, and if there are a bunch of threads doing _different_ work,
they'll get the best cache behavior if they don't fight over slots in
the LRU with different programs.  Either way, seems like a win?

(I've kept the try-acquire/release pattern just to make the focus of
this change more clear.  We can fold it through more if we like it.)

Change-Id: Ib1ee270069c48446845ce27225652896661c5dfe
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/233060
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/src/core/SkVMBlitter.cpp b/src/core/SkVMBlitter.cpp
index 493ce16..597512f 100644
--- a/src/core/SkVMBlitter.cpp
+++ b/src/core/SkVMBlitter.cpp
@@ -6,7 +6,6 @@
  */
 
 #include "include/private/SkMacros.h"
-#include "include/private/SkSpinlock.h"
 #include "src/core/SkArenaAlloc.h"
 #include "src/core/SkColorSpacePriv.h"
 #include "src/core/SkColorSpaceXformSteps.h"
@@ -44,20 +43,12 @@
             && x.colorFilter == y.colorFilter;
     }
 
-    static SkSpinlock gProgramCacheLock;
-
-    SK_TRY_ACQUIRE(true, gProgramCacheLock)
     static SkLRUCache<Key, skvm::Program>* try_acquire_program_cache() {
-        if (gProgramCacheLock.tryAcquire()) {
-            static auto cache SK_GUARDED_BY(gProgramCacheLock)
-                = new SkLRUCache<Key, skvm::Program>{8};
-            return cache;
-        }
-        return nullptr;
+        thread_local static auto* cache = new SkLRUCache<Key, skvm::Program>{8};
+        return cache;
     }
 
-    SK_RELEASE_CAPABILITY(gProgramCacheLock)
-    static void release_program_cache() { gProgramCacheLock.release(); }
+    static void release_program_cache() { }
 
 
     struct Uniforms {