Fix worker race condition in gemmlowp.

Bug: 29993722

  - Merge upstream change 67949d5f0f0e00d44709570d87f5ffa43b81e4f1
  - Fix race condition by adding explicit memory barriers

Change-Id: Ie3e72ac3307e6b205d9f473aac3eb19ea43866f8
(cherry picked from commit d2f96f0c9363004d5bd8d042dbc70c1519875d33)
diff --git a/internal/multi_thread_gemm.h b/internal/multi_thread_gemm.h
index be33d5f..0aacddb 100644
--- a/internal/multi_thread_gemm.h
+++ b/internal/multi_thread_gemm.h
@@ -66,6 +66,34 @@
 
 #endif  // not GEMMLOWP_ALLOW_INLINE_ASM
 
+inline void WriteBarrier() {
+#ifdef GEMMLOWP_ARM_32
+  MemoryBarrier();
+#elif defined(GEMMLOWP_ARM_64)
+  asm volatile("dmb ishst" ::: "memory");
+#elif defined(GEMMLOWP_X86)
+  asm volatile("sfence" ::: "memory");
+#elif defined(__mips__)
+  MemoryBarrier();
+#else
+#error "Unsupported architecture for WriteBarrier."
+#endif
+}
+
+inline void ReadBarrier() {
+#ifdef GEMMLOWP_ARM_32
+  MemoryBarrier();
+#elif defined(GEMMLOWP_ARM_64)
+  asm volatile("dmb ishld" ::: "memory");
+#elif defined(GEMMLOWP_X86)
+  asm volatile("lfence" ::: "memory");
+#elif defined(__mips__)
+  MemoryBarrier();
+#else
+#error "Unsupported architecture for ReadBarrier."
+#endif
+}
+
 // Waits until *var != initial_value.
 //
 // Returns the new value of *var. The guarantee here is that
@@ -255,6 +283,7 @@
       switch (state_to_act_upon) {
         case State::HasWork:
           // Got work to do! So do it, and then revert to 'Ready' state.
+          ReadBarrier();
           assert(task_);
           task_->Run();
           delete task_;
@@ -280,6 +309,7 @@
     assert(!task_);
     task->local_allocator = &local_allocator_;
     task_ = task;
+    WriteBarrier();
     assert(state_ == State::Ready);
     ChangeState(State::HasWork);
   }