Fix worker race condition in gemmlowp. am: d2f96f0c93 am: 9b33f095ef
am: adde4e0d4d

Change-Id: Idc1dab9970b45c89888b0a3d208d9d6fcdd4498c
diff --git a/internal/multi_thread_gemm.h b/internal/multi_thread_gemm.h
index be33d5f..9223a42 100644
--- a/internal/multi_thread_gemm.h
+++ b/internal/multi_thread_gemm.h
@@ -66,6 +66,30 @@
 
 #endif  // not GEMMLOWP_ALLOW_INLINE_ASM
 
+inline void WriteBarrier() {
+#ifdef GEMMLOWP_ARM_32
+  MemoryBarrier();
+#elif defined(GEMMLOWP_ARM_64)
+  asm volatile("dmb ishst" ::: "memory");
+#elif defined(GEMMLOWP_X86)
+  asm volatile("sfence" ::: "memory");
+#else
+#error "Unsupported architecture for WriteBarrier."
+#endif
+}
+
+inline void ReadBarrier() {
+#ifdef GEMMLOWP_ARM_32
+  MemoryBarrier();
+#elif defined(GEMMLOWP_ARM_64)
+  asm volatile("dmb ishld" ::: "memory");
+#elif defined(GEMMLOWP_X86)
+  asm volatile("lfence" ::: "memory");
+#else
+#error "Unsupported architecture for ReadBarrier."
+#endif
+}
+
 // Waits until *var != initial_value.
 //
 // Returns the new value of *var. The guarantee here is that
@@ -255,6 +279,7 @@
       switch (state_to_act_upon) {
         case State::HasWork:
           // Got work to do! So do it, and then revert to 'Ready' state.
+          ReadBarrier();
           assert(task_);
           task_->Run();
           delete task_;
@@ -280,6 +305,7 @@
     assert(!task_);
     task->local_allocator = &local_allocator_;
     task_ = task;
+    WriteBarrier();
     assert(state_ == State::Ready);
     ChangeState(State::HasWork);
   }