Python/lock.c - platform/external/python/cpython3 - Git at Google

 // Lock implementation

 #include "Python.h"

 #include "pycore_lock.h"
 #include "pycore_parking_lot.h"
 #include "pycore_semaphore.h"
 #include "pycore_time.h"          // _PyTime_Add()

 #ifdef MS_WINDOWS
 #  define WIN32_LEAN_AND_MEAN
 #  include <windows.h>            // SwitchToThread()
 #elif defined(HAVE_SCHED_H)
 #  include <sched.h>              // sched_yield()
 #endif

 // If a thread waits on a lock for longer than TIME_TO_BE_FAIR_NS (1 ms), then
 // the unlocking thread directly hands off ownership of the lock. This avoids
 // starvation.
 static const PyTime_t TIME_TO_BE_FAIR_NS = 1000*1000;

 // Spin for a bit before parking the thread. This is only enabled for
 // `--disable-gil` builds because it is unlikely to be helpful if the GIL is
 // enabled.
 #if Py_GIL_DISABLED
 static const int MAX_SPIN_COUNT = 40;
 #else
 static const int MAX_SPIN_COUNT = 0;
 #endif

 struct mutex_entry {
     // The time after which the unlocking thread should hand off lock ownership
     // directly to the waiting thread. Written by the waiting thread.
     PyTime_t time_to_be_fair;

     // Set to 1 if the lock was handed off. Written by the unlocking thread.
     int handed_off;
 };

 static void
 _Py_yield(void)
 {
 #ifdef MS_WINDOWS
     SwitchToThread();
 #elif defined(HAVE_SCHED_H)
     sched_yield();
 #endif
 }

 void
 _PyMutex_LockSlow(PyMutex *m)
 {
     _PyMutex_LockTimed(m, -1, _PY_LOCK_DETACH);
 }

 PyLockStatus
 _PyMutex_LockTimed(PyMutex *m, PyTime_t timeout, _PyLockFlags flags)
 {
     uint8_t v = _Py_atomic_load_uint8_relaxed(&m->v);
     if ((v & _Py_LOCKED) == 0) {
         if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v|_Py_LOCKED)) {
             return PY_LOCK_ACQUIRED;
         }
     }
     else if (timeout == 0) {
         return PY_LOCK_FAILURE;
     }

     PyTime_t now;
     // silently ignore error: cannot report error to the caller
     (void)PyTime_MonotonicRaw(&now);
     PyTime_t endtime = 0;
     if (timeout > 0) {
         endtime = _PyTime_Add(now, timeout);
     }

     struct mutex_entry entry = {
         .time_to_be_fair = now + TIME_TO_BE_FAIR_NS,
         .handed_off = 0,
     };

     Py_ssize_t spin_count = 0;
     for (;;) {
         if ((v & _Py_LOCKED) == 0) {
             // The lock is unlocked. Try to grab it.
             if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v|_Py_LOCKED)) {
                 return PY_LOCK_ACQUIRED;
             }
             continue;
         }

         if (!(v & _Py_HAS_PARKED) && spin_count < MAX_SPIN_COUNT) {
             // Spin for a bit.
             _Py_yield();
             spin_count++;
             continue;
         }

         if (timeout == 0) {
             return PY_LOCK_FAILURE;
         }

         uint8_t newv = v;
         if (!(v & _Py_HAS_PARKED)) {
             // We are the first waiter. Set the _Py_HAS_PARKED flag.
             newv = v | _Py_HAS_PARKED;
             if (!_Py_atomic_compare_exchange_uint8(&m->v, &v, newv)) {
                 continue;
             }
         }

         int ret = _PyParkingLot_Park(&m->v, &newv, sizeof(newv), timeout,
                                      &entry, (flags & _PY_LOCK_DETACH) != 0);
         if (ret == Py_PARK_OK) {
             if (entry.handed_off) {
                 // We own the lock now.
                 assert(_Py_atomic_load_uint8_relaxed(&m->v) & _Py_LOCKED);
                 return PY_LOCK_ACQUIRED;
             }
         }
         else if (ret == Py_PARK_INTR && (flags & _PY_LOCK_HANDLE_SIGNALS)) {
             if (Py_MakePendingCalls() < 0) {
                 return PY_LOCK_INTR;
             }
         }
         else if (ret == Py_PARK_TIMEOUT) {
             assert(timeout >= 0);
             return PY_LOCK_FAILURE;
         }

         if (timeout > 0) {
             timeout = _PyDeadline_Get(endtime);
             if (timeout <= 0) {
                 // Avoid negative values because those mean block forever.
                 timeout = 0;
             }
         }

         v = _Py_atomic_load_uint8_relaxed(&m->v);
     }
 }

 static void
 mutex_unpark(PyMutex *m, struct mutex_entry *entry, int has_more_waiters)
 {
     uint8_t v = 0;
     if (entry) {
         PyTime_t now;
         // silently ignore error: cannot report error to the caller
         (void)PyTime_MonotonicRaw(&now);
         int should_be_fair = now > entry->time_to_be_fair;

         entry->handed_off = should_be_fair;
         if (should_be_fair) {
             v |= _Py_LOCKED;
         }
         if (has_more_waiters) {
             v |= _Py_HAS_PARKED;
         }
     }
     _Py_atomic_store_uint8(&m->v, v);
 }

 int
 _PyMutex_TryUnlock(PyMutex *m)
 {
     uint8_t v = _Py_atomic_load_uint8(&m->v);
     for (;;) {
         if ((v & _Py_LOCKED) == 0) {
             // error: the mutex is not locked
             return -1;
         }
         else if ((v & _Py_HAS_PARKED)) {
             // wake up a single thread
             _PyParkingLot_Unpark(&m->v, (_Py_unpark_fn_t *)mutex_unpark, m);
             return 0;
         }
         else if (_Py_atomic_compare_exchange_uint8(&m->v, &v, _Py_UNLOCKED)) {
             // fast-path: no waiters
             return 0;
         }
     }
 }

 void
 _PyMutex_UnlockSlow(PyMutex *m)
 {
     if (_PyMutex_TryUnlock(m) < 0) {
         Py_FatalError("unlocking mutex that is not locked");
     }
 }

 // _PyRawMutex stores a linked list of `struct raw_mutex_entry`, one for each
 // thread waiting on the mutex, directly in the mutex itself.
 struct raw_mutex_entry {
     struct raw_mutex_entry *next;
     _PySemaphore sema;
 };

 void
 _PyRawMutex_LockSlow(_PyRawMutex *m)
 {
     struct raw_mutex_entry waiter;
     _PySemaphore_Init(&waiter.sema);

     uintptr_t v = _Py_atomic_load_uintptr(&m->v);
     for (;;) {
         if ((v & _Py_LOCKED) == 0) {
             // Unlocked: try to grab it (even if it has a waiter).
             if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, v|_Py_LOCKED)) {
                 break;
             }
             continue;
         }

         // Locked: try to add ourselves as a waiter.
         waiter.next = (struct raw_mutex_entry *)(v & ~1);
         uintptr_t desired = ((uintptr_t)&waiter)|_Py_LOCKED;
         if (!_Py_atomic_compare_exchange_uintptr(&m->v, &v, desired)) {
             continue;
         }

         // Wait for us to be woken up. Note that we still have to lock the
         // mutex ourselves: it is NOT handed off to us.
         _PySemaphore_Wait(&waiter.sema, -1, /*detach=*/0);
     }

     _PySemaphore_Destroy(&waiter.sema);
 }

 void
 _PyRawMutex_UnlockSlow(_PyRawMutex *m)
 {
     uintptr_t v = _Py_atomic_load_uintptr(&m->v);
     for (;;) {
         if ((v & _Py_LOCKED) == 0) {
             Py_FatalError("unlocking mutex that is not locked");
         }

         struct raw_mutex_entry *waiter = (struct raw_mutex_entry *)(v & ~1);
         if (waiter) {
             uintptr_t next_waiter = (uintptr_t)waiter->next;
             if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, next_waiter)) {
                 _PySemaphore_Wakeup(&waiter->sema);
                 return;
             }
         }
         else {
             if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, _Py_UNLOCKED)) {
                 return;
             }
         }
     }
 }

 int
 _PyEvent_IsSet(PyEvent *evt)
 {
     uint8_t v = _Py_atomic_load_uint8(&evt->v);
     return v == _Py_LOCKED;
 }

 void
 _PyEvent_Notify(PyEvent *evt)
 {
     uintptr_t v = _Py_atomic_exchange_uint8(&evt->v, _Py_LOCKED);
     if (v == _Py_UNLOCKED) {
         // no waiters
         return;
     }
     else if (v == _Py_LOCKED) {
         // event already set
         return;
     }
     else {
         assert(v == _Py_HAS_PARKED);
         _PyParkingLot_UnparkAll(&evt->v);
     }
 }

 void
 PyEvent_Wait(PyEvent *evt)
 {
     while (!PyEvent_WaitTimed(evt, -1, /*detach=*/1))
         ;
 }

 int
 PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns, int detach)
 {
     for (;;) {
         uint8_t v = _Py_atomic_load_uint8(&evt->v);
         if (v == _Py_LOCKED) {
             // event already set
             return 1;
         }
         if (v == _Py_UNLOCKED) {
             if (!_Py_atomic_compare_exchange_uint8(&evt->v, &v, _Py_HAS_PARKED)) {
                 continue;
             }
         }

         uint8_t expected = _Py_HAS_PARKED;
         (void) _PyParkingLot_Park(&evt->v, &expected, sizeof(evt->v),
                                   timeout_ns, NULL, detach);

         return _Py_atomic_load_uint8(&evt->v) == _Py_LOCKED;
     }
 }

 static int
 unlock_once(_PyOnceFlag *o, int res)
 {
     // On success (res=0), we set the state to _Py_ONCE_INITIALIZED.
     // On failure (res=-1), we reset the state to _Py_UNLOCKED.
     uint8_t new_value;
     switch (res) {
         case -1: new_value = _Py_UNLOCKED; break;
         case  0: new_value = _Py_ONCE_INITIALIZED; break;
         default: {
             Py_FatalError("invalid result from _PyOnceFlag_CallOnce");
             Py_UNREACHABLE();
             break;
         }
     }

     uint8_t old_value = _Py_atomic_exchange_uint8(&o->v, new_value);
     if ((old_value & _Py_HAS_PARKED) != 0) {
         // wake up anyone waiting on the once flag
         _PyParkingLot_UnparkAll(&o->v);
     }
     return res;
 }

 int
 _PyOnceFlag_CallOnceSlow(_PyOnceFlag *flag, _Py_once_fn_t *fn, void *arg)
 {
     uint8_t v = _Py_atomic_load_uint8(&flag->v);
     for (;;) {
         if (v == _Py_UNLOCKED) {
             if (!_Py_atomic_compare_exchange_uint8(&flag->v, &v, _Py_LOCKED)) {
                 continue;
             }
             int res = fn(arg);
             return unlock_once(flag, res);
         }

         if (v == _Py_ONCE_INITIALIZED) {
             return 0;
         }

         // The once flag is initializing (locked).
         assert((v & _Py_LOCKED));
         if (!(v & _Py_HAS_PARKED)) {
             // We are the first waiter. Set the _Py_HAS_PARKED flag.
             uint8_t new_value = v | _Py_HAS_PARKED;
             if (!_Py_atomic_compare_exchange_uint8(&flag->v, &v, new_value)) {
                 continue;
             }
             v = new_value;
         }

         // Wait for initialization to finish.
         _PyParkingLot_Park(&flag->v, &v, sizeof(v), -1, NULL, 1);
         v = _Py_atomic_load_uint8(&flag->v);
     }
 }

 static int
 recursive_mutex_is_owned_by(_PyRecursiveMutex *m, PyThread_ident_t tid)
 {
     return _Py_atomic_load_ullong_relaxed(&m->thread) == tid;
 }

 int
 _PyRecursiveMutex_IsLockedByCurrentThread(_PyRecursiveMutex *m)
 {
     return recursive_mutex_is_owned_by(m, PyThread_get_thread_ident_ex());
 }

 void
 _PyRecursiveMutex_Lock(_PyRecursiveMutex *m)
 {
     PyThread_ident_t thread = PyThread_get_thread_ident_ex();
     if (recursive_mutex_is_owned_by(m, thread)) {
         m->level++;
         return;
     }
     PyMutex_Lock(&m->mutex);
     _Py_atomic_store_ullong_relaxed(&m->thread, thread);
     assert(m->level == 0);
 }

 void
 _PyRecursiveMutex_Unlock(_PyRecursiveMutex *m)
 {
     PyThread_ident_t thread = PyThread_get_thread_ident_ex();
     if (!recursive_mutex_is_owned_by(m, thread)) {
         Py_FatalError("unlocking a recursive mutex that is not owned by the"
                       " current thread");
     }
     if (m->level > 0) {
         m->level--;
         return;
     }
     assert(m->level == 0);
     _Py_atomic_store_ullong_relaxed(&m->thread, 0);
     PyMutex_Unlock(&m->mutex);
 }

 #define _Py_WRITE_LOCKED 1
 #define _PyRWMutex_READER_SHIFT 2
 #define _Py_RWMUTEX_MAX_READERS (UINTPTR_MAX >> _PyRWMutex_READER_SHIFT)

 static uintptr_t
 rwmutex_set_parked_and_wait(_PyRWMutex *rwmutex, uintptr_t bits)
 {
     // Set _Py_HAS_PARKED and wait until we are woken up.
     if ((bits & _Py_HAS_PARKED) == 0) {
         uintptr_t newval = bits | _Py_HAS_PARKED;
         if (!_Py_atomic_compare_exchange_uintptr(&rwmutex->bits,
                                                  &bits, newval)) {
             return bits;
         }
         bits = newval;
     }

     _PyParkingLot_Park(&rwmutex->bits, &bits, sizeof(bits), -1, NULL, 1);
     return _Py_atomic_load_uintptr_relaxed(&rwmutex->bits);
 }

 // The number of readers holding the lock
 static uintptr_t
 rwmutex_reader_count(uintptr_t bits)
 {
     return bits >> _PyRWMutex_READER_SHIFT;
 }

 void
 _PyRWMutex_RLock(_PyRWMutex *rwmutex)
 {
     uintptr_t bits = _Py_atomic_load_uintptr_relaxed(&rwmutex->bits);
     for (;;) {
         if ((bits & _Py_WRITE_LOCKED)) {
             // A writer already holds the lock.
             bits = rwmutex_set_parked_and_wait(rwmutex, bits);
             continue;
         }
         else if ((bits & _Py_HAS_PARKED)) {
             // Reader(s) hold the lock (or just gave up the lock), but there is
             // at least one waiting writer. We can't grab the lock because we
             // don't want to starve the writer. Instead, we park ourselves and
             // wait for the writer to eventually wake us up.
             bits = rwmutex_set_parked_and_wait(rwmutex, bits);
             continue;
         }
         else {
             // The lock is unlocked or read-locked. Try to grab it.
             assert(rwmutex_reader_count(bits) < _Py_RWMUTEX_MAX_READERS);
             uintptr_t newval = bits + (1 << _PyRWMutex_READER_SHIFT);
             if (!_Py_atomic_compare_exchange_uintptr(&rwmutex->bits,
                                                      &bits, newval)) {
                 continue;
             }
             return;
         }
     }
 }

 void
 _PyRWMutex_RUnlock(_PyRWMutex *rwmutex)
 {
     uintptr_t bits = _Py_atomic_add_uintptr(&rwmutex->bits, -(1 << _PyRWMutex_READER_SHIFT));
     assert(rwmutex_reader_count(bits) > 0 && "lock was not read-locked");
     bits -= (1 << _PyRWMutex_READER_SHIFT);

     if (rwmutex_reader_count(bits) == 0 && (bits & _Py_HAS_PARKED)) {
         _PyParkingLot_UnparkAll(&rwmutex->bits);
         return;
     }
 }

 void
 _PyRWMutex_Lock(_PyRWMutex *rwmutex)
 {
     uintptr_t bits = _Py_atomic_load_uintptr_relaxed(&rwmutex->bits);
     for (;;) {
         // If there are no active readers and it's not already write-locked,
         // then we can grab the lock.
         if ((bits & ~_Py_HAS_PARKED) == 0) {
             if (!_Py_atomic_compare_exchange_uintptr(&rwmutex->bits,
                                                      &bits,
                                                      bits | _Py_WRITE_LOCKED)) {
                 continue;
             }
             return;
         }

         // Otherwise, we have to wait.
         bits = rwmutex_set_parked_and_wait(rwmutex, bits);
     }
 }

 void
 _PyRWMutex_Unlock(_PyRWMutex *rwmutex)
 {
     uintptr_t old_bits = _Py_atomic_exchange_uintptr(&rwmutex->bits, 0);

     assert((old_bits & _Py_WRITE_LOCKED) && "lock was not write-locked");
     assert(rwmutex_reader_count(old_bits) == 0 && "lock was read-locked");

     if ((old_bits & _Py_HAS_PARKED) != 0) {
         _PyParkingLot_UnparkAll(&rwmutex->bits);
     }
 }

 #define SEQLOCK_IS_UPDATING(sequence) (sequence & 0x01)

 void _PySeqLock_LockWrite(_PySeqLock *seqlock)
 {
     // lock by moving to an odd sequence number
     uint32_t prev = _Py_atomic_load_uint32_relaxed(&seqlock->sequence);
     while (1) {
         if (SEQLOCK_IS_UPDATING(prev)) {
             // Someone else is currently updating the cache
             _Py_yield();
             prev = _Py_atomic_load_uint32_relaxed(&seqlock->sequence);
         }
         else if (_Py_atomic_compare_exchange_uint32(&seqlock->sequence, &prev, prev + 1)) {
             // We've locked the cache
             break;
         }
         else {
             _Py_yield();
         }
     }
 }

 void _PySeqLock_AbandonWrite(_PySeqLock *seqlock)
 {
     uint32_t new_seq = _Py_atomic_load_uint32_relaxed(&seqlock->sequence) - 1;
     assert(!SEQLOCK_IS_UPDATING(new_seq));
     _Py_atomic_store_uint32(&seqlock->sequence, new_seq);
 }

 void _PySeqLock_UnlockWrite(_PySeqLock *seqlock)
 {
     uint32_t new_seq = _Py_atomic_load_uint32_relaxed(&seqlock->sequence) + 1;
     assert(!SEQLOCK_IS_UPDATING(new_seq));
     _Py_atomic_store_uint32(&seqlock->sequence, new_seq);
 }

 uint32_t _PySeqLock_BeginRead(_PySeqLock *seqlock)
 {
     uint32_t sequence = _Py_atomic_load_uint32_acquire(&seqlock->sequence);
     while (SEQLOCK_IS_UPDATING(sequence)) {
         _Py_yield();
         sequence = _Py_atomic_load_uint32_acquire(&seqlock->sequence);
     }

     return sequence;
 }

 uint32_t _PySeqLock_EndRead(_PySeqLock *seqlock, uint32_t previous)
 {
     // Synchronize again and validate that the entry hasn't been updated
     // while we were readying the values.
      if (_Py_atomic_load_uint32_acquire(&seqlock->sequence) == previous) {
         return 1;
      }

      _Py_yield();
      return 0;
 }

 uint32_t _PySeqLock_AfterFork(_PySeqLock *seqlock)
 {
     // Synchronize again and validate that the entry hasn't been updated
     // while we were readying the values.
      if (SEQLOCK_IS_UPDATING(seqlock->sequence)) {
         seqlock->sequence = 0;
         return 1;
      }

      return 0;
 }
	// Lock implementation

	#include "Python.h"

	#include "pycore_lock.h"
	#include "pycore_parking_lot.h"
	#include "pycore_semaphore.h"
	#include "pycore_time.h" // _PyTime_Add()

	#ifdef MS_WINDOWS
	# define WIN32_LEAN_AND_MEAN
	# include <windows.h> // SwitchToThread()
	#elif defined(HAVE_SCHED_H)
	# include <sched.h> // sched_yield()
	#endif

	// If a thread waits on a lock for longer than TIME_TO_BE_FAIR_NS (1 ms), then
	// the unlocking thread directly hands off ownership of the lock. This avoids
	// starvation.
	static const PyTime_t TIME_TO_BE_FAIR_NS = 1000*1000;

	// Spin for a bit before parking the thread. This is only enabled for
	// `--disable-gil` builds because it is unlikely to be helpful if the GIL is
	// enabled.
	#if Py_GIL_DISABLED
	static const int MAX_SPIN_COUNT = 40;
	#else
	static const int MAX_SPIN_COUNT = 0;
	#endif

	struct mutex_entry {
	// The time after which the unlocking thread should hand off lock ownership
	// directly to the waiting thread. Written by the waiting thread.
	PyTime_t time_to_be_fair;

	// Set to 1 if the lock was handed off. Written by the unlocking thread.
	int handed_off;
	};

	static void
	_Py_yield(void)
	{
	#ifdef MS_WINDOWS
	SwitchToThread();
	#elif defined(HAVE_SCHED_H)
	sched_yield();
	#endif
	}

	void
	_PyMutex_LockSlow(PyMutex *m)
	{
	_PyMutex_LockTimed(m, -1, _PY_LOCK_DETACH);
	}

	PyLockStatus
	_PyMutex_LockTimed(PyMutex *m, PyTime_t timeout, _PyLockFlags flags)
	{
	uint8_t v = _Py_atomic_load_uint8_relaxed(&m->v);
	if ((v & _Py_LOCKED) == 0) {
	if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v\|_Py_LOCKED)) {
	return PY_LOCK_ACQUIRED;
	}
	}
	else if (timeout == 0) {
	return PY_LOCK_FAILURE;
	}

	PyTime_t now;
	// silently ignore error: cannot report error to the caller
	(void)PyTime_MonotonicRaw(&now);
	PyTime_t endtime = 0;
	if (timeout > 0) {
	endtime = _PyTime_Add(now, timeout);
	}

	struct mutex_entry entry = {
	.time_to_be_fair = now + TIME_TO_BE_FAIR_NS,
	.handed_off = 0,
	};

	Py_ssize_t spin_count = 0;
	for (;;) {
	if ((v & _Py_LOCKED) == 0) {
	// The lock is unlocked. Try to grab it.
	if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v\|_Py_LOCKED)) {
	return PY_LOCK_ACQUIRED;
	}
	continue;
	}

	if (!(v & _Py_HAS_PARKED) && spin_count < MAX_SPIN_COUNT) {
	// Spin for a bit.
	_Py_yield();
	spin_count++;
	continue;
	}

	if (timeout == 0) {
	return PY_LOCK_FAILURE;
	}

	uint8_t newv = v;
	if (!(v & _Py_HAS_PARKED)) {
	// We are the first waiter. Set the _Py_HAS_PARKED flag.
	newv = v \| _Py_HAS_PARKED;
	if (!_Py_atomic_compare_exchange_uint8(&m->v, &v, newv)) {
	continue;
	}
	}

	int ret = _PyParkingLot_Park(&m->v, &newv, sizeof(newv), timeout,
	&entry, (flags & _PY_LOCK_DETACH) != 0);
	if (ret == Py_PARK_OK) {
	if (entry.handed_off) {
	// We own the lock now.
	assert(_Py_atomic_load_uint8_relaxed(&m->v) & _Py_LOCKED);
	return PY_LOCK_ACQUIRED;
	}
	}
	else if (ret == Py_PARK_INTR && (flags & _PY_LOCK_HANDLE_SIGNALS)) {
	if (Py_MakePendingCalls() < 0) {
	return PY_LOCK_INTR;
	}
	}
	else if (ret == Py_PARK_TIMEOUT) {
	assert(timeout >= 0);
	return PY_LOCK_FAILURE;
	}

	if (timeout > 0) {
	timeout = _PyDeadline_Get(endtime);
	if (timeout <= 0) {
	// Avoid negative values because those mean block forever.
	timeout = 0;
	}
	}

	v = _Py_atomic_load_uint8_relaxed(&m->v);
	}
	}

	static void
	mutex_unpark(PyMutex m, struct mutex_entry entry, int has_more_waiters)
	{
	uint8_t v = 0;
	if (entry) {
	PyTime_t now;
	// silently ignore error: cannot report error to the caller
	(void)PyTime_MonotonicRaw(&now);
	int should_be_fair = now > entry->time_to_be_fair;

	entry->handed_off = should_be_fair;
	if (should_be_fair) {
	v \|= _Py_LOCKED;
	}
	if (has_more_waiters) {
	v \|= _Py_HAS_PARKED;
	}
	}
	_Py_atomic_store_uint8(&m->v, v);
	}

	int
	_PyMutex_TryUnlock(PyMutex *m)
	{
	uint8_t v = _Py_atomic_load_uint8(&m->v);
	for (;;) {
	if ((v & _Py_LOCKED) == 0) {
	// error: the mutex is not locked
	return -1;
	}
	else if ((v & _Py_HAS_PARKED)) {
	// wake up a single thread
	_PyParkingLot_Unpark(&m->v, (_Py_unpark_fn_t *)mutex_unpark, m);
	return 0;
	}
	else if (_Py_atomic_compare_exchange_uint8(&m->v, &v, _Py_UNLOCKED)) {
	// fast-path: no waiters
	return 0;
	}
	}
	}

	void
	_PyMutex_UnlockSlow(PyMutex *m)
	{
	if (_PyMutex_TryUnlock(m) < 0) {
	Py_FatalError("unlocking mutex that is not locked");
	}
	}

	// _PyRawMutex stores a linked list of `struct raw_mutex_entry`, one for each
	// thread waiting on the mutex, directly in the mutex itself.
	struct raw_mutex_entry {
	struct raw_mutex_entry *next;
	_PySemaphore sema;
	};

	void
	_PyRawMutex_LockSlow(_PyRawMutex *m)
	{
	struct raw_mutex_entry waiter;
	_PySemaphore_Init(&waiter.sema);

	uintptr_t v = _Py_atomic_load_uintptr(&m->v);
	for (;;) {
	if ((v & _Py_LOCKED) == 0) {
	// Unlocked: try to grab it (even if it has a waiter).
	if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, v\|_Py_LOCKED)) {
	break;
	}
	continue;
	}

	// Locked: try to add ourselves as a waiter.
	waiter.next = (struct raw_mutex_entry *)(v & ~1);
	uintptr_t desired = ((uintptr_t)&waiter)\|_Py_LOCKED;
	if (!_Py_atomic_compare_exchange_uintptr(&m->v, &v, desired)) {
	continue;
	}

	// Wait for us to be woken up. Note that we still have to lock the
	// mutex ourselves: it is NOT handed off to us.
	_PySemaphore_Wait(&waiter.sema, -1, /detach=/0);
	}

	_PySemaphore_Destroy(&waiter.sema);
	}

	void
	_PyRawMutex_UnlockSlow(_PyRawMutex *m)
	{
	uintptr_t v = _Py_atomic_load_uintptr(&m->v);
	for (;;) {
	if ((v & _Py_LOCKED) == 0) {
	Py_FatalError("unlocking mutex that is not locked");
	}

	struct raw_mutex_entry waiter = (struct raw_mutex_entry )(v & ~1);
	if (waiter) {
	uintptr_t next_waiter = (uintptr_t)waiter->next;
	if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, next_waiter)) {
	_PySemaphore_Wakeup(&waiter->sema);
	return;
	}
	}
	else {
	if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, _Py_UNLOCKED)) {
	return;
	}
	}
	}
	}

	int
	_PyEvent_IsSet(PyEvent *evt)
	{
	uint8_t v = _Py_atomic_load_uint8(&evt->v);
	return v == _Py_LOCKED;
	}

	void
	_PyEvent_Notify(PyEvent *evt)
	{
	uintptr_t v = _Py_atomic_exchange_uint8(&evt->v, _Py_LOCKED);
	if (v == _Py_UNLOCKED) {
	// no waiters
	return;
	}
	else if (v == _Py_LOCKED) {
	// event already set
	return;
	}
	else {
	assert(v == _Py_HAS_PARKED);
	_PyParkingLot_UnparkAll(&evt->v);
	}
	}

	void
	PyEvent_Wait(PyEvent *evt)
	{
	while (!PyEvent_WaitTimed(evt, -1, /detach=/1))
	;
	}

	int
	PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns, int detach)
	{
	for (;;) {
	uint8_t v = _Py_atomic_load_uint8(&evt->v);
	if (v == _Py_LOCKED) {
	// event already set
	return 1;
	}
	if (v == _Py_UNLOCKED) {
	if (!_Py_atomic_compare_exchange_uint8(&evt->v, &v, _Py_HAS_PARKED)) {
	continue;
	}
	}

	uint8_t expected = _Py_HAS_PARKED;
	(void) _PyParkingLot_Park(&evt->v, &expected, sizeof(evt->v),
	timeout_ns, NULL, detach);

	return _Py_atomic_load_uint8(&evt->v) == _Py_LOCKED;
	}
	}

	static int
	unlock_once(_PyOnceFlag *o, int res)
	{
	// On success (res=0), we set the state to _Py_ONCE_INITIALIZED.
	// On failure (res=-1), we reset the state to _Py_UNLOCKED.
	uint8_t new_value;
	switch (res) {
	case -1: new_value = _Py_UNLOCKED; break;
	case 0: new_value = _Py_ONCE_INITIALIZED; break;
	default: {
	Py_FatalError("invalid result from _PyOnceFlag_CallOnce");
	Py_UNREACHABLE();
	break;
	}
	}

	uint8_t old_value = _Py_atomic_exchange_uint8(&o->v, new_value);
	if ((old_value & _Py_HAS_PARKED) != 0) {
	// wake up anyone waiting on the once flag
	_PyParkingLot_UnparkAll(&o->v);
	}
	return res;
	}

	int
	_PyOnceFlag_CallOnceSlow(_PyOnceFlag flag, _Py_once_fn_t fn, void *arg)
	{
	uint8_t v = _Py_atomic_load_uint8(&flag->v);
	for (;;) {
	if (v == _Py_UNLOCKED) {
	if (!_Py_atomic_compare_exchange_uint8(&flag->v, &v, _Py_LOCKED)) {
	continue;
	}
	int res = fn(arg);
	return unlock_once(flag, res);
	}

	if (v == _Py_ONCE_INITIALIZED) {
	return 0;
	}

	// The once flag is initializing (locked).
	assert((v & _Py_LOCKED));
	if (!(v & _Py_HAS_PARKED)) {
	// We are the first waiter. Set the _Py_HAS_PARKED flag.
	uint8_t new_value = v \| _Py_HAS_PARKED;
	if (!_Py_atomic_compare_exchange_uint8(&flag->v, &v, new_value)) {
	continue;
	}
	v = new_value;
	}

	// Wait for initialization to finish.
	_PyParkingLot_Park(&flag->v, &v, sizeof(v), -1, NULL, 1);
	v = _Py_atomic_load_uint8(&flag->v);
	}
	}

	static int
	recursive_mutex_is_owned_by(_PyRecursiveMutex *m, PyThread_ident_t tid)
	{
	return _Py_atomic_load_ullong_relaxed(&m->thread) == tid;
	}

	int
	_PyRecursiveMutex_IsLockedByCurrentThread(_PyRecursiveMutex *m)
	{
	return recursive_mutex_is_owned_by(m, PyThread_get_thread_ident_ex());
	}

	void
	_PyRecursiveMutex_Lock(_PyRecursiveMutex *m)
	{
	PyThread_ident_t thread = PyThread_get_thread_ident_ex();
	if (recursive_mutex_is_owned_by(m, thread)) {
	m->level++;
	return;
	}
	PyMutex_Lock(&m->mutex);
	_Py_atomic_store_ullong_relaxed(&m->thread, thread);
	assert(m->level == 0);
	}

	void
	_PyRecursiveMutex_Unlock(_PyRecursiveMutex *m)
	{
	PyThread_ident_t thread = PyThread_get_thread_ident_ex();
	if (!recursive_mutex_is_owned_by(m, thread)) {
	Py_FatalError("unlocking a recursive mutex that is not owned by the"
	" current thread");
	}
	if (m->level > 0) {
	m->level--;
	return;
	}
	assert(m->level == 0);
	_Py_atomic_store_ullong_relaxed(&m->thread, 0);
	PyMutex_Unlock(&m->mutex);
	}

	#define _Py_WRITE_LOCKED 1
	#define _PyRWMutex_READER_SHIFT 2
	#define _Py_RWMUTEX_MAX_READERS (UINTPTR_MAX >> _PyRWMutex_READER_SHIFT)

	static uintptr_t
	rwmutex_set_parked_and_wait(_PyRWMutex *rwmutex, uintptr_t bits)
	{
	// Set _Py_HAS_PARKED and wait until we are woken up.
	if ((bits & _Py_HAS_PARKED) == 0) {
	uintptr_t newval = bits \| _Py_HAS_PARKED;
	if (!_Py_atomic_compare_exchange_uintptr(&rwmutex->bits,
	&bits, newval)) {
	return bits;
	}
	bits = newval;
	}

	_PyParkingLot_Park(&rwmutex->bits, &bits, sizeof(bits), -1, NULL, 1);
	return _Py_atomic_load_uintptr_relaxed(&rwmutex->bits);
	}

	// The number of readers holding the lock
	static uintptr_t
	rwmutex_reader_count(uintptr_t bits)
	{
	return bits >> _PyRWMutex_READER_SHIFT;
	}

	void
	_PyRWMutex_RLock(_PyRWMutex *rwmutex)
	{
	uintptr_t bits = _Py_atomic_load_uintptr_relaxed(&rwmutex->bits);
	for (;;) {
	if ((bits & _Py_WRITE_LOCKED)) {
	// A writer already holds the lock.
	bits = rwmutex_set_parked_and_wait(rwmutex, bits);
	continue;
	}
	else if ((bits & _Py_HAS_PARKED)) {
	// Reader(s) hold the lock (or just gave up the lock), but there is
	// at least one waiting writer. We can't grab the lock because we
	// don't want to starve the writer. Instead, we park ourselves and
	// wait for the writer to eventually wake us up.
	bits = rwmutex_set_parked_and_wait(rwmutex, bits);
	continue;
	}
	else {
	// The lock is unlocked or read-locked. Try to grab it.
	assert(rwmutex_reader_count(bits) < _Py_RWMUTEX_MAX_READERS);
	uintptr_t newval = bits + (1 << _PyRWMutex_READER_SHIFT);
	if (!_Py_atomic_compare_exchange_uintptr(&rwmutex->bits,
	&bits, newval)) {
	continue;
	}
	return;
	}
	}
	}

	void
	_PyRWMutex_RUnlock(_PyRWMutex *rwmutex)
	{
	uintptr_t bits = _Py_atomic_add_uintptr(&rwmutex->bits, -(1 << _PyRWMutex_READER_SHIFT));
	assert(rwmutex_reader_count(bits) > 0 && "lock was not read-locked");
	bits -= (1 << _PyRWMutex_READER_SHIFT);

	if (rwmutex_reader_count(bits) == 0 && (bits & _Py_HAS_PARKED)) {
	_PyParkingLot_UnparkAll(&rwmutex->bits);
	return;
	}
	}

	void
	_PyRWMutex_Lock(_PyRWMutex *rwmutex)
	{
	uintptr_t bits = _Py_atomic_load_uintptr_relaxed(&rwmutex->bits);
	for (;;) {
	// If there are no active readers and it's not already write-locked,
	// then we can grab the lock.
	if ((bits & ~_Py_HAS_PARKED) == 0) {
	if (!_Py_atomic_compare_exchange_uintptr(&rwmutex->bits,
	&bits,
	bits \| _Py_WRITE_LOCKED)) {
	continue;
	}
	return;
	}

	// Otherwise, we have to wait.
	bits = rwmutex_set_parked_and_wait(rwmutex, bits);
	}
	}

	void
	_PyRWMutex_Unlock(_PyRWMutex *rwmutex)
	{
	uintptr_t old_bits = _Py_atomic_exchange_uintptr(&rwmutex->bits, 0);

	assert((old_bits & _Py_WRITE_LOCKED) && "lock was not write-locked");
	assert(rwmutex_reader_count(old_bits) == 0 && "lock was read-locked");

	if ((old_bits & _Py_HAS_PARKED) != 0) {
	_PyParkingLot_UnparkAll(&rwmutex->bits);
	}
	}

	#define SEQLOCK_IS_UPDATING(sequence) (sequence & 0x01)

	void _PySeqLock_LockWrite(_PySeqLock *seqlock)
	{
	// lock by moving to an odd sequence number
	uint32_t prev = _Py_atomic_load_uint32_relaxed(&seqlock->sequence);
	while (1) {
	if (SEQLOCK_IS_UPDATING(prev)) {
	// Someone else is currently updating the cache
	_Py_yield();
	prev = _Py_atomic_load_uint32_relaxed(&seqlock->sequence);
	}
	else if (_Py_atomic_compare_exchange_uint32(&seqlock->sequence, &prev, prev + 1)) {
	// We've locked the cache
	break;
	}
	else {
	_Py_yield();
	}
	}
	}

	void _PySeqLock_AbandonWrite(_PySeqLock *seqlock)
	{
	uint32_t new_seq = _Py_atomic_load_uint32_relaxed(&seqlock->sequence) - 1;
	assert(!SEQLOCK_IS_UPDATING(new_seq));
	_Py_atomic_store_uint32(&seqlock->sequence, new_seq);
	}

	void _PySeqLock_UnlockWrite(_PySeqLock *seqlock)
	{
	uint32_t new_seq = _Py_atomic_load_uint32_relaxed(&seqlock->sequence) + 1;
	assert(!SEQLOCK_IS_UPDATING(new_seq));
	_Py_atomic_store_uint32(&seqlock->sequence, new_seq);
	}

	uint32_t _PySeqLock_BeginRead(_PySeqLock *seqlock)
	{
	uint32_t sequence = _Py_atomic_load_uint32_acquire(&seqlock->sequence);
	while (SEQLOCK_IS_UPDATING(sequence)) {
	_Py_yield();
	sequence = _Py_atomic_load_uint32_acquire(&seqlock->sequence);
	}

	return sequence;
	}

	uint32_t _PySeqLock_EndRead(_PySeqLock *seqlock, uint32_t previous)
	{
	// Synchronize again and validate that the entry hasn't been updated
	// while we were readying the values.
	if (_Py_atomic_load_uint32_acquire(&seqlock->sequence) == previous) {
	return 1;
	}

	_Py_yield();
	return 0;
	}

	uint32_t _PySeqLock_AfterFork(_PySeqLock *seqlock)
	{
	// Synchronize again and validate that the entry hasn't been updated
	// while we were readying the values.
	if (SEQLOCK_IS_UPDATING(seqlock->sequence)) {
	seqlock->sequence = 0;
	return 1;
	}

	return 0;
	}