Fix pthread_join.

Let the kernel keep pthread_internal_t::tid updated, including
across forks and for the main thread. This then lets us fix
pthread_join to only return after the thread has really exited.

Also fix the thread attributes of the main thread so we don't
unmap the main thread's stack (which is really owned by the
dynamic linker and contains things like environment variables),
which fixes crashes when joining with an exited main thread
and also fixes problems reported publicly with accessing environment
variables after the main thread exits (for which I've added a new
unit test).

In passing I also fixed a bug where if the clone(2) inside
pthread_create(3) fails, we'd unmap the child's stack and TLS (which
contains the mutex) and then try to unlock the mutex. Boom! It wasn't
until after I'd uploaded the fix for this that I came across a new
public bug reporting this exact failure.

Bug: 8206355
Bug: 11693195
Bug: https://code.google.com/p/android/issues/detail?id=57421
Bug: https://code.google.com/p/android/issues/detail?id=62392
Change-Id: I2af9cf6e8ae510a67256ad93cad891794ed0580b
diff --git a/libc/SYSCALLS.TXT b/libc/SYSCALLS.TXT
index 533116e..a027024 100644
--- a/libc/SYSCALLS.TXT
+++ b/libc/SYSCALLS.TXT
@@ -276,6 +276,7 @@
 long    perf_event_open(struct perf_event_attr* attr_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags) all
 
 pid_t __clone:clone(int, void*, int*, void*, int*)  all
+int __set_tid_address:set_tid_address(int*) all
 
 int epoll_create1(int)  all
 int epoll_ctl(int, int op, int, struct epoll_event*)  all
diff --git a/libc/arch-aarch64/syscalls.mk b/libc/arch-aarch64/syscalls.mk
index 995d44a..890a6d8 100644
--- a/libc/arch-aarch64/syscalls.mk
+++ b/libc/arch-aarch64/syscalls.mk
@@ -19,6 +19,7 @@
 syscall_src += arch-aarch64/syscalls/__rt_sigsuspend.S
 syscall_src += arch-aarch64/syscalls/__rt_sigtimedwait.S
 syscall_src += arch-aarch64/syscalls/__sched_getaffinity.S
+syscall_src += arch-aarch64/syscalls/__set_tid_address.S
 syscall_src += arch-aarch64/syscalls/__syslog.S
 syscall_src += arch-aarch64/syscalls/__timer_create.S
 syscall_src += arch-aarch64/syscalls/__timer_delete.S
diff --git a/libc/arch-aarch64/syscalls/__set_tid_address.S b/libc/arch-aarch64/syscalls/__set_tid_address.S
new file mode 100644
index 0000000..b7541fc
--- /dev/null
+++ b/libc/arch-aarch64/syscalls/__set_tid_address.S
@@ -0,0 +1,22 @@
+/* Generated by gensyscalls.py. Do not edit. */
+
+#include <private/bionic_asm.h>
+
+ENTRY(__set_tid_address)
+    stp     x29, x30, [sp, #-16]!
+    mov     x29,  sp
+    str     x8,       [sp, #-16]!
+
+    mov     x8, __NR_set_tid_address
+    svc     #0
+
+    ldr     x8,       [sp], #16
+    ldp     x29, x30, [sp], #16
+
+    cmn     x0, #(MAX_ERRNO + 1)
+    cneg    x0, x0, hi
+    b.hi    __set_errno
+
+    ret
+END(__set_tid_address)
+.hidden _C_LABEL(__set_tid_address)
diff --git a/libc/arch-arm/syscalls.mk b/libc/arch-arm/syscalls.mk
index 75b6133..f8bb15a 100644
--- a/libc/arch-arm/syscalls.mk
+++ b/libc/arch-arm/syscalls.mk
@@ -23,6 +23,7 @@
 syscall_src += arch-arm/syscalls/__rt_sigsuspend.S
 syscall_src += arch-arm/syscalls/__rt_sigtimedwait.S
 syscall_src += arch-arm/syscalls/__sched_getaffinity.S
+syscall_src += arch-arm/syscalls/__set_tid_address.S
 syscall_src += arch-arm/syscalls/__set_tls.S
 syscall_src += arch-arm/syscalls/__sigaction.S
 syscall_src += arch-arm/syscalls/__statfs64.S
diff --git a/libc/arch-arm/syscalls/__set_tid_address.S b/libc/arch-arm/syscalls/__set_tid_address.S
new file mode 100644
index 0000000..b4b42e7
--- /dev/null
+++ b/libc/arch-arm/syscalls/__set_tid_address.S
@@ -0,0 +1,14 @@
+/* Generated by gensyscalls.py. Do not edit. */
+
+#include <private/bionic_asm.h>
+
+ENTRY(__set_tid_address)
+    mov     ip, r7
+    ldr     r7, =__NR_set_tid_address
+    swi     #0
+    mov     r7, ip
+    cmn     r0, #(MAX_ERRNO + 1)
+    bxls    lr
+    neg     r0, r0
+    b       __set_errno
+END(__set_tid_address)
diff --git a/libc/arch-mips/syscalls.mk b/libc/arch-mips/syscalls.mk
index e141a65..6b72e70 100644
--- a/libc/arch-mips/syscalls.mk
+++ b/libc/arch-mips/syscalls.mk
@@ -24,6 +24,7 @@
 syscall_src += arch-mips/syscalls/__rt_sigtimedwait.S
 syscall_src += arch-mips/syscalls/__sched_getaffinity.S
 syscall_src += arch-mips/syscalls/__set_thread_area.S
+syscall_src += arch-mips/syscalls/__set_tid_address.S
 syscall_src += arch-mips/syscalls/__sigaction.S
 syscall_src += arch-mips/syscalls/__statfs64.S
 syscall_src += arch-mips/syscalls/__syslog.S
diff --git a/libc/arch-mips/syscalls/__set_tid_address.S b/libc/arch-mips/syscalls/__set_tid_address.S
new file mode 100644
index 0000000..4fcc82a
--- /dev/null
+++ b/libc/arch-mips/syscalls/__set_tid_address.S
@@ -0,0 +1,23 @@
+/* Generated by gensyscalls.py. Do not edit. */
+
+#include <asm/unistd.h>
+    .text
+    .globl __set_tid_address
+    .align 4
+    .ent __set_tid_address
+
+__set_tid_address:
+    .set noreorder
+    .cpload $t9
+    li $v0, __NR_set_tid_address
+    syscall
+    bnez $a3, 1f
+    move $a0, $v0
+    j $ra
+    nop
+1:
+    la $t9,__set_errno
+    j $t9
+    nop
+    .set reorder
+    .end __set_tid_address
diff --git a/libc/arch-x86/syscalls.mk b/libc/arch-x86/syscalls.mk
index 70e6e39..b6a1e38 100644
--- a/libc/arch-x86/syscalls.mk
+++ b/libc/arch-x86/syscalls.mk
@@ -24,6 +24,7 @@
 syscall_src += arch-x86/syscalls/__rt_sigtimedwait.S
 syscall_src += arch-x86/syscalls/__sched_getaffinity.S
 syscall_src += arch-x86/syscalls/__set_thread_area.S
+syscall_src += arch-x86/syscalls/__set_tid_address.S
 syscall_src += arch-x86/syscalls/__sigaction.S
 syscall_src += arch-x86/syscalls/__statfs64.S
 syscall_src += arch-x86/syscalls/__syslog.S
diff --git a/libc/arch-x86/syscalls/__set_tid_address.S b/libc/arch-x86/syscalls/__set_tid_address.S
new file mode 100644
index 0000000..0c66d47
--- /dev/null
+++ b/libc/arch-x86/syscalls/__set_tid_address.S
@@ -0,0 +1,20 @@
+/* Generated by gensyscalls.py. Do not edit. */
+
+#include <private/bionic_asm.h>
+
+ENTRY(__set_tid_address)
+    pushl   %ebx
+    mov     8(%esp), %ebx
+    movl    $__NR_set_tid_address, %eax
+    int     $0x80
+    cmpl    $-MAX_ERRNO, %eax
+    jb      1f
+    negl    %eax
+    pushl   %eax
+    call    __set_errno
+    addl    $4, %esp
+    orl     $-1, %eax
+1:
+    popl    %ebx
+    ret
+END(__set_tid_address)
diff --git a/libc/arch-x86_64/syscalls.mk b/libc/arch-x86_64/syscalls.mk
index c874b61..50d9ab3 100644
--- a/libc/arch-x86_64/syscalls.mk
+++ b/libc/arch-x86_64/syscalls.mk
@@ -20,6 +20,7 @@
 syscall_src += arch-x86_64/syscalls/__rt_sigsuspend.S
 syscall_src += arch-x86_64/syscalls/__rt_sigtimedwait.S
 syscall_src += arch-x86_64/syscalls/__sched_getaffinity.S
+syscall_src += arch-x86_64/syscalls/__set_tid_address.S
 syscall_src += arch-x86_64/syscalls/__syslog.S
 syscall_src += arch-x86_64/syscalls/__timer_create.S
 syscall_src += arch-x86_64/syscalls/__timer_delete.S
diff --git a/libc/arch-x86_64/syscalls/__set_tid_address.S b/libc/arch-x86_64/syscalls/__set_tid_address.S
new file mode 100644
index 0000000..fe7260f
--- /dev/null
+++ b/libc/arch-x86_64/syscalls/__set_tid_address.S
@@ -0,0 +1,17 @@
+/* Generated by gensyscalls.py. Do not edit. */
+
+#include <private/bionic_asm.h>
+
+ENTRY(__set_tid_address)
+    movl    $__NR_set_tid_address, %eax
+    syscall
+    cmpq    $-MAX_ERRNO, %rax
+    jb      1f
+    negl    %eax
+    movl    %eax, %edi
+    call    __set_errno
+    orq     $-1, %rax
+1:
+    ret
+END(__set_tid_address)
+.hidden _C_LABEL(__set_tid_address)
diff --git a/libc/bionic/fork.cpp b/libc/bionic/fork.cpp
index 339a0e8..f7d1c11 100644
--- a/libc/bionic/fork.cpp
+++ b/libc/bionic/fork.cpp
@@ -41,7 +41,12 @@
   __timer_table_start_stop(1);
   __bionic_atfork_run_prepare();
 
-  int result = __clone(SIGCHLD, NULL, NULL, NULL, NULL);
+  pthread_internal_t* self = __get_thread();
+#if defined(__x86_64__)
+  int result = __clone(CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD, NULL, NULL, &(self->tid), NULL);
+#else
+  int result = __clone(CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD, NULL, NULL, NULL, &(self->tid));
+#endif
   if (result != 0) {  // Not a child process.
     __timer_table_start_stop(0);
     __bionic_atfork_run_parent();
diff --git a/libc/bionic/libc_init_common.cpp b/libc/bionic/libc_init_common.cpp
index 3e092ae..130c287 100644
--- a/libc/bionic/libc_init_common.cpp
+++ b/libc/bionic/libc_init_common.cpp
@@ -50,6 +50,7 @@
 extern "C" uintptr_t __get_sp(void);
 extern "C" int __system_properties_init(void);
 extern "C" int __set_tls(void* ptr);
+extern "C" int __set_tid_address(int* tid_address);
 
 // Not public, but well-known in the BSDs.
 const char* __progname;
@@ -90,17 +91,24 @@
   uintptr_t stack_bottom = stack_top - stack_size;
 
   static void* tls[BIONIC_TLS_SLOTS];
-  static pthread_internal_t thread;
-  thread.tid = gettid();
-  thread.tls = tls;
-  pthread_attr_init(&thread.attr);
-  pthread_attr_setstack(&thread.attr, (void*) stack_bottom, stack_size);
-  _init_thread(&thread, false);
-  __init_tls(&thread);
-  __set_tls(thread.tls);
+  static pthread_internal_t main_thread;
+  main_thread.tls = tls;
+
+  // Tell the kernel to clear our tid field when we exit, so we're like any other pthread.
+  main_thread.tid = __set_tid_address(&main_thread.tid);
+
+  // We already have a stack, and we don't want to free it up on exit (because things like
+  // environment variables with global scope live on it).
+  pthread_attr_init(&main_thread.attr);
+  pthread_attr_setstack(&main_thread.attr, (void*) stack_bottom, stack_size);
+  main_thread.attr.flags = PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK;
+
+  _init_thread(&main_thread, false);
+  __init_tls(&main_thread);
+  __set_tls(main_thread.tls);
   tls[TLS_SLOT_BIONIC_PREINIT] = &args;
 
-  __init_alternate_signal_stack(&thread);
+  __init_alternate_signal_stack(&main_thread);
 }
 
 void __libc_init_common(KernelArgumentBlock& args) {
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 6ed01ff..dde5ed7 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -97,7 +97,6 @@
     }
   }
 
-  pthread_cond_init(&thread->join_cond, NULL);
   thread->cleanup_stack = NULL;
 
   if (add_to_thread_list) {
@@ -215,17 +214,22 @@
   // the new thread.
   pthread_mutex_t* start_mutex = (pthread_mutex_t*) &thread->tls[TLS_SLOT_START_MUTEX];
   pthread_mutex_init(start_mutex, NULL);
-  ScopedPthreadMutexLocker start_locker(start_mutex);
+  pthread_mutex_lock(start_mutex);
 
   thread->tls[TLS_SLOT_THREAD_ID] = thread;
 
   thread->start_routine = start_routine;
   thread->start_routine_arg = arg;
 
-  int flags = CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | CLONE_SETTLS;
-  int tid = __bionic_clone(flags, child_stack, NULL, thread->tls, NULL, __pthread_start, thread);
-  if (tid < 0) {
+  int flags = CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+      CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID;
+  int rc = __bionic_clone(flags, child_stack, &(thread->tid), thread->tls, &(thread->tid), __pthread_start, thread);
+  if (rc == -1) {
     int clone_errno = errno;
+    // We don't have to unlock the mutex at all because clone(2) failed so there's no child waiting to
+    // be unblocked, but we're about to unmap the memory the mutex is stored in, so this serves as a
+    // reminder that you can't rewrite this function to use a ScopedPthreadMutexLocker.
+    pthread_mutex_unlock(start_mutex);
     if ((thread->attr.flags & PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK) == 0) {
       munmap(thread->attr.stack_base, thread->attr.stack_size);
     }
@@ -234,12 +238,10 @@
     return clone_errno;
   }
 
-  thread->tid = tid;
-
   int init_errno = _init_thread(thread, true);
   if (init_errno != 0) {
-    // Mark the thread detached and let its __pthread_start run to
-    // completion. (It'll just exit immediately, cleaning up its resources.)
+    // Mark the thread detached and let its __pthread_start run to completion.
+    // It'll check this flag and exit immediately, cleaning up its resources.
     thread->internal_flags |= PTHREAD_INTERNAL_FLAG_THREAD_INIT_FAILED;
     thread->attr.flags |= PTHREAD_ATTR_FLAG_DETACHED;
     return init_errno;
@@ -251,8 +253,9 @@
     _thread_created_hook(thread->tid);
   }
 
-  // Publish the pthread_t and let the thread run.
-  *thread_out = (pthread_t) thread;
+  // Publish the pthread_t and unlock the mutex to let the new thread start running.
+  *thread_out = reinterpret_cast<pthread_t>(thread);
+  pthread_mutex_unlock(start_mutex);
 
   return 0;
 }
diff --git a/libc/bionic/pthread_exit.cpp b/libc/bionic/pthread_exit.cpp
index cc86271..22c2c3c 100644
--- a/libc/bionic/pthread_exit.cpp
+++ b/libc/bionic/pthread_exit.cpp
@@ -57,8 +57,9 @@
   }
 }
 
-void pthread_exit(void* retval) {
+void pthread_exit(void* return_value) {
   pthread_internal_t* thread = __get_thread();
+  thread->return_value = return_value;
 
   // Call the cleanup handlers first.
   while (thread->cleanup_stack) {
@@ -90,10 +91,9 @@
   size_t stack_size = thread->attr.stack_size;
   bool user_allocated_stack = ((thread->attr.flags & PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK) != 0);
 
-  // If the thread is detached, destroy the pthread_internal_t,
-  // otherwise keep it in memory and signal any joiners.
   pthread_mutex_lock(&gThreadListLock);
-  if (thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) {
+  if ((thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) != 0) {
+    // The thread is detached, so we can destroy the pthread_internal_t.
     _pthread_internal_remove_locked(thread);
   } else {
     // Make sure that the pthread_internal_t doesn't have stale pointers to a stack that
@@ -103,15 +103,8 @@
       thread->attr.stack_size = 0;
       thread->tls = NULL;
     }
-
-    // Indicate that the thread has exited for joining threads.
-    thread->attr.flags |= PTHREAD_ATTR_FLAG_ZOMBIE;
-    thread->return_value = retval;
-
-    // Signal the joining thread if present.
-    if (thread->attr.flags & PTHREAD_ATTR_FLAG_JOINED) {
-      pthread_cond_signal(&thread->join_cond);
-    }
+    // pthread_join is responsible for destroying the pthread_internal_t for non-detached threads.
+    // The kernel will futex_wake on the pthread_internal_t::tid field to wake pthread_join.
   }
   pthread_mutex_unlock(&gThreadListLock);
 
@@ -131,6 +124,6 @@
     _exit_with_stack_teardown(stack_base, stack_size, 0);
   }
 
-  /* NOTREACHED, but we told the compiler this function is noreturn, and it doesn't believe us. */
+  // NOTREACHED, but we told the compiler this function is noreturn, and it doesn't believe us.
   abort();
 }
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index d8ad544..de1ef26 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -31,28 +31,31 @@
 #include <pthread.h>
 
 struct pthread_internal_t {
-    struct pthread_internal_t*  next;
-    struct pthread_internal_t*  prev;
-    pthread_attr_t              attr;
-    pid_t                       tid;
-    bool                        allocated_on_heap;
-    pthread_cond_t              join_cond;
-    void*                       return_value;
-    int                         internal_flags;
-    __pthread_cleanup_t*        cleanup_stack;
-    void**                      tls;         /* thread-local storage area */
+  struct pthread_internal_t* next;
+  struct pthread_internal_t* prev;
 
-    void* (*start_routine)(void*);
-    void* start_routine_arg;
+  pid_t tid;
 
-    void* alternate_signal_stack;
+  void** tls;
 
-    /*
-     * The dynamic linker implements dlerror(3), which makes it hard for us to implement this
-     * per-thread buffer by simply using malloc(3) and free(3).
-     */
+  pthread_attr_t attr;
+  bool allocated_on_heap; /* TODO: move this into attr.flags? */
+  int internal_flags; /* TODO: move this into attr.flags? */
+
+  __pthread_cleanup_t* cleanup_stack;
+
+  void* (*start_routine)(void*);
+  void* start_routine_arg;
+  void* return_value;
+
+  void* alternate_signal_stack;
+
+  /*
+   * The dynamic linker implements dlerror(3), which makes it hard for us to implement this
+   * per-thread buffer by simply using malloc(3) and free(3).
+   */
 #define __BIONIC_DLERROR_BUFFER_SIZE 512
-    char dlerror_buffer[__BIONIC_DLERROR_BUFFER_SIZE];
+  char dlerror_buffer[__BIONIC_DLERROR_BUFFER_SIZE];
 };
 
 __LIBC_HIDDEN__ int _init_thread(pthread_internal_t* thread, bool add_to_thread_list);
@@ -73,9 +76,6 @@
 /* Has the thread been joined by another thread? */
 #define PTHREAD_ATTR_FLAG_JOINED 0x00000004
 
-/* Has the thread already exited but not been joined? */
-#define PTHREAD_ATTR_FLAG_ZOMBIE 0x00000008
-
 #define PTHREAD_INTERNAL_FLAG_THREAD_INIT_FAILED 1
 
 /*
diff --git a/libc/bionic/pthread_join.cpp b/libc/bionic/pthread_join.cpp
index 7e022c2..0cbed62 100644
--- a/libc/bionic/pthread_join.cpp
+++ b/libc/bionic/pthread_join.cpp
@@ -28,33 +28,50 @@
 
 #include <errno.h>
 
+#include "private/bionic_futex.h"
 #include "pthread_accessor.h"
 
-int pthread_join(pthread_t t, void** ret_val) {
+int pthread_join(pthread_t t, void** return_value) {
   if (t == pthread_self()) {
     return EDEADLK;
   }
 
-  pthread_accessor thread(t);
-  if (thread.get() == NULL) {
+  pid_t tid;
+  volatile int* tid_ptr;
+  {
+    pthread_accessor thread(t);
+    if (thread.get() == NULL) {
       return ESRCH;
+    }
+
+    if ((thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) != 0) {
+      return EINVAL;
+    }
+
+    if ((thread->attr.flags & PTHREAD_ATTR_FLAG_JOINED) != 0) {
+      return EINVAL;
+    }
+
+    // Okay, looks like we can signal our intention to join.
+    thread->attr.flags |= PTHREAD_ATTR_FLAG_JOINED;
+    tid = thread->tid;
+    tid_ptr = &thread->tid;
   }
 
-  if (thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) {
-    return EINVAL;
+  // We set the PTHREAD_ATTR_FLAG_JOINED flag with the lock held,
+  // so no one is going to remove this thread except us.
+
+  // Wait for the thread to actually exit, if it hasn't already.
+  while (*tid_ptr != 0) {
+    __futex_wait(tid_ptr, tid, NULL);
   }
 
-  if (thread->attr.flags & PTHREAD_ATTR_FLAG_JOINED) {
-    return EINVAL;
-  }
+  // Take the lock again so we can pull the thread's return value
+  // and remove the thread from the list.
+  pthread_accessor thread(t);
 
-  // Signal our intention to join, and wait for the thread to exit.
-  thread->attr.flags |= PTHREAD_ATTR_FLAG_JOINED;
-  while ((thread->attr.flags & PTHREAD_ATTR_FLAG_ZOMBIE) == 0) {
-    pthread_cond_wait(&thread->join_cond, &gThreadListLock);
-  }
-  if (ret_val) {
-    *ret_val = thread->return_value;
+  if (return_value) {
+    *return_value = thread->return_value;
   }
 
   _pthread_internal_remove_locked(thread.get());
diff --git a/libc/bionic/pthread_key.cpp b/libc/bionic/pthread_key.cpp
index f2f4d20..6cc68af 100644
--- a/libc/bionic/pthread_key.cpp
+++ b/libc/bionic/pthread_key.cpp
@@ -218,7 +218,7 @@
     // startup trampoline (__pthread_start) hasn't been run yet by the
     // scheduler. t->tls will also be NULL after a thread's stack has been
     // unmapped but before the ongoing pthread_join() is finished.
-    if ((t->attr.flags & PTHREAD_ATTR_FLAG_ZOMBIE) || t->tls == NULL) {
+    if (t->tid == 0 || t->tls == NULL) {
       continue;
     }
 
diff --git a/libc/private/bionic_futex.h b/libc/private/bionic_futex.h
index bfc3520..5602af7 100644
--- a/libc/private/bionic_futex.h
+++ b/libc/private/bionic_futex.h
@@ -28,6 +28,7 @@
 #ifndef _BIONIC_FUTEX_H
 #define _BIONIC_FUTEX_H
 
+#include <linux/compiler.h> /* needed for __user in non-uapi futex.h */
 #include <linux/futex.h>
 #include <sys/cdefs.h>
 
diff --git a/libc/private/bionic_tls.h b/libc/private/bionic_tls.h
index 094c71c..ff13fdb 100644
--- a/libc/private/bionic_tls.h
+++ b/libc/private/bionic_tls.h
@@ -51,7 +51,9 @@
   TLS_SLOT_THREAD_ID,
   TLS_SLOT_ERRNO,
 
-  /* This slot is used when starting a new thread, before any code that needs errno runs. */
+  /* This slot in the child's TLS is used to synchronize the parent and child
+   * during thread initialization. The child finishes with this mutex before
+   * running any code that can set errno, so we can reuse the errno slot. */
   TLS_SLOT_START_MUTEX = TLS_SLOT_ERRNO,
 
   /* These two aren't used by bionic itself, but allow the graphics code to
diff --git a/tests/pthread_test.cpp b/tests/pthread_test.cpp
index f6f61b1..480e455 100644
--- a/tests/pthread_test.cpp
+++ b/tests/pthread_test.cpp
@@ -150,22 +150,48 @@
   ASSERT_EQ(EDEADLK, pthread_join(pthread_self(), &result));
 }
 
-#if __BIONIC__ // For some reason, gtest on bionic can cope with this but gtest on glibc can't.
+struct TestBug37410 {
+  pthread_t main_thread;
+  pthread_mutex_t mutex;
 
-static void TestBug37410() {
-  pthread_t t1;
-  ASSERT_EQ(0, pthread_create(&t1, NULL, JoinFn, reinterpret_cast<void*>(pthread_self())));
-  pthread_exit(NULL);
-}
+  static void main() {
+    TestBug37410 data;
+    data.main_thread = pthread_self();
+    ASSERT_EQ(0, pthread_mutex_init(&data.mutex, NULL));
+    ASSERT_EQ(0, pthread_mutex_lock(&data.mutex));
+
+    pthread_t t;
+    ASSERT_EQ(0, pthread_create(&t, NULL, TestBug37410::thread_fn, reinterpret_cast<void*>(&data)));
+
+    // Wait for the thread to be running...
+    ASSERT_EQ(0, pthread_mutex_lock(&data.mutex));
+    ASSERT_EQ(0, pthread_mutex_unlock(&data.mutex));
+
+    // ...and exit.
+    pthread_exit(NULL);
+  }
+
+ private:
+  static void* thread_fn(void* arg) {
+    TestBug37410* data = reinterpret_cast<TestBug37410*>(arg);
+
+    // Let the main thread know we're running.
+    pthread_mutex_unlock(&data->mutex);
+
+    // And wait for the main thread to exit.
+    pthread_join(data->main_thread, NULL);
+
+    return NULL;
+  }
+};
 
 // Even though this isn't really a death test, we have to say "DeathTest" here so gtest knows to
 // run this test (which exits normally) in its own process.
 TEST(pthread_DeathTest, pthread_bug_37410) {
   // http://code.google.com/p/android/issues/detail?id=37410
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
-  ASSERT_EXIT(TestBug37410(), ::testing::ExitedWithCode(0), "");
+  ASSERT_EXIT(TestBug37410::main(), ::testing::ExitedWithCode(0), "");
 }
-#endif
 
 static void* SignalHandlerFn(void* arg) {
   sigset_t wait_set;
diff --git a/tests/stdlib_test.cpp b/tests/stdlib_test.cpp
index e5d7812..fa59c41 100644
--- a/tests/stdlib_test.cpp
+++ b/tests/stdlib_test.cpp
@@ -19,6 +19,7 @@
 #include <errno.h>
 #include <libgen.h>
 #include <limits.h>
+#include <pthread.h>
 #include <stdint.h>
 #include <stdlib.h>
 
@@ -132,3 +133,27 @@
   ASSERT_STREQ("bravo", entries[1].name);
   ASSERT_STREQ("charlie", entries[2].name);
 }
+
+static void* TestBug57421_child(void* arg) {
+  pthread_t main_thread = reinterpret_cast<pthread_t>(arg);
+  pthread_join(main_thread, NULL);
+  char* value = getenv("ENVIRONMENT_VARIABLE");
+  if (value == NULL) {
+    setenv("ENVIRONMENT_VARIABLE", "value", 1);
+  }
+  return NULL;
+}
+
+static void TestBug57421_main() {
+  pthread_t t;
+  ASSERT_EQ(0, pthread_create(&t, NULL, TestBug57421_child, reinterpret_cast<void*>(pthread_self())));
+  pthread_exit(NULL);
+}
+
+// Even though this isn't really a death test, we have to say "DeathTest" here so gtest knows to
+// run this test (which exits normally) in its own process.
+TEST(stdlib_DeathTest, getenv_after_main_thread_exits) {
+  // https://code.google.com/p/android/issues/detail?id=57421
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  ASSERT_EXIT(TestBug57421_main(), ::testing::ExitedWithCode(0), "");
+}