Use mmap to create the pthread_internal_t

Add name to mmaped regions.
Add pthread benchmark code.
Allocate pthread_internal_t on regular stack.

Bug: 16847284
Change-Id: Id60835163bb0d68092241f1a118015b5a8f85069
diff --git a/benchmarks/pthread_benchmark.cpp b/benchmarks/pthread_benchmark.cpp
index 92e5998..42023e0 100644
--- a/benchmarks/pthread_benchmark.cpp
+++ b/benchmarks/pthread_benchmark.cpp
@@ -47,6 +47,21 @@
 }
 BENCHMARK(BM_pthread_getspecific);
 
+static void BM_pthread_setspecific(int iters) {
+  StopBenchmarkTiming();
+  pthread_key_t key;
+  pthread_key_create(&key, NULL);
+  StartBenchmarkTiming();
+
+  for (int i = 0; i < iters; ++i) {
+    pthread_setspecific(key, NULL);
+  }
+
+  StopBenchmarkTiming();
+  pthread_key_delete(key);
+}
+BENCHMARK(BM_pthread_setspecific);
+
 static void DummyPthreadOnceInitFunction() {
 }
 
@@ -137,3 +152,80 @@
   pthread_rwlock_destroy(&lock);
 }
 BENCHMARK(BM_pthread_rw_lock_write);
+
+static void* IdleThread(void*) {
+  return NULL;
+}
+
+static void BM_pthread_create(int iters) {
+  StopBenchmarkTiming();
+  pthread_t thread;
+
+  for (int i = 0; i < iters; ++i) {
+    StartBenchmarkTiming();
+    pthread_create(&thread, NULL, IdleThread, NULL);
+    StopBenchmarkTiming();
+    pthread_join(thread, NULL);
+  }
+}
+BENCHMARK(BM_pthread_create);
+
+static void* RunThread(void*) {
+  StopBenchmarkTiming();
+  return NULL;
+}
+
+static void BM_pthread_create_and_run(int iters) {
+  StopBenchmarkTiming();
+  pthread_t thread;
+
+  for (int i = 0; i < iters; ++i) {
+    StartBenchmarkTiming();
+    pthread_create(&thread, NULL, RunThread, NULL);
+    pthread_join(thread, NULL);
+  }
+}
+BENCHMARK(BM_pthread_create_and_run);
+
+static void* ExitThread(void*) {
+  StartBenchmarkTiming();
+  pthread_exit(NULL);
+}
+
+static void BM_pthread_exit_and_join(int iters) {
+  StopBenchmarkTiming();
+  pthread_t thread;
+
+  for (int i = 0; i < iters; ++i) {
+    pthread_create(&thread, NULL, ExitThread, NULL);
+    pthread_join(thread, NULL);
+    StopBenchmarkTiming();
+  }
+}
+BENCHMARK(BM_pthread_exit_and_join);
+
+static void BM_pthread_key_create(int iters) {
+  StopBenchmarkTiming();
+  pthread_key_t key;
+
+  for (int i = 0; i < iters; ++i) {
+    StartBenchmarkTiming();
+    pthread_key_create(&key, NULL);
+    StopBenchmarkTiming();
+    pthread_key_delete(key);
+  }
+}
+BENCHMARK(BM_pthread_key_create);
+
+static void BM_pthread_key_delete(int iters) {
+  StopBenchmarkTiming();
+  pthread_key_t key;
+
+  for (int i = 0; i < iters; ++i) {
+    pthread_key_create(&key, NULL);
+    StartBenchmarkTiming();
+    pthread_key_delete(key);
+    StopBenchmarkTiming();
+  }
+}
+BENCHMARK(BM_pthread_key_delete);
diff --git a/libc/bionic/libc_init_common.cpp b/libc/bionic/libc_init_common.cpp
index 2a6a03b..15b3fd5 100644
--- a/libc/bionic/libc_init_common.cpp
+++ b/libc/bionic/libc_init_common.cpp
@@ -74,9 +74,7 @@
 void __libc_init_tls(KernelArgumentBlock& args) {
   __libc_auxv = args.auxv;
 
-  static void* tls[BIONIC_TLS_SLOTS];
   static pthread_internal_t main_thread;
-  main_thread.tls = tls;
 
   // Tell the kernel to clear our tid field when we exit, so we're like any other pthread.
   // As a side-effect, this tells us our pid (which is the same as the main thread's tid).
@@ -96,7 +94,7 @@
   __init_thread(&main_thread, false);
   __init_tls(&main_thread);
   __set_tls(main_thread.tls);
-  tls[TLS_SLOT_BIONIC_PREINIT] = &args;
+  main_thread.tls[TLS_SLOT_BIONIC_PREINIT] = &args;
 
   __init_alternate_signal_stack(&main_thread);
 }
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index c99e69c..c47b750 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -35,6 +35,7 @@
 #include "pthread_internal.h"
 
 #include "private/bionic_macros.h"
+#include "private/bionic_prctl.h"
 #include "private/bionic_ssp.h"
 #include "private/bionic_tls.h"
 #include "private/libc_logging.h"
@@ -72,6 +73,10 @@
     ss.ss_flags = 0;
     sigaltstack(&ss, NULL);
     thread->alternate_signal_stack = ss.ss_sp;
+
+    // We can only use const static allocated string for mapped region name, as Android kernel
+    // uses the string pointer directly when dumping /proc/pid/maps.
+    prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ss.ss_sp, ss.ss_size, "thread signal stack");
   }
 }
 
@@ -101,31 +106,64 @@
   return error;
 }
 
-static void* __create_thread_stack(pthread_internal_t* thread) {
+static void* __create_thread_stack(const pthread_attr_t& attr) {
   // Create a new private anonymous map.
   int prot = PROT_READ | PROT_WRITE;
   int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
-  void* stack = mmap(NULL, thread->attr.stack_size, prot, flags, -1, 0);
+  void* stack = mmap(NULL, attr.stack_size, prot, flags, -1, 0);
   if (stack == MAP_FAILED) {
     __libc_format_log(ANDROID_LOG_WARN,
                       "libc",
                       "pthread_create failed: couldn't allocate %zd-byte stack: %s",
-                      thread->attr.stack_size, strerror(errno));
+                      attr.stack_size, strerror(errno));
     return NULL;
   }
 
   // Set the guard region at the end of the stack to PROT_NONE.
-  if (mprotect(stack, thread->attr.guard_size, PROT_NONE) == -1) {
+  if (mprotect(stack, attr.guard_size, PROT_NONE) == -1) {
     __libc_format_log(ANDROID_LOG_WARN, "libc",
                       "pthread_create failed: couldn't mprotect PROT_NONE %zd-byte stack guard region: %s",
-                      thread->attr.guard_size, strerror(errno));
-    munmap(stack, thread->attr.stack_size);
+                      attr.guard_size, strerror(errno));
+    munmap(stack, attr.stack_size);
     return NULL;
   }
 
   return stack;
 }
 
+static int __allocate_thread(pthread_attr_t* attr, pthread_internal_t** threadp, void** child_stack) {
+  if (attr->stack_base == NULL) {
+    // The caller didn't provide a stack, so allocate one.
+    // Make sure the stack size and guard size are multiples of PAGE_SIZE.
+    attr->stack_size = BIONIC_ALIGN(attr->stack_size, PAGE_SIZE);
+    attr->guard_size = BIONIC_ALIGN(attr->guard_size, PAGE_SIZE);
+    attr->stack_base = __create_thread_stack(*attr);
+    if (attr->stack_base == NULL) {
+      return EAGAIN;
+    }
+  } else {
+    // The caller did provide a stack, so remember we're not supposed to free it.
+    attr->flags |= PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK;
+  }
+
+  // Thread stack is used for two sections:
+  //   pthread_internal_t.
+  //   regular stack, from top to down.
+  uint8_t* stack_top = reinterpret_cast<uint8_t*>(attr->stack_base) + attr->stack_size;
+  stack_top -= sizeof(pthread_internal_t);
+  pthread_internal_t* thread = reinterpret_cast<pthread_internal_t*>(stack_top);
+
+  // No need to check stack_top alignment. The size of pthread_internal_t is 16-bytes aligned,
+  // and user allocated stack is guaranteed by pthread_attr_setstack.
+
+  thread->attr = *attr;
+  __init_tls(thread);
+
+  *threadp = thread;
+  *child_stack = stack_top;
+  return 0;
+}
+
 static int __pthread_start(void* arg) {
   pthread_internal_t* thread = reinterpret_cast<pthread_internal_t*>(arg);
 
@@ -158,43 +196,21 @@
   // Inform the rest of the C library that at least one thread was created.
   __isthreaded = 1;
 
-  pthread_internal_t* thread = __create_thread_struct();
-  if (thread == NULL) {
-    return EAGAIN;
-  }
-
+  pthread_attr_t thread_attr;
   if (attr == NULL) {
-    pthread_attr_init(&thread->attr);
+    pthread_attr_init(&thread_attr);
   } else {
-    thread->attr = *attr;
+    thread_attr = *attr;
     attr = NULL; // Prevent misuse below.
   }
 
-  // Make sure the stack size and guard size are multiples of PAGE_SIZE.
-  thread->attr.stack_size = BIONIC_ALIGN(thread->attr.stack_size, PAGE_SIZE);
-  thread->attr.guard_size = BIONIC_ALIGN(thread->attr.guard_size, PAGE_SIZE);
-
-  if (thread->attr.stack_base == NULL) {
-    // The caller didn't provide a stack, so allocate one.
-    thread->attr.stack_base = __create_thread_stack(thread);
-    if (thread->attr.stack_base == NULL) {
-      __free_thread_struct(thread);
-      return EAGAIN;
-    }
-  } else {
-    // The caller did provide a stack, so remember we're not supposed to free it.
-    thread->attr.flags |= PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK;
+  pthread_internal_t* thread = NULL;
+  void* child_stack = NULL;
+  int result = __allocate_thread(&thread_attr, &thread, &child_stack);
+  if (result != 0) {
+    return result;
   }
 
-  // Make room for the TLS area.
-  // The child stack is the same address, just growing in the opposite direction.
-  // At offsets >= 0, we have the TLS slots.
-  // At offsets < 0, we have the child stack.
-  thread->tls = reinterpret_cast<void**>(reinterpret_cast<uint8_t*>(thread->attr.stack_base) +
-                  thread->attr.stack_size - BIONIC_ALIGN(BIONIC_TLS_SLOTS * sizeof(void*), 16));
-  void* child_stack = thread->tls;
-  __init_tls(thread);
-
   // Create a mutex for the thread in TLS to wait on once it starts so we can keep
   // it from doing anything until after we notify the debugger about it
   //
@@ -211,7 +227,7 @@
 
   int flags = CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
       CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID;
-  void* tls = thread->tls;
+  void* tls = reinterpret_cast<void*>(thread->tls);
 #if defined(__i386__)
   // On x86 (but not x86-64), CLONE_SETTLS takes a pointer to a struct user_desc rather than
   // a pointer to the TLS itself.
@@ -229,7 +245,6 @@
     if (!thread->user_allocated_stack()) {
       munmap(thread->attr.stack_base, thread->attr.stack_size);
     }
-    __free_thread_struct(thread);
     __libc_format_log(ANDROID_LOG_WARN, "libc", "pthread_create failed: clone failed: %s", strerror(errno));
     return clone_errno;
   }
diff --git a/libc/bionic/pthread_detach.cpp b/libc/bionic/pthread_detach.cpp
index a8608e3..715acf1 100644
--- a/libc/bionic/pthread_detach.cpp
+++ b/libc/bionic/pthread_detach.cpp
@@ -46,7 +46,7 @@
 
   if (thread->tid == 0) {
     // Already exited; clean up.
-    _pthread_internal_remove_locked(thread.get());
+    _pthread_internal_remove_locked(thread.get(), true);
     return 0;
   }
 
diff --git a/libc/bionic/pthread_exit.cpp b/libc/bionic/pthread_exit.cpp
index a6bb363..e04cf8e 100644
--- a/libc/bionic/pthread_exit.cpp
+++ b/libc/bionic/pthread_exit.cpp
@@ -90,7 +90,7 @@
   // Keep track of what we need to know about the stack before we lose the pthread_internal_t.
   void* stack_base = thread->attr.stack_base;
   size_t stack_size = thread->attr.stack_size;
-  bool user_allocated_stack = thread->user_allocated_stack();
+  bool free_stack = false;
 
   pthread_mutex_lock(&g_thread_list_lock);
   if ((thread->attr.flags & PTHREAD_ATTR_FLAG_DETACHED) != 0) {
@@ -98,24 +98,18 @@
     // First make sure that the kernel does not try to clear the tid field
     // because we'll have freed the memory before the thread actually exits.
     __set_tid_address(NULL);
-    _pthread_internal_remove_locked(thread);
-  } else {
-    // Make sure that the pthread_internal_t doesn't have stale pointers to a stack that
-    // will be unmapped after the exit call below.
-    if (!user_allocated_stack) {
-      thread->attr.stack_base = NULL;
-      thread->attr.stack_size = 0;
-      thread->tls = NULL;
+
+    // pthread_internal_t is freed below with stack, not here.
+    _pthread_internal_remove_locked(thread, false);
+    if (!thread->user_allocated_stack()) {
+      free_stack = true;
     }
-    // pthread_join is responsible for destroying the pthread_internal_t for non-detached threads.
-    // The kernel will futex_wake on the pthread_internal_t::tid field to wake pthread_join.
   }
   pthread_mutex_unlock(&g_thread_list_lock);
 
-  if (user_allocated_stack) {
-    // Cleaning up this thread's stack is the creator's responsibility, not ours.
-    __exit(0);
-  } else {
+  // Detached threads exit with stack teardown, and everything deallocated here.
+  // Threads that can be joined exit but leave their stacks for the pthread_join caller to clean up.
+  if (free_stack) {
     // We need to munmap the stack we're running on before calling exit.
     // That's not something we can do in C.
 
@@ -126,5 +120,7 @@
     sigprocmask(SIG_SETMASK, &mask, NULL);
 
     _exit_with_stack_teardown(stack_base, stack_size);
+  } else {
+    __exit(0);
   }
 }
diff --git a/libc/bionic/pthread_internal.h b/libc/bionic/pthread_internal.h
index c5136c9..95097b7 100644
--- a/libc/bionic/pthread_internal.h
+++ b/libc/bionic/pthread_internal.h
@@ -30,6 +30,8 @@
 
 #include <pthread.h>
 
+#include "private/bionic_tls.h"
+
 /* Has the thread been detached by a pthread_join or pthread_detach call? */
 #define PTHREAD_ATTR_FLAG_DETACHED 0x00000001
 
@@ -72,8 +74,6 @@
     return (attr.flags & PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK) != 0;
   }
 
-  void** tls;
-
   pthread_attr_t attr;
 
   __pthread_cleanup_t* cleanup_stack;
@@ -86,16 +86,16 @@
 
   pthread_mutex_t startup_handshake_mutex;
 
+  void* tls[BIONIC_TLS_SLOTS];
+
   /*
    * The dynamic linker implements dlerror(3), which makes it hard for us to implement this
    * per-thread buffer by simply using malloc(3) and free(3).
    */
 #define __BIONIC_DLERROR_BUFFER_SIZE 512
   char dlerror_buffer[__BIONIC_DLERROR_BUFFER_SIZE];
-};
+} __attribute__((aligned(16))); // Align it as thread stack top below it should be aligned.
 
-__LIBC_HIDDEN__ pthread_internal_t* __create_thread_struct();
-__LIBC_HIDDEN__ void __free_thread_struct(pthread_internal_t*);
 __LIBC_HIDDEN__ int __init_thread(pthread_internal_t* thread, bool add_to_thread_list);
 __LIBC_HIDDEN__ void __init_tls(pthread_internal_t* thread);
 __LIBC_HIDDEN__ void __init_alternate_signal_stack(pthread_internal_t*);
@@ -105,7 +105,7 @@
 extern "C" __LIBC64_HIDDEN__ pthread_internal_t* __get_thread(void);
 
 __LIBC_HIDDEN__ void pthread_key_clean_all(void);
-__LIBC_HIDDEN__ void _pthread_internal_remove_locked(pthread_internal_t* thread);
+__LIBC_HIDDEN__ void _pthread_internal_remove_locked(pthread_internal_t* thread, bool free_thread);
 
 /*
  * Traditionally we gave threads a 1MiB stack. When we started
diff --git a/libc/bionic/pthread_internals.cpp b/libc/bionic/pthread_internals.cpp
index 33cddd7..7c30e6e 100644
--- a/libc/bionic/pthread_internals.cpp
+++ b/libc/bionic/pthread_internals.cpp
@@ -41,26 +41,7 @@
 pthread_internal_t* g_thread_list = NULL;
 pthread_mutex_t g_thread_list_lock = PTHREAD_MUTEX_INITIALIZER;
 
-pthread_internal_t* __create_thread_struct() {
-  void* result = mmap(NULL, sizeof(pthread_internal_t), PROT_READ | PROT_WRITE,
-                      MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
-  if (result == MAP_FAILED) {
-    __libc_format_log(ANDROID_LOG_WARN, "libc",
-                      "__create_thread_struct() failed: %s", strerror(errno));
-    return NULL;
-  }
-  return reinterpret_cast<pthread_internal_t*>(result);
-}
-
-void __free_thread_struct(pthread_internal_t* thread) {
-  int result = munmap(thread, sizeof(pthread_internal_t));
-  if (result != 0) {
-    __libc_format_log(ANDROID_LOG_WARN, "libc",
-                      "__free_thread_struct() failed: %s", strerror(errno));
-  }
-}
-
-void _pthread_internal_remove_locked(pthread_internal_t* thread) {
+void _pthread_internal_remove_locked(pthread_internal_t* thread, bool free_thread) {
   if (thread->next != NULL) {
     thread->next->prev = thread->prev;
   }
@@ -70,10 +51,11 @@
     g_thread_list = thread->next;
   }
 
-  // The main thread is not heap-allocated. See __libc_init_tls for the declaration,
-  // and __libc_init_common for the point where it's added to the thread list.
-  if ((thread->attr.flags & PTHREAD_ATTR_FLAG_MAIN_THREAD) == 0) {
-    __free_thread_struct(thread);
+  // For threads using user allocated stack (including the main thread), the pthread_internal_t
+  // can't be freed since it is on the stack.
+  if (free_thread && !(thread->attr.flags & PTHREAD_ATTR_FLAG_USER_ALLOCATED_STACK)) {
+    // Use one munmap to free the whole thread stack, including pthread_internal_t.
+    munmap(thread->attr.stack_base, thread->attr.stack_size);
   }
 }
 
diff --git a/libc/bionic/pthread_join.cpp b/libc/bionic/pthread_join.cpp
index 0cbed62..e3350ef 100644
--- a/libc/bionic/pthread_join.cpp
+++ b/libc/bionic/pthread_join.cpp
@@ -74,6 +74,6 @@
     *return_value = thread->return_value;
   }
 
-  _pthread_internal_remove_locked(thread.get());
+  _pthread_internal_remove_locked(thread.get(), true);
   return 0;
 }