Use ifuncs in the linker

Using ifuncs allows the linker to select faster versions of libc functions
like strcmp, making linking faster.

The linker continues to first initialize TLS, then call the ifunc
resolvers. There are small amounts of code in Bionic that need to avoid
calling functions selected using ifuncs (generally string.h APIs). I've
tried to compile those pieces with -ffreestanding. Maybe it's unnecessary,
but maybe it could help avoid compiler-inserted memset calls, and maybe
it will be useful later on.

The ifuncs are called in a special early pass using special
__rel[a]_iplt_start / __rel[a]_iplt_end symbols. The linker will encounter
the ifuncs again as R_*_IRELATIVE dynamic relocations, so they're skipped
on the second pass.

Break linker_main.cpp into its own liblinker_main library so it can be
compiled with -ffreestanding.

On walleye, this change fixes a recent 2.3% linker64 start-up time
regression (156.6ms -> 160.2ms), but it also helps the 32-bit time by
about 1.9% on the same benchmark. I'm measuring the run-time using a
synthetic benchmark based on loading libandroid_servers.so.

Test: bionic unit tests, manual benchmarking
Bug: none
Exempt-From-Owner-Approval: cherry pick, fix automerger conflict
Merged-In: Ieb9446c2df13a66fc0d377596756becad0af6995
Change-Id: Ieb9446c2df13a66fc0d377596756becad0af6995
(cherry picked from commit 772bcbb0c2f7a87b18021849528240ef0c617d94)
diff --git a/libc/Android.bp b/libc/Android.bp
index 53a26a6..c5ea4c5 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -142,19 +142,21 @@
 }
 
 // ========================================================
-// libc_stack_protector.a - stack protector code
+// libc_bootstrap.a - -fno-stack-protector and -ffreestanding
 // ========================================================
 //
-// Code that implements the stack protector (or that runs
-// before TLS has been set up) needs to be compiled with
-// -fno-stack-protector, since it accesses the stack canary
-// TLS slot.
+// Code that implements the stack protector (or that runs before TLS has been set up) needs to be
+// compiled with -fno-stack-protector, since it accesses the stack canary TLS slot. In the linker,
+// some of this code runs before ifunc resolvers have made string.h functions work, so compile with
+// -ffreestanding.
 
 cc_library_static {
 
     srcs: [
         "bionic/__libc_init_main_thread.cpp",
         "bionic/__stack_chk_fail.cpp",
+        "bionic/bionic_call_ifunc_resolver.cpp",
+        "bionic/getauxval.cpp",
     ],
     arch: {
         arm64: {
@@ -172,20 +174,25 @@
     },
 
     defaults: ["libc_defaults"],
-    cflags: ["-fno-stack-protector"],
-    name: "libc_stack_protector",
+    cflags: ["-fno-stack-protector", "-ffreestanding"],
+    name: "libc_bootstrap",
 }
 
-// libc_init_static.cpp also needs to be built without stack protector,
-// because it's responsible for setting up TLS for static executables.
-// This isn't the case for dynamic executables because the dynamic linker
-// has already set up the main thread's TLS.
+// libc_init_static.cpp and libc_init_dynamic.cpp need to be built without stack protector.
+// libc_init_static.cpp sets up TLS for static executables, and libc_init_dynamic.cpp initializes
+// the stack protector global variable.
 
 cc_library_static {
     name: "libc_init_static",
     defaults: ["libc_defaults"],
     srcs: ["bionic/libc_init_static.cpp"],
-    cflags: ["-fno-stack-protector"],
+    cflags: [
+        "-fno-stack-protector",
+
+        // Compile libc_init_static.cpp with -ffreestanding, because some of its code is called
+        // from the linker before ifunc resolvers have made string.h functions available.
+        "-ffreestanding",
+    ],
 }
 
 cc_library_static {
@@ -784,12 +791,6 @@
 cc_library_static {
     defaults: ["libc_defaults"],
     srcs: [
-        // The data that backs getauxval is initialized in the libc init
-        // functions which are invoked by the linker. If this file is included
-        // in libc_ndk.a, only one of the copies of the global data will be
-        // initialized, resulting in nullptr dereferences.
-        "bionic/getauxval.cpp",
-
         // These require getauxval, which isn't available on older platforms.
         "bionic/sysconf.cpp",
         "bionic/vdso.cpp",
@@ -1084,7 +1085,6 @@
         "bionic/atof.cpp",
         "bionic/bionic_allocator.cpp",
         "bionic/bionic_arc4random.cpp",
-        "bionic/bionic_call_ifunc_resolver.cpp",
         "bionic/bionic_futex.cpp",
         "bionic/bionic_netlink.cpp",
         "bionic/bionic_systrace.cpp",
@@ -1427,6 +1427,7 @@
 
     whole_static_libs: [
         "libc_bionic_ndk",
+        "libc_bootstrap",
         "libc_fortify",
         "libc_freebsd",
         "libc_freebsd_large_stack",
@@ -1434,7 +1435,6 @@
         "libc_netbsd",
         "libc_openbsd_large_stack",
         "libc_openbsd_ndk",
-        "libc_stack_protector",
         "libc_syscalls",
         "libc_tzcode",
         "libm",
@@ -1458,6 +1458,7 @@
     whole_static_libs: [
         "libc_bionic",
         "libc_bionic_ndk",
+        "libc_bootstrap",
         "libc_dns",
         "libc_fortify",
         "libc_freebsd",
@@ -1467,7 +1468,6 @@
         "libc_openbsd",
         "libc_openbsd_large_stack",
         "libc_openbsd_ndk",
-        "libc_stack_protector",
         "libc_syscalls",
         "libc_tzcode",
         "libstdc++",
@@ -1495,11 +1495,11 @@
 }
 
 // ========================================================
-// libc_common_static.a For static binaries.
+// libc_static_dispatch.a
 // ========================================================
 cc_library_static {
     defaults: ["libc_defaults"],
-    name: "libc_common_static",
+    name: "libc_static_dispatch",
 
     arch: {
         x86: {
@@ -1512,18 +1512,14 @@
             srcs: ["arch-arm64/static_function_dispatch.S"],
         },
     },
-
-    whole_static_libs: [
-        "libc_common",
-    ],
 }
 
 // ========================================================
-// libc_common_shared.a For shared libraries.
+// libc_dynamic_dispatch.a
 // ========================================================
 cc_library_static {
     defaults: ["libc_defaults"],
-    name: "libc_common_shared",
+    name: "libc_dynamic_dispatch",
 
     cflags: [
         "-ffreestanding",
@@ -1541,9 +1537,31 @@
             srcs: ["arch-arm64/dynamic_function_dispatch.cpp"],
         },
     },
+}
+
+// ========================================================
+// libc_common_static.a For static binaries.
+// ========================================================
+cc_library_static {
+    defaults: ["libc_defaults"],
+    name: "libc_common_static",
 
     whole_static_libs: [
         "libc_common",
+        "libc_static_dispatch",
+    ],
+}
+
+// ========================================================
+// libc_common_shared.a For shared libraries.
+// ========================================================
+cc_library_static {
+    defaults: ["libc_defaults"],
+    name: "libc_common_shared",
+
+    whole_static_libs: [
+        "libc_common",
+        "libc_dynamic_dispatch",
     ],
 }
 
@@ -1567,19 +1585,16 @@
 // libc_nomalloc.a
 // ========================================================
 //
-// This is a version of the static C library that does not
-// include malloc. It's useful in situations when the user wants
-// to provide their own malloc implementation, or wants to
-// explicitly disallow the use of malloc, such as in the
-// dynamic linker.
+// This is a version of the static C library used by the dynamic linker that exclude malloc. It also
+// excludes functions selected using ifunc's (e.g. for string.h). Link in either
+// libc_static_dispatch or libc_dynamic_dispatch to provide those functions.
 
 cc_library_static {
     name: "libc_nomalloc",
     defaults: ["libc_defaults"],
-    cflags: ["-DLIBC_STATIC"],
 
     whole_static_libs: [
-        "libc_common_static",
+        "libc_common",
         "libc_init_static",
         "libc_unwind_static",
     ],
diff --git a/libc/bionic/__libc_init_main_thread.cpp b/libc/bionic/__libc_init_main_thread.cpp
index 6e1b0de..94cf1f8 100644
--- a/libc/bionic/__libc_init_main_thread.cpp
+++ b/libc/bionic/__libc_init_main_thread.cpp
@@ -57,7 +57,9 @@
 //
 // This is in a file by itself because it needs to be built with
 // -fno-stack-protector because it's responsible for setting up the main
-// thread's TLS (which stack protector relies on).
+// thread's TLS (which stack protector relies on). It's also built with
+// -ffreestanding because the early init function runs in the linker before
+// ifunc resolvers have run.
 
 // Do enough setup to:
 //  - Let the dynamic linker invoke system calls (and access errno)
@@ -65,7 +67,8 @@
 //  - Allow the stack protector to work (with a zero cookie)
 // Avoid doing much more because, when this code is called within the dynamic
 // linker, the linker binary hasn't been relocated yet, so certain kinds of code
-// are hazardous, such as accessing non-hidden global variables.
+// are hazardous, such as accessing non-hidden global variables or calling
+// string.h functions.
 __BIONIC_WEAK_FOR_NATIVE_BRIDGE
 extern "C" void __libc_init_main_thread_early(const KernelArgumentBlock& args,
                                               bionic_tcb* temp_tcb) {
@@ -80,6 +83,23 @@
   main_thread.set_cached_pid(main_thread.tid);
 }
 
+// This code is used both by each new pthread and the code that initializes the main thread.
+void __init_tcb(bionic_tcb* tcb, pthread_internal_t* thread) {
+#ifdef TLS_SLOT_SELF
+  // On x86, slot 0 must point to itself so code can read the thread pointer by
+  // loading %fs:0 or %gs:0.
+  tcb->tls_slot(TLS_SLOT_SELF) = &tcb->tls_slot(TLS_SLOT_SELF);
+#endif
+  tcb->tls_slot(TLS_SLOT_THREAD_ID) = thread;
+}
+
+void __init_tcb_dtv(bionic_tcb* tcb) {
+  // Initialize the DTV slot to a statically-allocated empty DTV. The first
+  // access to a dynamic TLS variable allocates a new DTV.
+  static const TlsDtv zero_dtv = {};
+  __set_tcb_dtv(tcb, const_cast<TlsDtv*>(&zero_dtv));
+}
+
 // Finish initializing the main thread.
 __BIONIC_WEAK_FOR_NATIVE_BRIDGE
 extern "C" void __libc_init_main_thread_late() {
diff --git a/libc/bionic/bionic_call_ifunc_resolver.cpp b/libc/bionic/bionic_call_ifunc_resolver.cpp
index 8522835..437de78 100644
--- a/libc/bionic/bionic_call_ifunc_resolver.cpp
+++ b/libc/bionic/bionic_call_ifunc_resolver.cpp
@@ -30,14 +30,32 @@
 #include <sys/auxv.h>
 #include <sys/ifunc.h>
 
+#include "private/bionic_auxv.h"
+
+// This code is called in the linker before it has been relocated, so minimize calls into other
+// parts of Bionic. In particular, we won't ever have two ifunc resolvers called concurrently, so
+// initializing the ifunc resolver argument doesn't need to be thread-safe.
+
 ElfW(Addr) __bionic_call_ifunc_resolver(ElfW(Addr) resolver_addr) {
 #if defined(__aarch64__)
   typedef ElfW(Addr) (*ifunc_resolver_t)(uint64_t, __ifunc_arg_t*);
-  static __ifunc_arg_t arg = { sizeof(__ifunc_arg_t), getauxval(AT_HWCAP), getauxval(AT_HWCAP2) };
+  static __ifunc_arg_t arg;
+  static bool initialized = false;
+  if (!initialized) {
+    initialized = true;
+    arg._size = sizeof(__ifunc_arg_t);
+    arg._hwcap = getauxval(AT_HWCAP);
+    arg._hwcap2 = getauxval(AT_HWCAP2);
+  }
   return reinterpret_cast<ifunc_resolver_t>(resolver_addr)(arg._hwcap | _IFUNC_ARG_HWCAP, &arg);
 #elif defined(__arm__)
   typedef ElfW(Addr) (*ifunc_resolver_t)(unsigned long);
-  static unsigned long hwcap = getauxval(AT_HWCAP);
+  static unsigned long hwcap;
+  static bool initialized = false;
+  if (!initialized) {
+    initialized = true;
+    hwcap = getauxval(AT_HWCAP);
+  }
   return reinterpret_cast<ifunc_resolver_t>(resolver_addr)(hwcap);
 #else
   typedef ElfW(Addr) (*ifunc_resolver_t)(void);
diff --git a/libc/bionic/getauxval.cpp b/libc/bionic/getauxval.cpp
index c8f867b..f865f97 100644
--- a/libc/bionic/getauxval.cpp
+++ b/libc/bionic/getauxval.cpp
@@ -36,7 +36,6 @@
 
 // This function needs to be safe to call before TLS is set up, so it can't
 // access errno or the stack protector.
-__attribute__((no_stack_protector))
 __LIBC_HIDDEN__ unsigned long __bionic_getauxval(unsigned long type, bool& exists) {
   for (ElfW(auxv_t)* v = __libc_shared_globals()->auxv; v->a_type != AT_NULL; ++v) {
     if (v->a_type == type) {
diff --git a/libc/bionic/libc_init_static.cpp b/libc/bionic/libc_init_static.cpp
index 0b74023..28c0b0c 100644
--- a/libc/bionic/libc_init_static.cpp
+++ b/libc/bionic/libc_init_static.cpp
@@ -231,6 +231,9 @@
   g_target_sdk_version = target;
 }
 
+// This function is called in the dynamic linker before ifunc resolvers have run, so this file is
+// compiled with -ffreestanding to avoid implicit string.h function calls. (It shouldn't strictly
+// be necessary, though.)
 __LIBC_HIDDEN__ libc_shared_globals* __libc_shared_globals() {
   static libc_shared_globals globals;
   return &globals;
diff --git a/libc/bionic/pthread_create.cpp b/libc/bionic/pthread_create.cpp
index 1dc1066..03af2d9 100644
--- a/libc/bionic/pthread_create.cpp
+++ b/libc/bionic/pthread_create.cpp
@@ -54,31 +54,12 @@
 void __init_user_desc(struct user_desc*, bool, void*);
 #endif
 
-// This code is used both by each new pthread and the code that initializes the main thread.
-__attribute__((no_stack_protector))
-void __init_tcb(bionic_tcb* tcb, pthread_internal_t* thread) {
-#ifdef TLS_SLOT_SELF
-  // On x86, slot 0 must point to itself so code can read the thread pointer by
-  // loading %fs:0 or %gs:0.
-  tcb->tls_slot(TLS_SLOT_SELF) = &tcb->tls_slot(TLS_SLOT_SELF);
-#endif
-  tcb->tls_slot(TLS_SLOT_THREAD_ID) = thread;
-}
-
 __attribute__((no_stack_protector))
 void __init_tcb_stack_guard(bionic_tcb* tcb) {
   // GCC looks in the TLS for the stack guard on x86, so copy it there from our global.
   tcb->tls_slot(TLS_SLOT_STACK_GUARD) = reinterpret_cast<void*>(__stack_chk_guard);
 }
 
-__attribute__((no_stack_protector))
-void __init_tcb_dtv(bionic_tcb* tcb) {
-  // Initialize the DTV slot to a statically-allocated empty DTV. The first
-  // access to a dynamic TLS variable allocates a new DTV.
-  static const TlsDtv zero_dtv = {};
-  __set_tcb_dtv(tcb, const_cast<TlsDtv*>(&zero_dtv));
-}
-
 void __init_bionic_tls_ptrs(bionic_tcb* tcb, bionic_tls* tls) {
   tcb->thread()->bionic_tls = tls;
   tcb->tls_slot(TLS_SLOT_BIONIC_TLS) = tls;
diff --git a/linker/Android.bp b/linker/Android.bp
index 8061f40..1800bdb 100644
--- a/linker/Android.bp
+++ b/linker/Android.bp
@@ -123,6 +123,15 @@
 }
 
 cc_library_static {
+    name: "liblinker_main",
+    defaults: ["linker_defaults", "linker_all_targets"],
+    srcs: ["linker_main.cpp"],
+
+    // Ensure that the compiler won't insert string function calls before ifuncs are resolved.
+    cflags: ["-ffreestanding"],
+}
+
+cc_library_static {
     name: "liblinker_malloc",
     defaults: ["linker_defaults", "linker_all_targets"],
     srcs: ["linker_memory.cpp"],
@@ -151,7 +160,6 @@
         "linker_globals.cpp",
         "linker_libc_support.c",
         "linker_libcxx_support.cpp",
-        "linker_main.cpp",
         "linker_namespaces.cpp",
         "linker_logger.cpp",
         "linker_mapped_file_fragment.cpp",
@@ -287,10 +295,12 @@
     },
 
     static_libs: [
+        "liblinker_main",
         "liblinker_malloc",
 
         "libc++_static",
         "libc_nomalloc",
+        "libc_dynamic_dispatch",
         "libm",
     ],
 
diff --git a/linker/linker.cpp b/linker/linker.cpp
index eedce70..1393eb5 100644
--- a/linker/linker.cpp
+++ b/linker/linker.cpp
@@ -3162,7 +3162,10 @@
         TRACE_TYPE(RELO, "RELO IRELATIVE %16p <- %16p\n",
                     reinterpret_cast<void*>(reloc),
                     reinterpret_cast<void*>(load_bias + addend));
-        {
+        // In the linker, ifuncs are called as soon as possible so that string functions work.
+        // We must not call them again. (e.g. On arm32, resolving an ifunc changes the meaning of
+        // the addend from a resolver function to the implementation.)
+        if (!is_linker()) {
 #if !defined(__LP64__)
           // When relocating dso with text_relocation .text segment is
           // not executable. We need to restore elf flags for this
diff --git a/linker/linker_main.cpp b/linker/linker_main.cpp
index 3b950a3..264923f 100644
--- a/linker/linker_main.cpp
+++ b/linker/linker_main.cpp
@@ -40,6 +40,8 @@
 #include "linker_tls.h"
 #include "linker_utils.h"
 
+#include "private/bionic_auxv.h"
+#include "private/bionic_call_ifunc_resolver.h"
 #include "private/bionic_globals.h"
 #include "private/bionic_tls.h"
 #include "private/KernelArgumentBlock.h"
@@ -565,6 +567,32 @@
   }
 }
 
+// TODO: There is a similar ifunc resolver calling loop in libc_init_static.cpp, but that version
+// uses weak symbols, which don't work in the linker prior to its relocation. This version also
+// supports a load bias. When we stop supporting the gold linker in the NDK, then maybe we can use
+// non-weak definitions and merge the two loops.
+#if defined(USE_RELA)
+extern __LIBC_HIDDEN__ ElfW(Rela) __rela_iplt_start[], __rela_iplt_end[];
+
+static void call_ifunc_resolvers(ElfW(Addr) load_bias) {
+  for (ElfW(Rela) *r = __rela_iplt_start; r != __rela_iplt_end; ++r) {
+    ElfW(Addr)* offset = reinterpret_cast<ElfW(Addr)*>(r->r_offset + load_bias);
+    ElfW(Addr) resolver = r->r_addend + load_bias;
+    *offset = __bionic_call_ifunc_resolver(resolver);
+  }
+}
+#else
+extern __LIBC_HIDDEN__ ElfW(Rel) __rel_iplt_start[], __rel_iplt_end[];
+
+static void call_ifunc_resolvers(ElfW(Addr) load_bias) {
+  for (ElfW(Rel) *r = __rel_iplt_start; r != __rel_iplt_end; ++r) {
+    ElfW(Addr)* offset = reinterpret_cast<ElfW(Addr)*>(r->r_offset + load_bias);
+    ElfW(Addr) resolver = *offset + load_bias;
+    *offset = __bionic_call_ifunc_resolver(resolver);
+  }
+}
+#endif
+
 // Detect an attempt to run the linker on itself. e.g.:
 //   /system/bin/linker64 /system/bin/linker64
 // Use priority-1 to run this constructor before other constructors.
@@ -616,11 +644,15 @@
   ElfW(Ehdr)* elf_hdr = reinterpret_cast<ElfW(Ehdr)*>(linker_addr);
   ElfW(Phdr)* phdr = reinterpret_cast<ElfW(Phdr)*>(linker_addr + elf_hdr->e_phoff);
 
+  // string.h functions must not be used prior to calling the linker's ifunc resolvers.
+  const ElfW(Addr) load_bias = get_elf_exec_load_bias(elf_hdr);
+  call_ifunc_resolvers(load_bias);
+
   soinfo tmp_linker_so(nullptr, nullptr, nullptr, 0, 0);
 
   tmp_linker_so.base = linker_addr;
   tmp_linker_so.size = phdr_table_get_load_size(phdr, elf_hdr->e_phnum);
-  tmp_linker_so.load_bias = get_elf_exec_load_bias(elf_hdr);
+  tmp_linker_so.load_bias = load_bias;
   tmp_linker_so.dynamic = nullptr;
   tmp_linker_so.phdr = phdr;
   tmp_linker_so.phnum = elf_hdr->e_phnum;