[ARM][AArch64] Increase TLS alignment to reserve space for Android's TCB

ARM and AArch64 use TLS variant 1, where the first two words after the
thread pointer are reserved for the TCB, followed by the executable's TLS
segment. Both the thread pointer and the TLS segment are aligned to at
least the TLS segment's alignment.

Android/Bionic historically has not supported ELF TLS, and it has
allocated memory after the thread pointer for several Bionic TLS slots
(currently 9 but soon only 8). At least one of these allocations
(TLS_SLOT_STACK_GUARD == 5) is widespread throughout Android/AArch64
binaries and can't be changed.

To reconcile this disagreement about TLS memory layout, set the minimum
alignment for executable TLS segments to 8 words on ARM/AArch64, which
reserves at least 8 words of memory after the TP (2 for the ABI-specified
TCB and 6 for alignment padding). For simplicity, and because lld doesn't
know when it's targeting Android, increase the alignment regardless of
operating system.

Differential Revision: https://reviews.llvm.org/D53906

Change-Id: If37322cbf6525e4b31ba200201308e4d0ff4c4e4
git-svn-id: https://llvm.org/svn/llvm-project/lld/trunk@350681 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/ELF/InputSection.cpp b/ELF/InputSection.cpp
index 30a9fc2..7de3507 100644
--- a/ELF/InputSection.cpp
+++ b/ELF/InputSection.cpp
@@ -575,6 +575,10 @@
     // Variant 1. The thread pointer points to a TCB with a fixed 2-word size,
     // followed by a variable amount of alignment padding, followed by the TLS
     // segment.
+    //
+    // NB: While the ARM/AArch64 ABI formally has a 2-word TCB size, lld
+    // effectively increases the TCB size to 8 words for Android compatibility.
+    // It accomplishes this by increasing the segment's alignment.
     return alignTo(Config->Wordsize * 2, Out::TlsPhdr->p_align);
   case EM_386:
   case EM_X86_64:
diff --git a/ELF/Writer.cpp b/ELF/Writer.cpp
index 37a53a1..2f6a0bf 100644
--- a/ELF/Writer.cpp
+++ b/ELF/Writer.cpp
@@ -2181,11 +2181,23 @@
       P->p_memsz = alignTo(P->p_memsz, Target->PageSize);
     }
 
-    // The TLS pointer goes after PT_TLS for variant 2 targets. At least glibc
-    // will align it, so round up the size to make sure the offsets are
-    // correct.
-    if (P->p_type == PT_TLS && P->p_memsz)
+    if (P->p_type == PT_TLS && P->p_memsz) {
+      if (!Config->Shared &&
+          (Config->EMachine == EM_ARM || Config->EMachine == EM_AARCH64)) {
+        // On ARM/AArch64, reserve extra space (8 words) between the thread
+        // pointer and an executable's TLS segment by overaligning the segment.
+        // This reservation is needed for backwards compatibility with Android's
+        // TCB, which allocates several slots after the thread pointer (e.g.
+        // TLS_SLOT_STACK_GUARD==5). For simplicity, this overalignment is also
+        // done on other operating systems.
+        P->p_align = std::max<uint64_t>(P->p_align, Config->Wordsize * 8);
+      }
+
+      // The TLS pointer goes after PT_TLS for variant 2 targets. At least glibc
+      // will align it, so round up the size to make sure the offsets are
+      // correct.
       P->p_memsz = alignTo(P->p_memsz, P->p_align);
+    }
   }
 }
 
diff --git a/test/ELF/aarch64-cortex-a53-843419-tlsrelax.s b/test/ELF/aarch64-cortex-a53-843419-tlsrelax.s
index bff72d3..2db5c7e 100644
--- a/test/ELF/aarch64-cortex-a53-843419-tlsrelax.s
+++ b/test/ELF/aarch64-cortex-a53-843419-tlsrelax.s
@@ -26,9 +26,9 @@
 // CHECK: _start:
 // CHECK-NEXT:   210ff8:        41 d0 3b d5     mrs     x1, TPIDR_EL0
 // CHECK-NEXT:   210ffc:        00 00 a0 d2     movz    x0, #0, lsl #16
-// CHECK-NEXT:   211000:        01 02 80 f2     movk    x1, #16
+// CHECK-NEXT:   211000:        01 08 80 f2     movk    x1, #64
 // CHECK-NEXT:   211004:        00 00 a0 d2     movz    x0, #0, lsl #16
-// CHECK-NEXT:   211008:        01 02 80 f2     movk    x1, #16
+// CHECK-NEXT:   211008:        01 08 80 f2     movk    x1, #64
 // CHECK-NEXT:   21100c:        c0 03 5f d6     ret
 
  .type  v,@object
diff --git a/test/ELF/aarch64-tls-gdle.s b/test/ELF/aarch64-tls-gdle.s
index e91d397..882ec8c 100644
--- a/test/ELF/aarch64-tls-gdle.s
+++ b/test/ELF/aarch64-tls-gdle.s
@@ -5,15 +5,15 @@
 # RUN: llvm-objdump -d %tout | FileCheck %s
 # RUN: llvm-readobj -s -r %tout | FileCheck -check-prefix=RELOC %s
 
-#Local-Dynamic to Initial-Exec relax creates no
+#Local-Dynamic to Local-Exec relax creates no
 #RELOC:      Relocations [
 #RELOC-NEXT: ]
 
-# TCB size = 0x16 and foo is first element from TLS register.
+# TCB size = 64 and foo is first element from TLS register.
 # CHECK: Disassembly of section .text:
 # CHECK: _start:
 # CHECK:  210000:	00 00 a0 d2	movz	x0, #0, lsl #16
-# CHECK:  210004:	00 02 80 f2 	movk	x0, #16
+# CHECK:  210004:	00 08 80 f2 	movk	x0, #64
 # CHECK:  210008:	1f 20 03 d5 	nop
 # CHECK:  21000c:	1f 20 03 d5 	nop
 
diff --git a/test/ELF/aarch64-tls-iele.s b/test/ELF/aarch64-tls-iele.s
index 9fec4ee..0229d66 100644
--- a/test/ELF/aarch64-tls-iele.s
+++ b/test/ELF/aarch64-tls-iele.s
@@ -9,13 +9,13 @@
 # RELOC:      Relocations [
 # RELOC-NEXT: ]
 
-# TCB size = 0x16 and foo is first element from TLS register.
+# TCB size = 64 and foo is first element from TLS register.
 # CHECK: Disassembly of section .text:
 # CHECK: _start:
 # CHECK-NEXT: 210000:  00 00 a0 d2   movz   x0, #0, lsl #16
-# CHECK-NEXT: 210004:  80 02 80 f2   movk   x0, #20
+# CHECK-NEXT: 210004:  80 08 80 f2   movk   x0, #68
 # CHECK-NEXT: 210008:  00 00 a0 d2   movz   x0, #0, lsl #16
-# CHECK-NEXT: 21000c:  00 02 80 f2   movk   x0, #16
+# CHECK-NEXT: 21000c:  00 08 80 f2   movk   x0, #64
 
 .section .tdata
 .align 2
diff --git a/test/ELF/aarch64-tls-le.s b/test/ELF/aarch64-tls-le.s
index 85cd3be..49c322f 100644
--- a/test/ELF/aarch64-tls-le.s
+++ b/test/ELF/aarch64-tls-le.s
@@ -4,7 +4,7 @@
 # RUN: llvm-objdump -d %tout | FileCheck %s
 # RUN: llvm-readobj -s -r %tout | FileCheck -check-prefix=RELOC %s
 
-#Local-Dynamic to Initial-Exec relax creates no
+#Local-Dynamic to Local-Exec relax creates no
 #RELOC:      Relocations [
 #RELOC-NEXT: ]
 
@@ -17,12 +17,12 @@
  add x0, x0, :tprel_hi12:v2
  add x0, x0, :tprel_lo12_nc:v2
 
-# TCB size = 0x16 and foo is first element from TLS register.
+# TCB size = 64 and foo is first element from TLS register.
 #CHECK: Disassembly of section .text:
 #CHECK: _start:
 #CHECK:  210000: 40 d0 3b d5     mrs     x0, TPIDR_EL0
 #CHECK:  210004: 00 00 40 91     add     x0, x0, #0, lsl #12
-#CHECK:  210008: 00 40 00 91     add     x0, x0, #16
+#CHECK:  210008: 00 00 01 91     add     x0, x0, #64
 #CHECK:  21000c: 40 d0 3b d5     mrs     x0, TPIDR_EL0
 #CHECK:  210010: 00 fc 7f 91     add     x0, x0, #4095, lsl #12
 #CHECK:  210014: 00 e0 3f 91     add     x0, x0, #4088
@@ -36,9 +36,9 @@
 .word  0
 .size  v1, 4
 
-# The current offset from the thread pointer is 20. Raise it to just below the
+# The current offset from the thread pointer is 68. Raise it to just below the
 # 24-bit limit.
-.space (0xfffff8 - 20)
+.space (0xfffff8 - 68)
 
 .type   v2,@object
 .globl  v2
diff --git a/test/ELF/aarch64-tlsld-ldst.s b/test/ELF/aarch64-tlsld-ldst.s
index 3144ca5..8ebdc2f 100644
--- a/test/ELF/aarch64-tlsld-ldst.s
+++ b/test/ELF/aarch64-tlsld-ldst.s
@@ -26,27 +26,27 @@
 
 // CHECK: _start:
 // CHECK-NEXT:    210000:       48 d0 3b d5     mrs     x8, TPIDR_EL0
-// 0x0 + c10 = 0xc10       = tcb (16-bytes) + var0
-// CHECK-NEXT:    210004:       08 01 40 91     add     x8, x8, #0, lsl #12
-// CHECK-NEXT:    210008:       14 05 c3 3d     ldr     q20, [x8, #3088]
-// 0x1000 + 0x820 = 0x1820 = tcb + var1
-// CHECK-NEXT:    21000c:       08 05 40 91     add     x8, x8, #1, lsl #12
-// CHECK-NEXT:    210010:       00 11 44 f9     ldr     x0, [x8, #2080]
-// 0x2000 + 0x428 = 0x2428 = tcb + var2
-// CHECK-NEXT:    210014:       08 09 40 91     add     x8, x8, #2, lsl #12
-// CHECK-NEXT:    210018:       00 29 44 b9     ldr     w0, [x8, #1064]
-// 0x3000 + 0x2c  = 0x302c = tcb + var3
-// CHECK-NEXT:    21001c:       08 0d 40 91     add     x8, x8, #3, lsl #12
-// CHECK-NEXT:    210020:       00 59 40 79     ldrh    w0, [x8, #44]
-// 0x3000 + 0xc2e = 0x32ce = tcb + var4
-// CHECK-NEXT:    210024:       08 0d 40 91     add     x8, x8, #3, lsl #12
-// CHECK-NEXT:    210028:       00 b9 70 39     ldrb    w0, [x8, #3118]
+// 0x0 + c40 = 0xc40       = tcb (64-bytes) + var0
+// CHECK-NEXT:    210004:       08 01 40 91     add x8, x8, #0, lsl #12
+// CHECK-NEXT:    210008:       14 11 c3 3d     ldr q20, [x8, #3136]
+// 0x1000 + 0x850 = 0x1850 = tcb + var1
+// CHECK-NEXT:    21000c:       08 05 40 91     add x8, x8, #1, lsl #12
+// CHECK-NEXT:    210010:       00 29 44 f9     ldr x0, [x8, #2128]
+// 0x2000 + 0x458 = 0x2458 = tcb + var2
+// CHECK-NEXT:    210014:       08 09 40 91     add x8, x8, #2, lsl #12
+// CHECK-NEXT:    210018:       00 59 44 b9     ldr w0, [x8, #1112]
+// 0x3000 + 0x5c  = 0x305c = tcb + var3
+// CHECK-NEXT:    21001c:       08 0d 40 91     add x8, x8, #3, lsl #12
+// CHECK-NEXT:    210020:       00 b9 40 79     ldrh  w0, [x8, #92]
+// 0x3000 + 0xc5e = 0x3c5e = tcb + var4
+// CHECK-NEXT:    210024:       08 0d 40 91     add x8, x8, #3, lsl #12
+// CHECK-NEXT:    210028:       00 79 71 39     ldrb  w0, [x8, #3166]
 
-// CHECK-SYMS:      0000000000000c00     0 TLS     GLOBAL DEFAULT    2 var0
-// CHECK-SYMS-NEXT: 0000000000001810     4 TLS     GLOBAL DEFAULT    2 var1
-// CHECK-SYMS-NEXT: 0000000000002418     2 TLS     GLOBAL DEFAULT    2 var2
-// CHECK-SYMS-NEXT: 000000000000301c     1 TLS     GLOBAL DEFAULT    2 var3
-// CHECK-SYMS-NEXT: 0000000000003c1e     0 TLS     GLOBAL DEFAULT    2 var4
+// CHECK-SYMS:      0000000000000c00    16 TLS     GLOBAL DEFAULT    2 var0
+// CHECK-SYMS-NEXT: 0000000000001810     8 TLS     GLOBAL DEFAULT    2 var1
+// CHECK-SYMS-NEXT: 0000000000002418     4 TLS     GLOBAL DEFAULT    2 var2
+// CHECK-SYMS-NEXT: 000000000000301c     2 TLS     GLOBAL DEFAULT    2 var3
+// CHECK-SYMS-NEXT: 0000000000003c1e     1 TLS     GLOBAL DEFAULT    2 var4
 
         .globl var0
         .globl var1
@@ -59,12 +59,12 @@
         .type var3,@object
 
 .section .tbss,"awT",@nobits
-        .balign 16
+        .balign 64
         .space 1024 * 3
 var0:
         .quad 0
         .quad 0
-        .size var1, 16
+        .size var0, 16
         .space 1024 * 3
 var1:
         .quad 0
@@ -72,14 +72,14 @@
         .space 1024 * 3
 var2:
         .word 0
-        .size var1, 4
+        .size var2, 4
 
         .space 1024 * 3
 var3:
         .hword 0
-        .size var2, 2
+        .size var3, 2
         .space 1024 * 3
 var4:
         .byte 0
-        .size var3, 1
+        .size var4, 1
         .space 1024 * 3
diff --git a/test/ELF/arm-tls-le32.s b/test/ELF/arm-tls-le32.s
index 7834ded..f9a5fa9 100644
--- a/test/ELF/arm-tls-le32.s
+++ b/test/ELF/arm-tls-le32.s
@@ -69,9 +69,9 @@
 
 // CHECK: Disassembly of section .text:
 // CHECK-NEXT: _start:
-// offset of x from Thread pointer = (TcbSize + 0x0 = 0x8)
-// CHECK-NEXT:   11000:         08 00 00 00
-// offset of z from Thread pointer = (TcbSize + 0x8 = 0x10)
-// CHECK-NEXT:   11004:         10 00 00 00
-// offset of y from Thread pointer = (TcbSize + 0x4 = 0xc)
-// CHECK-NEXT:   11008:         0c 00 00 00
+// offset of x from Thread pointer = (TcbSize + 0x0 = 0x20)
+// CHECK-NEXT:   11000:         20 00 00 00
+// offset of z from Thread pointer = (TcbSize + 0x8 = 0x28)
+// CHECK-NEXT:   11004:         28 00 00 00
+// offset of y from Thread pointer = (TcbSize + 0x4 = 0x24)
+// CHECK-NEXT:   11008:         24 00 00 00
diff --git a/test/ELF/arm-tls-norelax-ie-le.s b/test/ELF/arm-tls-norelax-ie-le.s
index be8af97..11c3e4f 100644
--- a/test/ELF/arm-tls-norelax-ie-le.s
+++ b/test/ELF/arm-tls-norelax-ie-le.s
@@ -37,5 +37,5 @@
  .type x2, %object
 
 // CHECK: Contents of section .got:
-// x1 at offset 8 from TP, x2 at offset c from TP. Offsets include TCB size of 8
-// CHECK-NEXT: 13064 08000000 0c000000
+// x1 at offset 0x20 from TP, x2 at offset 0x24 from TP. Offsets include TCB size of 0x20
+// CHECK-NEXT: 13064 20000000 24000000