Fix 16-byte stack alignment requirement for x86_64 ABI

Change-Id: I43304803ac54c8688c61688bd96c7160614172d4
Signed-off-by: Pavel Chupin <pavel.v.chupin@intel.com>
diff --git a/libc/arch-common/bionic/crtbegin.c b/libc/arch-common/bionic/crtbegin.c
index bc67bfb..fa9f3f3 100644
--- a/libc/arch-common/bionic/crtbegin.c
+++ b/libc/arch-common/bionic/crtbegin.c
@@ -50,6 +50,10 @@
   array.fini_array = &__FINI_ARRAY__;
 
   void* raw_args = (void*) ((uintptr_t) __builtin_frame_address(0) + sizeof(void*));
+#ifdef __x86_64__
+  // 16-byte stack alignment is required by x86_64 ABI
+  asm("andq  $~15, %rsp");
+#endif
   __libc_init(raw_args, NULL, &main, &array);
 }
 
diff --git a/libc/arch-x86_64/bionic/clone.S b/libc/arch-x86_64/bionic/clone.S
index 2ae0e85..7511e86 100644
--- a/libc/arch-x86_64/bionic/clone.S
+++ b/libc/arch-x86_64/bionic/clone.S
@@ -59,10 +59,11 @@
 
         # We're in the child now, so call __thread_entry
         # with the arguments from the child stack moved into
-        # the appropriate registers.
-        popq    %rdi  # fn
-        popq    %rsi  # arg
-        popq    %rdx  # tls
+        # the appropriate registers. We avoid pop here to keep
+        # the required 16-byte stack alignment.
+        movq    (%rsp), %rdi    # fn
+        movq    8(%rsp), %rsi   # arg
+        movq    16(%rsp), %rdx  # tls
         call    __thread_entry
         hlt
 2: