Add support for optimized (non-generic) atomic libcalls.

For integer types of sizes 1, 2, 4 and 8, libcompiler-rt (and libgcc)
provide atomic functions that pass parameters by value and return
results directly.

libgcc and libcompiler-rt only provide optimized libcalls for
__atomic_fetch_*, as generic libcalls on non-integer types would make
little sense. This means that we can finally make __atomic_fetch_* work
on architectures for which we don't provide these operations as builtins
(e.g. ARM).

This should fix the dreaded "cannot compile this atomic library call
yet" error that would pop up once every while.


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@183033 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/CodeGen/CGAtomic.cpp b/lib/CodeGen/CGAtomic.cpp
index 0b48a5c..ddacdd1 100644
--- a/lib/CodeGen/CGAtomic.cpp
+++ b/lib/CodeGen/CGAtomic.cpp
@@ -15,6 +15,7 @@
 #include "CGCall.h"
 #include "CodeGenModule.h"
 #include "clang/AST/ASTContext.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Operator.h"
@@ -317,6 +318,22 @@
   return DeclPtr;
 }
 
+static void
+AddDirectArgument(CodeGenFunction &CGF, CallArgList &Args,
+                       bool UseOptimizedLibcall, llvm::Value *Val,
+                       QualType ValTy) {
+  if (UseOptimizedLibcall) {
+    // Load value and pass it to the function directly.
+    unsigned Align = CGF.getContext().getTypeAlignInChars(ValTy).getQuantity();
+    Val = CGF.EmitLoadOfScalar(Val, false, Align, ValTy);
+    Args.add(RValue::get(Val), ValTy);
+  } else {
+    // Non-optimized functions always take a reference.
+    Args.add(RValue::get(CGF.EmitCastToVoidPtr(Val)),
+                         CGF.getContext().VoidPtrTy);
+  }
+}
+
 RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E, llvm::Value *Dest) {
   QualType AtomicTy = E->getPtr()->getType()->getPointeeType();
   QualType MemTy = AtomicTy;
@@ -424,67 +441,137 @@
 
   // Use a library call.  See: http://gcc.gnu.org/wiki/Atomic/GCCMM/LIbrary .
   if (UseLibcall) {
+    bool UseOptimizedLibcall = false;
+    switch (E->getOp()) {
+    case AtomicExpr::AO__c11_atomic_fetch_add:
+    case AtomicExpr::AO__atomic_fetch_add:
+    case AtomicExpr::AO__c11_atomic_fetch_and:
+    case AtomicExpr::AO__atomic_fetch_and:
+    case AtomicExpr::AO__c11_atomic_fetch_or:
+    case AtomicExpr::AO__atomic_fetch_or:
+    case AtomicExpr::AO__c11_atomic_fetch_sub:
+    case AtomicExpr::AO__atomic_fetch_sub:
+    case AtomicExpr::AO__c11_atomic_fetch_xor:
+    case AtomicExpr::AO__atomic_fetch_xor:
+      // For these, only library calls for certain sizes exist.
+      UseOptimizedLibcall = true;
+      break;
+    default:
+      // Only use optimized library calls for sizes for which they exist.
+      if (Size == 1 || Size == 2 || Size == 4 || Size == 8)
+        UseOptimizedLibcall = true;
+      break;
+    }
 
-    SmallVector<QualType, 5> Params;
     CallArgList Args;
-    // Size is always the first parameter
-    Args.add(RValue::get(llvm::ConstantInt::get(SizeTy, Size)),
-             getContext().getSizeType());
-    // Atomic address is always the second parameter
+    if (!UseOptimizedLibcall) {
+      // For non-optimized library calls, the size is the first parameter
+      Args.add(RValue::get(llvm::ConstantInt::get(SizeTy, Size)),
+               getContext().getSizeType());
+    }
+    // Atomic address is the first or second parameter
     Args.add(RValue::get(EmitCastToVoidPtr(Ptr)),
              getContext().VoidPtrTy);
 
-    const char* LibCallName;
-    QualType RetTy = getContext().VoidTy;
+    std::string LibCallName;
+    QualType RetTy;
+    bool HaveRetTy = false;
     switch (E->getOp()) {
     // There is only one libcall for compare an exchange, because there is no
     // optimisation benefit possible from a libcall version of a weak compare
     // and exchange.
-    // bool __atomic_compare_exchange(size_t size, void *obj, void *expected,
+    // bool __atomic_compare_exchange(size_t size, void *mem, void *expected,
     //                                void *desired, int success, int failure)
+    // bool __atomic_compare_exchange_N(T *mem, T *expected, T desired,
+    //                                  int success, int failure)
     case AtomicExpr::AO__c11_atomic_compare_exchange_weak:
     case AtomicExpr::AO__c11_atomic_compare_exchange_strong:
     case AtomicExpr::AO__atomic_compare_exchange:
     case AtomicExpr::AO__atomic_compare_exchange_n:
       LibCallName = "__atomic_compare_exchange";
       RetTy = getContext().BoolTy;
+      HaveRetTy = true;
       Args.add(RValue::get(EmitCastToVoidPtr(Val1)),
                getContext().VoidPtrTy);
-      Args.add(RValue::get(EmitCastToVoidPtr(Val2)),
-               getContext().VoidPtrTy);
+      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val2, MemTy);
       Args.add(RValue::get(Order),
                getContext().IntTy);
       Order = OrderFail;
       break;
     // void __atomic_exchange(size_t size, void *mem, void *val, void *return,
     //                        int order)
+    // T __atomic_exchange_N(T *mem, T val, int order)
     case AtomicExpr::AO__c11_atomic_exchange:
     case AtomicExpr::AO__atomic_exchange_n:
     case AtomicExpr::AO__atomic_exchange:
       LibCallName = "__atomic_exchange";
-      Args.add(RValue::get(EmitCastToVoidPtr(Val1)),
-               getContext().VoidPtrTy);
-      Args.add(RValue::get(EmitCastToVoidPtr(Dest)),
-               getContext().VoidPtrTy);
+      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy);
       break;
     // void __atomic_store(size_t size, void *mem, void *val, int order)
+    // void __atomic_store_N(T *mem, T val, int order)
     case AtomicExpr::AO__c11_atomic_store:
     case AtomicExpr::AO__atomic_store:
     case AtomicExpr::AO__atomic_store_n:
       LibCallName = "__atomic_store";
-      Args.add(RValue::get(EmitCastToVoidPtr(Val1)),
-               getContext().VoidPtrTy);
+      RetTy = getContext().VoidTy;
+      HaveRetTy = true;
+      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy);
       break;
     // void __atomic_load(size_t size, void *mem, void *return, int order)
+    // T __atomic_load_N(T *mem, int order)
     case AtomicExpr::AO__c11_atomic_load:
     case AtomicExpr::AO__atomic_load:
     case AtomicExpr::AO__atomic_load_n:
       LibCallName = "__atomic_load";
-      Args.add(RValue::get(EmitCastToVoidPtr(Dest)),
-               getContext().VoidPtrTy);
+      break;
+    // T __atomic_fetch_add_N(T *mem, T val, int order)
+    case AtomicExpr::AO__c11_atomic_fetch_add:
+    case AtomicExpr::AO__atomic_fetch_add:
+      LibCallName = "__atomic_fetch_add";
+      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy);
+      break;
+    // T __atomic_fetch_and_N(T *mem, T val, int order)
+    case AtomicExpr::AO__c11_atomic_fetch_and:
+    case AtomicExpr::AO__atomic_fetch_and:
+      LibCallName = "__atomic_fetch_and";
+      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy);
+      break;
+    // T __atomic_fetch_or_N(T *mem, T val, int order)
+    case AtomicExpr::AO__c11_atomic_fetch_or:
+    case AtomicExpr::AO__atomic_fetch_or:
+      LibCallName = "__atomic_fetch_or";
+      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy);
+      break;
+    // T __atomic_fetch_sub_N(T *mem, T val, int order)
+    case AtomicExpr::AO__c11_atomic_fetch_sub:
+    case AtomicExpr::AO__atomic_fetch_sub:
+      LibCallName = "__atomic_fetch_sub";
+      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy);
+      break;
+    // T __atomic_fetch_xor_N(T *mem, T val, int order)
+    case AtomicExpr::AO__c11_atomic_fetch_xor:
+    case AtomicExpr::AO__atomic_fetch_xor:
+      LibCallName = "__atomic_fetch_xor";
+      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy);
       break;
     default: return EmitUnsupportedRValue(E, "atomic library call");
     }
+
+    // Optimized functions have the size in their name.
+    if (UseOptimizedLibcall)
+      LibCallName += "_" + llvm::utostr(Size);
+    // By default, assume we return a value of the atomic type.
+    if (!HaveRetTy) {
+      if (UseOptimizedLibcall) {
+        // Value is returned directly.
+        RetTy = MemTy;
+      } else {
+        // Value is returned through parameter before the order.
+        RetTy = getContext().VoidTy;
+        Args.add(RValue::get(EmitCastToVoidPtr(Dest)),
+                 getContext().VoidPtrTy);
+      }
+    }
     // order is always the last parameter
     Args.add(RValue::get(Order),
              getContext().IntTy);
@@ -495,7 +582,7 @@
     llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FuncInfo);
     llvm::Constant *Func = CGM.CreateRuntimeFunction(FTy, LibCallName);
     RValue Res = EmitCall(FuncInfo, Func, ReturnValueSlot(), Args);
-    if (E->isCmpXChg())
+    if (!RetTy->isVoidType())
       return Res;
     if (E->getType()->isVoidType())
       return RValue::get(0);
diff --git a/test/CodeGen/atomics-inlining.c b/test/CodeGen/atomics-inlining.c
index 3ef96e5..c69702f 100644
--- a/test/CodeGen/atomics-inlining.c
+++ b/test/CodeGen/atomics-inlining.c
@@ -1,3 +1,4 @@
+// RUN: %clang_cc1 -triple arm-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=ARM
 // RUN: %clang_cc1 -triple powerpc-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=PPC32
 // RUN: %clang_cc1 -triple powerpc64-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=PPC64
 // RUN: %clang_cc1 -triple mipsel-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS32
@@ -7,6 +8,7 @@
 unsigned short s1, s2;
 unsigned int i1, i2;
 unsigned long long ll1, ll2;
+unsigned char a1[100], a2[100];
 
 enum memory_order {
   memory_order_relaxed,
@@ -19,31 +21,73 @@
 
 void test1(void) {
   (void)__atomic_load(&c1, &c2, memory_order_seq_cst);
+  (void)__atomic_store(&c1, &c2, memory_order_seq_cst);
   (void)__atomic_load(&s1, &s2, memory_order_seq_cst);
+  (void)__atomic_store(&s1, &s2, memory_order_seq_cst);
   (void)__atomic_load(&i1, &i2, memory_order_seq_cst);
+  (void)__atomic_store(&i1, &i2, memory_order_seq_cst);
   (void)__atomic_load(&ll1, &ll2, memory_order_seq_cst);
+  (void)__atomic_store(&ll1, &ll2, memory_order_seq_cst);
+  (void)__atomic_load(&a1, &a2, memory_order_seq_cst);
+  (void)__atomic_store(&a1, &a2, memory_order_seq_cst);
+
+// ARM: define arm_aapcscc void @test1
+// ARM: = call arm_aapcscc zeroext i8 @__atomic_load_1(i8* @c1
+// ARM: call arm_aapcscc void @__atomic_store_1(i8* @c1, i8 zeroext
+// ARM: = call arm_aapcscc zeroext i16 @__atomic_load_2(i8* bitcast (i16* @s1 to i8*)
+// ARM: call arm_aapcscc void @__atomic_store_2(i8* bitcast (i16* @s1 to i8*), i16 zeroext
+// ARM: = call arm_aapcscc i32 @__atomic_load_4(i8* bitcast (i32* @i1 to i8*)
+// ARM: call arm_aapcscc void @__atomic_store_4(i8* bitcast (i32* @i1 to i8*), i32
+// ARM: = call arm_aapcscc i64 @__atomic_load_8(i8* bitcast (i64* @ll1 to i8*)
+// ARM: call arm_aapcscc void @__atomic_store_8(i8* bitcast (i64* @ll1 to i8*), i64
+// ARM: call arm_aapcscc void @__atomic_load(i32 100, i8* getelementptr inbounds ([100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8]* @a2, i32 0, i32 0)
+// ARM: call arm_aapcscc void @__atomic_store(i32 100, i8* getelementptr inbounds ([100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8]* @a2, i32 0, i32 0)
 
 // PPC32: define void @test1
 // PPC32: = load atomic i8* @c1 seq_cst
+// PPC32: store atomic i8 {{.*}}, i8* @c1 seq_cst
 // PPC32: = load atomic i16* @s1 seq_cst
+// PPC32: store atomic i16 {{.*}}, i16* @s1 seq_cst
 // PPC32: = load atomic i32* @i1 seq_cst
-// PPC32: call void @__atomic_load(i32 8, i8* bitcast (i64* @ll1 to i8*)
+// PPC32: store atomic i32 {{.*}}, i32* @i1 seq_cst
+// PPC32: = call i64 @__atomic_load_8(i8* bitcast (i64* @ll1 to i8*)
+// PPC32: call void @__atomic_store_8(i8* bitcast (i64* @ll1 to i8*), i64
+// PPC32: call void @__atomic_load(i32 100, i8* getelementptr inbounds ([100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8]* @a2, i32 0, i32 0)
+// PPC32: call void @__atomic_store(i32 100, i8* getelementptr inbounds ([100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8]* @a2, i32 0, i32 0)
 
 // PPC64: define void @test1
 // PPC64: = load atomic i8* @c1 seq_cst
+// PPC64: store atomic i8 {{.*}}, i8* @c1 seq_cst
 // PPC64: = load atomic i16* @s1 seq_cst
+// PPC64: store atomic i16 {{.*}}, i16* @s1 seq_cst
 // PPC64: = load atomic i32* @i1 seq_cst
+// PPC64: store atomic i32 {{.*}}, i32* @i1 seq_cst
 // PPC64: = load atomic i64* @ll1 seq_cst
+// PPC64: store atomic i64 {{.*}}, i64* @ll1 seq_cst
+// PPC64: call void @__atomic_load(i64 100, i8* getelementptr inbounds ([100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8]* @a2, i32 0, i32 0)
+// PPC64: call void @__atomic_store(i64 100, i8* getelementptr inbounds ([100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8]* @a2, i32 0, i32 0)
 
 // MIPS32: define void @test1
 // MIPS32: = load atomic i8* @c1 seq_cst
+// MIPS32: store atomic i8 {{.*}}, i8* @c1 seq_cst
 // MIPS32: = load atomic i16* @s1 seq_cst
+// MIPS32: store atomic i16 {{.*}}, i16* @s1 seq_cst
 // MIPS32: = load atomic i32* @i1 seq_cst
-// MIPS32: call void @__atomic_load(i32 8, i8* bitcast (i64* @ll1 to i8*)
+// MIPS32: store atomic i32 {{.*}}, i32* @i1 seq_cst
+// MIPS32: call i64 @__atomic_load_8(i8* bitcast (i64* @ll1 to i8*)
+// MIPS32: call void @__atomic_store_8(i8* bitcast (i64* @ll1 to i8*), i64
+// MIPS32: call void @__atomic_load(i32 100, i8* getelementptr inbounds ([100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8]* @a2, i32 0, i32 0)
+// MIPS32: call void @__atomic_store(i32 100, i8* getelementptr inbounds ([100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8]* @a2, i32 0, i32 0)
 
 // MIPS64: define void @test1
 // MIPS64: = load atomic i8* @c1 seq_cst
+// MIPS64: store atomic i8 {{.*}}, i8* @c1 seq_cst
 // MIPS64: = load atomic i16* @s1 seq_cst
+// MIPS64: store atomic i16 {{.*}}, i16* @s1 seq_cst
 // MIPS64: = load atomic i32* @i1 seq_cst
+// MIPS64: store atomic i32 {{.*}}, i32* @i1 seq_cst
 // MIPS64: = load atomic i64* @ll1 seq_cst
+// MIPS64: store atomic i64 {{.*}}, i64* @ll1 seq_cst
+// MIPS64: call void @__atomic_load(i64 100, i8* getelementptr inbounds ([100 x i8]* @a1, i32 0, i32 0)
+// MIPS64: call void @__atomic_store(i64 100, i8* getelementptr inbounds ([100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8]* @a2, i32 0, i32 0)
 }