RSForEachExpand: Improve getelementptr instruction generation.

(1) The old pattern, when accessing fields of the driver info
structure, was:

 %base = gep %DriverInfo *%info, 0, <base array index>
 %addr = gep [N x i8*]* %base, 0, <index>

This change folds the above into:

 %addr = gep %DriverInfo *%info, 0, <base array index>, <index>

which simplifies both way that we generate the GEP instructions as
well as the resulting bitcode.

(2) In addition, GEPs of input pointers and instep sizes can be
hoisted out of the loop body.

(3) Finally, mark all GEPs into the DriverInfo structure as inbounds,
because we know they are.

Change-Id: Icfabbdb464af8865c2a4623d967bbb63ef711680
diff --git a/lib/Renderscript/RSForEachExpand.cpp b/lib/Renderscript/RSForEachExpand.cpp
index 3e70b1d..ce1fb58 100644
--- a/lib/Renderscript/RSForEachExpand.cpp
+++ b/lib/Renderscript/RSForEachExpand.cpp
@@ -427,6 +427,31 @@
     }
   }
 
+  // GEPHelper() returns a SmallVector of values suitable for passing
+  // to IRBuilder::CreateGEP(), and SmallGEPIndices is a typedef for
+  // the returned data type. It is sized so that the SmallVector
+  // returned by GEPHelper() never needs to do a heap allocation for
+  // any list of GEP indices it encounters in the code.
+  typedef llvm::SmallVector<llvm::Value *, 3> SmallGEPIndices;
+
+  // Helper for turning a list of constant integer GEP indices into a
+  // SmallVector of llvm::Value*. The return value is suitable for
+  // passing to a GetElementPtrInst constructor or IRBuilder::CreateGEP().
+  //
+  // Inputs:
+  //   I32Args should be integers which represent the index arguments
+  //   to a GEP instruction.
+  //
+  // Returns:
+  //   Returns a SmallVector of ConstantInts.
+  SmallGEPIndices GEPHelper(std::initializer_list<int32_t> I32Args) {
+    SmallGEPIndices Out(I32Args.size());
+    llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context);
+    std::transform(I32Args.begin(), I32Args.end(), Out.begin(),
+                   [I32Ty](int32_t Arg) { return llvm::ConstantInt::get(I32Ty, Arg); });
+    return Out;
+  }
+
 public:
   RSForEachExpandPass(bool pEnableStepOpt = true)
       : ModulePass(ID), Module(nullptr), Context(nullptr),
@@ -448,6 +473,8 @@
   //            suitable for computing arguments for the ForEach-able function
   // CalleeArgs - contribution is accumulated here
   // Bump - invoked once for each contributed outgoing argument
+  // LoopHeaderInsertionPoint - an Instruction in the loop header, before which
+  //                            this function can insert loop-invariant loads
   //
   // Return value is the (zero-based) position of the context (Arg_p)
   // argument in the CalleeArgs vector, or a negative value if the
@@ -457,7 +484,8 @@
                              llvm::Value *Arg_p,
                              llvm::IRBuilder<> &Builder,
                              llvm::SmallVector<llvm::Value*, 8> &CalleeArgs,
-                             std::function<void ()> Bump) {
+                             std::function<void ()> Bump,
+                             llvm::Instruction *LoopHeaderInsertionPoint) {
 
     bccAssert(CalleeArgs.empty());
 
@@ -475,23 +503,30 @@
 
     if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) ||
         bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
+      bccAssert(LoopHeaderInsertionPoint);
 
-      llvm::Value *Current = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldCurrent);
+      // Y and Z are loop invariant, so they can be hoisted out of the
+      // loop. Set the IRBuilder insertion point to the loop header.
+      auto OldInsertionPoint = Builder.saveIP();
+      Builder.SetInsertPoint(LoopHeaderInsertionPoint);
 
       if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
-        llvm::Value *Y = Builder.CreateLoad(
-            Builder.CreateStructGEP(nullptr, Current, RsLaunchDimensionsFieldY), "Y");
-
-        CalleeArgs.push_back(Y);
+        SmallGEPIndices YValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
+          RsLaunchDimensionsFieldY}));
+        llvm::Value *YAddr = Builder.CreateInBoundsGEP(Arg_p, YValueGEP, "Y.gep");
+        CalleeArgs.push_back(Builder.CreateLoad(YAddr, "Y"));
         Bump();
       }
 
       if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
-        llvm::Value *Z = Builder.CreateLoad(
-            Builder.CreateStructGEP(nullptr, Current, RsLaunchDimensionsFieldZ), "Z");
-        CalleeArgs.push_back(Z);
+        SmallGEPIndices ZValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
+          RsLaunchDimensionsFieldZ}));
+        llvm::Value *ZAddr = Builder.CreateInBoundsGEP(Arg_p, ZValueGEP, "Z.gep");
+        CalleeArgs.push_back(Builder.CreateLoad(ZAddr, "Z"));
         Bump();
       }
+
+      Builder.restoreIP(OldInsertionPoint);
     }
 
     return Return;
@@ -545,23 +580,20 @@
     llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin();
 
     llvm::Type  *InTy      = nullptr;
-    llvm::Value *InBasePtr = nullptr;
+    llvm::Value *InBufPtr = nullptr;
     if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
-      llvm::Value *InsBasePtr  = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInPtr, "inputs_base");
-
-      llvm::Value *InStepsBase = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInStride, "insteps_base");
-
-      llvm::Value    *InStepAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InStepsBase, 0, 0);
-      llvm::LoadInst *InStepArg  = Builder.CreateLoad(InStepAddr,
-                                                      "instep_addr");
+      SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride, 0}));
+      llvm::LoadInst *InStepArg  = Builder.CreateLoad(
+        Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep"), "instep_addr");
 
       InTy = (FunctionArgIter++)->getType();
       InStep = getStepValue(&DL, InTy, InStepArg);
 
       InStep->setName("instep");
 
-      llvm::Value *InputAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InsBasePtr, 0, 0);
-      InBasePtr = Builder.CreateLoad(InputAddr, "input_base");
+      SmallGEPIndices InputAddrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 0}));
+      InBufPtr = Builder.CreateLoad(
+        Builder.CreateInBoundsGEP(Arg_p, InputAddrGEP, "input_buf.gep"), "input_buf");
     }
 
     llvm::Type *OutTy = nullptr;
@@ -570,26 +602,26 @@
       OutTy = (FunctionArgIter++)->getType();
       OutStep = getStepValue(&DL, OutTy, Arg_outstep);
       OutStep->setName("outstep");
-      OutBasePtr = Builder.CreateLoad(
-                     Builder.CreateConstInBoundsGEP2_32(nullptr,
-                         Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldOutPtr),
-                         0, 0));
+      SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
+      OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
     }
 
     llvm::Value *UsrData = nullptr;
     if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) {
       llvm::Type *UsrDataTy = (FunctionArgIter++)->getType();
-      UsrData = Builder.CreatePointerCast(Builder.CreateLoad(
-          Builder.CreateStructGEP(nullptr, Arg_p,  RsExpandKernelDriverInfoPfxFieldUsr)), UsrDataTy);
+      llvm::Value *UsrDataPointerAddr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr);
+      UsrData = Builder.CreatePointerCast(Builder.CreateLoad(UsrDataPointerAddr), UsrDataTy);
       UsrData->setName("UsrData");
     }
 
+    llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
     llvm::PHINode *IV;
     createLoop(Builder, Arg_x1, Arg_x2, &IV);
 
     llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
     const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
-                                                            [&FunctionArgIter]() { FunctionArgIter++; });
+                                                            [&FunctionArgIter]() { FunctionArgIter++; },
+                                                            LoopHeader->getTerminator());
 
     bccAssert(FunctionArgIter == Function->arg_end());
 
@@ -610,14 +642,14 @@
     if (OutBasePtr) {
       llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
       OutOffset = Builder.CreateMul(OutOffset, OutStep);
-      OutPtr = Builder.CreateGEP(OutBasePtr, OutOffset);
+      OutPtr = Builder.CreateInBoundsGEP(OutBasePtr, OutOffset);
       OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
     }
 
-    if (InBasePtr) {
+    if (InBufPtr) {
       llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
       InOffset = Builder.CreateMul(InOffset, InStep);
-      InPtr = Builder.CreateGEP(InBasePtr, InOffset);
+      InPtr = Builder.CreateInBoundsGEP(InBufPtr, InOffset);
       InPtr = Builder.CreatePointerCast(InPtr, InTy);
     }
 
@@ -696,7 +728,7 @@
      *
      * Note that we load any loop-invariant arguments before entering the Loop.
      */
-    size_t NumInputs = Function->arg_size();
+    size_t NumRemainingInputs = Function->arg_size();
 
     // No usrData parameter on kernels.
     bccAssert(
@@ -720,7 +752,7 @@
         OutTy = ArgIter->getType();
 
         ArgIter++;
-        --NumInputs;
+        --NumRemainingInputs;
       } else {
         // We don't increment Args, since we are using the actual return type.
         OutTy = OutBaseTy->getPointerTo();
@@ -728,10 +760,8 @@
 
       OutStep = getStepValue(&DL, OutTy, Arg_outstep);
       OutStep->setName("outstep");
-      OutBasePtr = Builder.CreateLoad(
-                     Builder.CreateConstInBoundsGEP2_32(nullptr,
-                         Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldOutPtr),
-                         0, 0));
+      SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
+      OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
 
       if (gEnableRsTbaa) {
         OutBasePtr->setMetadata("tbaa", TBAAPointer);
@@ -742,32 +772,41 @@
       CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
     }
 
+    llvm::SmallVector<llvm::Type*,  8> InTypes;
+    llvm::SmallVector<llvm::Value*, 8> InSteps;
+    llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
+    llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
+
+    bccAssert(NumRemainingInputs <= RS_KERNEL_INPUT_LIMIT);
+
+    // Create the loop structure.
+    llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
     llvm::PHINode *IV;
     createLoop(Builder, Arg_x1, Arg_x2, &IV);
 
     llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
-    const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
-                                                            [&NumInputs]() { --NumInputs; });
+    const int CalleeArgsContextIdx =
+      ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
+                             [&NumRemainingInputs]() { --NumRemainingInputs; },
+                             LoopHeader->getTerminator());
 
-    llvm::SmallVector<llvm::Type*,  8> InTypes;
-    llvm::SmallVector<llvm::Value*, 8> InSteps;
-    llvm::SmallVector<llvm::Value*, 8> InBasePtrs;
-    llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
+    // After ExpandSpecialArguments() gets called, NumRemainingInputs
+    // counts the number of arguments to the kernel that correspond to
+    // an array entry from the InPtr field of the DriverInfo
+    // structure.
+    const size_t NumInPtrArguments = NumRemainingInputs;
 
-    bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT);
+    if (NumInPtrArguments > 0) {
+      // Extract information about input slots and step sizes. The work done
+      // here is loop-invariant, so we can hoist the operations out of the loop.
+      auto OldInsertionPoint = Builder.saveIP();
+      Builder.SetInsertPoint(LoopHeader->getTerminator());
 
-    if (NumInputs > 0) {
-      llvm::Value *InsBasePtr  = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInPtr, "inputs_base");
-
-      llvm::Value *InStepsBase = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInStride, "insteps_base");
-
-      llvm::Instruction *AllocaInsertionPoint = &*ExpandedFunction->getEntryBlock().begin();
-      for (size_t InputIndex = 0; InputIndex < NumInputs;
-           ++InputIndex, ArgIter++) {
-
-        llvm::Value    *InStepAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InStepsBase, 0, InputIndex);
-        llvm::LoadInst *InStepArg  = Builder.CreateLoad(InStepAddr,
-                                                          "instep_addr");
+      for (size_t InputIndex = 0; InputIndex < NumInPtrArguments; ++InputIndex, ArgIter++) {
+        SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride,
+          static_cast<int32_t>(InputIndex)}));
+        llvm::Value *InStepAddr = Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep");
+        llvm::LoadInst *InStepArg = Builder.CreateLoad(InStepAddr, "instep_addr");
 
         llvm::Type *InType = ArgIter->getType();
 
@@ -776,20 +815,15 @@
          * get passed by pointer instead of passed by value.  This, combined
          * with the fact that we don't allow kernels to operate on pointer
          * data means that if we see a kernel with a pointer parameter we know
-         * that it is struct input that has been promoted.  As such we don't
+         * that it is a struct input that has been promoted.  As such we don't
          * need to convert its type to a pointer.  Later we will need to know
          * to create a temporary copy on the stack, so we save this information
          * in InStructTempSlots.
          */
         if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) {
           llvm::Type *ElementType = PtrType->getElementType();
-          uint64_t Alignment = DL.getABITypeAlignment(ElementType);
-          llvm::Value *Slot = new llvm::AllocaInst(ElementType,
-                                                   nullptr,
-                                                   Alignment,
-                                                   "input_struct_slot",
-                                                   AllocaInsertionPoint);
-          InStructTempSlots.push_back(Slot);
+          InStructTempSlots.push_back(Builder.CreateAlloca(ElementType, nullptr,
+                                                           "input_struct_slot"));
         } else {
           InType = InType->getPointerTo();
           InStructTempSlots.push_back(nullptr);
@@ -799,21 +833,23 @@
 
         InStep->setName("instep");
 
-        llvm::Value    *InputAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InsBasePtr, 0, InputIndex);
-        llvm::LoadInst *InBasePtr = Builder.CreateLoad(InputAddr,
-                                                         "input_base");
-        llvm::Value    *CastInBasePtr = Builder.CreatePointerCast(InBasePtr,
-                                                                    InType, "casted_in");
+        SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr,
+          static_cast<int32_t>(InputIndex)}));
+        llvm::Value    *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep");
+        llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf");
+        llvm::Value    *CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in");
         if (gEnableRsTbaa) {
-          InBasePtr->setMetadata("tbaa", TBAAPointer);
+          InBufPtr->setMetadata("tbaa", TBAAPointer);
         }
 
-        InBasePtr->setMetadata("alias.scope", AliasingScope);
+        InBufPtr->setMetadata("alias.scope", AliasingScope);
 
         InTypes.push_back(InType);
         InSteps.push_back(InStep);
-        InBasePtrs.push_back(CastInBasePtr);
+        InBufPtrs.push_back(CastInBufPtr);
       }
+
+      Builder.restoreIP(OldInsertionPoint);
     }
 
     // Populate the actual call to kernel().
@@ -836,7 +872,7 @@
     if (CastedOutBasePtr) {
       llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
 
-      OutPtr    = Builder.CreateGEP(CastedOutBasePtr, OutOffset);
+      OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
 
       if (PassOutByPointer) {
         RootArgs.push_back(OutPtr);
@@ -845,11 +881,11 @@
 
     // Inputs
 
-    if (NumInputs > 0) {
+    if (NumInPtrArguments > 0) {
       llvm::Value *Offset = Builder.CreateSub(IV, Arg_x1);
 
-      for (size_t Index = 0; Index < NumInputs; ++Index) {
-        llvm::Value *InPtr    = Builder.CreateGEP(InBasePtrs[Index], Offset);
+      for (size_t Index = 0; Index < NumInPtrArguments; ++Index) {
+        llvm::Value *InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset);
         llvm::Value *Input;
 
         if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) {
diff --git a/tests/README.lit b/tests/README.lit
new file mode 100644
index 0000000..16fa305
--- /dev/null
+++ b/tests/README.lit
@@ -0,0 +1,7 @@
+To run the libbcc lit tests:
+ * Ensure `llvm-rs-as` is built, either by doing a top-level `make
+   checkbuild` or by doing `mm` from frameworks/compile/slang.
+ * Ensure that LLVM and libbcc are built with
+   `FORCE_BUILD_LLVM_COMPONENTS=true`.
+ * Ensure `opt` is built from external/llvm, either by top-level `make
+   checkbuild` or by doing `mm` from external/llvm.
diff --git a/tests/libbcc/getelementptr.ll b/tests/libbcc/getelementptr.ll
new file mode 100644
index 0000000..6f3e175
--- /dev/null
+++ b/tests/libbcc/getelementptr.ll
@@ -0,0 +1,74 @@
+; This checks that RSForEachExpand generates getelementptr
+; instructions into the driver info structure as expected - namely,
+; that they index into the right positions of the structure and that
+; the instructions that are generated are in the loop header.
+
+; RUN: opt -load libbcc.so -foreachexp -S < %s | FileCheck %s
+
+; ModuleID = 'test_getelementptr.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnueabi"
+
+; Declarations expected by the expansion pass.
+declare void @_Z14rsGetElementAt13rs_allocationj()
+declare void @_Z14rsGetElementAt13rs_allocationjj()
+declare void @_Z14rsGetElementAt13rs_allocationjjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjjj()
+declare void @_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj()
+
+; Old-style kernel
+define void @root(i32* nocapture %ain, i32* nocapture %out, i32 %x, i32 %y, i32 %z) {
+  ret void
+; CHECK: define void @root.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, i32 %outstep)
+; CHECK: Begin:
+; CHECK: %instep_addr.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 1, i32 0
+; CHECK: load i32, i32* %instep_addr.gep
+; CHECK: %input_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 0
+; CHECK: load i8*, i8** %input_buf.gep
+; CHECK: %out_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 3, i32 0
+; CHECK: load i8*, i8** %out_buf.gep
+; CHECK: %Y.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 1
+; CHECK: load i32, i32* %Y.gep
+; CHECK: %Z.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 2
+; CHECK: load i32, i32* %Z.gep
+; CHECK: Loop:
+}
+
+; New style kernel with multiple inputs
+define i32 @foo(i32 %in0, i32 %in1, i32 %x, i32 %y, i32 %z) {
+  ret i32 0
+; CHECK: define void @foo.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, i32 %outstep)
+; CHECK: Begin:
+; CHECK: %out_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 3, i32 0
+; CHECK: load i8*, i8** %out_buf.gep
+; CHECK: %Y.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 1
+; CHECK: load i32, i32* %Y.gep
+; CHECK: %Z.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 2
+; CHECK: load i32, i32* %Z.gep
+; CHECK: %instep_addr.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 1, i32 0
+; CHECK: load i32, i32* %instep_addr.gep
+; CHECK: %input_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 0
+; CHECK: load i8*, i8** %input_buf.gep
+; CHECK: %instep_addr.gep1 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 1, i32 1
+; CHECK: load i32, i32* %instep_addr.gep1
+; CHECK: %input_buf.gep3 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 1
+; CHECK: load i8*, i8** %input_buf.gep3
+; CHECK: Loop:
+}
+
+!llvm.ident = !{!0}
+!\23pragma = !{!1, !2}
+!\23rs_export_foreach_name = !{!3, !4}
+!\23rs_export_foreach = !{!5, !6}
+
+!0 = !{!"clang version 3.6 "}
+!1 = !{!"version", !"1"}
+!2 = !{!"java_package_name", !"foo"}
+!3 = !{!"root"}
+!4 = !{!"foo"}
+!5 = !{!"91"}
+!6 = !{!"123"}
diff --git a/tests/libbcc/lit.cfg b/tests/libbcc/lit.cfg
index 5b3c749..109a9d7 100644
--- a/tests/libbcc/lit.cfg
+++ b/tests/libbcc/lit.cfg
@@ -24,17 +24,20 @@
 # test_source_root: The path where tests are located (default is the test suite
 # root).
 config.test_source_root = None
-config.test_exec_root = os.path.join(ANDROID_HOST_OUT, 'tests', 'bcinfo')
+config.test_exec_root = os.path.join(ANDROID_HOST_OUT, 'tests', 'libbcc')
 
-tools_dir = os.path.join(ANDROID_HOST_OUT, 'bin')
+tools_dir = os.pathsep.join([os.path.join(ANDROID_HOST_OUT, 'bin'),
+                             os.path.join(ANDROID_HOST_OUT, 'lib64')])
 
 # Based on LLVM's lit.cfg: "For each occurrence of an llvm tool name
 # as its own word, replace it with the full path to the build directory
 # holding that tool."
 for pattern in [r"\bFileCheck\b",
                 r"\bllvm-rs-as\b",
-                r"\bbcinfo\b"]:
-    tool_match = re.match(r"^(\\)?((\| )?)\W+b([0-9A-Za-z-_]+)\\b\W*$",
+                r"\bbcinfo\b",
+                r"\bopt\b",
+                r"\blibbcc.so\b"]:
+    tool_match = re.match(r"^(\\)?((\| )?)\W+b([\.0-9A-Za-z-_]+)\\b\W*$",
                           pattern)
     tool_pipe = tool_match.group(2)
     tool_name = tool_match.group(4)