Expand reduce-style kernels.

Bug: 22631253

Rename the RSForEachExpand pass / source file to RSKernelExpand. In
addition to already being responsible for expanding foreach-style
kernels, RSKernelExpand now expands reduce-style kernels from

  define foo @func(foo %a, foo %b)

into

  define void @func.expand(i8* %inBuf, i8* %outBuf, i32 len)

The expanded function reduces a sub-range of its input.

Change-Id: I518fa9367b57111a2f2a1b23903ff870c421beeb
diff --git a/include/bcc/Compiler.h b/include/bcc/Compiler.h
index 47c23c7..8a30c38 100644
--- a/include/bcc/Compiler.h
+++ b/include/bcc/Compiler.h
@@ -81,7 +81,7 @@
   enum ErrorCode runPasses(Script &pScript, llvm::raw_pwrite_stream &pResult);
 
   bool addInternalizeSymbolsPass(Script &pScript, llvm::legacy::PassManager &pPM);
-  void addExpandForEachPass(llvm::legacy::PassManager &pPM);
+  void addExpandKernelPass(llvm::legacy::PassManager &pPM);
   void addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM);
   void addInvariantPass(llvm::legacy::PassManager &pPM);
   void addInvokeHelperPass(llvm::legacy::PassManager &pPM);
diff --git a/include/bcc/Renderscript/RSTransforms.h b/include/bcc/Renderscript/RSTransforms.h
index d5830ca..6dcfedd 100644
--- a/include/bcc/Renderscript/RSTransforms.h
+++ b/include/bcc/Renderscript/RSTransforms.h
@@ -25,7 +25,7 @@
 namespace bcc {
 
 llvm::ModulePass *
-createRSForEachExpandPass(bool pEnableStepOpt);
+createRSKernelExpandPass(bool pEnableStepOpt);
 
 llvm::FunctionPass *
 createRSInvariantPass();
diff --git a/lib/Core/Compiler.cpp b/lib/Core/Compiler.cpp
index 2da44b1..5c769b4 100644
--- a/lib/Core/Compiler.cpp
+++ b/lib/Core/Compiler.cpp
@@ -157,7 +157,7 @@
 
   // Add some initial custom passes.
   addInvokeHelperPass(passes);
-  addExpandForEachPass(passes);
+  addExpandKernelPass(passes);
   addInvariantPass(passes);
   if (!addInternalizeSymbolsPass(pScript, passes))
     return kErrCustomPasses;
@@ -329,9 +329,11 @@
   size_t exportVarCount = me.getExportVarCount();
   size_t exportFuncCount = me.getExportFuncCount();
   size_t exportForEachCount = me.getExportForEachSignatureCount();
+  size_t exportReduceCount = me.getExportReduceCount();
   const char **exportVarNameList = me.getExportVarNameList();
   const char **exportFuncNameList = me.getExportFuncNameList();
   const char **exportForEachNameList = me.getExportForEachNameList();
+  const char **exportReduceNameList = me.getExportReduceNameList();
   size_t i;
 
   for (i = 0; i < exportVarCount; ++i) {
@@ -342,18 +344,22 @@
     export_symbols.push_back(exportFuncNameList[i]);
   }
 
-  // Expanded foreach functions should not be internalized, too.
-  // expanded_foreach_funcs keeps the .expand version of the kernel names
-  // around until createInternalizePass() is finished making its own
-  // copy of the visible symbols.
-  std::vector<std::string> expanded_foreach_funcs;
+  // Expanded foreach and reduce functions should not be
+  // internalized. expanded_funcs keeps the names of the expanded
+  // functions around until createInternalizePass() is finished making
+  // its own copy of the visible symbols.
+  std::vector<std::string> expanded_funcs;
+  expanded_funcs.reserve(exportForEachCount + exportReduceCount);
+
   for (i = 0; i < exportForEachCount; ++i) {
-    expanded_foreach_funcs.push_back(
-        std::string(exportForEachNameList[i]) + ".expand");
+    expanded_funcs.push_back(std::string(exportForEachNameList[i]) + ".expand");
+  }
+  for (i = 0; i < exportReduceCount; ++i) {
+    expanded_funcs.push_back(std::string(exportReduceNameList[i]) + ".expand");
   }
 
-  for (i = 0; i < exportForEachCount; i++) {
-      export_symbols.push_back(expanded_foreach_funcs[i].c_str());
+  for (auto &symbol_name : expanded_funcs) {
+    export_symbols.push_back(symbol_name.c_str());
   }
 
   pPM.add(llvm::createInternalizePass(export_symbols));
@@ -368,10 +374,10 @@
   }
 }
 
-void Compiler::addExpandForEachPass(llvm::legacy::PassManager &pPM) {
-  // Expand ForEach on CPU path to reduce launch overhead.
+void Compiler::addExpandKernelPass(llvm::legacy::PassManager &pPM) {
+  // Expand ForEach and reduce on CPU path to reduce launch overhead.
   bool pEnableStepOpt = true;
-  pPM.add(createRSForEachExpandPass(pEnableStepOpt));
+  pPM.add(createRSKernelExpandPass(pEnableStepOpt));
 }
 
 void Compiler::addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM) {
diff --git a/lib/Renderscript/Android.mk b/lib/Renderscript/Android.mk
index 56cae16..4b18eda 100644
--- a/lib/Renderscript/Android.mk
+++ b/lib/Renderscript/Android.mk
@@ -24,7 +24,7 @@
 libbcc_renderscript_SRC_FILES := \
   RSCompilerDriver.cpp \
   RSEmbedInfo.cpp \
-  RSForEachExpand.cpp \
+  RSKernelExpand.cpp \
   RSGlobalInfoPass.cpp \
   RSInvariant.cpp \
   RSScript.cpp \
diff --git a/lib/Renderscript/RSEmbedInfo.cpp b/lib/Renderscript/RSEmbedInfo.cpp
index dc1033c..b0c2767 100644
--- a/lib/Renderscript/RSEmbedInfo.cpp
+++ b/lib/Renderscript/RSEmbedInfo.cpp
@@ -73,11 +73,13 @@
     size_t exportVarCount = me.getExportVarCount();
     size_t exportFuncCount = me.getExportFuncCount();
     size_t exportForEachCount = me.getExportForEachSignatureCount();
+    size_t exportReduceCount = me.getExportReduceCount();
     size_t objectSlotCount = me.getObjectSlotCount();
     size_t pragmaCount = me.getPragmaCount();
     const char **exportVarNameList = me.getExportVarNameList();
     const char **exportFuncNameList = me.getExportFuncNameList();
     const char **exportForEachNameList = me.getExportForEachNameList();
+    const char **exportReduceNameList = me.getExportReduceNameList();
     const uint32_t *exportForEachSignatureList =
         me.getExportForEachSignatureList();
     const uint32_t *objectSlotList = me.getObjectSlotList();
@@ -111,6 +113,11 @@
         << exportForEachNameList[i] << "\n";
     }
 
+    s << "exportReduceCount: " << exportReduceCount << "\n";
+    for (i = 0; i < exportReduceCount; ++i) {
+      s << exportReduceNameList[i] << "\n";
+    }
+
     s << "objectSlotCount: " << objectSlotCount << "\n";
     for (i = 0; i < objectSlotCount; ++i) {
       s << objectSlotList[i] << "\n";
diff --git a/lib/Renderscript/RSForEachExpand.cpp b/lib/Renderscript/RSKernelExpand.cpp
similarity index 71%
rename from lib/Renderscript/RSForEachExpand.cpp
rename to lib/Renderscript/RSKernelExpand.cpp
index e91e4ed..63a42c8 100644
--- a/lib/Renderscript/RSForEachExpand.cpp
+++ b/lib/Renderscript/RSKernelExpand.cpp
@@ -38,7 +38,14 @@
 
 #include "bcinfo/MetadataExtractor.h"
 
-#define NUM_EXPANDED_FUNCTION_PARAMS 4
+#ifndef __DISABLE_ASSERTS
+// Only used in bccAssert()
+const int kNumExpandedForeachParams = 4;
+const int kNumExpandedReduceParams = 3;
+#endif
+
+const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA";
+const char kRenderScriptTBAANodeName[] = "RenderScript TBAA";
 
 using namespace bcc;
 
@@ -46,15 +53,17 @@
 
 static const bool gEnableRsTbaa = true;
 
-/* RSForEachExpandPass - This pass operates on functions that are able to be
- * called via rsForEach() or "foreach_<NAME>". We create an inner loop for the
- * ForEach-able function to be invoked over the appropriate data cells of the
- * input/output allocations (adjusting other relevant parameters as we go). We
- * support doing this for any ForEach-able compute kernels. The new function
- * name is the original function name followed by ".expand". Note that we
- * still generate code for the original function.
+/* RSKernelExpandPass - This pass operates on functions that are able
+ * to be called via rsForEach(), "foreach_<NAME>", or
+ * "reduce_<NAME>". We create an inner loop for the function to be
+ * invoked over the appropriate data cells of the input/output
+ * allocations (adjusting other relevant parameters as we go). We
+ * support doing this for any forEach or reduce style compute
+ * kernels. The new function name is the original function name
+ * followed by ".expand". Note that we still generate code for the
+ * original function.
  */
-class RSForEachExpandPass : public llvm::ModulePass {
+class RSKernelExpandPass : public llvm::ModulePass {
 public:
   static char ID;
 
@@ -91,16 +100,19 @@
   llvm::LLVMContext *Context;
 
   /*
-   * Pointer to LLVM type information for the the function signature
-   * for expanded kernels.  This must be re-calculated for each
-   * module the pass is run on.
+   * Pointers to LLVM type information for the the function signatures
+   * for expanded functions. These must be re-calculated for each module
+   * the pass is run on.
    */
-  llvm::FunctionType *ExpandedFunctionType;
+  llvm::FunctionType *ExpandedForEachType, *ExpandedReduceType;
 
   uint32_t mExportForEachCount;
   const char **mExportForEachNameList;
   const uint32_t *mExportForEachSignatureList;
 
+  uint32_t mExportReduceCount;
+  const char **mExportReduceNameList;
+
   // Turns on optimization of allocation stride values.
   bool mEnableStepOpt;
 
@@ -286,41 +298,68 @@
         llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
 
     // Create the function type for expanded kernels.
+    llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
 
     llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
+    // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep)
+    ExpandedForEachType = llvm::FunctionType::get(VoidTy,
+        {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false);
 
-    llvm::SmallVector<llvm::Type*, 8> ParamTypes;
-    ParamTypes.push_back(RsExpandKernelDriverInfoPfxPtrTy); // const RsExpandKernelDriverInfoPfx *p
-    ParamTypes.push_back(Int32Ty);                          // uint32_t x1
-    ParamTypes.push_back(Int32Ty);                          // uint32_t x2
-    ParamTypes.push_back(Int32Ty);                          // uint32_t outstep
-
-    ExpandedFunctionType =
-        llvm::FunctionType::get(llvm::Type::getVoidTy(*Context), ParamTypes,
-                                false);
+    // void (void *inBuf, void *outBuf, uint32_t len)
+    ExpandedReduceType = llvm::FunctionType::get(VoidTy, {VoidPtrTy, VoidPtrTy, Int32Ty}, false);
   }
 
-  /// @brief Create skeleton of the expanded function.
+  /// @brief Create skeleton of the expanded foreach kernel.
   ///
   /// This creates a function with the following signature:
   ///
   ///   void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
   ///         uint32_t outstep)
   ///
-  llvm::Function *createEmptyExpandedFunction(llvm::StringRef OldName) {
+  llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) {
     llvm::Function *ExpandedFunction =
-      llvm::Function::Create(ExpandedFunctionType,
+      llvm::Function::Create(ExpandedForEachType,
                              llvm::GlobalValue::ExternalLinkage,
                              OldName + ".expand", Module);
-
-    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
-
+    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
     llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
-
     (AI++)->setName("p");
     (AI++)->setName("x1");
     (AI++)->setName("x2");
     (AI++)->setName("arg_outstep");
+    llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
+                                                       ExpandedFunction);
+    llvm::IRBuilder<> Builder(Begin);
+    Builder.CreateRetVoid();
+    return ExpandedFunction;
+  }
+
+  // Create skeleton of the expanded reduce kernel.
+  //
+  // This creates a function with the following signature:
+  //
+  //   void @func.expand(i8* nocapture %inBuf, i8* nocapture %outBuf, i32 len)
+  //
+  llvm::Function *createEmptyExpandedReduceKernel(llvm::StringRef OldName) {
+    llvm::Function *ExpandedFunction =
+      llvm::Function::Create(ExpandedReduceType,
+                             llvm::GlobalValue::ExternalLinkage,
+                             OldName + ".expand", Module);
+    bccAssert(ExpandedFunction->arg_size() == kNumExpandedReduceParams);
+
+    llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
+
+    using llvm::Attribute;
+
+    llvm::Argument *InBuf = &(*AI++);
+    InBuf->setName("inBuf");
+    InBuf->addAttr(llvm::AttributeSet::get(*Context, InBuf->getArgNo() + 1, {Attribute::NoCapture}));
+
+    llvm::Argument *OutBuf = &(*AI++);
+    OutBuf->setName("outBuf");
+    OutBuf->addAttr(llvm::AttributeSet::get(*Context, OutBuf->getArgNo() + 1, {Attribute::NoCapture}));
+
+    (AI++)->setName("len");
 
     llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
                                                        ExpandedFunction);
@@ -444,7 +483,7 @@
   //
   // Returns:
   //   Returns a SmallVector of ConstantInts.
-  SmallGEPIndices GEPHelper(std::initializer_list<int32_t> I32Args) {
+  SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) {
     SmallGEPIndices Out(I32Args.size());
     llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context);
     std::transform(I32Args.begin(), I32Args.end(), Out.begin(),
@@ -453,7 +492,7 @@
   }
 
 public:
-  RSForEachExpandPass(bool pEnableStepOpt = true)
+  RSKernelExpandPass(bool pEnableStepOpt = true)
       : ModulePass(ID), Module(nullptr), Context(nullptr),
         mEnableStepOpt(pEnableStepOpt) {
 
@@ -536,7 +575,7 @@
    * Module will contain a new function of the name "<NAME>.expand" that
    * invokes <NAME>() in a loop with the appropriate parameters.
    */
-  bool ExpandFunction(llvm::Function *Function, uint32_t Signature) {
+  bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) {
     ALOGV("Expanding ForEach-able Function %s",
           Function->getName().str().c_str());
 
@@ -552,14 +591,14 @@
     llvm::DataLayout DL(Module);
 
     llvm::Function *ExpandedFunction =
-      createEmptyExpandedFunction(Function->getName());
+      createEmptyExpandedForEachKernel(Function->getName());
 
     /*
      * Extract the expanded function's parameters.  It is guaranteed by
      * createEmptyExpandedFunction that there will be five parameters.
      */
 
-    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
+    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
 
     llvm::Function::arg_iterator ExpandedFunctionArgIter =
       ExpandedFunction->arg_begin();
@@ -672,24 +711,24 @@
     return true;
   }
 
-  /* Expand a pass-by-value kernel.
+  /* Expand a pass-by-value foreach kernel.
    */
-  bool ExpandKernel(llvm::Function *Function, uint32_t Signature) {
+  bool ExpandForEach(llvm::Function *Function, uint32_t Signature) {
     bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
     ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
 
-    // TODO: Refactor this to share functionality with ExpandFunction.
+    // TODO: Refactor this to share functionality with ExpandOldStyleForEach.
     llvm::DataLayout DL(Module);
 
     llvm::Function *ExpandedFunction =
-      createEmptyExpandedFunction(Function->getName());
+      createEmptyExpandedForEachKernel(Function->getName());
 
     /*
      * Extract the expanded function's parameters.  It is guaranteed by
      * createEmptyExpandedFunction that there will be five parameters.
      */
 
-    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
+    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
 
     llvm::Function::arg_iterator ExpandedFunctionArgIter =
       ExpandedFunction->arg_begin();
@@ -708,8 +747,8 @@
     llvm::MDBuilder MDHelper(*Context);
 
     TBAARenderScriptDistinct =
-      MDHelper.createTBAARoot("RenderScript Distinct TBAA");
-    TBAARenderScript = MDHelper.createTBAANode("RenderScript TBAA",
+      MDHelper.createTBAARoot(kRenderScriptTBAARootName);
+    TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
         TBAARenderScriptDistinct);
     TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
                                                        TBAARenderScript);
@@ -854,7 +893,6 @@
     llvm::Value *OutPtr = nullptr;
     if (CastedOutBasePtr) {
       llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
-
       OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
 
       if (PassOutByPointer) {
@@ -909,6 +947,269 @@
     return true;
   }
 
+  // Expand a reduce-style kernel function.
+  //
+  // The input is a kernel which represents a binary operation,
+  // of the form
+  //
+  //   define foo @func(foo %a, foo %b),
+  //
+  // (More generally, it can be of the forms
+  //
+  //   define void @func(foo* %ret, foo* %a, foo* %b)
+  //   define void @func(foo* %ret, foo1 %a, foo1 %b)
+  //   define foo1 @func(foo2 %a, foo2 %b)
+  //
+  // as a result of argument / return value conversions. Here, "foo1"
+  // and "foo2" refer to possibly coerced types, and the coerced
+  // argument type may be different from the coerced return type. See
+  // "Note on coercion" below.)
+  //
+  // Note also, we do not expect to encounter any case when the
+  // arguments are promoted to pointers but the return value is
+  // unpromoted to pointer, e.g.
+  //
+  //   define foo1 @func(foo* %a, foo* %b)
+  //
+  // and we will throw an assertion in this case.)
+  //
+  // The input kernel gets expanded into a kernel of the form
+  //
+  //   define void @func.expand(i8* %inBuf, i8* outBuf, i32 len)
+  //
+  // which performs a serial reduction of `len` elements from `inBuf`,
+  // and stores the result into `outBuf`. In pseudocode, @func.expand
+  // does:
+  //
+  //   inArr := (foo *)inBuf;
+  //   accum := inArr[0];
+  //   for (i := 1; i < len; ++i) {
+  //     accum := foo(accum, inArr[i]);
+  //   }
+  //   *(foo *)outBuf := accum;
+  //
+  // Note on coercion
+  //
+  // Both the return value and the argument types may undergo internal
+  // coercion in clang as part of call lowering. As a result, the
+  // return value type may differ from the argument type even if the
+  // types in the RenderScript signaure are the same. For instance, the
+  // kernel
+  //
+  //   int3 add(int3 a, int3 b) { return a + b; }
+  //
+  // gets lowered by clang as
+  //
+  //   define <3 x i32> @add(<4 x i32> %a.coerce, <4 x i32> %b.coerce)
+  //
+  // under AArch64. The details of this process are found in clang,
+  // lib/CodeGen/TargetInfo.cpp, under classifyArgumentType() and
+  // classifyReturnType() in ARMABIInfo, AArch64ABIInfo. If the value
+  // is passed by pointer, then the pointed-to type is not coerced.
+  //
+  // Since we lack the original type information, this code does loads
+  // and stores of allocation data by way of pointers to the coerced
+  // type.
+  bool ExpandReduce(llvm::Function *Function) {
+    bccAssert(Function);
+
+    ALOGV("Expanding reduce kernel %s", Function->getName().str().c_str());
+
+    llvm::DataLayout DL(Module);
+
+    // TBAA Metadata
+    llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, *TBAAAllocation;
+    llvm::MDBuilder MDHelper(*Context);
+
+    TBAARenderScriptDistinct =
+      MDHelper.createTBAARoot(kRenderScriptTBAARootName);
+    TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
+        TBAARenderScriptDistinct);
+    TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
+                                                       TBAARenderScript);
+    TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
+                                                      TBAAAllocation, 0);
+
+    llvm::Function *ExpandedFunction =
+      createEmptyExpandedReduceKernel(Function->getName());
+
+    // Extract the expanded kernel's parameters.  It is guaranteed by
+    // createEmptyExpandedFunction that there will be 3 parameters.
+    auto ExpandedFunctionArgIter = ExpandedFunction->arg_begin();
+
+    llvm::Value *Arg_inBuf  = &*(ExpandedFunctionArgIter++);
+    llvm::Value *Arg_outBuf = &*(ExpandedFunctionArgIter++);
+    llvm::Value *Arg_len    = &*(ExpandedFunctionArgIter++);
+
+    bccAssert(Function->arg_size() == 2 || Function->arg_size() == 3);
+
+    // Check if, instead of returning a value, the original kernel has
+    // a pointer parameter which points to a temporary buffer into
+    // which the return value gets written.
+    const bool ReturnValuePointerStyle = (Function->arg_size() == 3);
+    bccAssert(Function->getReturnType()->isVoidTy() == ReturnValuePointerStyle);
+
+    // Check if, instead of being passed by value, the inputs to the
+    // original kernel are passed by pointer.
+    auto FirstArgIter = Function->arg_begin();
+    // The second argument is always an input to the original kernel.
+    auto SecondArgIter = std::next(FirstArgIter);
+    const bool InputsPointerStyle = SecondArgIter->getType()->isPointerTy();
+
+    // Get the output type (i.e. return type of the original kernel).
+    llvm::PointerType *OutPtrTy = nullptr;
+    llvm::Type *OutTy = nullptr;
+    if (ReturnValuePointerStyle) {
+      OutPtrTy = llvm::dyn_cast<llvm::PointerType>(FirstArgIter->getType());
+      bccAssert(OutPtrTy && "Expected a pointer parameter to kernel");
+      OutTy = OutPtrTy->getElementType();
+    } else {
+      OutTy = Function->getReturnType();
+      bccAssert(!OutTy->isVoidTy());
+      OutPtrTy = OutTy->getPointerTo();
+    }
+
+    // Get the input type (type of the arguments to the original
+    // kernel). Some input types are different from the output type,
+    // due to explicit coercion that the compiler performs when
+    // lowering the parameters. See "Note on coercion" above.
+    llvm::PointerType *InPtrTy;
+    llvm::Type *InTy;
+    if (InputsPointerStyle) {
+      InPtrTy = llvm::dyn_cast<llvm::PointerType>(SecondArgIter->getType());
+      bccAssert(InPtrTy && "Expected a pointer parameter to kernel");
+      bccAssert(ReturnValuePointerStyle);
+      bccAssert(std::next(SecondArgIter)->getType() == InPtrTy &&
+                "Input type mismatch");
+      InTy = InPtrTy->getElementType();
+    } else {
+      InTy = SecondArgIter->getType();
+      InPtrTy = InTy->getPointerTo();
+      if (!ReturnValuePointerStyle) {
+        bccAssert(InTy == FirstArgIter->getType() && "Input type mismatch");
+      } else {
+        bccAssert(InTy == std::next(SecondArgIter)->getType() &&
+                  "Input type mismatch");
+      }
+    }
+
+    // The input type should take up the same amount of space in
+    // memory as the output type.
+    bccAssert(DL.getTypeAllocSize(InTy) == DL.getTypeAllocSize(OutTy));
+
+    // Construct the actual function body.
+    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
+
+    // Cast input and output buffers to appropriate types.
+    llvm::Value *InBuf = Builder.CreatePointerCast(Arg_inBuf, InPtrTy);
+    llvm::Value *OutBuf = Builder.CreatePointerCast(Arg_outBuf, OutPtrTy);
+
+    // Create a slot to pass temporary results back. This needs to be
+    // separate from the accumulator slot because the kernel may mark
+    // the return value slot as noalias.
+    llvm::Value *ReturnBuf = nullptr;
+    if (ReturnValuePointerStyle) {
+      ReturnBuf = Builder.CreateAlloca(OutTy, nullptr, "ret.tmp");
+    }
+
+    // Create a slot to hold the second input if the inputs are passed
+    // by pointer to the original kernel. We cannot directly pass a
+    // pointer to the input buffer, because the kernel may modify its
+    // inputs.
+    llvm::Value *SecondInputTempBuf = nullptr;
+    if (InputsPointerStyle) {
+      SecondInputTempBuf = Builder.CreateAlloca(InTy, nullptr, "in.tmp");
+    }
+
+    // Create a slot to accumulate temporary results, and fill it with
+    // the first value.
+    llvm::Value *AccumBuf = Builder.CreateAlloca(OutTy, nullptr, "accum");
+    // Cast to OutPtrTy before loading, since AccumBuf has type OutPtrTy.
+    llvm::LoadInst *FirstElementLoad = Builder.CreateLoad(
+      Builder.CreatePointerCast(InBuf, OutPtrTy));
+    if (gEnableRsTbaa) {
+      FirstElementLoad->setMetadata("tbaa", TBAAAllocation);
+    }
+    // Memory operations with AccumBuf shouldn't be marked with
+    // RenderScript TBAA, since this might conflict with TBAA metadata
+    // in the kernel function when AccumBuf is passed by pointer.
+    Builder.CreateStore(FirstElementLoad, AccumBuf);
+
+    // Loop body
+
+    // Create the loop structure. Note that the first input in the input buffer
+    // has already been accumulated, so that we start at index 1.
+    llvm::PHINode *IndVar;
+    llvm::Value *Start = llvm::ConstantInt::get(Arg_len->getType(), 1);
+    llvm::BasicBlock *Exit = createLoop(Builder, Start, Arg_len, &IndVar);
+
+    llvm::Value *InputPtr = Builder.CreateInBoundsGEP(InBuf, IndVar, "next_input.gep");
+
+    // Set up arguments and call the original (unexpanded) kernel.
+    //
+    // The original kernel can have at most 3 arguments, which is
+    // achieved when the signature looks like:
+    //
+    //    define void @func(foo* %ret, bar %a, bar %b)
+    //
+    // (bar can be one of foo/foo.coerce/foo*).
+    llvm::SmallVector<llvm::Value *, 3> KernelArgs;
+
+    if (ReturnValuePointerStyle) {
+      KernelArgs.push_back(ReturnBuf);
+    }
+
+    if (InputsPointerStyle) {
+      bccAssert(ReturnValuePointerStyle);
+      // Because the return buffer is copied back into the
+      // accumulator, it's okay if the accumulator is overwritten.
+      KernelArgs.push_back(AccumBuf);
+
+      llvm::LoadInst *InputLoad = Builder.CreateLoad(InputPtr);
+      if (gEnableRsTbaa) {
+        InputLoad->setMetadata("tbaa", TBAAAllocation);
+      }
+      Builder.CreateStore(InputLoad, SecondInputTempBuf);
+
+      KernelArgs.push_back(SecondInputTempBuf);
+    } else {
+      // InPtrTy may be different from OutPtrTy (the type of
+      // AccumBuf), so first cast the accumulator buffer to the
+      // pointer type corresponding to the input argument type.
+      KernelArgs.push_back(
+        Builder.CreateLoad(Builder.CreatePointerCast(AccumBuf, InPtrTy)));
+
+      llvm::LoadInst *LoadedArg = Builder.CreateLoad(InputPtr);
+      if (gEnableRsTbaa) {
+        LoadedArg->setMetadata("tbaa", TBAAAllocation);
+      }
+      KernelArgs.push_back(LoadedArg);
+    }
+
+    llvm::Value *RetVal = Builder.CreateCall(Function, KernelArgs);
+
+    const uint64_t ElementSize = DL.getTypeStoreSize(OutTy);
+    const uint64_t ElementAlign = DL.getABITypeAlignment(OutTy);
+
+    // Store the output in the accumulator.
+    if (ReturnValuePointerStyle) {
+      Builder.CreateMemCpy(AccumBuf, ReturnBuf, ElementSize, ElementAlign);
+    } else {
+      Builder.CreateStore(RetVal, AccumBuf);
+    }
+
+    // Loop exit
+    Builder.SetInsertPoint(Exit, Exit->begin());
+
+    llvm::LoadInst *OutputLoad = Builder.CreateLoad(AccumBuf);
+    llvm::StoreInst *OutputStore = Builder.CreateStore(OutputLoad, OutBuf);
+    if (gEnableRsTbaa) {
+      OutputStore->setMetadata("tbaa", TBAAAllocation);
+    }
+
+    return true;
+  }
+
   /// @brief Checks if pointers to allocation internals are exposed
   ///
   /// This function verifies if through the parameters passed to the kernel
@@ -994,31 +1295,31 @@
   virtual bool runOnModule(llvm::Module &Module) {
     bool Changed  = false;
     this->Module  = &Module;
-    this->Context = &Module.getContext();
+    Context = &Module.getContext();
 
-    this->buildTypes();
+    buildTypes();
 
     bcinfo::MetadataExtractor me(&Module);
     if (!me.extract()) {
       ALOGE("Could not extract metadata from module!");
       return false;
     }
+
+    // Expand forEach_* style kernels.
     mExportForEachCount = me.getExportForEachSignatureCount();
     mExportForEachNameList = me.getExportForEachNameList();
     mExportForEachSignatureList = me.getExportForEachSignatureList();
 
-    bool AllocsExposed = allocPointersExposed(Module);
-
     for (size_t i = 0; i < mExportForEachCount; ++i) {
       const char *name = mExportForEachNameList[i];
       uint32_t signature = mExportForEachSignatureList[i];
       llvm::Function *kernel = Module.getFunction(name);
       if (kernel) {
         if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
-          Changed |= ExpandKernel(kernel, signature);
+          Changed |= ExpandForEach(kernel, signature);
           kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
         } else if (kernel->getReturnType()->isVoidTy()) {
-          Changed |= ExpandFunction(kernel, signature);
+          Changed |= ExpandOldStyleForEach(kernel, signature);
           kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
         } else {
           // There are some graphics root functions that are not
@@ -1028,7 +1329,18 @@
       }
     }
 
-    if (gEnableRsTbaa && !AllocsExposed) {
+    // Expand reduce_* style kernels.
+    mExportReduceCount = me.getExportReduceCount();
+    mExportReduceNameList = me.getExportReduceNameList();
+
+    for (size_t i = 0; i < mExportReduceCount; ++i) {
+      llvm::Function *kernel = Module.getFunction(mExportReduceNameList[i]);
+      if (kernel) {
+        Changed |= ExpandReduce(kernel);
+      }
+    }
+
+    if (gEnableRsTbaa && !allocPointersExposed(Module)) {
       connectRenderScriptTBAAMetadata(Module);
     }
 
@@ -1036,21 +1348,21 @@
   }
 
   virtual const char *getPassName() const {
-    return "ForEach-able Function Expansion";
+    return "forEach_* and reduce_* function expansion";
   }
 
-}; // end RSForEachExpandPass
+}; // end RSKernelExpandPass
 
 } // end anonymous namespace
 
-char RSForEachExpandPass::ID = 0;
-static llvm::RegisterPass<RSForEachExpandPass> X("foreachexp", "ForEach Expand Pass");
+char RSKernelExpandPass::ID = 0;
+static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass");
 
 namespace bcc {
 
 llvm::ModulePass *
-createRSForEachExpandPass(bool pEnableStepOpt){
-  return new RSForEachExpandPass(pEnableStepOpt);
+createRSKernelExpandPass(bool pEnableStepOpt) {
+  return new RSKernelExpandPass(pEnableStepOpt);
 }
 
 } // end namespace bcc
diff --git a/tests/libbcc/getelementptr.ll b/tests/libbcc/getelementptr.ll
index 6f3e175..fedd258 100644
--- a/tests/libbcc/getelementptr.ll
+++ b/tests/libbcc/getelementptr.ll
@@ -3,7 +3,7 @@
 ; that they index into the right positions of the structure and that
 ; the instructions that are generated are in the loop header.
 
-; RUN: opt -load libbcc.so -foreachexp -S < %s | FileCheck %s
+; RUN: opt -load libbcc.so -kernelexp -S < %s | FileCheck %s
 
 ; ModuleID = 'test_getelementptr.bc'
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
diff --git a/tests/libbcc/tbaa-through-alloca.ll b/tests/libbcc/tbaa-through-alloca.ll
index 3e9e3ab..5b0a270 100644
--- a/tests/libbcc/tbaa-through-alloca.ll
+++ b/tests/libbcc/tbaa-through-alloca.ll
@@ -1,7 +1,7 @@
 ; This test checks that the code doesn't aggressively apply TBAA
 ; metadata to temporaries that are passed by pointer to kernels.
 
-; RUN: opt -load libbcc.so -foreachexp -inline -tbaa -aa-eval -print-may-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
+; RUN: opt -load libbcc.so -kernelexp -inline -tbaa -aa-eval -print-may-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
 
 ; ModuleID = 'kernel.bc'
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
diff --git a/tests/libbcc/tbaa.ll b/tests/libbcc/tbaa.ll
index 81972f2..6d8cb48 100644
--- a/tests/libbcc/tbaa.ll
+++ b/tests/libbcc/tbaa.ll
@@ -1,7 +1,7 @@
 ; Basic test of TBAA that should report that pointer loads do not
 ; alias with stores to allocations.
 
-; RUN: opt -load libbcc.so -foreachexp -tbaa -aa-eval -print-no-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
+; RUN: opt -load libbcc.so -kernelexp -tbaa -aa-eval -print-no-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
 
 ; ModuleID = 'kernel.bc'
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"