Merge "Modify findMinAndMax kernel to handle infinities in input."
diff --git a/cpp/Android.mk b/cpp/Android.mk
index f2934f6..3d3995d 100644
--- a/cpp/Android.mk
+++ b/cpp/Android.mk
@@ -82,7 +82,6 @@
 LOCAL_SRC_FILES += ../rsCompatibilityLib.cpp
 
 LOCAL_WHOLE_STATIC_LIBRARIES := \
-	libutils \
 	libRSDispatch
 
 LOCAL_MODULE:= libRScpp_static
diff --git a/cpp/Script.cpp b/cpp/Script.cpp
index acea0c8..52933f2 100644
--- a/cpp/Script.cpp
+++ b/cpp/Script.cpp
@@ -36,21 +36,6 @@
     tryDispatch(mRS, RS::dispatch->ScriptForEach(mRS->getContext(), getID(), slot, in_id, out_id, usr, usrLen, nullptr, 0));
 }
 
-void Script::reduce(uint32_t slot, sp<const Allocation> ain, sp<const Allocation> aout,
-                    const RsScriptCall *sc) const {
-    if (RS::dispatch->ScriptReduce == nullptr) {
-        mRS->throwError(RS_ERROR_RUNTIME_ERROR, "Reduce is not supported at the current API level");
-        return;
-    }
-    if (ain == nullptr || aout == nullptr) {
-        mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Both ain and aout are required to be non-null.");
-        return;
-    }
-    void *in_id = BaseObj::getObjID(ain);
-    void *out_id = BaseObj::getObjID(aout);
-    tryDispatch(mRS, RS::dispatch->ScriptReduce(mRS->getContext(), getID(), slot, in_id, out_id, sc, sc == nullptr ? 0 : sizeof(*sc)));
-}
-
 Script::Script(void *id, sp<RS> rs) : BaseObj(id, rs) {
 }
 
@@ -71,4 +56,3 @@
 void Script::FieldBase::init(sp<RS> rs, uint32_t dimx, uint32_t usages) {
     mAllocation = Allocation::createSized(rs, mElement, dimx, RS_ALLOCATION_USAGE_SCRIPT | usages);
 }
-
diff --git a/cpp/rsCppInternal.h b/cpp/rsCppInternal.h
index 69857d7..c51f7a0 100644
--- a/cpp/rsCppInternal.h
+++ b/cpp/rsCppInternal.h
@@ -24,12 +24,12 @@
 #include "rsDispatch.h"
 
 #define tryDispatch(rs, dispatch)               \
-    if (rs->getError() == RS_SUCCESS) {         \
+    if ((rs)->getError() == RS_SUCCESS) {       \
         dispatch;                               \
     }
 
 #define createDispatch(rs, dispatch) \
-    rs->getError() == RS_SUCCESS ? dispatch : nullptr
+    rs->getError() == RS_SUCCESS ? (dispatch) : nullptr
 
 #undef LOG_TAG
 #undef LOG_NDEBUG
diff --git a/cpp/rsCppStructs.h b/cpp/rsCppStructs.h
index 95d190b..a3e65ab 100644
--- a/cpp/rsCppStructs.h
+++ b/cpp/rsCppStructs.h
@@ -1412,7 +1412,7 @@
         bool mSkipPadding;
 
     public:
-        Builder(sp<RS> rs);
+        explicit Builder(sp<RS> rs);
         ~Builder();
         void add(sp<const Element> e, const char * name, uint32_t arraySize = 1);
         sp<const Element> create();
@@ -1428,7 +1428,7 @@
             uint32_t * arraySizes);
     Element(void *id, sp<RS> rs, RsDataType dt, RsDataKind dk, bool norm, uint32_t size);
     Element(void *id, sp<RS> rs);
-    Element(sp<RS> rs);
+    explicit Element(sp<RS> rs);
     virtual ~Element();
 
 private:
@@ -1458,7 +1458,7 @@
     size_t mLen;
 
 public:
-    FieldPacker(size_t len)
+    explicit FieldPacker(size_t len)
         : mPos(0), mLen(len) {
             mData = new unsigned char[len];
         }
@@ -1700,8 +1700,6 @@
     Script(void *id, sp<RS> rs);
     void forEach(uint32_t slot, sp<const Allocation> in, sp<const Allocation> out,
             const void *v, size_t) const;
-    void reduce(uint32_t slot, sp<const Allocation> in, sp<const Allocation> out,
-                const RsScriptCall *sc) const;
     void bindAllocation(sp<Allocation> va, uint32_t slot) const;
     void setVar(uint32_t index, const void *, size_t len) const;
     void setVar(uint32_t index, sp<const BaseObj> o) const;
diff --git a/cpp/rsDispatch.cpp b/cpp/rsDispatch.cpp
index f612145..5773903 100644
--- a/cpp/rsDispatch.cpp
+++ b/cpp/rsDispatch.cpp
@@ -21,8 +21,7 @@
 #include <limits.h>
 
 #define LOG_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "RS Dispatch", __VA_ARGS__);
-#define REDUCE_API_LEVEL INT_MAX
-#define REDUCE_NEW_API_LEVEL 24
+#define REDUCE_API_LEVEL 24
 
 bool loadSymbols(void* handle, dispatchTable& dispatchTab, int targetApiLevel) {
 #ifdef __LP64__
@@ -101,7 +100,6 @@
     dispatchTab.ScriptInvokeV = (ScriptInvokeVFnPtr)dlsym(handle, "rsScriptInvokeV");
     dispatchTab.ScriptKernelIDCreate = (ScriptKernelIDCreateFnPtr)dlsym(handle, "rsScriptKernelIDCreate");
     dispatchTab.ScriptReduce = (ScriptReduceFnPtr)dlsym(handle, "rsScriptReduce");
-    dispatchTab.ScriptReduceNew = (ScriptReduceNewFnPtr)dlsym(handle, "rsScriptReduceNew");
     dispatchTab.ScriptSetTimeZone = (ScriptSetTimeZoneFnPtr)dlsym(handle, "rsScriptSetTimeZone");
     dispatchTab.ScriptSetVarD = (ScriptSetVarDFnPtr)dlsym(handle, "rsScriptSetVarD");
     dispatchTab.ScriptSetVarF = (ScriptSetVarFFnPtr)dlsym(handle, "rsScriptSetVarF");
@@ -427,7 +425,7 @@
             return false;
         }
     }
-    // TODO: Update the API level when reduce is added.
+
     if (targetApiLevel >= REDUCE_API_LEVEL) {
         if (dispatchTab.ScriptReduce == nullptr) {
             LOG_ERR("Couldn't initialize dispatchTab.ScriptReduce");
@@ -435,13 +433,6 @@
         }
     }
 
-    if (targetApiLevel >= REDUCE_NEW_API_LEVEL) {
-        if (dispatchTab.ScriptReduceNew == nullptr) {
-            LOG_ERR("Couldn't initialize dispatchTab.ScriptReduceNew");
-            return false;
-        }
-    }
-
     return true;
 
 }
diff --git a/cpp/rsDispatch.h b/cpp/rsDispatch.h
index 8f2df70..df12f32 100644
--- a/cpp/rsDispatch.h
+++ b/cpp/rsDispatch.h
@@ -77,8 +77,7 @@
 typedef void (*ScriptInvokeVFnPtr) (RsContext, RsScript, uint32_t, const void*, size_t);
 typedef void (*ScriptForEachFnPtr) (RsContext, RsScript, uint32_t, RsAllocation, RsAllocation, const void*, size_t, const RsScriptCall*, size_t);
 typedef void (*ScriptForEachMultiFnPtr) (RsContext, RsScript, uint32_t, RsAllocation*, size_t, RsAllocation, const void*, size_t, const RsScriptCall*, size_t);
-typedef void (*ScriptReduceFnPtr) (RsContext, RsScript, uint32_t, RsAllocation, RsAllocation, const RsScriptCall*, size_t);
-typedef void (*ScriptReduceNewFnPtr) (RsContext, RsScript, uint32_t, RsAllocation*, size_t, RsAllocation, const RsScriptCall*, size_t);
+typedef void (*ScriptReduceFnPtr) (RsContext, RsScript, uint32_t, RsAllocation*, size_t, RsAllocation, const RsScriptCall*, size_t);
 typedef void (*ScriptSetVarIFnPtr) (RsContext, RsScript, uint32_t, int);
 typedef void (*ScriptSetVarObjFnPtr) (RsContext, RsScript, uint32_t, RsObjectBase);
 typedef void (*ScriptSetVarJFnPtr) (RsContext, RsScript, uint32_t, int64_t);
@@ -173,7 +172,6 @@
     ScriptInvokeVFnPtr ScriptInvokeV;
     ScriptKernelIDCreateFnPtr ScriptKernelIDCreate;
     ScriptReduceFnPtr ScriptReduce;
-    ScriptReduceNewFnPtr ScriptReduceNew;
     ScriptSetTimeZoneFnPtr ScriptSetTimeZone;
     ScriptSetVarDFnPtr ScriptSetVarD;
     ScriptSetVarFFnPtr ScriptSetVarF;
diff --git a/cpp/util/RefBase.h b/cpp/util/RefBase.h
index 01c0b5f..40bb7bc 100644
--- a/cpp/util/RefBase.h
+++ b/cpp/util/RefBase.h
@@ -202,12 +202,12 @@
 
     inline wp() : m_ptr(0) { }
 
-    wp(T* other);
+    explicit wp(T* other);
     wp(const wp<T>& other);
-    wp(const sp<T>& other);
-    template<typename U> wp(U* other);
-    template<typename U> wp(const sp<U>& other);
-    template<typename U> wp(const wp<U>& other);
+    explicit wp(const sp<T>& other);
+    template<typename U> explicit wp(U* other);
+    template<typename U> explicit wp(const sp<U>& other);
+    template<typename U> explicit wp(const wp<U>& other);
 
     ~wp();
 
diff --git a/cpp/util/StrongPointer.h b/cpp/util/StrongPointer.h
index 0f68615..a9995ba 100644
--- a/cpp/util/StrongPointer.h
+++ b/cpp/util/StrongPointer.h
@@ -65,10 +65,10 @@
 public:
     inline sp() : m_ptr(0) { }
 
-    sp(T* other);
+    sp(T* other);  // NOLINT, implicit
     sp(const sp<T>& other);
-    template<typename U> sp(U* other);
-    template<typename U> sp(const sp<U>& other);
+    template<typename U> sp(U* other);  // NOLINT, implicit
+    template<typename U> sp(const sp<U>& other);  // NOLINT, implicit
 
     ~sp();
 
diff --git a/cpp/util/TypeHelpers.h b/cpp/util/TypeHelpers.h
index 33a5201..e738cd3 100644
--- a/cpp/util/TypeHelpers.h
+++ b/cpp/util/TypeHelpers.h
@@ -233,7 +233,7 @@
     key_value_pair_t() { }
     key_value_pair_t(const key_value_pair_t& o) : key(o.key), value(o.value) { }
     key_value_pair_t(const KEY& k, const VALUE& v) : key(k), value(v)  { }
-    key_value_pair_t(const KEY& k) : key(k) { }
+    explicit key_value_pair_t(const KEY& k) : key(k) { }
     inline bool operator < (const key_value_pair_t& o) const {
         return strictly_order_type(key, o.key);
     }
diff --git a/cpu_ref/rsCpuBLASDispatch.h b/cpu_ref/rsCpuBLASDispatch.h
index 4602135..40da346 100644
--- a/cpu_ref/rsCpuBLASDispatch.h
+++ b/cpu_ref/rsCpuBLASDispatch.h
@@ -457,7 +457,7 @@
 // Macros to help load the function pointers.
 #define RS_APPLY_MACRO_TO(x) \
     x = (FnPtr_##x)dlsym(handle, #x); \
-    if (x == nullptr) { \
+    if ((x) == nullptr) { \
         ALOGE("Failed to load " #x " for RS BLAS implementation."); \
         return false; \
     }
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 7e47a03..23ce72f 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -45,7 +45,7 @@
 using namespace android;
 using namespace android::renderscript;
 
-#define REDUCE_NEW_ALOGV(mtls, level, ...) do { if ((mtls)->logReduce >= (level)) ALOGV(__VA_ARGS__); } while(0)
+#define REDUCE_ALOGV(mtls, level, ...) do { if ((mtls)->logReduce >= (level)) ALOGV(__VA_ARGS__); } while(0)
 
 static pthread_key_t gThreadTLSKey = 0;
 static uint32_t gThreadTLSKeyCount = 0;
@@ -357,7 +357,7 @@
 //   mtls - The MTLaunchStruct holding information about the kernel launch
 //   redp - The reduce parameters (driver info structure)
 //   x, y, z - The start offsets into each dimension
-static inline void RedpPtrSetup(const MTLaunchStructReduceNew *mtls, RsExpandKernelDriverInfo *redp,
+static inline void RedpPtrSetup(const MTLaunchStructReduce *mtls, RsExpandKernelDriverInfo *redp,
                                 uint32_t x, uint32_t y, uint32_t z) {
     for (uint32_t i = 0; i < redp->inLen; i++) {
         redp->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z);
@@ -511,8 +511,8 @@
   return *outBuf;
 }
 
-static void reduce_new_get_accumulator(uint8_t *&accumPtr, const MTLaunchStructReduceNew *mtls,
-                                       const char *walkerName, uint32_t threadIdx) {
+static void reduce_get_accumulator(uint8_t *&accumPtr, const MTLaunchStructReduce *mtls,
+                                   const char *walkerName, uint32_t threadIdx) {
   rsAssert(!accumPtr);
 
   uint32_t accumIdx = (uint32_t)__sync_fetch_and_add(&mtls->accumCount, 1);
@@ -525,8 +525,8 @@
       accumPtr = mtls->accumAlloc + mtls->accumStride * (accumIdx - 1);
     }
   }
-  REDUCE_NEW_ALOGV(mtls, 2, "%s(%p): idx = %u got accumCount %u and accumPtr %p",
-                   walkerName, mtls->accumFunc, threadIdx, accumIdx, accumPtr);
+  REDUCE_ALOGV(mtls, 2, "%s(%p): idx = %u got accumCount %u and accumPtr %p",
+               walkerName, mtls->accumFunc, threadIdx, accumIdx, accumPtr);
   // initialize accumulator
   if (mtls->initFunc) {
     mtls->initFunc(accumPtr);
@@ -535,18 +535,18 @@
   }
 }
 
-static void walk_1d_reduce_new(void *usr, uint32_t idx) {
-  const MTLaunchStructReduceNew *mtls = (const MTLaunchStructReduceNew *)usr;
+static void walk_1d_reduce(void *usr, uint32_t idx) {
+  const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
   RsExpandKernelDriverInfo redp = mtls->redp;
 
   // find accumulator
   uint8_t *&accumPtr = mtls->accumPtr[idx];
   if (!accumPtr) {
-    reduce_new_get_accumulator(accumPtr, mtls, __func__, idx);
+    reduce_get_accumulator(accumPtr, mtls, __func__, idx);
   }
 
   // accumulate
-  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
   while (1) {
     uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
     uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
@@ -569,23 +569,23 @@
     } else {
       fmt[0] = 0;
     }
-    REDUCE_NEW_ALOGV(mtls, 2, "walk_1d_reduce_new(%p): idx = %u, x in [%u, %u)%s",
-                     mtls->accumFunc, idx, xStart, xEnd, fmt);
+    REDUCE_ALOGV(mtls, 2, "walk_1d_reduce(%p): idx = %u, x in [%u, %u)%s",
+                 mtls->accumFunc, idx, xStart, xEnd, fmt);
   }
 }
 
-static void walk_2d_reduce_new(void *usr, uint32_t idx) {
-  const MTLaunchStructReduceNew *mtls = (const MTLaunchStructReduceNew *)usr;
+static void walk_2d_reduce(void *usr, uint32_t idx) {
+  const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
   RsExpandKernelDriverInfo redp = mtls->redp;
 
   // find accumulator
   uint8_t *&accumPtr = mtls->accumPtr[idx];
   if (!accumPtr) {
-    reduce_new_get_accumulator(accumPtr, mtls, __func__, idx);
+    reduce_get_accumulator(accumPtr, mtls, __func__, idx);
   }
 
   // accumulate
-  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
   while (1) {
     uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
     uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
@@ -608,23 +608,23 @@
     } else {
       fmt[0] = 0;
     }
-    REDUCE_NEW_ALOGV(mtls, 2, "walk_2d_reduce_new(%p): idx = %u, y in [%u, %u)%s",
-                     mtls->accumFunc, idx, yStart, yEnd, fmt);
+    REDUCE_ALOGV(mtls, 2, "walk_2d_reduce(%p): idx = %u, y in [%u, %u)%s",
+                 mtls->accumFunc, idx, yStart, yEnd, fmt);
   }
 }
 
-static void walk_3d_reduce_new(void *usr, uint32_t idx) {
-  const MTLaunchStructReduceNew *mtls = (const MTLaunchStructReduceNew *)usr;
+static void walk_3d_reduce(void *usr, uint32_t idx) {
+  const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
   RsExpandKernelDriverInfo redp = mtls->redp;
 
   // find accumulator
   uint8_t *&accumPtr = mtls->accumPtr[idx];
   if (!accumPtr) {
-    reduce_new_get_accumulator(accumPtr, mtls, __func__, idx);
+    reduce_get_accumulator(accumPtr, mtls, __func__, idx);
   }
 
   // accumulate
-  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
   while (1) {
     uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
 
@@ -643,44 +643,25 @@
     } else {
       fmt[0] = 0;
     }
-    REDUCE_NEW_ALOGV(mtls, 2, "walk_3d_reduce_new(%p): idx = %u, z = %u%s",
-                     mtls->accumFunc, idx, redp.current.z, fmt);
+    REDUCE_ALOGV(mtls, 2, "walk_3d_reduce(%p): idx = %u, z = %u%s",
+                 mtls->accumFunc, idx, redp.current.z, fmt);
   }
 }
 
-// Launch a simple reduce-style kernel.
-// Inputs:
-//  ain:  The allocation that contains the input
-//  aout: The allocation that will hold the output
-//  mtls: Holds launch parameters
-void RsdCpuReferenceImpl::launchReduce(const Allocation *ain,
-                                       Allocation *aout,
-                                       MTLaunchStructReduce *mtls) {
-    const uint32_t xStart = mtls->start.x;
-    const uint32_t xEnd = mtls->end.x;
-
-    if (xStart >= xEnd) {
-      return;
-    }
-
-    const uint32_t startOffset = ain->getType()->getElementSizeBytes() * xStart;
-    mtls->kernel(&mtls->inBuf[startOffset], mtls->outBuf, xEnd - xStart);
-}
-
 // Launch a general reduce-style kernel.
 // Inputs:
 //   ains[0..inLen-1]: Array of allocations that contain the inputs
 //   aout:             The allocation that will hold the output
 //   mtls:             Holds launch parameters
-void RsdCpuReferenceImpl::launchReduceNew(const Allocation ** ains,
-                                          uint32_t inLen,
-                                          Allocation * aout,
-                                          MTLaunchStructReduceNew *mtls) {
+void RsdCpuReferenceImpl::launchReduce(const Allocation ** ains,
+                                       uint32_t inLen,
+                                       Allocation * aout,
+                                       MTLaunchStructReduce *mtls) {
   mtls->logReduce = mRSC->props.mLogReduce;
   if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
-    launchReduceNewParallel(ains, inLen, aout, mtls);
+    launchReduceParallel(ains, inLen, aout, mtls);
   } else {
-    launchReduceNewSerial(ains, inLen, aout, mtls);
+    launchReduceSerial(ains, inLen, aout, mtls);
   }
 }
 
@@ -689,12 +670,12 @@
 //   ains[0..inLen-1]: Array of allocations that contain the inputs
 //   aout:             The allocation that will hold the output
 //   mtls:             Holds launch parameters
-void RsdCpuReferenceImpl::launchReduceNewSerial(const Allocation ** ains,
-                                                uint32_t inLen,
-                                                Allocation * aout,
-                                                MTLaunchStructReduceNew *mtls) {
-  REDUCE_NEW_ALOGV(mtls, 1, "launchReduceNewSerial(%p): %u x %u x %u", mtls->accumFunc,
-                   mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z);
+void RsdCpuReferenceImpl::launchReduceSerial(const Allocation ** ains,
+                                             uint32_t inLen,
+                                             Allocation * aout,
+                                             MTLaunchStructReduce *mtls) {
+  REDUCE_ALOGV(mtls, 1, "launchReduceSerial(%p): %u x %u x %u", mtls->accumFunc,
+               mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z);
 
   // In the presence of outconverter, we allocate temporary memory for
   // the accumulator.
@@ -713,7 +694,7 @@
   }
 
   // accumulate
-  const ReduceNewAccumulatorFunc_t fn = mtls->accumFunc;
+  const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
   uint32_t slice = 0;
   while (SelectOuterSlice(mtls, &mtls->redp, slice++)) {
     for (mtls->redp.current.y = mtls->start.y;
@@ -736,13 +717,13 @@
 //   ains[0..inLen-1]: Array of allocations that contain the inputs
 //   aout:             The allocation that will hold the output
 //   mtls:             Holds launch parameters
-void RsdCpuReferenceImpl::launchReduceNewParallel(const Allocation ** ains,
-                                                  uint32_t inLen,
-                                                  Allocation * aout,
-                                                  MTLaunchStructReduceNew *mtls) {
+void RsdCpuReferenceImpl::launchReduceParallel(const Allocation ** ains,
+                                               uint32_t inLen,
+                                               Allocation * aout,
+                                               MTLaunchStructReduce *mtls) {
   // For now, we don't know how to go parallel in the absence of a combiner.
   if (!mtls->combFunc) {
-    launchReduceNewSerial(ains, inLen, aout, mtls);
+    launchReduceSerial(ains, inLen, aout, mtls);
     return;
   }
 
@@ -780,19 +761,19 @@
 
   rsAssert(!mInKernel);
   mInKernel = true;
-  REDUCE_NEW_ALOGV(mtls, 1, "launchReduceNewParallel(%p): %u x %u x %u, %u threads, accumAlloc = %p",
-                   mtls->accumFunc,
-                   mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z,
-                   numThreads, mtls->accumAlloc);
+  REDUCE_ALOGV(mtls, 1, "launchReduceParallel(%p): %u x %u x %u, %u threads, accumAlloc = %p",
+               mtls->accumFunc,
+               mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z,
+               numThreads, mtls->accumAlloc);
   if (mtls->redp.dim.z > 1) {
     mtls->mSliceSize = 1;
-    launchThreads(walk_3d_reduce_new, mtls);
+    launchThreads(walk_3d_reduce, mtls);
   } else if (mtls->redp.dim.y > 1) {
     mtls->mSliceSize = rsMax(1U, mtls->redp.dim.y / (numThreads * 4));
-    launchThreads(walk_2d_reduce_new, mtls);
+    launchThreads(walk_2d_reduce, mtls);
   } else {
     mtls->mSliceSize = rsMax(1U, mtls->redp.dim.x / (numThreads * 4));
-    launchThreads(walk_1d_reduce_new, mtls);
+    launchThreads(walk_1d_reduce, mtls);
   }
   mInKernel = false;
 
@@ -807,12 +788,12 @@
         if (mtls->combFunc) {
           if (mtls->logReduce >= 3) {
             FormatBuf fmt;
-            REDUCE_NEW_ALOGV(mtls, 3, "launchReduceNewParallel(%p): accumulating into%s",
-                             mtls->accumFunc,
-                             format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
-            REDUCE_NEW_ALOGV(mtls, 3, "launchReduceNewParallel(%p):    accumulator[%d]%s",
-                             mtls->accumFunc, idx,
-                             format_bytes(&fmt, thisAccumPtr, mtls->accumSize));
+            REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): accumulating into%s",
+                         mtls->accumFunc,
+                         format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
+            REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p):    accumulator[%d]%s",
+                         mtls->accumFunc, idx,
+                         format_bytes(&fmt, thisAccumPtr, mtls->accumSize));
           }
           mtls->combFunc(finalAccumPtr, thisAccumPtr);
         } else {
@@ -826,8 +807,8 @@
   rsAssert(finalAccumPtr != nullptr);
   if (mtls->logReduce >= 3) {
     FormatBuf fmt;
-    REDUCE_NEW_ALOGV(mtls, 3, "launchReduceNewParallel(%p): final accumulator%s",
-                     mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
+    REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): final accumulator%s",
+                 mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
   }
 
   // Outconvert
@@ -835,9 +816,9 @@
     mtls->outFunc(mtls->redp.outPtr[0], finalAccumPtr);
     if (mtls->logReduce >= 3) {
       FormatBuf fmt;
-      REDUCE_NEW_ALOGV(mtls, 3, "launchReduceNewParallel(%p): final outconverted result%s",
-                       mtls->accumFunc,
-                       format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0]));
+      REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): final outconverted result%s",
+                   mtls->accumFunc,
+                   format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0]));
     }
   }
 
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index 62882aa..e33ea10 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -32,22 +32,21 @@
 extern bool gArchUseSIMD;
 
 // Function types found in RenderScript code
-typedef void (*ReduceFunc_t)(const uint8_t *inBuf, uint8_t *outBuf, uint32_t len);
-typedef void (*ReduceNewAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
-typedef void (*ReduceNewCombinerFunc_t)(uint8_t *accum, const uint8_t *other);
-typedef void (*ReduceNewInitializerFunc_t)(uint8_t *accum);
-typedef void (*ReduceNewOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
+typedef void (*ReduceAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
+typedef void (*ReduceCombinerFunc_t)(uint8_t *accum, const uint8_t *other);
+typedef void (*ReduceInitializerFunc_t)(uint8_t *accum);
+typedef void (*ReduceOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
 typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride);
 typedef void (*InvokeFunc_t)(void *params);
 typedef void (*InitOrDtorFunc_t)(void);
 typedef int  (*RootFunc_t)(void);
 
-struct ReduceNewDescription {
-    ReduceNewAccumulatorFunc_t  accumFunc;  // expanded accumulator function
-    ReduceNewInitializerFunc_t  initFunc;   // user initializer function
-    ReduceNewCombinerFunc_t     combFunc;   // user combiner function
-    ReduceNewOutConverterFunc_t outFunc;    // user outconverter function
-    size_t                      accumSize;  // accumulator datum size, in bytes
+struct ReduceDescription {
+    ReduceAccumulatorFunc_t  accumFunc;  // expanded accumulator function
+    ReduceInitializerFunc_t  initFunc;   // user initializer function
+    ReduceCombinerFunc_t     combFunc;   // user combiner function
+    ReduceOutConverterFunc_t outFunc;    // user outconverter function
+    size_t                   accumSize;  // accumulator datum size, in bytes
 };
 
 // Internal driver callback used to execute a kernel
@@ -75,8 +74,7 @@
     RsLaunchDimensions start;
     RsLaunchDimensions end;
     // Points to MTLaunchStructForEach::fep::dim or
-    // MTLaunchStructReduce::inputDim or
-    // MTLaunchStructReduceNew::redp::dim.
+    // MTLaunchStructReduce::redp::dim.
     RsLaunchDimensions *dimPtr;
 };
 
@@ -90,22 +88,15 @@
 };
 
 struct MTLaunchStructReduce : public MTLaunchStructCommon {
-    ReduceFunc_t kernel;
-    const uint8_t *inBuf;
-    uint8_t *outBuf;
-    RsLaunchDimensions inputDim;
-};
-
-struct MTLaunchStructReduceNew : public MTLaunchStructCommon {
     // Driver info structure
     RsExpandKernelDriverInfo redp;
 
     const Allocation *ains[RS_KERNEL_INPUT_LIMIT];
 
-    ReduceNewAccumulatorFunc_t accumFunc;
-    ReduceNewInitializerFunc_t initFunc;
-    ReduceNewCombinerFunc_t combFunc;
-    ReduceNewOutConverterFunc_t outFunc;
+    ReduceAccumulatorFunc_t accumFunc;
+    ReduceInitializerFunc_t initFunc;
+    ReduceCombinerFunc_t combFunc;
+    ReduceOutConverterFunc_t outFunc;
 
     size_t accumSize;  // accumulator datum size in bytes
 
@@ -154,7 +145,7 @@
 class RsdCpuReferenceImpl : public RsdCpuReference {
 public:
     ~RsdCpuReferenceImpl() override;
-    RsdCpuReferenceImpl(Context *);
+    explicit RsdCpuReferenceImpl(Context *);
 
     void lockMutex();
     void unlockMutex();
@@ -174,13 +165,9 @@
     void launchForEach(const Allocation **ains, uint32_t inLen, Allocation *aout,
                        const RsScriptCall *sc, MTLaunchStructForEach *mtls);
 
-    // Launch a simple reduce kernel
-    void launchReduce(const Allocation *ain, Allocation *aout,
-                      MTLaunchStructReduce *mtls);
-
     // Launch a general reduce kernel
-    void launchReduceNew(const Allocation ** ains, uint32_t inLen, Allocation *aout,
-                         MTLaunchStructReduceNew *mtls);
+    void launchReduce(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+                      MTLaunchStructReduce *mtls);
 
     CpuScript * createScript(const ScriptC *s, char const *resName, char const *cacheDir,
                              uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags) override;
@@ -271,10 +258,10 @@
     long mPageSize;
 
     // Launch a general reduce kernel
-    void launchReduceNewSerial(const Allocation ** ains, uint32_t inLen, Allocation *aout,
-                               MTLaunchStructReduceNew *mtls);
-    void launchReduceNewParallel(const Allocation ** ains, uint32_t inLen, Allocation *aout,
-                                 MTLaunchStructReduceNew *mtls);
+    void launchReduceSerial(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+                            MTLaunchStructReduce *mtls);
+    void launchReduceParallel(const Allocation ** ains, uint32_t inLen, Allocation *aout,
+                              MTLaunchStructReduce *mtls);
 };
 
 
diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp
index ca9a4b6..3d5e635 100644
--- a/cpu_ref/rsCpuExecutable.cpp
+++ b/cpu_ref/rsCpuExecutable.cpp
@@ -272,7 +272,6 @@
 #define EXPORT_FUNC_STR "exportFuncCount: "
 #define EXPORT_FOREACH_STR "exportForEachCount: "
 #define EXPORT_REDUCE_STR "exportReduceCount: "
-#define EXPORT_REDUCE_NEW_STR "exportReduceNewCount: "
 #define OBJECT_SLOT_STR "objectSlotCount: "
 #define PRAGMA_STR "pragmaCount: "
 #define THREADABLE_STR "isThreadable: "
@@ -311,7 +310,6 @@
     size_t funcCount = 0;
     size_t forEachCount = 0;
     size_t reduceCount = 0;
-    size_t reduceNewCount = 0;
     size_t objectSlotCount = 0;
     size_t pragmaCount = 0;
     bool isThreadable = true;
@@ -322,8 +320,7 @@
     InvokeFunc_t* invokeFunctions = nullptr;
     ForEachFunc_t* forEachFunctions = nullptr;
     uint32_t* forEachSignatures = nullptr;
-    ReduceFunc_t* reduceFunctions = nullptr;
-    ReduceNewDescription* reduceNewDescriptions = nullptr;
+    ReduceDescription* reduceDescriptions = nullptr;
     const char ** pragmaKeys = nullptr;
     const char ** pragmaValues = nullptr;
     uint32_t checksum = 0;
@@ -455,56 +452,21 @@
         }
     }
 
-    // Read simple reduce kernels
-    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-        goto error;
-    }
-    if (sscanf(line, EXPORT_REDUCE_STR "%zu", &reduceCount) != 1) {
-        ALOGE("Invalid export reduce count!: %s", line);
-        goto error;
-    }
-
-    reduceFunctions = new ReduceFunc_t[reduceCount];
-    if (reduceFunctions == nullptr) {
-        goto error;
-    }
-
-    for (size_t i = 0; i < reduceCount; ++i) {
-        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-            goto error;
-        }
-        char *c = strrchr(line, '\n');
-        if (c) {
-            *c = '\0';
-        }
-
-        // Lookup the expanded reduce kernel.
-        strncat(line, ".expand", MAXLINESTR-strlen(line));
-
-        reduceFunctions[i] =
-            reinterpret_cast<ReduceFunc_t>(dlsym(sharedObj, line));
-        if (reduceFunctions[i] == nullptr) {
-            ALOGE("Failed to get function address for %s(): %s",
-                  line, dlerror());
-            goto error;
-        }
-    }
-
     // Read general reduce kernels
     if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
         goto error;
     }
-    if (sscanf(line, EXPORT_REDUCE_NEW_STR "%zu", &reduceNewCount) != 1) {
+    if (sscanf(line, EXPORT_REDUCE_STR "%zu", &reduceCount) != 1) {
         ALOGE("Invalid export reduce new count!: %s", line);
         goto error;
     }
 
-    reduceNewDescriptions = new ReduceNewDescription[reduceNewCount];
-    if (reduceNewDescriptions == nullptr) {
+    reduceDescriptions = new ReduceDescription[reduceCount];
+    if (reduceDescriptions == nullptr) {
         goto error;
     }
 
-    for (size_t i = 0; i < reduceNewCount; ++i) {
+    for (size_t i = 0; i < reduceCount; ++i) {
         static const char kNoName[] = ".";
 
         unsigned int tmpSig = 0;
@@ -545,25 +507,25 @@
         // The current implementation does not use the signature
         // or reduce name.
 
-        reduceNewDescriptions[i].accumSize = tmpSize;
+        reduceDescriptions[i].accumSize = tmpSize;
 
         // Process the (optional) initializer.
         if (strcmp(tmpNameInitializer, kNoName)) {
           // Lookup the original user-written initializer.
-          if (!(reduceNewDescriptions[i].initFunc =
-                (ReduceNewInitializerFunc_t) dlsym(sharedObj, tmpNameInitializer))) {
+          if (!(reduceDescriptions[i].initFunc =
+                (ReduceInitializerFunc_t) dlsym(sharedObj, tmpNameInitializer))) {
             ALOGE("Failed to find initializer function address for %s(): %s",
                   tmpNameInitializer, dlerror());
             goto error;
           }
         } else {
-          reduceNewDescriptions[i].initFunc = nullptr;
+          reduceDescriptions[i].initFunc = nullptr;
         }
 
         // Lookup the expanded accumulator.
         strncat(tmpNameAccumulator, ".expand", MAXLINESTR-strlen(tmpNameAccumulator));
-        if (!(reduceNewDescriptions[i].accumFunc =
-              (ReduceNewAccumulatorFunc_t) dlsym(sharedObj, tmpNameAccumulator))) {
+        if (!(reduceDescriptions[i].accumFunc =
+              (ReduceAccumulatorFunc_t) dlsym(sharedObj, tmpNameAccumulator))) {
             ALOGE("Failed to find accumulator function address for %s(): %s",
                   tmpNameAccumulator, dlerror());
             goto error;
@@ -572,27 +534,27 @@
         // Process the (optional) combiner.
         if (strcmp(tmpNameCombiner, kNoName)) {
           // Lookup the original user-written combiner.
-          if (!(reduceNewDescriptions[i].combFunc =
-                (ReduceNewCombinerFunc_t) dlsym(sharedObj, tmpNameCombiner))) {
+          if (!(reduceDescriptions[i].combFunc =
+                (ReduceCombinerFunc_t) dlsym(sharedObj, tmpNameCombiner))) {
             ALOGE("Failed to find combiner function address for %s(): %s",
                   tmpNameCombiner, dlerror());
             goto error;
           }
         } else {
-          reduceNewDescriptions[i].combFunc = nullptr;
+          reduceDescriptions[i].combFunc = nullptr;
         }
 
         // Process the (optional) outconverter.
         if (strcmp(tmpNameOutConverter, kNoName)) {
           // Lookup the original user-written outconverter.
-          if (!(reduceNewDescriptions[i].outFunc =
-                (ReduceNewOutConverterFunc_t) dlsym(sharedObj, tmpNameOutConverter))) {
+          if (!(reduceDescriptions[i].outFunc =
+                (ReduceOutConverterFunc_t) dlsym(sharedObj, tmpNameOutConverter))) {
             ALOGE("Failed to find outconverter function address for %s(): %s",
                   tmpNameOutConverter, dlerror());
             goto error;
           }
         } else {
-          reduceNewDescriptions[i].outFunc = nullptr;
+          reduceDescriptions[i].outFunc = nullptr;
         }
     }
 
@@ -726,8 +688,7 @@
         fieldAddress, fieldIsObject, fieldName, varCount,
         invokeFunctions, funcCount,
         forEachFunctions, forEachSignatures, forEachCount,
-        reduceFunctions, reduceCount,
-        reduceNewDescriptions, reduceNewCount,
+        reduceDescriptions, reduceCount,
         pragmaKeys, pragmaValues, pragmaCount,
         rsGlobalNames, rsGlobalAddresses, rsGlobalSizes, rsGlobalProperties,
         numEntries, isThreadable, checksum);
@@ -745,8 +706,6 @@
     delete[] pragmaKeys;
 #endif  // RS_COMPATIBILITY_LIB
 
-    delete[] reduceFunctions;
-
     delete[] forEachSignatures;
     delete[] forEachFunctions;
 
diff --git a/cpu_ref/rsCpuExecutable.h b/cpu_ref/rsCpuExecutable.h
index 72c352c..90d3759 100644
--- a/cpu_ref/rsCpuExecutable.h
+++ b/cpu_ref/rsCpuExecutable.h
@@ -67,8 +67,7 @@
                      InvokeFunc_t* invokeFunctions, size_t funcCount,
                      ForEachFunc_t* forEachFunctions, uint32_t* forEachSignatures,
                      size_t forEachCount,
-                     ReduceFunc_t* reduceFunctions, size_t reduceCount,
-                     ReduceNewDescription *reduceNewDescriptions, size_t reduceNewCount,
+                     ReduceDescription *reduceDescriptions, size_t reduceCount,
                      const char** pragmaKeys, const char** pragmaValues,
                      size_t pragmaCount,
                      const char **globalNames, const void **globalAddresses,
@@ -80,8 +79,7 @@
         mInvokeFunctions(invokeFunctions), mFuncCount(funcCount),
         mForEachFunctions(forEachFunctions), mForEachSignatures(forEachSignatures),
         mForEachCount(forEachCount),
-        mReduceFunctions(reduceFunctions), mReduceCount(reduceCount),
-        mReduceNewDescriptions(reduceNewDescriptions), mReduceNewCount(reduceNewCount),
+        mReduceDescriptions(reduceDescriptions), mReduceCount(reduceCount),
         mPragmaKeys(pragmaKeys), mPragmaValues(pragmaValues),
         mPragmaCount(pragmaCount), mGlobalNames(globalNames),
         mGlobalAddresses(globalAddresses), mGlobalSizes(globalSizes),
@@ -107,9 +105,7 @@
         delete[] mPragmaValues;
         delete[] mPragmaKeys;
 
-        delete[] mReduceFunctions;
-
-        delete[] mReduceNewDescriptions;
+        delete[] mReduceDescriptions;
 
         delete[] mForEachSignatures;
         delete[] mForEachFunctions;
@@ -136,7 +132,6 @@
     size_t getExportedFunctionCount() const { return mFuncCount; }
     size_t getExportedForEachCount() const { return mForEachCount; }
     size_t getExportedReduceCount() const { return mReduceCount; }
-    size_t getExportedReduceNewCount() const { return mReduceNewCount; }
     size_t getPragmaCount() const { return mPragmaCount; }
 
     void* getFieldAddress(int slot) const { return mFieldAddress[slot]; }
@@ -149,10 +144,8 @@
     ForEachFunc_t getForEachFunction(int slot) const { return mForEachFunctions[slot]; }
     uint32_t getForEachSignature(int slot) const { return mForEachSignatures[slot]; }
 
-    ReduceFunc_t getReduceFunction(int slot) const { return mReduceFunctions[slot]; }
-
-    const ReduceNewDescription* getReduceNewDescription(int slot) const {
-        return &mReduceNewDescriptions[slot];
+    const ReduceDescription* getReduceDescription(int slot) const {
+        return &mReduceDescriptions[slot];
     }
 
     const char ** getPragmaKeys() const { return mPragmaKeys; }
@@ -207,12 +200,9 @@
     uint32_t* mForEachSignatures;
     size_t mForEachCount;
 
-    ReduceFunc_t* mReduceFunctions;
+    ReduceDescription* mReduceDescriptions;
     size_t mReduceCount;
 
-    ReduceNewDescription* mReduceNewDescriptions;
-    size_t mReduceNewCount;
-
     const char ** mPragmaKeys;
     const char ** mPragmaValues;
     size_t mPragmaCount;
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 0400fab..d9f9412 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -382,7 +382,8 @@
     compileArguments.push_back("-build-checksum");
     std::stringstream ss;
     ss << std::hex << mBuildChecksum;
-    compileArguments.push_back(ss.str().c_str());
+    std::string checksumStr(ss.str());
+    compileArguments.push_back(checksumStr.c_str());
     compileArguments.push_back(nullptr);
 
     if (!is_force_recompile() && !useRSDebugContext) {
@@ -500,7 +501,6 @@
     // Copy info over to runtime
     script->mHal.info.exportedFunctionCount = mScriptExec->getExportedFunctionCount();
     script->mHal.info.exportedReduceCount = mScriptExec->getExportedReduceCount();
-    script->mHal.info.exportedReduceNewCount = mScriptExec->getExportedReduceNewCount();
     script->mHal.info.exportedForEachCount = mScriptExec->getExportedForEachCount();
     script->mHal.info.exportedVariableCount = mScriptExec->getExportedVariableCount();
     script->mHal.info.exportedPragmaCount = mScriptExec->getPragmaCount();;
@@ -555,52 +555,14 @@
     return true;
 }
 
-// Preliminary work to prepare a simple reduce-style kernel for launch.
-bool RsdCpuScriptImpl::reduceMtlsSetup(const Allocation *ain,
-                                       const Allocation *aout,
+// Preliminary work to prepare a general reduce-style kernel for launch.
+bool RsdCpuScriptImpl::reduceMtlsSetup(const Allocation ** ains,
+                                       uint32_t inLen,
+                                       const Allocation * aout,
                                        const RsScriptCall *sc,
                                        MTLaunchStructReduce *mtls) {
-    rsAssert(ain && aout);
-    memset(mtls, 0, sizeof(MTLaunchStructReduce));
-    mtls->dimPtr = &mtls->inputDim;
-
-    if (allocationLODIsNull(ain) || allocationLODIsNull(aout)) {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
-                                     "reduce called with a null allocation");
-        return false;
-    }
-
-    // Set up the dimensions of the input.
-    const Type *inType = ain->getType();
-    mtls->inputDim.x = inType->getDimX();
-    rsAssert(inType->getDimY() == 0);
-
-    if (!setUpMtlsDimensions(mtls, mtls->inputDim, sc)) {
-        return false;
-    }
-
-    mtls->rs = mCtx;
-    // Currently not threaded.
-    mtls->isThreadable = false;
-    mtls->mSliceNum = -1;
-
-    // Set up input and output.
-    mtls->inBuf = static_cast<uint8_t *>(ain->getPointerUnchecked(0, 0));
-    mtls->outBuf = static_cast<uint8_t *>(aout->getPointerUnchecked(0, 0));
-
-    rsAssert(mtls->inBuf && mtls->outBuf);
-
-    return true;
-}
-
-// Preliminary work to prepare a general reduce-style kernel for launch.
-bool RsdCpuScriptImpl::reduceNewMtlsSetup(const Allocation ** ains,
-                                          uint32_t inLen,
-                                          const Allocation * aout,
-                                          const RsScriptCall *sc,
-                                          MTLaunchStructReduceNew *mtls) {
     rsAssert(ains && (inLen >= 1) && aout);
-    memset(mtls, 0, sizeof(MTLaunchStructReduceNew));
+    memset(mtls, 0, sizeof(MTLaunchStructReduce));
     mtls->dimPtr = &mtls->redp.dim;
 
     for (int index = inLen; --index >= 0;) {
@@ -793,29 +755,15 @@
 }
 
 void RsdCpuScriptImpl::invokeReduce(uint32_t slot,
-                                    const Allocation *ain,
+                                    const Allocation ** ains, uint32_t inLen,
                                     Allocation *aout,
                                     const RsScriptCall *sc) {
-    MTLaunchStructReduce mtls;
+  MTLaunchStructReduce mtls;
 
-    if (reduceMtlsSetup(ain, aout, sc, &mtls)) {
-        reduceKernelSetup(slot, &mtls);
-        RsdCpuScriptImpl *oldTLS = mCtx->setTLS(this);
-        mCtx->launchReduce(ain, aout, &mtls);
-        mCtx->setTLS(oldTLS);
-    }
-}
-
-void RsdCpuScriptImpl::invokeReduceNew(uint32_t slot,
-                                       const Allocation ** ains, uint32_t inLen,
-                                       Allocation *aout,
-                                       const RsScriptCall *sc) {
-  MTLaunchStructReduceNew mtls;
-
-  if (reduceNewMtlsSetup(ains, inLen, aout, sc, &mtls)) {
-    reduceNewKernelSetup(slot, &mtls);
+  if (reduceMtlsSetup(ains, inLen, aout, sc, &mtls)) {
+    reduceKernelSetup(slot, &mtls);
     RsdCpuScriptImpl *oldTLS = mCtx->setTLS(this);
-    mCtx->launchReduceNew(ains, inLen, aout, &mtls);
+    mCtx->launchReduce(ains, inLen, aout, &mtls);
     mCtx->setTLS(oldTLS);
   }
 }
@@ -829,15 +777,9 @@
 
 void RsdCpuScriptImpl::reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls) {
     mtls->script = this;
-    mtls->kernel = mScriptExec->getReduceFunction(slot);
-    rsAssert(mtls->kernel != nullptr);
-}
-
-void RsdCpuScriptImpl::reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceNew *mtls) {
-    mtls->script = this;
     mtls->redp.slot = slot;
 
-    const ReduceNewDescription *desc = mScriptExec->getReduceNewDescription(slot);
+    const ReduceDescription *desc = mScriptExec->getReduceDescription(slot);
     mtls->accumFunc = desc->accumFunc;
     mtls->initFunc  = desc->initFunc;   // might legally be nullptr
     mtls->combFunc  = desc->combFunc;   // might legally be nullptr
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 2909dab..94345bd 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -61,15 +61,10 @@
                        const RsScriptCall* sc) override;
 
     void invokeReduce(uint32_t slot,
-                      const Allocation* ain,
+                      const Allocation ** ains, uint32_t inLen,
                       Allocation* aout,
                       const RsScriptCall* sc) override;
 
-    void invokeReduceNew(uint32_t slot,
-                         const Allocation ** ains, uint32_t inLen,
-                         Allocation* aout,
-                         const RsScriptCall* sc) override;
-
     void invokeInit() override;
     void invokeFreeChildren() override;
 
@@ -94,17 +89,11 @@
 
     virtual void forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls);
 
-    // Build an MTLaunchStruct suitable for launching a simple reduce-style kernel.
-    bool reduceMtlsSetup(const Allocation *ain, const Allocation *aout,
-                         const RsScriptCall *sc, MTLaunchStructReduce *mtls);
-    // Finalize an MTLaunchStruct for launching a simple reduce-style kernel.
-    virtual void reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls);
-
     // Build an MTLaunchStruct suitable for launching a general reduce-style kernel.
-    bool reduceNewMtlsSetup(const Allocation ** ains, uint32_t inLen, const Allocation *aout,
-                            const RsScriptCall *sc, MTLaunchStructReduceNew *mtls);
+    bool reduceMtlsSetup(const Allocation ** ains, uint32_t inLen, const Allocation *aout,
+                         const RsScriptCall *sc, MTLaunchStructReduce *mtls);
     // Finalize an MTLaunchStruct for launching a general reduce-style kernel.
-    virtual void reduceNewKernelSetup(uint32_t slot, MTLaunchStructReduceNew *mtls);
+    virtual void reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls);
 
     const RsdCpuReference::CpuSymbol * lookupSymbolMath(const char *sym);
     static void * lookupRuntimeStub(void* pContext, char const* name);
diff --git a/cpu_ref/rsCpuScriptGroup2.cpp b/cpu_ref/rsCpuScriptGroup2.cpp
index c784ce3..2382cd6 100644
--- a/cpu_ref/rsCpuScriptGroup2.cpp
+++ b/cpu_ref/rsCpuScriptGroup2.cpp
@@ -181,7 +181,8 @@
             mBatches.push_back(batch);
             std::stringstream ss;
             ss << "Batch" << ++i;
-            batch = new Batch(this, ss.str().c_str());
+            std::string batchStr(ss.str());
+            batch = new Batch(this, batchStr.c_str());
         }
 
         batch->mClosures.push_back(cc);
@@ -416,7 +417,7 @@
 
     std::stringstream ss;
     ss << std::hex << checksum;
-    const char* checksumStr = ss.str().c_str();
+    std::string checksumStr(ss.str());
 
     //===--------------------------------------------------------------------===//
     // Try to load a shared lib from code cache matching filename and checksum
@@ -466,7 +467,7 @@
     //===--------------------------------------------------------------------===//
 
     arguments.push_back("-build-checksum");
-    arguments.push_back(checksumStr);
+    arguments.push_back(checksumStr.c_str());
     arguments.push_back(nullptr);
 
     bool compiled = rsuExecuteCommand(RsdCpuScriptImpl::BCC_EXE_PATH,
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index e226b93..a8d980e 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -59,15 +59,10 @@
                                    const RsScriptCall *sc) = 0;
 
         virtual void invokeReduce(uint32_t slot,
-                                  const Allocation *ain,
+                                  const Allocation ** ains, uint32_t inLen,
                                   Allocation *aout,
                                   const RsScriptCall *sc) = 0;
 
-        virtual void invokeReduceNew(uint32_t slot,
-                                     const Allocation ** ains, uint32_t inLen,
-                                     Allocation *aout,
-                                     const RsScriptCall *sc) = 0;
-
         virtual void invokeInit() = 0;
         virtual void invokeFreeChildren() = 0;
 
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index af8d6ad..5aa1c94 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -126,20 +126,11 @@
 
 void rsdScriptInvokeReduce(const Context *dc, Script *s,
                            uint32_t slot,
-                           const Allocation *ain,
+                           const Allocation ** ains, size_t inLen,
                            Allocation *aout,
                            const RsScriptCall *sc) {
     RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
-    cs->invokeReduce(slot, ain, aout, sc);
-}
-
-void rsdScriptInvokeReduceNew(const Context *dc, Script *s,
-                              uint32_t slot,
-                              const Allocation ** ains, size_t inLen,
-                              Allocation *aout,
-                              const RsScriptCall *sc) {
-    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
-    cs->invokeReduceNew(slot, ains, inLen, aout, sc);
+    cs->invokeReduce(slot, ains, inLen, aout, sc);
 }
 
 void rsdScriptSetGlobalVar(const Context *dc, const Script *s,
diff --git a/driver/rsdBcc.h b/driver/rsdBcc.h
index a2bf8be..c79f445 100644
--- a/driver/rsdBcc.h
+++ b/driver/rsdBcc.h
@@ -46,18 +46,11 @@
 void rsdScriptInvokeReduce(const android::renderscript::Context *rsc,
                            android::renderscript::Script *s,
                            uint32_t slot,
-                           const android::renderscript::Allocation *ain,
+                           const android::renderscript::Allocation ** ains,
+                           size_t inLen,
                            android::renderscript::Allocation *aout,
                            const RsScriptCall *sc);
 
-void rsdScriptInvokeReduceNew(const android::renderscript::Context *rsc,
-                              android::renderscript::Script *s,
-                              uint32_t slot,
-                              const android::renderscript::Allocation ** ains,
-                              size_t inLen,
-                              android::renderscript::Allocation *aout,
-                              const RsScriptCall *sc);
-
 void rsdScriptInvokeForEachMulti(const android::renderscript::Context *rsc,
                                  android::renderscript::Script *s,
                                  uint32_t slot,
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index f0a7334..a5e942a 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -101,8 +101,6 @@
         fnPtr[0] = (void *)rsdScriptUpdateCachedObject; break;
     case RS_HAL_SCRIPT_INVOKE_REDUCE:
         fnPtr[0] = (void *)rsdScriptInvokeReduce; break;
-    case RS_HAL_SCRIPT_INVOKE_REDUCE_NEW:
-        fnPtr[0] = (void *)rsdScriptInvokeReduceNew; break;
 
     case RS_HAL_ALLOCATION_INIT:
         fnPtr[0] = (void *)rsdAllocationInit; break;
@@ -265,7 +263,7 @@
 
 
 extern "C" bool rsdHalQueryVersion(uint32_t *major, uint32_t *minor) {
-    *major = 23;
+    *major = RS_HAL_VERSION;
     *minor = 0;
     return true;
 }
diff --git a/driver/rsdShader.h b/driver/rsdShader.h
index caccc09..dc31c24 100644
--- a/driver/rsdShader.h
+++ b/driver/rsdShader.h
@@ -70,7 +70,7 @@
 
     class StateBasedKey {
     public:
-        StateBasedKey(uint32_t texCount) : mShaderID(0) {
+        explicit StateBasedKey(uint32_t texCount) : mShaderID(0) {
             mTextureTargets = new uint32_t[texCount];
         }
         ~StateBasedKey() {
diff --git a/driver/rsdShaderCache.h b/driver/rsdShaderCache.h
index 29f91bb..884b1c8 100644
--- a/driver/rsdShaderCache.h
+++ b/driver/rsdShaderCache.h
@@ -78,7 +78,7 @@
         int32_t writtenLength;
         int32_t arraySize;
         uint32_t type;
-        UniformQueryData(uint32_t maxName) {
+        explicit UniformQueryData(uint32_t maxName) {
             name = nullptr;
             nameLength = maxName;
             if (nameLength > 0 ) {
diff --git a/libRS.map b/libRS.map
index b6a2d82..ba7e3d4 100644
--- a/libRS.map
+++ b/libRS.map
@@ -84,7 +84,6 @@
     rsScriptForEachMulti;
     rsScriptGetVarV;
     rsScriptGroup2Create;
-    rsScriptGroup2Create;
     rsScriptGroupCreate;
     rsScriptGroupExecute;
     rsScriptGroupSetInput;
@@ -95,7 +94,6 @@
     rsScriptInvokeV;
     rsScriptKernelIDCreate;
     rsScriptReduce;
-    rsScriptReduceNew;
     rsScriptSetTimeZone;
     rsScriptSetVarD;
     rsScriptSetVarF;
diff --git a/rs.spec b/rs.spec
index 608f324..efae43e 100644
--- a/rs.spec
+++ b/rs.spec
@@ -414,14 +414,6 @@
 ScriptReduce {
     param RsScript s
     param uint32_t slot
-    param RsAllocation ain
-    param RsAllocation aout
-    param const RsScriptCall * sc
-}
-
-ScriptReduceNew {
-    param RsScript s
-    param uint32_t slot
     param RsAllocation * ains
     param RsAllocation aout
     param const RsScriptCall * sc
diff --git a/rsAllocation.h b/rsAllocation.h
index 4d09679..0f60150 100644
--- a/rsAllocation.h
+++ b/rsAllocation.h
@@ -223,7 +223,7 @@
 #if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
     class NewBufferListener : public android::ConsumerBase::FrameAvailableListener {
     public:
-        NewBufferListener(uint32_t numAlloc);
+        explicit NewBufferListener(uint32_t numAlloc);
         virtual ~NewBufferListener();
         const android::renderscript::Context *rsc;
         const android::renderscript::Allocation **alloc;
diff --git a/rsAnimation.h b/rsAnimation.h
index de4957e..f77d3cb 100644
--- a/rsAnimation.h
+++ b/rsAnimation.h
@@ -41,7 +41,7 @@
     static Animation *createFromStream(Context *rsc, IStream *stream);
 
 protected:
-    Animation(Context *rsc);
+    explicit Animation(Context *rsc);
 
 
 
diff --git a/rsContext.h b/rsContext.h
index 4a8cd29..fe771ec 100644
--- a/rsContext.h
+++ b/rsContext.h
@@ -102,7 +102,7 @@
 
     class PushState {
     public:
-        PushState(Context *);
+        explicit PushState(Context *);
         ~PushState();
 
     private:
diff --git a/rsCppUtils.h b/rsCppUtils.h
index 606046e..3da0d03 100644
--- a/rsCppUtils.h
+++ b/rsCppUtils.h
@@ -65,7 +65,7 @@
     // server has no Vector or String8 classes; implement on top of STL
     class String8: public std::string {
     public:
-    String8(const char *ptr) : std::string(ptr) {
+    explicit String8(const char *ptr) : std::string(ptr) {
 
         }
     String8(const char *ptr, size_t len) : std::string(ptr, len) {
diff --git a/rsDriverLoader.cpp b/rsDriverLoader.cpp
index 426c519..16efa0d 100644
--- a/rsDriverLoader.cpp
+++ b/rsDriverLoader.cpp
@@ -71,7 +71,6 @@
     ret &= fn(RS_HAL_SCRIPT_INVOKE_ROOT, (void **)&rsc->mHal.funcs.script.invokeRoot);
     ret &= fn(RS_HAL_SCRIPT_INVOKE_FOR_EACH, (void **)&rsc->mHal.funcs.script.invokeForEach);
     ret &= fn(RS_HAL_SCRIPT_INVOKE_REDUCE, (void **)&rsc->mHal.funcs.script.invokeReduce);
-    ret &= fn(RS_HAL_SCRIPT_INVOKE_REDUCE_NEW, (void **)&rsc->mHal.funcs.script.invokeReduceNew);
     ret &= fn(RS_HAL_SCRIPT_INVOKE_INIT, (void **)&rsc->mHal.funcs.script.invokeInit);
     ret &= fn(RS_HAL_SCRIPT_INVOKE_FREE_CHILDREN, (void **)&rsc->mHal.funcs.script.invokeFreeChildren);
     ret &= fn(RS_HAL_SCRIPT_SET_GLOBAL_VAR, (void **)&rsc->mHal.funcs.script.setGlobalVar);
@@ -201,6 +200,12 @@
         goto error;
     }
 
+    if (version_major != RS_HAL_VERSION) {
+        ALOGE("Mismatched RS HAL versions: %s is version %u but version %u is expected",
+              filename, version_major, RS_HAL_VERSION);
+        goto error;
+    }
+
     if (!LoadHalTable(this, fnQueryHal, mIsGraphicsContext)) {
         ALOGE("Error loading RS HAL table, %s", filename);
         goto error;
diff --git a/rsElement.h b/rsElement.h
index 9374c64..d8f101b 100644
--- a/rsElement.h
+++ b/rsElement.h
@@ -152,7 +152,7 @@
 
 
     virtual ~Element();
-    Element(Context *);
+    explicit Element(Context *);
 
     Component mComponent;
     uint32_t mBitsUnpadded;
diff --git a/rsFileA3D.h b/rsFileA3D.h
index 8bf36b9..ae74455 100644
--- a/rsFileA3D.h
+++ b/rsFileA3D.h
@@ -32,7 +32,7 @@
 
 class FileA3D : public ObjectBase {
 public:
-    FileA3D(Context *rsc);
+    explicit FileA3D(Context *rsc);
     ~FileA3D();
 
     uint32_t mMajorVersion;
diff --git a/rsFont.h b/rsFont.h
index 0f17340..3be5c3c 100644
--- a/rsFont.h
+++ b/rsFont.h
@@ -116,7 +116,7 @@
     float mFontSize;
     uint32_t mDpi;
 
-    Font(Context *rsc);
+    explicit Font(Context *rsc);
     bool init(const char *name, float fontSize, uint32_t dpi, const void *data = nullptr, uint32_t dataLen = 0);
 
     virtual void preDestroy() const;
diff --git a/rsList.h b/rsList.h
index 24720a2..052ec77 100644
--- a/rsList.h
+++ b/rsList.h
@@ -92,7 +92,7 @@
         T* operator->() { return p; }
 
     protected:
-        iterator(const List* list_) : list(list_) {}
+        explicit iterator(const List* list_) : list(list_) {}
         iterator(const List* list_, LinkedBuffer* buffer_, T* p_) :
             p(p_), buffer(buffer_), list(list_) {}
 
diff --git a/rsMesh.h b/rsMesh.h
index c7ee088..6c60555 100644
--- a/rsMesh.h
+++ b/rsMesh.h
@@ -55,7 +55,7 @@
     };
     Hal mHal;
 
-    Mesh(Context *);
+    explicit Mesh(Context *);
     Mesh(Context *, uint32_t vertexBuffersCount, uint32_t primitivesCount);
     ~Mesh();
 
diff --git a/rsObjectBase.h b/rsObjectBase.h
index c51d85c..4f29e57 100644
--- a/rsObjectBase.h
+++ b/rsObjectBase.h
@@ -35,7 +35,7 @@
     static const bool gDebugLeaks = false;
     static const bool gDebugLifetime = false;
 
-    ObjectBase(Context *rsc);
+    ObjectBase(Context *rsc);  // NOLINT, implicit
 
     void incSysRef() const;
     bool decSysRef() const;
@@ -111,7 +111,7 @@
         }
     }
 
-    ObjectBaseRef(T *ref) {
+    ObjectBaseRef(T *ref) {  // NOLINT, implicit
         mRef = ref;
         if (mRef) {
             ref->incSysRef();
diff --git a/rsProgramBase.h b/rsProgramBase.h
index 80da453..aeee5c1 100644
--- a/rsProgramBase.h
+++ b/rsProgramBase.h
@@ -26,7 +26,7 @@
 
 class ProgramBase : public ObjectBase {
 public:
-    ProgramBase(Context *rsc) : ObjectBase(rsc) {
+    explicit ProgramBase(Context *rsc) : ObjectBase(rsc) {
         mDirty = true;
     }
 
diff --git a/rsSampler.h b/rsSampler.h
index 2fdf707..c63a4a8 100644
--- a/rsSampler.h
+++ b/rsSampler.h
@@ -74,7 +74,7 @@
     virtual ~Sampler();
 
 private:
-    Sampler(Context *);
+    explicit Sampler(Context *);
     Sampler(Context *,
             RsSamplerValue magFilter,
             RsSamplerValue minFilter,
diff --git a/rsScript.cpp b/rsScript.cpp
index bf28328..4c2f52f 100644
--- a/rsScript.cpp
+++ b/rsScript.cpp
@@ -225,23 +225,15 @@
 }
 
 void rsi_ScriptReduce(Context *rsc, RsScript vs, uint32_t slot,
-                      RsAllocation vain, RsAllocation vaout,
-                      const RsScriptCall *sc, size_t scLen) {
-    Script *s = static_cast<Script *>(vs);
-    s->runReduce(rsc, slot, static_cast<const Allocation *>(vain),
-                            static_cast<Allocation *>(vaout), sc);
-}
-
-void rsi_ScriptReduceNew(Context *rsc, RsScript vs, uint32_t slot,
-                         RsAllocation *vains, size_t inLen,
-                         RsAllocation vaout, const RsScriptCall *sc,
-                         size_t scLen) {
+                      RsAllocation *vains, size_t inLen,
+                      RsAllocation vaout, const RsScriptCall *sc,
+                      size_t scLen) {
   Script *s = static_cast<Script *>(vs);
   Allocation **ains = (Allocation**)(vains);
 
-  s->runReduceNew(rsc, slot,
-                  const_cast<const Allocation **>(ains), inLen,
-                  static_cast<Allocation *>(vaout), sc);
+  s->runReduce(rsc, slot,
+               const_cast<const Allocation **>(ains), inLen,
+               static_cast<Allocation *>(vaout), sc);
 }
 
 void rsi_ScriptInvoke(Context *rsc, RsScript vs, uint32_t slot) {
diff --git a/rsScript.h b/rsScript.h
index c3241ab..39620c1 100644
--- a/rsScript.h
+++ b/rsScript.h
@@ -86,7 +86,6 @@
             size_t exportedVariableCount;
             size_t exportedForEachCount;
             size_t exportedReduceCount;
-            size_t exportedReduceNewCount;
             size_t exportedFunctionCount;
             size_t exportedPragmaCount;
             char const **exportedPragmaKeyList;
@@ -99,7 +98,7 @@
     };
     Hal mHal;
 
-    Script(Context *);
+    explicit Script(Context *);
     virtual ~Script();
 
     struct Enviroment_t {
@@ -133,13 +132,10 @@
                             size_t usrBytes,
                             const RsScriptCall *sc = nullptr) = 0;
 
-    virtual void runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
+    virtual void runReduce(Context *rsc, uint32_t slot,
+                           const Allocation **ains, size_t inLen,
                            Allocation *aout, const RsScriptCall *sc) = 0;
 
-    virtual void runReduceNew(Context *rsc, uint32_t slot,
-                              const Allocation **ains, size_t inLen,
-                              Allocation *aout, const RsScriptCall *sc) = 0;
-
     virtual void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) = 0;
     virtual void setupScript(Context *rsc) = 0;
     virtual uint32_t run(Context *) = 0;
diff --git a/rsScriptC.cpp b/rsScriptC.cpp
index eaef849..b4d364f 100644
--- a/rsScriptC.cpp
+++ b/rsScriptC.cpp
@@ -29,6 +29,7 @@
 
 #include <sys/stat.h>
 
+#include <sstream>
 #include <string>
 
 #ifdef USE_MINGW
@@ -192,15 +193,13 @@
     // Trace this function call.
     // To avoid overhead we only build the string if tracing is actually
     // enabled.
-    String8 *AString = NULL;
-    const char *String = "";
+    std::stringstream ss;
     if (ATRACE_ENABLED()) {
-        AString = new String8("runForEach_");
-        AString->append(mHal.info.exportedForeachFuncList[slot].first);
-        String = AString->string();
+        ss << "runForEach slot[" << slot << "]";
     }
-    ATRACE_NAME(String);
-    (void)String;
+    std::string msgStr(ss.str());
+    ATRACE_NAME(msgStr.c_str());
+
     if (mRSC->hadFatalError()) return;
 
     Context::PushState ps(rsc);
@@ -224,39 +223,15 @@
         rsc->setError(RS_ERROR_FATAL_DRIVER,
                       "Driver support for multi-input not present");
     }
-
-    if (AString) {
-        delete AString;
-    }
 }
 
-void ScriptC::runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
+void ScriptC::runReduce(Context *rsc, uint32_t slot,
+                        const Allocation ** ains, size_t inLen,
                         Allocation *aout, const RsScriptCall *sc) {
-    // TODO: Record the name of the kernel in the tracing information.
-    ATRACE_CALL();
-
-    if (slot >= mHal.info.exportedReduceCount) {
-        rsc->setError(RS_ERROR_BAD_SCRIPT, "The simple reduce kernel index is out of bounds");
-        return;
-    }
-    if (mRSC->hadFatalError()) return;
-
-    setupScript(rsc);
-
-    if (rsc->props.mLogScripts) {
-        ALOGV("%p ScriptC::runReduce invoking slot %i, ptr %p", rsc, slot, this);
-    }
-
-    rsc->mHal.funcs.script.invokeReduce(rsc, this, slot, ain, aout, sc);
-}
-
-void ScriptC::runReduceNew(Context *rsc, uint32_t slot,
-                           const Allocation ** ains, size_t inLen,
-                           Allocation *aout, const RsScriptCall *sc) {
   // TODO: Record the name of the kernel in the tracing information.
   ATRACE_CALL();
 
-  if (slot >= mHal.info.exportedReduceNewCount) {
+  if (slot >= mHal.info.exportedReduceCount) {
       rsc->setError(RS_ERROR_BAD_SCRIPT, "The general reduce kernel index is out of bounds");
       return;
   }
@@ -265,10 +240,10 @@
   setupScript(rsc);
 
   if (rsc->props.mLogScripts) {
-      ALOGV("%p ScriptC::runReduceNew invoking slot %i, ptr %p", rsc, slot, this);
+      ALOGV("%p ScriptC::runReduce invoking slot %i, ptr %p", rsc, slot, this);
   }
 
-  rsc->mHal.funcs.script.invokeReduceNew(rsc, this, slot, ains, inLen, aout, sc);
+  rsc->mHal.funcs.script.invokeReduce(rsc, this, slot, ains, inLen, aout, sc);
 }
 
 void ScriptC::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
diff --git a/rsScriptC.h b/rsScriptC.h
index c8881a4..3c342d4 100644
--- a/rsScriptC.h
+++ b/rsScriptC.h
@@ -31,7 +31,7 @@
     typedef int (*RunScript_t)();
     typedef void (*VoidFunc_t)();
 
-    ScriptC(Context *);
+    explicit ScriptC(Context *);
     virtual ~ScriptC();
 
     void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) override;
@@ -47,13 +47,10 @@
                     size_t usrBytes,
                     const RsScriptCall *sc = nullptr) override;
 
-    void runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
+    void runReduce(Context *rsc, uint32_t slot,
+                   const Allocation ** ains, size_t inLen,
                    Allocation *aout, const RsScriptCall *sc) override;
 
-    void runReduceNew(Context *rsc, uint32_t slot,
-                      const Allocation ** ains, size_t inLen,
-                      Allocation *aout, const RsScriptCall *sc) override;
-
     virtual void serialize(Context *rsc, OStream *stream) const {    }
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_SCRIPT_C; }
     static Type *createFromStream(Context *rsc, IStream *stream) { return nullptr; }
diff --git a/rsScriptGroup.h b/rsScriptGroup.h
index 68783f3..835d8c0 100644
--- a/rsScriptGroup.h
+++ b/rsScriptGroup.h
@@ -56,7 +56,7 @@
 
     class Node {
     public:
-        Node(Script *);
+        explicit Node(Script *);
 
         Vector<const ScriptKernelID *> mKernels;
         Vector<Link *> mOutputs;
@@ -68,7 +68,7 @@
 
     class IO {
     public:
-        IO(const ScriptKernelID *);
+        explicit IO(const ScriptKernelID *);
 
         const ScriptKernelID *mKernel;
         ObjectBaseRef<Allocation> mAlloc;
@@ -103,7 +103,7 @@
     // executes. Skips the exeuction if validation fails.
     bool validateInputAndOutput(Context *);
 
-    ScriptGroup(Context *);
+    explicit ScriptGroup(Context *);
 };
 
 
diff --git a/rsScriptGroupBase.h b/rsScriptGroupBase.h
index 00ae6c6..f79f08f 100644
--- a/rsScriptGroupBase.h
+++ b/rsScriptGroupBase.h
@@ -8,7 +8,7 @@
 
 class ScriptGroupBase : public ObjectBase {
  public:
-  ScriptGroupBase(Context* rsc) : ObjectBase(rsc) {}
+  explicit ScriptGroupBase(Context* rsc) : ObjectBase(rsc) {}
   virtual ~ScriptGroupBase() {}
 
   virtual void serialize(Context *rsc, OStream *stream) const {}
diff --git a/rsScriptIntrinsic.cpp b/rsScriptIntrinsic.cpp
index 6e0f6ae..0122a71 100644
--- a/rsScriptIntrinsic.cpp
+++ b/rsScriptIntrinsic.cpp
@@ -68,15 +68,11 @@
                                               aout, usr, usrBytes, sc);
 }
 
-void ScriptIntrinsic::runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
+void ScriptIntrinsic::runReduce(Context *rsc, uint32_t slot,
+                                const Allocation ** ains, size_t inLen,
                                 Allocation *aout, const RsScriptCall *sc) {
 }
 
-void ScriptIntrinsic::runReduceNew(Context *rsc, uint32_t slot,
-                                   const Allocation ** ains, size_t inLen,
-                                   Allocation *aout, const RsScriptCall *sc) {
-}
-
 void ScriptIntrinsic::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
 }
 
diff --git a/rsScriptIntrinsic.h b/rsScriptIntrinsic.h
index e2b04b8..cd8253d 100644
--- a/rsScriptIntrinsic.h
+++ b/rsScriptIntrinsic.h
@@ -30,7 +30,7 @@
 
     ObjectBaseRef<const Element> mElement;
 
-    ScriptIntrinsic(Context *);
+    explicit ScriptIntrinsic(Context *);
     ~ScriptIntrinsic() override;
 
     bool init(Context *rsc, RsScriptIntrinsicID iid, Element *e);
@@ -49,13 +49,10 @@
                     size_t usrBytes,
                     const RsScriptCall* sc = nullptr) override;
 
-    void runReduce(Context *rsc, uint32_t slot, const Allocation *ain,
+    void runReduce(Context *rsc, uint32_t slot,
+                   const Allocation ** ains, size_t inLen,
                    Allocation *aout, const RsScriptCall *sc) override;
 
-    void runReduceNew(Context *rsc, uint32_t slot,
-                      const Allocation ** ains, size_t inLen,
-                      Allocation *aout, const RsScriptCall *sc) override;
-
     void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) override;
     void setupScript(Context *rsc) override;
     uint32_t run(Context *) override;
diff --git a/rsType.h b/rsType.h
index 6ae8446..6e260fa 100644
--- a/rsType.h
+++ b/rsType.h
@@ -150,7 +150,7 @@
     virtual ~Type();
 
 private:
-    Type(Context *);
+    explicit Type(Context *);
     Type(const Type &);
 };
 
diff --git a/rs_hal.h b/rs_hal.h
index 2f3aa1a..7e07ddd 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -19,13 +19,34 @@
 
 #include <rsInternalDefines.h>
 
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ * !! Major version number of the driver.  This is used to ensure that
+ * !! the driver (e.g., libRSDriver) is compatible with the shell
+ * !! (i.e., libRS_internal) responsible for loading the driver.
+ * !! There is no notion of backwards compatibility -- the driver and
+ * !! the shell must agree on the major version number.
+ * !!
+ * !! The version number must change whenever there is a semantic change
+ * !! to the HAL such as adding or removing an entry point or changing
+ * !! the meaning of an entry point.  By convention it is monotonically
+ * !! increasing across all branches (e.g., aosp/master and all internal
+ * !! branches).
+ * !!
+ * !! Be very careful when merging or cherry picking between branches!
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+#define RS_HAL_VERSION 100
+
 /**
  * The interface for loading RenderScript drivers
  *
  * The startup sequence is
  *
  * 1: dlopen driver
- * 2: Query driver version with rsdHalQueryVersion()
+ * 2: Query driver version with rsdHalQueryVersion() and verify
+ *    that the driver (e.g., libRSDriver) is compatible with the shell
+ *    (i.e., libRS_internal) responsible for loading the driver
  * 3: Fill in HAL pointer table with calls to rsdHalQueryHAL()
  * 4: Initialize the context with rsdHalInit()
  *
@@ -151,14 +172,10 @@
                               size_t usrLen,
                               const RsScriptCall *sc);
         void (*invokeReduce)(const Context *rsc, Script *s,
-                             uint32_t slot, const Allocation *ain,
+                             uint32_t slot,
+                             const Allocation ** ains, size_t inLen,
                              Allocation *aout,
                              const RsScriptCall *sc);
-        void (*invokeReduceNew)(const Context *rsc, Script *s,
-                                uint32_t slot,
-                                const Allocation ** ains, size_t inLen,
-                                Allocation *aout,
-                                const RsScriptCall *sc);
         void (*invokeInit)(const Context *rsc, Script *s);
         void (*invokeFreeChildren)(const Context *rsc, Script *s);
 
@@ -391,7 +408,6 @@
     RS_HAL_SCRIPT_INVOKE_FOR_EACH_MULTI                     = 1013,
     RS_HAL_SCRIPT_UPDATE_CACHED_OBJECT                      = 1014,
     RS_HAL_SCRIPT_INVOKE_REDUCE                             = 1015,
-    RS_HAL_SCRIPT_INVOKE_REDUCE_NEW                         = 1016,
 
     RS_HAL_ALLOCATION_INIT                                  = 2000,
     RS_HAL_ALLOCATION_INIT_ADAPTER                          = 2001,
@@ -479,10 +495,16 @@
 
 /**
  * Get the major version number of the driver.  The major
- * version should be the API version number
+ * version should be the RS_HAL_VERSION against which the
+ * driver was built
  *
  * The Minor version number is vendor specific
  *
+ * The caller should ensure that *version_major is the same as
+ * RS_HAL_VERSION -- i.e., that the driver (e.g., libRSDriver)
+ * is compatible with the shell (i.e., libRS_internal) responsible
+ * for loading the driver
+ *
  * return: False will abort loading the driver, true indicates
  * success
  */
diff --git a/tests/Android.mk b/tests/Android.mk
new file mode 100644
index 0000000..95904a8
--- /dev/null
+++ b/tests/Android.mk
@@ -0,0 +1,4 @@
+LOCAL_PATH:=$(call my-dir)
+
+include $(call all-makefiles-under,$(LOCAL_PATH))
+
diff --git a/tests/cppreduce/Android.mk b/tests/cppreduce/Android.mk
deleted file mode 100644
index 5e795f6..0000000
--- a/tests/cppreduce/Android.mk
+++ /dev/null
@@ -1,32 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_SDK_VERSION := 9
-LOCAL_NDK_STL_VARIANT := stlport_static
-
-LOCAL_RENDERSCRIPT_FLAGS := -target-api 0
-
-LOCAL_SRC_FILES:= \
-	reduce.rs \
-	compute.cpp
-
-LOCAL_STATIC_LIBRARIES := \
-	libRScpp_static
-
-LOCAL_CFLAGS := -std=c++11 -Werror
-LOCAL_LDFLAGS += -llog -ldl
-
-LOCAL_MODULE:= rstest-reduce
-
-LOCAL_MODULE_TAGS := tests
-
-intermediates := $(call intermediates-dir-for,STATIC_LIBRARIES,libRS,TARGET,)
-
-LOCAL_C_INCLUDES += frameworks/rs/cpp
-LOCAL_C_INCLUDES += frameworks/rs
-LOCAL_C_INCLUDES += $(intermediates)
-
-LOCAL_CLANG := true
-
-include $(BUILD_EXECUTABLE)
-
diff --git a/tests/cppreduce/compute.cpp b/tests/cppreduce/compute.cpp
deleted file mode 100644
index 740df96..0000000
--- a/tests/cppreduce/compute.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "RenderScript.h"
-
-#include "ScriptC_reduce.h"
-
-using namespace android;
-using namespace RSC;
-
-bool checkForFailure(Int3 output, Int3 expected) {
-  static bool first = true;
-  if (first) {
-    printf("========================\n");
-    first = false;
-  }
-  printf("output  : %4d %4d %4d\n", output.x, output.y, output.z);
-  printf("expected: %4d %4d %4d\n", expected.x, expected.y, expected.z);
-  printf("========================\n");
-  return output.x != expected.x || output.y != expected.y || output.z != expected.z;
-}
-
-bool checkForFailure(int32_t output[3], Int3 expected) {
-  return checkForFailure(Int3(output[0], output[1], output[2]), expected);
-}
-
-// This tests the variants of the reduce kernel available to the C++ API.
-//
-// For this to work, the appropriate API level needs to be set so that ScriptReduce is
-// available to the dispatch table (frameworks/rs/cpp/rsDispatch.cpp).
-int test_reduce() {
-  bool failed = false;
-  {
-    sp<RS> rs = new RS();
-
-    // only legitimate because this is a standalone executable
-    bool r = rs->init("/system/bin");
-
-    // Input array
-    const int SIZE = 100;
-    int input[SIZE * 3];
-    for (int i = 0; i < SIZE * 3; ++i) {
-      input[i] = (i % 3);
-    }
-
-    sp<ScriptC_reduce> sc = new ScriptC_reduce(rs);
-
-    // Create input and output allocations.
-    sp<const Element> e = Element::I32_3(rs);
-    sp<Allocation> ain = Allocation::createSized(rs, e, SIZE);
-    // Set auto-padding so that we don't have to pad arrays in copyTo() / copyFrom().
-    ain->setAutoPadding(true);
-    ain->copy1DRangeFrom(0, SIZE, input);
-
-    sp<Allocation> aout = Allocation::createSized(rs, e, 1);
-    aout->setAutoPadding(true);
-
-    // Demo of all reduce variants.
-
-    // Reduce by passing input and output allocations.
-    sc->reduce_add(ain, aout);
-    int32_t output[3];
-    aout->copy1DTo(output);
-    failed |= checkForFailure(output, Int3(0, 100, 200));
-
-    // Reduce by passing input and output allocations along with a sub-range.
-    RsScriptCall bounds{};
-    bounds.xStart = 1;
-    bounds.xEnd = 10;
-    sc->reduce_add(ain, aout, &bounds);
-    aout->copy1DTo(output);
-    failed |= checkForFailure(output, Int3(0, 9, 18));
-
-    // Helper variant with explicit size
-    Int3 result = sc->reduce_add(input, sizeof(input) / sizeof(input[0]));
-    failed |= checkForFailure(result, Int3(0, 100, 200));
-
-    // Helper variant with inferred size
-    result = sc->reduce_add(input);
-    failed |= checkForFailure(result, Int3(0, 100, 200));
-
-    // Helper variant with sub-range
-    result = sc->reduce_add(input, 0, 1, sizeof(input) / sizeof(input[0]));
-    failed |= checkForFailure(result, Int3(0, 1, 2));
-
-    // Helper variant with sub-range and inferred size
-    result = sc->reduce_add(input, 2, 11);
-    failed |= checkForFailure(result, Int3(0, 9, 18));
-
-    failed |= rs->getError() != RS_SUCCESS;
-
-    // Helper variant with sub-range and out-of-bounds (should cause an error)
-    result = sc->reduce_add(input, 0, 101);
-    failed |= checkForFailure(result, Int3()) || (rs->getError() == RS_SUCCESS);
-  }
-
-  return failed;
-}
-
-int main() {
-  bool failed = test_reduce();
-
-  if (failed) {
-    printf("TEST FAILED!\n");
-  } else {
-    printf("TEST PASSED!\n");
-  }
-
-  return failed;
-}
diff --git a/tests/cppreduce/reduce.rs b/tests/cppreduce/reduce.rs
deleted file mode 100644
index 482cc7a..0000000
--- a/tests/cppreduce/reduce.rs
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.reduce)
-
-int3 __attribute__((kernel("reduce"))) add(int3 a, int3 b) {
-    return a + b;
-}
diff --git a/update_rs_prebuilts.sh b/update_rs_prebuilts.sh
index 2224d73..fa0ab21 100755
--- a/update_rs_prebuilts.sh
+++ b/update_rs_prebuilts.sh
@@ -28,8 +28,6 @@
 
 fi
 
-echo "Using $NUM_CORES cores"
-
 # Turn off the build cache and make sure we build all of LLVM from scratch.
 export ANDROID_USE_BUILDCACHE=false
 export FORCE_BUILD_LLVM_COMPONENTS=true
@@ -58,10 +56,12 @@
 PREBUILTS_DIR=$MY_ANDROID_DIR/prebuilts/sdk/
 
 print_usage() {
-  echo "USAGE: $0 [-h|--help] [-n|--no-build] [-x]"
+  echo "USAGE: $0 [-h|--help] [-j <num>] [-n|--no-build] [--no-start] [-x]"
   echo "OPTIONS:"
+  echo "    -j <num>       : Specify parallelism for builds."
   echo "    -h, --help     : Display this help message."
   echo "    -n, --no-build : Skip the build step and just copy files."
+  echo "    --no-start     : Do not \"repo start\" a new branch for the copied files."
   echo "    -x             : Display commands before they are executed."
 }
 
@@ -84,15 +84,31 @@
 # Build everything by default
 build_rs=1
 
+# repo start by default
+repo_start=1
+
 while [ $# -gt 0 ]; do
   case "$1" in
     -h|--help)
       print_usage
       exit 0
       ;;
+    -j)
+      if [[ $# -gt 1 && "$2" =~  ^[0-9]+$ ]]; then
+        NUM_CORES="$2"
+        shift
+      else
+        echo Expected numeric argument after "$1"
+        print_usage
+        exit 99
+      fi
+      ;;
     -n|--no-build)
       build_rs=0
       ;;
+    --no-start)
+      repo_start=0
+      ;;
     -x)
       # set lets us enable bash -x mode.
       set -x
@@ -113,6 +129,8 @@
   echo !!! BUILDING RS PREBUILTS !!!
   echo !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
+  echo "Using $NUM_CORES cores"
+
   source build/envsetup.sh
 
   for t in ${TARGETS[@]}; do
@@ -127,10 +145,23 @@
 
 fi
 
-DATE=`date +%Y%m%d`
-
 cd $PREBUILTS_DIR || exit 3
-repo start pb_$DATE .
+
+# Verify that project is "clean"
+if [ `git status --short --untracked-files=no | wc -l` -ne 0 ]; then
+  echo $PREBUILTS_DIR contains modified files -- aborting.
+  git status --untracked-files=no
+  exit 1
+fi
+
+if [ $repo_start -eq 1 ]; then
+  DATE=`date +%Y%m%d`
+  repo start pb_$DATE .
+  if [ $? -ne 0 ]; then
+    echo repo start failed -- aborting.
+    exit 1
+  fi
+fi
 
 # Don't copy device prebuilts on Darwin. We don't need/use them.
 if [ $DARWIN -eq 0 ]; then