Snap for 8603585 from 0a3207f08f581da4c6f6520a7d69e7fda17eab2a to mainline-tzdata3-release

Change-Id: I21fa83c74f4ab9222eb7514fb8d51eb25fb8bab5
diff --git a/Android.bp b/Android.bp
index ab53382..a2eaf6f 100644
--- a/Android.bp
+++ b/Android.bp
@@ -370,4 +370,5 @@
     "cpu_ref",
     "script_api",
     "support",
+    "toolkit",
 ]
diff --git a/build_rs.py b/build_rs.py
index 0416109..c2faa5b 100755
--- a/build_rs.py
+++ b/build_rs.py
@@ -56,14 +56,17 @@
 
 
 def build(out_dir):
-    products = (
-        'aosp_arm',
-        'aosp_arm64',
-        # 'aosp_mips',
-        # 'aosp_mips64',
-        'aosp_x86',
-        'aosp_x86_64',
-    )
+    if sys.platform == 'darwin':
+        products = ('aosp_arm',)
+    else:
+        products = (
+            'aosp_arm',
+            'aosp_arm64',
+            # 'aosp_mips',
+            # 'aosp_mips64',
+            'aosp_x86',
+            'aosp_x86_64',
+        )
     for product in products:
         build_product(out_dir, product)
 
@@ -78,13 +81,19 @@
     env['TARGET_BUILD_VARIANT'] = 'userdebug'
     env['TARGET_PRODUCT'] = product
 
-    targets = [
-        # PHONY target specified in frameworks/rs/Android.mk.
-        'rs-prebuilts-full',
-        # We have to explicitly specify the jar for JACK to build.
-        android_path('out/target/common/obj/JAVA_LIBRARIES/' +
-            'android-support-v8-renderscript_intermediates/classes.jar')
-    ]
+    if sys.platform == 'darwin':
+        targets = [
+            'llvm-rs-cc',
+            'bcc_compat',
+        ]
+    else:
+        targets = [
+            # PHONY target specified in frameworks/rs/Android.mk.
+            'rs-prebuilts-full',
+            # We have to explicitly specify the jar for JACK to build.
+            android_path('out/target/common/obj/JAVA_LIBRARIES/' +
+                'android-support-v8-renderscript_intermediates/classes.jar')
+        ]
     subprocess.check_call(
         ['build/soong/soong_ui.bash', '--make-mode'] + targets, cwd=android_path(), env=env)
 
@@ -113,7 +122,8 @@
 def install_toolchain(build_dir, install_dir, host):
     install_built_host_files(build_dir, install_dir, host)
     install_clang_headers(build_dir, install_dir, host)
-    install_built_device_files(build_dir, install_dir, host)
+    if not host.startswith('darwin'):
+        install_built_device_files(build_dir, install_dir, host)
     install_license_files(install_dir)
     # We need to package libwinpthread-1.dll for Windows. This is explicitly
     # linked whenever pthreads is used, and the build system doesn't allow
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index ce30092..d102488 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -109,6 +109,15 @@
 namespace android {
 namespace renderscript {
 
+// Convert vector to uchar4, clipping each value to 255.
+template <typename TI>
+static inline uchar4 convertClipped(TI amount) {
+    return uchar4 { static_cast<uchar>(amount.x > 255 ? 255 : amount.x),
+                    static_cast<uchar>(amount.y > 255 ? 255 : amount.y),
+                    static_cast<uchar>(amount.z > 255 ? 255 : amount.z),
+                    static_cast<uchar>(amount.w > 255 ? 255 : amount.w)};
+}
+
 void RsdCpuScriptIntrinsicBlend::kernel(const RsExpandKernelDriverInfo *info,
                                         uint32_t xstart, uint32_t xend,
                                         uint32_t outstep) {
@@ -120,8 +129,11 @@
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
     if (gArchUseSIMD) {
-        if (rsdIntrinsicBlend_K(out, in, info->slot, x1, x2) >= 0)
+        if (rsdIntrinsicBlend_K(out, in, info->slot, 0, x2 - x1) >= 0) {
             return;
+        } else {
+            ALOGW("Intrinsic Blend failed to use SIMD for %d", info->slot);
+        }
     }
 #endif
     switch (info->slot) {
@@ -151,10 +163,10 @@
         }
     #endif
         for (;x1 < x2; x1++, out++, in++) {
-            short4 in_s = convert_short4(*in);
-            short4 out_s = convert_short4(*out);
-            in_s = in_s + ((out_s * (short4)(255 - in_s.w)) >> (short4)8);
-            *out = convert_uchar4(in_s);
+            ushort4 in_s = convert_ushort4(*in);
+            ushort4 out_s = convert_ushort4(*out);
+            in_s = in_s + ((out_s * (ushort4)(255 - in_s.w)) >> (ushort4)8);
+            *out = convertClipped(in_s);
         }
         break;
     case BLEND_DST_OVER:
@@ -170,10 +182,10 @@
         }
      #endif
         for (;x1 < x2; x1++, out++, in++) {
-            short4 in_s = convert_short4(*in);
-            short4 out_s = convert_short4(*out);
-            in_s = out_s + ((in_s * (short4)(255 - out_s.w)) >> (short4)8);
-            *out = convert_uchar4(in_s);
+            ushort4 in_s = convert_ushort4(*in);
+            ushort4 out_s = convert_ushort4(*out);
+            in_s = out_s + ((in_s * (ushort4)(255 - out_s.w)) >> (ushort4)8);
+            *out = convertClipped(in_s);
         }
         break;
     case BLEND_SRC_IN:
@@ -189,8 +201,8 @@
         }
     #endif
         for (;x1 < x2; x1++, out++, in++) {
-            short4 in_s = convert_short4(*in);
-            in_s = (in_s * out->w) >> (short4)8;
+            ushort4 in_s = convert_ushort4(*in);
+            in_s = (in_s * out->w) >> (ushort4)8;
             *out = convert_uchar4(in_s);
         }
         break;
@@ -261,11 +273,14 @@
         }
     #endif
         for (;x1 < x2; x1++, out++, in++) {
-            short4 in_s = convert_short4(*in);
-            short4 out_s = convert_short4(*out);
+            // The max value the operation could produce before the shift
+            // is 255 * 255 + 255 * (255 - 0) = 130050, or 0x1FC02.
+            // That value does not fit in a ushort, so we use uint.
+            uint4 in_s = convert_uint4(*in);
+            uint4 out_s = convert_uint4(*out);
             out_s.xyz = ((in_s.xyz * out_s.w) +
-              (out_s.xyz * ((short3)255 - (short3)in_s.w))) >> (short3)8;
-            *out = convert_uchar4(out_s);
+              (out_s.xyz * ((uint3)255 - (uint3)in_s.w))) >> (uint3)8;
+            *out = convertClipped(out_s);
         }
         break;
     case BLEND_DST_ATOP:
@@ -281,12 +296,12 @@
         }
      #endif
         for (;x1 < x2; x1++, out++, in++) {
-            short4 in_s = convert_short4(*in);
-            short4 out_s = convert_short4(*out);
+            uint4 in_s = convert_uint4(*in);
+            uint4 out_s = convert_uint4(*out);
             out_s.xyz = ((out_s.xyz * in_s.w) +
-              (in_s.xyz * ((short3)255 - (short3)out_s.w))) >> (short3)8;
+              (in_s.xyz * ((uint3)255 - (uint3)out_s.w))) >> (uint3)8;
             out_s.w = in_s.w;
-            *out = convert_uchar4(out_s);
+            *out = convertClipped(out_s);
         }
         break;
     case BLEND_XOR:
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index 8a3dd1a..8afa2ed 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -353,7 +353,7 @@
     const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
     const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
 
-    uchar4 *out = ((uchar4 *)info->outPtr[0]) + xstart;
+    uchar4 *out = ((uchar4 *)info->outPtr[0]);
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -430,7 +430,7 @@
     const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
     const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
 
-    uchar2 *out = ((uchar2 *)info->outPtr[0]) + xstart;
+    uchar2 *out = ((uchar2 *)info->outPtr[0]);
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -508,7 +508,7 @@
     const uchar *yp2 = pin + stride * ys2;
     const uchar *yp3 = pin + stride * ys3;
 
-    uchar *out = ((uchar *)info->outPtr[0]) + xstart;
+    uchar *out = ((uchar *)info->outPtr[0]);
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -586,7 +586,7 @@
     const float4 *yp2 = (const float4 *)(pin + stride * ys2);
     const float4 *yp3 = (const float4 *)(pin + stride * ys3);
 
-    float4 *out = ((float4 *)info->outPtr[0]) + xstart;
+    float4 *out = ((float4 *)info->outPtr[0]);
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -638,7 +638,7 @@
     const float2 *yp2 = (const float2 *)(pin + stride * ys2);
     const float2 *yp3 = (const float2 *)(pin + stride * ys3);
 
-    float2 *out = ((float2 *)info->outPtr[0]) + xstart;
+    float2 *out = ((float2 *)info->outPtr[0]);
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -690,7 +690,7 @@
     const float *yp2 = (const float *)(pin + stride * ys2);
     const float *yp3 = (const float *)(pin + stride * ys3);
 
-    float *out = ((float *)info->outPtr[0]) + xstart;
+    float *out = ((float *)info->outPtr[0]);
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
index b4a2b7c..1473336 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blend.S
@@ -593,7 +593,7 @@
 ENTRY(rsdIntrinsicBlend_K)
     adrp    x5, blendtable
     add     x5, x5, :lo12:blendtable
-    cmp     w2, tablesize >> 1
+    cmp     w2, tablesize
     bhs     1f
     ldrsh   x6, [x5, w2, uxtw #1]
     add     x0, x0, w3, uxtw #2
@@ -615,4 +615,3 @@
 #define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
         BLEND_LIST(BLEND_X)
 #undef BLEND_X
-
diff --git a/script_api/Android.bp b/script_api/Android.bp
index 0f026bf..8046bd0 100644
--- a/script_api/Android.bp
+++ b/script_api/Android.bp
@@ -26,3 +26,9 @@
         never: true,
     },
 }
+
+filegroup {
+    name: "rs_script_api",
+    srcs: ["include/*.rsh"],
+    path: "include",
+}
diff --git a/tests/cpp_api/Android.bp b/tests/cpp_api/Android.bp
new file mode 100644
index 0000000..9c3632a
--- /dev/null
+++ b/tests/cpp_api/Android.bp
@@ -0,0 +1,31 @@
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+cc_defaults {
+    name: "frameworks_rs_tests_cpp-api-defaults",
+    shared_libs: ["liblog"],
+    cflags: [
+        "-Werror",
+        "-Wall",
+        "-Wextra",
+    ],
+    header_libs: ["rs-headers"],
+}
+
diff --git a/tests/cpp_api/Android.mk b/tests/cpp_api/Android.mk
deleted file mode 100644
index 6145a3d..0000000
--- a/tests/cpp_api/Android.mk
+++ /dev/null
@@ -1,3 +0,0 @@
-LOCAL_PATH:=$(call my-dir)
-
-include $(call all-makefiles-under,$(LOCAL_PATH))
diff --git a/tests/cpp_api/common.mk b/tests/cpp_api/common.mk
deleted file mode 100644
index 9bad790..0000000
--- a/tests/cpp_api/common.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_CFLAGS += -Werror -Wall -Wextra
-LOCAL_LDFLAGS +=  -llog
-
-intermediates := $(call intermediates-dir-for,STATIC_LIBRARIES,libRS,TARGET,)
-LOCAL_C_INCLUDES += $(intermediates)
diff --git a/tests/cpp_api/cpp-globalguard/Android.bp b/tests/cpp_api/cpp-globalguard/Android.bp
new file mode 100644
index 0000000..c0ac0f9
--- /dev/null
+++ b/tests/cpp_api/cpp-globalguard/Android.bp
@@ -0,0 +1,34 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+cc_binary {
+    name: "rstest-cpp-globalguard",
+    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
+    sdk_version: "21",
+    stl: "c++_static",
+    srcs: [
+        "multiply.rscript",
+        "compute.cpp",
+    ],
+    static_libs: ["libRScpp_static"],
+}
diff --git a/tests/cpp_api/cpp-globalguard/Android.mk b/tests/cpp_api/cpp-globalguard/Android.mk
deleted file mode 100644
index 88a10c9..0000000
--- a/tests/cpp_api/cpp-globalguard/Android.mk
+++ /dev/null
@@ -1,21 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE:= rstest-cpp-globalguard
-LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS:= notice
-
-LOCAL_SDK_VERSION := 21
-LOCAL_NDK_STL_VARIANT := c++_static
-
-LOCAL_SRC_FILES:= \
-	multiply.rscript \
-	compute.cpp
-
-LOCAL_STATIC_LIBRARIES := \
-	libRScpp_static
-
-LOCAL_LDFLAGS += -llog
-
-include frameworks/rs/tests/cpp_api/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/cpp_api/cppallocation/Android.bp b/tests/cpp_api/cppallocation/Android.bp
new file mode 100644
index 0000000..210969b
--- /dev/null
+++ b/tests/cpp_api/cppallocation/Android.bp
@@ -0,0 +1,32 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+cc_binary {
+    name: "rstest-cppallocation",
+    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
+    sdk_version: "21",
+    stl: "c++_static",
+    srcs: [
+        "multiply.rscript",
+        "compute.cpp",
+    ],
+    static_libs: ["libRScpp_static"],
+}
diff --git a/tests/cpp_api/cppallocation/Android.mk b/tests/cpp_api/cppallocation/Android.mk
deleted file mode 100644
index d2c7cbc..0000000
--- a/tests/cpp_api/cppallocation/Android.mk
+++ /dev/null
@@ -1,21 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE:= rstest-cppallocation
-LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS:= notice
-
-LOCAL_SDK_VERSION := 21
-LOCAL_NDK_STL_VARIANT := c++_static
-
-LOCAL_SRC_FILES:= \
-	multiply.rscript \
-	compute.cpp
-
-LOCAL_STATIC_LIBRARIES := \
-	libRScpp_static
-
-LOCAL_LDFLAGS += -llog
-
-include frameworks/rs/tests/cpp_api/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/cpp_api/cppbasic-getpointer/Android.bp b/tests/cpp_api/cppbasic-getpointer/Android.bp
new file mode 100644
index 0000000..203a8c9
--- /dev/null
+++ b/tests/cpp_api/cppbasic-getpointer/Android.bp
@@ -0,0 +1,32 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+cc_binary {
+    name: "rstest-compute-getpointer",
+    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
+    srcs: [
+        "mono.rscript",
+        "compute.cpp",
+    ],
+    shared_libs: ["libRScpp"],
+}
diff --git a/tests/cpp_api/cppbasic-getpointer/Android.mk b/tests/cpp_api/cppbasic-getpointer/Android.mk
deleted file mode 100644
index 963a3e4..0000000
--- a/tests/cpp_api/cppbasic-getpointer/Android.mk
+++ /dev/null
@@ -1,18 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE:= rstest-compute-getpointer
-LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS:= notice
-
-LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
-
-LOCAL_SRC_FILES:= \
-	mono.rscript \
-	compute.cpp
-
-LOCAL_SHARED_LIBRARIES := \
-	libRScpp
-
-include frameworks/rs/tests/cpp_api/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/cpp_api/cppbasic-shared/Android.bp b/tests/cpp_api/cppbasic-shared/Android.bp
new file mode 100644
index 0000000..0b49d86
--- /dev/null
+++ b/tests/cpp_api/cppbasic-shared/Android.bp
@@ -0,0 +1,32 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+cc_binary {
+    name: "rstest-compute-shared",
+    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
+    srcs: [
+        "mono.rscript",
+        "compute.cpp",
+    ],
+    shared_libs: ["libRScpp"],
+}
diff --git a/tests/cpp_api/cppbasic-shared/Android.mk b/tests/cpp_api/cppbasic-shared/Android.mk
deleted file mode 100644
index ca91745..0000000
--- a/tests/cpp_api/cppbasic-shared/Android.mk
+++ /dev/null
@@ -1,18 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE:= rstest-compute-shared
-LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS:= notice
-
-LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
-
-LOCAL_SRC_FILES:= \
-	mono.rscript \
-	compute.cpp
-
-LOCAL_SHARED_LIBRARIES := \
-	libRScpp
-
-include frameworks/rs/tests/cpp_api/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/cpp_api/cppbasic/Android.bp b/tests/cpp_api/cppbasic/Android.bp
new file mode 100644
index 0000000..d6723ca
--- /dev/null
+++ b/tests/cpp_api/cppbasic/Android.bp
@@ -0,0 +1,34 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+cc_binary {
+    name: "rstest-compute",
+    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
+    sdk_version: "21",
+    stl: "c++_static",
+    srcs: [
+        "mono.rscript",
+        "compute.cpp",
+    ],
+    static_libs: ["libRScpp_static"],
+}
diff --git a/tests/cpp_api/cppbasic/Android.mk b/tests/cpp_api/cppbasic/Android.mk
deleted file mode 100644
index a1f090f..0000000
--- a/tests/cpp_api/cppbasic/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE:= rstest-compute
-LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS:= notice
-
-LOCAL_SDK_VERSION := 21
-LOCAL_NDK_STL_VARIANT := c++_static
-
-LOCAL_SRC_FILES:= \
-	mono.rscript \
-	compute.cpp
-
-LOCAL_STATIC_LIBRARIES := \
-	libRScpp_static
-
-include frameworks/rs/tests/cpp_api/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/cpp_api/cppf16/Android.bp b/tests/cpp_api/cppf16/Android.bp
new file mode 100644
index 0000000..4aaca31
--- /dev/null
+++ b/tests/cpp_api/cppf16/Android.bp
@@ -0,0 +1,33 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+cc_binary {
+    name: "rstest-cppf16",
+    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
+    stl: "c++_static",
+    srcs: ["compute.cpp"],
+    static_libs: ["libRScpp_static"],
+    shared_libs: [
+        "libdl",
+    ],
+}
diff --git a/tests/cpp_api/cppf16/Android.mk b/tests/cpp_api/cppf16/Android.mk
deleted file mode 100644
index eca91db..0000000
--- a/tests/cpp_api/cppf16/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE:= rstest-cppf16
-LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS:= notice
-
-LOCAL_NDK_STL_VARIANT := c++_static
-
-LOCAL_SRC_FILES:= \
-	compute.cpp
-
-LOCAL_STATIC_LIBRARIES := \
-	libRScpp_static
-
-LOCAL_LDFLAGS += -llog -ldl
-
-include frameworks/rs/tests/cpp_api/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/cpp_api/cppstrided/Android.bp b/tests/cpp_api/cppstrided/Android.bp
new file mode 100644
index 0000000..df3fc93
--- /dev/null
+++ b/tests/cpp_api/cppstrided/Android.bp
@@ -0,0 +1,34 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+cc_binary {
+    name: "rstest-cppstrided",
+    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
+    sdk_version: "21",
+    stl: "c++_static",
+    srcs: [
+        "multiply.rscript",
+        "compute.cpp",
+    ],
+    static_libs: ["libRScpp_static"],
+}
diff --git a/tests/cpp_api/cppstrided/Android.mk b/tests/cpp_api/cppstrided/Android.mk
deleted file mode 100644
index e0e03b3..0000000
--- a/tests/cpp_api/cppstrided/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE:= rstest-cppstrided
-LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS:= notice
-
-LOCAL_SDK_VERSION := 21
-LOCAL_NDK_STL_VARIANT := c++_static
-
-LOCAL_SRC_FILES:= \
-	multiply.rscript \
-	compute.cpp
-
-LOCAL_STATIC_LIBRARIES := \
-	libRScpp_static
-
-include frameworks/rs/tests/cpp_api/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/cpp_api/latency/Android.bp b/tests/cpp_api/latency/Android.bp
new file mode 100644
index 0000000..3eaf1ee
--- /dev/null
+++ b/tests/cpp_api/latency/Android.bp
@@ -0,0 +1,34 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+cc_binary {
+    name: "rstest-latency",
+    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
+    sdk_version: "21",
+    stl: "c++_static",
+    srcs: [
+        "latency.rscript",
+        "latency.cpp",
+    ],
+    static_libs: ["libRScpp_static"],
+}
diff --git a/tests/cpp_api/latency/Android.mk b/tests/cpp_api/latency/Android.mk
deleted file mode 100644
index 16557f4..0000000
--- a/tests/cpp_api/latency/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE:= rstest-latency
-LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS:= notice
-
-LOCAL_SDK_VERSION := 21
-LOCAL_NDK_STL_VARIANT := c++_static
-
-LOCAL_SRC_FILES:= \
-	latency.rscript \
-	latency.cpp
-
-LOCAL_STATIC_LIBRARIES := \
-	libRScpp_static
-
-include frameworks/rs/tests/cpp_api/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/cpp_api/typecheck/Android.bp b/tests/cpp_api/typecheck/Android.bp
new file mode 100644
index 0000000..793888c
--- /dev/null
+++ b/tests/cpp_api/typecheck/Android.bp
@@ -0,0 +1,34 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+cc_binary {
+    name: "rstest-typecheck",
+    defaults: ["frameworks_rs_tests_cpp-api-defaults"],
+    sdk_version: "21",
+    stl: "c++_static",
+    srcs: [
+        "kernels.rscript",
+        "typecheck.cpp",
+    ],
+    static_libs: ["libRScpp_static"],
+}
diff --git a/tests/cpp_api/typecheck/Android.mk b/tests/cpp_api/typecheck/Android.mk
deleted file mode 100644
index 82f92ea..0000000
--- a/tests/cpp_api/typecheck/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE:= rstest-typecheck
-LOCAL_LICENSE_KINDS:= SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS:= notice
-
-LOCAL_SDK_VERSION := 21
-LOCAL_NDK_STL_VARIANT := c++_static
-
-LOCAL_SRC_FILES:= \
-	kernels.rscript \
-	typecheck.cpp
-
-LOCAL_STATIC_LIBRARIES := \
-	libRScpp_static
-
-include frameworks/rs/tests/cpp_api/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/java_api/Balls/Android.bp b/tests/java_api/Balls/Android.bp
new file mode 100644
index 0000000..e931d3e
--- /dev/null
+++ b/tests/java_api/Balls/Android.bp
@@ -0,0 +1,63 @@
+//
+// Copyright (C) 2008 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+// This variable is used to match the 'LOCAL_SDK_VERSION' field in the former Android.mk file.
+local_sdk_version = "14"
+// This variable is used to set the value of the '-target-api' option for the 'llvm-rs-cc' command.
+// Note: it may NOT always be the same as the 'local_sdk_version', due to the existing logic in the Make build system.
+// For the Android.mk to Android.bp conversion, it is recommend to run the build before and after
+// the conversion, to make sure the value of the '-target-api' option to be the same.
+target_api_level = local_sdk_version
+
+android_test {
+    name: "RsBalls",
+    srcs: [
+        "src/**/*.java",
+        ":RsBalls-rscript{RsBalls.srcjar}",
+    ],
+    resource_zips: [
+        ":RsBalls-rscript{RsBalls.res.zip}",
+    ],
+    sdk_version: local_sdk_version,
+}
+
+genrule {
+    name: "RsBalls-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "RsBalls.srcjar",
+        "RsBalls.res.zip",
+    ],
+    cmd: "$(location llvm-rs-cc) -target-api " + target_api_level +
+        "  -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $(locations src/**/*.rscript) &&" +
+        "$(location soong_zip) -srcjar -o $(location RsBalls.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location RsBalls.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/Balls/Android.mk b/tests/java_api/Balls/Android.mk
deleted file mode 100644
index 409c735..0000000
--- a/tests/java_api/Balls/Android.mk
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Copyright (C) 2008 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := RsBalls
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := 14
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/CannyLive/Android.bp b/tests/java_api/CannyLive/Android.bp
new file mode 100644
index 0000000..24d8a98
--- /dev/null
+++ b/tests/java_api/CannyLive/Android.bp
@@ -0,0 +1,65 @@
+//
+// Copyright (C) 2015 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "CannyLive",
+    static_libs: [
+        "androidx.legacy_legacy-support-v4",
+        "androidx.appcompat_appcompat",
+    ],
+    aaptflags: [
+        "--extra-packages",
+        "android.support.v7.appcompat",
+    ],
+    srcs: [
+        "src/**/*.java",
+        ":CannyLive-rscript{CannyLive.srcjar}",
+    ],
+    resource_zips: [
+        ":CannyLive-rscript{CannyLive.res.zip}",
+    ],
+    sdk_version: "current",
+}
+
+genrule {
+    name: "CannyLive-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "CannyLive.srcjar",
+        "CannyLive.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -target-api 22 " +
+        "  -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location CannyLive.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location CannyLive.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/CannyLive/Android.mk b/tests/java_api/CannyLive/Android.mk
deleted file mode 100644
index 54aeb00..0000000
--- a/tests/java_api/CannyLive/Android.mk
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# Copyright (C) 2015 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-LOCAL_USE_AAPT2 :=true
-LOCAL_STATIC_ANDROID_LIBRARIES += androidx.legacy_legacy-support-v4
-LOCAL_STATIC_ANDROID_LIBRARIES += androidx.appcompat_appcompat
-LOCAL_AAPT_FLAGS += --extra-packages android.support.v7.appcompat
-LOCAL_RENDERSCRIPT_TARGET_API := 22
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-LOCAL_SDK_VERSION := current
-
-LOCAL_PACKAGE_NAME := CannyLive
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/CannyLive/AndroidManifest.xml b/tests/java_api/CannyLive/AndroidManifest.xml
index e7e3299..4764b8f 100644
--- a/tests/java_api/CannyLive/AndroidManifest.xml
+++ b/tests/java_api/CannyLive/AndroidManifest.xml
@@ -16,7 +16,8 @@
         android:theme="@style/AppTheme" >

         <activity

             android:name="com.android.example.cannylive.MainActivity"

-            android:label="@string/app_name" >

+            android:label="@string/app_name"

+            android:exported="true" >

             <intent-filter>

                 <action android:name="android.intent.action.MAIN" />

                 <category android:name="android.intent.category.LAUNCHER" />

diff --git a/tests/java_api/ComputeBenchmark/Android.bp b/tests/java_api/ComputeBenchmark/Android.bp
new file mode 100644
index 0000000..6a4e96d
--- /dev/null
+++ b/tests/java_api/ComputeBenchmark/Android.bp
@@ -0,0 +1,57 @@
+//
+// Copyright (C) 2012 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "RsComputeBenchmark",
+    srcs: [
+        "src/**/*.java",
+        ":RsComputeBenchmark-rscript{RsComputeBenchmark.srcjar}",
+    ],
+    resource_zips: [
+        ":RsComputeBenchmark-rscript{RsComputeBenchmark.res.zip}",
+    ],
+    sdk_version: "current",
+    min_sdk_version: "17",
+}
+
+genrule {
+    name: "RsComputeBenchmark-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "RsComputeBenchmark.srcjar",
+        "RsComputeBenchmark.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location RsComputeBenchmark.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location RsComputeBenchmark.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/ComputeBenchmark/Android.mk b/tests/java_api/ComputeBenchmark/Android.mk
deleted file mode 100644
index a3ca785..0000000
--- a/tests/java_api/ComputeBenchmark/Android.mk
+++ /dev/null
@@ -1,31 +0,0 @@
-#
-# Copyright (C) 2012 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := RsComputeBenchmark
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_MIN_SDK_VERSION := 17
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/ComputePerf/Android.bp b/tests/java_api/ComputePerf/Android.bp
new file mode 100644
index 0000000..b60397d
--- /dev/null
+++ b/tests/java_api/ComputePerf/Android.bp
@@ -0,0 +1,57 @@
+//
+// Copyright (C) 2011 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "RsComputePerf",
+    srcs: [
+        "src/**/*.java",
+        ":RsComputePerf-rscript{RsComputePerf.srcjar}",
+    ],
+    resource_zips: [
+        ":RsComputePerf-rscript{RsComputePerf.res.zip}",
+    ],
+    sdk_version: "current",
+    min_sdk_version: "19",
+}
+
+genrule {
+    name: "RsComputePerf-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "RsComputePerf.srcjar",
+        "RsComputePerf.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc)  -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location RsComputePerf.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location RsComputePerf.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/ComputePerf/Android.mk b/tests/java_api/ComputePerf/Android.mk
deleted file mode 100644
index 163e7dd..0000000
--- a/tests/java_api/ComputePerf/Android.mk
+++ /dev/null
@@ -1,31 +0,0 @@
-#
-# Copyright (C) 2011 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := RsComputePerf
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_MIN_SDK_VERSION := 19
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/GenImages/Android.bp b/tests/java_api/GenImages/Android.bp
new file mode 100644
index 0000000..520e698
--- /dev/null
+++ b/tests/java_api/GenImages/Android.bp
@@ -0,0 +1,29 @@
+//
+// Copyright (C) 2013 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "RsGenImages",
+    srcs: [
+        "src/**/*.java",
+    ],
+    sdk_version: "current",
+    min_sdk_version: "14",
+}
diff --git a/tests/java_api/GenImages/Android.mk b/tests/java_api/GenImages/Android.mk
deleted file mode 100644
index d3e00ae..0000000
--- a/tests/java_api/GenImages/Android.mk
+++ /dev/null
@@ -1,31 +0,0 @@
-#
-# Copyright (C) 2013 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := RsGenImages
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_MIN_SDK_VERSION := 14
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/HealingBrush/Android.bp b/tests/java_api/HealingBrush/Android.bp
new file mode 100644
index 0000000..b8843d5
--- /dev/null
+++ b/tests/java_api/HealingBrush/Android.bp
@@ -0,0 +1,61 @@
+//
+// Copyright (C) 2015 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+android_test {
+    name: "HealingBrush",
+    srcs: [
+        "src/**/*.java",
+        ":HealingBrush-rscript{HealingBrush.srcjar}",
+    ],
+    resource_zips: [
+        ":HealingBrush-rscript{HealingBrush.res.zip}",
+    ],
+    static_libs: ["android-support-v8-renderscript"],
+    sdk_version: "current",
+    jni_libs: ["librsjni"],
+}
+
+genrule {
+    name: "HealingBrush-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "HealingBrush.srcjar",
+        "HealingBrush.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+            "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+            "  -target-api 21 -rs-package-name=androidx.renderscript " +
+            "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+            "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+            "done && " +
+            "$(location soong_zip) -srcjar -o $(location HealingBrush.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+            "$(location soong_zip) -o $(location HealingBrush.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/HealingBrush/Android.mk b/tests/java_api/HealingBrush/Android.mk
deleted file mode 100644
index 4ca1251..0000000
--- a/tests/java_api/HealingBrush/Android.mk
+++ /dev/null
@@ -1,42 +0,0 @@
-#
-# Copyright (C) 2015 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_STATIC_JAVA_LIBRARIES := android-support-v8-renderscript
-
-LOCAL_PACKAGE_NAME := HealingBrush
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_RENDERSCRIPT_TARGET_API := 19
-LOCAL_RENDERSCRIPT_COMPATIBILITY := 18
-
-LOCAL_RENDERSCRIPT_CC := $(LLVM_RS_CC)
-LOCAL_RENDERSCRIPT_INCLUDES_OVERRIDE := \
-    $(TOPDIR)external/clang/lib/Headers \
-    $(TOPDIR)frameworks/rs/script_api/include
-
-LOCAL_RENDERSCRIPT_FLAGS := -rs-package-name=androidx.renderscript
-LOCAL_JNI_SHARED_LIBRARIES := librsjni
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/HelloComputeNDK/Android.bp b/tests/java_api/HelloComputeNDK/Android.bp
new file mode 100644
index 0000000..7dd25d6
--- /dev/null
+++ b/tests/java_api/HelloComputeNDK/Android.bp
@@ -0,0 +1,35 @@
+//
+// Copyright (C) 2013 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+android_test {
+    name: "HelloComputeNDK",
+
+    srcs: [
+        "src/**/*.java",
+    ],
+
+    sdk_version: "current",
+
+    jni_libs: ["libhellocomputendk"],
+
+}
diff --git a/tests/java_api/HelloComputeNDK/Android.mk b/tests/java_api/HelloComputeNDK/Android.mk
deleted file mode 100644
index 5fe2ffd..0000000
--- a/tests/java_api/HelloComputeNDK/Android.mk
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Copyright (C) 2013 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_CFLAGS := -Werror -Wall -Wextra
-
-LOCAL_PACKAGE_NAME := HelloComputeNDK
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-LOCAL_JNI_SHARED_LIBRARIES := libhellocomputendk
-
-include $(BUILD_PACKAGE)
-include $(LOCAL_PATH)/libhellocomputendk/Android.mk
diff --git a/tests/java_api/HelloComputeNDK/libhellocomputendk/Android.bp b/tests/java_api/HelloComputeNDK/libhellocomputendk/Android.bp
new file mode 100644
index 0000000..00679ff
--- /dev/null
+++ b/tests/java_api/HelloComputeNDK/libhellocomputendk/Android.bp
@@ -0,0 +1,45 @@
+// Copyright (C) 2013 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+//
+// This is the shared library included by the JNI test app.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+cc_test_library {
+   name: "libhellocomputendk",
+
+   srcs: [
+        "helloComputeNDK.cpp",
+        "mono.rscript",
+    ],
+    ldflags: ["-Wl,-Bsymbolic"],
+    header_libs: ["jni_headers"],
+    shared_libs: [
+        "libdl",
+        "liblog",
+        "libjnigraphics",
+    ],
+    static_libs: ["libRScpp_static"],
+
+    sdk_version: "current",
+
+    stl: "c++_static",
+
+}
diff --git a/tests/java_api/HelloComputeNDK/libhellocomputendk/Android.mk b/tests/java_api/HelloComputeNDK/libhellocomputendk/Android.mk
deleted file mode 100644
index d0a8d88..0000000
--- a/tests/java_api/HelloComputeNDK/libhellocomputendk/Android.mk
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (C) 2013 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#
-# This is the shared library included by the JNI test app.
-#
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
-
-LOCAL_MODULE := libhellocomputendk
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_MODULE_TAGS := tests
-LOCAL_SRC_FILES := helloComputeNDK.cpp mono.rscript
-
-LOCAL_CFLAGS := -Wall -Werror
-LOCAL_LDFLAGS := -Wl,-Bsymbolic
-LOCAL_HEADER_LIBRARIES := jni_headers
-LOCAL_SHARED_LIBRARIES := libdl liblog libjnigraphics
-LOCAL_STATIC_LIBRARIES := libRScpp_static
-
-LOCAL_SDK_VERSION := current
-
-LOCAL_NDK_STL_VARIANT := c++_static
-
-include $(BUILD_SHARED_LIBRARY)
diff --git a/tests/java_api/ImageProcessing/Android.bp b/tests/java_api/ImageProcessing/Android.bp
new file mode 100644
index 0000000..6ac2167
--- /dev/null
+++ b/tests/java_api/ImageProcessing/Android.bp
@@ -0,0 +1,62 @@
+//
+// Copyright (C) 2009 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "ImageProcessing",
+    libs: [
+        "android.test.runner.stubs",
+        "android.test.base.stubs",
+    ],
+    static_libs: ["junit"],
+    srcs: [
+        "src/**/*.java",
+        ":ImageProcessing-rscript{ImageProcessing.srcjar}",
+    ],
+    resource_zips: [
+        ":ImageProcessing-rscript{ImageProcessing.res.zip}",
+    ],
+    sdk_version: "current",
+    min_sdk_version: "23",
+}
+
+genrule {
+    name: "ImageProcessing-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "ImageProcessing.srcjar",
+        "ImageProcessing.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location ImageProcessing.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location ImageProcessing.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/ImageProcessing/Android.mk b/tests/java_api/ImageProcessing/Android.mk
deleted file mode 100644
index 5ac0d15..0000000
--- a/tests/java_api/ImageProcessing/Android.mk
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Copyright (C) 2009 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_JAVA_LIBRARIES := android.test.runner.stubs android.test.base.stubs
-
-LOCAL_STATIC_JAVA_LIBRARIES := junit
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := ImageProcessing
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_MIN_SDK_VERSION := 23
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/ImageProcessing2/Android.bp b/tests/java_api/ImageProcessing2/Android.bp
new file mode 100644
index 0000000..85c3a14
--- /dev/null
+++ b/tests/java_api/ImageProcessing2/Android.bp
@@ -0,0 +1,61 @@
+//
+// Copyright (C) 2009 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+android_test {
+    name: "ImageProcessing2",
+    srcs: [
+        "src/**/*.java",
+        ":ImageProcessing2-rscript{ImageProcessing2.srcjar}",
+    ],
+    resource_zips: [
+        ":ImageProcessing2-rscript{ImageProcessing2.res.zip}",
+    ],
+    static_libs: ["android-support-v8-renderscript"],
+    sdk_version: "current",
+    jni_libs: ["librsjni"],
+}
+
+genrule {
+    name: "ImageProcessing2-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "ImageProcessing2.srcjar",
+        "ImageProcessing2.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+            "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+            "  -target-api 21 -rs-package-name=androidx.renderscript " +
+            "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+            "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+            "done && " +
+            "$(location soong_zip) -srcjar -o $(location ImageProcessing2.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+            "$(location soong_zip) -o $(location ImageProcessing2.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/ImageProcessing2/Android.mk b/tests/java_api/ImageProcessing2/Android.mk
deleted file mode 100644
index 7165cc3..0000000
--- a/tests/java_api/ImageProcessing2/Android.mk
+++ /dev/null
@@ -1,39 +0,0 @@
-#
-# Copyright (C) 2009 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_STATIC_JAVA_LIBRARIES := android-support-v8-renderscript
-
-LOCAL_PACKAGE_NAME := ImageProcessing2
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_RENDERSCRIPT_TARGET_API := 18
-LOCAL_RENDERSCRIPT_COMPATIBILITY := 18
-LOCAL_RENDERSCRIPT_INCLUDES_OVERRIDE := $(TOPDIR)external/clang/lib/Headers \
-                                        $(TOPDIR)frameworks/rs/script_api/include
-
-LOCAL_RENDERSCRIPT_FLAGS := -rs-package-name=androidx.renderscript
-LOCAL_JNI_SHARED_LIBRARIES := librsjni
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/ImageProcessing_jb/Android.bp b/tests/java_api/ImageProcessing_jb/Android.bp
new file mode 100644
index 0000000..9ffa7ed
--- /dev/null
+++ b/tests/java_api/ImageProcessing_jb/Android.bp
@@ -0,0 +1,62 @@
+//
+// Copyright (C) 2009 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "ImageProcessingJB",
+    static_libs: ["androidx.test.rules"],
+    libs: [
+        "android.test.runner.stubs",
+        "android.test.base.stubs",
+    ],
+    test_suites: ["device-tests"],
+    srcs: [
+        "src/**/*.java",
+        ":ImageProcessingJB-rscript{ImageProcessingJB.srcjar}",
+    ],
+    resource_zips: [
+        ":ImageProcessingJB-rscript{ImageProcessingJB.res.zip}",
+    ],
+    sdk_version: "current",
+}
+
+genrule {
+    name: "ImageProcessingJB-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "ImageProcessingJB.srcjar",
+        "ImageProcessingJB.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location ImageProcessingJB.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location ImageProcessingJB.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/ImageProcessing_jb/Android.mk b/tests/java_api/ImageProcessing_jb/Android.mk
deleted file mode 100644
index 71669e7..0000000
--- a/tests/java_api/ImageProcessing_jb/Android.mk
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# Copyright (C) 2009 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_STATIC_JAVA_LIBRARIES := androidx.test.rules
-LOCAL_JAVA_LIBRARIES := android.test.runner.stubs android.test.base.stubs
-
-LOCAL_MODULE_TAGS := tests
-LOCAL_COMPATIBILITY_SUITE += device-tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := ImageProcessingJB
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/LatencyBenchmark/Android.bp b/tests/java_api/LatencyBenchmark/Android.bp
new file mode 100644
index 0000000..0f05493
--- /dev/null
+++ b/tests/java_api/LatencyBenchmark/Android.bp
@@ -0,0 +1,58 @@
+//
+// Copyright (C) 2012 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "RsLatencyBenchmark",
+    srcs: [
+        "src/**/*.java",
+        ":RsLatencyBenchmark-rscript{RsLatencyBenchmark.srcjar}",
+    ],
+    resource_zips: [
+        ":RsLatencyBenchmark-rscript{RsLatencyBenchmark.res.zip}"
+    ],
+    sdk_version: "current",
+    min_sdk_version: "23",
+}
+
+genrule {
+    name: "RsLatencyBenchmark-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "RsLatencyBenchmark.srcjar",
+        "RsLatencyBenchmark.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -target-api 23 " +
+        "  -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location RsLatencyBenchmark.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location RsLatencyBenchmark.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/LatencyBenchmark/Android.mk b/tests/java_api/LatencyBenchmark/Android.mk
deleted file mode 100644
index 8df2d05..0000000
--- a/tests/java_api/LatencyBenchmark/Android.mk
+++ /dev/null
@@ -1,33 +0,0 @@
-#
-# Copyright (C) 2012 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_RENDERSCRIPT_TARGET_API := 23
-
-LOCAL_PACKAGE_NAME := RsLatencyBenchmark
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_MIN_SDK_VERSION := 23
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/LivePreview/Android.bp b/tests/java_api/LivePreview/Android.bp
new file mode 100644
index 0000000..0b06972
--- /dev/null
+++ b/tests/java_api/LivePreview/Android.bp
@@ -0,0 +1,56 @@
+//
+// Copyright (C) 2012 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "PreviewRS",
+    srcs: [
+        "src/**/*.java",
+        ":PreviewRS-rscript{PreviewRS.srcjar}",
+    ],
+    resource_zips: [
+        ":PreviewRS-rscript{PreviewRS.res.zip}",
+    ],
+    sdk_version: "current",
+}
+
+genrule {
+    name: "PreviewRS-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "PreviewRS.srcjar",
+        "PreviewRS.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location PreviewRS.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location PreviewRS.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/LivePreview/Android.mk b/tests/java_api/LivePreview/Android.mk
deleted file mode 100644
index a1cbe5f..0000000
--- a/tests/java_api/LivePreview/Android.mk
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Copyright (C) 2012 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := PreviewRS
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/MathErr/Android.bp b/tests/java_api/MathErr/Android.bp
new file mode 100644
index 0000000..628c5c5
--- /dev/null
+++ b/tests/java_api/MathErr/Android.bp
@@ -0,0 +1,57 @@
+//
+// Copyright (C) 2013 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "RsMathErr",
+    srcs: [
+        "src/**/*.java",
+        ":RsMathErr-rscript{RsMathErr.srcjar}",
+    ],
+    resource_zips: [
+        ":RsMathErr-rscript{RsMathErr.res.zip}"
+    ],
+    sdk_version: "current",
+    min_sdk_version: "19",
+}
+
+genrule {
+    name: "RsMathErr-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "RsMathErr.srcjar",
+        "RsMathErr.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location RsMathErr.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location RsMathErr.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/MathErr/Android.mk b/tests/java_api/MathErr/Android.mk
deleted file mode 100644
index 1fae614..0000000
--- a/tests/java_api/MathErr/Android.mk
+++ /dev/null
@@ -1,31 +0,0 @@
-#
-# Copyright (C) 2013 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := RsMathErr
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_MIN_SDK_VERSION := 19
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/RSTestBackward/Android.bp b/tests/java_api/RSTestBackward/Android.bp
new file mode 100644
index 0000000..c962be7
--- /dev/null
+++ b/tests/java_api/RSTestBackward/Android.bp
@@ -0,0 +1,38 @@
+//
+// Copyright (C) 2017 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+android_test {
+    name: "RSTestBackward",
+    sdk_version: "current",
+    srcs: [
+        "src/**/*.java",
+        ":RSUnitTests_java_srcs",
+        ":RSTest-rscript{RSTest.srcjar}",
+    ],
+    resource_zips: [
+        ":RSTest-rscript{RSTest.res.zip}",
+    ],
+    static_libs: ["androidx.test.rules"],
+    test_suites: ["device-tests"],
+    min_sdk_version: "21",
+}
diff --git a/tests/java_api/RSTestBackward/Android.mk b/tests/java_api/RSTestBackward/Android.mk
deleted file mode 100644
index b3111a6..0000000
--- a/tests/java_api/RSTestBackward/Android.mk
+++ /dev/null
@@ -1,38 +0,0 @@
-#
-# Copyright (C) 2017 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_PACKAGE_NAME := RSTestBackward
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := \
-    $(call all-java-files-under, src)\
-    $(call all-java-files-under, ../RSUnitTests/src)\
-    $(call all-renderscript-files-under, ../RSUnitTests/src)\
-
-LOCAL_STATIC_JAVA_LIBRARIES := androidx.test.rules
-LOCAL_COMPATIBILITY_SUITE := device-tests
-
-LOCAL_RENDERSCRIPT_TARGET_API := current
-LOCAL_MIN_SDK_VERSION := 21
-LOCAL_SDK_VERSION := current
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/RSTest_CompatLib/Android.bp b/tests/java_api/RSTest_CompatLib/Android.bp
new file mode 100644
index 0000000..74dc966
--- /dev/null
+++ b/tests/java_api/RSTest_CompatLib/Android.bp
@@ -0,0 +1,66 @@
+//
+// Copyright (C) 2017 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+android_test {
+    name: "RSTest_Compat",
+    sdk_version: "current",
+    srcs: [
+        "src/**/*.java",
+        ":RSUnitTests_java_gui",
+        ":RSUnitTests_java_supportlibsrc_gen",
+        ":RSTest_Compat-rscript{RSTest_Compat.srcjar}",
+    ],
+    resource_zips: [
+        ":RSTest_Compat-rscript{RSTest_Compat.res.zip}",
+    ],
+    static_libs: [
+        "androidx.test.rules",
+        "android-support-v8-renderscript",
+    ],
+    min_sdk_version: "8",
+}
+
+genrule {
+    name: "RSTest_Compat-rscript",
+    srcs: [
+        ":RSUnitTests_rscript_supportlibsrc_gen",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "RSTest_Compat.srcjar",
+        "RSTest_Compat.res.zip",
+    ],
+    cmd: "for f in $(locations :RSUnitTests_rscript_supportlibsrc_gen); do " +
+        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -rs-package-name=androidx.renderscript " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location RSTest_Compat.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location RSTest_Compat.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/RSTest_CompatLib/Android.mk b/tests/java_api/RSTest_CompatLib/Android.mk
deleted file mode 100644
index 2df8904..0000000
--- a/tests/java_api/RSTest_CompatLib/Android.mk
+++ /dev/null
@@ -1,43 +0,0 @@
-#
-# Copyright (C) 2017 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_PACKAGE_NAME := RSTest_Compat
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := \
-    $(call all-java-files-under,src) \
-    $(call all-java-files-under,../RSUnitTests/gui) \
-    $(call all-java-files-under,../RSUnitTests/supportlibsrc_gen) \
-    $(call all-renderscript-files-under,../RSUnitTests/supportlibsrc_gen) \
-
-LOCAL_STATIC_JAVA_LIBRARIES := \
-    androidx.test.rules \
-    android-support-v8-renderscript \
-
-LOCAL_RENDERSCRIPT_TARGET_API := current
-LOCAL_RENDERSCRIPT_COMPATIBILITY := true
-LOCAL_SDK_VERSION := current
-LOCAL_MIN_SDK_VERSION := 8
-
-LOCAL_RENDERSCRIPT_FLAGS := -rs-package-name=androidx.renderscript
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/RSUnitTests/Android.bp b/tests/java_api/RSUnitTests/Android.bp
new file mode 100644
index 0000000..7d35d9f
--- /dev/null
+++ b/tests/java_api/RSUnitTests/Android.bp
@@ -0,0 +1,55 @@
+//
+// Copyright (C) 2008 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+filegroup {
+    name: "RSUnitTests_java_srcs",
+    srcs: [
+        "src/**/*.java",
+    ],
+}
+
+filegroup {
+    name: "RSUnitTests_rscript_srcs",
+    srcs: [
+        "src/**/*.rscript",
+    ],
+}
+
+filegroup {
+    name: "RSUnitTests_java_gui",
+    srcs: [
+        "gui/**/*.java",
+    ],
+}
+
+filegroup {
+    name: "RSUnitTests_java_supportlibsrc_gen",
+    srcs: [
+        "supportlibsrc_gen/**/*.java",
+    ],
+}
+
+filegroup {
+    name: "RSUnitTests_rscript_supportlibsrc_gen",
+    srcs: [
+        "supportlibsrc_gen/**/*.rscript",
+    ],
+}
diff --git a/tests/java_api/Refocus/Android.bp b/tests/java_api/Refocus/Android.bp
new file mode 100644
index 0000000..26af7aa
--- /dev/null
+++ b/tests/java_api/Refocus/Android.bp
@@ -0,0 +1,64 @@
+//
+// Copyright (C) 2015 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+android_test {
+    name: "Refocus",
+    static_libs: [
+        "android-support-v8-renderscript",
+        "xmp_toolkit",
+    ],
+    srcs: [
+        "src/**/*.java",
+        ":Refocus-rscript{Refocus.srcjar}",
+    ],
+    resource_zips: [
+        ":Refocus-rscript{Refocus.res.zip}",
+    ],
+    sdk_version: "current",
+    jni_libs: ["librsjni"],
+}
+
+genrule {
+    name: "Refocus-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "Refocus.srcjar",
+        "Refocus.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+            "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+            "  -target-api 21 -rs-package-name=androidx.renderscript " +
+            "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+            "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+            "done && " +
+            "$(location soong_zip) -srcjar -o $(location Refocus.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+            "$(location soong_zip) -o $(location Refocus.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/Refocus/Android.mk b/tests/java_api/Refocus/Android.mk
deleted file mode 100644
index 9635005..0000000
--- a/tests/java_api/Refocus/Android.mk
+++ /dev/null
@@ -1,43 +0,0 @@
-#
-# Copyright (C) 2015 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_STATIC_JAVA_LIBRARIES := android-support-v8-renderscript
-LOCAL_STATIC_JAVA_LIBRARIES += xmp_toolkit
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := Refocus
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SDK_VERSION := current
-LOCAL_RENDERSCRIPT_TARGET_API := 21
-LOCAL_RENDERSCRIPT_COMPATIBILITY := 21
-
-LOCAL_RENDERSCRIPT_CC := $(LLVM_RS_CC)
-LOCAL_RENDERSCRIPT_INCLUDES_OVERRIDE := \
-    $(TOPDIR)external/clang/lib/Headers \
-    $(TOPDIR)frameworks/rs/script_api/include
-
-LOCAL_RENDERSCRIPT_FLAGS := -rs-package-name=androidx.renderscript
-LOCAL_JNI_SHARED_LIBRARIES := librsjni
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/RsCameraDemo/Android.bp b/tests/java_api/RsCameraDemo/Android.bp
new file mode 100644
index 0000000..9e03305
--- /dev/null
+++ b/tests/java_api/RsCameraDemo/Android.bp
@@ -0,0 +1,56 @@
+//
+// Copyright (C) 2015 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "RsCameraDemo",
+    srcs: [
+        "src/**/*.java",
+        ":RsCameraDemo-rscript{RsCameraDemo.srcjar}",
+    ],
+    resource_zips: [
+        ":RsCameraDemo-rscript{RsCameraDemo.srcjar}",
+    ],
+    sdk_version: "current",
+}
+
+genrule {
+    name: "RsCameraDemo-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "RsCameraDemo.srcjar",
+        "RsCameraDemo.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location RsCameraDemo.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location RsCameraDemo.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/RsCameraDemo/Android.mk b/tests/java_api/RsCameraDemo/Android.mk
deleted file mode 100644
index fd9daf9..0000000
--- a/tests/java_api/RsCameraDemo/Android.mk
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Copyright (C) 2015 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-LOCAL_SDK_VERSION := current
-
-LOCAL_PACKAGE_NAME := RsCameraDemo
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/RsMinimalTest/Android.bp b/tests/java_api/RsMinimalTest/Android.bp
new file mode 100644
index 0000000..96bc7bb
--- /dev/null
+++ b/tests/java_api/RsMinimalTest/Android.bp
@@ -0,0 +1,28 @@
+//
+// Copyright (C) 2018 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "RsMinimalTest",
+    static_libs: ["androidx.test.rules"],
+    srcs: ["src/**/*.java"],
+    sdk_version: "30",
+    test_suites: ["device-tests"],
+}
diff --git a/tests/java_api/RsMinimalTest/Android.mk b/tests/java_api/RsMinimalTest/Android.mk
deleted file mode 100644
index 65cba1b..0000000
--- a/tests/java_api/RsMinimalTest/Android.mk
+++ /dev/null
@@ -1,33 +0,0 @@
-#
-# Copyright (C) 2018 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-LOCAL_STATIC_JAVA_LIBRARIES := androidx.test.rules
-
-LOCAL_SRC_FILES := $(call all-java-files-under,src)
-
-LOCAL_RENDERSCRIPT_TARGET_API := 21
-LOCAL_SDK_VERSION := 26
-
-LOCAL_PACKAGE_NAME := RsMinimalTest
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_COMPATIBILITY_SUITE := device-tests
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/RsMinimalTest/AndroidManifest.xml b/tests/java_api/RsMinimalTest/AndroidManifest.xml
index 006d320..40b5acf 100644
--- a/tests/java_api/RsMinimalTest/AndroidManifest.xml
+++ b/tests/java_api/RsMinimalTest/AndroidManifest.xml
@@ -17,7 +17,7 @@
     package="com.android.rs.minimaltest">
     <uses-sdk
         android:minSdkVersion="21"
-        android:targetSdkVersion="26" />
+        android:targetSdkVersion="30" />
 
     <application
         android:largeHeap="true"
diff --git a/tests/java_api/RsNbody/Android.bp b/tests/java_api/RsNbody/Android.bp
new file mode 100644
index 0000000..0118315
--- /dev/null
+++ b/tests/java_api/RsNbody/Android.bp
@@ -0,0 +1,58 @@
+//
+// Copyright (C) 2015 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "RsNbody",
+    srcs: [
+        "src/**/*.java",
+        ":RsNbody-rscript{RsNbody.srcjar}",
+    ],
+    resource_zips: [
+        ":RsNbody-rscript{RsNbody.res.zip}",
+    ],
+    static_libs: ["androidx.legacy_legacy-support-v4"],
+    sdk_version: "current",
+}
+
+genrule {
+    name: "RsNbody-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "RsNbody.srcjar",
+        "RsNbody.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -target-api 23 " +
+        "  -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location RsNbody.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location RsNbody.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/RsNbody/Android.mk b/tests/java_api/RsNbody/Android.mk
deleted file mode 100644
index 4c99c42..0000000
--- a/tests/java_api/RsNbody/Android.mk
+++ /dev/null
@@ -1,38 +0,0 @@
-#
-# Copyright (C) 2015 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-LOCAL_USE_AAPT2 := true
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_STATIC_ANDROID_LIBRARIES += androidx.legacy_legacy-support-v4
-
-LOCAL_PACKAGE_NAME := RsNbody
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_RENDERSCRIPT_TARGET_API := 23
-
-LOCAL_RENDERSCRIPT_CC := $(LLVM_RS_CC)
-LOCAL_RENDERSCRIPT_INCLUDES_OVERRIDE := \
-    $(TOPDIR)external/clang/lib/Headers \
-    $(TOPDIR)frameworks/rs/script_api/include
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/RsTest/Android.bp b/tests/java_api/RsTest/Android.bp
new file mode 100644
index 0000000..3695234
--- /dev/null
+++ b/tests/java_api/RsTest/Android.bp
@@ -0,0 +1,62 @@
+//
+// Copyright (C) 2008 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: [
+        "Android-Apache-2.0",
+    ],
+}
+
+android_test {
+    name: "RSTest",
+    static_libs: ["androidx.test.rules"],
+    srcs: [
+        "src/**/*.java",
+        ":RSUnitTests_java_srcs",
+        ":RSUnitTests_java_gui",
+        ":RSTest-rscript{RSTest.srcjar}",
+    ],
+    resource_zips: [
+        ":RSTest-rscript{RSTest.res.zip}",
+    ],
+    sdk_version: "current",
+    test_suites: ["device-tests"],
+}
+
+genrule {
+    name: "RSTest-rscript",
+    srcs: [
+        ":RSUnitTests_rscript_srcs",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "RSTest.srcjar",
+        "RSTest.res.zip",
+    ],
+    cmd: "for f in $(locations :RSUnitTests_rscript_srcs); do " +
+        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location RSTest.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location RSTest.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/RsTest/Android.mk b/tests/java_api/RsTest/Android.mk
deleted file mode 100644
index 4f231b5..0000000
--- a/tests/java_api/RsTest/Android.mk
+++ /dev/null
@@ -1,37 +0,0 @@
-#
-# Copyright (C) 2008 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-LOCAL_STATIC_JAVA_LIBRARIES := androidx.test.rules
-
-LOCAL_SRC_FILES := \
-    $(call all-java-files-under,src) \
-    $(call all-java-files-under,../RSUnitTests/gui) \
-    $(call all-java-files-under,../RSUnitTests/src) \
-    $(call all-renderscript-files-under,../RSUnitTests/src) \
-
-LOCAL_RENDERSCRIPT_TARGET_API := current
-
-LOCAL_PACKAGE_NAME := RSTest
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_COMPATIBILITY_SUITE := device-tests
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/RsTest_11/Android.bp b/tests/java_api/RsTest_11/Android.bp
new file mode 100644
index 0000000..1e176d2
--- /dev/null
+++ b/tests/java_api/RsTest_11/Android.bp
@@ -0,0 +1,65 @@
+//
+// Copyright (C) 2008 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+// This variable is used to match the 'LOCAL_SDK_VERSION' field in the former Android.mk file.
+local_sdk_version = "11"
+// This variable is used to set the value of the '-target-api' option for the 'llvm-rs-cc' command.
+// Note: it may NOT always be the same as the 'local_sdk_version', due to the existing logic in the Make build system.
+// For the Android.mk to Android.bp conversion, it is recommend to run the build before and after
+// the conversion, to make sure the value of the '-target-api' option to be the same.
+target_api_level = local_sdk_version
+
+android_test {
+    name: "RSTest_v11",
+    srcs: [
+        "src/**/*.java",
+        ":RSTest_v11-rscript{RSTest_v11.srcjar}",
+    ],
+    resource_zips: [
+        ":RSTest_v11-rscript{RSTest_v11.res.zip}"
+    ],
+    sdk_version: local_sdk_version,
+}
+
+genrule {
+    name: "RSTest_v11-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "RSTest_v11.srcjar",
+        "RSTest_v11.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -target-api " + target_api_level +
+        "  -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location RSTest_v11.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location RSTest_v11.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/RsTest_11/Android.mk b/tests/java_api/RsTest_11/Android.mk
deleted file mode 100644
index 8741441..0000000
--- a/tests/java_api/RsTest_11/Android.mk
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Copyright (C) 2008 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := RSTest_v11
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := 11
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/RsTest_14/Android.bp b/tests/java_api/RsTest_14/Android.bp
new file mode 100644
index 0000000..0f52649
--- /dev/null
+++ b/tests/java_api/RsTest_14/Android.bp
@@ -0,0 +1,65 @@
+//
+// Copyright (C) 2008 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+// This variable is used to match the 'LOCAL_SDK_VERSION' field in the former Android.mk file.
+local_sdk_version = "14"
+// This variable is used to set the value of the '-target-api' option for the 'llvm-rs-cc' command.
+// Note: it may NOT always be the same as the 'local_sdk_version', due to the existing logic in the Make build system.
+// For the Android.mk to Android.bp conversion, it is recommend to run the build before and after
+// the conversion, to make sure the value of the '-target-api' option to be the same.
+target_api_level = local_sdk_version
+
+android_test {
+    name: "RSTest_v14",
+    srcs: [
+        "src/**/*.java",
+        ":RSTest_v14-rscript{RSTest_v14.srcjar}",
+    ],
+    resource_zips: [
+        ":RSTest_v14-rscript{RSTest_v14.res.zip}",
+    ],
+    sdk_version: local_sdk_version,
+}
+
+genrule {
+    name: "RSTest_v14-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "RSTest_v14.srcjar",
+        "RSTest_v14.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -target-api " + target_api_level +
+        "  -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location RSTest_v14.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location RSTest_v14.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/RsTest_14/Android.mk b/tests/java_api/RsTest_14/Android.mk
deleted file mode 100644
index 3008e2e..0000000
--- a/tests/java_api/RsTest_14/Android.mk
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Copyright (C) 2008 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := RSTest_v14
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := 14
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/RsTest_16/Android.bp b/tests/java_api/RsTest_16/Android.bp
new file mode 100644
index 0000000..50fce59
--- /dev/null
+++ b/tests/java_api/RsTest_16/Android.bp
@@ -0,0 +1,65 @@
+//
+// Copyright (C) 2008 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+// This variable is used to match the 'LOCAL_SDK_VERSION' field in the former Android.mk file.
+local_sdk_version = "16"
+// This variable is used to set the value of the '-target-api' option for the 'llvm-rs-cc' command.
+// Note: it may NOT always be the same as the 'local_sdk_version', due to the existing logic in the Make build system.
+// For the Android.mk to Android.bp conversion, it is recommend to run the build before and after
+// the conversion, to make sure the value of the '-target-api' option to be the same.
+target_api_level = local_sdk_version
+
+android_test {
+    name: "RSTest_v16",
+    srcs: [
+        "src/**/*.java",
+        ":RSTest_v16-rscript{RSTest_v16.srcjar}",
+    ],
+    resource_zips: [
+        ":RSTest_v16-rscript{RSTest_v16.res.zip}",
+    ],
+    sdk_version: local_sdk_version,
+}
+
+genrule {
+    name: "RSTest_v16-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "RSTest_v16.srcjar",
+        "RSTest_v16.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -target-api " + target_api_level +
+        "  -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location RSTest_v16.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location RSTest_v16.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/RsTest_16/Android.mk b/tests/java_api/RsTest_16/Android.mk
deleted file mode 100644
index 4bb59a2..0000000
--- a/tests/java_api/RsTest_16/Android.mk
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Copyright (C) 2008 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := RSTest_v16
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := 16
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/SSHealingBrush/Android.bp b/tests/java_api/SSHealingBrush/Android.bp
new file mode 100644
index 0000000..4e139e4
--- /dev/null
+++ b/tests/java_api/SSHealingBrush/Android.bp
@@ -0,0 +1,58 @@
+//
+// Copyright (C) 2015 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "SSHealingBrush",
+    srcs: [
+        "src/**/*.java",
+        ":SSHealingBrush-rscript{SSHealingBrush.srcjar}",
+    ],
+    resource_zips: [
+        ":SSHealingBrush-rscript{SSHealingBrush.res.zip}",
+    ],
+    static_libs: ["android-support-v8-renderscript"],
+    sdk_version: "current",
+}
+
+genrule {
+    name: "SSHealingBrush-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "SSHealingBrush.srcjar",
+        "SSHealingBrush.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -target-api 0" +
+        "  -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location SSHealingBrush.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location SSHealingBrush.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/SSHealingBrush/Android.mk b/tests/java_api/SSHealingBrush/Android.mk
deleted file mode 100644
index 63154e5..0000000
--- a/tests/java_api/SSHealingBrush/Android.mk
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# Copyright (C) 2015 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_STATIC_JAVA_LIBRARIES := android-support-v8-renderscript
-
-LOCAL_PACKAGE_NAME := SSHealingBrush
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-LOCAL_RENDERSCRIPT_FLAGS := -target-api 0
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/SampleTest/Android.bp b/tests/java_api/SampleTest/Android.bp
new file mode 100644
index 0000000..0399177
--- /dev/null
+++ b/tests/java_api/SampleTest/Android.bp
@@ -0,0 +1,57 @@
+//
+// Copyright (C) 2012 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "SampleRS",
+    srcs: [
+        "src/**/*.java",
+        ":SampleRS-rscript{SampleRS.srcjar}",
+    ],
+    resource_zips: [
+        ":SampleRS-rscript{SampleRS.res.zip}",
+    ],
+    sdk_version: "current",
+    min_sdk_version: "17",
+}
+
+genrule {
+    name: "SampleRS-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "SampleRS.srcjar",
+        "SampleRS.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location SampleRS.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location SampleRS.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/SampleTest/Android.mk b/tests/java_api/SampleTest/Android.mk
deleted file mode 100644
index 8152181..0000000
--- a/tests/java_api/SampleTest/Android.mk
+++ /dev/null
@@ -1,30 +0,0 @@
-#
-# Copyright (C) 2012 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := SampleRS
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-LOCAL_MIN_SDK_VERSION := 17
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/ScriptGroupTest/Android.bp b/tests/java_api/ScriptGroupTest/Android.bp
new file mode 100644
index 0000000..2f45df1
--- /dev/null
+++ b/tests/java_api/ScriptGroupTest/Android.bp
@@ -0,0 +1,58 @@
+//
+// Copyright (C) 2009 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "ScriptGroupTest",
+    libs: ["android.test.runner.stubs"],
+    srcs: [
+        "src/**/*.java",
+        ":ScriptGroupTest-rscript{ScriptGroupTest.srcjar}",
+    ],
+    resource_zips: [
+        ":ScriptGroupTest-rscript{ScriptGroupTest.res.zip}",
+    ],
+    sdk_version: "current",
+}
+
+genrule {
+    name: "ScriptGroupTest-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "ScriptGroupTest.srcjar",
+        "ScriptGroupTest.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -target-api 0" +
+        "  -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location ScriptGroupTest.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location ScriptGroupTest.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/ScriptGroupTest/Android.mk b/tests/java_api/ScriptGroupTest/Android.mk
deleted file mode 100644
index 6fed8b9..0000000
--- a/tests/java_api/ScriptGroupTest/Android.mk
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Copyright (C) 2009 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_JAVA_LIBRARIES := android.test.runner.stubs
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-#LOCAL_STATIC_JAVA_LIBRARIES := android.renderscript
-
-LOCAL_RENDERSCRIPT_FLAGS := -target-api 0
-
-LOCAL_PACKAGE_NAME := ScriptGroupTest
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-include $(BUILD_PACKAGE)
diff --git a/tests/java_api/VrDemo/Android.bp b/tests/java_api/VrDemo/Android.bp
new file mode 100644
index 0000000..5072f2c
--- /dev/null
+++ b/tests/java_api/VrDemo/Android.bp
@@ -0,0 +1,56 @@
+//
+// Copyright (C) 2015 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    // See: http://go/android-license-faq
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_test {
+    name: "VrDemo",
+    srcs: [
+        "src/**/*.java",
+        ":VrDemo-rscript{VrDemo.srcjar}",
+    ],
+    resource_zips: [
+        ":VrDemo-rscript{VrDemo.res.zip}",
+    ],
+    sdk_version: "current",
+}
+
+genrule {
+    name: "VrDemo-rscript",
+    srcs: [
+        "src/**/*.rscript",
+        ":rs_script_api",
+        ":rs_clang_headers",
+    ],
+    tools: [
+        "llvm-rs-cc",
+        "soong_zip",
+    ],
+    out: [
+        "VrDemo.srcjar",
+        "VrDemo.res.zip",
+    ],
+    cmd: "for f in $(locations src/**/*.rscript); do " +
+        "  $(location llvm-rs-cc) -o $(genDir)/res/raw -p $(genDir)/src " +
+        "  -I $$(dirname $$(echo $(locations :rs_script_api) | awk '{ print $$1 }')) " +
+        "  -I $$(dirname $$(echo $(locations :rs_clang_headers) | awk '{ print $$1 }')) $${f}; " +
+        "done && " +
+        "$(location soong_zip) -srcjar -o $(location VrDemo.srcjar) -C $(genDir)/src -D $(genDir)/src &&" +
+        "$(location soong_zip) -o $(location VrDemo.res.zip) -C $(genDir)/res -D $(genDir)/res",
+}
diff --git a/tests/java_api/VrDemo/Android.mk b/tests/java_api/VrDemo/Android.mk
deleted file mode 100644
index e33fb77..0000000
--- a/tests/java_api/VrDemo/Android.mk
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Copyright (C) 2015 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-LOCAL_SDK_VERSION := current
-
-LOCAL_PACKAGE_NAME := VrDemo
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-include $(BUILD_PACKAGE)
diff --git a/tests/lldb/.gitignore b/tests/lldb/.gitignore
deleted file mode 100644
index b79973f..0000000
--- a/tests/lldb/.gitignore
+++ /dev/null
@@ -1,65 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-*.swp
-
-*~
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-env/
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*,cover
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-
-results.xml
-LLDBTestsuiteLog.txt
diff --git a/tests/lldb/Android.mk b/tests/lldb/Android.mk
deleted file mode 100644
index 95904a8..0000000
--- a/tests/lldb/Android.mk
+++ /dev/null
@@ -1,4 +0,0 @@
-LOCAL_PATH:=$(call my-dir)
-
-include $(call all-makefiles-under,$(LOCAL_PATH))
-
diff --git a/tests/lldb/README.txt b/tests/lldb/README.txt
deleted file mode 100644
index 70e93bc..0000000
--- a/tests/lldb/README.txt
+++ /dev/null
@@ -1,307 +0,0 @@
-- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
--   LLDB for Renderscript Test Suite
--
--   16/03/2016
-- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
-
-Overview:
-
-    The LLDB for Renderscript test suite is written in python and relies on
-    LLDB's python bindings.  The test suite will push several test app's onto
-    a target device, and makes coordinated use of LLDB and ADB to run automated
-    debug sessions.
-
-Set up the test suite:
-
-    Check out the AOSP and build it for your target. Navigate to
-    /frameworks/rs/test/lldb and type mm.  This should successfully build the
-    binaries that the testsuite uses. They will be placed in
-    <path to out folder>/target/product/<product code name>/data/app, system/lib
-    and system/bin.
-
-Prerequisite:
-
-    An lldb-server executable must be present on your device/emulator.
-    LLDB must be compiled on your host machine along with its python interface.
-    lldb-server and lldb should be built from the same source revisions.
-
-Running the test suite:
-
-    The test suite can be run via the following command:
-
-        > python run_tests.py [-h]
-                              [--config path]
-                              [--device DEVICE]
-                              [--test path]
-                              [--install-only]
-                              [--no-install]
-                              [--no-uninstall]
-                              [--print-to-stdout]
-                              [--verbose]
-                              [--wimpy]
-                              [--run-emu]
-                              [--adb-path ADB_PATH]
-                              [--aosp-product-path AOSP_PRODUCT_PATH]
-                              [--blocklist BLOCKLIST [BLOCKLIST ...]]
-                              [--device-port DEVICE_PORT]
-                              [--emu-cmd EMU_CMD]
-                              [--host-port HOST_PORT]
-                              [--lldb-path LLDB_PATH]
-                              [--lldb-server-path-device LLDB_SERVER_PATH_DEVICE]
-                              [--lldb-server-path-host LLDB_SERVER_PATH_HOST]
-                              [--log-file-path LOG_FILE_PATH]
-                              [--results-file-path RESULTS_FILE_PATH]
-                              [--timeout TIMEOUT]
-
-        optional arguments:
-          -h, --help            show this help message and exit
-          --config path, -c path
-                                Path to a custom config file.
-          --device DEVICE, -d DEVICE
-                                Specify the device id of the device to test on.
-          --test path, -t path  Specify a specific test to run.
-          --install-only        It only runs the pre-run stage of the test suite. It
-                                installs the required APKs but does not execute the
-                                tests.
-          --no-install, -n      Stop the test suite installing apks to device.
-          --no-uninstall        Stop the test suite uninstalling apks after
-                                completion.
-          --print-to-stdout     Print all logging information to standard out.
-          --verbose, -v         Store extra info in the log.
-          --wimpy, -w           Test only a core subset of features.
-          --run-emu             Spawn an emulator and run the test suite on that.
-                                Specify the emulator command line in the config file
-                                or with -emu-cmd.
-          --adb-path ADB_PATH   Path to android debug bridge on the host.
-          --aosp-product-path AOSP_PRODUCT_PATH
-                                The path to the "out" folder of the AOSP repository.
-          --blocklist BLOCKLIST [BLOCKLIST ...]
-                                Provide a test blocklist for skipping specific tests.
-                                To specify the blocklist from the command line the
-                                following can be used: --blocklist test1.py test2.py
-                                ...
-          --device-port DEVICE_PORT
-                                Specify the port number that lldb-server (on the
-                                device) listens on. When lldb-server is spawned on the
-                                device it will listen on this port. Each successive
-                                test will increment onwards from this port.
-          --emu-cmd EMU_CMD     The command line for the emulator (if using -run-emu).
-          --host-port HOST_PORT
-                                Specify host port which lldb-server will be forwarded
-                                to. Specify the starting host port number that lldb-
-                                server (on the target) will be forwarded to on the
-                                host. Each successive test will increment onwards from
-                                this initial port.
-          --lldb-path LLDB_PATH
-                                The path to lldb executable on the host.
-          --lldb-server-path-device LLDB_SERVER_PATH_DEVICE
-                                Path to the lldb-server executable on the device.
-          --lldb-server-path-host LLDB_SERVER_PATH_HOST
-                                Path to the lldb-server executable on host (if using
-                                -run-emu).
-          --log-file-path LOG_FILE_PATH
-                                The path to the file where the log will be written.
-          --results-file-path RESULTS_FILE_PATH
-                                The path to the file where junit results.xml will be
-                                written.
-          --timeout TIMEOUT     Timeout period for a single command, expressed in
-                                seconds
-
-    An optional config file can be passed to the test suite which will provide
-    details of your specific environment. The user file should define a custom
-    class inheriting from Config in config.py. The class Config presents the
-    default set of options, available to be overridden.
-
-    All options in the config file can also be specified on the command line.
-
-    If your config and command line do not specify a path to the host lldb,
-    the PYTHONPATH environment variable must be set.  The appropriate value to
-    set this to can be obtained by running the following command:
-
-        > lldb -P
-
-    This will print out a path to the lldb python bindings on your local machine.
-
-Build Requirements:
-
-
-    The following revisions are from the llvm git mirror:
-
-    llvm : 5786b73
-    clang: b6d0b32
-    lldb : 98712eb
-
-    lldb has the following dependencies:
-
-      Python2.7.6
-      swig2.0
-      lldb-server
-
-Building LLDB python bindings:
-
-    Linux:
-
-        Build instructions for Linux lldb can be found on the official lldb web
-        page:
-
-          http://lldb.llvm.org/build.html
-
-        The following CMake variables should be enabled when generating:
-
-          LLDB_ENABLE_PYTHON_SCRIPTS_SWIG_API_GENERATION = True
-
-        As a post build step, swig will generate the python bindings for lldb.
-
-    Windows:
-
-        Prerequisites:
-
-            Visual Studio 2015
-            Custom x64 Python2.7.10 package:
-                http://p-nand-q.com/python/2015.08.07-Python2710-x64-vs2015.7z
-            Swig Version 3.0.5 (Added to the $PATH)
-            Cmake 3.4.0 (So we can generate Visual Studio 2015 solutions)
-
-        Build Python for Windows:
-
-            http://p-nand-q.com/python/building-python-27-with-vs2010.html
-
-            The important thing here is that the above python distribution
-            contains debug versions of the libraries and is built with the same
-            Visual Studio version we are using so the runtimes do not conflict.
-
-        Build LLDB on Windows:
-
-            Select the Cmake generator "Microsoft Visual Studio 2015 Win64".
-            The following CMake variables should be enabled when generating:
-
-                LLDB_DISABLE_PYTHON=False
-                LLDB_RELOCATABLE_PYTHON=False
-                PYTHON_HOME=<path to the above python release>
-
-            Using cmake-gui is a good idea and lets you make sure that swig has
-            been correctly detected.
-
-            In the CMake configure step, you should see something similar to the
-            following:
-            -- Found Python version 2.7.9
-            -- LLDB Found PythonExecutable: E:/Python27/python.exe and
-            E:/Python27/python_d.exe
-            -- LLDB Found PythonLibs: E:/Python27/libs/python27.lib and
-            E:/Python27/libs/python27_d.lib
-            -- LLDB Found PythonDLL: E:/Python27/python27.dll and
-            E:/Python27/python27_d.dll
-            -- LLDB Found PythonIncludeDirs: E:/Python27/Include
-            LLDB version: 3.8.0
-            Could NOT find Doxygen (missing:  DOXYGEN_EXECUTABLE)
-            Found PythonInterp:
-            $<$<CONFIG:Debug>:E:/Python27/python_d.exe>$<$<NOT:$<CONFIG:Debug>>:E:/Python27/python.exe>
-            (found version "1.4")
-
-            Build LLDB as a RelWithDebInfo build, because debug builds of lldb
-            produce an lldb_d.pyd file, but the __init__.py still refers to
-            lldb.pyd.
-
-            The LLDB python bindings should have built in
-            "llvm_build\RelWithDebInfo\lib\site-packages\lldb". Point the test
-            suite towards "llvm_build\RelWithDebInfo\bin\lldb.exe".
-            When running the test suite itself, make sure to use the python
-            executable from the custom package.
-
-A typical test transcript:
-
-    Located ADB
-    Located device ZX1G427S2S
-    Pushing all tests...
-    Pushed all tests
-    Pre run complete
-    Found 86 tests
-    Running test_allocation_dump_2_cpp.py: PASS
-    Running test_breakpoint_fileline_multiple_rs_files_cpp.py: PASS
-    Running test_read_local_cpp.py: PASS
-    Running test_breakpoint_kernel_multiple_rs_files_cpp.py: PASS
-    Running test_multiple_rs_files_cpp.py: PASS
-    Running test_breakpoint_kernel_all_multiple_rs_files_cpp.py: PASS
-    Running test_dwarf_lang_cpp.py: PASS
-    Running test_write_global_element_cpp.py: PASS
-    Running test_allocation_list_cpp.py: PASS
-    Running test_breakpoint_coordinate_cpp.py: PASS
-    Running test_rs_consts_cpp.py: PASS
-    Running test_allocation_file_cpp.py: PASS
-    Running test_allocation_dump_1_cpp.py: PASS
-    Running test_source_step_cpp.py: PASS
-    Running test_breakpoint_kernel_2_cpp.py: PASS
-    Running test_backtrace_cpp.py: PASS
-    Running test_language_subcmds_no_debug_cpp.py: PASS
-    Running test_breakpoint_kernel_1_cpp.py: PASS
-    Running test_language_subcmds_cpp.py: PASS
-    Running test_write_global_cpp.py: PASS
-    Running test_invoke_fun_cpp.py: PASS
-    Running test_breakpoint_fileline_cpp.py: PASS
-    Running test_write_local_cpp.py: PASS
-    Running test_breakpoint_kernel_all_cpp.py: PASS
-    Running test_write_local_element_cpp.py: PASS
-    Running test_call_api_funs_cpp.py: PASS
-    Running test_coordinates_cpp.py: PASS
-    Running test_read_global_cpp.py: PASS
-    Running test_language_subcmds.py: PASS
-    Running test_coordinates.py: PASS
-    Running test_language_subcmds_no_debug.py: PASS
-    Running test_read_local.py: PASS
-    Running test_call_api_funs.py: PASS
-    Running test_breakpoint_kernel_1.py: PASS
-    Running test_breakpoint_fileline.py: PASS
-    Running test_breakpoint_fileline_multiple_rs_files.py: PASS
-    Running test_rs_consts.py: PASS
-    Running test_invoke_fun.py: PASS
-    Running test_write_local_element.py: PASS
-    Running test_source_step.py: PASS
-    Running test_allocation_file.py: PASS
-    Running test_allocation_list.py: PASS
-    Running test_breakpoint_kernel_multiple_rs_files.py: PASS
-    Running test_allocation_dump_1.py: PASS
-    Running test_breakpoint_kernel_all.py: PASS
-    Running test_allocation_dump_2.py: PASS
-    Running test_allocation_dump_struct.py: PASS
-    Running test_read_global.py: PASS
-    Running test_language.py: PASS
-    Running test_dwarf_lang.py: PASS
-    Running test_breakpoint_coordinate.py: PASS
-    Running test_write_global.py: PASS
-    Running test_multiple_rs_files.py: PASS
-    Running test_write_global_element.py: PASS
-    Running test_breakpoint_kernel_all_multiple_rs_files.py: PASS
-    Running test_breakpoint_kernel_2.py: PASS
-    Running test_write_local.py: PASS
-    Running test_backtrace.py: PASS
-    Running test_call_api_funs_jni.py: PASS
-    Running test_invoke_fun_jni.py: PASS
-    Running test_allocation_dump_1_jni.py: PASS
-    Running test_breakpoint_fileline_multiple_rs_files_jni.py: PASS
-    Running test_allocation_file_jni.py: PASS
-    Running test_breakpoint_fileline_jni.py: PASS
-    Running test_source_step_jni.py: PASS
-    Running test_coordinates_jni.py: PASS
-    Running test_rs_consts_jni.py: PASS
-    Running test_breakpoint_kernel_all_multiple_rs_files_jni.py: PASS
-    Running test_multiple_rs_files_jni.py: PASS
-    Running test_allocation_dump_2_jni.py: PASS
-    Running test_allocation_list_jni.py: PASS
-    Running test_write_local_element_jni.py: PASS
-    Running test_breakpoint_kernel_all_jni.py: PASS
-    Running test_breakpoint_coordinate_jni.py: PASS
-    Running test_language_subcmds_no_debug_jni.py: PASS
-    Running test_read_local_jni.py: PASS
-    Running test_dwarf_lang_jni.py: PASS
-    Running test_breakpoint_kernel_2_jni.py: PASS
-    Running test_breakpoint_kernel_multiple_rs_files_jni.py: PASS
-    Running test_write_global_element_jni.py: PASS
-    Running test_breakpoint_kernel_1_jni.py: PASS
-    Running test_read_global_jni.py: PASS
-    Running test_language_subcmds_jni.py: PASS
-    Running test_write_global_jni.py: PASS
-    Running test_backtrace_jni.py: PASS
-    Running test_write_local_jni.py: PASS
-    Uninstalled/Deleted all tests
-    86 of 86 passed
-    100% rate
diff --git a/tests/lldb/config.py b/tests/lldb/config.py
deleted file mode 100644
index bc478d0..0000000
--- a/tests/lldb/config.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''LLDB-Renderscript test suite configuration file.
-
-This file contains the default test suite config which will be used in the
-case a developer did not supply a custom one.'''
-
-import os
-from tests.harness.decorators import deprecated
-
-
-class Config(object):
-    '''Test suite configuration object.
-
-    The Config class is used by the test suite to abstract the specifics of a
-    user's local setup.  This config can be overridden by specifying a custom
-    config on the command line.'''
-    # pylint: disable=no-self-use
-
-    @property
-    def adb_path(self):
-        '''Path to android debug bridge on the host.'''
-        return 'adb'
-
-    @property
-    def host_port(self):
-        '''Specify host port which lldb-server will be forwarded to.
-
-        Specify the starting host port number that lldb-server (on the target)
-        will be forwarded to on the host. Each successive test will increment
-        onwards from this initial port.'''
-        return 1234
-
-    @property
-    def device_port(self):
-        '''Specify the port number that lldb-server (on the device) listens on.
-
-        When lldb-server is spawned on the device it will listen on this port.
-        Each successive test will increment onwards from this port.'''
-        return 1234
-
-    @property
-    def lldb_server_path_device(self):
-        '''Path to the lldb-server executable on the device.'''
-        return '/data/lldb-server'
-
-    @property
-    def lldb_server_path_host(self):
-        '''Path to the lldb-server executable on host (if using -run-emu).'''
-        return 'lldb-server'
-
-    @property
-    def aosp_product_path(self):
-        '''The path to the "out" folder of the AOSP repository.'''
-        return os.getenv('ANDROID_PRODUCT_OUT')
-
-    @property
-    def log_file_path(self):
-        '''The path to the file where the log will be written.'''
-        return os.path.join(os.getcwd(), 'LLDBTestsuiteLog.txt')
-
-    @property
-    def results_file_path(self):
-        '''The path to the file where junit results.xml will be written.'''
-        return os.path.join(os.getcwd(), 'results.xml')
-
-    @property
-    def lldb_path(self):
-        '''The path to lldb executable on the host.'''
-        return 'lldb'
-
-    @property
-    def blocklist(self):
-        '''Provide a test blocklist for skipping specific tests.
-
-        To specify the blocklist from the command line the following can be
-        used: --blocklist test1.py test2.py ...'''
-        return []
-
-    @property
-    def verbose(self):
-        '''Flag to indicate whether to store extra output in the logs.'''
-        return False
-
-    @property
-    def device(self):
-        '''Specify the device id of the device to run on.
-
-        When multiple devices or emulators are present, a specific device to
-        use while testing can be indicated here.'''
-        return os.environ.get('ANDROID_SERIAL')
-
-    @property
-    def timeout(self):
-        '''Timeout period for a single command, expressed in seconds'''
-        return 60 * 15
-
-    @property
-    @deprecated()
-    def emu_cmd(self):
-        '''The command line for the emulator (if using -run-emu).'''
-        return os.path.join(os.path.dirname(__file__), '..', '..', '..', '..',
-                            'prebuilts', 'android-emulator', 'linux-x86_64',
-                            'emulator')
diff --git a/tests/lldb/cpp/Allocations/Allocations.cpp b/tests/lldb/cpp/Allocations/Allocations.cpp
deleted file mode 100644
index 4ad546d..0000000
--- a/tests/lldb/cpp/Allocations/Allocations.cpp
+++ /dev/null
@@ -1,416 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <RenderScript.h>
-
-#include "ScriptC_allocs.h"
-
-sp<RS> mRS;
-sp<Allocation> mBoolAllocation;  // boolean
-
-sp<Allocation> mCharAllocation;  // char
-sp<Allocation> mChar2Allocation;   // char2
-sp<Allocation> mChar3Allocation;   // char3
-sp<Allocation> mChar4Allocation;   // char4
-
-sp<Allocation> mUCharAllocation;   // uchar
-sp<Allocation> mUChar2Allocation;  // uchar2
-sp<Allocation> mUChar3Allocation;  // uchar3
-sp<Allocation> mUChar4Allocation;  // uchar4
-
-sp<Allocation> mShortAllocation;   // short
-sp<Allocation> mShort2Allocation;  // short2
-sp<Allocation> mShort3Allocation;  // short3
-sp<Allocation> mShort4Allocation;  // short4
-
-sp<Allocation> mUShortAllocation;  // ushort
-sp<Allocation> mUShort2Allocation; // ushort2
-sp<Allocation> mUShort3Allocation; // ushort3
-sp<Allocation> mUShort4Allocation; // ushort4
-
-sp<Allocation> mIntAllocation;   // int
-sp<Allocation> mInt2Allocation;  // int2
-sp<Allocation> mInt3Allocation;  // int3
-sp<Allocation> mInt4Allocation;  // int4
-
-sp<Allocation> mUIntAllocation;  // uint
-sp<Allocation> mUInt2Allocation;   // uint2
-sp<Allocation> mUInt3Allocation;   // uint3
-sp<Allocation> mUInt4Allocation;   // uint4
-
-sp<Allocation> mLongAllocation;  // long
-sp<Allocation> mLong2Allocation;   // long2
-sp<Allocation> mLong3Allocation;   // long3
-sp<Allocation> mLong4Allocation;   // long4
-
-sp<Allocation> mULongAllocation;   // ulong
-sp<Allocation> mULong2Allocation;  // ulong2
-sp<Allocation> mULong3Allocation;  // ulong3
-sp<Allocation> mULong4Allocation;  // ulong4
-
-sp<Allocation> mHalfAllocation;  // half
-sp<Allocation> mHalf2Allocation;   // half2
-sp<Allocation> mHalf3Allocation;   // half3
-sp<Allocation> mHalf4Allocation;   // half4
-
-sp<Allocation> mFloatAllocation;   // float
-sp<Allocation> mFloat2Allocation;  // float2
-sp<Allocation> mFloat3Allocation;  // float3
-sp<Allocation> mFloat4Allocation;  // float4
-
-sp<Allocation> mDoubleAllocation;  // double
-sp<Allocation> mDouble2Allocation; // double2
-sp<Allocation> mDouble3Allocation; // double3
-sp<Allocation> mDouble4Allocation; // double4
-
-const int mAllocSize = 24; // Needs to be < CHAR_MAX and divisible by 4.
-const int mBitmapSize = 64;
-
-void createSignedAllocations() {
-    Type::Builder typeI8Builder(mRS, Element::I8(mRS));
-    typeI8Builder.setX(1); // One element here to test 16 byte memory alignment
-    typeI8Builder.setY(3);
-    typeI8Builder.setZ(8);
-
-    mCharAllocation = Allocation::createTyped(mRS, typeI8Builder.create());
-    mChar2Allocation = Allocation::createSized(mRS, Element::I8_2(mRS), mAllocSize / 2);
-    mChar3Allocation = Allocation::createSized(mRS, Element::I8_3(mRS), mAllocSize / 4);
-    mChar4Allocation = Allocation::createSized(mRS, Element::I8_4(mRS), mAllocSize / 4);
-
-    Type::Builder typeI16_2Builder(mRS, Element::I16_2(mRS));
-    typeI16_2Builder.setX(6);
-    typeI16_2Builder.setY(1);
-    typeI16_2Builder.setZ(2);
-
-    mShortAllocation = Allocation::createSized(mRS, Element::I16(mRS), mAllocSize);
-    mShort2Allocation = Allocation::createTyped(mRS, typeI16_2Builder.create());
-    mShort3Allocation = Allocation::createSized(mRS, Element::I16_3(mRS), mAllocSize / 4);
-    mShort4Allocation = Allocation::createSized(mRS, Element::I16_4(mRS), mAllocSize / 4);
-
-    Type::Builder typeI32_3Builder(mRS, Element::I32_3(mRS));
-    typeI32_3Builder.setX(3);
-    typeI32_3Builder.setY(2);
-
-    mIntAllocation = Allocation::createSized(mRS, Element::I32(mRS), mAllocSize);
-    mInt2Allocation = Allocation::createSized(mRS, Element::I32_2(mRS), mAllocSize / 2);
-    mInt3Allocation = Allocation::createTyped(mRS, typeI32_3Builder.create());
-    mInt4Allocation = Allocation::createSized(mRS, Element::I32_4(mRS), mAllocSize / 4);
-
-    Type::Builder typeI64_4Builder(mRS, Element::I64_4(mRS));
-    typeI64_4Builder.setX(1);
-    typeI64_4Builder.setY(6);
-
-    mLongAllocation = Allocation::createSized(mRS, Element::I64(mRS), mAllocSize);
-    mLong2Allocation = Allocation::createSized(mRS, Element::I64_2(mRS), mAllocSize / 2);
-    mLong3Allocation = Allocation::createSized(mRS, Element::I64_3(mRS), mAllocSize / 4);
-    mLong4Allocation = Allocation::createTyped(mRS, typeI64_4Builder.create());
-
-    mBoolAllocation = Allocation::createSized(mRS, Element::BOOLEAN(mRS), mAllocSize);
-}
-
-void initSignedAllocations() {
-    char *buffer_char = new char[mAllocSize];
-    short *buffer_short = new short[mAllocSize];
-    int *buffer_int = new int[mAllocSize];
-    int64_t *buffer_long = new int64_t[mAllocSize];
-    char *buffer_bool = new char[mAllocSize];
-
-    for(int i = 0; i < mAllocSize; ++i) {
-        buffer_char[i] = (char) i;
-        buffer_short[i] = (short) i;
-        buffer_int[i] = (int) i;
-        buffer_long[i] = (int64_t) i;
-        buffer_bool[i] =  (char) (0x01 & i);
-    }
-
-    mCharAllocation->copy3DRangeFrom(0, 0, 0, 1, 3, 8, buffer_char);
-    mChar2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_char);
-    mChar3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
-    mChar4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
-
-    delete [] buffer_char;
-
-    mShortAllocation->copy1DRangeFrom(0, mAllocSize, buffer_short);
-    mShort2Allocation->copy3DRangeFrom(0, 0, 0, 6, 1, 2, buffer_short);
-    mShort3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_short);
-    mShort4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_short);
-
-    delete [] buffer_short;
-
-    mIntAllocation->copy1DRangeFrom(0, mAllocSize, buffer_int);
-    mInt2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_int);
-    mInt3Allocation->copy2DRangeFrom(0, 0, 3, 2, buffer_int);
-    mInt4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_int);
-
-    delete [] buffer_int;
-
-    mLongAllocation->copy1DRangeFrom(0, mAllocSize, buffer_long);
-    mLong2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_long);
-    mLong3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_long);
-    mLong4Allocation->copy2DRangeFrom(0, 0, 1, 6, buffer_long);
-
-    delete [] buffer_long;
-
-    mBoolAllocation->copy1DRangeFrom(0, mAllocSize, buffer_bool);
-
-    delete [] buffer_bool;
-}
-
-void createUnsignedAllocations() {
-    Type::Builder typeU8_2Builder(mRS, Element::U8_2(mRS));
-    typeU8_2Builder.setX(2);
-    typeU8_2Builder.setY(6);
-
-    mUCharAllocation = Allocation::createSized(mRS, Element::U8(mRS), mAllocSize);
-    mUChar2Allocation = Allocation::createTyped(mRS, typeU8_2Builder.create());
-    mUChar3Allocation = Allocation::createSized(mRS, Element::U8_3(mRS), mAllocSize / 4);
-    mUChar4Allocation = Allocation::createSized(mRS, Element::U8_4(mRS), mAllocSize / 4);
-
-    Type::Builder typeU16_3Builder(mRS, Element::U16_3(mRS));
-    typeU16_3Builder.setX(1);
-    typeU16_3Builder.setY(6);
-
-    mUShortAllocation = Allocation::createSized(mRS, Element::U16(mRS), mAllocSize);
-    mUShort2Allocation = Allocation::createSized(mRS, Element::U16_2(mRS), mAllocSize / 2);
-    mUShort3Allocation = Allocation::createTyped(mRS, typeU16_3Builder.create());
-    mUShort4Allocation = Allocation::createSized(mRS, Element::U16_4(mRS), mAllocSize / 4);
-
-    Type::Builder typeU32_4Builder(mRS, Element::U32_4(mRS));
-    typeU32_4Builder.setX(1);
-    typeU32_4Builder.setY(1);
-    typeU32_4Builder.setZ(6);
-
-    mUIntAllocation = Allocation::createSized(mRS, Element::U32(mRS), mAllocSize);
-    mUInt2Allocation = Allocation::createSized(mRS, Element::U32_2(mRS), mAllocSize / 2);
-    mUInt3Allocation = Allocation::createSized(mRS, Element::U32_3(mRS), mAllocSize / 4);
-    mUInt4Allocation = Allocation::createTyped(mRS, typeU32_4Builder.create());
-
-    Type::Builder typeU64Builder(mRS, Element::U64(mRS));
-    typeU64Builder.setX(4);
-    typeU64Builder.setY(3);
-    typeU64Builder.setZ(2);
-
-    mULongAllocation = Allocation::createTyped(mRS, typeU64Builder.create());
-    mULong2Allocation = Allocation::createSized(mRS, Element::U64_2(mRS), mAllocSize / 2);
-    mULong3Allocation = Allocation::createSized(mRS, Element::U64_3(mRS), mAllocSize / 4);
-    mULong4Allocation = Allocation::createSized(mRS, Element::U64_4(mRS), mAllocSize / 4);
-}
-
-void initUnsignedAllocations() {
-    char *buffer_char = new char[mAllocSize];
-    short *buffer_short = new short[mAllocSize];
-    int *buffer_int = new int[mAllocSize];
-    uint64_t *buffer_long = new uint64_t[mAllocSize];
-
-    for(int i = 0; i < mAllocSize; ++i) {
-        buffer_char[i] = (char) i;
-        buffer_short[i] = (short) i;
-        buffer_int[i] = (int) i;
-        buffer_long[i] = (uint64_t) i;
-    }
-
-    mUCharAllocation->copy1DRangeFrom(0, mAllocSize, buffer_char);
-    mUChar2Allocation->copy2DRangeFrom(0, 0, 2, 6, buffer_char);
-    mUChar3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
-    mUChar4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
-
-    delete [] buffer_char;
-
-    mUShortAllocation->copy1DRangeFrom(0, mAllocSize, buffer_short);
-    mUShort2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_short);
-    mUShort3Allocation->copy2DRangeFrom(0, 0, 1, 6, buffer_short);
-    mUShort4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_short);
-
-    delete [] buffer_short;
-
-    mUIntAllocation->copy1DRangeFrom(0, mAllocSize, buffer_int);
-    mUInt2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_int);
-    mUInt3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_int);
-    mUInt4Allocation->copy3DRangeFrom(0, 0, 0, 1, 1, 6, buffer_int);
-
-    delete [] buffer_int;
-
-    mULongAllocation->copy3DRangeFrom(0, 0, 0, 4, 3, 2, buffer_long);
-    mULong2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_long);
-    mULong3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_long);
-    mULong4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_long);
-
-    delete [] buffer_long;
-}
-
-void createFloatAllocations() {
-    Type::Builder typeF16_3Builder(mRS, Element::F16_3(mRS));
-    typeF16_3Builder.setX(1);
-    typeF16_3Builder.setY(6);
-
-    mHalfAllocation = Allocation::createSized(mRS, Element::F16(mRS), mAllocSize);
-    mHalf2Allocation = Allocation::createSized(mRS, Element::F16_2(mRS), mAllocSize / 2);
-    mHalf3Allocation = Allocation::createTyped(mRS, typeF16_3Builder.create());
-    mHalf4Allocation = Allocation::createSized(mRS, Element::F16_4(mRS), mAllocSize / 4);
-
-    Type::Builder typeF32_4Builder(mRS, Element::F32_4(mRS));
-    typeF32_4Builder.setX(3);
-    typeF32_4Builder.setY(2);
-
-    mFloatAllocation = Allocation::createSized(mRS, Element::F32(mRS), mAllocSize);
-    mFloat2Allocation = Allocation::createSized(mRS, Element::F32_2(mRS), mAllocSize / 2);
-    mFloat3Allocation = Allocation::createSized(mRS, Element::F32_3(mRS), mAllocSize / 4);
-    mFloat4Allocation = Allocation::createTyped(mRS, typeF32_4Builder.create());
-
-    Type::Builder typeF64_2Builder(mRS, Element::F64_2(mRS));
-    typeF64_2Builder.setX(4);
-    typeF64_2Builder.setY(1);
-    typeF64_2Builder.setZ(3);
-
-    mDoubleAllocation = Allocation::createSized(mRS, Element::F64(mRS), mAllocSize);
-    mDouble2Allocation = Allocation::createTyped(mRS, typeF64_2Builder.create());
-
-    Type::Builder typeF64_3Builder(mRS, Element::F64_3(mRS));
-    typeF64_3Builder.setX(1);
-    typeF64_3Builder.setY(2);
-    typeF64_3Builder.setZ(3);
-
-    Type::Builder typeF64_4Builder(mRS, Element::F64_4(mRS));
-    typeF64_4Builder.setX(1);
-    typeF64_4Builder.setY(2);
-    typeF64_4Builder.setZ(3);
-
-    mDouble3Allocation = Allocation::createTyped(mRS, typeF64_3Builder.create());
-    mDouble4Allocation = Allocation::createTyped(mRS, typeF64_4Builder.create());
-}
-
-void initFloatAllocations() {
-    __fp16 *buffer_half = new __fp16[mAllocSize];
-    float *buffer_float = new float[mAllocSize];
-    double *buffer_double = new double[mAllocSize];
-
-    for(int i = 0; i < mAllocSize; ++i) {
-        buffer_half[i] = (__fp16) 1 / i;
-        buffer_float[i] = (float) 1 / i;
-        buffer_double[i] = (double) 1 / i;
-    }
-
-    mHalfAllocation->copy1DRangeFrom(0, mAllocSize, buffer_half);
-    mHalf2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_half);
-    mHalf3Allocation->copy2DRangeFrom(0, 0, 1, 6, buffer_half);
-    mHalf4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_half);
-
-    delete [] buffer_half;
-
-    mFloatAllocation->copy1DRangeFrom(0, mAllocSize, buffer_float);
-    mFloat2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_float);
-    mFloat3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_float);
-    mFloat4Allocation->copy2DRangeFrom(0, 0, 3, 2, buffer_float);
-
-    delete [] buffer_float;
-
-    mDoubleAllocation->copy1DRangeFrom(0, mAllocSize, buffer_double);
-    mDouble2Allocation->copy3DRangeFrom(0, 0, 0, 4, 1, 3, buffer_double);
-    mDouble3Allocation->copy3DRangeFrom(0, 0, 0, 1, 2, 3, buffer_double);
-    mDouble4Allocation->copy3DRangeFrom(0, 0, 0, 1, 2, 3, buffer_double);
-
-    delete [] buffer_double;
-}
-
-int main()
-{
-    mRS = new RS();
-
-    mRS->init("/data/rscache", RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
-
-    sp<ScriptC_allocs> mScript = new ScriptC_allocs(mRS);
-
-    Type::Builder typeRGBA_888Builder(mRS, Element::RGBA_8888(mRS));
-    typeRGBA_888Builder.setX(mBitmapSize);
-    typeRGBA_888Builder.setY(mBitmapSize);
-
-    sp<Allocation> mInAllocation = Allocation::createTyped(mRS, typeRGBA_888Builder.create());
-
-    const int image_area = mBitmapSize*mBitmapSize;
-    const int image_size = image_area*sizeof(int);
-
-    char *zero_buffer = new char[image_size];
-    memset(zero_buffer, 0, image_size);
-    mInAllocation->copy1DRangeFrom(0, image_area, zero_buffer);
-    delete [] zero_buffer;
-
-    sp<Allocation> mOutAllocation = Allocation::createTyped(mRS, typeRGBA_888Builder.create());
-    createSignedAllocations();
-    initSignedAllocations();
-
-    mRS->finish();
-    mScript->forEach_swizzle_kernel(mInAllocation, mOutAllocation);
-    mRS->finish();
-
-    mCharAllocation.clear();
-    mChar2Allocation.clear();
-    mChar3Allocation.clear();
-    mChar4Allocation.clear();
-
-    mShort2Allocation.clear();
-    mShort3Allocation.clear();
-    mShort4Allocation.clear();
-
-    mIntAllocation.clear();
-    mInt2Allocation.clear();
-    mInt3Allocation.clear();
-    mInt4Allocation.clear();
-
-    mLongAllocation.clear();
-    mLong2Allocation.clear();
-    mLong3Allocation.clear();
-    mLong4Allocation.clear();
-
-    mBoolAllocation.clear();
-
-    createUnsignedAllocations();
-    initUnsignedAllocations();
-
-    mInAllocation = mUShortAllocation; // Host side assignment
-
-    mRS->finish();
-    mScript->forEach_square_kernel(mInAllocation, mUIntAllocation);
-    mRS->finish();
-
-    mUCharAllocation.clear();
-    mUChar2Allocation.clear();
-    mUChar3Allocation.clear();
-    mUChar4Allocation.clear();
-
-    mUShortAllocation.clear();
-    mUShort2Allocation.clear();
-    mUShort3Allocation.clear();
-    mUShort4Allocation.clear();
-
-    mUInt2Allocation.clear();
-    mUInt3Allocation.clear();
-    mUInt4Allocation.clear();
-
-    mULongAllocation.clear();
-    mULong2Allocation.clear();
-    mULong3Allocation.clear();
-    mULong4Allocation.clear();
-
-    createFloatAllocations();
-    initFloatAllocations();
-
-    mRS->finish();
-    mScript->forEach_add_half_kernel(mDouble4Allocation, mDouble3Allocation);
-    mRS->finish();
-
-    return 0;
-}
-
diff --git a/tests/lldb/cpp/Allocations/Android.mk b/tests/lldb/cpp/Allocations/Android.mk
deleted file mode 100644
index 9f72923..0000000
--- a/tests/lldb/cpp/Allocations/Android.mk
+++ /dev/null
@@ -1,15 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := CppAllocations
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := \
-	Allocations.cpp \
-	allocs.rscript
-
-LOCAL_STATIC_LIBRARIES := libcompiler_rt
-
-include frameworks/rs/tests/lldb/cpp/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/lldb/cpp/Allocations/allocs.rscript b/tests/lldb/cpp/Allocations/allocs.rscript
deleted file mode 100644
index eff7977..0000000
--- a/tests/lldb/cpp/Allocations/allocs.rscript
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.cppallocations)
-
-// Kernel performs basic vector swizzle
-uchar4 __attribute__((kernel)) swizzle_kernel(uchar4 in)
-{
-    return in.wzyx;
-}
-
-// Kernel squares every element in allocation
-uint __attribute__((kernel)) square_kernel(ushort in)
-{
-    uint result = (uint)in * (uint)in;
-    return result;
-}
-
-// Helper function adding 1/2 to passed in double
-static double half_helper(double in)
-{
-    return (in + 0.5);
-}
-
-// Kernel returns first 3 elements of a double4 plus 1/2
-double3 __attribute__((kernel)) add_half_kernel(double4 in)
-{
-    double3 result;
-    result.x = half_helper(in.x);
-    result.y = half_helper(in.y);
-    result.z = half_helper(in.z);
-    return result;
-}
diff --git a/tests/lldb/cpp/Android.mk b/tests/lldb/cpp/Android.mk
deleted file mode 100644
index 8338432..0000000
--- a/tests/lldb/cpp/Android.mk
+++ /dev/null
@@ -1,2 +0,0 @@
-include $(call all-subdir-makefiles)
-
diff --git a/tests/lldb/cpp/BranchingFunCalls/Android.mk b/tests/lldb/cpp/BranchingFunCalls/Android.mk
deleted file mode 100644
index c06e6af..0000000
--- a/tests/lldb/cpp/BranchingFunCalls/Android.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := CppBranchingFunCalls
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := \
-	BranchingFunCalls.cpp \
-	scalars.rscript
-
-include frameworks/rs/tests/lldb/cpp/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/lldb/cpp/BranchingFunCalls/BranchingFunCalls.cpp b/tests/lldb/cpp/BranchingFunCalls/BranchingFunCalls.cpp
deleted file mode 100644
index f0eee09..0000000
--- a/tests/lldb/cpp/BranchingFunCalls/BranchingFunCalls.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <RenderScript.h>
-
-#include "ScriptC_scalars.h"
-
-int main()
-{
-    static const int size = 64;
-    sp<RS> rs = new RS();
-
-    rs->init("/data/rscache", RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
-
-    auto e = Element::I32(rs);
-    Type::Builder tb(rs, e);
-    tb.setX(size);
-    tb.setY(size);
-    auto t = tb.create();
-
-    auto a = Allocation::createTyped(rs, t);
-    auto b = Allocation::createTyped(rs, t);
-
-    int * input = new int[size*size];
-    for(int i = 0; i < size*size; ++i) {
-        input[i] = i - (size*size / 2);
-    }
-    a->copy2DRangeFrom(0, 0, size, size, input);
-    delete [] input;
-
-    // Script is executed once, then the data is copied back when finished
-    sp<ScriptC_scalars> s = new ScriptC_scalars(rs);
-    s->invoke_addToGlobal(234);
-    s->forEach_simple_kernel(a, b);
-    rs->finish();
-    int32_t * output = new int32_t[size*size];
-    b->copy2DRangeTo(0, 0, size, size, output);
-    delete [] output;
-
-    return 0;
-}
-
diff --git a/tests/lldb/cpp/BranchingFunCalls/scalars.rscript b/tests/lldb/cpp/BranchingFunCalls/scalars.rscript
deleted file mode 100644
index 279694d..0000000
--- a/tests/lldb/cpp/BranchingFunCalls/scalars.rscript
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.cppbranchingfuncalls)
-
-static bool is_neg(int a)
-{
-    if(a < 0)
-        return true;
-    else
-        return false;
-}
-
-static bool is_pos(int a)
-{
-    if(a > 0)
-        return true;
-    else
-        return false;
-}
-
-static void set_i(int * a, int b)
-{
-    int tmp = b;
-    *a = tmp;
-}
-
-static void modify_f(float * f)
-{
-    *f *= 0.5f;
-}
-
-static void modify_i(int * i)
-{
-    int j = *i;
-    int cutoff = 2 << 6;
-    if(j > cutoff)
-        j = cutoff;
-    if(is_neg(j))
-        set_i(i, 0);
-    else if(is_pos(j))
-        set_i(i, j);
-    else
-        set_i(i, cutoff);
-}
-
-int __attribute__((kernel)) simple_kernel(int in)
-{
-    int i = in;
-    float f = (float) i;
-    modify_f(&f);
-    modify_i(&i);
-    int ret = (int) f;
-    return in * ret;
-}
-
-int glob = 123;
-
-void addToGlobal(int arg)
-{
-    glob += arg;
-}
diff --git a/tests/lldb/cpp/InfiniteLoop/Android.mk b/tests/lldb/cpp/InfiniteLoop/Android.mk
deleted file mode 100644
index 86a4ec5..0000000
--- a/tests/lldb/cpp/InfiniteLoop/Android.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := CppInfiniteLoop
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := \
-	InfiniteLoop.cpp \
-	infiniteloop.rscript
-
-include frameworks/rs/tests/lldb/cpp/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/lldb/cpp/InfiniteLoop/InfiniteLoop.cpp b/tests/lldb/cpp/InfiniteLoop/InfiniteLoop.cpp
deleted file mode 100644
index 2b55d4e..0000000
--- a/tests/lldb/cpp/InfiniteLoop/InfiniteLoop.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <thread>
-#include <chrono>
-
-#include <RenderScript.h>
-
-#include "ScriptC_infiniteloop.h"
-
-int main()
-{
-    static const int size = 64;
-    sp<RS> rs = new RS();
-
-    rs->init("/data/rscache", RS_INIT_LOW_LATENCY);
-
-    auto e = Element::RGBA_8888(rs);
-    Type::Builder tb(rs, e);
-    tb.setX(size);
-    tb.setY(size);
-    auto t = tb.create();
-
-    auto a = Allocation::createTyped(rs, t);
-    auto b = Allocation::createTyped(rs, t);
-
-    sp<ScriptC_infiniteloop> s = new ScriptC_infiniteloop(rs);
-
-    // Test is designed to loop forever, waits for two seconds
-    // between each invocation of the kernel
-    bool forever = true;
-    while(forever)
-    {
-        s->forEach_simple_kernel(a, b);
-        std::this_thread::sleep_for(std::chrono::seconds(2));
-    }
-
-    uint32_t * output = new uint32_t[size*size];
-    b->copy2DRangeTo(0, 0, size, size, output);
-    delete [] output;
-
-    return 0;
-}
-
diff --git a/tests/lldb/cpp/InfiniteLoop/infiniteloop.rscript b/tests/lldb/cpp/InfiniteLoop/infiniteloop.rscript
deleted file mode 100644
index 7eff95c..0000000
--- a/tests/lldb/cpp/InfiniteLoop/infiniteloop.rscript
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.cppinfiniteloop)
-
-
-float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
-
-/* RenderScript kernel that just sets the colour of the screen and does some
- * simple operations so it is not completely empty
- * (and can therefore be debugged).
- */
-uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
-{
-    float4 out = rsUnpackColor8888(in);
-
-    out.r = gColor.r;
-    out.g = gColor.g;
-    out.b = gColor.b;
-    out.a = gColor.a;
-
-    uchar4 result = rsPackColorTo8888(out);
-    return result;
-}
-
diff --git a/tests/lldb/cpp/KernelVariables/Android.mk b/tests/lldb/cpp/KernelVariables/Android.mk
deleted file mode 100644
index 7a68c93..0000000
--- a/tests/lldb/cpp/KernelVariables/Android.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := CppKernelVariables
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := \
-	KernelVariables.cpp \
-	simple.rscript
-
-include frameworks/rs/tests/lldb/cpp/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/lldb/cpp/KernelVariables/KernelVariables.cpp b/tests/lldb/cpp/KernelVariables/KernelVariables.cpp
deleted file mode 100644
index e289005..0000000
--- a/tests/lldb/cpp/KernelVariables/KernelVariables.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <RenderScript.h>
-
-#include "ScriptC_simple.h"
-
-int main()
-{
-    static const int size = 64;
-    sp<RS> rs = new RS();
-
-    rs->init("/data/rscache", RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
-
-    auto e = Element::RGBA_8888(rs);
-    Type::Builder tb(rs, e);
-    tb.setX(size);
-    tb.setY(size);
-    auto t = tb.create();
-
-    auto a = Allocation::createTyped(rs, t);
-    auto b = Allocation::createTyped(rs, t);
-
-    sp<ScriptC_simple> s = new ScriptC_simple(rs);
-
-    static const int buffer_int[] = {1, 2, 3, 4};
-    sp<Allocation> int_allocation = Allocation::createSized(rs, Element::I32(rs), 4);
-    int_allocation->copy1DRangeFrom(0, 4, buffer_int);
-    s->set_allocation_1D_global(int_allocation);
-
-    static const int buffer_int2[] = {5, 6, 7, 8};
-
-    Type::Builder typeI32Builder2D(rs, Element::I32(rs));
-    typeI32Builder2D.setX(2);
-    typeI32Builder2D.setY(2);
-
-    sp<Allocation> int_allocation2 = Allocation::createTyped(rs, typeI32Builder2D.create());
-    int_allocation2->copy2DRangeFrom(0, 0, 2, 2, buffer_int2);
-    s->set_allocation_1D_global2(int_allocation2);
-
-    s->set_allocation_2D_global(a);
-    s->set_allocation_2D_global2(b);
-
-    static const int buffer_int3[] = {9, 10, 11, 12, 13, 14, 15, 16};
-
-    Type::Builder typeI32Builder3D(rs, Element::I32(rs));
-    typeI32Builder3D.setX(2);
-    typeI32Builder3D.setY(2);
-    typeI32Builder3D.setZ(2);
-
-    sp<Allocation> int_allocation3 = Allocation::createTyped(rs, typeI32Builder3D.create());
-    int_allocation3->copy3DRangeFrom(0, 0, 0, 2, 2, 2, buffer_int3);
-    s->set_allocation_3D_global(int_allocation3);
-
-    Type::Builder yuvTypeBuilder(rs, Element::YUV(rs));
-    yuvTypeBuilder.setX(4);
-    yuvTypeBuilder.setY(4);
-    yuvTypeBuilder.setYuvFormat(RS_YUV_YV12);
-
-    sp<Allocation> yuv_allocation = Allocation::createTyped(rs, yuvTypeBuilder.create());
-    s->set_allocation_YUV_2D_global(yuv_allocation);
-
-    s->set_sampler_global(Sampler::CLAMP_LINEAR(rs));
-
-    // Script is executed once, then the data is copied back when finished
-    s->forEach_kernel(a, b);
-    rs->finish();
-    uint32_t * output = new uint32_t[size*size];
-    b->copy2DRangeTo(0, 0, size, size, output);
-    delete [] output;
-
-    return 0;
-}
-
diff --git a/tests/lldb/cpp/KernelVariables/simple.rscript b/tests/lldb/cpp/KernelVariables/simple.rscript
deleted file mode 100644
index bad675e..0000000
--- a/tests/lldb/cpp/KernelVariables/simple.rscript
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.cppkernelvariables)
-
-char char_global = 12;
-uchar uchar_global = 234;
-short short_global = -321;
-ushort ushort_global = 432;
-int int_global = 1234;
-uint uint_global = 2345;
-float float_global = 4.5f;
-long long_global = -77777;
-ulong ulong_global = 8888;
-double double_global = -456.5f;
-
-char2 char2_global = {11, -22};
-uchar2 uchar2_global = {33, 44};
-short2 short2_global = {-555, 666};
-ushort2 ushort2_global = {777, 888};
-int2 int2_global = {999, -1111};
-uint2 uint2_global = {2222, 3333};
-float2 float2_global = {4.5f, -5.0f};
-long2 long2_global = {-4444, 5555};
-ulong2 ulong2_global = {6666, 7777};
-double2 double2_global = {88.5f, -99.0f};
-
-char3 char3_global = {11, -22, -33};
-uchar3 uchar3_global = {33, 44, 55};
-short3 short3_global = {-555, 666, 777};
-ushort3 ushort3_global = {777, 888, 999};
-int3 int3_global = {999, -1111, 2222};
-uint3 uint3_global = {2222, 3333, 4444};
-float3 float3_global = {4.5f, -5.0f, -6.5f};
-long3 long3_global = {-4444, 5555, 6666};
-ulong3 ulong3_global = {6666, 7777, 8888};
-double3 double3_global = {88.5f, -99.0f, 111.5f};
-
-char4 char4_global = {55, 11, -22, -33};
-uchar4 uchar4_global = {222, 33, 44, 55};
-short4 short4_global = {-444, -555, 666, 777};
-ushort4 ushort4_global = {666, 777, 888, 999};
-int4 int4_global = {888, 999, -1111, 2222};
-uint4 uint4_global = {1111, 2222, 3333, 4444};
-float4 float4_global = {3.0f, 4.5f, -5.0f, -6.5f};
-long4 long4_global = {-3333, -4444, 5555, 6666};
-ulong4 ulong4_global = {5555, 6666, 7777, 8888};
-double4 double4_global = {-77.0f, 88.5f, -99.0f, 111.5f};
-
-rs_matrix2x2 matrix2x2_global;
-rs_matrix3x3 matrix3x3_global;
-rs_matrix4x4 matrix4x4_global;
-
-rs_quaternion quaternion_global;
-
-rs_allocation allocation_1D_global;
-rs_allocation allocation_1D_global2;
-rs_allocation allocation_2D_global;
-rs_allocation allocation_2D_global2;
-rs_allocation allocation_3D_global;
-rs_allocation allocation_YUV_2D_global;
-
-rs_allocation_cubemap_face cubemap_face_global;
-rs_sampler sampler_global;
-
-uchar4 __attribute__((kernel)) kernel(uchar4 in)
-{
-    char char_local = 'a';
-    uchar uchar_local = 'b';
-    short short_local = -321;
-    ushort ushort_local = 432;
-    int int_local = 1234;
-    uint uint_local = 2345;
-    float float_local = 4.5f;
-    long long_local = -77777;
-    ulong ulong_local = 8888;
-    double double_local = -456.5f;
-
-    char2 char2_local = {-11, -22};
-    uchar2 uchar2_local = {33, 44};
-    short2 short2_local = {-555, 666};
-    ushort2 ushort2_local = {777, 888};
-    int2 int2_local = {999, -1111};
-    uint2 uint2_local = {2222, 3333};
-    float2 float2_local = {4.5f, -5.0f};
-    long2 long2_local = {-4444, 5555};
-    ulong2 ulong2_local = {6666, 7777};
-    double2 double2_local = {88.5f, -99.0f};
-
-    char3 char3_local = {11, -22, -33};
-    uchar3 uchar3_local = {33, 44, 55};
-    short3 short3_local = {-555, 666, 777};
-    ushort3 ushort3_local = {777, 888, 999};
-    int3 int3_local = {999, -1111, 2222};
-    uint3 uint3_local = {2222, 3333, 4444};
-    float3 float3_local = {4.5f, -5.0f, -6.5f};
-    long3 long3_local = {-4444, 5555, 6666};
-    ulong3 ulong3_local = {6666, 7777, 8888};
-    double3 double3_local = {88.5f, -99.0f, 111.5f};
-
-    char4 char4_local = {55, 11, -22, -33};
-    uchar4 uchar4_local = {22, 33, 44, 55};
-    short4 short4_local = {-444, -555, 666, 777};
-    ushort4 ushort4_local = {666, 777, 888, 999};
-    int4 int4_local = {888, 999, -1111, 2222};
-    uint4 uint4_local = {1111, 2222, 3333, 4444};
-    float4 float4_local = {3.0f, 4.5f, -5.0f, -6.5f};
-    long4 long4_local = {-3333, -4444, 5555, 6666};
-    ulong4 ulong4_local = {5555, 6666, 7777, 8888};
-    double4 double4_local = {-77.0f, 88.5f, -99.0f, 111.5f};
-
-    rs_matrix2x2 matrix2x2_local = {{1., 2.5,
-                                     3., 4.5}};
-    rs_matrix3x3 matrix3x3_local = {{5., 6.5, 7.,
-                                     8.5, 9., 1.5,
-                                     2., 3.5, 4.}};
-    rs_matrix4x4 matrix4x4_local = {{5.5, 6., 7.5, 8.,
-                                     9., 1.5, 2., 3.5,
-                                     4.5, 5.5, 6.5, 7.,
-                                     8., 9.5, 1.5, 2.5}};
-
-    matrix2x2_global = matrix2x2_local;
-    matrix3x3_global = matrix3x3_local;
-    matrix4x4_global = matrix4x4_local;
-
-    rsQuaternionSet(&quaternion_global, 3.0, 4.5, 5.5, 6.0);
-
-    rs_quaternion quaternion_local;
-    rsQuaternionSet(&quaternion_local, 7.5, 8.0, 9.0, 0.5);
-
-    char char_combined = char_local + (char)uchar_local + char2_local.x +
-        (char)uchar2_local.x + char3_local.x - (char)uchar3_local.x +
-        char4_local.x + (char)uchar4_local.x;
-
-    short short_combined = short_local + (short)ushort_local + short2_local.x +
-        (short)ushort2_local.x + short3_local.x + (short)ushort3_local.x +
-        short4_local.x + (short)ushort4_local.x;
-
-    int int_combined = int_local + (int)uint_local + int2_local.x +
-        (int)uint2_local.x + int3_local.x + (int)uint3_local.x + int4_local.x +
-        (int)uint4_local.x;
-
-    float float_combined = float_local + float2_local.x + float3_local.x +
-        float4_local.x;
-
-    long long_combined = long_local + (long)ulong_local + long2_local.x +
-        (long)ulong2_local.x + long3_local.x + (long)ulong3_local.x +
-        long4_local.x + (long)ulong4_local.x;
-
-    double double_combined = double_local + double2_local.x + double3_local.x +
-        double4_local.x;
-
-    char_global = char_combined;
-    short_global = short_combined;
-    int_global = int_combined;
-    float_global = float_combined;
-    long_global = long_combined;
-    double_global = double_combined;
-
-    uchar4 result = {1,2,3,4};
-    return result;
-}
-
-float use_constants_global;
-
-void setup(void)
-{
-  use_constants_global =
-      M_1_PI +
-      M_2_PI +
-      M_2_PIl +
-      M_2_SQRTPI +
-      M_E +
-      M_LN10 +
-      M_LN2 +
-      M_LOG10E +
-      M_LOG2E +
-      M_PI +
-      M_PI_2 +
-      M_PI_4 +
-      M_SQRT1_2 +
-      M_SQRT2;
-}
diff --git a/tests/lldb/cpp/MultipleRSFiles/Android.mk b/tests/lldb/cpp/MultipleRSFiles/Android.mk
deleted file mode 100644
index d4b5b35..0000000
--- a/tests/lldb/cpp/MultipleRSFiles/Android.mk
+++ /dev/null
@@ -1,14 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := CppMultipleRSFiles
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := \
-	MultipleRSFiles.cpp \
-	first.rscript \
-	second.rscript
-
-include frameworks/rs/tests/lldb/cpp/common.mk
-include $(BUILD_EXECUTABLE)
diff --git a/tests/lldb/cpp/MultipleRSFiles/MultipleRSFiles.cpp b/tests/lldb/cpp/MultipleRSFiles/MultipleRSFiles.cpp
deleted file mode 100644
index 9d0d4e7..0000000
--- a/tests/lldb/cpp/MultipleRSFiles/MultipleRSFiles.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <RenderScript.h>
-
-#include "ScriptC_first.h"
-#include "ScriptC_second.h"
-
-int main()
-{
-    static const int size = 64;
-    sp<RS> rs = new RS();
-
-    rs->init("/data/rscache", RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
-
-    auto e = Element::RGBA_8888(rs);
-    Type::Builder tb(rs, e);
-    tb.setX(size);
-    tb.setY(size);
-    auto t = tb.create();
-
-    auto a = Allocation::createTyped(rs, t);
-    auto b = Allocation::createTyped(rs, t);
-
-    // Script is executed once, then the data is copied back when finished
-    sp<ScriptC_first> s1 = new ScriptC_first(rs);
-    sp<ScriptC_second> s2 = new ScriptC_second(rs);
-
-    s1->forEach_first_kernel(a, b);
-    uint32_t * output = new uint32_t[size*size];
-    b->copy2DRangeTo(0, 0, size, size, output);
-    delete [] output;
-
-    s2->forEach_second_kernel(a, b);
-
-    rs->finish();
-    return 0;
-}
diff --git a/tests/lldb/cpp/MultipleRSFiles/first.rscript b/tests/lldb/cpp/MultipleRSFiles/first.rscript
deleted file mode 100644
index 2f84c46..0000000
--- a/tests/lldb/cpp/MultipleRSFiles/first.rscript
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.cppmultiplersfiles)
-
-float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
-
-/* RenderScript kernel that just sets the colour of the screen and does some
- * simple operations so it is not completely empty
- * (and can therefore be debugged).
- */
-uchar4 __attribute__((kernel)) first_kernel(uchar4 in)
-{
-    float4 temp = rsUnpackColor8888(in);
-    temp = gColor;
-    uchar4 result = rsPackColorTo8888(temp);
-    return result;
-}
diff --git a/tests/lldb/cpp/MultipleRSFiles/second.rscript b/tests/lldb/cpp/MultipleRSFiles/second.rscript
deleted file mode 100644
index 0279d0f..0000000
--- a/tests/lldb/cpp/MultipleRSFiles/second.rscript
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.cppmultiplersfiles)
-
-/* RenderScript kernel that just returns the swizzled input. */
-uchar4 __attribute__((kernel)) second_kernel(uchar4 in)
-{
-    uchar4 result = in.wzyx;
-    return result;
-}
diff --git a/tests/lldb/cpp/WaitAttach/Android.mk b/tests/lldb/cpp/WaitAttach/Android.mk
deleted file mode 100644
index 00bf745..0000000
--- a/tests/lldb/cpp/WaitAttach/Android.mk
+++ /dev/null
@@ -1,27 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := CppDebugWaitAttach
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := \
-	WaitAttach.cpp \
-	simple.rscript
-
-include frameworks/rs/tests/lldb/cpp/common.mk
-include $(BUILD_EXECUTABLE)
-
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := CppNoDebugWaitAttach
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := \
-	WaitAttach.cpp \
-	simple.rscript
-
-include frameworks/rs/tests/lldb/cpp/common.mk
-LOCAL_RENDERSCRIPT_FLAGS := $(filter-out -g,$(LOCAL_RENDERSCRIPT_FLAGS))
-include $(BUILD_EXECUTABLE)
diff --git a/tests/lldb/cpp/WaitAttach/WaitAttach.cpp b/tests/lldb/cpp/WaitAttach/WaitAttach.cpp
deleted file mode 100644
index c6c1980..0000000
--- a/tests/lldb/cpp/WaitAttach/WaitAttach.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <RenderScript.h>
-
-#include "ScriptC_simple.h"
-
-int main()
-{
-    static const int size = 8;
-    sp<RS> rs = new RS();
-
-    rs->init("/data/rscache", RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
-
-    auto e = Element::RGBA_8888(rs);
-    Type::Builder tb(rs, e);
-    tb.setX(size);
-    tb.setY(size);
-    auto t = tb.create();
-
-    auto a = Allocation::createTyped(rs, t);
-    auto b = Allocation::createTyped(rs, t);
-
-    // Script is executed once, then the data is copied back when finished
-    sp<ScriptC_simple> s = new ScriptC_simple(rs);
-    s->forEach_simple_kernel(a, b);
-    uint32_t * output = new uint32_t[size*size];
-    b->copy2DRangeTo(0, 0, size, size, output);
-    delete [] output;
-
-    s->forEach_other_kernel(a, b);
-
-    rs->finish();
-    return 0;
-}
diff --git a/tests/lldb/cpp/WaitAttach/simple.rscript b/tests/lldb/cpp/WaitAttach/simple.rscript
deleted file mode 100644
index 1e9780a..0000000
--- a/tests/lldb/cpp/WaitAttach/simple.rscript
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.cppwaitattach)
-
-float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
-
-/* RenderScript kernel that just sets the colour of the screen and does some
- * simple operations so it is not completely empty
- * (and can therefore be debugged).
- */
-uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
-{
-    float4 temp = rsUnpackColor8888(in);
-    temp = gColor;
-    uchar4 result = rsPackColorTo8888(temp);
-    return result;
-}
-
-// Extra kernel to test lldb setting breakpoints on all the RS kernels.
-uchar4 __attribute__((kernel)) other_kernel(uchar4 in)
-{
-    uchar4 result = in.wzyx;
-    return result;
-}
diff --git a/tests/lldb/cpp/common.mk b/tests/lldb/cpp/common.mk
deleted file mode 100644
index 3b65741..0000000
--- a/tests/lldb/cpp/common.mk
+++ /dev/null
@@ -1,11 +0,0 @@
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_RENDERSCRIPT_FLAGS += -g -O0 -target-api 0
-LOCAL_CFLAGS := -Werror -Wall -Wextra -std=c++11
-LOCAL_LDFLAGS += -llog
-
-LOCAL_STATIC_LIBRARIES += libRScpp_static
-
-intermediates += $(call intermediates-dir-for,STATIC_LIBRARIES,libRS,TARGET,)
-
-LOCAL_C_INCLUDES += $(intermediates)
diff --git a/tests/lldb/java/Allocations/Android.mk b/tests/lldb/java/Allocations/Android.mk
deleted file mode 100644
index cff322f..0000000
--- a/tests/lldb/java/Allocations/Android.mk
+++ /dev/null
@@ -1,17 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := \
-	$(call all-java-files-under, src) \
-	$(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := Allocations
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_PRIVATE_PLATFORM_APIS := true
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -Wno-unused -target-api 0
-
-include $(BUILD_PACKAGE)
diff --git a/tests/lldb/java/Allocations/AndroidManifest.xml b/tests/lldb/java/Allocations/AndroidManifest.xml
deleted file mode 100644
index 8650cb0..0000000
--- a/tests/lldb/java/Allocations/AndroidManifest.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.allocations">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="Allocations"
-                 android:hardwareAccelerated="true">
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
-
diff --git a/tests/lldb/java/Allocations/res/layout/main_layout.xml b/tests/lldb/java/Allocations/res/layout/main_layout.xml
deleted file mode 100644
index 131c3b5..0000000
--- a/tests/lldb/java/Allocations/res/layout/main_layout.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
-
diff --git a/tests/lldb/java/Allocations/src/com/android/rs/allocations/MainActivity.java b/tests/lldb/java/Allocations/src/com/android/rs/allocations/MainActivity.java
deleted file mode 100644
index f06f2b9..0000000
--- a/tests/lldb/java/Allocations/src/com/android/rs/allocations/MainActivity.java
+++ /dev/null
@@ -1,552 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.allocations;
-
-import android.app.Activity;
-import android.os.Bundle;
-import android.graphics.Bitmap;
-import android.renderscript.*;
-
-public class MainActivity extends Activity {
-    private RenderScript mRS;
-
-    private Allocation mInAllocation;      // script input
-    private Allocation mOutAllocation;     // script output
-
-    private Allocation mStructInAlloc;     // complexStruct input
-    private Allocation mStructOutAlloc;    // complexStruct output
-
-    private Allocation mBoolAllocation;    // boolean
-
-    private Allocation mCharAllocation;    // char
-    private Allocation mChar2Allocation;   // char2
-    private Allocation mChar3Allocation;   // char3
-    private Allocation mChar4Allocation;   // char4
-
-    private Allocation mUCharAllocation;   // uchar
-    private Allocation mUChar2Allocation;  // uchar2
-    private Allocation mUChar3Allocation;  // uchar3
-    private Allocation mUChar4Allocation;  // uchar4
-
-    private Allocation mShortAllocation;   // short
-    private Allocation mShort2Allocation;  // short2
-    private Allocation mShort3Allocation;  // short3
-    private Allocation mShort4Allocation;  // short4
-
-    private Allocation mUShortAllocation;  // ushort
-    private Allocation mUShort2Allocation; // ushort2
-    private Allocation mUShort3Allocation; // ushort3
-    private Allocation mUShort4Allocation; // ushort4
-
-    private Allocation mIntAllocation;     // int
-    private Allocation mInt2Allocation;    // int2
-    private Allocation mInt3Allocation;    // int3
-    private Allocation mInt4Allocation;    // int4
-
-    private Allocation mUIntAllocation;    // uint
-    private Allocation mUInt2Allocation;   // uint2
-    private Allocation mUInt3Allocation;   // uint3
-    private Allocation mUInt4Allocation;   // uint4
-
-    private Allocation mLongAllocation;    // long
-    private Allocation mLong2Allocation;   // long2
-    private Allocation mLong3Allocation;   // long3
-    private Allocation mLong4Allocation;   // long4
-
-    private Allocation mULongAllocation;   // ulong
-    private Allocation mULong2Allocation;  // ulong2
-    private Allocation mULong3Allocation;  // ulong3
-    private Allocation mULong4Allocation;  // ulong4
-
-    private Allocation mHalfAllocation;    // half
-    private Allocation mHalf2Allocation;   // half2
-    private Allocation mHalf3Allocation;   // half3
-    private Allocation mHalf4Allocation;   // half4
-
-    private Allocation mFloatAllocation;   // float
-    private Allocation mFloat2Allocation;  // float2
-    private Allocation mFloat3Allocation;  // float3
-    private Allocation mFloat4Allocation;  // float4
-
-    private Allocation mDoubleAllocation;  // double
-    private Allocation mDouble2Allocation; // double2
-    private Allocation mDouble3Allocation; // double3
-    private Allocation mDouble4Allocation; // double4
-
-    private ScriptC_allocs mScript;
-
-    private int mAllocSize = 24; // Chosen as allocation size since it's easily divisible
-
-    private Bitmap mBitmapIn;
-    private Bitmap mBitmapOut;
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-
-        setContentView(R.layout.main_layout);
-
-        mBitmapIn = Bitmap.createBitmap(64, 64, Bitmap.Config.ARGB_8888);
-        mBitmapOut = Bitmap.createBitmap(mBitmapIn.getWidth(), mBitmapIn.getHeight(), mBitmapIn.getConfig());
-
-        createScript();
-        runScript();
-    }
-
-    private void createScript() {
-        mRS = RenderScript.create(this,
-            RenderScript.ContextType.NORMAL,
-            RenderScript.CREATE_FLAG_LOW_LATENCY |
-            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
-
-        mScript = new ScriptC_allocs(mRS);
-    }
-
-    private void createSignedAllocations() {
-        Type.Builder typeI8Builder = new Type.Builder(mRS, Element.I8(mRS));
-        typeI8Builder.setX(1); // One element here to test 16 byte memory alignment
-        typeI8Builder.setY(3);
-        typeI8Builder.setZ(8);
-
-        mCharAllocation = Allocation.createTyped(mRS, typeI8Builder.create());
-        mRS.finish();
-        mChar2Allocation = Allocation.createSized(mRS, Element.I8_2(mRS), mAllocSize / 2);
-        mRS.finish();
-        mChar3Allocation = Allocation.createSized(mRS, Element.I8_3(mRS), mAllocSize / 4);
-        mRS.finish();
-        mChar4Allocation = Allocation.createSized(mRS, Element.I8_4(mRS), mAllocSize / 4);
-        mRS.finish();
-
-        Type.Builder typeI16_2Builder = new Type.Builder(mRS, Element.I16_2(mRS));
-        typeI16_2Builder.setX(6);
-        typeI16_2Builder.setY(1);
-        typeI16_2Builder.setZ(2);
-
-        mShortAllocation = Allocation.createSized(mRS, Element.I16(mRS), mAllocSize);
-        mRS.finish();
-        mShort2Allocation = Allocation.createTyped(mRS, typeI16_2Builder.create());
-        mRS.finish();
-        mShort3Allocation = Allocation.createSized(mRS, Element.I16_3(mRS), mAllocSize / 4);
-        mRS.finish();
-        mShort4Allocation = Allocation.createSized(mRS, Element.I16_4(mRS), mAllocSize / 4);
-        mRS.finish();
-
-        Type.Builder typeI32_3Builder = new Type.Builder(mRS, Element.I32_3(mRS));
-        typeI32_3Builder.setX(3);
-        typeI32_3Builder.setY(2);
-
-        mIntAllocation = Allocation.createSized(mRS, Element.I32(mRS), mAllocSize);
-        mRS.finish();
-        mInt2Allocation = Allocation.createSized(mRS, Element.I32_2(mRS), mAllocSize / 2);
-        mRS.finish();
-        mInt3Allocation = Allocation.createTyped(mRS, typeI32_3Builder.create());
-        mRS.finish();
-        mInt4Allocation = Allocation.createSized(mRS, Element.I32_4(mRS), mAllocSize / 4);
-        mRS.finish();
-
-        Type.Builder typeI64_4Builder = new Type.Builder(mRS, Element.I64_4(mRS));
-        typeI64_4Builder.setX(1);
-        typeI64_4Builder.setY(6);
-
-        mLongAllocation = Allocation.createSized(mRS, Element.I64(mRS), mAllocSize);
-        mRS.finish();
-        mLong2Allocation = Allocation.createSized(mRS, Element.I64_2(mRS), mAllocSize / 2);
-        mRS.finish();
-        mLong3Allocation = Allocation.createSized(mRS, Element.I64_3(mRS), mAllocSize / 4);
-        mRS.finish();
-        mLong4Allocation = Allocation.createTyped(mRS, typeI64_4Builder.create());
-        mRS.finish();
-
-        mBoolAllocation = Allocation.createSized(mRS, Element.BOOLEAN(mRS), mAllocSize);
-        mRS.finish();
-    }
-
-    private void initSignedAllocations() {
-        byte[] buffer_char = new byte[mAllocSize];
-        short[] buffer_short = new short[mAllocSize];
-        int[] buffer_int = new int[mAllocSize];
-        long[] buffer_long = new long[mAllocSize];
-        byte[] buffer_bool = new byte[mAllocSize];
-
-        for(int i = 0; i < mAllocSize; ++i) {
-            buffer_char[i] = (byte) i;
-            buffer_short[i] = (short) i;
-            buffer_int[i] = (int) i;
-            buffer_long[i] = (long) i;
-            buffer_bool[i] =  (byte) (0x01 & i);
-        }
-
-        mCharAllocation.copyFrom(buffer_char);
-        mChar2Allocation.copyFrom(buffer_char);
-        mChar3Allocation.copyFrom(buffer_char);
-        mChar4Allocation.copyFrom(buffer_char);
-
-        mShortAllocation.copyFrom(buffer_short);
-        mShort2Allocation.copyFrom(buffer_short);
-        mShort3Allocation.copyFrom(buffer_short);
-        mShort4Allocation.copyFrom(buffer_short);
-
-        mIntAllocation.copyFrom(buffer_int);
-        mInt2Allocation.copyFrom(buffer_int);
-        mInt3Allocation.copyFrom(buffer_int);
-        mInt4Allocation.copyFrom(buffer_int);
-
-        mLongAllocation.copyFrom(buffer_long);
-        mLong2Allocation.copyFrom(buffer_long);
-        mLong3Allocation.copyFrom(buffer_long);
-        mLong4Allocation.copyFrom(buffer_long);
-
-        mBoolAllocation.copyFromUnchecked(buffer_bool);
-    }
-
-    private void createUnsignedAllocations() {
-        Type.Builder typeU8_2Builder = new Type.Builder(mRS, Element.U8_2(mRS));
-        typeU8_2Builder.setX(2);
-        typeU8_2Builder.setY(6);
-
-        mUCharAllocation = Allocation.createSized(mRS, Element.U8(mRS), mAllocSize);
-        mRS.finish();
-        mUChar2Allocation = Allocation.createTyped(mRS, typeU8_2Builder.create());
-        mRS.finish();
-        mUChar3Allocation = Allocation.createSized(mRS, Element.U8_3(mRS), mAllocSize / 4);
-        mRS.finish();
-        mUChar4Allocation = Allocation.createSized(mRS, Element.U8_4(mRS), mAllocSize / 4);
-        mRS.finish();
-
-        Type.Builder typeU16_3Builder = new Type.Builder(mRS, Element.U16_3(mRS));
-        typeU16_3Builder.setX(1);
-        typeU16_3Builder.setY(6);
-
-        mUShortAllocation = Allocation.createSized(mRS, Element.U16(mRS), mAllocSize);
-        mRS.finish();
-        mUShort2Allocation = Allocation.createSized(mRS, Element.U16_2(mRS), mAllocSize / 2);
-        mRS.finish();
-        mUShort3Allocation = Allocation.createTyped(mRS, typeU16_3Builder.create());
-        mRS.finish();
-        mUShort4Allocation = Allocation.createSized(mRS, Element.U16_4(mRS), mAllocSize / 4);
-        mRS.finish();
-
-        Type.Builder typeU32_4Builder = new Type.Builder(mRS, Element.U32_4(mRS));
-        typeU32_4Builder.setX(1);
-        typeU32_4Builder.setY(1);
-        typeU32_4Builder.setZ(6);
-
-        mUIntAllocation = Allocation.createSized(mRS, Element.U32(mRS), mAllocSize);
-        mRS.finish();
-        mUInt2Allocation = Allocation.createSized(mRS, Element.U32_2(mRS), mAllocSize / 2);
-        mRS.finish();
-        mUInt3Allocation = Allocation.createSized(mRS, Element.U32_3(mRS), mAllocSize / 4);
-        mRS.finish();
-        mUInt4Allocation = Allocation.createTyped(mRS, typeU32_4Builder.create());
-        mRS.finish();
-
-        Type.Builder typeU64Builder = new Type.Builder(mRS, Element.U64(mRS));
-        typeU64Builder.setX(4);
-        typeU64Builder.setY(3);
-        typeU64Builder.setZ(2);
-
-        mULongAllocation = Allocation.createTyped(mRS, typeU64Builder.create());
-        mRS.finish();
-        mULong2Allocation = Allocation.createSized(mRS, Element.U64_2(mRS), mAllocSize / 2);
-        mRS.finish();
-        mULong3Allocation = Allocation.createSized(mRS, Element.U64_3(mRS), mAllocSize / 4);
-        mRS.finish();
-        mULong4Allocation = Allocation.createSized(mRS, Element.U64_4(mRS), mAllocSize / 4);
-        mRS.finish();
-    }
-
-    private void initUnsignedAllocations() {
-        byte[] buffer_char = new byte[mAllocSize];
-        short[] buffer_short = new short[mAllocSize];
-        int[] buffer_int = new int[mAllocSize];
-        long[] buffer_long = new long[mAllocSize];
-
-        for(int i = 0; i < mAllocSize; ++i) {
-            buffer_char[i] = (byte) i;
-            buffer_short[i] = (short) i;
-            buffer_int[i] = (int) i;
-            buffer_long[i] = (long) i;
-        }
-
-        mUCharAllocation.copyFrom(buffer_char);
-        mUChar2Allocation.copyFrom(buffer_char);
-        mUChar3Allocation.copyFrom(buffer_char);
-        mUChar4Allocation.copyFrom(buffer_char);
-
-        mUShortAllocation.copyFrom(buffer_short);
-        mUShort2Allocation.copyFrom(buffer_short);
-        mUShort3Allocation.copyFrom(buffer_short);
-        mUShort4Allocation.copyFrom(buffer_short);
-
-        mUIntAllocation.copyFrom(buffer_int);
-        mUInt2Allocation.copyFrom(buffer_int);
-        mUInt3Allocation.copyFrom(buffer_int);
-        mUInt4Allocation.copyFrom(buffer_int);
-
-        mULongAllocation.copyFrom(buffer_long);
-        mULong2Allocation.copyFrom(buffer_long);
-        mULong3Allocation.copyFrom(buffer_long);
-        mULong4Allocation.copyFrom(buffer_long);
-    }
-
-    private void createFloatAllocations() {
-        Type.Builder typeF16_3Builder = new Type.Builder(mRS, Element.F16_3(mRS));
-        typeF16_3Builder.setX(1);
-        typeF16_3Builder.setY(6);
-
-        mHalfAllocation = Allocation.createSized(mRS, Element.F16(mRS), mAllocSize);
-        mRS.finish();
-        mHalf2Allocation = Allocation.createSized(mRS, Element.F16_2(mRS), mAllocSize / 2);
-        mRS.finish();
-        mHalf3Allocation = Allocation.createTyped(mRS, typeF16_3Builder.create());
-        mRS.finish();
-        mHalf4Allocation = Allocation.createSized(mRS, Element.F16_4(mRS), mAllocSize / 4);
-        mRS.finish();
-
-        Type.Builder typeF32_4Builder = new Type.Builder(mRS, Element.F32_4(mRS));
-        typeF32_4Builder.setX(3);
-        typeF32_4Builder.setY(2);
-
-        mFloatAllocation = Allocation.createSized(mRS, Element.F32(mRS), mAllocSize);
-        mRS.finish();
-        mFloat2Allocation = Allocation.createSized(mRS, Element.F32_2(mRS), mAllocSize / 2);
-        mRS.finish();
-        mFloat3Allocation = Allocation.createSized(mRS, Element.F32_3(mRS), mAllocSize / 4);
-        mRS.finish();
-        mFloat4Allocation = Allocation.createTyped(mRS, typeF32_4Builder.create());
-        mRS.finish();
-
-        Type.Builder typeF64_2Builder = new Type.Builder(mRS, Element.F64_2(mRS));
-        typeF64_2Builder.setX(4);
-        typeF64_2Builder.setY(1);
-        typeF64_2Builder.setZ(3);
-
-        mDoubleAllocation = Allocation.createSized(mRS, Element.F64(mRS), mAllocSize);
-        mRS.finish();
-        mDouble2Allocation = Allocation.createTyped(mRS, typeF64_2Builder.create());
-        mRS.finish();
-
-        Type.Builder typeF64_3Builder = new Type.Builder(mRS, Element.F64_3(mRS));
-        typeF64_3Builder.setX(1);
-        typeF64_3Builder.setY(2);
-        typeF64_3Builder.setZ(3);
-
-        Type.Builder typeF64_4Builder = new Type.Builder(mRS, Element.F64_4(mRS));
-        typeF64_4Builder.setX(1);
-        typeF64_4Builder.setY(2);
-        typeF64_4Builder.setZ(3);
-
-        mDouble3Allocation = Allocation.createTyped(mRS, typeF64_3Builder.create());
-        mRS.finish();
-        mDouble4Allocation = Allocation.createTyped(mRS, typeF64_4Builder.create());
-        mRS.finish();
-    }
-
-    private void initFloatAllocations() {
-        // No java type for half precision float, so bitcast 16-bit int
-        short[] buffer_half = new short[mAllocSize];
-        float[] buffer_float = new float[mAllocSize];
-        double[] buffer_double = new double[mAllocSize];
-
-        for(int i = 0; i < mAllocSize; ++i) {
-            // Construct IEEE 754 half with increasing fraction.
-            byte mantissa = (byte)(i);
-            byte exponent = 0b00111100; // keep exponent constant at one
-            buffer_half[i] = (short)((exponent << 8) | mantissa);
-
-            buffer_float[i] = (float) 1 / i;
-            buffer_double[i] = (double) 1 / i;
-        }
-
-        mHalfAllocation.copyFromUnchecked(buffer_half);
-        mHalf2Allocation.copyFromUnchecked(buffer_half);
-        mHalf3Allocation.copyFromUnchecked(buffer_half);
-        mHalf4Allocation.copyFromUnchecked(buffer_half);
-
-        mFloatAllocation.copyFrom(buffer_float);
-        mFloat2Allocation.copyFrom(buffer_float);
-        mFloat3Allocation.copyFrom(buffer_float);
-        mFloat4Allocation.copyFrom(buffer_float);
-
-        mDoubleAllocation.copyFrom(buffer_double);
-        mDouble2Allocation.copyFrom(buffer_double);
-        mDouble3Allocation.copyFrom(buffer_double);
-        mDouble4Allocation.copyFrom(buffer_double);
-    }
-
-    private void createStructAllocations() {
-        ScriptField_complexStruct complex_struct;
-
-        complex_struct = new ScriptField_complexStruct(mRS, mAllocSize);
-        mRS.finish();
-        mScript.bind_g_complexStruct_in(complex_struct);
-        mRS.finish();
-        mStructInAlloc = complex_struct.getAllocation();
-        mRS.finish();
-
-        complex_struct = new ScriptField_complexStruct(mRS, mAllocSize);
-        mRS.finish();
-        mScript.bind_g_complexStruct_out(complex_struct);
-        mRS.finish();
-        mStructOutAlloc = complex_struct.getAllocation();
-        mRS.finish();
-    }
-
-    private void overwriteFloatAllocations() {
-        float[] buffer_float = new float[mAllocSize];
-
-        // Set float allocations to -1/n
-        for(int i = 0; i < mAllocSize; ++i) {
-            buffer_float[i] = -1f / i;
-        }
-
-        mFloatAllocation.copyFrom(buffer_float);
-        mFloat2Allocation.copyFrom(buffer_float);
-        mFloat3Allocation.copyFrom(buffer_float);
-        mFloat4Allocation.copyFrom(buffer_float);
-    }
-
-    private void runScript() {
-        mInAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
-        mRS.finish();
-        mOutAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
-        mRS.finish();
-
-
-        createSignedAllocations();
-        initSignedAllocations();
-
-        mRS.finish();
-        mScript.forEach_swizzle_kernel(mInAllocation, mOutAllocation);
-        mRS.finish();
-
-        mOutAllocation.copyTo(mBitmapOut);
-
-        mCharAllocation.destroy();
-        mRS.finish();
-        mChar2Allocation.destroy();
-        mRS.finish();
-        mChar3Allocation.destroy();
-        mRS.finish();
-        mChar4Allocation.destroy();
-        mRS.finish();
-
-        mShort2Allocation.destroy();
-        mRS.finish();
-        mShort3Allocation.destroy();
-        mRS.finish();
-        mShort4Allocation.destroy();
-        mRS.finish();
-
-        mIntAllocation.destroy();
-        mRS.finish();
-        mInt2Allocation.destroy();
-        mRS.finish();
-        mInt3Allocation.destroy();
-        mRS.finish();
-        mInt4Allocation.destroy();
-        mRS.finish();
-
-        mLongAllocation.destroy();
-        mRS.finish();
-        mLong2Allocation.destroy();
-        mRS.finish();
-        mLong3Allocation.destroy();
-        mRS.finish();
-        mLong4Allocation.destroy();
-        mRS.finish();
-
-        mBoolAllocation.destroy();
-        mRS.finish();
-
-
-        createUnsignedAllocations();
-        initUnsignedAllocations();
-
-        mInAllocation = mUShortAllocation; // Host side assignment
-
-        mRS.finish();
-        mScript.forEach_square_kernel(mInAllocation, mUIntAllocation);
-        mRS.finish();
-
-        mUCharAllocation.destroy();
-        mRS.finish();
-        mUChar2Allocation.destroy();
-        mRS.finish();
-        mUChar3Allocation.destroy();
-        mRS.finish();
-        mUChar4Allocation.destroy();
-        mRS.finish();
-
-        mUShortAllocation.destroy();
-        mRS.finish();
-        mUShort2Allocation.destroy();
-        mRS.finish();
-        mUShort3Allocation.destroy();
-        mRS.finish();
-        mUShort4Allocation.destroy();
-        mRS.finish();
-
-        mUInt2Allocation.destroy();
-        mRS.finish();
-        mUInt3Allocation.destroy();
-        mRS.finish();
-        mUInt4Allocation.destroy();
-        mRS.finish();
-
-        mULongAllocation.destroy();
-        mRS.finish();
-        mULong2Allocation.destroy();
-        mRS.finish();
-        mULong3Allocation.destroy();
-        mRS.finish();
-        mULong4Allocation.destroy();
-        mRS.finish();
-
-
-        createFloatAllocations();
-        initFloatAllocations();
-
-        mRS.finish();
-        mScript.forEach_add_half_kernel(mDouble4Allocation, mDouble3Allocation);
-        mRS.finish();
-
-        mHalfAllocation.destroy();
-        mRS.finish();
-        mHalf2Allocation.destroy();
-        mRS.finish();
-        mHalf3Allocation.destroy();
-        mRS.finish();
-        mHalf4Allocation.destroy();
-        mRS.finish();
-
-        mDoubleAllocation.destroy();
-        mRS.finish();
-        mDouble2Allocation.destroy();
-        mRS.finish();
-        mDouble4Allocation.destroy();
-        mRS.finish();
-
-        overwriteFloatAllocations();
-
-        createStructAllocations();
-
-        mRS.finish();
-        mScript.forEach_struct_kernel(mStructInAlloc, mStructOutAlloc);
-        mRS.finish();
-    }
-}
diff --git a/tests/lldb/java/Allocations/src/rs/allocs.rscript b/tests/lldb/java/Allocations/src/rs/allocs.rscript
deleted file mode 100644
index 0ec39c5..0000000
--- a/tests/lldb/java/Allocations/src/rs/allocs.rscript
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.allocations)
-
-struct simpleStruct {
-   int i;
-   unsigned int j;
-};
-
-struct complexStruct {
-    struct simpleStruct s;
-    uchar4 c;
-    float f[2];
-};
-
-struct complexStruct *g_complexStruct_in;
-struct complexStruct *g_complexStruct_out;
-
-// Kernel performs basic vector swizzle
-uchar4 __attribute__((kernel)) swizzle_kernel(uchar4 in)
-{
-    return in.wzyx;
-}
-
-// Kernel squares every element in allocation
-uint __attribute__((kernel)) square_kernel(ushort in)
-{
-    uint result = (uint)(in) * (uint)in;
-    return result;
-}
-
-// Helper function adding 1/2 to passed in double
-static double half_helper(double in)
-{
-    return (in + 0.5);
-}
-
-// Kernel returns first 3 elements of a double4 plus 1/2
-double3 __attribute__((kernel)) add_half_kernel(double4 in)
-{
-    double3 result;
-    result.x = half_helper(in.x);
-    result.y = half_helper(in.y);
-    result.z = half_helper(in.z);
-    return result;
-}
-
-// Kernel for testing structs
-struct complexStruct __attribute__((kernel))
-struct_kernel(struct complexStruct in, uint32_t x)
-{
-   struct complexStruct complex_out;
-   struct simpleStruct simple_out;
-   simple_out.i = (int) x;
-   simple_out.j = x;
-   complex_out.s = simple_out;
-
-   complex_out.f[0] = (float) x;
-   complex_out.f[1] = (float) x + 0.5;
-
-   complex_out.c.x = (uchar) (x % 128);
-   complex_out.c.y = 'A';
-   complex_out.c.z = 'B';
-   complex_out.c.w = 'C';
-
-   return complex_out;
-}
diff --git a/tests/lldb/java/Android.mk b/tests/lldb/java/Android.mk
deleted file mode 100644
index 5053e7d..0000000
--- a/tests/lldb/java/Android.mk
+++ /dev/null
@@ -1 +0,0 @@
-include $(call all-subdir-makefiles)
diff --git a/tests/lldb/java/BranchingFunCalls/Android.mk b/tests/lldb/java/BranchingFunCalls/Android.mk
deleted file mode 100644
index 94f9d6f..0000000
--- a/tests/lldb/java/BranchingFunCalls/Android.mk
+++ /dev/null
@@ -1,17 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := \
-	$(call all-java-files-under, src) \
-	$(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := BranchingFunCalls
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_PRIVATE_PLATFORM_APIS := true
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
diff --git a/tests/lldb/java/BranchingFunCalls/AndroidManifest.xml b/tests/lldb/java/BranchingFunCalls/AndroidManifest.xml
deleted file mode 100644
index fb83a04..0000000
--- a/tests/lldb/java/BranchingFunCalls/AndroidManifest.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.branchingfuncalls">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="BranchingFunCalls"
-                 android:hardwareAccelerated="true">
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
-
diff --git a/tests/lldb/java/BranchingFunCalls/res/layout/main_layout.xml b/tests/lldb/java/BranchingFunCalls/res/layout/main_layout.xml
deleted file mode 100644
index 131c3b5..0000000
--- a/tests/lldb/java/BranchingFunCalls/res/layout/main_layout.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
-
diff --git a/tests/lldb/java/BranchingFunCalls/src/com/android/rs/branchingfuncalls/MainActivity.java b/tests/lldb/java/BranchingFunCalls/src/com/android/rs/branchingfuncalls/MainActivity.java
deleted file mode 100644
index a2c1f82..0000000
--- a/tests/lldb/java/BranchingFunCalls/src/com/android/rs/branchingfuncalls/MainActivity.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.branchingfuncalls;
-
-import android.app.Activity;
-import android.os.Bundle;
-import android.renderscript.*;
-
-public class MainActivity extends Activity {
-    private RenderScript mRS;
-    private Allocation mInAllocation;
-    private Allocation mOutAllocation;
-    private ScriptC_scalars mScript;
-    private int mAllocSize = 256;
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-
-        setContentView(R.layout.main_layout);
-        createScript();
-        runScript();
-    }
-
-    private void createScript() {
-        mRS = RenderScript.create(this,
-            RenderScript.ContextType.NORMAL,
-            RenderScript.CREATE_FLAG_LOW_LATENCY |
-            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
-
-        Element e = Element.I32(mRS);
-        mInAllocation = Allocation.createSized(mRS, e, mAllocSize);
-        mOutAllocation = Allocation.createSized(mRS, e, mAllocSize);
-
-        mScript = new ScriptC_scalars(mRS);
-    }
-
-    private void runScript() {
-        mScript.invoke_addToGlobal(234);
-
-        int[] init = new int[mAllocSize];
-        for(int i = 0; i < mAllocSize; ++i) {
-            init[i] = i - (mAllocSize / 2);
-        }
-        mInAllocation.copy1DRangeFrom(0, mAllocSize, init);
-        mScript.forEach_simple_kernel(mInAllocation, mOutAllocation);
-    }
-}
-
diff --git a/tests/lldb/java/BranchingFunCalls/src/rs/scalars.rscript b/tests/lldb/java/BranchingFunCalls/src/rs/scalars.rscript
deleted file mode 100644
index 7206330..0000000
--- a/tests/lldb/java/BranchingFunCalls/src/rs/scalars.rscript
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.branchingfuncalls)
-
-static bool is_neg(int a)
-{
-    if(a < 0)
-        return true;
-    else
-        return false;
-}
-
-static bool is_pos(int a)
-{
-    if(a > 0)
-        return true;
-    else
-        return false;
-}
-
-static void set_i(int * a, int b)
-{
-    int tmp = b;
-    *a = tmp;
-}
-
-static void modify_f(float * f)
-{
-    *f *= 0.5f;
-}
-
-static void modify_i(int * i)
-{
-    int j = *i;
-    int cutoff = 2 << 6;
-    if(j > cutoff)
-        j = cutoff;
-    if(is_neg(j))
-        set_i(i, 0);
-    else if(is_pos(j))
-        set_i(i, j);
-    else
-        set_i(i, cutoff);
-}
-
-int __attribute__((kernel)) simple_kernel(int in)
-{
-    int i = in;
-    float f = (float) i;
-    modify_f(&f);
-    modify_i(&i);
-    int ret = (int) f;
-    return in * ret;
-}
-
-int glob = 123;
-
-void addToGlobal(int arg)
-{
-    glob += arg;
-}
diff --git a/tests/lldb/java/DebugWaitAttach/Android.mk b/tests/lldb/java/DebugWaitAttach/Android.mk
deleted file mode 100644
index 3486b78..0000000
--- a/tests/lldb/java/DebugWaitAttach/Android.mk
+++ /dev/null
@@ -1,15 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := JavaDebugWaitAttach
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_PRIVATE_PLATFORM_APIS := true
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
diff --git a/tests/lldb/java/DebugWaitAttach/AndroidManifest.xml b/tests/lldb/java/DebugWaitAttach/AndroidManifest.xml
deleted file mode 100644
index 862af28..0000000
--- a/tests/lldb/java/DebugWaitAttach/AndroidManifest.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.waitattachdebug">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="JavaDebugWaitAttach"
-                 android:hardwareAccelerated="true">
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
diff --git a/tests/lldb/java/DebugWaitAttach/res/layout/main_layout.xml b/tests/lldb/java/DebugWaitAttach/res/layout/main_layout.xml
deleted file mode 100644
index 4ef172f..0000000
--- a/tests/lldb/java/DebugWaitAttach/res/layout/main_layout.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
diff --git a/tests/lldb/java/DebugWaitAttach/src/com/android/rs/waitattachdebug/MainActivity.java b/tests/lldb/java/DebugWaitAttach/src/com/android/rs/waitattachdebug/MainActivity.java
deleted file mode 100644
index 3aeefd3..0000000
--- a/tests/lldb/java/DebugWaitAttach/src/com/android/rs/waitattachdebug/MainActivity.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.waitattachdebug;
-
-import android.app.Activity;
-import android.graphics.Bitmap;
-import android.os.Bundle;
-import android.widget.ImageView;
-import android.renderscript.*;
-
-public class MainActivity extends Activity {
-    private Bitmap mBitmapIn;
-    private Bitmap mBitmapOut;
-    private ImageView mImageView;
-
-    private RenderScript mRS;
-    private Allocation mInAllocation;
-    private Allocation mOutAllocation;
-    private ScriptC_simple mScript;
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-
-        setContentView(R.layout.main_layout);
-
-        mBitmapIn = Bitmap.createBitmap(8, 8, Bitmap.Config.ARGB_8888);
-        mBitmapOut = Bitmap.createBitmap(mBitmapIn.getWidth(),
-                    mBitmapIn.getHeight(), mBitmapIn.getConfig());
-
-        mImageView = findViewById(R.id.imageView);
-        mImageView.setImageBitmap(mBitmapOut);
-
-        createScript();
-        updateImage(1.0f);
-    }
-
-    private void createScript() {
-        mRS = RenderScript.create(this,
-            RenderScript.ContextType.NORMAL,
-            RenderScript.CREATE_FLAG_LOW_LATENCY |
-            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
-
-        mInAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
-        mOutAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
-
-        mScript = new ScriptC_simple(mRS);
-    }
-
-
-    private void updateImage(final float f) {
-        mScript.set_gColor(new Float4(0.9f, 0.8f, 0.5f, 1.0f));
-        mScript.forEach_simple_kernel(mInAllocation, mOutAllocation);
-        mOutAllocation.copyTo(mBitmapOut);
-        mScript.forEach_other_kernel(mInAllocation, mOutAllocation);
-    }
-}
-
diff --git a/tests/lldb/java/DebugWaitAttach/src/rs/simple.rscript b/tests/lldb/java/DebugWaitAttach/src/rs/simple.rscript
deleted file mode 100644
index 6b564c0..0000000
--- a/tests/lldb/java/DebugWaitAttach/src/rs/simple.rscript
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.waitattachdebug)
-
-float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
-
-/* RenderScript kernel that just sets the colour of the screen and does some
- * simple operations so it is not completely empty
- * (and can therefore be debugged).
- */
-uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
-{
-    float4 temp = rsUnpackColor8888(in);
-    temp = gColor;
-    uchar4 result = rsPackColorTo8888(temp);
-    return result;
-}
-
-// Extra kernel to test lldb setting breakpoints on all the RS kernels.
-uchar4 __attribute__((kernel)) other_kernel(uchar4 in)
-{
-    uchar4 result = in.wzyx;
-    return result;
-}
diff --git a/tests/lldb/java/InfiniteLoop/Android.mk b/tests/lldb/java/InfiniteLoop/Android.mk
deleted file mode 100644
index 02e2790..0000000
--- a/tests/lldb/java/InfiniteLoop/Android.mk
+++ /dev/null
@@ -1,15 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := JavaInfiniteLoop
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
diff --git a/tests/lldb/java/InfiniteLoop/AndroidManifest.xml b/tests/lldb/java/InfiniteLoop/AndroidManifest.xml
deleted file mode 100644
index ba846cd..0000000
--- a/tests/lldb/java/InfiniteLoop/AndroidManifest.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.infiniteloop">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="InfiniteLoop"
-                 android:hardwareAccelerated="true">
-
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
diff --git a/tests/lldb/java/InfiniteLoop/res/layout/main_layout.xml b/tests/lldb/java/InfiniteLoop/res/layout/main_layout.xml
deleted file mode 100755
index 4ef172f..0000000
--- a/tests/lldb/java/InfiniteLoop/res/layout/main_layout.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
diff --git a/tests/lldb/java/InfiniteLoop/src/com/android/rs/infiniteloop/MainActivity.java b/tests/lldb/java/InfiniteLoop/src/com/android/rs/infiniteloop/MainActivity.java
deleted file mode 100644
index 7243cc5..0000000
--- a/tests/lldb/java/InfiniteLoop/src/com/android/rs/infiniteloop/MainActivity.java
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.infiniteloop;
-
-import android.app.Activity;
-import android.graphics.Bitmap;
-import android.os.AsyncTask;
-import android.os.Bundle;
-import android.widget.ImageView;
-import android.renderscript.*;
-
-import java.util.Random;
-import java.util.Timer;
-import java.util.TimerTask;
-
-public class MainActivity extends Activity {
-    /* Number of bitmaps that are used for renderScript thread and UI thread synchronization.
-       Ideally, this can be reduced to 2, however in some devices, 2 buffers still showing tearing on UI.
-     */
-    private final int NUM_BITMAPS = 3;
-    private int mCurrentBitmap = 0;
-    private Bitmap mBitmapIn;
-    private Bitmap[] mBitmapsOut;
-    private ImageView mImageView;
-    private Random mRand;
-
-    private RenderScript mRS;
-    private Allocation mInAllocation;
-    private Allocation[] mOutAllocations;
-    private ScriptC_infiniteloop mScript;
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-
-        setContentView(R.layout.main_layout);
-        mRand = new Random();
-
-        /*
-         * Initialize UI
-         */
-        mBitmapIn = Bitmap.createBitmap(500, 500, Bitmap.Config.ARGB_8888);
-        mBitmapsOut = new Bitmap[NUM_BITMAPS];
-        for (int i = 0; i < NUM_BITMAPS; ++i) {
-            mBitmapsOut[i] = Bitmap.createBitmap(mBitmapIn.getWidth(),
-                    mBitmapIn.getHeight(), mBitmapIn.getConfig());
-        }
-
-        mImageView = findViewById(R.id.imageView);
-        mImageView.setImageBitmap(mBitmapsOut[mCurrentBitmap]);
-        mCurrentBitmap += (mCurrentBitmap + 1) % NUM_BITMAPS;
-
-        /*
-         * Create renderScript
-         */
-        createScript();
-
-        /*
-         * Invoke renderScript kernel and update imageView
-         */
-        updateImage(1.0f);
-
-        Timer t = new Timer();
-        t.schedule(new TimerTask() {
-            @Override
-            public void run() {
-                updateImage(1.f);
-            }
-        }, 2000, 2000);
-    }
-
-    /*
-     * Initialize RenderScript
-     * In the sample, it creates RenderScript kernel that performs saturation manipulation.
-     */
-    private void createScript() {
-        // Initialize RS
-        mRS = RenderScript.create(this,
-            RenderScript.ContextType.NORMAL,
-            RenderScript.CREATE_FLAG_LOW_LATENCY);
-
-        // Allocate buffers
-        mInAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
-        mOutAllocations = new Allocation[NUM_BITMAPS];
-        for (int i = 0; i < NUM_BITMAPS; ++i) {
-            mOutAllocations[i] = Allocation.createFromBitmap(mRS, mBitmapsOut[i]);
-        }
-
-        // Load script
-        mScript = new ScriptC_infiniteloop(mRS);
-    }
-
-    /*
-     * In the AsyncTask, it invokes a simple RenderScript kernel.
-     * After the kernel is done, an operation blocks at Allocation.copyTo() in AsyncTask thread.
-     * Once operations are finished and we reach onPostExecute() in the UI thread,
-     * it can invalidate and update the ImageView UI.
-     */
-    private class RenderScriptTask extends AsyncTask<Float, Integer, Integer> {
-        Boolean issued = false;
-
-        protected Integer doInBackground(Float... values) {
-            int index = -1;
-            if (isCancelled() == false) {
-                issued = true;
-                index = mCurrentBitmap;
-
-                /*
-                 * Set global variable in RS
-                 */
-                mScript.set_gColour(new Float4(mRand.nextFloat(), mRand.nextFloat(),
-                        mRand.nextFloat(), 1.f));
-
-                /*
-                 * Invoke saturation filter kernel
-                 */
-                mScript.forEach_simple_kernel(mInAllocation, mOutAllocations[index]);
-
-                /*
-                 * Copy to bitmap and invalidate image view
-                 */
-                mOutAllocations[index].copyTo(mBitmapsOut[index]);
-                mCurrentBitmap = (mCurrentBitmap + 1) % NUM_BITMAPS;
-            }
-            return index;
-        }
-
-        void updateView(Integer result) {
-            if (result != -1) {
-                // Request UI update
-                mImageView.setImageBitmap(mBitmapsOut[result]);
-                mImageView.invalidate();
-            }
-        }
-
-        protected void onPostExecute(Integer result) {
-            updateView(result);
-        }
-
-        protected void onCancelled(Integer result) {
-            if (issued) {
-                updateView(result);
-            }
-        }
-    }
-
-    RenderScriptTask currentTask = null;
-
-    /*
-     * Invoke AsyncTask and cancel the previous task.
-     * When AsyncTasks are piled up (typically in slow device with heavy kernel),
-     * only the latest (and already started) task invokes RenderScript operation.
-     */
-    private void updateImage(final float f) {
-        if (currentTask != null)
-            currentTask.cancel(false);
-        currentTask = new RenderScriptTask();
-        currentTask.execute(f);
-    }
-}
diff --git a/tests/lldb/java/InfiniteLoop/src/com/android/rs/infiniteloop/infiniteloop.rscript b/tests/lldb/java/InfiniteLoop/src/com/android/rs/infiniteloop/infiniteloop.rscript
deleted file mode 100644
index 5042ebf..0000000
--- a/tests/lldb/java/InfiniteLoop/src/com/android/rs/infiniteloop/infiniteloop.rscript
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.infiniteloop)
-#pragma rs_fp_relaxed
-
-float4 gColour = {0.299f, 0.587f, 0.114f, 1.f};
-
-/* RenderScript kernel that just sets the colour of the screen and does some
- * simple operations so it is not completely empty
- * (and can therefore be debugged).
- */
-uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
-{
-    float4 temp = rsUnpackColor8888(in);
-    temp = gColour;
-    uchar4 result = rsPackColorTo8888(temp);
-    return result;
-}
diff --git a/tests/lldb/java/KernelVariables/Android.mk b/tests/lldb/java/KernelVariables/Android.mk
deleted file mode 100644
index c642a9d..0000000
--- a/tests/lldb/java/KernelVariables/Android.mk
+++ /dev/null
@@ -1,15 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := KernelVariables
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_PRIVATE_PLATFORM_APIS := true
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
diff --git a/tests/lldb/java/KernelVariables/AndroidManifest.xml b/tests/lldb/java/KernelVariables/AndroidManifest.xml
deleted file mode 100644
index a1e2a74..0000000
--- a/tests/lldb/java/KernelVariables/AndroidManifest.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.kernelvariables">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="KernelVariables"
-                 android:hardwareAccelerated="true">
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
-
diff --git a/tests/lldb/java/KernelVariables/res/layout/main_layout.xml b/tests/lldb/java/KernelVariables/res/layout/main_layout.xml
deleted file mode 100644
index 131c3b5..0000000
--- a/tests/lldb/java/KernelVariables/res/layout/main_layout.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
-
diff --git a/tests/lldb/java/KernelVariables/src/com/android/rs/kernelvariables/MainActivity.java b/tests/lldb/java/KernelVariables/src/com/android/rs/kernelvariables/MainActivity.java
deleted file mode 100644
index 728b872..0000000
--- a/tests/lldb/java/KernelVariables/src/com/android/rs/kernelvariables/MainActivity.java
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.kernelvariables;
-
-import android.app.Activity;
-import android.graphics.Bitmap;
-import android.graphics.ImageFormat;
-import android.os.Bundle;
-import android.widget.ImageView;
-import android.renderscript.*;
-
-public class MainActivity extends Activity {
-    private Bitmap mBitmapIn;
-    private Bitmap mBitmapOut;
-    private ImageView mImageView;
-
-    private RenderScript mRS;
-    private Allocation mInAllocation;
-    private Allocation mOutAllocation;
-    private ScriptC_simple mScript;
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-
-        setContentView(R.layout.main_layout);
-
-        mBitmapIn = Bitmap.createBitmap(500, 500, Bitmap.Config.ARGB_8888);
-        mBitmapOut = Bitmap.createBitmap(mBitmapIn.getWidth(),
-                    mBitmapIn.getHeight(), mBitmapIn.getConfig());
-
-        mImageView = findViewById(R.id.imageView);
-        mImageView.setImageBitmap(mBitmapOut);
-
-        createScript();
-        updateImage();
-    }
-
-    private void createScript() {
-        mRS = RenderScript.create(this,
-            RenderScript.ContextType.NORMAL,
-            RenderScript.CREATE_FLAG_LOW_LATENCY |
-            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
-
-        mInAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
-        mOutAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
-
-        mScript = new ScriptC_simple(mRS);
-    }
-
-    private void updateImage() {
-        int[] buffer_int = {1, 2, 3, 4};
-        Allocation int_allocation = Allocation.createSized(mRS, Element.I32(mRS), 4);
-        int_allocation.copyFrom(buffer_int);
-        mScript.set_allocation_1D_global(int_allocation);
-
-        int[] buffer_int2 = {5, 6, 7, 8};
-
-        Type.Builder typeI32Builder2D = new Type.Builder(mRS, Element.I32(mRS));
-        typeI32Builder2D.setX(2);
-        typeI32Builder2D.setY(2);
-
-        Allocation int_allocation2 = Allocation.createTyped(mRS, typeI32Builder2D.create());
-        int_allocation2.copyFrom(buffer_int2);
-        mScript.set_allocation_1D_global2(int_allocation2);
-
-        mScript.set_allocation_2D_global(mInAllocation);
-        mScript.set_allocation_2D_global2(mOutAllocation);
-
-        int[] buffer_int3 = new int[64];
-
-        for (int i=0; i<4*4*4; ++i)
-            buffer_int3[i] = 9 + i;
-
-        Type.Builder typeI32Builder3D = new Type.Builder(mRS, Element.I32(mRS));
-        typeI32Builder3D.setX(4);
-        typeI32Builder3D.setY(4);
-        typeI32Builder3D.setZ(4);
-
-        Allocation int_allocation3 = Allocation.createTyped(mRS, typeI32Builder3D.create());
-        int_allocation3.copyFrom(buffer_int3);
-        mScript.set_allocation_3D_global(int_allocation3);
-
-        Type.Builder yuvTypeBuilder = new Type.Builder(mRS, Element.YUV(mRS));
-        yuvTypeBuilder.setX(4);
-        yuvTypeBuilder.setY(4);
-        yuvTypeBuilder.setYuvFormat(ImageFormat.YV12);
-        Allocation yuv_allocation = Allocation.createTyped(mRS, yuvTypeBuilder.create());
-        mScript.set_allocation_YUV_2D_global(yuv_allocation);
-
-        mScript.set_sampler_global(Sampler.CLAMP_LINEAR(mRS));
-
-        mScript.forEach_kernel(mInAllocation, mOutAllocation);
-        mOutAllocation.copyTo(mBitmapOut);
-    }
-}
diff --git a/tests/lldb/java/KernelVariables/src/rs/simple.rscript b/tests/lldb/java/KernelVariables/src/rs/simple.rscript
deleted file mode 100644
index 26b6aff..0000000
--- a/tests/lldb/java/KernelVariables/src/rs/simple.rscript
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.kernelvariables)
-
-char char_global = 12;
-uchar uchar_global = 234;
-short short_global = -321;
-ushort ushort_global = 432;
-int int_global = 1234;
-uint uint_global = 2345;
-float float_global = 4.5f;
-long long_global = -77777;
-ulong ulong_global = 8888;
-double double_global = -456.5f;
-
-char2 char2_global = {11, -22};
-uchar2 uchar2_global = {33, 44};
-short2 short2_global = {-555, 666};
-ushort2 ushort2_global = {777, 888};
-int2 int2_global = {999, -1111};
-uint2 uint2_global = {2222, 3333};
-float2 float2_global = {4.5f, -5.0f};
-long2 long2_global = {-4444, 5555};
-ulong2 ulong2_global = {6666, 7777};
-double2 double2_global = {88.5f, -99.0f};
-
-char3 char3_global = {11, -22, -33};
-uchar3 uchar3_global = {33, 44, 55};
-short3 short3_global = {-555, 666, 777};
-ushort3 ushort3_global = {777, 888, 999};
-int3 int3_global = {999, -1111, 2222};
-uint3 uint3_global = {2222, 3333, 4444};
-float3 float3_global = {4.5f, -5.0f, -6.5f};
-long3 long3_global = {-4444, 5555, 6666};
-ulong3 ulong3_global = {6666, 7777, 8888};
-double3 double3_global = {88.5f, -99.0f, 111.5f};
-
-char4 char4_global = {55, 11, -22, -33};
-uchar4 uchar4_global = {222, 33, 44, 55};
-short4 short4_global = {-444, -555, 666, 777};
-ushort4 ushort4_global = {666, 777, 888, 999};
-int4 int4_global = {888, 999, -1111, 2222};
-uint4 uint4_global = {1111, 2222, 3333, 4444};
-float4 float4_global = {3.0f, 4.5f, -5.0f, -6.5f};
-long4 long4_global = {-3333, -4444, 5555, 6666};
-ulong4 ulong4_global = {5555, 6666, 7777, 8888};
-double4 double4_global = {-77.0f, 88.5f, -99.0f, 111.5f};
-
-rs_matrix2x2 matrix2x2_global;
-rs_matrix3x3 matrix3x3_global;
-rs_matrix4x4 matrix4x4_global;
-
-rs_quaternion quaternion_global;
-
-rs_allocation allocation_1D_global;
-rs_allocation allocation_1D_global2;
-rs_allocation allocation_2D_global;
-rs_allocation allocation_2D_global2;
-rs_allocation allocation_3D_global;
-rs_allocation allocation_YUV_2D_global;
-
-rs_allocation_cubemap_face cubemap_face_global;
-rs_sampler sampler_global;
-
-uchar4 __attribute__((kernel)) kernel(uchar4 in)
-{
-    char char_local = 'a';
-    uchar uchar_local = 'b';
-    short short_local = -321;
-    ushort ushort_local = 432;
-    int int_local = 1234;
-    uint uint_local = 2345;
-    float float_local = 4.5f;
-    long long_local = -77777;
-    ulong ulong_local = 8888;
-    double double_local = -456.5f;
-
-    char2 char2_local = {-11, -22};
-    uchar2 uchar2_local = {33, 44};
-    short2 short2_local = {-555, 666};
-    ushort2 ushort2_local = {777, 888};
-    int2 int2_local = {999, -1111};
-    uint2 uint2_local = {2222, 3333};
-    float2 float2_local = {4.5f, -5.0f};
-    long2 long2_local = {-4444, 5555};
-    ulong2 ulong2_local = {6666, 7777};
-    double2 double2_local = {88.5f, -99.0f};
-
-    char3 char3_local = {11, -22, -33};
-    uchar3 uchar3_local = {33, 44, 55};
-    short3 short3_local = {-555, 666, 777};
-    ushort3 ushort3_local = {777, 888, 999};
-    int3 int3_local = {999, -1111, 2222};
-    uint3 uint3_local = {2222, 3333, 4444};
-    float3 float3_local = {4.5f, -5.0f, -6.5f};
-    long3 long3_local = {-4444, 5555, 6666};
-    ulong3 ulong3_local = {6666, 7777, 8888};
-    double3 double3_local = {88.5f, -99.0f, 111.5f};
-
-    char4 char4_local = {55, 11, -22, -33};
-    uchar4 uchar4_local = {22, 33, 44, 55};
-    short4 short4_local = {-444, -555, 666, 777};
-    ushort4 ushort4_local = {666, 777, 888, 999};
-    int4 int4_local = {888, 999, -1111, 2222};
-    uint4 uint4_local = {1111, 2222, 3333, 4444};
-    float4 float4_local = {3.0f, 4.5f, -5.0f, -6.5f};
-    long4 long4_local = {-3333, -4444, 5555, 6666};
-    ulong4 ulong4_local = {5555, 6666, 7777, 8888};
-    double4 double4_local = {-77.0f, 88.5f, -99.0f, 111.5f};
-
-    rs_matrix2x2 matrix2x2_local = {{1., 2.5,
-                                     3., 4.5}};
-    rs_matrix3x3 matrix3x3_local = {{5., 6.5, 7.,
-                                     8.5, 9., 1.5,
-                                     2., 3.5, 4.}};
-    rs_matrix4x4 matrix4x4_local = {{5.5, 6., 7.5, 8.,
-                                     9., 1.5, 2., 3.5,
-                                     4.5, 5.5, 6.5, 7.,
-                                     8., 9.5, 1.5, 2.5}};
-
-    matrix2x2_global = matrix2x2_local;
-    matrix3x3_global = matrix3x3_local;
-    matrix4x4_global = matrix4x4_local;
-
-    rsQuaternionSet(&quaternion_global, 3.0, 4.5, 5.5, 6.0);
-
-    rs_quaternion quaternion_local;
-    rsQuaternionSet(&quaternion_local, 7.5, 8.0, 9.0, 0.5);
-
-    char char_combined = char_local + (char)uchar_local + char2_local.x +
-        (char)uchar2_local.x + char3_local.x - (char)uchar3_local.x +
-        char4_local.x + (char)uchar4_local.x;
-
-    short short_combined = short_local + (short)ushort_local + short2_local.x +
-        (short)ushort2_local.x + short3_local.x + (short)ushort3_local.x +
-        short4_local.x + (short)ushort4_local.x;
-
-    int int_combined = int_local + (int)uint_local + int2_local.x +
-        (int)uint2_local.x + int3_local.x + (int)uint3_local.x + int4_local.x +
-        (int)uint4_local.x;
-
-    float float_combined = float_local + float2_local.x + float3_local.x +
-        float4_local.x;
-
-    long long_combined = long_local + (long)ulong_local + long2_local.x +
-        (long)ulong2_local.x + long3_local.x + (long)ulong3_local.x +
-        long4_local.x + (long)ulong4_local.x;
-
-    double double_combined = double_local + double2_local.x + double3_local.x +
-        double4_local.x;
-
-    char_global = char_combined;
-    short_global = short_combined;
-    int_global = int_combined;
-    float_global = float_combined;
-    long_global = long_combined;
-    double_global = double_combined;
-
-    uchar4 result = {1,2,3,4};
-    return result;
-}
-
-float use_constants_global;
-
-void setup(void)
-{
-  use_constants_global =
-      M_1_PI +
-      M_2_PI +
-      M_2_PIl +
-      M_2_SQRTPI +
-      M_E +
-      M_LN10 +
-      M_LN2 +
-      M_LOG10E +
-      M_LOG2E +
-      M_PI +
-      M_PI_2 +
-      M_PI_4 +
-      M_SQRT1_2 +
-      M_SQRT2;
-}
diff --git a/tests/lldb/java/MultipleRSFiles/Android.mk b/tests/lldb/java/MultipleRSFiles/Android.mk
deleted file mode 100644
index b45cc51..0000000
--- a/tests/lldb/java/MultipleRSFiles/Android.mk
+++ /dev/null
@@ -1,15 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := MultipleRSFiles
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_PRIVATE_PLATFORM_APIS := true
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
diff --git a/tests/lldb/java/MultipleRSFiles/AndroidManifest.xml b/tests/lldb/java/MultipleRSFiles/AndroidManifest.xml
deleted file mode 100644
index 5a83901..0000000
--- a/tests/lldb/java/MultipleRSFiles/AndroidManifest.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.multiplersfiles">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="MultipleRSFiles"
-                 android:hardwareAccelerated="true">
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
diff --git a/tests/lldb/java/MultipleRSFiles/res/layout/main_layout.xml b/tests/lldb/java/MultipleRSFiles/res/layout/main_layout.xml
deleted file mode 100644
index 4ef172f..0000000
--- a/tests/lldb/java/MultipleRSFiles/res/layout/main_layout.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
diff --git a/tests/lldb/java/MultipleRSFiles/src/com/android/rs/multiplersfiles/MainActivity.java b/tests/lldb/java/MultipleRSFiles/src/com/android/rs/multiplersfiles/MainActivity.java
deleted file mode 100644
index 7895484..0000000
--- a/tests/lldb/java/MultipleRSFiles/src/com/android/rs/multiplersfiles/MainActivity.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.multiplersfiles;
-
-import android.app.Activity;
-import android.graphics.Bitmap;
-import android.os.Bundle;
-import android.widget.ImageView;
-import android.renderscript.*;
-
-public class MainActivity extends Activity {
-    private Bitmap mBitmapIn;
-    private Bitmap mBitmapOut;
-    private ImageView mImageView;
-
-    private RenderScript mRS;
-    private Allocation mInAllocation;
-    private Allocation mOutAllocation;
-    private ScriptC_first mFirstScript;
-    private ScriptC_second mSecondScript;
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-
-        setContentView(R.layout.main_layout);
-
-        mBitmapIn = Bitmap.createBitmap(500, 500, Bitmap.Config.ARGB_8888);
-        mBitmapOut = Bitmap.createBitmap(mBitmapIn.getWidth(),
-                    mBitmapIn.getHeight(), mBitmapIn.getConfig());
-
-        mImageView = findViewById(R.id.imageView);
-        mImageView.setImageBitmap(mBitmapOut);
-
-        createScript();
-        updateImage(1.0f);
-    }
-
-    private void createScript() {
-        mRS = RenderScript.create(this,
-            RenderScript.ContextType.NORMAL,
-            RenderScript.CREATE_FLAG_LOW_LATENCY |
-            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
-
-        mInAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
-        mOutAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
-
-        mFirstScript = new ScriptC_first(mRS);
-        mSecondScript = new ScriptC_second(mRS);
-    }
-
-
-    private void updateImage(final float f) {
-        mFirstScript.set_gColor(new Float4(0.9f, 0.8f, 0.5f, 1.0f));
-        mFirstScript.forEach_first_kernel(mInAllocation, mOutAllocation);
-        mOutAllocation.copyTo(mBitmapOut);
-        mSecondScript.forEach_second_kernel(mInAllocation, mOutAllocation);
-    }
-}
-
diff --git a/tests/lldb/java/MultipleRSFiles/src/rs/first.rscript b/tests/lldb/java/MultipleRSFiles/src/rs/first.rscript
deleted file mode 100644
index cbed426..0000000
--- a/tests/lldb/java/MultipleRSFiles/src/rs/first.rscript
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.multiplersfiles)
-
-float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
-
-/* RenderScript kernel that just sets the colour of the screen and does some
- * simple operations so it is not completely empty
- * (and can therefore be debugged).
- */
-uchar4 __attribute__((kernel)) first_kernel(uchar4 in)
-{
-    float4 temp = rsUnpackColor8888(in);
-    temp = gColor;
-    uchar4 result = rsPackColorTo8888(temp);
-    return result;
-}
diff --git a/tests/lldb/java/MultipleRSFiles/src/rs/second.rscript b/tests/lldb/java/MultipleRSFiles/src/rs/second.rscript
deleted file mode 100644
index c87d7ae..0000000
--- a/tests/lldb/java/MultipleRSFiles/src/rs/second.rscript
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.multiplersfiles)
-
-/* RenderScript kernel that just returns the swizzled input. */
-uchar4 __attribute__((kernel)) second_kernel(uchar4 in)
-{
-    uchar4 result = in.wzyx;
-    return result;
-}
diff --git a/tests/lldb/java/NoDebugWaitAttach/Android.mk b/tests/lldb/java/NoDebugWaitAttach/Android.mk
deleted file mode 100644
index 355ffb7..0000000
--- a/tests/lldb/java/NoDebugWaitAttach/Android.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := JavaNoDebugWaitAttach
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_PRIVATE_PLATFORM_APIS := true
-
-include $(BUILD_PACKAGE)
diff --git a/tests/lldb/java/NoDebugWaitAttach/AndroidManifest.xml b/tests/lldb/java/NoDebugWaitAttach/AndroidManifest.xml
deleted file mode 100644
index dbc31a5..0000000
--- a/tests/lldb/java/NoDebugWaitAttach/AndroidManifest.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.waitattachnodebug">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="JavaNoDebugWaitAttach"
-                 android:hardwareAccelerated="true">
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
diff --git a/tests/lldb/java/NoDebugWaitAttach/res/layout/main_layout.xml b/tests/lldb/java/NoDebugWaitAttach/res/layout/main_layout.xml
deleted file mode 100644
index 4ef172f..0000000
--- a/tests/lldb/java/NoDebugWaitAttach/res/layout/main_layout.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
diff --git a/tests/lldb/java/NoDebugWaitAttach/src/com/android/rs/waitattachnodebug/MainActivity.java b/tests/lldb/java/NoDebugWaitAttach/src/com/android/rs/waitattachnodebug/MainActivity.java
deleted file mode 100644
index a43b387..0000000
--- a/tests/lldb/java/NoDebugWaitAttach/src/com/android/rs/waitattachnodebug/MainActivity.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.waitattachnodebug;
-
-import android.app.Activity;
-import android.graphics.Bitmap;
-import android.os.Bundle;
-import android.widget.ImageView;
-import android.renderscript.*;
-
-public class MainActivity extends Activity {
-    private Bitmap mBitmapIn;
-    private Bitmap mBitmapOut;
-    private ImageView mImageView;
-
-    private RenderScript mRS;
-    private Allocation mInAllocation;
-    private Allocation mOutAllocation;
-    private ScriptC_simple mScript;
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-
-        setContentView(R.layout.main_layout);
-
-        mBitmapIn = Bitmap.createBitmap(8, 8, Bitmap.Config.ARGB_8888);
-        mBitmapOut = Bitmap.createBitmap(mBitmapIn.getWidth(),
-                    mBitmapIn.getHeight(), mBitmapIn.getConfig());
-
-        mImageView = findViewById(R.id.imageView);
-        mImageView.setImageBitmap(mBitmapOut);
-
-        createScript();
-        updateImage(1.0f);
-    }
-
-    private void createScript() {
-        mRS = RenderScript.create(this,
-            RenderScript.ContextType.NORMAL,
-            RenderScript.CREATE_FLAG_LOW_LATENCY |
-            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
-
-        mInAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
-        mOutAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
-
-        mScript = new ScriptC_simple(mRS);
-    }
-
-
-    private void updateImage(final float f) {
-        mScript.set_gColor(new Float4(0.9f, 0.8f, 0.5f, 1.0f));
-        mScript.forEach_simple_kernel(mInAllocation, mOutAllocation);
-        mOutAllocation.copyTo(mBitmapOut);
-    }
-}
-
diff --git a/tests/lldb/java/NoDebugWaitAttach/src/rs/simple.rscript b/tests/lldb/java/NoDebugWaitAttach/src/rs/simple.rscript
deleted file mode 100644
index 6652675..0000000
--- a/tests/lldb/java/NoDebugWaitAttach/src/rs/simple.rscript
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.waitattachnodebug)
-
-float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
-
-/* RenderScript kernel that just sets the colour of the screen and does some
- * simple operations so it is not completely empty
- * (and can therefore be debugged).
- */
-uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
-{
-    float4 temp = rsUnpackColor8888(in);
-    temp = gColor;
-    uchar4 result = rsPackColorTo8888(temp);
-    return result;
-}
-
diff --git a/tests/lldb/java/Reduction/Android.mk b/tests/lldb/java/Reduction/Android.mk
deleted file mode 100644
index 6e71659..0000000
--- a/tests/lldb/java/Reduction/Android.mk
+++ /dev/null
@@ -1,31 +0,0 @@
-#
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_RENDERSCRIPT_FLAGS := -target-api 0 -O0 -g
-
-LOCAL_PACKAGE_NAME := Reduction
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_PRIVATE_PLATFORM_APIS := true
-
-include $(BUILD_PACKAGE)
diff --git a/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/MainActivity.java b/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/MainActivity.java
deleted file mode 100644
index b10d5d2..0000000
--- a/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/MainActivity.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (C) 2016 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This application is a simple scheduler for testing the lldb debugger
- * implementation for general reduction kernels.
- *
- * It launches one of two simple reductions in a loop
- */
-
-package com.android.rs.lldbreductiontest;
-
-import android.app.Activity;
-import android.content.Context;
-import android.content.res.Resources;
-import android.os.Handler;
-import android.os.Bundle;
-import android.util.Log;
-import android.renderscript.*;
-
-import java.lang.Float;
-import java.lang.Math;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Random;
-
-public class MainActivity extends Activity {
-  static private int idxOffset = 10;
-  static private int mX = 128;
-  static private int mY = 2;
-  static private int mZ = 2;
-  static private float mMultiplier = 2.f;
-  private RenderScript mRS;
-  private ScriptC_reduce mScript;
-  private ScriptC_reduce_auto_comb mScript_auto_comb;
-
-  @Override
-  protected void onCreate(Bundle savedInstanceState) {
-    super.onCreate(savedInstanceState);
-    initRS();
-
-    int loopDelayMillis = 1000;
-    Handler loopHandler = new Handler();
-    loopHandler.postDelayed(new Runnable() {
-      @Override
-      public void run() {
-        runRS();
-        loopHandler.postDelayed(this, loopDelayMillis);
-      }
-    }, loopDelayMillis);
-  }
-
-  private float findMinUserTypeAutoComb(
-      RenderScript rs, ScriptC_reduce_auto_comb s, Allocation alloc) {
-    s.set_a_startval(mX);
-    s.set_b_startval(mY);
-    s.set_multiplier(mMultiplier);
-
-    return s.reduce_find_min_user_type_auto_comb(alloc).get();
-  }
-
-  private float findMinUserType(RenderScript rs, ScriptC_reduce s, Allocation alloc) {
-    s.set_a_startval(mX);
-    s.set_b_startval(mY);
-    s.set_multiplier(mMultiplier);
-
-    return s.reduce_find_min_user_type(alloc).get();
-  }
-
-  private float findMinUserType1DAutoComb(RenderScript rs, ScriptC_reduce_auto_comb s, int xCount) {
-    ScriptField_MinUserType minUserType = new ScriptField_MinUserType(rs, xCount);
-    for (int i = 0; i < xCount; i++) {
-      ScriptField_MinUserType.Item val = new ScriptField_MinUserType.Item();
-      val.a = i + idxOffset;
-      val.b = i + idxOffset;
-      minUserType.set(val, i, true);
-    }
-
-    Allocation alloc = minUserType.getAllocation();
-
-    return findMinUserTypeAutoComb(rs, s, alloc);
-  }
-
-  private float findMinUserType1D(RenderScript rs, ScriptC_reduce s, int xCount) {
-    ScriptField_MinUserType minUserType = new ScriptField_MinUserType(rs, xCount);
-    for (int i = 0; i < xCount; i++) {
-      ScriptField_MinUserType.Item val = new ScriptField_MinUserType.Item();
-      val.a = i + idxOffset;
-      val.b = i + idxOffset;
-      minUserType.set(val, i, true);
-    }
-
-    Allocation alloc = minUserType.getAllocation();
-
-    return findMinUserType(rs, s, alloc);
-  }
-
-  public void initRS() {
-    mRS = RenderScript.create(this, RenderScript.ContextType.NORMAL,
-        RenderScript.CREATE_FLAG_LOW_LATENCY | RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH);
-    mScript = new ScriptC_reduce(mRS);
-    mScript_auto_comb = new ScriptC_reduce_auto_comb(mRS);
-  }
-
-  public void runRS() {
-    findMinUserType1D(mRS, mScript, mX);
-    findMinUserType1DAutoComb(mRS, mScript_auto_comb, mX);
-  }
-
-  public void onDestroy() {
-    mRS.finish();
-    mRS.destroy();
-  }
-}
diff --git a/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce.rscript b/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce.rscript
deleted file mode 100644
index c8fb088..0000000
--- a/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce.rscript
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (C) 2016 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This application is a simple scheduler for testing the lldb debugger
- * implementation for general reduction kernels.
- *
- * It launches one of two simple reductions in a loop
- */
-
-#pragma rs java_package_name(com.android.rs.lldbreductiontest)
-#pragma version(1)
-#pragma rs reduce(find_min_user_type) initializer(find_min_user_type_init) \
-    accumulator(find_min_user_type_accum)                                  \
-        outconverter(find_min_user_type_outc)                              \
-            combiner(find_min_user_type_comb)
-
-#define RSTESTS_USER_COMBINER
-#include "reduce_common.rsh"
diff --git a/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce_auto_comb.rscript b/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce_auto_comb.rscript
deleted file mode 100644
index 5a9de90..0000000
--- a/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce_auto_comb.rscript
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (C) 2016 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* This application is a simple scheduler for testing the lldb debugger
- * implementation for general reduction kernels.
- *
- * It launches one of two simple reductions in a loop
- */
-
-#pragma rs java_package_name(com.android.rs.lldbreductiontest)
-#pragma version(1)
-#pragma rs reduce(find_min_user_type_auto_comb)                                \
-    initializer(find_min_user_type_init) accumulator(find_min_user_type_accum) \
-        outconverter(find_min_user_type_outc)                                  \
-
-#include "reduce_common.rsh"
diff --git a/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce_common.rsh b/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce_common.rsh
deleted file mode 100644
index 846d530..0000000
--- a/tests/lldb/java/Reduction/src/com/android/rs/lldbreductiontest/reduce_common.rsh
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-int __attribute__((kernel)) my_foreach_kernel(int a) {
-  // This kernel is unused, but we want to make sure it is not listed as a
-  // reduction kernel by the debugger
-  return a + 1;
-}
-
-typedef struct MinUserType {
-  int32_t a;
-  int32_t b;
-} user_t;
-
-int32_t b_startval;
-int32_t a_startval;
-float multiplier;
-
-static void find_min_user_type_init(user_t *alloc) {
-  alloc->a = a_startval;
-  alloc->b = b_startval;
-}
-
-static void find_min_user_type_accum(user_t *accum, const user_t val) {
-  if (val.a + val.b * multiplier < accum->a + accum->b * multiplier) {
-    accum->a = val.a;
-    accum->b = val.b;
-  }
-}
-
-// Combiners are autogenerated if the user has not defined the combiner.
-// We specialise the tests for lldb's handling of this behaviour as well,
-// generating two test apps from the same source.
-// This combiner is equivalent to the accumulator.
-#if defined(RSTESTS_USER_COMBINER)
-static void find_min_user_type_comb(user_t *accum, const user_t *val) {
-  if (val->a + val->b * multiplier < accum->a + accum->b * multiplier) {
-    accum->a = val->a;
-    accum->b = val->b;
-  }
-}
-#endif
-
-static void find_min_user_type_outc(float *output, const user_t *val) {
-  *output = val->a + val->b * multiplier;
-}
diff --git a/tests/lldb/java/ScriptGroup/Android.mk b/tests/lldb/java/ScriptGroup/Android.mk
deleted file mode 100644
index 18a1cd5..0000000
--- a/tests/lldb/java/ScriptGroup/Android.mk
+++ /dev/null
@@ -1,15 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := ScriptGroup
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_PRIVATE_PLATFORM_APIS := true
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
diff --git a/tests/lldb/java/ScriptGroup/AndroidManifest.xml b/tests/lldb/java/ScriptGroup/AndroidManifest.xml
deleted file mode 100644
index 5288c74..0000000
--- a/tests/lldb/java/ScriptGroup/AndroidManifest.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.scriptgroup">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="scriptgroup"
-                 android:hardwareAccelerated="true">
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
diff --git a/tests/lldb/java/ScriptGroup/res/layout/main_layout.xml b/tests/lldb/java/ScriptGroup/res/layout/main_layout.xml
deleted file mode 100644
index 4ef172f..0000000
--- a/tests/lldb/java/ScriptGroup/res/layout/main_layout.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
diff --git a/tests/lldb/java/ScriptGroup/src/com/android/rs/scriptgroup/MainActivity.java b/tests/lldb/java/ScriptGroup/src/com/android/rs/scriptgroup/MainActivity.java
deleted file mode 100644
index 01e8a13..0000000
--- a/tests/lldb/java/ScriptGroup/src/com/android/rs/scriptgroup/MainActivity.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.scriptgroup;
-
-import android.app.Activity;
-import android.graphics.Bitmap;
-import android.os.Bundle;
-import android.widget.ImageView;
-import android.renderscript.*;
-
-public class MainActivity extends Activity {
-    private static final int ARRAY_SIZE = 8;
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-        setContentView(R.layout.main_layout);
-
-        // create renderscript context
-        RenderScript pRS = RenderScript.create(this, RenderScript.ContextType.NORMAL,
-            RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH | RenderScript.CREATE_FLAG_LOW_LATENCY);
-
-        ScriptC_scriptgroup script = new ScriptC_scriptgroup(pRS);
-
-        // create and initalize a simple input allocation
-        int[] array = new int[ARRAY_SIZE];
-        for (int i = 0; i < ARRAY_SIZE; i++) {
-            array[i] = i;
-        }
-        Allocation input = Allocation.createSized(pRS, Element.I32(pRS), ARRAY_SIZE);
-        input.copyFrom(array);
-
-        ScriptGroup.Builder2 builder = new ScriptGroup.Builder2(pRS);
-
-        ScriptGroup.Input unbound = builder.addInput();
-
-        ScriptGroup.Closure c0 = builder.addKernel(
-            script.getKernelID_foo(), Type.createX(pRS, Element.I32(pRS), ARRAY_SIZE), unbound);
-
-        ScriptGroup.Closure c1 = builder.addKernel(script.getKernelID_goo(),
-            Type.createX(pRS, Element.I32(pRS), ARRAY_SIZE), c0.getReturn());
-
-        ScriptGroup group = builder.create("scriptgroup_test", c1.getReturn());
-
-        int[] a = new int[ARRAY_SIZE];
-        ((Allocation) group.execute(input)[0]).copyTo(a);
-
-        pRS.finish();
-        pRS.destroy();
-    }
-}
diff --git a/tests/lldb/java/ScriptGroup/src/rs/scriptgroup.rscript b/tests/lldb/java/ScriptGroup/src/rs/scriptgroup.rscript
deleted file mode 100644
index 29089e1..0000000
--- a/tests/lldb/java/ScriptGroup/src/rs/scriptgroup.rscript
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.scriptgroup)
-#pragma rs_fp_full
-
-int __attribute__((kernel)) foo(int a) {
-    return a * a;
-}
-
-int __attribute__((kernel)) goo(int a) {
-    return a + a;
-}
diff --git a/tests/lldb/java/SingleSource/Android.mk b/tests/lldb/java/SingleSource/Android.mk
deleted file mode 100644
index 202c2a7..0000000
--- a/tests/lldb/java/SingleSource/Android.mk
+++ /dev/null
@@ -1,15 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := SingleSource
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_PRIVATE_PLATFORM_APIS := true
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
diff --git a/tests/lldb/java/SingleSource/AndroidManifest.xml b/tests/lldb/java/SingleSource/AndroidManifest.xml
deleted file mode 100644
index 8820c93..0000000
--- a/tests/lldb/java/SingleSource/AndroidManifest.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.singlesource">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="SingleSource"
-                 android:hardwareAccelerated="true">
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
-
diff --git a/tests/lldb/java/SingleSource/res/layout/main_layout.xml b/tests/lldb/java/SingleSource/res/layout/main_layout.xml
deleted file mode 100644
index 4ef172f..0000000
--- a/tests/lldb/java/SingleSource/res/layout/main_layout.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
diff --git a/tests/lldb/java/SingleSource/src/com/android/rs/singlesource/MainActivity.java b/tests/lldb/java/SingleSource/src/com/android/rs/singlesource/MainActivity.java
deleted file mode 100644
index 8308043..0000000
--- a/tests/lldb/java/SingleSource/src/com/android/rs/singlesource/MainActivity.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.singlesource;
-
-import android.app.Activity;
-import android.graphics.Bitmap;
-import android.os.Bundle;
-import android.widget.ImageView;
-import android.renderscript.*;
-
-public class MainActivity extends Activity {
-
-    private RenderScript mRS;
-    private Allocation mAllocIn1;
-    private Allocation mAllocIn2;
-    private Allocation mAllocOut;
-    private ScriptC_rs_single_source mScript;
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-
-        super.onCreate(savedInstanceState);
-
-        setContentView(R.layout.main_layout);
-
-        // create renderscript context
-        mRS = RenderScript.create(
-              this,
-              RenderScript.ContextType.NORMAL,
-              RenderScript.CREATE_FLAG_WAIT_FOR_ATTACH |
-              RenderScript.CREATE_FLAG_LOW_LATENCY);
-
-        // create a new instance of the script
-        mScript = new ScriptC_rs_single_source(mRS);
-
-        // create the first input allocation
-        mAllocIn1 = Allocation.createSized(mRS, Element.F32(mRS), 4);
-        float [] in1 = new float[]{ 1.f, 2.f, 3.f, 4.f };
-        mAllocIn1.copyFrom(in1);
-
-        // create second input allocation
-        mAllocIn2 = Allocation.createSized(mRS, Element.F32(mRS), 4);
-        float [] in2 = new float[]{ 5.f, 6.f, 7.f, 8.f };
-        mAllocIn2.copyFrom(in2);
-
-        // create output allocation
-        mAllocOut = Allocation.createSized(mRS, Element.F32(mRS), 4);
-
-        // setup the global output allocation
-        mScript.set_global_alloc(Allocation.createSized(mRS, Element.F32(mRS), 4));
-
-        // invoke static function 1
-        mScript.invoke_script_invoke_1(mAllocOut, mAllocIn1, mAllocIn2);
-
-        // invoke static function 2
-        mScript.invoke_script_invoke_2();
-
-        // invoke void kernel
-        Script.LaunchOptions options = new Script.LaunchOptions();
-        options.setX(0, 4);
-        mScript.forEach_void_kernel_1(options);
-    }
-}
diff --git a/tests/lldb/java/SingleSource/src/rs/rs_single_source.rscript b/tests/lldb/java/SingleSource/src/rs/rs_single_source.rscript
deleted file mode 100644
index 15c35ef..0000000
--- a/tests/lldb/java/SingleSource/src/rs/rs_single_source.rscript
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.singlesource)
-#pragma rs_fp_full
-
-// global allocation used for void kernel
-rs_allocation global_alloc;
-
-static void check_in()
-{
-    // debugger check point
-    return;
-}
-
-float __attribute__((kernel)) kernel_1(float a)
-{
-    // square
-    return a * a;
-}
-
-float __attribute__((kernel)) kernel_2(float a, float b)
-{
-    // product
-    return a * b;
-}
-
-void __attribute__((kernel)) void_kernel_1(uint32_t x)
-{
-    // allocation[x] = x
-    rsSetElementAt_float(global_alloc, (float)x, x);
-}
-
-void script_invoke_1(rs_allocation out, rs_allocation in1, rs_allocation in2)
-{
-    // invoke kernel taking one argument
-    rsForEach(kernel_1, out, in1);
-
-    check_in();
-
-    // invoke kernel taking two arguments
-    rsForEach(kernel_2, out, in1, in2);
-
-    check_in();
-}
-
-void script_invoke_2()
-{
-    // invoke kernel that takes no arguments and no return type
-    rs_script_call_t options = {
-        .strategy=RS_FOR_EACH_STRATEGY_DONT_CARE,
-        .xStart=0,
-        .xEnd=4
-    };
-    rsForEachWithOptions(void_kernel_1, &options);
-
-    check_in();
-}
diff --git a/tests/lldb/jni/Allocations/Android.mk b/tests/lldb/jni/Allocations/Android.mk
deleted file mode 100644
index 67ef2d9..0000000
--- a/tests/lldb/jni/Allocations/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := JNIAllocations
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-LOCAL_JNI_SHARED_LIBRARIES := libjniallocations
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
-include $(LOCAL_PATH)/jniallocations/Android.mk
diff --git a/tests/lldb/jni/Allocations/AndroidManifest.xml b/tests/lldb/jni/Allocations/AndroidManifest.xml
deleted file mode 100644
index e73799f..0000000
--- a/tests/lldb/jni/Allocations/AndroidManifest.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.jniallocations">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="JNIAllocations"
-                 android:hardwareAccelerated="true">
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
-
diff --git a/tests/lldb/jni/Allocations/jniallocations/Android.mk b/tests/lldb/jni/Allocations/jniallocations/Android.mk
deleted file mode 100644
index e52e1a3..0000000
--- a/tests/lldb/jni/Allocations/jniallocations/Android.mk
+++ /dev/null
@@ -1,15 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := libjniallocations
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := jniallocations.cpp allocs.rscript
-
-LOCAL_RENDERSCRIPT_FLAGS := -g
-
-LOCAL_STATIC_LIBRARIES := libcompiler_rt
-
-include frameworks/rs/tests/lldb/jni/common.mk
-include $(BUILD_SHARED_LIBRARY)
diff --git a/tests/lldb/jni/Allocations/jniallocations/allocs.rscript b/tests/lldb/jni/Allocations/jniallocations/allocs.rscript
deleted file mode 100644
index c5ebcbf..0000000
--- a/tests/lldb/jni/Allocations/jniallocations/allocs.rscript
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.jniallocations)
-
-// Kernel performs basic vector swizzle
-uchar4 __attribute__((kernel)) swizzle_kernel(uchar4 in)
-{
-    return in.wzyx;
-}
-
-// Kernel squares every element in allocation
-uint __attribute__((kernel)) square_kernel(ushort in)
-{
-    uint result = (uint)(in) * (uint)in;
-    return result;
-}
-
-// Helper function adding 1/2 to passed in double
-static double half_helper(double in)
-{
-    return (in + 0.5);
-}
-
-// Kernel returns first 3 elements of a double4 plus 1/2
-double3 __attribute__((kernel)) add_half_kernel(double4 in)
-{
-    double3 result;
-    result.x = half_helper(in.x);
-    result.y = half_helper(in.y);
-    result.z = half_helper(in.z);
-    return result;
-}
diff --git a/tests/lldb/jni/Allocations/jniallocations/jniallocations.cpp b/tests/lldb/jni/Allocations/jniallocations/jniallocations.cpp
deleted file mode 100644
index f86b64c..0000000
--- a/tests/lldb/jni/Allocations/jniallocations/jniallocations.cpp
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <memory>
-
-#include <jni.h>
-#include <RenderScript.h>
-
-#include "ScriptC_allocs.h"
-
-sp<RS> mRS;
-
-sp<Allocation> mBoolAllocation;  // boolean
-
-sp<Allocation> mCharAllocation;  // char
-sp<Allocation> mChar2Allocation;   // char2
-sp<Allocation> mChar3Allocation;   // char3
-sp<Allocation> mChar4Allocation;   // char4
-
-sp<Allocation> mUCharAllocation;   // uchar
-sp<Allocation> mUChar2Allocation;  // uchar2
-sp<Allocation> mUChar3Allocation;  // uchar3
-sp<Allocation> mUChar4Allocation;  // uchar4
-
-sp<Allocation> mShortAllocation;   // short
-sp<Allocation> mShort2Allocation;  // short2
-sp<Allocation> mShort3Allocation;  // short3
-sp<Allocation> mShort4Allocation;  // short4
-
-sp<Allocation> mUShortAllocation;  // ushort
-sp<Allocation> mUShort2Allocation; // ushort2
-sp<Allocation> mUShort3Allocation; // ushort3
-sp<Allocation> mUShort4Allocation; // ushort4
-
-sp<Allocation> mIntAllocation;   // int
-sp<Allocation> mInt2Allocation;  // int2
-sp<Allocation> mInt3Allocation;  // int3
-sp<Allocation> mInt4Allocation;  // int4
-
-sp<Allocation> mUIntAllocation;  // uint
-sp<Allocation> mUInt2Allocation;   // uint2
-sp<Allocation> mUInt3Allocation;   // uint3
-sp<Allocation> mUInt4Allocation;   // uint4
-
-sp<Allocation> mLongAllocation;  // long
-sp<Allocation> mLong2Allocation;   // long2
-sp<Allocation> mLong3Allocation;   // long3
-sp<Allocation> mLong4Allocation;   // long4
-
-sp<Allocation> mULongAllocation;   // ulong
-sp<Allocation> mULong2Allocation;  // ulong2
-sp<Allocation> mULong3Allocation;  // ulong3
-sp<Allocation> mULong4Allocation;  // ulong4
-
-sp<Allocation> mHalfAllocation;  // half
-sp<Allocation> mHalf2Allocation;   // half2
-sp<Allocation> mHalf3Allocation;   // half3
-sp<Allocation> mHalf4Allocation;   // half4
-
-sp<Allocation> mFloatAllocation;   // float
-sp<Allocation> mFloat2Allocation;  // float2
-sp<Allocation> mFloat3Allocation;  // float3
-sp<Allocation> mFloat4Allocation;  // float4
-
-sp<Allocation> mDoubleAllocation;  // double
-sp<Allocation> mDouble2Allocation; // double2
-sp<Allocation> mDouble3Allocation; // double3
-sp<Allocation> mDouble4Allocation; // double4
-
-const int mAllocSize = 24; // Needs to be < CHAR_MAX and divisible by 4.
-const int mBitmapSize = 64;
-
-void createSignedAllocations() {
-    Type::Builder typeI8Builder(mRS, Element::I8(mRS));
-    typeI8Builder.setX(1); // One element here to test 16 byte memory alignment
-    typeI8Builder.setY(3);
-    typeI8Builder.setZ(8);
-
-    mCharAllocation = Allocation::createTyped(mRS, typeI8Builder.create());
-    mChar2Allocation = Allocation::createSized(mRS, Element::I8_2(mRS), mAllocSize / 2);
-    mChar3Allocation = Allocation::createSized(mRS, Element::I8_3(mRS), mAllocSize / 4);
-    mChar4Allocation = Allocation::createSized(mRS, Element::I8_4(mRS), mAllocSize / 4);
-
-    Type::Builder typeI16_2Builder(mRS, Element::I16_2(mRS));
-    typeI16_2Builder.setX(6);
-    typeI16_2Builder.setY(1);
-    typeI16_2Builder.setZ(2);
-
-    mShortAllocation = Allocation::createSized(mRS, Element::I16(mRS), mAllocSize);
-    mShort2Allocation = Allocation::createTyped(mRS, typeI16_2Builder.create());
-    mShort3Allocation = Allocation::createSized(mRS, Element::I16_3(mRS), mAllocSize / 4);
-    mShort4Allocation = Allocation::createSized(mRS, Element::I16_4(mRS), mAllocSize / 4);
-
-    Type::Builder typeI32_3Builder(mRS, Element::I32_3(mRS));
-    typeI32_3Builder.setX(3);
-    typeI32_3Builder.setY(2);
-
-    mIntAllocation = Allocation::createSized(mRS, Element::I32(mRS), mAllocSize);
-    mInt2Allocation = Allocation::createSized(mRS, Element::I32_2(mRS), mAllocSize / 2);
-    mInt3Allocation = Allocation::createTyped(mRS, typeI32_3Builder.create());
-    mInt4Allocation = Allocation::createSized(mRS, Element::I32_4(mRS), mAllocSize / 4);
-
-    Type::Builder typeI64_4Builder(mRS, Element::I64_4(mRS));
-    typeI64_4Builder.setX(1);
-    typeI64_4Builder.setY(6);
-
-    mLongAllocation = Allocation::createSized(mRS, Element::I64(mRS), mAllocSize);
-    mLong2Allocation = Allocation::createSized(mRS, Element::I64_2(mRS), mAllocSize / 2);
-    mLong3Allocation = Allocation::createSized(mRS, Element::I64_3(mRS), mAllocSize / 4);
-    mLong4Allocation = Allocation::createTyped(mRS, typeI64_4Builder.create());
-
-    mBoolAllocation = Allocation::createSized(mRS, Element::BOOLEAN(mRS), mAllocSize);
-}
-
-void initSignedAllocations() {
-    char *buffer_char = new char[mAllocSize];
-    short *buffer_short = new short[mAllocSize];
-    int *buffer_int = new int[mAllocSize];
-    int64_t *buffer_long = new int64_t[mAllocSize];
-    char *buffer_bool = new char[mAllocSize];
-
-    for(int i = 0; i < mAllocSize; ++i) {
-        buffer_char[i] = (char) i;
-        buffer_short[i] = (short) i;
-        buffer_int[i] = (int) i;
-        buffer_long[i] = (int64_t) i;
-        buffer_bool[i] =  (char) (0x01 & i);
-    }
-
-    mCharAllocation->copy3DRangeFrom(0, 0, 0, 1, 3, 8, buffer_char);
-    mChar2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_char);
-    mChar3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
-    mChar4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
-
-    delete [] buffer_char;
-
-    mShortAllocation->copy1DRangeFrom(0, mAllocSize, buffer_short);
-    mShort2Allocation->copy3DRangeFrom(0, 0, 0, 6, 1, 2, buffer_short);
-    mShort3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_short);
-    mShort4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_short);
-
-    delete [] buffer_short;
-
-    mIntAllocation->copy1DRangeFrom(0, mAllocSize, buffer_int);
-    mInt2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_int);
-    mInt3Allocation->copy2DRangeFrom(0, 0, 3, 2, buffer_int);
-    mInt4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_int);
-
-    delete [] buffer_int;
-
-    mLongAllocation->copy1DRangeFrom(0, mAllocSize, buffer_long);
-    mLong2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_long);
-    mLong3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_long);
-    mLong4Allocation->copy2DRangeFrom(0, 0, 1, 6, buffer_long);
-
-    delete [] buffer_long;
-
-    mBoolAllocation->copy1DRangeFrom(0, mAllocSize, buffer_bool);
-
-    delete [] buffer_bool;
-}
-
-void createUnsignedAllocations() {
-    Type::Builder typeU8_2Builder(mRS, Element::U8_2(mRS));
-    typeU8_2Builder.setX(2);
-    typeU8_2Builder.setY(6);
-
-    mUCharAllocation = Allocation::createSized(mRS, Element::U8(mRS), mAllocSize);
-    mUChar2Allocation = Allocation::createTyped(mRS, typeU8_2Builder.create());
-    mUChar3Allocation = Allocation::createSized(mRS, Element::U8_3(mRS), mAllocSize / 4);
-    mUChar4Allocation = Allocation::createSized(mRS, Element::U8_4(mRS), mAllocSize / 4);
-
-    Type::Builder typeU16_3Builder(mRS, Element::U16_3(mRS));
-    typeU16_3Builder.setX(1);
-    typeU16_3Builder.setY(6);
-
-    mUShortAllocation = Allocation::createSized(mRS, Element::U16(mRS), mAllocSize);
-    mUShort2Allocation = Allocation::createSized(mRS, Element::U16_2(mRS), mAllocSize / 2);
-    mUShort3Allocation = Allocation::createTyped(mRS, typeU16_3Builder.create());
-    mUShort4Allocation = Allocation::createSized(mRS, Element::U16_4(mRS), mAllocSize / 4);
-
-    Type::Builder typeU32_4Builder(mRS, Element::U32_4(mRS));
-    typeU32_4Builder.setX(1);
-    typeU32_4Builder.setY(1);
-    typeU32_4Builder.setZ(6);
-
-    mUIntAllocation = Allocation::createSized(mRS, Element::U32(mRS), mAllocSize);
-    mUInt2Allocation = Allocation::createSized(mRS, Element::U32_2(mRS), mAllocSize / 2);
-    mUInt3Allocation = Allocation::createSized(mRS, Element::U32_3(mRS), mAllocSize / 4);
-    mUInt4Allocation = Allocation::createTyped(mRS, typeU32_4Builder.create());
-
-    Type::Builder typeU64Builder(mRS, Element::U64(mRS));
-    typeU64Builder.setX(4);
-    typeU64Builder.setY(3);
-    typeU64Builder.setZ(2);
-
-    mULongAllocation = Allocation::createTyped(mRS, typeU64Builder.create());
-    mULong2Allocation = Allocation::createSized(mRS, Element::U64_2(mRS), mAllocSize / 2);
-    mULong3Allocation = Allocation::createSized(mRS, Element::U64_3(mRS), mAllocSize / 4);
-    mULong4Allocation = Allocation::createSized(mRS, Element::U64_4(mRS), mAllocSize / 4);
-}
-
-void initUnsignedAllocations() {
-    char *buffer_char = new char[mAllocSize];
-    short *buffer_short = new short[mAllocSize];
-    int *buffer_int = new int[mAllocSize];
-    uint64_t *buffer_long = new uint64_t[mAllocSize];
-
-    for(int i = 0; i < mAllocSize; ++i) {
-        buffer_char[i] = (char) i;
-        buffer_short[i] = (short) i;
-        buffer_int[i] = (int) i;
-        buffer_long[i] = (uint64_t) i;
-    }
-
-    mUCharAllocation->copy1DRangeFrom(0, mAllocSize, buffer_char);
-    mUChar2Allocation->copy2DRangeFrom(0, 0, 2, 6, buffer_char);
-    mUChar3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
-    mUChar4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_char);
-
-    delete [] buffer_char;
-
-    mUShortAllocation->copy1DRangeFrom(0, mAllocSize, buffer_short);
-    mUShort2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_short);
-    mUShort3Allocation->copy2DRangeFrom(0, 0, 1, 6, buffer_short);
-    mUShort4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_short);
-
-    delete [] buffer_short;
-
-    mUIntAllocation->copy1DRangeFrom(0, mAllocSize, buffer_int);
-    mUInt2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_int);
-    mUInt3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_int);
-    mUInt4Allocation->copy3DRangeFrom(0, 0, 0, 1, 1, 6, buffer_int);
-
-    delete [] buffer_int;
-
-    mULongAllocation->copy3DRangeFrom(0, 0, 0, 4, 3, 2, buffer_long);
-    mULong2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_long);
-    mULong3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_long);
-    mULong4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_long);
-
-    delete [] buffer_long;
-}
-
-void createFloatAllocations() {
-    Type::Builder typeF16_3Builder(mRS, Element::F16_3(mRS));
-    typeF16_3Builder.setX(1);
-    typeF16_3Builder.setY(6);
-
-    mHalfAllocation = Allocation::createSized(mRS, Element::F16(mRS), mAllocSize);
-    mHalf2Allocation = Allocation::createSized(mRS, Element::F16_2(mRS), mAllocSize / 2);
-    mHalf3Allocation = Allocation::createTyped(mRS, typeF16_3Builder.create());
-    mHalf4Allocation = Allocation::createSized(mRS, Element::F16_4(mRS), mAllocSize / 4);
-
-    Type::Builder typeF32_4Builder(mRS, Element::F32_4(mRS));
-    typeF32_4Builder.setX(3);
-    typeF32_4Builder.setY(2);
-
-    mFloatAllocation = Allocation::createSized(mRS, Element::F32(mRS), mAllocSize);
-    mFloat2Allocation = Allocation::createSized(mRS, Element::F32_2(mRS), mAllocSize / 2);
-    mFloat3Allocation = Allocation::createSized(mRS, Element::F32_3(mRS), mAllocSize / 4);
-    mFloat4Allocation = Allocation::createTyped(mRS, typeF32_4Builder.create());
-
-    Type::Builder typeF64_2Builder(mRS, Element::F64_2(mRS));
-    typeF64_2Builder.setX(4);
-    typeF64_2Builder.setY(1);
-    typeF64_2Builder.setZ(3);
-
-    mDoubleAllocation = Allocation::createSized(mRS, Element::F64(mRS), mAllocSize);
-    mDouble2Allocation = Allocation::createTyped(mRS, typeF64_2Builder.create());
-
-    Type::Builder typeF64_3Builder(mRS, Element::F64_3(mRS));
-    typeF64_3Builder.setX(1);
-    typeF64_3Builder.setY(2);
-    typeF64_3Builder.setZ(3);
-
-    Type::Builder typeF64_4Builder(mRS, Element::F64_4(mRS));
-    typeF64_4Builder.setX(1);
-    typeF64_4Builder.setY(2);
-    typeF64_4Builder.setZ(3);
-
-    mDouble3Allocation = Allocation::createTyped(mRS, typeF64_3Builder.create());
-    mDouble4Allocation = Allocation::createTyped(mRS, typeF64_4Builder.create());
-}
-
-void initFloatAllocations() {
-    __fp16 *buffer_half = new __fp16[mAllocSize];
-    float *buffer_float = new float[mAllocSize];
-    double *buffer_double = new double[mAllocSize];
-
-    for(int i = 0; i < mAllocSize; ++i) {
-        buffer_half[i] = (__fp16) 1 / i;
-        buffer_float[i] = (float) 1 / i;
-        buffer_double[i] = (double) 1 / i;
-    }
-
-    mHalfAllocation->copy1DRangeFrom(0, mAllocSize, buffer_half);
-    mHalf2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_half);
-    mHalf3Allocation->copy2DRangeFrom(0, 0, 1, 6, buffer_half);
-    mHalf4Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_half);
-
-    delete [] buffer_half;
-
-    mFloatAllocation->copy1DRangeFrom(0, mAllocSize, buffer_float);
-    mFloat2Allocation->copy1DRangeFrom(0, mAllocSize/2, buffer_float);
-    mFloat3Allocation->copy1DRangeFrom(0, mAllocSize/4, buffer_float);
-    mFloat4Allocation->copy2DRangeFrom(0, 0, 3, 2, buffer_float);
-
-    delete [] buffer_float;
-
-    mDoubleAllocation->copy1DRangeFrom(0, mAllocSize, buffer_double);
-    mDouble2Allocation->copy3DRangeFrom(0, 0, 0, 4, 1, 3, buffer_double);
-    mDouble3Allocation->copy3DRangeFrom(0, 0, 0, 1, 2, 3, buffer_double);
-    mDouble4Allocation->copy3DRangeFrom(0, 0, 0, 1, 2, 3, buffer_double);
-
-    delete [] buffer_double;
-}
-
-extern "C" void JNICALL
-Java_com_android_rs_jniallocations_MainActivity_nativeRS(
-	JNIEnv * env,
-	jclass,
-	jstring pathObj)
-{
-    mRS = new RS();
-
-    const char * path = env->GetStringUTFChars(pathObj, nullptr);
-    mRS->init(path, RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
-    env->ReleaseStringUTFChars(pathObj, path);
-
-    sp<ScriptC_allocs> mScript = new ScriptC_allocs(mRS);
-
-    Type::Builder typeRGBA_888Builder(mRS, Element::RGBA_8888(mRS));
-    typeRGBA_888Builder.setX(mBitmapSize);
-    typeRGBA_888Builder.setY(mBitmapSize);
-
-    sp<Allocation> mInAllocation = Allocation::createTyped(mRS, typeRGBA_888Builder.create());
-
-    const int image_area = mBitmapSize*mBitmapSize;
-    const int image_size = image_area*sizeof(int);
-
-    char *zero_buffer = new char[image_size];
-    memset(zero_buffer, 0, image_size);
-    mInAllocation->copy1DRangeFrom(0, image_area, zero_buffer);
-    delete [] zero_buffer;
-
-    sp<Allocation> mOutAllocation = Allocation::createTyped(mRS, typeRGBA_888Builder.create());
-    createSignedAllocations();
-    initSignedAllocations();
-
-    mRS->finish();
-    mScript->forEach_swizzle_kernel(mInAllocation, mOutAllocation);
-    mRS->finish();
-
-    mCharAllocation.clear();
-    mChar2Allocation.clear();
-    mChar3Allocation.clear();
-    mChar4Allocation.clear();
-
-    mShort2Allocation.clear();
-    mShort3Allocation.clear();
-    mShort4Allocation.clear();
-
-    mIntAllocation.clear();
-    mInt2Allocation.clear();
-    mInt3Allocation.clear();
-    mInt4Allocation.clear();
-
-    mLongAllocation.clear();
-    mLong2Allocation.clear();
-    mLong3Allocation.clear();
-    mLong4Allocation.clear();
-
-    mBoolAllocation.clear();
-
-    createUnsignedAllocations();
-    initUnsignedAllocations();
-
-    mInAllocation = mUShortAllocation; // Host side assignment
-
-    mRS->finish();
-    mScript->forEach_square_kernel(mInAllocation, mUIntAllocation);
-    mRS->finish();
-
-    mUCharAllocation.clear();
-    mUChar2Allocation.clear();
-    mUChar3Allocation.clear();
-    mUChar4Allocation.clear();
-
-    mUShortAllocation.clear();
-    mUShort2Allocation.clear();
-    mUShort3Allocation.clear();
-    mUShort4Allocation.clear();
-
-    mUInt2Allocation.clear();
-    mUInt3Allocation.clear();
-    mUInt4Allocation.clear();
-
-    mULongAllocation.clear();
-    mULong2Allocation.clear();
-    mULong3Allocation.clear();
-    mULong4Allocation.clear();
-
-    createFloatAllocations();
-    initFloatAllocations();
-
-    mRS->finish();
-    mScript->forEach_add_half_kernel(mDouble4Allocation, mDouble3Allocation);
-    mRS->finish();
-}
-
diff --git a/tests/lldb/jni/Allocations/res/layout/main_layout.xml b/tests/lldb/jni/Allocations/res/layout/main_layout.xml
deleted file mode 100644
index 131c3b5..0000000
--- a/tests/lldb/jni/Allocations/res/layout/main_layout.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
-
diff --git a/tests/lldb/jni/Allocations/src/com/android/rs/jniallocations/MainActivity.java b/tests/lldb/jni/Allocations/src/com/android/rs/jniallocations/MainActivity.java
deleted file mode 100644
index f13682f..0000000
--- a/tests/lldb/jni/Allocations/src/com/android/rs/jniallocations/MainActivity.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.jniallocations;
-
-import android.app.Activity;
-import android.os.Bundle;
-import android.graphics.BitmapFactory;
-import android.graphics.Bitmap;
-import android.widget.ImageView;
-
-public class MainActivity extends Activity {
-    private Bitmap mBitmapIn;
-    private Bitmap mBitmapOut;
-
-    static {
-        System.loadLibrary("RS");
-        System.loadLibrary("jniallocations");
-    }
-
-    native void nativeRS(String cacheDir);
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-        setContentView(R.layout.main_layout);
-        nativeRS(this.getCacheDir().toString());
-    }
-}
-
diff --git a/tests/lldb/jni/Android.mk b/tests/lldb/jni/Android.mk
deleted file mode 100644
index 5053e7d..0000000
--- a/tests/lldb/jni/Android.mk
+++ /dev/null
@@ -1 +0,0 @@
-include $(call all-subdir-makefiles)
diff --git a/tests/lldb/jni/BranchingFunCalls/Android.mk b/tests/lldb/jni/BranchingFunCalls/Android.mk
deleted file mode 100644
index a5ee3b4..0000000
--- a/tests/lldb/jni/BranchingFunCalls/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := JNIBranchingFunCalls
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-LOCAL_JNI_SHARED_LIBRARIES := libjnibranchingfuncalls
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
-include $(LOCAL_PATH)/jnibranchingfuncalls/Android.mk
diff --git a/tests/lldb/jni/BranchingFunCalls/AndroidManifest.xml b/tests/lldb/jni/BranchingFunCalls/AndroidManifest.xml
deleted file mode 100644
index 3b616fa..0000000
--- a/tests/lldb/jni/BranchingFunCalls/AndroidManifest.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.jnibranchingfuncalls">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="JNIBranchingFunCalls"
-                 android:hardwareAccelerated="true">
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
-
diff --git a/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/Android.mk b/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/Android.mk
deleted file mode 100644
index 69f9162..0000000
--- a/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/Android.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := libjnibranchingfuncalls
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := jnibranchingfuncalls.cpp scalars.rscript
-
-LOCAL_RENDERSCRIPT_FLAGS := -g
-
-include frameworks/rs/tests/lldb/jni/common.mk
-include $(BUILD_SHARED_LIBRARY)
diff --git a/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/jnibranchingfuncalls.cpp b/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/jnibranchingfuncalls.cpp
deleted file mode 100644
index 4e2c4cf..0000000
--- a/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/jnibranchingfuncalls.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <memory>
-
-#include <jni.h>
-#include <RenderScript.h>
-
-#include "ScriptC_scalars.h"
-
-extern "C" void JNICALL
-Java_com_android_rs_jnibranchingfuncalls_MainActivity_nativeRS(
-    JNIEnv * env,
-    jclass,
-    jstring pathObj)
-{
-    static const int size = 64;
-    sp<RS> rs = new RS();
-
-    const char * path = env->GetStringUTFChars(pathObj, nullptr);
-    rs->init(path, RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
-    env->ReleaseStringUTFChars(pathObj, path);
-
-    auto e = Element::I32(rs);
-    Type::Builder tb(rs, e);
-    tb.setX(size);
-    tb.setY(size);
-    auto t = tb.create();
-
-    auto a = Allocation::createTyped(rs, t);
-    auto b = Allocation::createTyped(rs, t);
-
-    int * input = new int[size*size];
-    for(int i = 0; i < size*size; ++i) {
-        input[i] = i - (size*size / 2);
-    }
-    a->copy2DRangeFrom(0, 0, size, size, input);
-    delete [] input;
-
-    // Script is executed once, then the data is copied back when finished
-    sp<ScriptC_scalars> s = new ScriptC_scalars(rs);
-    s->invoke_addToGlobal(234);
-    s->forEach_simple_kernel(a, b);
-    rs->finish();
-    int32_t * output = new int32_t[size*size];
-    b->copy2DRangeTo(0, 0, size, size, output);
-    delete [] output;
-}
-
diff --git a/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/scalars.rscript b/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/scalars.rscript
deleted file mode 100644
index b98df28..0000000
--- a/tests/lldb/jni/BranchingFunCalls/jnibranchingfuncalls/scalars.rscript
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.jnibranchingfuncalls)
-
-static bool is_neg(int a)
-{
-    if(a < 0)
-        return true;
-    else
-        return false;
-}
-
-static bool is_pos(int a)
-{
-    if(a > 0)
-        return true;
-    else
-        return false;
-}
-
-static void set_i(int * a, int b)
-{
-    int tmp = b;
-    *a = tmp;
-}
-
-static void modify_f(float * f)
-{
-    *f *= 0.5f;
-}
-
-static void modify_i(int * i)
-{
-    int j = *i;
-    int cutoff = 2 << 6;
-    if(j > cutoff)
-        j = cutoff;
-    if(is_neg(j))
-        set_i(i, 0);
-    else if(is_pos(j))
-        set_i(i, j);
-    else
-        set_i(i, cutoff);
-}
-
-int __attribute__((kernel)) simple_kernel(int in)
-{
-    int i = in;
-    float f = (float) i;
-    modify_f(&f);
-    modify_i(&i);
-    int ret = (int) f;
-    return in * ret;
-}
-
-int glob = 123;
-
-void addToGlobal(int arg)
-{
-    glob += arg;
-}
diff --git a/tests/lldb/jni/BranchingFunCalls/res/layout/main_layout.xml b/tests/lldb/jni/BranchingFunCalls/res/layout/main_layout.xml
deleted file mode 100644
index 131c3b5..0000000
--- a/tests/lldb/jni/BranchingFunCalls/res/layout/main_layout.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
-
diff --git a/tests/lldb/jni/BranchingFunCalls/src/com/android/rs/jnibranchingfuncalls/MainActivity.java b/tests/lldb/jni/BranchingFunCalls/src/com/android/rs/jnibranchingfuncalls/MainActivity.java
deleted file mode 100644
index b0ac283..0000000
--- a/tests/lldb/jni/BranchingFunCalls/src/com/android/rs/jnibranchingfuncalls/MainActivity.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.jnibranchingfuncalls;
-
-import android.app.Activity;
-import android.os.Bundle;
-import android.graphics.BitmapFactory;
-import android.graphics.Bitmap;
-import android.widget.ImageView;
-
-public class MainActivity extends Activity {
-    private Bitmap mBitmapIn;
-    private Bitmap mBitmapOut;
-
-    static {
-        System.loadLibrary("RS");
-        System.loadLibrary("jnibranchingfuncalls");
-    }
-
-    native void nativeRS(String cacheDir);
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-        setContentView(R.layout.main_layout);
-        nativeRS(this.getCacheDir().toString());
-    }
-}
-
diff --git a/tests/lldb/jni/DebugWaitAttach/Android.mk b/tests/lldb/jni/DebugWaitAttach/Android.mk
deleted file mode 100644
index 14cf20d..0000000
--- a/tests/lldb/jni/DebugWaitAttach/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := JNIDebugWaitAttach
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-LOCAL_JNI_SHARED_LIBRARIES := libjnidebugwaitattach
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
-include $(LOCAL_PATH)/jnidebugwaitattach/Android.mk
diff --git a/tests/lldb/jni/DebugWaitAttach/AndroidManifest.xml b/tests/lldb/jni/DebugWaitAttach/AndroidManifest.xml
deleted file mode 100644
index 12e544b..0000000
--- a/tests/lldb/jni/DebugWaitAttach/AndroidManifest.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.jnidebugwaitattach">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="JNIDebugWaitAttach"
-                 android:hardwareAccelerated="true">
-
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
diff --git a/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/Android.mk b/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/Android.mk
deleted file mode 100644
index af7d578..0000000
--- a/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/Android.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := libjnidebugwaitattach
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := jnidebugwaitattach.cpp simple.rscript
-
-LOCAL_RENDERSCRIPT_FLAGS := -g
-
-include frameworks/rs/tests/lldb/jni/common.mk
-include $(BUILD_SHARED_LIBRARY)
diff --git a/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/jnidebugwaitattach.cpp b/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/jnidebugwaitattach.cpp
deleted file mode 100644
index f8151f4..0000000
--- a/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/jnidebugwaitattach.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <memory>
-
-#include <jni.h>
-#include <RenderScript.h>
-
-#include "ScriptC_simple.h"
-
-extern "C" void JNICALL
-Java_com_android_rs_jnidebugwaitattach_MainActivity_nativeRS(
-    JNIEnv * env,
-    jclass,
-    jstring pathObj)
-{
-    static const int size = 8;
-    sp<RS> rs = new RS();
-
-    const char * path = env->GetStringUTFChars(pathObj, nullptr);
-    rs->init(path, RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
-    env->ReleaseStringUTFChars(pathObj, path);
-
-    auto e = Element::RGBA_8888(rs);
-    Type::Builder tb(rs, e);
-    tb.setX(size);
-    tb.setY(size);
-    auto t = tb.create();
-
-    auto a = Allocation::createTyped(rs, t);
-    auto b = Allocation::createTyped(rs, t);
-
-    // Script is executed once, then the data is copied back when finished
-    sp<ScriptC_simple> s = new ScriptC_simple(rs);
-    s->forEach_simple_kernel(a, b);
-    uint32_t * output = new uint32_t[size*size];
-    b->copy2DRangeTo(0, 0, size, size, output);
-    delete [] output;
-
-    s->forEach_other_kernel(a, b);
-
-    rs->finish();
-}
-
diff --git a/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/simple.rscript b/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/simple.rscript
deleted file mode 100644
index a89c1f2..0000000
--- a/tests/lldb/jni/DebugWaitAttach/jnidebugwaitattach/simple.rscript
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.jnidebugwaitattach)
-
-float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
-
-/* RenderScript kernel that just sets the colour of the screen and does some
- * simple operations so it is not completely empty
- * (and can therefore be debugged).
- */
-uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
-{
-    float4 temp = rsUnpackColor8888(in);
-    temp = gColor;
-    uchar4 result = rsPackColorTo8888(temp);
-    return result;
-}
-
-// Extra kernel to test lldb setting breakpoints on all the RS kernels.
-uchar4 __attribute__((kernel)) other_kernel(uchar4 in)
-{
-    uchar4 result = in.wzyx;
-    return result;
-}
diff --git a/tests/lldb/jni/DebugWaitAttach/res/layout/main_layout.xml b/tests/lldb/jni/DebugWaitAttach/res/layout/main_layout.xml
deleted file mode 100644
index 131c3b5..0000000
--- a/tests/lldb/jni/DebugWaitAttach/res/layout/main_layout.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
-
diff --git a/tests/lldb/jni/DebugWaitAttach/src/com/android/rs/jniwaitattachdebug/MainActivity.java b/tests/lldb/jni/DebugWaitAttach/src/com/android/rs/jniwaitattachdebug/MainActivity.java
deleted file mode 100644
index b858cf7..0000000
--- a/tests/lldb/jni/DebugWaitAttach/src/com/android/rs/jniwaitattachdebug/MainActivity.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.jnidebugwaitattach;
-
-import android.app.Activity;
-import android.os.Bundle;
-import android.graphics.BitmapFactory;
-import android.graphics.Bitmap;
-import android.widget.ImageView;
-
-public class MainActivity extends Activity {
-    private Bitmap mBitmapIn;
-    private Bitmap mBitmapOut;
-
-    static {
-        System.loadLibrary("RS");
-        System.loadLibrary("jnidebugwaitattach");
-    }
-
-    native void nativeRS(String cacheDir);
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-        setContentView(R.layout.main_layout);
-        nativeRS(this.getCacheDir().toString());
-    }
-}
-
diff --git a/tests/lldb/jni/InfiniteLoop/Android.mk b/tests/lldb/jni/InfiniteLoop/Android.mk
deleted file mode 100644
index 892e1e9..0000000
--- a/tests/lldb/jni/InfiniteLoop/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := JNIInfiniteLoop
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-LOCAL_JNI_SHARED_LIBRARIES := libjniinfiniteloop
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
-include $(LOCAL_PATH)/jniinfiniteloop/Android.mk
diff --git a/tests/lldb/jni/InfiniteLoop/AndroidManifest.xml b/tests/lldb/jni/InfiniteLoop/AndroidManifest.xml
deleted file mode 100644
index 23031f0..0000000
--- a/tests/lldb/jni/InfiniteLoop/AndroidManifest.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.jniinfiniteloop">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="JNIInfiniteLoop"
-                 android:hardwareAccelerated="true">
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
-
diff --git a/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/Android.mk b/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/Android.mk
deleted file mode 100644
index 2171722..0000000
--- a/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/Android.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := libjniinfiniteloop
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := jniinfiniteloop.cpp infiniteloop.rscript
-
-LOCAL_RENDERSCRIPT_FLAGS := -g
-
-include frameworks/rs/tests/lldb/jni/common.mk
-include $(BUILD_SHARED_LIBRARY)
diff --git a/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/infiniteloop.rscript b/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/infiniteloop.rscript
deleted file mode 100644
index 142b27b..0000000
--- a/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/infiniteloop.rscript
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.jniinfiniteloop)
-#pragma rs_fp_relaxed
-
-float4 gColour = {0.299f, 0.587f, 0.114f, 1.f};
-
-/* RenderScript kernel that just sets the colour of the screen and does some
- * simple operations so it is not completely empty
- * (and can therefore be debugged).
- */
-uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
-{
-    float4 temp = rsUnpackColor8888(in);
-    temp = gColour;
-    uchar4 result = rsPackColorTo8888(temp);
-    return result;
-}
diff --git a/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/jniinfiniteloop.cpp b/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/jniinfiniteloop.cpp
deleted file mode 100644
index 73d1cbb..0000000
--- a/tests/lldb/jni/InfiniteLoop/jniinfiniteloop/jniinfiniteloop.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <memory>
-#include <unistd.h>
-
-#include <jni.h>
-#include <RenderScript.h>
-
-#include "ScriptC_infiniteloop.h"
-
-extern "C" void JNICALL
-Java_com_android_rs_jniinfiniteloop_MainActivity_nativeRS(
-    JNIEnv * env,
-    jclass,
-    jstring pathObj)
-{
-    static const int size = 64;
-    sp<RS> rs = new RS();
-
-    const char * path = env->GetStringUTFChars(pathObj, nullptr);
-    rs->init(path, RS_INIT_LOW_LATENCY);
-    env->ReleaseStringUTFChars(pathObj, path);
-
-    auto e = Element::RGBA_8888(rs);
-    Type::Builder tb(rs, e);
-    tb.setX(size);
-    tb.setY(size);
-    auto t = tb.create();
-
-    auto a = Allocation::createTyped(rs, t);
-    auto b = Allocation::createTyped(rs, t);
-
-    sp<ScriptC_infiniteloop> s = new ScriptC_infiniteloop(rs);
-
-    // Test is designed to loop forever, waits for two seconds
-    // between each invocation of the kernel
-    bool forever = true;
-    while(forever)
-    {
-        s->forEach_simple_kernel(a, b);
-        sleep(2);
-    }
-
-    uint32_t * output = new uint32_t[size*size];
-    b->copy2DRangeTo(0, 0, size, size, output);
-    delete [] output;
-}
-
diff --git a/tests/lldb/jni/InfiniteLoop/res/layout/main_layout.xml b/tests/lldb/jni/InfiniteLoop/res/layout/main_layout.xml
deleted file mode 100644
index 131c3b5..0000000
--- a/tests/lldb/jni/InfiniteLoop/res/layout/main_layout.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
-
diff --git a/tests/lldb/jni/InfiniteLoop/src/com/android/rs/jniinfiniteloop/MainActivity.java b/tests/lldb/jni/InfiniteLoop/src/com/android/rs/jniinfiniteloop/MainActivity.java
deleted file mode 100644
index a18c420..0000000
--- a/tests/lldb/jni/InfiniteLoop/src/com/android/rs/jniinfiniteloop/MainActivity.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.jniinfiniteloop;
-
-import android.app.Activity;
-import android.os.Bundle;
-import android.graphics.BitmapFactory;
-import android.graphics.Bitmap;
-import android.widget.ImageView;
-
-public class MainActivity extends Activity {
-    private Bitmap mBitmapIn;
-    private Bitmap mBitmapOut;
-
-    static {
-        System.loadLibrary("RS");
-        System.loadLibrary("jniinfiniteloop");
-    }
-
-    native void nativeRS(String cacheDir);
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-        setContentView(R.layout.main_layout);
-        nativeRS(this.getCacheDir().toString());
-    }
-}
-
diff --git a/tests/lldb/jni/KernelVariables/Android.mk b/tests/lldb/jni/KernelVariables/Android.mk
deleted file mode 100644
index 12017d5..0000000
--- a/tests/lldb/jni/KernelVariables/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := JNIKernelVariables
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-LOCAL_JNI_SHARED_LIBRARIES := libjnikernelvariables
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
-include $(LOCAL_PATH)/jnikernelvariables/Android.mk
diff --git a/tests/lldb/jni/KernelVariables/AndroidManifest.xml b/tests/lldb/jni/KernelVariables/AndroidManifest.xml
deleted file mode 100644
index b719892..0000000
--- a/tests/lldb/jni/KernelVariables/AndroidManifest.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.jnikernelvariables">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="JNIKernelVariables"
-                 android:hardwareAccelerated="true">
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
-
diff --git a/tests/lldb/jni/KernelVariables/jnikernelvariables/Android.mk b/tests/lldb/jni/KernelVariables/jnikernelvariables/Android.mk
deleted file mode 100644
index 10afc4b..0000000
--- a/tests/lldb/jni/KernelVariables/jnikernelvariables/Android.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := libjnikernelvariables
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := jnikernelvariables.cpp simple.rscript
-
-LOCAL_RENDERSCRIPT_FLAGS := -g
-
-include frameworks/rs/tests/lldb/jni/common.mk
-include $(BUILD_SHARED_LIBRARY)
diff --git a/tests/lldb/jni/KernelVariables/jnikernelvariables/jnikernelvariables.cpp b/tests/lldb/jni/KernelVariables/jnikernelvariables/jnikernelvariables.cpp
deleted file mode 100644
index 94917bf..0000000
--- a/tests/lldb/jni/KernelVariables/jnikernelvariables/jnikernelvariables.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <memory>
-
-#include <jni.h>
-#include <RenderScript.h>
-
-#include "ScriptC_simple.h"
-
-extern "C" void JNICALL
-Java_com_android_rs_jnikernelvariables_MainActivity_nativeRS(
-    JNIEnv * env,
-    jclass,
-    jstring pathObj)
-{
-    static const int size = 64;
-    sp<RS> rs = new RS();
-
-    const char * path = env->GetStringUTFChars(pathObj, nullptr);
-    rs->init(path, RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
-    env->ReleaseStringUTFChars(pathObj, path);
-
-    auto e = Element::RGBA_8888(rs);
-    Type::Builder tb(rs, e);
-    tb.setX(size);
-    tb.setY(size);
-    auto t = tb.create();
-
-    auto a = Allocation::createTyped(rs, t);
-    auto b = Allocation::createTyped(rs, t);
-
-    sp<ScriptC_simple> s = new ScriptC_simple(rs);
-
-    static const int buffer_int[] = {1, 2, 3, 4};
-    sp<Allocation> int_allocation = Allocation::createSized(rs, Element::I32(rs), 4);
-    int_allocation->copy1DRangeFrom(0, 4, buffer_int);
-    s->set_allocation_1D_global(int_allocation);
-
-    static const int buffer_int2[] = {5, 6, 7, 8};
-
-    Type::Builder typeI32Builder2D(rs, Element::I32(rs));
-    typeI32Builder2D.setX(2);
-    typeI32Builder2D.setY(2);
-
-    sp<Allocation> int_allocation2 = Allocation::createTyped(rs, typeI32Builder2D.create());
-    int_allocation2->copy2DRangeFrom(0, 0, 2, 2, buffer_int2);
-    s->set_allocation_1D_global2(int_allocation2);
-
-    s->set_allocation_2D_global(a);
-    s->set_allocation_2D_global2(b);
-
-    static const int buffer_int3[] = {9, 10, 11, 12, 13, 14, 15, 16};
-
-    Type::Builder typeI32Builder3D(rs, Element::I32(rs));
-    typeI32Builder3D.setX(2);
-    typeI32Builder3D.setY(2);
-    typeI32Builder3D.setZ(2);
-
-    sp<Allocation> int_allocation3 = Allocation::createTyped(rs, typeI32Builder3D.create());
-    int_allocation3->copy3DRangeFrom(0, 0, 0, 2, 2, 2, buffer_int3);
-    s->set_allocation_3D_global(int_allocation3);
-
-    Type::Builder yuvTypeBuilder(rs, Element::YUV(rs));
-    yuvTypeBuilder.setX(4);
-    yuvTypeBuilder.setY(4);
-    yuvTypeBuilder.setYuvFormat(RS_YUV_YV12);
-
-    sp<Allocation> yuv_allocation = Allocation::createTyped(rs, yuvTypeBuilder.create());
-    s->set_allocation_YUV_2D_global(yuv_allocation);
-
-    s->set_sampler_global(Sampler::CLAMP_LINEAR(rs));
-
-    // Script is executed once, then the data is copied back when finished
-    s->forEach_kernel(a, b);
-    rs->finish();
-    uint32_t * output = new uint32_t[size*size];
-    b->copy2DRangeTo(0, 0, size, size, output);
-    delete [] output;
-}
-
diff --git a/tests/lldb/jni/KernelVariables/jnikernelvariables/simple.rscript b/tests/lldb/jni/KernelVariables/jnikernelvariables/simple.rscript
deleted file mode 100644
index 30feb00..0000000
--- a/tests/lldb/jni/KernelVariables/jnikernelvariables/simple.rscript
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.jnikernelvariables)
-
-char char_global = 12;
-uchar uchar_global = 234;
-short short_global = -321;
-ushort ushort_global = 432;
-int int_global = 1234;
-uint uint_global = 2345;
-float float_global = 4.5f;
-long long_global = -77777;
-ulong ulong_global = 8888;
-double double_global = -456.5f;
-
-char2 char2_global = {11, -22};
-uchar2 uchar2_global = {33, 44};
-short2 short2_global = {-555, 666};
-ushort2 ushort2_global = {777, 888};
-int2 int2_global = {999, -1111};
-uint2 uint2_global = {2222, 3333};
-float2 float2_global = {4.5f, -5.0f};
-long2 long2_global = {-4444, 5555};
-ulong2 ulong2_global = {6666, 7777};
-double2 double2_global = {88.5f, -99.0f};
-
-char3 char3_global = {11, -22, -33};
-uchar3 uchar3_global = {33, 44, 55};
-short3 short3_global = {-555, 666, 777};
-ushort3 ushort3_global = {777, 888, 999};
-int3 int3_global = {999, -1111, 2222};
-uint3 uint3_global = {2222, 3333, 4444};
-float3 float3_global = {4.5f, -5.0f, -6.5f};
-long3 long3_global = {-4444, 5555, 6666};
-ulong3 ulong3_global = {6666, 7777, 8888};
-double3 double3_global = {88.5f, -99.0f, 111.5f};
-
-char4 char4_global = {55, 11, -22, -33};
-uchar4 uchar4_global = {222, 33, 44, 55};
-short4 short4_global = {-444, -555, 666, 777};
-ushort4 ushort4_global = {666, 777, 888, 999};
-int4 int4_global = {888, 999, -1111, 2222};
-uint4 uint4_global = {1111, 2222, 3333, 4444};
-float4 float4_global = {3.0f, 4.5f, -5.0f, -6.5f};
-long4 long4_global = {-3333, -4444, 5555, 6666};
-ulong4 ulong4_global = {5555, 6666, 7777, 8888};
-double4 double4_global = {-77.0f, 88.5f, -99.0f, 111.5f};
-
-rs_matrix2x2 matrix2x2_global;
-rs_matrix3x3 matrix3x3_global;
-rs_matrix4x4 matrix4x4_global;
-
-rs_quaternion quaternion_global;
-
-rs_allocation allocation_1D_global;
-rs_allocation allocation_1D_global2;
-rs_allocation allocation_2D_global;
-rs_allocation allocation_2D_global2;
-rs_allocation allocation_3D_global;
-rs_allocation allocation_YUV_2D_global;
-
-rs_allocation_cubemap_face cubemap_face_global;
-rs_sampler sampler_global;
-
-uchar4 __attribute__((kernel)) kernel(uchar4 in)
-{
-    char char_local = 'a';
-    uchar uchar_local = 'b';
-    short short_local = -321;
-    ushort ushort_local = 432;
-    int int_local = 1234;
-    uint uint_local = 2345;
-    float float_local = 4.5f;
-    long long_local = -77777;
-    ulong ulong_local = 8888;
-    double double_local = -456.5f;
-
-    char2 char2_local = {-11, -22};
-    uchar2 uchar2_local = {33, 44};
-    short2 short2_local = {-555, 666};
-    ushort2 ushort2_local = {777, 888};
-    int2 int2_local = {999, -1111};
-    uint2 uint2_local = {2222, 3333};
-    float2 float2_local = {4.5f, -5.0f};
-    long2 long2_local = {-4444, 5555};
-    ulong2 ulong2_local = {6666, 7777};
-    double2 double2_local = {88.5f, -99.0f};
-
-    char3 char3_local = {11, -22, -33};
-    uchar3 uchar3_local = {33, 44, 55};
-    short3 short3_local = {-555, 666, 777};
-    ushort3 ushort3_local = {777, 888, 999};
-    int3 int3_local = {999, -1111, 2222};
-    uint3 uint3_local = {2222, 3333, 4444};
-    float3 float3_local = {4.5f, -5.0f, -6.5f};
-    long3 long3_local = {-4444, 5555, 6666};
-    ulong3 ulong3_local = {6666, 7777, 8888};
-    double3 double3_local = {88.5f, -99.0f, 111.5f};
-
-    char4 char4_local = {55, 11, -22, -33};
-    uchar4 uchar4_local = {22, 33, 44, 55};
-    short4 short4_local = {-444, -555, 666, 777};
-    ushort4 ushort4_local = {666, 777, 888, 999};
-    int4 int4_local = {888, 999, -1111, 2222};
-    uint4 uint4_local = {1111, 2222, 3333, 4444};
-    float4 float4_local = {3.0f, 4.5f, -5.0f, -6.5f};
-    long4 long4_local = {-3333, -4444, 5555, 6666};
-    ulong4 ulong4_local = {5555, 6666, 7777, 8888};
-    double4 double4_local = {-77.0f, 88.5f, -99.0f, 111.5f};
-
-    rs_matrix2x2 matrix2x2_local = {{1., 2.5,
-                                     3., 4.5}};
-    rs_matrix3x3 matrix3x3_local = {{5., 6.5, 7.,
-                                     8.5, 9., 1.5,
-                                     2., 3.5, 4.}};
-    rs_matrix4x4 matrix4x4_local = {{5.5, 6., 7.5, 8.,
-                                     9., 1.5, 2., 3.5,
-                                     4.5, 5.5, 6.5, 7.,
-                                     8., 9.5, 1.5, 2.5}};
-
-    matrix2x2_global = matrix2x2_local;
-    matrix3x3_global = matrix3x3_local;
-    matrix4x4_global = matrix4x4_local;
-
-    rsQuaternionSet(&quaternion_global, 3.0, 4.5, 5.5, 6.0);
-
-    rs_quaternion quaternion_local;
-    rsQuaternionSet(&quaternion_local, 7.5, 8.0, 9.0, 0.5);
-
-    char char_combined = char_local + (char)uchar_local + char2_local.x +
-        (char)uchar2_local.x + char3_local.x - (char)uchar3_local.x +
-        char4_local.x + (char)uchar4_local.x;
-
-    short short_combined = short_local + (short)ushort_local + short2_local.x +
-        (short)ushort2_local.x + short3_local.x + (short)ushort3_local.x +
-        short4_local.x + (short)ushort4_local.x;
-
-    int int_combined = int_local + (int)uint_local + int2_local.x +
-        (int)uint2_local.x + int3_local.x + (int)uint3_local.x + int4_local.x +
-        (int)uint4_local.x;
-
-    float float_combined = float_local + float2_local.x + float3_local.x +
-        float4_local.x;
-
-    long long_combined = long_local + (long)ulong_local + long2_local.x +
-        (long)ulong2_local.x + long3_local.x + (long)ulong3_local.x +
-        long4_local.x + (long)ulong4_local.x;
-
-    double double_combined = double_local + double2_local.x + double3_local.x +
-        double4_local.x;
-
-    char_global = char_combined;
-    short_global = short_combined;
-    int_global = int_combined;
-    float_global = float_combined;
-    long_global = long_combined;
-    double_global = double_combined;
-
-    uchar4 result = {1,2,3,4};
-    return result;
-}
-
-float use_constants_global;
-
-void setup(void)
-{
-  use_constants_global =
-      M_1_PI +
-      M_2_PI +
-      M_2_PIl +
-      M_2_SQRTPI +
-      M_E +
-      M_LN10 +
-      M_LN2 +
-      M_LOG10E +
-      M_LOG2E +
-      M_PI +
-      M_PI_2 +
-      M_PI_4 +
-      M_SQRT1_2 +
-      M_SQRT2;
-}
diff --git a/tests/lldb/jni/KernelVariables/res/layout/main_layout.xml b/tests/lldb/jni/KernelVariables/res/layout/main_layout.xml
deleted file mode 100644
index 131c3b5..0000000
--- a/tests/lldb/jni/KernelVariables/res/layout/main_layout.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
-
diff --git a/tests/lldb/jni/KernelVariables/src/com/android/rs/jnikernelvariables/MainActivity.java b/tests/lldb/jni/KernelVariables/src/com/android/rs/jnikernelvariables/MainActivity.java
deleted file mode 100644
index 11e41f2..0000000
--- a/tests/lldb/jni/KernelVariables/src/com/android/rs/jnikernelvariables/MainActivity.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.jnikernelvariables;
-
-import android.app.Activity;
-import android.os.Bundle;
-import android.graphics.BitmapFactory;
-import android.graphics.Bitmap;
-import android.widget.ImageView;
-
-public class MainActivity extends Activity {
-    private Bitmap mBitmapIn;
-    private Bitmap mBitmapOut;
-
-    static {
-        System.loadLibrary("RS");
-        System.loadLibrary("jnikernelvariables");
-    }
-
-    native void nativeRS(String cacheDir);
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-        setContentView(R.layout.main_layout);
-        nativeRS(this.getCacheDir().toString());
-    }
-}
diff --git a/tests/lldb/jni/MultipleRSFiles/Android.mk b/tests/lldb/jni/MultipleRSFiles/Android.mk
deleted file mode 100644
index ff24154..0000000
--- a/tests/lldb/jni/MultipleRSFiles/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := JNIMultipleRSFiles
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-LOCAL_JNI_SHARED_LIBRARIES := libjnimultiplersfiles
-
-LOCAL_RENDERSCRIPT_FLAGS := -g -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
-include $(LOCAL_PATH)/jnimultiplersfiles/Android.mk
diff --git a/tests/lldb/jni/MultipleRSFiles/AndroidManifest.xml b/tests/lldb/jni/MultipleRSFiles/AndroidManifest.xml
deleted file mode 100644
index 19bb65c..0000000
--- a/tests/lldb/jni/MultipleRSFiles/AndroidManifest.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.jnimultiplersfiles">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="JNIMultipleRSFiles"
-                 android:hardwareAccelerated="true">
-
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
diff --git a/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/Android.mk b/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/Android.mk
deleted file mode 100644
index b3c335f..0000000
--- a/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/Android.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := libjnimultiplersfiles
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := jnimultiplersfiles.cpp first.rscript second.rscript
-
-LOCAL_RENDERSCRIPT_FLAGS := -g
-
-include frameworks/rs/tests/lldb/jni/common.mk
-include $(BUILD_SHARED_LIBRARY)
diff --git a/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/first.rscript b/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/first.rscript
deleted file mode 100644
index 7c4a852..0000000
--- a/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/first.rscript
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.jnimultiplersfiles)
-
-float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
-
-/* RenderScript kernel that just sets the colour of the screen and does some
- * simple operations so it is not completely empty
- * (and can therefore be debugged).
- */
-uchar4 __attribute__((kernel)) first_kernel(uchar4 in)
-{
-    float4 temp = rsUnpackColor8888(in);
-    temp = gColor;
-    uchar4 result = rsPackColorTo8888(temp);
-    return result;
-}
diff --git a/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/jnimultiplersfiles.cpp b/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/jnimultiplersfiles.cpp
deleted file mode 100644
index 6d28a4b..0000000
--- a/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/jnimultiplersfiles.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <memory>
-
-#include <jni.h>
-#include <RenderScript.h>
-
-#include "ScriptC_first.h"
-#include "ScriptC_second.h"
-
-extern "C" void JNICALL
-Java_com_android_rs_jnimultiplersfiles_MainActivity_nativeRS(
-    JNIEnv * env,
-    jclass,
-    jstring pathObj)
-{
-    static const int size = 64;
-    sp<RS> rs = new RS();
-
-    const char * path = env->GetStringUTFChars(pathObj, nullptr);
-    rs->init(path, RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
-    env->ReleaseStringUTFChars(pathObj, path);
-
-    auto e = Element::RGBA_8888(rs);
-    Type::Builder tb(rs, e);
-    tb.setX(size);
-    tb.setY(size);
-    auto t = tb.create();
-
-    auto a = Allocation::createTyped(rs, t);
-    auto b = Allocation::createTyped(rs, t);
-
-    // Script is executed once, then the data is copied back when finished
-    sp<ScriptC_first> s1 = new ScriptC_first(rs);
-    sp<ScriptC_second> s2 = new ScriptC_second(rs);
-
-    s1->forEach_first_kernel(a, b);
-    uint32_t * output = new uint32_t[size*size];
-    b->copy2DRangeTo(0, 0, size, size, output);
-    delete [] output;
-
-    s2->forEach_second_kernel(a, b);
-
-    rs->finish();
-}
-
diff --git a/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/second.rscript b/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/second.rscript
deleted file mode 100644
index ab898f9..0000000
--- a/tests/lldb/jni/MultipleRSFiles/jnimultiplersfiles/second.rscript
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.jnimultiplersfiles)
-
-/* RenderScript kernel that just returns the swizzled input. */
-uchar4 __attribute__((kernel)) second_kernel(uchar4 in)
-{
-    uchar4 result = in.wzyx;
-    return result;
-}
diff --git a/tests/lldb/jni/MultipleRSFiles/res/layout/main_layout.xml b/tests/lldb/jni/MultipleRSFiles/res/layout/main_layout.xml
deleted file mode 100644
index 131c3b5..0000000
--- a/tests/lldb/jni/MultipleRSFiles/res/layout/main_layout.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
-
diff --git a/tests/lldb/jni/MultipleRSFiles/src/com/android/rs/jnimultiplersfiles/MainActivity.java b/tests/lldb/jni/MultipleRSFiles/src/com/android/rs/jnimultiplersfiles/MainActivity.java
deleted file mode 100644
index ea743a6..0000000
--- a/tests/lldb/jni/MultipleRSFiles/src/com/android/rs/jnimultiplersfiles/MainActivity.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.jnimultiplersfiles;
-
-import android.app.Activity;
-import android.os.Bundle;
-import android.graphics.BitmapFactory;
-import android.graphics.Bitmap;
-import android.widget.ImageView;
-
-public class MainActivity extends Activity {
-    private Bitmap mBitmapIn;
-    private Bitmap mBitmapOut;
-
-    static {
-        System.loadLibrary("RS");
-        System.loadLibrary("jnimultiplersfiles");
-    }
-
-    native void nativeRS(String cacheDir);
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-        setContentView(R.layout.main_layout);
-        nativeRS(this.getCacheDir().toString());
-    }
-}
-
diff --git a/tests/lldb/jni/NoDebugWaitAttach/Android.mk b/tests/lldb/jni/NoDebugWaitAttach/Android.mk
deleted file mode 100644
index 887b199..0000000
--- a/tests/lldb/jni/NoDebugWaitAttach/Android.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_SRC_FILES := $(call all-java-files-under, src) \
-                   $(call all-renderscript-files-under, src)
-
-LOCAL_PACKAGE_NAME := JNINoDebugWaitAttach
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-LOCAL_SDK_VERSION := current
-
-LOCAL_JNI_SHARED_LIBRARIES := libjninodebugwaitattach
-
-LOCAL_RENDERSCRIPT_FLAGS := -O0 -target-api 0
-
-include $(BUILD_PACKAGE)
-include $(LOCAL_PATH)/jninodebugwaitattach/Android.mk
diff --git a/tests/lldb/jni/NoDebugWaitAttach/AndroidManifest.xml b/tests/lldb/jni/NoDebugWaitAttach/AndroidManifest.xml
deleted file mode 100644
index 846eddd..0000000
--- a/tests/lldb/jni/NoDebugWaitAttach/AndroidManifest.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.jninodebugwaitattach">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="JNINoDebugWaitAttach"
-                 android:hardwareAccelerated="true">
-
-        <activity android:name="MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-</manifest>
diff --git a/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/Android.mk b/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/Android.mk
deleted file mode 100644
index 19b0807..0000000
--- a/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/Android.mk
+++ /dev/null
@@ -1,11 +0,0 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_MODULE := libjninodebugwaitattach
-LOCAL_LICENSE_KINDS := SPDX-license-identifier-Apache-2.0
-LOCAL_LICENSE_CONDITIONS := notice
-
-LOCAL_SRC_FILES := jninodebugwaitattach.cpp simple.rscript
-
-include frameworks/rs/tests/lldb/jni/common.mk
-include $(BUILD_SHARED_LIBRARY)
diff --git a/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/jninodebugwaitattach.cpp b/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/jninodebugwaitattach.cpp
deleted file mode 100644
index 72ec36e..0000000
--- a/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/jninodebugwaitattach.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <memory>
-
-#include <jni.h>
-#include <RenderScript.h>
-
-#include "ScriptC_simple.h"
-
-extern "C" void JNICALL
-Java_com_android_rs_jninodebugwaitattach_MainActivity_nativeRS(
-    JNIEnv * env,
-    jclass,
-    jstring pathObj)
-{
-    static const int size = 8;
-    sp<RS> rs = new RS();
-
-    const char * path = env->GetStringUTFChars(pathObj, nullptr);
-    rs->init(path, RS_INIT_LOW_LATENCY | RS_INIT_WAIT_FOR_ATTACH);
-    env->ReleaseStringUTFChars(pathObj, path);
-
-    auto e = Element::RGBA_8888(rs);
-    Type::Builder tb(rs, e);
-    tb.setX(size);
-    tb.setY(size);
-    auto t = tb.create();
-
-    auto a = Allocation::createTyped(rs, t);
-    auto b = Allocation::createTyped(rs, t);
-
-    // Script is executed once, then the data is copied back when finished
-    sp<ScriptC_simple> s = new ScriptC_simple(rs);
-    s->forEach_simple_kernel(a, b);
-    uint32_t * output = new uint32_t[size*size];
-    b->copy2DRangeTo(0, 0, size, size, output);
-    delete [] output;
-
-    rs->finish();
-}
-
diff --git a/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/simple.rscript b/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/simple.rscript
deleted file mode 100644
index c55e0b5..0000000
--- a/tests/lldb/jni/NoDebugWaitAttach/jninodebugwaitattach/simple.rscript
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma version(1)
-#pragma rs java_package_name(com.android.rs.jninodebugwaitattach)
-
-float4 gColor = {0.299f, 0.587f, 0.114f, 1.f};
-
-/* RenderScript kernel that just sets the colour of the screen and does some
- * simple operations so it is not completely empty
- * (and can therefore be debugged).
- */
-uchar4 __attribute__((kernel)) simple_kernel(uchar4 in)
-{
-    float4 temp = rsUnpackColor8888(in);
-    temp = gColor;
-    uchar4 result = rsPackColorTo8888(temp);
-    return result;
-}
\ No newline at end of file
diff --git a/tests/lldb/jni/NoDebugWaitAttach/res/layout/main_layout.xml b/tests/lldb/jni/NoDebugWaitAttach/res/layout/main_layout.xml
deleted file mode 100644
index 131c3b5..0000000
--- a/tests/lldb/jni/NoDebugWaitAttach/res/layout/main_layout.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#0099cc"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-</FrameLayout>
-
diff --git a/tests/lldb/jni/NoDebugWaitAttach/src/com/android/rs/jninodebugwaitattach/MainActivity.java b/tests/lldb/jni/NoDebugWaitAttach/src/com/android/rs/jninodebugwaitattach/MainActivity.java
deleted file mode 100644
index 937e4d0..0000000
--- a/tests/lldb/jni/NoDebugWaitAttach/src/com/android/rs/jninodebugwaitattach/MainActivity.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-* Copyright (C) 2016 The Android Open Source Project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.android.rs.jninodebugwaitattach;
-
-import android.app.Activity;
-import android.os.Bundle;
-import android.graphics.BitmapFactory;
-import android.graphics.Bitmap;
-import android.widget.ImageView;
-
-public class MainActivity extends Activity {
-    private Bitmap mBitmapIn;
-    private Bitmap mBitmapOut;
-
-    static {
-        System.loadLibrary("RS");
-        System.loadLibrary("jninodebugwaitattach");
-    }
-
-    native void nativeRS(String cacheDir);
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-        setContentView(R.layout.main_layout);
-        nativeRS(this.getCacheDir().toString());
-    }
-}
-
diff --git a/tests/lldb/jni/common.mk b/tests/lldb/jni/common.mk
deleted file mode 100644
index 7441307..0000000
--- a/tests/lldb/jni/common.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-LOCAL_MODULE_TAGS := tests
-
-LOCAL_CPP_FEATURES += exceptions
-
-LOCAL_CFLAGS := -Werror -Wall -Wextra -std=c++11
-LOCAL_RENDERSCRIPT_FLAGS += -O0 -target-api 0
-
-LOCAL_HEADER_LIBRARIES := jni_headers
-LOCAL_SHARED_LIBRARIES += libdl liblog
-LOCAL_STATIC_LIBRARIES += libRScpp_static
-
-LOCAL_SDK_VERSION := 23
-LOCAL_NDK_STL_VARIANT := c++_static
diff --git a/tests/lldb/run_tests.py b/tests/lldb/run_tests.py
deleted file mode 100755
index d1700bb..0000000
--- a/tests/lldb/run_tests.py
+++ /dev/null
@@ -1,839 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Main test suite execution script.'''
-import argparse
-import inspect
-import logging
-import os
-import signal
-import subprocess
-import sys
-import time
-import collections
-import xml.etree.ElementTree as ET
-
-from config import Config
-from tests.harness import util_constants
-from tests.harness.exception import TestSuiteException, FailFastException
-from tests.harness import UtilAndroid
-from tests.harness import UtilBundle
-from tests.harness import util_log
-from tests.harness.util_functions import load_py_module
-from tests.harness.decorators import deprecated
-
-# For some reason pylint is not able to understand the class returned by
-# from util_log.get_logger() and generates a lot of false warnings
-#pylint: disable=maybe-no-member
-
-EMU_PROC = None
-
-def _parse_args():
-    '''Parse the command line arguments.
-
-    Returns:
-        A namespace object that contains the options specified to run_tests on
-        the command line.
-    '''
-
-    parser = argparse.ArgumentParser(description='Run the test suite.')
-
-    parser.add_argument('--config', '-c',
-                        metavar='path',
-                        help='Path to a custom config file.')
-    parser.add_argument('--device', '-d',
-                        help='Specify the device id of the device to test on.')
-    parser.add_argument('--test', '-t',
-                        metavar='path',
-                        help='Specify a specific test to run.')
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('--wimpy', '-w',
-                        action='store_true',
-                        default=None,
-                        help='Test only a core subset of features.')
-    group.add_argument('--app-types',
-                        default=['java', 'cpp', 'jni'],
-                        nargs='*',
-                        help='Specify a list of Android app types against which'
-                             ' to run the tests',
-                        dest='bundle_types')
-    parser.add_argument('--install-only',
-                        action='store_true',
-                        default=False,
-                        help='It only runs the pre-run stage of the test suite.'
-                             ' It installs the required APKs but does not '
-                             'execute the tests.',
-                        dest='install_only')
-    parser.add_argument('--no-install', '-n',
-                        action='store_true',
-                        default=False,
-                        help='Stop the test suite installing apks to device.',
-                        dest='noinstall')
-    parser.add_argument('--no-uninstall',
-                        action='store_true',
-                        default=False,
-                        help='Stop the test suite uninstalling apks after '
-                             'completion.',
-                        dest='nouninstall')
-    parser.add_argument('--print-to-stdout',
-                        action='store_true',
-                        default=False,
-                        help='Print all logging information to standard out.',
-                        dest='print_to_stdout')
-    parser.add_argument('--verbose', '-v',
-                        action='store_true',
-                        default=None,
-                        help='Store extra info in the log.')
-    parser.add_argument('--fail-fast',
-                        action='store_true',
-                        default=False,
-                        help='Exit the test suite immediately on the first failure.')
-    parser.add_argument('--run-emu',
-                        action='store_true',
-                        default=None,
-                        help='Spawn an emulator and run the test suite on that.'
-                             ' Specify the emulator command line in the config'
-                             ' file or with -emu-cmd.',
-                        dest='run_emu')
-
-    # Get the properties of the Config class and add a command line argument
-    # for each.
-    this_module = sys.modules[__name__]
-    for member_name, member_obj in inspect.getmembers(Config):
-        if (inspect.isdatadescriptor(member_obj) and
-            member_name not in ['__weakref__', 'device', 'verbose']):
-
-            # List type properties can take one or more arguments
-            num_args = None
-            if (isinstance(member_obj, property)
-                and isinstance(member_obj.fget(Config), list)):
-                num_args = '+'
-
-            opt_name = member_name.replace('_', '-')
-
-            setattr(this_module, opt_name, '')
-
-            parser.add_argument('--' + opt_name,
-                                nargs=num_args,
-                                help=member_obj.__doc__,
-                                dest=member_name)
-
-    return parser.parse_args()
-
-
-def _choice(first_choice, second_choice):
-    '''Return first_choice if it is not None otherwise return second_choice.
-
-    Args:
-        first_choice: The first choice value.
-        second_choice: The alternative value.
-
-    Returns:
-        The first argument if it is not None, and the second otherwise.
-    '''
-    return first_choice if first_choice else second_choice
-
-
-class State(object):
-    '''This class manages all objects required by the test suite.'''
-
-    # pylint: disable=too-many-instance-attributes
-    # Since this is a state class many attributes are expected.
-
-    def __init__(self):
-        '''State constructor.
-
-        Raises:
-            TestSuiteException: When unable to load config file.
-
-            AssertionError: When assertions fail.
-        '''
-
-        # Parse the command line options
-        args = _parse_args()
-
-        # create a config instance
-        if args.config:
-            # use the user supplied
-            config = State.load_user_configuration(args.config)
-        else:
-            # use the default configuration
-            config = Config()
-
-        # save the test denylist
-        self.blocklist = _choice(args.blocklist, config.blocklist)
-
-        # Allow any of the command line arguments to override the
-        # values in the config file.
-        self.adb_path = _choice(args.adb_path, config.adb_path)
-
-        self.host_port = int(_choice(args.host_port, config.host_port))
-
-        self.device = _choice(args.device, config.device)
-
-        self.user_specified_device = self.device
-
-        self.device_port = int(_choice(args.device_port, config.device_port))
-
-        self.lldb_server_path_device = _choice(args.lldb_server_path_device,
-                                               config.lldb_server_path_device)
-
-        self.lldb_server_path_host = _choice(args.lldb_server_path_host,
-                                             config.lldb_server_path_host)
-
-        self.aosp_product_path = _choice(args.aosp_product_path,
-                                         config.aosp_product_path)
-
-        self.log_file_path = _choice(args.log_file_path, config.log_file_path)
-
-        self.results_file_path = _choice(args.results_file_path,
-                                         config.results_file_path)
-
-        self.lldb_path = _choice(args.lldb_path, config.lldb_path)
-        self.print_to_stdout = args.print_to_stdout
-        self.verbose = _choice(args.verbose, config.verbose)
-        self.timeout = int(_choice(args.timeout, config.timeout))
-        self.emu_cmd = _choice(args.emu_cmd, config.emu_cmd)
-        self.run_emu = args.run_emu
-        self.wimpy = args.wimpy
-        self.bundle_types = args.bundle_types if not self.wimpy else ['java']
-        self.fail_fast = args.fail_fast
-
-        # validate the param "verbose"
-        if not isinstance(self.verbose, bool):
-            raise TestSuiteException('The parameter "verbose" should be a '
-                                     'boolean: {0}'.format(self.verbose))
-
-        # create result array
-        self.results = dict()
-        self.single_test = args.test
-
-        # initialise the logging facility
-        log_level = logging.INFO if not self.verbose else logging.DEBUG
-        util_log.initialise("driver",
-                            print_to_stdout=self.print_to_stdout,
-                            level=log_level,
-                            file_mode='w', # open for write
-                            file_path=self.log_file_path
-                            )
-        log = util_log.get_logger()
-
-        if self.run_emu and not self.emu_cmd:
-            log.TestSuiteException('Need to specify --emu-cmd (or specify a'
-                ' value in the config file) if using --run-emu.')
-
-        # create a results file
-        self.results_file = open(self.results_file_path, 'w')
-
-        # create an android helper object
-        self.android = UtilAndroid(self.adb_path,
-                                   self.lldb_server_path_device,
-                                   self.device)
-        assert self.android
-
-        # create a test bundle
-        self.bundle = UtilBundle(self.android,
-                                 self.aosp_product_path)
-        assert self.bundle
-
-        # save the no pushing option
-        assert isinstance(args.noinstall, bool)
-        self.noinstall = args.noinstall
-
-        assert isinstance(args.nouninstall, bool)
-        self.nouninstall = args.nouninstall
-
-        # install only option
-        assert type(args.install_only) is bool
-        self.install_only = args.install_only
-        if self.install_only:
-            log.log_and_print('Option --install-only set. The test APKs will '
-                              'be installed on the device but the tests will '
-                              'not be executed.')
-            if self.noinstall:
-                raise TestSuiteException('Conflicting options given: '
-                                         '--install-only and --no-install')
-
-        # TCP port modifier which is used to increment the port number used for
-        # each test case to avoid collisions.
-        self.port_mod = 0
-
-        # total number of test files that have been executed
-        self.test_count = 0
-
-    def get_android(self):
-        '''Return the android ADB helper instance.
-
-        Returns:
-            The android ADB helper, instance of UtilAndroid.
-        '''
-        assert self.android
-        return self.android
-
-    def get_bundle(self):
-        '''Return the test executable bundle.
-
-        Returns:
-            The test exectable collection, instance of UtilBundle.
-        '''
-        return self.bundle
-
-    def add_result(self, name, app_type, result):
-        '''Add a test result to the collection.
-
-        Args:
-            name: String name of the test that has executed.
-            app_type: type of app i.e. java, jni, or cpp
-            result: String result of the test, "pass", "fail", "error".
-        '''
-        key = (name, app_type)
-        assert key not in self.results
-        self.results[key] = result
-
-    def get_single_test(self):
-        '''Get the name of the single test to run.
-
-        Returns:
-            A string that is the name of the python file containing the test to
-            be run. If all tests are to be run this returns None.
-        '''
-        return self.single_test
-
-    @staticmethod
-    def load_user_configuration(path):
-        '''Load the test suite config from the give path.
-
-        Instantiate the Config class found in the module at the given path.
-        If no suitable class is available, it raises a TestSuiteException.
-
-        Args:
-            path: String location of the module.
-
-        Returns:
-            an instance of the Config class, defined in the module.
-
-        Raises:
-            TestSuiteException: when unable to import the module or when a
-                                subclass of Config is not found inside it.
-        '''
-
-        # load the module
-        config_module = load_py_module(path)
-        if not config_module:
-            raise TestSuiteException('Unable to import the module from "%s"'
-                                     % (path))
-
-        # look for a subclass of Config
-        for name, value in inspect.getmembers(config_module):
-            if (inspect.isclass(value)
-                and name != 'Config'
-                and issubclass(value, Config)):
-                # that's our candidate
-                return value()
-
-        # otherwise there are no valid candidates
-        raise TestSuiteException('The provided user configuration is not '
-                                 'valid. The module must define a subclass '
-                                 'of Config')
-
-
-def _kill_emulator():
-    ''' Kill the emulator process. '''
-    global EMU_PROC
-    if EMU_PROC:
-        try:
-            EMU_PROC.terminate()
-        except OSError:
-            # can't kill a dead proc
-            log = util_log.get_logger()
-            log.debug('Trying to kill an emulator but it is already dead.')
-
-
-def _check_emulator_terminated():
-    ''' Throw an exception if the emulator process has ended.
-
-    Raises:
-        TestSuiteException: If the emulator process has ended.
-    '''
-    global EMU_PROC
-    assert EMU_PROC
-    if EMU_PROC.poll():
-        stdout, stderr = EMU_PROC.communicate()
-        raise TestSuiteException('The emulator terminated with output:'
-            '\nstderr: {0}\nstdout: {1}.'.format(stderr, stdout))
-
-
-@deprecated()
-def _launch_emulator(state):
-    '''Launch the emulator and wait for it to boot.
-
-    Args:
-        emu_cmd: The command line to run the emulator.
-
-    Raises:
-        TestSuiteException: If an emulator already exists or the emulator
-                            process terminated before we could connect to it, or
-                            we failed to copy lldb-server to the emulator.
-    '''
-    global EMU_PROC
-    android = state.android
-    if state.user_specified_device:
-        if android.device_with_substring_exists(state.user_specified_device):
-            raise TestSuiteException(
-                'A device with name {0} already exists.',
-                state.user_specified_device)
-    else:
-        if android.device_with_substring_exists('emulator'):
-            raise TestSuiteException('An emulator already exists.')
-
-    assert state.emu_cmd
-    EMU_PROC = subprocess.Popen(state.emu_cmd.split(),
-                                stdout=None,
-                                stderr=subprocess.STDOUT)
-
-    log = util_log.get_logger()
-    log.info('Launching emulator with command line {0}'.format(state.emu_cmd))
-
-    tries_number = 180
-    tries = tries_number
-    found_device = False
-    while not found_device:
-        try:
-            android.validate_device(False, 'emulator')
-            found_device = True
-        except TestSuiteException as ex:
-            tries -= 1
-            if tries == 0:
-                # Avoid infinitely looping if the emulator won't boot
-                log.warning(
-                    'Giving up trying to validate device after {0} tries.'
-                    .format(tries_number))
-                raise ex
-            _check_emulator_terminated()
-            # wait a bit and try again, maybe it has now booted
-            time.sleep(10)
-
-    tries = 500
-    while not android.is_booted():
-        tries -= 1
-        if tries == 0:
-            # Avoid infinitely looping if the emulator won't boot
-            raise TestSuiteException('The emulator has failed to boot.')
-        _check_emulator_terminated()
-        time.sleep(5)
-
-    # Need to be root before we can push lldb-server
-    android.adb_root()
-    android.wait_for_device()
-
-    # Push the lldb-server executable to the device.
-    output = android.adb('push {0} {1}'.format(state.lldb_server_path_host,
-                                               state.lldb_server_path_device))
-
-    if 'failed to copy' in output or 'No such file or directory' in output:
-        raise TestSuiteException(
-            'unable to push lldb-server to the emulator: {0}.'
-            .format(output))
-
-    output = android.shell('chmod a+x {0}'
-                           .format(state.lldb_server_path_device))
-
-    if 'No such file or directory' in output:
-        raise TestSuiteException('Failed to copy lldb-server to the emulator.')
-
-
-def _restart_emulator(state):
-    '''Kill the emulator and start a new instance.
-
-    Args:
-        state: Test suite state collection, instance of State.
-    '''
-    _kill_emulator()
-    _launch_emulator(state)
-
-
-def _run_test(state, name, bundle_type):
-    '''Execute a single test case.
-
-    Args:
-        state: Test suite state collection, instance of State.
-        name: String file name of the test to execute.
-        bundle_type: string for the installed app type (cpp|jni|java)
-
-    Raises:
-        AssertionError: When assertion fails.
-    '''
-    assert isinstance(name, str)
-
-    try:
-        state.android.check_adb_alive()
-    except TestSuiteException as expt:
-        global EMU_PROC
-        if EMU_PROC:
-            _restart_emulator(state)
-        else:
-            raise expt
-
-    log = util_log.get_logger()
-    sys.stdout.write('Running {0}\r'.format(name))
-    sys.stdout.flush()
-    log.info('Running {0}'.format(name))
-
-    run_tests_dir = os.path.dirname(os.path.realpath(__file__))
-    run_test_path = os.path.join(run_tests_dir, 'tests', 'run_test.py')
-
-    # Forward port for lldb-server on the device to our host
-    hport = int(state.host_port) + state.port_mod
-    dport = int(state.device_port) + state.port_mod
-    state.android.forward_port(hport, dport)
-    state.port_mod += 1
-
-    log.debug('Giving up control to {0}...'.format(name))
-
-    params = map(str, [
-        sys.executable,
-        run_test_path,
-        name,
-        state.log_file_path,
-        state.adb_path,
-        state.lldb_server_path_device,
-        state.aosp_product_path,
-        dport,
-        state.android.get_device_id(),
-        state.print_to_stdout,
-        state.verbose,
-        state.wimpy,
-        state.timeout,
-        bundle_type
-    ])
-
-    return_code = subprocess.call(params)
-    state.test_count += 1
-    state.android.remove_port_forwarding()
-    log.seek_to_end()
-
-    # report in sys.stdout the result
-    success = return_code == util_constants.RC_TEST_OK
-    status_handlers = collections.defaultdict(lambda: ('error', log.error), (
-            (util_constants.RC_TEST_OK, ('pass', log.info)),
-            (util_constants.RC_TEST_TIMEOUT, ('timeout', log.error)),
-            (util_constants.RC_TEST_IGNORED, ('ignored', log.info)),
-            (util_constants.RC_TEST_FAIL, ('fail', log.critical))
-        )
-    )
-    status_name, status_logger = status_handlers[return_code]
-    log.info('Running %s: %s', name, status_name.upper())
-    status_logger("Test %r: %s", name, status_name)
-
-    # Special case for ignored tests - just return now
-    if return_code == util_constants.RC_TEST_IGNORED:
-        return
-
-    state.add_result(name, bundle_type, status_name)
-
-    if state.fail_fast and not success:
-        raise FailFastException(name)
-
-    # print a running total pass rate
-    passes = sum(1 for key, value in state.results.items() if value == 'pass')
-    log.info('Current pass rate: %s of %s executed.', passes, len(state.results))
-
-
-def _check_lldbserver_exists(state):
-    '''Check lldb-server exists on the target device and it is executable.
-
-    Raises:
-        TestSuiteError: If lldb-server does not exist on the target.
-    '''
-    assert state
-
-    message = 'Unable to verify valid lldb-server on target'
-
-    android = state.get_android()
-    assert android
-
-    cmd = state.lldb_server_path_device
-    out = android.shell(cmd, False)
-    if not isinstance(out, str):
-        raise TestSuiteException(message)
-    if out.find('Usage:') < 0:
-        raise TestSuiteException(message)
-
-
-def _suite_pre_run(state):
-    '''This function is executed before the test cases are run (setup).
-
-    Args:
-        state: Test suite state collection, instance of State.
-
-    Return:
-        True if the pre_run step completes without error.
-        Checks made:
-            - Validating that adb exists and runs.
-            - Validating that a device is attached.
-            - We have root access to the device.
-            - All test binaries were pushed to the device.
-            - The port for lldb-server was forwarded correctly.
-
-    Raises:
-        AssertionError: When assertions fail.
-    '''
-    assert state
-    log = util_log.get_logger()
-
-    try:
-        android = state.get_android()
-        bundle = state.get_bundle()
-        assert android
-        assert bundle
-
-        # validate ADB helper class
-        android.validate_adb()
-        log.log_and_print('Located ADB')
-
-        if state.run_emu:
-            log.log_and_print('Launching emulator...')
-            _launch_emulator(state)
-            log.log_and_print('Started emulator ' + android.device)
-        else:
-            android.validate_device()
-            log.log_and_print('Located device ' + android.device)
-
-        if state.noinstall and not state.single_test:
-            bundle.check_apps_installed(state.wimpy)
-
-        # elevate to root user
-        android.adb_root()
-        android.wait_for_device()
-        # check that lldb-server exists on device
-        android.kill_servers()
-        _check_lldbserver_exists(state)
-
-        if not state.noinstall:
-            # push all tests to the device
-            log.log_and_print('Pushing all tests...')
-            bundle.push_all()
-            log.log_and_print('Pushed all tests')
-        log.log_and_print('Pre run complete')
-
-    except TestSuiteException as expt:
-        log.exception('Test suite pre run failure')
-
-        # Even if we are logging the error, it may be helpful and more
-        # immediate to find out the error into the terminal
-        log.log_and_print('ERROR: Unable to set up the test suite: %s\n'
-                          % expt.message, logging.ERROR)
-
-        return False
-    return True
-
-
-def _suite_post_run(state):
-    '''This function is executed after the test cases have run (teardown).
-
-    Args:
-        state: Test suite state collection, instance of State.
-    Returns:
-        Number of failures
-    '''
-    log = util_log.get_logger()
-
-    if not state.noinstall and not state.nouninstall:
-        if state.wimpy:
-            state.bundle.uninstall_all_apk()
-        else:
-            state.bundle.uninstall_all()
-        log.log_and_print('Uninstalled/Deleted all tests')
-
-    total = 0
-    passes = 0
-    failures = 0
-
-    results = ET.Element('testsuite')
-    results.attrib['name'] = 'LLDB RS Test Suite'
-
-    for key, value in state.results.items():
-        total += 1
-        if value == 'pass':
-            passes += 1
-        else:
-            failures += 1
-
-        # test case name, followed by pass, failure or error elements
-        testcase = ET.Element('testcase')
-        testcase.attrib['name'] = "%s:%s" % key
-        result_element = ET.Element(value)
-        result_element.text = "%s:%s" % key
-        testcase.append(result_element)
-        results.append(testcase)
-
-    assert passes + failures == total, 'Invalid test results status'
-    if failures:
-        log.log_and_print(
-            'The following failures occurred:\n%s\n' %
-            '\n'.join('failed: %s:%s' % test_spec
-                for test_spec, result in state.results.items() if result != 'pass'
-        ))
-
-    log.log_and_print('{0} of {1} passed'.format(passes, total))
-    if total:
-        log.log_and_print('{0}% rate'.format((passes*100)/total))
-
-    results.attrib['tests'] = str(total)
-    state.results_file.write(ET.tostring(results, encoding='iso-8859-1'))
-
-    return failures
-
-
-def _discover_tests(state):
-    '''Discover all tests in the tests directory.
-
-    Returns:
-        List of strings, test file names from the 'tests' directory.
-    '''
-    tests = []
-
-    single_test = state.get_single_test()
-    if single_test is None:
-        file_dir = os.path.dirname(os.path.realpath(__file__))
-        tests_dir = os.path.join(file_dir, 'tests')
-
-        for sub_dir in os.listdir(tests_dir):
-            current_test_dir = os.path.join(tests_dir, sub_dir)
-            if os.path.isdir(current_test_dir):
-                dir_name = os.path.basename(current_test_dir)
-
-                if dir_name == 'harness':
-                    continue
-
-                for item in os.listdir(current_test_dir):
-                    if (item.startswith('test')
-                        and item.endswith('.py')
-                        and not item in state.blocklist):
-                        tests.append(item)
-    else:
-        if single_test.endswith('.py'):
-            tests.append(single_test)
-        else:
-            tests.append(single_test + '.py')
-
-    return tests
-
-
-def _deduce_python_path(state):
-    '''Try to deduce the PYTHONPATH environment variable via the LLDB binary.
-
-    Args:
-        state: Test suite state collection, instance of State.
-
-    Returns:
-        True if PYTHONPATH has been updated, False otherwise.
-
-    Raises:
-        TestSuiteException: If lldb path provided in the config or command line
-                            is incorrect.
-        AssertionError: If an assertion fails.
-    '''
-
-    lldb_path = state.lldb_path
-    if not lldb_path:
-        # lldb may not be provided in preference of a manual $PYTHONPATH
-        return False
-
-    params = [lldb_path, '-P']
-
-    try:
-        proc = subprocess.Popen(params, stdout=subprocess.PIPE)
-    except OSError as err:
-        error_string = 'Could not run lldb at %s: %s' % (lldb_path, str(err))
-        raise TestSuiteException(error_string)
-
-    stdout = proc.communicate()[0]
-    if stdout:
-        os.environ['PYTHONPATH'] = stdout.strip()
-        return True
-
-    return False
-
-
-def main():
-    '''The lldb-renderscript test suite entry point.'''
-    log = None
-
-    try:
-        # parse the command line
-        state = State()
-        assert state
-
-        # logging is initialised in State()
-        log = util_log.get_logger()
-
-        # if we can, set PYTHONPATH for lldb bindings
-        if not _deduce_python_path(state):
-            log.log_and_print('Unable to deduce PYTHONPATH', logging.WARN)
-
-        # pre run step
-        if not _suite_pre_run(state):
-            raise TestSuiteException('Test suite pre-run step failed')
-        # discover all tests and execute them
-        tests = _discover_tests(state)
-        log.log_and_print('Found {0} tests'.format(len(tests)))
-        if state.install_only:
-            log.log_and_print('Test applications installed. Terminating due to '
-                              '--install-only option')
-        else:
-            # run the tests
-            for bundle_type in state.bundle_types:
-                log.info("Running bundle type '%s'", bundle_type)
-                for item in tests:
-                    _run_test(state, item, bundle_type)
-                # post run step
-            quit(0 if _suite_post_run(state) == 0 else 1)
-
-    except AssertionError:
-        if log:
-            log.exception('Internal test suite error')
-
-        print('Internal test suite error')
-        quit(1)
-
-    except FailFastException:
-        log.exception('Early exit after first test failure')
-        quit(1)
-
-    except TestSuiteException as error:
-        if log:
-            log.exception('Test suite exception')
-
-        print('{0}'.format(str(error)))
-        quit(2)
-
-    finally:
-        _kill_emulator()
-        logging.shutdown()
-
-def signal_handler(_, _unused):
-    '''Signal handler for SIGINT, caused by the user typing Ctrl-C.'''
-    # pylint: disable=unused-argument
-    # pylint: disable=protected-access
-    print('Ctrl+C!')
-    os._exit(1)
-
-
-# execution trampoline
-if __name__ == '__main__':
-    signal.signal(signal.SIGINT, signal_handler)
-    main()
diff --git a/tests/lldb/tests/__init__.py b/tests/lldb/tests/__init__.py
deleted file mode 100644
index d91549f..0000000
--- a/tests/lldb/tests/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''This module contains a test runner, tests and utility code'''
-
-
-
-from . import harness
diff --git a/tests/lldb/tests/harness/RS_funs.py b/tests/lldb/tests/harness/RS_funs.py
deleted file mode 100644
index a5a0539..0000000
--- a/tests/lldb/tests/harness/RS_funs.py
+++ /dev/null
@@ -1,1401 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''A list of signatures for RS builtin functions and util functions for them.
-
-from __future__ import absolute_import
-
-Function signature syntax is usually C-like, however, fixed values can also be
-specified for those functions where the input range is restricted.
-Lines in the function table beginning with - are comments.
-Also contains utility functions to build an LLDB expression from a single
-function line.
-'''
-
-import re
-
-# Remove blank and comment lines using a lambda.
-FUNC_LIST = filter(lambda line: line.strip()
-                                and not line.strip().startswith('-'), '''
-- math functions
-
-uchar abs(char v);
-uchar2 abs(char2 v);
-uchar3 abs(char3 v);
-uchar4 abs(char4 v);
-uint abs(int v);
-uint2 abs(int2 v);
-uint3 abs(int3 v);
-uint4 abs(int4 v);
-ushort abs(short v);
-ushort2 abs(short2 v);
-ushort3 abs(short3 v);
-ushort4 abs(short4 v);
-
-float acos(float v);
-float2 acos(float2 v);
-float3 acos(float3 v);
-float4 acos(float4 v);
-
-float acosh(float v);
-float2 acosh(float2 v);
-float3 acosh(float3 v);
-float4 acosh(float4 v);
-
-float acospi(float v);
-float2 acospi(float2 v);
-float3 acospi(float3 v);
-float4 acospi(float4 v);
-
-float asin(float v);
-float2 asin(float2 v);
-float3 asin(float3 v);
-float4 asin(float4 v);
-
-float asinh(float v);
-float2 asinh(float2 v);
-float3 asinh(float3 v);
-float4 asinh(float4 v);
-
-float asinpi(float v);
-float2 asinpi(float2 v);
-float3 asinpi(float3 v);
-float4 asinpi(float4 v);
-
-float atan(float v);
-float2 atan(float2 v);
-float3 atan(float3 v);
-float4 atan(float4 v);
-
-float atan2(float numerator, float denominator);
-float2 atan2(float2 numerator, float2 denominator);
-float3 atan2(float3 numerator, float3 denominator);
-float4 atan2(float4 numerator, float4 denominator);
-
-float atan2pi(float numerator, float denominator);
-float2 atan2pi(float2 numerator, float2 denominator);
-float3 atan2pi(float3 numerator, float3 denominator);
-float4 atan2pi(float4 numerator, float4 denominator);
-
-float atanh(float v);
-float2 atanh(float2 v);
-float3 atanh(float3 v);
-float4 atanh(float4 v);
-
-float atanpi(float v);
-float2 atanpi(float2 v);
-float3 atanpi(float3 v);
-float4 atanpi(float4 v);
-
-float cbrt(float v);
-float2 cbrt(float2 v);
-float3 cbrt(float3 v);
-float4 cbrt(float4 v);
-
-float ceil(float v);
-float2 ceil(float2 v);
-float3 ceil(float3 v);
-float4 ceil(float4 v);
-
-char clamp(char value, char min_value, char max_value);
-char2 clamp(char2 value, char min_value, char max_value);
-char2 clamp(char2 value, char2 min_value, char2 max_value);
-char3 clamp(char3 value, char min_value, char max_value);
-char3 clamp(char3 value, char3 min_value, char3 max_value);
-char4 clamp(char4 value, char min_value, char max_value);
-char4 clamp(char4 value, char4 min_value, char4 max_value);
-float clamp(float value, float min_value, float max_value);
-float2 clamp(float2 value, float min_value, float max_value);
-float2 clamp(float2 value, float2 min_value, float2 max_value);
-float3 clamp(float3 value, float min_value, float max_value);
-float3 clamp(float3 value, float3 min_value, float3 max_value);
-float4 clamp(float4 value, float min_value, float max_value);
-float4 clamp(float4 value, float4 min_value, float4 max_value);
-int clamp(int value, int min_value, int max_value);
-int2 clamp(int2 value, int min_value, int max_value);
-int2 clamp(int2 value, int2 min_value, int2 max_value);
-int3 clamp(int3 value, int min_value, int max_value);
-int3 clamp(int3 value, int3 min_value, int3 max_value);
-int4 clamp(int4 value, int min_value, int max_value);
-int4 clamp(int4 value, int4 min_value, int4 max_value);
-long clamp(long value, long min_value, long max_value);
-long2 clamp(long2 value, long min_value, long max_value);
-long2 clamp(long2 value, long2 min_value, long2 max_value);
-long3 clamp(long3 value, long min_value, long max_value);
-long3 clamp(long3 value, long3 min_value, long3 max_value);
-long4 clamp(long4 value, long min_value, long max_value);
-long4 clamp(long4 value, long4 min_value, long4 max_value);
-short clamp(short value, short min_value, short max_value);
-short2 clamp(short2 value, short min_value, short max_value);
-short2 clamp(short2 value, short2 min_value, short2 max_value);
-short3 clamp(short3 value, short min_value, short max_value);
-short3 clamp(short3 value, short3 min_value, short3 max_value);
-short4 clamp(short4 value, short min_value, short max_value);
-short4 clamp(short4 value, short4 min_value, short4 max_value);
-uchar clamp(uchar value, uchar min_value, uchar max_value);
-uchar2 clamp(uchar2 value, uchar min_value, uchar max_value);
-uchar2 clamp(uchar2 value, uchar2 min_value, uchar2 max_value);
-uchar3 clamp(uchar3 value, uchar min_value, uchar max_value);
-uchar3 clamp(uchar3 value, uchar3 min_value, uchar3 max_value);
-uchar4 clamp(uchar4 value, uchar min_value, uchar max_value);
-uchar4 clamp(uchar4 value, uchar4 min_value, uchar4 max_value);
-uint clamp(uint value, uint min_value, uint max_value);
-uint2 clamp(uint2 value, uint min_value, uint max_value);
-uint2 clamp(uint2 value, uint2 min_value, uint2 max_value);
-uint3 clamp(uint3 value, uint min_value, uint max_value);
-uint3 clamp(uint3 value, uint3 min_value, uint3 max_value);
-uint4 clamp(uint4 value, uint min_value, uint max_value);
-uint4 clamp(uint4 value, uint4 min_value, uint4 max_value);
-ulong clamp(ulong value, ulong min_value, ulong max_value);
-ulong2 clamp(ulong2 value, ulong min_value, ulong max_value);
-ulong2 clamp(ulong2 value, ulong2 min_value, ulong2 max_value);
-ulong3 clamp(ulong3 value, ulong min_value, ulong max_value);
-ulong3 clamp(ulong3 value, ulong3 min_value, ulong3 max_value);
-ulong4 clamp(ulong4 value, ulong min_value, ulong max_value);
-ulong4 clamp(ulong4 value, ulong4 min_value, ulong4 max_value);
-ushort clamp(ushort value, ushort min_value, ushort max_value);
-ushort2 clamp(ushort2 value, ushort min_value, ushort max_value);
-ushort2 clamp(ushort2 value, ushort2 min_value, ushort2 max_value);
-ushort3 clamp(ushort3 value, ushort min_value, ushort max_value);
-ushort3 clamp(ushort3 value, ushort3 min_value, ushort3 max_value);
-ushort4 clamp(ushort4 value, ushort min_value, ushort max_value);
-ushort4 clamp(ushort4 value, ushort4 min_value, ushort4 max_value);
-
-char clz(char value);
-char2 clz(char2 value);
-char3 clz(char3 value);
-char4 clz(char4 value);
-int clz(int value);
-int2 clz(int2 value);
-int3 clz(int3 value);
-int4 clz(int4 value);
-short clz(short value);
-short2 clz(short2 value);
-short3 clz(short3 value);
-short4 clz(short4 value);
-uchar clz(uchar value);
-uchar2 clz(uchar2 value);
-uchar3 clz(uchar3 value);
-uchar4 clz(uchar4 value);
-uint clz(uint value);
-uint2 clz(uint2 value);
-uint3 clz(uint3 value);
-uint4 clz(uint4 value);
-ushort clz(ushort value);
-ushort2 clz(ushort2 value);
-ushort3 clz(ushort3 value);
-ushort4 clz(ushort4 value);
-
-float copysign(float magnitude_value, float sign_value);
-float2 copysign(float2 magnitude_value, float2 sign_value);
-float3 copysign(float3 magnitude_value, float3 sign_value);
-float4 copysign(float4 magnitude_value, float4 sign_value);
-
-float cos(float v);
-float2 cos(float2 v);
-float3 cos(float3 v);
-float4 cos(float4 v);
-
-float cosh(float v);
-float2 cosh(float2 v);
-float3 cosh(float3 v);
-float4 cosh(float4 v);
-
-float cospi(float v);
-float2 cospi(float2 v);
-float3 cospi(float3 v);
-float4 cospi(float4 v);
-
-float erf(float v);
-float2 erf(float2 v);
-float3 erf(float3 v);
-float4 erf(float4 v);
-
-float erfc(float v);
-float2 erfc(float2 v);
-float3 erfc(float3 v);
-float4 erfc(float4 v);
-
-float exp(float v);
-float2 exp(float2 v);
-float3 exp(float3 v);
-float4 exp(float4 v);
-
-float exp10(float v);
-float2 exp10(float2 v);
-float3 exp10(float3 v);
-float4 exp10(float4 v);
-
-float exp2(float v);
-float2 exp2(float2 v);
-float3 exp2(float3 v);
-float4 exp2(float4 v);
-
-float expm1(float v);
-float2 expm1(float2 v);
-float3 expm1(float3 v);
-float4 expm1(float4 v);
-
-float fabs(float v);
-float2 fabs(float2 v);
-float3 fabs(float3 v);
-float4 fabs(float4 v);
-
-float fdim(float a, float b);
-float2 fdim(float2 a, float2 b);
-float3 fdim(float3 a, float3 b);
-float4 fdim(float4 a, float4 b);
-
-float floor(float v);
-float2 floor(float2 v);
-float3 floor(float3 v);
-float4 floor(float4 v);
-
-float fma(float multiplicand1, float multiplicand2, float offset);
-float2 fma(float2 multiplicand1, float2 multiplicand2, float2 offset);
-float3 fma(float3 multiplicand1, float3 multiplicand2, float3 offset);
-float4 fma(float4 multiplicand1, float4 multiplicand2, float4 offset);
-
-float fmax(float a, float b);
-float2 fmax(float2 a, float b);
-float2 fmax(float2 a, float2 b);
-float3 fmax(float3 a, float b);
-float3 fmax(float3 a, float3 b);
-float4 fmax(float4 a, float b);
-float4 fmax(float4 a, float4 b);
-
-float fmin(float a, float b);
-float2 fmin(float2 a, float b);
-float2 fmin(float2 a, float2 b);
-float3 fmin(float3 a, float b);
-float3 fmin(float3 a, float3 b);
-float4 fmin(float4 a, float b);
-float4 fmin(float4 a, float4 b);
-
-float fmod(float numerator, float denominator);
-float2 fmod(float2 numerator, float2 denominator);
-float3 fmod(float3 numerator, float3 denominator);
-float4 fmod(float4 numerator, float4 denominator);
-
-float fract(float v, float* floor);
-float2 fract(float2 v, float2* floor);
-float3 fract(float3 v, float3* floor);
-float4 fract(float4 v, float4* floor);
-
-float frexp(float v, int* exponent);
-float2 frexp(float2 v, int2* exponent);
-float3 frexp(float3 v, int3* exponent);
-float4 frexp(float4 v, int4* exponent);
-
-float half_recip(float v);
-float2 half_recip(float2 v);
-float3 half_recip(float3 v);
-float4 half_recip(float4 v);
-
-float half_rsqrt(float v);
-float2 half_rsqrt(float2 v);
-float3 half_rsqrt(float3 v);
-float4 half_rsqrt(float4 v);
-
-float half_sqrt(float v);
-float2 half_sqrt(float2 v);
-float3 half_sqrt(float3 v);
-float4 half_sqrt(float4 v);
-
-float hypot(float a, float b);
-float2 hypot(float2 a, float2 b);
-float3 hypot(float3 a, float3 b);
-float4 hypot(float4 a, float4 b);
-
-int ilogb(float v);
-int2 ilogb(float2 v);
-int3 ilogb(float3 v);
-int4 ilogb(float4 v);
-
-float ldexp(float mantissa, int exponent);
-float2 ldexp(float2 mantissa, int exponent);
-float2 ldexp(float2 mantissa, int2 exponent);
-float3 ldexp(float3 mantissa, int exponent);
-float3 ldexp(float3 mantissa, int3 exponent);
-float4 ldexp(float4 mantissa, int exponent);
-float4 ldexp(float4 mantissa, int4 exponent);
-
-float lgamma(float v);
-float lgamma(float v, int* sign_of_gamma);
-float2 lgamma(float2 v);
-float2 lgamma(float2 v, int2* sign_of_gamma);
-float3 lgamma(float3 v);
-float3 lgamma(float3 v, int3* sign_of_gamma);
-float4 lgamma(float4 v);
-float4 lgamma(float4 v, int4* sign_of_gamma);
-
-float log(float v);
-float2 log(float2 v);
-float3 log(float3 v);
-float4 log(float4 v);
-
-float log10(float v);
-float2 log10(float2 v);
-float3 log10(float3 v);
-float4 log10(float4 v);
-
-float log1p(float v);
-float2 log1p(float2 v);
-float3 log1p(float3 v);
-float4 log1p(float4 v);
-
-float log2(float v);
-float2 log2(float2 v);
-float3 log2(float3 v);
-float4 log2(float4 v);
-
-float logb(float v);
-float2 logb(float2 v);
-float3 logb(float3 v);
-float4 logb(float4 v);
-
-float mad(float multiplicand1, float multiplicand2, float offset);
-float2 mad(float2 multiplicand1, float2 multiplicand2, float2 offset);
-float3 mad(float3 multiplicand1, float3 multiplicand2, float3 offset);
-float4 mad(float4 multiplicand1, float4 multiplicand2, float4 offset);
-
-char max(char a, char b);
-char2 max(char2 a, char2 b);
-char3 max(char3 a, char3 b);
-char4 max(char4 a, char4 b);
-float max(float a, float b);
-float2 max(float2 a, float2 b);
-float3 max(float3 a, float3 b);
-float4 max(float4 a, float4 b);
-int max(int a, int b);
-int2 max(int2 a, int2 b);
-int3 max(int3 a, int3 b);
-int4 max(int4 a, int4 b);
-long max(long a, long b);
-long2 max(long2 a, long2 b);
-long3 max(long3 a, long3 b);
-long4 max(long4 a, long4 b);
-short max(short a, short b);
-short2 max(short2 a, short2 b);
-short3 max(short3 a, short3 b);
-short4 max(short4 a, short4 b);
-uchar max(uchar a, uchar b);
-uchar2 max(uchar2 a, uchar2 b);
-uchar3 max(uchar3 a, uchar3 b);
-uchar4 max(uchar4 a, uchar4 b);
-uint max(uint a, uint b);
-uint2 max(uint2 a, uint2 b);
-uint3 max(uint3 a, uint3 b);
-uint4 max(uint4 a, uint4 b);
-ulong max(ulong a, ulong b);
-ulong2 max(ulong2 a, ulong2 b);
-ulong3 max(ulong3 a, ulong3 b);
-ulong4 max(ulong4 a, ulong4 b);
-ushort max(ushort a, ushort b);
-ushort2 max(ushort2 a, ushort2 b);
-ushort3 max(ushort3 a, ushort3 b);
-ushort4 max(ushort4 a, ushort4 b);
-
-char min(char a, char b);
-char2 min(char2 a, char2 b);
-char3 min(char3 a, char3 b);
-char4 min(char4 a, char4 b);
-float min(float a, float b);
-float2 min(float2 a, float2 b);
-float3 min(float3 a, float3 b);
-float4 min(float4 a, float4 b);
-int min(int a, int b);
-int2 min(int2 a, int2 b);
-int3 min(int3 a, int3 b);
-int4 min(int4 a, int4 b);
-long min(long a, long b);
-long2 min(long2 a, long2 b);
-long3 min(long3 a, long3 b);
-long4 min(long4 a, long4 b);
-short min(short a, short b);
-short2 min(short2 a, short2 b);
-short3 min(short3 a, short3 b);
-short4 min(short4 a, short4 b);
-uchar min(uchar a, uchar b);
-uchar2 min(uchar2 a, uchar2 b);
-uchar3 min(uchar3 a, uchar3 b);
-uchar4 min(uchar4 a, uchar4 b);
-uint min(uint a, uint b);
-uint2 min(uint2 a, uint2 b);
-uint3 min(uint3 a, uint3 b);
-uint4 min(uint4 a, uint4 b);
-ulong min(ulong a, ulong b);
-ulong2 min(ulong2 a, ulong2 b);
-ulong3 min(ulong3 a, ulong3 b);
-ulong4 min(ulong4 a, ulong4 b);
-ushort min(ushort a, ushort b);
-ushort2 min(ushort2 a, ushort2 b);
-ushort3 min(ushort3 a, ushort3 b);
-ushort4 min(ushort4 a, ushort4 b);
-
-float mix(float start, float stop, float fraction);
-float2 mix(float2 start, float2 stop, float fraction);
-float2 mix(float2 start, float2 stop, float2 fraction);
-float3 mix(float3 start, float3 stop, float fraction);
-float3 mix(float3 start, float3 stop, float3 fraction);
-float4 mix(float4 start, float4 stop, float fraction);
-float4 mix(float4 start, float4 stop, float4 fraction);
-
-float modf(float v, float* integral_part);
-float2 modf(float2 v, float2* integral_part);
-float3 modf(float3 v, float3* integral_part);
-float4 modf(float4 v, float4* integral_part);
-
-float nan(uint v);
-
-float native_acos(float v);
-float2 native_acos(float2 v);
-float3 native_acos(float3 v);
-float4 native_acos(float4 v);
-
-float native_acosh(float v);
-float2 native_acosh(float2 v);
-float3 native_acosh(float3 v);
-float4 native_acosh(float4 v);
-
-float native_acospi(float v);
-float2 native_acospi(float2 v);
-float3 native_acospi(float3 v);
-float4 native_acospi(float4 v);
-
-float native_asin(float v);
-float2 native_asin(float2 v);
-float3 native_asin(float3 v);
-float4 native_asin(float4 v);
-
-float native_asinh(float v);
-float2 native_asinh(float2 v);
-float3 native_asinh(float3 v);
-float4 native_asinh(float4 v);
-
-float native_asinpi(float v);
-float2 native_asinpi(float2 v);
-float3 native_asinpi(float3 v);
-float4 native_asinpi(float4 v);
-
-float native_atan(float v);
-float2 native_atan(float2 v);
-float3 native_atan(float3 v);
-float4 native_atan(float4 v);
-
-float native_atan2(float numerator, float denominator);
-float2 native_atan2(float2 numerator, float2 denominator);
-float3 native_atan2(float3 numerator, float3 denominator);
-float4 native_atan2(float4 numerator, float4 denominator);
-
-float native_atan2pi(float numerator, float denominator);
-float2 native_atan2pi(float2 numerator, float2 denominator);
-float3 native_atan2pi(float3 numerator, float3 denominator);
-float4 native_atan2pi(float4 numerator, float4 denominator);
-
-float native_atanh(float v);
-float2 native_atanh(float2 v);
-float3 native_atanh(float3 v);
-float4 native_atanh(float4 v);
-
-float native_atanpi(float v);
-float2 native_atanpi(float2 v);
-float3 native_atanpi(float3 v);
-float4 native_atanpi(float4 v);
-
-float native_cbrt(float v);
-float2 native_cbrt(float2 v);
-float3 native_cbrt(float3 v);
-float4 native_cbrt(float4 v);
-
-float native_cos(float v);
-float2 native_cos(float2 v);
-float3 native_cos(float3 v);
-float4 native_cos(float4 v);
-
-float native_cosh(float v);
-float2 native_cosh(float2 v);
-float3 native_cosh(float3 v);
-float4 native_cosh(float4 v);
-
-float native_cospi(float v);
-float2 native_cospi(float2 v);
-float3 native_cospi(float3 v);
-float4 native_cospi(float4 v);
-
-float native_divide(float left_vector, float right_vector);
-float2 native_divide(float2 left_vector, float2 right_vector);
-float3 native_divide(float3 left_vector, float3 right_vector);
-float4 native_divide(float4 left_vector, float4 right_vector);
-
-float native_exp(float v);
-float2 native_exp(float2 v);
-float3 native_exp(float3 v);
-float4 native_exp(float4 v);
-
-float native_exp10(float v);
-float2 native_exp10(float2 v);
-float3 native_exp10(float3 v);
-float4 native_exp10(float4 v);
-
-float native_exp2(float v);
-float2 native_exp2(float2 v);
-float3 native_exp2(float3 v);
-float4 native_exp2(float4 v);
-
-float native_expm1(float v);
-float2 native_expm1(float2 v);
-float3 native_expm1(float3 v);
-float4 native_expm1(float4 v);
-
-float native_hypot(float a, float b);
-float2 native_hypot(float2 a, float2 b);
-float3 native_hypot(float3 a, float3 b);
-float4 native_hypot(float4 a, float4 b);
-
-float native_log(float v);
-float2 native_log(float2 v);
-float3 native_log(float3 v);
-float4 native_log(float4 v);
-
-float native_log10(float v);
-float2 native_log10(float2 v);
-float3 native_log10(float3 v);
-float4 native_log10(float4 v);
-
-float native_log1p(float v);
-float2 native_log1p(float2 v);
-float3 native_log1p(float3 v);
-float4 native_log1p(float4 v);
-
-float native_log2(float v);
-float2 native_log2(float2 v);
-float3 native_log2(float3 v);
-float4 native_log2(float4 v);
-
-float native_powr(float base, float exponent);
-float2 native_powr(float2 base, float2 exponent);
-float3 native_powr(float3 base, float3 exponent);
-float4 native_powr(float4 base, float4 exponent);
-
-float native_recip(float v);
-float2 native_recip(float2 v);
-float3 native_recip(float3 v);
-float4 native_recip(float4 v);
-
-float native_rootn(float v, int n);
-float2 native_rootn(float2 v, int2 n);
-float3 native_rootn(float3 v, int3 n);
-float4 native_rootn(float4 v, int4 n);
-
-float native_rsqrt(float v);
-float2 native_rsqrt(float2 v);
-float3 native_rsqrt(float3 v);
-float4 native_rsqrt(float4 v);
-
-float native_sin(float v);
-float2 native_sin(float2 v);
-float3 native_sin(float3 v);
-float4 native_sin(float4 v);
-
-float native_sincos(float v, float* cos);
-float2 native_sincos(float2 v, float2* cos);
-float3 native_sincos(float3 v, float3* cos);
-float4 native_sincos(float4 v, float4* cos);
-
-float native_sinh(float v);
-float2 native_sinh(float2 v);
-float3 native_sinh(float3 v);
-float4 native_sinh(float4 v);
-
-float native_sinpi(float v);
-float2 native_sinpi(float2 v);
-float3 native_sinpi(float3 v);
-float4 native_sinpi(float4 v);
-
-float native_sqrt(float v);
-float2 native_sqrt(float2 v);
-float3 native_sqrt(float3 v);
-float4 native_sqrt(float4 v);
-
-float native_tan(float v);
-float2 native_tan(float2 v);
-float3 native_tan(float3 v);
-float4 native_tan(float4 v);
-
-float native_tanh(float v);
-float2 native_tanh(float2 v);
-float3 native_tanh(float3 v);
-float4 native_tanh(float4 v);
-
-float native_tanpi(float v);
-float2 native_tanpi(float2 v);
-float3 native_tanpi(float3 v);
-float4 native_tanpi(float4 v);
-
-float nextafter(float v, float target);
-float2 nextafter(float2 v, float2 target);
-float3 nextafter(float3 v, float3 target);
-float4 nextafter(float4 v, float4 target);
-
-float pow(float base, float exponent);
-float2 pow(float2 base, float2 exponent);
-float3 pow(float3 base, float3 exponent);
-float4 pow(float4 base, float4 exponent);
-
-float pown(float base, int exponent);
-float2 pown(float2 base, int2 exponent);
-float3 pown(float3 base, int3 exponent);
-float4 pown(float4 base, int4 exponent);
-
-float powr(float base, float exponent);
-float2 powr(float2 base, float2 exponent);
-float3 powr(float3 base, float3 exponent);
-float4 powr(float4 base, float4 exponent);
-
-float radians(float v);
-float2 radians(float2 v);
-float3 radians(float3 v);
-float4 radians(float4 v);
-
-float remainder(float numerator, float denominator);
-float2 remainder(float2 numerator, float2 denominator);
-float3 remainder(float3 numerator, float3 denominator);
-float4 remainder(float4 numerator, float4 denominator);
-
-float remquo(float numerator, float denominator, int* quotient);
-float2 remquo(float2 numerator, float2 denominator, int2* quotient);
-float3 remquo(float3 numerator, float3 denominator, int3* quotient);
-float4 remquo(float4 numerator, float4 denominator, int4* quotient);
-
-float rint(float v);
-float2 rint(float2 v);
-float3 rint(float3 v);
-float4 rint(float4 v);
-
-float rootn(float v, int n);
-float2 rootn(float2 v, int2 n);
-float3 rootn(float3 v, int3 n);
-float4 rootn(float4 v, int4 n);
-
-float round(float v);
-float2 round(float2 v);
-float3 round(float3 v);
-float4 round(float4 v);
-
-char rsClamp(char amount, char low, char high);
-int rsClamp(int amount, int low, int high);
-short rsClamp(short amount, short low, short high);
-uchar rsClamp(uchar amount, uchar low, uchar high);
-uint rsClamp(uint amount, uint low, uint high);
-ushort rsClamp(ushort amount, ushort low, ushort high);
-
-float rsFrac(float v);
-
-float rsRand(float max_value);
-float rsRand(float min_value, float max_value);
-int rsRand(int max_value);
-int rsRand(int min_value, int max_value);
-
-float rsqrt(float v);
-float2 rsqrt(float2 v);
-float3 rsqrt(float3 v);
-float4 rsqrt(float4 v);
-
-float sign(float v);
-float2 sign(float2 v);
-float3 sign(float3 v);
-float4 sign(float4 v);
-
-float sin(float v);
-float2 sin(float2 v);
-float3 sin(float3 v);
-float4 sin(float4 v);
-
-float sincos(float v, float* cos);
-float2 sincos(float2 v, float2* cos);
-float3 sincos(float3 v, float3* cos);
-float4 sincos(float4 v, float4* cos);
-
-float sinh(float v);
-float2 sinh(float2 v);
-float3 sinh(float3 v);
-float4 sinh(float4 v);
-
-float sinpi(float v);
-float2 sinpi(float2 v);
-float3 sinpi(float3 v);
-float4 sinpi(float4 v);
-
-float sqrt(float v);
-float2 sqrt(float2 v);
-float3 sqrt(float3 v);
-float4 sqrt(float4 v);
-
-float step(float edge, float v);
-float2 step(float edge, float2 v);
-float2 step(float2 edge, float v);
-float2 step(float2 edge, float2 v);
-float3 step(float edge, float3 v);
-float3 step(float3 edge, float v);
-float3 step(float3 edge, float3 v);
-float4 step(float edge, float4 v);
-float4 step(float4 edge, float v);
-float4 step(float4 edge, float4 v);
-
-float tan(float v);
-float2 tan(float2 v);
-float3 tan(float3 v);
-float4 tan(float4 v);
-
-float tanh(float v);
-float2 tanh(float2 v);
-float3 tanh(float3 v);
-float4 tanh(float4 v);
-
-float tanpi(float v);
-float2 tanpi(float2 v);
-float3 tanpi(float3 v);
-float4 tanpi(float4 v);
-
-float tgamma(float v);
-float2 tgamma(float2 v);
-float3 tgamma(float3 v);
-float4 tgamma(float4 v);
-
-float trunc(float v);
-float2 trunc(float2 v);
-float3 trunc(float3 v);
-float4 trunc(float4 v);
-
-uchar4 rsPackColorTo8888(float r, float g, float b);
-uchar4 rsPackColorTo8888(float r, float g, float b, float a);
-uchar4 rsPackColorTo8888(float3 color);
-uchar4 rsPackColorTo8888(float4 color);
-float4 rsUnpackColor8888(uchar4 c);
-
-float4 rsYuvToRGBA_float4(uchar y, uchar u, uchar v);
-uchar4 rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v);
-
-- vector functions
-
-float3 cross(float3 left_vector, float3 right_vector);
-float4 cross(float4 left_vector, float4 right_vector);
-
-float distance(float  left_vector, float  right_vector);
-float distance(float2 left_vector, float2 right_vector);
-float distance(float3 left_vector, float3 right_vector);
-float distance(float4 left_vector, float4 right_vector);
-
-float dot(float  left_vector, float  right_vector);
-float dot(float2 left_vector, float2 right_vector);
-float dot(float3 left_vector, float3 right_vector);
-float dot(float4 left_vector, float4 right_vector);
-
-float fast_distance(float  left_vector, float  right_vector);
-float fast_distance(float2 left_vector, float2 right_vector);
-float fast_distance(float3 left_vector, float3 right_vector);
-float fast_distance(float4 left_vector, float4 right_vector);
-
-float fast_length(float  v);
-float fast_length(float2 v);
-float fast_length(float3 v);
-float fast_length(float4 v);
-
-float  fast_normalize(float  v);
-float2 fast_normalize(float2 v);
-float3 fast_normalize(float3 v);
-float4 fast_normalize(float4 v);
-
-float length(float  v);
-float length(float2 v);
-float length(float3 v);
-float length(float4 v);
-
-float native_distance(float  left_vector, float  right_vector);
-float native_distance(float2 left_vector, float2 right_vector);
-float native_distance(float3 left_vector, float3 right_vector);
-float native_distance(float4 left_vector, float4 right_vector);
-
-float native_length(float  v);
-float native_length(float2 v);
-float native_length(float3 v);
-float native_length(float4 v);
-
-float  native_normalize(float  v);
-float2 native_normalize(float2 v);
-float3 native_normalize(float3 v);
-float4 native_normalize(float4 v);
-
-float  normalize(float  v);
-float2 normalize(float2 v);
-float3 normalize(float3 v);
-float4 normalize(float4 v);
-
-- conversion functions
-
-char2 convert_char2(char2 v);
-char2 convert_char2(double2 v);
-char2 convert_char2(float2 v);
-char2 convert_char2(int2 v);
-char2 convert_char2(long2 v);
-char2 convert_char2(short2 v);
-char2 convert_char2(uchar2 v);
-char2 convert_char2(uint2 v);
-char2 convert_char2(ulong2 v);
-char2 convert_char2(ushort2 v);
-
-char3 convert_char3(char3 v);
-char3 convert_char3(double3 v);
-char3 convert_char3(float3 v);
-char3 convert_char3(int3 v);
-char3 convert_char3(long3 v);
-char3 convert_char3(short3 v);
-char3 convert_char3(uchar3 v);
-char3 convert_char3(uint3 v);
-char3 convert_char3(ulong3 v);
-char3 convert_char3(ushort3 v);
-
-char4 convert_char4(char4 v);
-char4 convert_char4(double4 v);
-char4 convert_char4(float4 v);
-char4 convert_char4(int4 v);
-char4 convert_char4(long4 v);
-char4 convert_char4(short4 v);
-char4 convert_char4(uchar4 v);
-char4 convert_char4(uint4 v);
-char4 convert_char4(ulong4 v);
-char4 convert_char4(ushort4 v);
-
-double2 convert_double2(char2 v);
-double2 convert_double2(double2 v);
-double2 convert_double2(float2 v);
-double2 convert_double2(int2 v);
-double2 convert_double2(long2 v);
-double2 convert_double2(short2 v);
-double2 convert_double2(uchar2 v);
-double2 convert_double2(uint2 v);
-double2 convert_double2(ulong2 v);
-double2 convert_double2(ushort2 v);
-
-double3 convert_double3(char3 v);
-double3 convert_double3(double3 v);
-double3 convert_double3(float3 v);
-double3 convert_double3(int3 v);
-double3 convert_double3(long3 v);
-double3 convert_double3(short3 v);
-double3 convert_double3(uchar3 v);
-double3 convert_double3(uint3 v);
-double3 convert_double3(ulong3 v);
-double3 convert_double3(ushort3 v);
-
-double4 convert_double4(char4 v);
-double4 convert_double4(double4 v);
-double4 convert_double4(float4 v);
-double4 convert_double4(int4 v);
-double4 convert_double4(long4 v);
-double4 convert_double4(short4 v);
-double4 convert_double4(uchar4 v);
-double4 convert_double4(uint4 v);
-double4 convert_double4(ulong4 v);
-double4 convert_double4(ushort4 v);
-
-float2 convert_float2(char2 v);
-float2 convert_float2(double2 v);
-float2 convert_float2(float2 v);
-float2 convert_float2(int2 v);
-float2 convert_float2(long2 v);
-float2 convert_float2(short2 v);
-float2 convert_float2(uchar2 v);
-float2 convert_float2(uint2 v);
-float2 convert_float2(ulong2 v);
-float2 convert_float2(ushort2 v);
-
-float3 convert_float3(char3 v);
-float3 convert_float3(double3 v);
-float3 convert_float3(float3 v);
-float3 convert_float3(int3 v);
-float3 convert_float3(long3 v);
-float3 convert_float3(short3 v);
-float3 convert_float3(uchar3 v);
-float3 convert_float3(uint3 v);
-float3 convert_float3(ulong3 v);
-float3 convert_float3(ushort3 v);
-
-float4 convert_float4(char4 v);
-float4 convert_float4(double4 v);
-float4 convert_float4(float4 v);
-float4 convert_float4(int4 v);
-float4 convert_float4(long4 v);
-float4 convert_float4(short4 v);
-float4 convert_float4(uchar4 v);
-float4 convert_float4(uint4 v);
-float4 convert_float4(ulong4 v);
-float4 convert_float4(ushort4 v);
-
-int2 convert_int2(char2 v);
-int2 convert_int2(double2 v);
-int2 convert_int2(float2 v);
-int2 convert_int2(int2 v);
-int2 convert_int2(long2 v);
-int2 convert_int2(short2 v);
-int2 convert_int2(uchar2 v);
-int2 convert_int2(uint2 v);
-int2 convert_int2(ulong2 v);
-int2 convert_int2(ushort2 v);
-
-int3 convert_int3(char3 v);
-int3 convert_int3(double3 v);
-int3 convert_int3(float3 v);
-int3 convert_int3(int3 v);
-int3 convert_int3(long3 v);
-int3 convert_int3(short3 v);
-int3 convert_int3(uchar3 v);
-int3 convert_int3(uint3 v);
-int3 convert_int3(ulong3 v);
-int3 convert_int3(ushort3 v);
-
-int4 convert_int4(char4 v);
-int4 convert_int4(double4 v);
-int4 convert_int4(float4 v);
-int4 convert_int4(int4 v);
-int4 convert_int4(long4 v);
-int4 convert_int4(short4 v);
-int4 convert_int4(uchar4 v);
-int4 convert_int4(uint4 v);
-int4 convert_int4(ulong4 v);
-int4 convert_int4(ushort4 v);
-
-long2 convert_long2(char2 v);
-long2 convert_long2(double2 v);
-long2 convert_long2(float2 v);
-long2 convert_long2(int2 v);
-long2 convert_long2(long2 v);
-long2 convert_long2(short2 v);
-long2 convert_long2(uchar2 v);
-long2 convert_long2(uint2 v);
-long2 convert_long2(ulong2 v);
-long2 convert_long2(ushort2 v);
-
-long3 convert_long3(char3 v);
-long3 convert_long3(double3 v);
-long3 convert_long3(float3 v);
-long3 convert_long3(int3 v);
-long3 convert_long3(long3 v);
-long3 convert_long3(short3 v);
-long3 convert_long3(uchar3 v);
-long3 convert_long3(uint3 v);
-long3 convert_long3(ulong3 v);
-long3 convert_long3(ushort3 v);
-
-long4 convert_long4(char4 v);
-long4 convert_long4(double4 v);
-long4 convert_long4(float4 v);
-long4 convert_long4(int4 v);
-long4 convert_long4(long4 v);
-long4 convert_long4(short4 v);
-long4 convert_long4(uchar4 v);
-long4 convert_long4(uint4 v);
-long4 convert_long4(ulong4 v);
-long4 convert_long4(ushort4 v);
-
-short2 convert_short2(char2 v);
-short2 convert_short2(double2 v);
-short2 convert_short2(float2 v);
-short2 convert_short2(int2 v);
-short2 convert_short2(long2 v);
-short2 convert_short2(short2 v);
-short2 convert_short2(uchar2 v);
-short2 convert_short2(uint2 v);
-short2 convert_short2(ulong2 v);
-short2 convert_short2(ushort2 v);
-
-short3 convert_short3(char3 v);
-short3 convert_short3(double3 v);
-short3 convert_short3(float3 v);
-short3 convert_short3(int3 v);
-short3 convert_short3(long3 v);
-short3 convert_short3(short3 v);
-short3 convert_short3(uchar3 v);
-short3 convert_short3(uint3 v);
-short3 convert_short3(ulong3 v);
-short3 convert_short3(ushort3 v);
-
-short4 convert_short4(char4 v);
-short4 convert_short4(double4 v);
-short4 convert_short4(float4 v);
-short4 convert_short4(int4 v);
-short4 convert_short4(long4 v);
-short4 convert_short4(short4 v);
-short4 convert_short4(uchar4 v);
-short4 convert_short4(uint4 v);
-short4 convert_short4(ulong4 v);
-short4 convert_short4(ushort4 v);
-
-uchar2 convert_uchar2(char2 v);
-uchar2 convert_uchar2(double2 v);
-uchar2 convert_uchar2(float2 v);
-uchar2 convert_uchar2(int2 v);
-uchar2 convert_uchar2(long2 v);
-uchar2 convert_uchar2(short2 v);
-uchar2 convert_uchar2(uchar2 v);
-uchar2 convert_uchar2(uint2 v);
-uchar2 convert_uchar2(ulong2 v);
-uchar2 convert_uchar2(ushort2 v);
-
-uchar3 convert_uchar3(char3 v);
-uchar3 convert_uchar3(double3 v);
-uchar3 convert_uchar3(float3 v);
-uchar3 convert_uchar3(int3 v);
-uchar3 convert_uchar3(long3 v);
-uchar3 convert_uchar3(short3 v);
-uchar3 convert_uchar3(uchar3 v);
-uchar3 convert_uchar3(uint3 v);
-uchar3 convert_uchar3(ulong3 v);
-uchar3 convert_uchar3(ushort3 v);
-
-uchar4 convert_uchar4(char4 v);
-uchar4 convert_uchar4(double4 v);
-uchar4 convert_uchar4(float4 v);
-uchar4 convert_uchar4(int4 v);
-uchar4 convert_uchar4(long4 v);
-uchar4 convert_uchar4(short4 v);
-uchar4 convert_uchar4(uchar4 v);
-uchar4 convert_uchar4(uint4 v);
-uchar4 convert_uchar4(ulong4 v);
-uchar4 convert_uchar4(ushort4 v);
-
-uint2 convert_uint2(char2 v);
-uint2 convert_uint2(double2 v);
-uint2 convert_uint2(float2 v);
-uint2 convert_uint2(int2 v);
-uint2 convert_uint2(long2 v);
-uint2 convert_uint2(short2 v);
-uint2 convert_uint2(uchar2 v);
-uint2 convert_uint2(uint2 v);
-uint2 convert_uint2(ulong2 v);
-uint2 convert_uint2(ushort2 v);
-
-uint3 convert_uint3(char3 v);
-uint3 convert_uint3(double3 v);
-uint3 convert_uint3(float3 v);
-uint3 convert_uint3(int3 v);
-uint3 convert_uint3(long3 v);
-uint3 convert_uint3(short3 v);
-uint3 convert_uint3(uchar3 v);
-uint3 convert_uint3(uint3 v);
-uint3 convert_uint3(ulong3 v);
-uint3 convert_uint3(ushort3 v);
-
-uint4 convert_uint4(char4 v);
-uint4 convert_uint4(double4 v);
-uint4 convert_uint4(float4 v);
-uint4 convert_uint4(int4 v);
-uint4 convert_uint4(long4 v);
-uint4 convert_uint4(short4 v);
-uint4 convert_uint4(uchar4 v);
-uint4 convert_uint4(uint4 v);
-uint4 convert_uint4(ulong4 v);
-uint4 convert_uint4(ushort4 v);
-
-ulong2 convert_ulong2(char2 v);
-ulong2 convert_ulong2(double2 v);
-ulong2 convert_ulong2(float2 v);
-ulong2 convert_ulong2(int2 v);
-ulong2 convert_ulong2(long2 v);
-ulong2 convert_ulong2(short2 v);
-ulong2 convert_ulong2(uchar2 v);
-ulong2 convert_ulong2(uint2 v);
-ulong2 convert_ulong2(ulong2 v);
-ulong2 convert_ulong2(ushort2 v);
-
-ulong3 convert_ulong3(char3 v);
-ulong3 convert_ulong3(double3 v);
-ulong3 convert_ulong3(float3 v);
-ulong3 convert_ulong3(int3 v);
-ulong3 convert_ulong3(long3 v);
-ulong3 convert_ulong3(short3 v);
-ulong3 convert_ulong3(uchar3 v);
-ulong3 convert_ulong3(uint3 v);
-ulong3 convert_ulong3(ulong3 v);
-ulong3 convert_ulong3(ushort3 v);
-
-ulong4 convert_ulong4(char4 v);
-ulong4 convert_ulong4(double4 v);
-ulong4 convert_ulong4(float4 v);
-ulong4 convert_ulong4(int4 v);
-ulong4 convert_ulong4(long4 v);
-ulong4 convert_ulong4(short4 v);
-ulong4 convert_ulong4(uchar4 v);
-ulong4 convert_ulong4(uint4 v);
-ulong4 convert_ulong4(ulong4 v);
-ulong4 convert_ulong4(ushort4 v);
-
-ushort2 convert_ushort2(char2 v);
-ushort2 convert_ushort2(double2 v);
-ushort2 convert_ushort2(float2 v);
-ushort2 convert_ushort2(int2 v);
-ushort2 convert_ushort2(long2 v);
-ushort2 convert_ushort2(short2 v);
-ushort2 convert_ushort2(uchar2 v);
-ushort2 convert_ushort2(uint2 v);
-ushort2 convert_ushort2(ulong2 v);
-ushort2 convert_ushort2(ushort2 v);
-
-ushort3 convert_ushort3(char3 v);
-ushort3 convert_ushort3(double3 v);
-ushort3 convert_ushort3(float3 v);
-ushort3 convert_ushort3(int3 v);
-ushort3 convert_ushort3(long3 v);
-ushort3 convert_ushort3(short3 v);
-ushort3 convert_ushort3(uchar3 v);
-ushort3 convert_ushort3(uint3 v);
-ushort3 convert_ushort3(ulong3 v);
-ushort3 convert_ushort3(ushort3 v);
-
-ushort4 convert_ushort4(char4 v);
-ushort4 convert_ushort4(double4 v);
-ushort4 convert_ushort4(float4 v);
-ushort4 convert_ushort4(int4 v);
-ushort4 convert_ushort4(long4 v);
-ushort4 convert_ushort4(short4 v);
-ushort4 convert_ushort4(uchar4 v);
-ushort4 convert_ushort4(uint4 v);
-ushort4 convert_ushort4(ulong4 v);
-ushort4 convert_ushort4(ushort4 v);
-
-uchar4 rsPackColorTo8888(float r, float g, float b);
-uchar4 rsPackColorTo8888(float r, float g, float b, float a);
-uchar4 rsPackColorTo8888(float3 color);
-uchar4 rsPackColorTo8888(float4 color);
-
-float4 rsUnpackColor8888(uchar4 c);
-
-float4 rsYuvToRGBA_float4(uchar y, uchar u, uchar v);
-
-uchar4 rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v);
-
-- matrix functions, some of these are not supported yet
-
--bool rsMatrixInverse(rs_matrix4x4* m);
-
--bool rsMatrixInverseTranspose(rs_matrix4x4* m);
-
--void rsMatrixLoad(rs_matrix2x2* destination, float* array);
--void rsMatrixLoad(rs_matrix2x2* destination, rs_matrix2x2* source);
--void rsMatrixLoad(rs_matrix3x3* destination, float* array);
--void rsMatrixLoad(rs_matrix3x3* destination, rs_matrix3x3* source);
--void rsMatrixLoad(rs_matrix4x4* destination, float* array);
--void rsMatrixLoad(rs_matrix4x4* destination, rs_matrix2x2* source);
--void rsMatrixLoad(rs_matrix4x4* destination, rs_matrix3x3* source);
--void rsMatrixLoad(rs_matrix4x4* destination, rs_matrix4x4* source);
-
--void rsMatrixLoadFrustum(rs_matrix4x4* m, float left, float right, float bottom, float top, float near, float far);
-
--void rsMatrixLoadIdentity(rs_matrix2x2* m);
--void rsMatrixLoadIdentity(rs_matrix3x3* m);
--void rsMatrixLoadIdentity(rs_matrix4x4* m);
-
--void rsMatrixLoadMultiply(rs_matrix2x2* m, rs_matrix2x2* lhs, rs_matrix2x2* rhs);
--void rsMatrixLoadMultiply(rs_matrix3x3* m, rs_matrix3x3* lhs, rs_matrix3x3* rhs);
--void rsMatrixLoadMultiply(rs_matrix4x4* m, rs_matrix4x4* lhs, rs_matrix4x4* rhs);
-
--void rsMatrixLoadOrtho(rs_matrix4x4* m, float left, float right, float bottom, float top, float near, float far);
-
--void rsMatrixLoadPerspective(rs_matrix4x4* m, float fovy, float aspect, float near, float far);
-
--void rsMatrixLoadRotate(rs_matrix4x4* m, float rot, float x, float y, float z);
-
--void rsMatrixLoadScale(rs_matrix4x4* m, float x, float y, float z);
-
--void rsMatrixLoadTranslate(rs_matrix4x4* m, float x, float y, float z);
-
-float2 rsMatrixMultiply(rs_matrix2x2* m, float2 in);
-float3 rsMatrixMultiply(rs_matrix3x3* m, float2 in);
-float3 rsMatrixMultiply(rs_matrix3x3* m, float3 in);
-float4 rsMatrixMultiply(rs_matrix4x4* m, float2 in);
-float4 rsMatrixMultiply(rs_matrix4x4* m, float3 in);
-float4 rsMatrixMultiply(rs_matrix4x4* m, float4 in);
-void rsMatrixMultiply(rs_matrix2x2* m, rs_matrix2x2* rhs);
-void rsMatrixMultiply(rs_matrix3x3* m, rs_matrix3x3* rhs);
-void rsMatrixMultiply(rs_matrix4x4* m, rs_matrix4x4* rhs);
-
--void rsMatrixRotate(rs_matrix4x4* m, float rot, float x, float y, float z);
-
--void rsMatrixScale(rs_matrix4x4* m, float x, float y, float z);
-
-void rsMatrixSet(rs_matrix2x2* m, 0, 1, float v);
-void rsMatrixSet(rs_matrix3x3* m, 2, 0, float v);
-void rsMatrixSet(rs_matrix4x4* m, 1, 3, float v);
-
--void rsMatrixTranslate(rs_matrix4x4* m, float x, float y, float z);
-
--void rsMatrixTranspose(rs_matrix2x2* m);
--void rsMatrixTranspose(rs_matrix3x3* m);
--void rsMatrixTranspose(rs_matrix4x4* m);
-
-- quaternion functions
-
-void rsQuaternionAdd(rs_quaternion* q, rs_quaternion* rhs);
-
-void rsQuaternionConjugate(rs_quaternion* q);
-
-float rsQuaternionDot(rs_quaternion* q0, rs_quaternion* q1);
-
-void rsQuaternionGetMatrixUnit(rs_matrix4x4* m, rs_quaternion* q);
-
-void rsQuaternionLoadRotate(rs_quaternion* q, float rot, float x, float y, float z);
-
-void rsQuaternionLoadRotateUnit(rs_quaternion* q, float rot, float x, float y, float z);
-
-void rsQuaternionMultiply(rs_quaternion* q, rs_quaternion* rhs);
-void rsQuaternionMultiply(rs_quaternion* q, float scalar);
-
-void rsQuaternionNormalize(rs_quaternion* q);
-
-void rsQuaternionSet(rs_quaternion* q, rs_quaternion* rhs);
-void rsQuaternionSet(rs_quaternion* q, float w, float x, float y, float z);
-
-void rsQuaternionSlerp(rs_quaternion* q, rs_quaternion* q0, rs_quaternion* q1, float t);
-
-- allocation data access functions, this is a subset, since we don't have all types of allocations
-
--void rsAllocationCopy1DRange(allocation_1D_global, uint dstOff, uint dstMip, uint count, allocation_1D_global2, uint srcOff, uint srcMip);
-
--void rsAllocationCopy2DRange(allocation_2D_global, uint dstXoff, uint dstYoff, uint dstMip, rs_allocation_cubemap_face dstFace, uint width, uint height, allocation_2D_global2, uint srcXoff, uint srcYoff, uint srcMip, rs_allocation_cubemap_face srcFace);
-
-int2 rsAllocationVLoadX_int2(allocation_1D_global, 0);
-int2 rsAllocationVLoadX_int2(allocation_2D_global, 24, 25);
-int2 rsAllocationVLoadX_int2(allocation_3D_global, 0, 1, 0);
-int3 rsAllocationVLoadX_int3(allocation_1D_global, 1);
-int3 rsAllocationVLoadX_int3(allocation_2D_global, 27, 28);
-int3 rsAllocationVLoadX_int3(allocation_3D_global, 1, 0, 1);
-int4 rsAllocationVLoadX_int4(allocation_1D_global, 0);
-int4 rsAllocationVLoadX_int4(allocation_2D_global, 29, 30);
-int4 rsAllocationVLoadX_int4(allocation_3D_global, 0, 1, 0);
-
-void rsAllocationVStoreX_int2(allocation_1D_global, int2 val, 2);
-void rsAllocationVStoreX_int2(allocation_2D_global, int2 val, 6, 7);
-void rsAllocationVStoreX_int2(allocation_3D_global, int2 val, 0, 1, 0);
-void rsAllocationVStoreX_int3(allocation_1D_global, int3 val, 1);
-void rsAllocationVStoreX_int3(allocation_2D_global, int3 val, 12, 13);
-void rsAllocationVStoreX_int3(allocation_3D_global, int3 val, 1, 0, 1);
-void rsAllocationVStoreX_int4(allocation_1D_global, int4 val, 0);
-void rsAllocationVStoreX_int4(allocation_2D_global, int4 val, 18, 19);
-void rsAllocationVStoreX_int4(allocation_3D_global, int4 val, 0, 1, 0);
-
-void* rsGetElementAt(allocation_1D_global, 0);
-void* rsGetElementAt(allocation_2D_global, 20, 21);
-void* rsGetElementAt(allocation_3D_global, 1, 0, 1);
-int rsGetElementAt_int(allocation_1D_global, 1);
-int rsGetElementAt_int(allocation_2D_global, 22, 23);
-int rsGetElementAt_int(allocation_3D_global, 0, 1, 0);
-
-uchar rsGetElementAtYuv_uchar_U(allocation_YUV_2D_global, 0, 1);
-
-uchar rsGetElementAtYuv_uchar_V(allocation_YUV_2D_global, 2, 3);
-
-uchar rsGetElementAtYuv_uchar_Y(allocation_YUV_2D_global, 0, 1);
-
-float4 rsSample(allocation_1D_global, rs_sampler s, float location);
-float4 rsSample(allocation_1D_global, rs_sampler s, float location, float lod);
-float4 rsSample(allocation_1D_global, rs_sampler s, float2 location);
-float4 rsSample(allocation_1D_global, rs_sampler s, float2 location, float lod);
-
-void rsSetElementAt(allocation_1D_global, int* ptr, 2);
-void rsSetElementAt(allocation_2D_global, int* ptr, 24, 25);
-void rsSetElementAt_int(allocation_1D_global, int val, 0);
-void rsSetElementAt_int(allocation_2D_global, int val, 26, 27);
-void rsSetElementAt_int(allocation_3D_global, int val, 1, 0, 1);
-'''.splitlines())
-
-
-TYPE_MAP = {
-    'void' : '',
-    'char' : r'\((signed )?char\)',
-    'uchar' : r'\(uchar\)',
-    'short' : r'\(short\)',
-    'ushort' : r'\(ushort\)',
-    'int' : r'\(int\)',
-    'uint' : r'\(uint\)',
-    'long' : r'\((long )?long\)',
-    'ulong' : r'\(ulong\)',
-    'float' : r'\(float\)',
-    'float2' : r'\(float2\)',
-    'float3' : r'\(float3\)',
-    'float4' : r'\(float4\)'
-    }
-
-
-def _build_arg(token):
-    '''Given a C argument construct an lldb expression for the argument.
-
-    Given a token, which represents a single argument of a C function
-    declaration, construct an lldb expression for the argument.
-
-    Args:
-        token: A string representing a single argument to a function. This
-               can be either [type][name] (e.g. int arg) or [value] (e.g. 5).
-
-    Returns:
-        The string that is the lldb expression for that argument, e.g.
-        int_global or 5.
-    '''
-    if len(token.split()) == 1:
-        # This is a fixed constant. Just take that as output.
-        return token
-
-    result = token.split()[0]
-
-    # Remove the rs_ prefix, because our globals' names don't have them
-    if result[:3] == 'rs_':
-        result = result[3:]
-
-    # If the function expects a pointer, take the address of the global
-    if result[-1] == '*':
-        result = '&' + result
-        result = result[:-1]
-
-    result += '_global'
-    return result
-
-
-def build_expr(line):
-    '''Build an lldb expression given a function prototype.
-
-    Given a function declaration, this function will construct an lldb
-    expression to call it.
-
-    Args:
-        line: A string representing a function declaration.
-
-    Returns:
-        The string that is the lldb expression.
-    '''
-    tokens = re.findall(r"[^(),;]+", line)
-    assert len(tokens) > 0
-    ret_name = tokens[0].split()
-    ret = ret_name[0]
-    name = ret_name[1]
-    expr = 'expr {0}('.format(name)
-
-    first = True
-    for tok in tokens[1:]:
-        if not first:
-            expr += ', '
-        expr += _build_arg(tok)
-        first = False
-
-    expr += ')'
-    return ret, expr
diff --git a/tests/lldb/tests/harness/__init__.py b/tests/lldb/tests/harness/__init__.py
deleted file mode 100644
index 863ac22..0000000
--- a/tests/lldb/tests/harness/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''This module contains utility code used by the test suite'''
-
-from __future__ import absolute_import
-
-from . import exception
-from . import util_constants
-from .exception import TestSuiteException
-from .test_base import TestBase
-from .util_android import UtilAndroid
-from .util_bundle import UtilBundle
-from . import RS_funs
\ No newline at end of file
diff --git a/tests/lldb/tests/harness/assert_mixins.py b/tests/lldb/tests/harness/assert_mixins.py
deleted file mode 100644
index 94a9e22..0000000
--- a/tests/lldb/tests/harness/assert_mixins.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Set of mixins for asserting common RenderScript lldb interactions
-That should cut down boilerplate
-To use these assertions simply inherit from them along with your
-`TestBase`:
-
-    >>> class MyLLDBRenderScriptTest(TestBaseRemote, CoordinateAssertionsMixin):
-    >>>     pass
-
-This will give you access to the useful assertion methods related to Coordinates
-
-NOTE: These are strictly clean mixins for `TestBase`. All classes here should
-strictly inherit only from `object`
-"""
-
-
-class CoordinateAssertionsMixin(object):
-    def assert_coord_bp_set(
-            self, breakpoint_expr, x, y=None, z=None, kernel_type='kernel'
-        ):
-        '''
-        Assert that a breakpoint conditional on a given coordinate is confirmed
-        by the renderscript breakpoint resolver.
-        This does not assert test the breakpoint is hit, only registered.
-            breakpoint_expr: the expression (e.g. the name of a function, or a
-            file and line).
-            kernel_type: The breakpoint resolver to use:
-                (reduction|kernel|scriptgroup)
-                default='kernel'
-            x: x coordinate: required
-            y, z: optional y, and z coordinates
-        '''
-
-        y = 0 if z is not None and y is None else y
-        coord_text = ','.join(map(str, filter(lambda p: p is not None, (x, y, z))))
-        self.try_command(
-            'language renderscript %s breakpoint set %s -c %s' % (
-                kernel_type, breakpoint_expr, coord_text
-            ),
-            [r'Breakpoint(s) created'],
-            expected_regex=[
-                r'Conditional kernel breakpoint on coordinate.+%d,\s*%d,\s*%d' % (
-                    x or 0, y or 0, z or 0
-                )
-            ]
-        )
-
-    def assert_coord_stop(
-            self, soname, func_name, x, y=None, z=None, stopped=True
-        ):
-        '''Run lldb commands to check that coordinates match expected values.
-
-        Args:
-            (x, y, z): The expected coordinates.
-            soname: The name of the renderscript script module e.g. 'allocs'
-            for librs.allocs.so
-            func_name: String that is the name of the kernel function
-
-        Raises:
-            TestFail: One of the lldb commands did not provide the expected
-                      output.
-        '''
-
-        if stopped:
-            self.try_command(
-                'process continue',
-                expected_regex=[
-                    r'resuming',
-                    r'Process \d+ stopped',
-                    r'stop reason = breakpoint',
-                    r'frame #0: (0x[0-9a-fA-F]+ )?librs.%s.so`%s' % (
-                        soname, func_name)
-                ]
-            )
-        else:
-            self.try_command(
-                'bt',
-                expected_regex=[
-                    'stop reason = breakpoint',
-                    'frame #0:',
-                    'librs.*\.so`%s' % kernel
-                ]
-            )
-
-        self.try_command(
-            'language renderscript kernel coordinate',
-            '(%d, %d, %d)' % (x, y or 0, z or 0)
-        )
diff --git a/tests/lldb/tests/harness/decorators.py b/tests/lldb/tests/harness/decorators.py
deleted file mode 100644
index e4a49b3..0000000
--- a/tests/lldb/tests/harness/decorators.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function, absolute_import
-
-import functools
-import warnings
-import inspect
-
-
-class skip_conditional(object):
-    '''
-    Test method decorator that marks a test method as ignorable if the given
-    arguments evaluate as Truthy. If the argument is callable, then it is called
-    and the return value is used as the predicate.
-
-    >>> class MyTestClass(TestBase):
-    ...     def test_something(self):
-    ...         pass
-    ...
-    ...     @skip_conditional(not sys.platform.startswith("linux"))
-    ...     def test_some_linux_behaviour(self):
-    ...         assert "vmlinuz" in open("/proc/cmdline").read()
-    ...
-    ...     @skip_conditional(lambda : True):
-    ...     def test_that_never_runs(self):
-    ...         pass
-    '''
-    def __init__(self, skip_condition, message="skipped"):
-        self._skip_condition = skip_condition
-        self._message = message
-
-    def __call__(self, func):
-        @functools.wraps(func)
-        def inner(*args, **kwargs):
-            skip_condition = self._skip_condition
-            if callable(skip_condition):
-                # args[0] is ``self``
-                skip_condition = skip_condition(args[0])
-
-            if skip_condition:
-                print("skipping %r - %s" % (func, self._message))
-                return True
-            return func(args[0])
-
-        return inner
-
-
-class skip_test(skip_conditional):
-    ''''
-    Unconditionally skip a test
-    '''
-    def __init__(self, skip_condition, *args, **kwargs):
-        super(skip_test, self).__init__(True, *args, **kwargs)
-
-
-java_only_test = lambda: skip_conditional(lambda self: not self.app_type == 'java')
-
-cpp_only_test = lambda: skip_conditional(lambda self: not self.app_type == 'cpp')
-
-jni_only_test = lambda: skip_conditional(lambda self: not self.app_type == 'jni')
-
-
-def wimpy(func):
-    '''
-    Mark a test as 'wimpy' that is - a function specifically known to be quick-running.
-    This implementation simply adds the `.wimpy` attribute to the decorated function
-    and returns it, otherwise unmodified
-    '''
-    func.wimpy = True
-
-    return func
-
-
-class ordered_test(object):
-    '''Set the ordered attribute on function'''
-    def __init__(self, order):
-        self._order = order
-
-    def __call__(self, func):
-        func.test_order = self._order
-        return func
-
-
-class deprecated(object):
-    """
-    method or function decorator used to warn of pending feature removal:
-
-    >>> @deprecated()
-    ... def myfunc():
-    ...     return 'hello'
-    ...
-    >>> myfunc()
-        DeprecationWarning: `__main__.myfunc()` is deprecated and will be removed soon.
-    'hello'
-    >>> class MyClass(object):
-    ... @deprecated(alternative_feature='print')
-    ... def myprint(self, *args, **kwargs):
-    ...     print(*args, **kwargs)
-    ...
-    >>> obj = MyClass()
-    >>> obj.myprint("hello")
-    DeprecationWarning: `__main__.MyClass.myfunc()` is deprecated and will be removed soon. Use 'print' instead.
-    hello
-    """
-
-    def __init__(
-            self,
-            alternative_feature=None,
-            removal_date='soon',
-            exception=UserWarning
-        ):
-        self.alternative_feature_message = (
-            alternative_feature and 'use %r instead' % alternative_feature or ''
-        )
-        self.exception = exception
-        self.removal_date = removal_date
-
-    def __call__(self, func):
-        class_name = ''
-        if getattr(func, 'im_class', None):
-            class_name = '%s.' % func.im_class.__name__
-
-        if getattr(func, 'im_func', None):
-            func_name = func.im_func.func_name
-        else:
-            func_name = func.func_name
-
-        module_name = getattr(func, '__module__')
-
-        warning = "`%s.%s%s()` is deprecated and will be removed %s. %s" % (
-            module_name,
-            class_name,
-            func_name,
-            self.removal_date,
-            self.alternative_feature_message
-        )
-
-        @functools.wraps(func)
-        def inner(*args, **kwargs):
-            if not getattr(func, 'deprecation_warned', False):
-                warnings.warn(warning, self.exception, 2)
-                func.deprecation_warned = True
-            return func(*args, **kwargs)
-
-        return inner
diff --git a/tests/lldb/tests/harness/exception.py b/tests/lldb/tests/harness/exception.py
deleted file mode 100644
index 2cfc99f..0000000
--- a/tests/lldb/tests/harness/exception.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains TestSuiteException.'''
-
-from __future__ import absolute_import
-
-class TestSuiteException(Exception):
-    '''Exception that is thrown whenever an internal error is encountered.
-
-    Just contains a message.
-    '''
-    pass
-
-class DisconnectedException(Exception):
-    '''Exception that is thrown if lldb-server unexpectedly disconnected.
-
-    Just contains a message.
-    '''
-    pass
-
-
-class FailFastException(TestSuiteException):
-    '''Quick Bailout'''
-    pass
-
-
-class TestIgnoredException(TestSuiteException):
-    '''Raised when a testcase is ignored.'''
-    pass
diff --git a/tests/lldb/tests/harness/test_base.py b/tests/lldb/tests/harness/test_base.py
deleted file mode 100644
index 9902989..0000000
--- a/tests/lldb/tests/harness/test_base.py
+++ /dev/null
@@ -1,332 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains TestBase, the base class of all tests.'''
-
-from __future__ import absolute_import
-
-import logging
-import os
-import re
-import tempfile
-import inspect
-import traceback
-
-from .exception import DisconnectedException, TestSuiteException
-
-from . import util_log
-
-
-class TestBase(object):
-    '''Base class for all tests. Provides some common functionality.'''
-
-    bundle_target = {}
-
-    class TestFail(Exception):
-        '''Exception that is thrown when a line in a test fails.
-
-        This exception is thrown if a lldb command does not return the expected
-        string.
-        '''
-        pass
-
-    def __init__(self, device_port, device, timer, app_type, wimpy=False, **kwargs):
-        # Keep argument names for documentation purposes. This method is
-        # overwritten by test_base_remote.
-        # pylint: disable=unused-argument
-        self._lldb = None # handle to the lldb module
-        self._ci = None # instance of the lldb command interpreter for this test
-        self._timer = timer # timer instance, to check whether the test froze
-        self.app_type = app_type # The type of bundle that is being executed
-        self.wimpy = wimpy
-
-    def setup(self, android):
-        '''Set up environment for the test.
-
-        Override to specify commands to be run before the test APK launch.
-        Useful for setting Android properties or environment variables. See also
-        the teardown method.
-
-        Args:
-            android: Handler to the android device, see the UtilAndroid class.
-        '''
-        pass
-
-    def teardown(self, android):
-        '''Clean up environment after test.
-
-        Override this procedure to specify commands to be run after the test has
-        finished. This method is run regardless the outcome of the test.
-
-        Args:
-            android: Handler to the android device, see the UtilAndroid class.
-        '''
-        pass
-
-    def run(self, dbg, remote_pid, lldb):
-        '''Execute the actual test suite.
-
-        Args:
-            dbg: The instance of the SBDebugger that is used to test commands.
-            remote_pid: The integer that is the process id of the binary that
-                        the debugger is attached to.
-            lldb: A handle to the lldb module.
-
-        Returns:
-            A list of (test, failure) tuples.
-        '''
-        log = util_log.get_logger()
-
-        def predicate(obj):
-            '''check whether we're interested in the function'''
-            if not callable(obj):
-                return False
-            if self.wimpy and not getattr(obj, 'wimpy', False):
-                log.debug("skipping non-wimpy test in wimpy mode:%r", obj)
-                return False
-            return True
-
-        test_methods = [
-            method for name, method in inspect.getmembers(self, predicate)
-            if name.startswith('test_')
-        ]
-        log.debug("Found the following tests %r", test_methods)
-        test_errors = []
-
-        for test in sorted(
-            test_methods,
-            key=lambda item: getattr(item, 'test_order', float('Inf'))
-        ):
-            try:
-                log.info("running test %r", test.__name__)
-                result = test()
-            except (self.TestFail, TestSuiteException) as e:
-                test_errors.append((method, e))
-
-        return test_errors
-
-    def post_run(self):
-        '''Clean up after test execution.'''
-        pass
-
-    def assert_true(self, cond):
-        '''Check a given condition and raise TestFail if it is False.
-
-        Args:
-            cond: The boolean condition to check.
-
-        Raises:
-            TestFail: The condition was false.
-        '''
-        if not cond:
-            raise self.TestFail()
-
-    def assert_lang_renderscript(self):
-        '''Check that LLDB is stopped in a RenderScript frame
-
-        Use the LLDB API to check that the language of the current frame
-        is RenderScript, fail otherwise.
-
-        Raises:
-            TestFail: Detected language not RenderScript.
-        '''
-        assert self._lldb
-        assert self._ci
-
-        proc = self._ci.GetProcess()
-        frame = proc.GetSelectedThread().GetSelectedFrame()
-        lang = frame.GetCompileUnit().GetLanguage()
-
-        if lang != self._lldb.eLanguageTypeExtRenderScript:
-            raise self.TestFail('Frame language not RenderScript, instead {0}'
-                                .format(lang))
-
-    def do_command(self, cmd):
-        '''Run an lldb command and return the output.
-
-        Args:
-            cmd: The string representing the lldb command to run.
-
-        Raises:
-            TestFail: The lldb command failed.
-        '''
-        assert self._lldb
-        assert self._ci
-
-        log = util_log.get_logger()
-        res = self._lldb.SBCommandReturnObject()
-
-        log.info('[Command] {0}'.format(cmd))
-
-        # before issuing the command, restart the current timer to check
-        # whether the command is going to freeze the test
-        if self._timer:
-            self._timer.reset()
-
-        self._ci.HandleCommand(cmd, res)
-
-        if not res.Succeeded():
-            error = res.GetError()
-            error = error if error else res.GetOutput()
-            raise self.TestFail('The command "{0}" failed with the error: {1}'
-                                .format(cmd, error if error else '<N/a>'))
-
-        output = res.GetOutput() or ''
-        log.debug('[Output] {0}'.format(output.rstrip()))
-
-        return output
-
-    def try_command(self, cmd, expected=None, expected_regex=None):
-        '''Run an lldb command and match the expected response.
-
-        Args:
-            cmd: The string representing the lldb command to run.
-            expected: A list of strings that should be present in lldb's
-                      output.
-            expected_regex: A list of regular expressions that should
-                            match lldb's output.
-
-        Raises:
-            TestFail: One of the expected strings were not found in the lldb
-            output.
-
-        Returns:
-            str: raw lldb command output.
-        '''
-        assert self._lldb
-        assert self._ci
-        log = util_log.get_logger()
-        output = ''
-        try:
-            output = self.do_command(cmd)
-
-            if 'lost connection' in output:
-                raise DisconnectedException('Lost connection to lldb-server.')
-
-            # check the expected strings
-            if expected:
-                self._match_literals(output, expected)
-
-            # check the regexp patterns
-            if expected_regex:
-                self._match_regexp_patterns(output, expected_regex)
-
-        except self.TestFail as exception:
-            # if the command failed, ensure the output retrieved from the
-            # command is printed even in verbose mode
-            if log.getEffectiveLevel() > logging.DEBUG:
-                log.error('[Output] {0}'.format(output.rstrip() if output
-                                                else '<empty>'))
-
-            # print the back trace, it should help to identify the error in
-            # the test
-            backtrace = ['[Back trace]']
-            for (filename, line, function, text) in \
-                    traceback.extract_stack()[:-1]:
-                backtrace.append('  [{0} line: {2} fn: {1}] {3}'.format(
-                            filename, function, line, text
-                    )
-                )
-            log.error('\n'.join(backtrace))
-            log.error('[TEST ERROR] {0}'.format(exception.message))
-            raise  # pass through
-
-        return output
-
-    def _match_literals(self, text, literals):
-        '''Checks the text against the array of literals.
-
-        Raises a TestFail exception in case one of the literals is not contained
-        in the text.
-
-        Args:
-            text: String, it represents the text to match.
-            literals: an array of string literals to match in the output.
-
-        Throws: self.TestFail: if it cannot match one of the literals in
-                the output.
-        '''
-        for string in literals:
-            if string not in text:
-                raise self.TestFail('Cannot find "{0}" in the output'
-                                    .format(string))
-
-    def _match_regexp_patterns(self, text, patterns):
-        '''Checks the text against the array of regular expression patterns.
-
-        Raises a TestFail exception in case one of the patterns is not matched
-        in the given text.
-
-        Args:
-            text: String, it represents the text to match.
-            patterns: an array of strings, each of them representing a regular
-                      expression to match in text.
-
-        Throws: self.TestFail: if it cannot match one of the literals in
-                the output.
-        '''
-        log = util_log.get_logger()
-
-        for regex in patterns:
-            match = re.search(regex, text)
-            if not match:
-                raise self.TestFail('Cannot match the regexp "{0}" in '
-                                    'the output'.format(regex))
-            else:
-                msg = 'Found match to regex {0}: {1}'.format(regex,
-                                     match.group())
-                log.debug(msg)
-
-    @staticmethod
-    def get_tmp_file_path():
-        '''Get the path of a temporary file that is then deleted.
-
-        Returns:
-            A string that is the path to a temporary file.
-        '''
-        file_desc, name = tempfile.mkstemp()
-        os.close(file_desc)
-        os.remove(name)
-        return name
-
-
-class TestBaseNoTargetProcess(TestBase):
-    '''lldb target that doesn't require a binary to be running.'''
-
-    def get_bundle_target(self):
-        '''Get bundle executable to run.
-
-        Returns: None
-        '''
-        return None
-
-    @property
-    def bundle_target(self):
-        return self.get_bundle_target()
-
-    def run(self, dbg, remote_pid, lldb):
-        '''Execute the test case.
-
-        Args:
-            dbg: The instance of the SBDebugger that is used to test commands.
-            lldb: A handle to the lldb module.
-
-        Returns:
-            True: test passed, False: test failed.
-        '''
-        self._lldb = lldb
-        self._dbg = dbg
-        self._ci = dbg.GetCommandInterpreter()
-        assert self._ci.IsValid()
-        return super(TestBaseNoTargetProcess, self).run(self, dbg, remote_pid)
diff --git a/tests/lldb/tests/harness/test_base_remote.py b/tests/lldb/tests/harness/test_base_remote.py
deleted file mode 100644
index c430edf..0000000
--- a/tests/lldb/tests/harness/test_base_remote.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the base class TestBaseRemote'''
-
-from __future__ import absolute_import
-
-import os
-import re
-
-from .test_base import TestBase
-from . import util_log
-
-
-class TestBaseRemote(TestBase):
-    '''Base class for all tests that connect to a remote device.
-
-    Provides common functionality to set up the connection and tear it down.
-    '''
-
-    def __init__(self, device_port, device, timer, *args, **kwargs):
-        super(TestBaseRemote, self).__init__(device_port, device, timer, *args, **kwargs)
-        # port used by lldb-server on the device.
-        self._device_port = device_port
-        self._platform = None
-        # id of the device that adb will communicate with.
-        self._device = device
-
-    def set_src_map(self, file_name, new_src_path):
-        '''Call lldb to set the source mapping of a given file.
-
-        Set lldb's source mapping of a given file to a given path. This can be
-        used to make the test suite independent of where an APK was compiled.
-
-        Args:
-            file_name: String, which is the name of the file whose mapping is
-                to be changed
-            new_src_path: String which is the new absolute path to the source
-                file.
-        '''
-        line_table = self.do_command('target modules dump line-table '
-                                     + file_name)
-
-        lines = line_table.split('\n')
-        if 'Line table for' not in lines[0]:
-            raise self.TestFail('Could not determine source path of '
-                                + file_name)
-
-        # Expecting output like:
-        # (lldb) target modules dump line-table scalars.rs
-        # Line table for /home/jenkins/workspace/grd-aosp-parameterised-build/
-        # merge_151216/frameworks/rs/tests/lldb/java/BranchingFunCalls/src/rs/
-        # frameworks/rs/tests/lldb/java/BranchingFunCalls/src/rs/scalars.rs in
-        # `librs.scalars.so
-        # 0xb30f2374: /home/jenkins/workspace/grd-aosp-parameterised-build/
-        # merge_151216/frameworks/rs/tests/lldb/java/BranchingFunCalls/src/rs/
-        # scalars.rs:46
-        # ...
-        # For some reason the first line contains a mangled path?
-        old_path = re.findall(r"[^ :]+", lines[1])[1]
-        old_dir = os.path.dirname(old_path)
-
-        self.try_command('settings set target.source-map %s %s'
-                         % (old_dir, new_src_path), [''])
-
-    def post_run(self):
-        '''Clean up after execution.'''
-        if self._platform:
-            self._platform.DisconnectRemote()
-
-    def _connect_to_platform(self, lldb_module, dbg, remote_pid):
-        '''Connect to an lldb platform that has been started elsewhere.
-
-        Args:
-            lldb_module: A handle to the lldb module.
-            dbg: The instance of the SBDebugger that should connect to the
-                 server.
-            remote_pid: The integer that is the process id of the binary that
-                        the debugger should attach to.
-
-        Returns:
-            True if the debugger successfully attached to the server and
-            process.
-        '''
-        # pylint: disable=too-many-return-statements
-        remote_pid = str(remote_pid)
-
-        log = util_log.get_logger()
-
-        err1 = dbg.SetCurrentPlatform('remote-android')
-        if err1.Fail():
-            log.fatal(err1.GetCString())
-            return False
-
-        self._platform = dbg.GetSelectedPlatform()
-        if not self._platform:
-            return False
-
-        connect_string = \
-            'adb://{0}:{1}'.format(self._device, self._device_port)
-        opts = lldb_module.SBPlatformConnectOptions(connect_string)
-
-        for _ in range(2):
-            err2 = self._platform.ConnectRemote(opts)
-            if err2.Fail():
-                log.error(err2.GetCString())
-
-                if 'Connection refused' in err2.GetCString():
-                    log.warning('Connection to lldb server was refused. '
-                                'Trying again.')
-                else:
-                    # Unknown error. Don't try again.
-                    return False
-            else:
-                # Success
-                break
-        else:
-            log.fatal('Not trying again, maximum retries exceeded.')
-            return False
-
-        target = dbg.CreateTarget(None)
-        if not target:
-            return False
-
-        dbg.SetSelectedTarget(target)
-        listener = lldb_module.SBListener()
-        err3 = lldb_module.SBError()
-        process = target.AttachToProcessWithID(listener, int(remote_pid), err3)
-        if err3.Fail() or not process:
-            log.fatal(err3.GetCString())
-            return False
-
-        return True
-
-    def run(self, dbg, remote_pid, lldb):
-        '''Execute the actual testsuite.
-
-        Args:
-            dbg: The instance of the SBDebugger that is used to test commands.
-            remote_pid: The integer that is the process id of the binary that
-                        the debugger is attached to.
-            lldb: A handle to the lldb module.
-
-        Returns: list of (test, failure) tuples.
-
-        '''
-        assert dbg
-        assert remote_pid
-        assert lldb
-
-        self._lldb = lldb
-
-        self.assert_true(self._connect_to_platform(lldb, dbg, remote_pid))
-        self._ci = dbg.GetCommandInterpreter()
-        assert self._ci
-
-        self.assert_true(self._ci.IsValid())
-        self.assert_true(self._ci.HasCommands())
-
-        return super(TestBaseRemote, self).run(dbg, remote_pid, lldb)
-
diff --git a/tests/lldb/tests/harness/util_android.py b/tests/lldb/tests/harness/util_android.py
deleted file mode 100644
index a0cf700..0000000
--- a/tests/lldb/tests/harness/util_android.py
+++ /dev/null
@@ -1,736 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the class UtilAndroid, providing utility method to
-interface with Android ADB.'''
-
-from __future__ import absolute_import
-
-import logging
-import re
-import subprocess
-import time
-import collections
-import multiprocessing
-try:
-    # Python 3
-    import queue
-except ImportError:
-    import Queue as queue
-
-from .exception import TestSuiteException
-from . import util_log
-
-
-class UtilAndroid(object):
-    '''Provides some utility methods that interface with Android using adb.'''
-    # pylint: disable=too-many-public-methods
-
-    def __init__(self, adb_path, lldb_server_path_device, device):
-        # The path to the adb binary on the local machine
-        self._path_adb = adb_path
-        # The path to the lldb server binary on the device
-        self._path_lldbserver = lldb_server_path_device
-        self._log = util_log.get_logger()
-        self.device = device
-        self._prop_stacks = collections.defaultdict(list)
-        return
-
-    @staticmethod
-    def _validate_string(string):
-        '''Check that a string is valid and not empty.
-
-        Args:
-            string: The string to be checked.
-        '''
-        assert isinstance(string, str)
-        assert len(string) > 0
-
-    def adb(self, args, async=False, device=True, timeout=None):
-        '''Run an adb command (async optional).
-
-        Args:
-            args: The command (including arguments) to run in adb.
-            async: Boolean to specify whether adb should run the command
-                   asynchronously.
-            device: boolean to specify whether the serial id of the android
-                    device should be inserted in the adb command.
-            timeout: it specifies the number of seconds to wait for
-                     a synchronous invocation before aborting. If unspecified or
-                     None it waits indefinitely for the command to complete.
-
-        Raises:
-            ValueError: it can be caused by any of the following situations:
-                        - when both the combination async=True and timeout are
-                          given.
-                        - when a timeout <= 0 is specified.
-
-        Returns:
-            If adb was synchronously run and the command completed by the
-            specified timeout, a string which is the output (standard out and
-            error) from adb. Otherwise it returns None.
-        '''
-
-        # Form the command
-        if device:
-            cmd = '{0} -s {1} {2}'.format(self._path_adb, self.device, args)
-        else:
-            cmd = '{0} {1}'.format(self._path_adb, args)
-
-        self._log.debug('Execute ADB: %s', cmd)
-
-        if timeout is None:
-            # local invocation
-            return_code, output = UtilAndroid._execute_command_local(cmd, async)
-
-        else:
-            # remote invocation
-            if async:
-                raise ValueError('Invalid combination: asynchronous invocation '
-                                 'with timeout specified')
-
-            return_code, output = UtilAndroid._execute_command_remote(cmd,
-                                                                      timeout)
-
-            if return_code is None:
-                self._log.warn('[ADB] The command timed out: %s', cmd)
-
-        # log the output message
-        if output is not None:
-            self._adb_log_output(cmd, output, return_code)
-
-        return output
-
-    def adb_retry(self, args, max_num_attempts, timeout):
-        '''Attempt to execute the given adb command a certain number of times.
-
-        The function executes the given command through adb, waiting for its
-        completion up to 'timeout' seconds. If the command completes then it
-        returns its output. Otherwise it aborts the execution of the adb
-        command and re-issues it anew with the same parameters. In case of
-        timeout this process is repeated up to 'max_num_attempts'.
-
-        The purpose of this function is to handle the cases when, for some
-        reason, a command sent to 'adb' freezes, blocking the whole test suite
-        indefinitely.
-
-        Args:
-            args: The command (including arguments) to run in adb.
-            max_num_attempts: the max number of attempts to repeat the command
-                              in case of timeout.
-            timeout: it specifies the number of seconds to wait for the adb
-                     command to complete.
-
-        Raises:
-            ValueError: when the parameter timeout is invalid (None or <= 0).
-
-        Returns:
-            If adb was synchronously run and the command completes by the
-            specified timeout, a string which is the output (standard out and
-            error) from adb. Otherwise it returns None.
-        '''
-        if timeout is None or timeout <= 0:
-            raise ValueError('Invalid value for timeout')
-
-        output = None
-
-        for attempt in range(max_num_attempts):
-            self._log.debug('[ADB] Attempt #%d: %s', attempt + 1, args)
-            output = self.adb(args, False, True, timeout)
-            if output:
-                break
-
-        return output
-
-    def _adb_log_output(self, cmd, output, return_code):
-        '''Save in the log the command & output from `adb`.
-
-        Internal function, helper to record in the log the issued adb command
-        together with its output and return code.
-
-        Params:
-            cmd: string, the command issued to `adb`.
-            output: string, the output retrieved from `adb`.
-            return_code: int, the return code from `adb`.
-        '''
-
-        message = output.strip()
-
-        # if return_code != 0, we wish to also record the command executed
-        # (which occurs if and only if we are in verbose mode)
-        is_warning = return_code != 0
-        threshold = self._log.getEffectiveLevel()
-        if is_warning and threshold > logging.DEBUG:
-            self._log.warn("[ADB] Command executed: {0}".format(cmd))
-
-        level = logging.WARNING if is_warning else logging.DEBUG
-        if message:
-            # if message is composed by multiple lines, then print it after
-            # the log preamble
-            if re.search('\n', message):
-                message = '\n' + message
-        else:
-            message = '<empty>'
-
-        self._log.log(level, 'RC: {0}, Output: {1}'.format(return_code,
-                                                           message))
-
-    def check_adb_alive(self):
-        '''Ping the device and raise an exception in case of timeout.
-
-        It sends a ping message through 'adb shell'. The emulator/device should
-        echo the same message back by one minute. If it does not, it raises
-        a TestSuiteException.
-
-        Purpose of this method is to check whether 'adb' became frozen or
-        stuck.
-
-        Raises:
-            TestSuiteException: in case the device/emulator does not reply by
-                                one minute or the `ping' message is not echoed
-                                back.
-        '''
-        token = 'PING'
-        log = util_log.get_logger()
-        cmd = "echo {0}".format(token)
-
-        tries = 10
-        try_number = tries
-        while try_number > 0:
-            log.debug('Sending a ping through "adb shell" (try #%s)...',
-                      try_number)
-            output = self.shell(cmd, False, 60)
-
-            if output is None:
-                raise TestSuiteException(
-                    'Timeout when pinging the device/emulator through '
-                    '"adb shell".  Is "adb" stuck or dead?')
-            elif token not in output:
-                log.debug('Ping failed. Cannot match the token "%s" in "adb '
-                          'shell %s"', token, cmd)
-            else:
-                log.debug('Pong message received')
-                return
-
-            try_number -= 1
-            time.sleep(5)
-
-        raise TestSuiteException('Cannot ping the device/emulator through '
-                                 '"adb shell". Tried %s times. Is "adb" stuck '
-                                 'or dead?' % tries)
-
-    def shell(self, cmd, async=False, timeout=None):
-        '''Run a command via the adb shell.
-
-        Args:
-            cmd: The command (including arguments) to run in the adb shell.
-            async: Boolean to specify whether adb should run the command
-                   asynchronously.
-            timeout: it specifies the number of seconds to wait for
-                     a synchronous invocation before aborting. If unspecified or
-                     None it waits indefinitely for the command to complete
-
-        Returns:
-            If adb was synchronously run, a string which is the output (standard
-            out and error) from adb. Otherwise None.
-        '''
-        return self.adb('shell "{0}"'.format(cmd), async, True, timeout)
-
-    def find_app_pid(self, process_name):
-        '''Find the process ID of a process with a given name.
-
-        If more than one instance of the process is running return the first pid
-        it finds.
-
-        Args:
-            process_name: A string representing the name of the package or
-                          binary for which the id should be found. I.e. the
-                          string or part of the string that shows up in the "ps"
-                          command.
-
-        Returns:
-            An integer representing the id of the process, or None if it was not
-            found.
-        '''
-        self._validate_string(process_name)
-
-        pid_output = self.shell('pidof ' + process_name)
-        pid_output = re.sub(r'\*.+\*', '', pid_output)
-        pids = pid_output.split()
-
-        if len(pids) < 1:
-            self._log.warn('Unable to find pid of: {0}'.format(process_name))
-            return None
-
-        if len(pids) > 1:
-            self._log.warn('Found multiple instances of {0} running: {1}'
-                           .format(process_name, pids))
-
-        try:
-            pid = int(pids[0])
-            self._log.info('App pid found: {0}'.format(pids[0]))
-            return pid
-        except ValueError:
-            return None
-
-    def adb_root(self):
-        '''Set adb to be in root mode.'''
-        self.adb('root')
-
-    def _adb_remount(self):
-        '''Remount the filesystem of the device.'''
-        self.adb('remount')
-
-    def validate_adb(self):
-        '''Validate adb that it can be run.
-
-        Raises:
-            TestSuiteException: Unable to validate that adb exists and runs
-                                successfully.
-        '''
-        out = self.adb('version', False, False)
-        if out and 'Android' in out and 'version' in out:
-            self._log.info('adb found: {0}'.format(out))
-            return None
-        raise TestSuiteException('unable to validate adb')
-
-    def is_booted(self):
-        ''' Check if the device/emulator has finished booting.
-
-        Returns: True if the property sys.boot_completed is true, False
-                 otherwise.
-        '''
-        return self._get_prop('sys.boot_completed').strip() == '1'
-
-    def validate_device(self, check_boot=True, device_substring=''):
-        '''Validate that there is at least one device.
-
-        Args:
-            check_boot: Boolean to specify whether to check whether the device
-                        has finished booting as well as being present.
-            device_substring: String that needs to be part of the name of the
-                              device.
-
-        Raises:
-            TestSuiteException: There was a failure to run adb to list the
-                                devices or there is no device connected or
-                                multiple devices connected without the user
-                                having specified the device to use.
-        '''
-
-        out = self.adb('devices', False, False)
-        if not 'List of devices attached' in out:
-            raise TestSuiteException('Unable to list devices')
-
-        lines = out.split('\n')
-        found_device = False # True if the specified device is found
-        devices = []
-
-        for line in lines[1:]:
-            if '\tdevice' in line and device_substring in line:
-                device = line.split()[0]
-                devices.append(device)
-                if self.device:
-                    if self.device == device:
-                        found_device = True
-
-        if len(devices) == 0:
-            raise TestSuiteException('adb is unable to find a connected '
-                                     'device/emulator to test.')
-
-        if not self.device:
-            if len(devices) == 1:
-                self.device = devices[0]
-            else:
-                raise TestSuiteException('Multiple devices connected,'
-                                         'specify -d device id.')
-        else:
-            if not found_device:
-                raise TestSuiteException('Couldn\'t find the device {0} that '
-                                         'was specified, please check -d '
-                                         'argument'.format(self.device))
-
-        if check_boot and not self.is_booted():
-            raise TestSuiteException(
-                'The device {0} has not yet finished booting.'
-                .format(self.device))
-
-    def device_with_substring_exists(self, device_substring):
-        '''Check whether a device exists whose name contains a given string.
-
-        Args:
-            device_substring: String that is part of the name of the device to
-                              look for.
-
-        Raises:
-            TestSuiteException: There was a failure to run adb to list the
-                                devices.
-        '''
-        out = self.adb('devices', False, False)
-        if not 'List of devices attached' in out:
-            raise TestSuiteException('Unable to list devices')
-
-        lines = out.split('\n')
-
-        for line in lines[1:]:
-            if '\tdevice' in line:
-                device = line.split()[0]
-                if device.find(device_substring) != -1:
-                    return True
-
-        return False
-
-    def get_device_id(self):
-        '''Return ID of the device that will be used for running the tests on.
-
-        Returns:
-            String representing device ID.
-        '''
-        return self.device
-
-    def _kill_pid(self, pid):
-        '''Kill a process identified by its pid by issuing a "kill" command.
-
-        Args:
-            pid: The integer that is the process id of the process to be killed.
-        '''
-        self.shell('kill -9 ' + str(pid))
-
-    def stop_app(self, package_name):
-        '''Terminate an app by calling am force-stop.
-
-        Args:
-            package_name: The string representing the name of the package of the
-                          app that is to be stopped.
-        '''
-        self._validate_string(package_name)
-        self.shell('am force-stop ' + package_name)
-
-    def kill_process(self, name):
-        '''Kill a process identified by its name (package name in case of apk).
-
-        Issues the "kill" command.
-
-        Args:
-            name: The string representing the name of the binary of the process
-                  that is to be killed.
-
-        Returns:
-            True if the kill command was executed, False if it could not be
-            found.
-        '''
-        pid = self.find_app_pid(name)
-        if pid:
-            self._kill_pid(pid)
-            return True
-        return False
-
-    def kill_all_processes(self, name):
-        '''Repeatedly try to call "kill" on a process to ensure it is gone.
-
-        If the process is still there after 5 attempts reboot the device.
-
-        Args:
-            name: The string representing the name of the binary of the process
-                  that is to be killed.
-
-        Raises:
-            TestSuiteException: If the process could not be killed after 5
-                                attempts and the device then failed to boot
-                                after rebooting.
-        '''
-
-        # try 5 times to kill this process
-        for _ in range(1, 5):
-            if not self.kill_process(name):
-                return
-        # stalled process must reboot
-        self._reboot_device()
-
-    def kill_servers(self):
-        '''Kill all gdbserver and lldb-server instances.
-
-        Raises:
-            TestSuiteException: If gdbserver or lldb-server could not be killed
-                                after 5 attempts and the device then failed to
-                                boot after rebooting.
-        '''
-        self.kill_all_processes('gdbserver')
-        self.kill_all_processes('lldb-server')
-
-    def launch_elf(self, binary_name):
-        '''Launch a binary (compiled with the NDK).
-
-        Args:
-            binary_name: The string representing the name of the binary that is
-                         to be launched.
-
-        Returns:
-            Boolean, failure if the app is not installed, success otherwise.
-        '''
-        # Ensure the apk is actually installed.
-        output = self.shell('ls /data/ | grep ' + binary_name)
-        if binary_name not in output:
-            return False
-
-        stdout = self.shell('exec /data/' + binary_name, True)
-        self._log.info(str(stdout))
-
-        return True
-
-    def wait_for_device(self):
-        '''Ask ADB to wait for a device to become ready.'''
-        self.adb('wait-for-device')
-
-    def _reboot_device(self):
-        '''Reboot the remote device.
-
-        Raises:
-            TestSuiteException: If the device failed to boot after rebooting.
-        '''
-        self.adb('reboot')
-        self.wait_for_device()
-        # Allow 20  mins boot time to give emulators such as MIPS enough time
-        sleeping_countdown = 60*20
-        while not self.is_booted():
-            time.sleep(1)
-            sleeping_countdown -= 1
-            if sleeping_countdown == 0:
-                raise TestSuiteException('Failed to reboot. Terminating.')
-
-        self.adb_root()
-        self.wait_for_device()
-        self._adb_remount()
-        self.wait_for_device()
-
-    def launch_app(self, name, activity):
-        '''Launch a Renderscript application.
-
-        Args:
-            name: The string representing the name of the app that is to be
-                  launched.
-            activity: The string representing the activity of the app that is to
-                      be started.
-
-        Returns:
-            Boolean, failure if the apk is not installed, success otherwise.
-        '''
-        assert name and activity
-
-        # Ensure the apk is actually installed.
-        output = self.shell('pm list packages ' + name)
-        if not output:
-            return False
-
-        cmd = 'am start -S -W {0}/{0}.{1}'.format(name, activity)
-        stdout = self.shell(cmd)
-
-        self._log.info(str(stdout))
-
-        return True
-
-    def launch_lldb_platform(self, port):
-        '''Launch lldb server and attach to target app.
-
-        Args:
-            port: The integer that is the port on which lldb should listen.
-        '''
-        cmd = "export LLDB_DEBUGSERVER_PATH='{0}';{0} p --listen *:{1}"\
-            .format(self._path_lldbserver, port)
-        self.shell(cmd, True)
-        time.sleep(5)
-
-    def forward_port(self, local, remote):
-        '''Use adb to forward a device port onto the local machine.
-
-        Args:
-            local: The integer that is the local port to forward.
-            remote: The integer that is the remote port to which to forward.
-        '''
-        cmd = 'forward tcp:%s tcp:%s' % (str(local), str(remote))
-        self.adb(cmd)
-
-    def remove_port_forwarding(self):
-        '''Remove all of the forward socket connections open in adb.
-
-        Avoids a windows adb error where we can't bind to a listener
-        because too many files are open.
-        '''
-        self.adb('forward --remove-all')
-
-    def _get_prop(self, name):
-        '''Get the value of an Android system property.
-
-        Args:
-            name: Name of the property of interest [string].
-
-        Returns:
-            Current value of the property [string].
-        '''
-        return self.shell('getprop %s' % str(name))
-
-    def _set_prop(self, name, value):
-        '''Set the value of an Android system property.
-
-        Args:
-            name: Name of the property of interest [string].
-            value: Desired new value for the property [string or integer].
-        '''
-        self.shell("setprop %s '%s'" % (str(name), str(value)))
-
-    def push_prop(self, name, new_value):
-        '''Save the value of an Android system property and set a new value.
-
-        Saves the old value onto a stack so it can be restored later.
-
-        Args:
-            name: Name of the property of interest [string].
-            new_value: Desired new value for the property [string or integer].
-        '''
-        old_value = self._get_prop(name)
-        self._set_prop(name, new_value)
-        self._prop_stacks[name].append(old_value.strip())
-
-    def pop_prop(self, name):
-        '''Restore the value of an Android system property previously set by
-        push_prop.
-
-        Args:
-            name: Name of the property of interest [string].
-
-        Returns:
-            Current value of the property [string].
-        '''
-        old_value = self._prop_stacks[name].pop()
-        self._set_prop(name, old_value)
-
-    def reset_all_props(self):
-        '''Restore all the android properties to the state before the first push
-
-        This is equivalent to popping each property the number of times it has
-        been pushed.
-        '''
-        for name in self._prop_stacks:
-            if self._prop_stacks[name] != []:
-                self._set_prop(name, self._prop_stacks[name][0])
-                self._prop_stacks[name] = []
-
-    def make_device_writeable(self):
-        ''' Ensure the device is full writable, in particular the system folder.
-
-        This disables verity and remounts.
-        '''
-        output = self.adb('disable-verity')
-
-        # if the remote is an emulator do not even try to reboot
-        # otherwise check whether a reboot is advised
-        if (self._get_prop('ro.boot.qemu') != '1' and output and
-                'Now reboot your device for settings to take effect' in output):
-            self._reboot_device()
-
-        self._adb_remount()
-        self.wait_for_device()
-        self.adb_root()
-        self.wait_for_device()
-
-    @staticmethod
-    def _execute_command_local(command, async=False):
-        '''Execute the given shell command in the same process.
-
-        Args:
-            command: String, the command to execute
-            async: Boolean to specify whether adb should run the command
-                   asynchronously.
-
-        Returns:
-            if async == False, it returns a tuple with the return code and
-            the output from the executed command. Otherwise the tuple
-            (None, None).
-        '''
-        proc = subprocess.Popen(command,
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.STDOUT,
-                                shell=True)
-        if async:
-            return None, None
-
-        # read the whole output from the command
-        with proc.stdout as file_proc:
-            output = ''.join(line for line in file_proc)
-
-        # release the process state
-        proc.terminate()
-        return_code = proc.wait()
-
-        return return_code, output
-
-    @staticmethod
-    def _execute_command_remote(command, timeout):
-        '''Execute the given shell command remotely, in a separate process.
-
-        It spawns an ad hoc process to execute the given command. It waits up
-        to timeout for the command to complete, otherwise it aborts the
-        execution and returns None.
-
-        Args:
-            command: String, the command to execute.
-            timeout: the number of seconds to wait for the command to complete.
-
-        Returns:
-            a pair with the return code and the output from the command, if it
-            completed by the specified 'timeout' seconds. Otherwise the tuple
-            (None, None).
-        '''
-
-        channel = multiprocessing.Queue()
-        proc = multiprocessing.Process(
-            target=_handle_remote_request,
-            name="Executor of `{0}'".format(command),
-            args=(command, channel)
-        )
-
-        # execute the command
-        proc.start()
-        return_code = None
-        output = None
-
-        # wait for the result
-        try:
-            return_code, output = channel.get(True, timeout)
-        except queue.Empty:
-            # timeout hit, the remote process has not fulfilled our request by
-            # the given time. We are going to return <None, None>, nothing to
-            # do here as it already holds return_code = output = None.
-            pass
-
-        # terminate the helper process
-        proc.terminate()
-
-        return return_code, output
-
-
-def _handle_remote_request(command, channel):
-    '''Entry point for the remote process.
-
-    It executes the given command and reports the result into the channel.
-    This function is supposed to be only called by
-    UtilAndroid._execute_command_remote to handle the inter-process
-    communication.
-
-    Args:
-        command: the command to execute.
-        channel: the channel to communicate with the caller process.
-    '''
-    channel.put(UtilAndroid._execute_command_local(command))
-
diff --git a/tests/lldb/tests/harness/util_bundle.py b/tests/lldb/tests/harness/util_bundle.py
deleted file mode 100644
index 68954cb..0000000
--- a/tests/lldb/tests/harness/util_bundle.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the class UtilBundle, representing a collection of RS
-binaries.'''
-
-from __future__ import absolute_import
-
-import os
-import time
-from . import util_constants
-from . import util_log
-from .exception import TestSuiteException
-
-
-class UtilBundle(object):
-    '''Represents the collection of RS binaries that are debugged.'''
-
-    # Map of binary name to package name of all Java apps debugged
-    _tests_apk = {
-        'JavaInfiniteLoop': 'com.android.rs.infiniteloop',
-        'JavaDebugWaitAttach': 'com.android.rs.waitattachdebug',
-        'JavaNoDebugWaitAttach': 'com.android.rs.waitattachnodebug',
-        'BranchingFunCalls': 'com.android.rs.branchingfuncalls',
-        'KernelVariables': 'com.android.rs.kernelvariables',
-        'Allocations': 'com.android.rs.allocations',
-        'MultipleRSFiles': 'com.android.rs.multiplersfiles',
-        'SingleSource': 'com.android.rs.singlesource',
-        'ScriptGroup': 'com.android.rs.scriptgroup',
-        'Reduction': 'com.android.rs.lldbreductiontest',
-    }
-
-    _tests_jni = {
-        'JNIInfiniteLoop': 'com.android.rs.jniinfiniteloop',
-        'JNIDebugWaitAttach': 'com.android.rs.jnidebugwaitattach',
-        'JNINoDebugWaitAttach': 'com.android.rs.jninodebugwaitattach',
-        'JNIBranchingFunCalls': 'com.android.rs.jnibranchingfuncalls',
-        'JNIKernelVariables': 'com.android.rs.jnikernelvariables',
-        'JNIAllocations': 'com.android.rs.jniallocations',
-        'JNIMultipleRSFiles': 'com.android.rs.jnimultiplersfiles'
-    }
-
-    _tests_ndk = {'CppInfiniteLoop', 'CppNoDebugWaitAttach',
-                  'CppDebugWaitAttach', 'CppBranchingFunCalls',
-                  'CppKernelVariables', 'CppAllocations', 'CppMultipleRSFiles'}
-
-    _missing_path_msg = (
-        'No product path has been provided. If using `lunch` ensure '
-        'the `ANDROID_PRODUCT_OUT` environment variable has been set correctly. '
-        'Alternatively, include it in the config file or specify it explicitly '
-        'on the command line (`--aosp-product-path`)'
-    )
-
-    def __init__(self, android, aosp_product_path):
-        assert android
-        self._android = android # Link to the android module
-        self._aosp_product_path = aosp_product_path
-        self._log = util_log.get_logger()
-
-    def is_apk(self, name):
-        '''Checks if a binary of a given name is an apk.
-
-        Checks whether the name of the apk is in the dictionary of apks.
-
-        Args:
-            name: The string that is the name of the binary to check.
-
-        Returns:
-            True if the binary is an apk, False if it is not.
-
-        Raises:
-            TestSuiteException: The string does not match any item in the list
-            of APK or NDK binaries.
-        '''
-        if name in self._tests_apk:
-            return True
-        if name not in self._tests_ndk and name not in self._tests_jni:
-            raise TestSuiteException('test not apk or ndk')
-        return False
-
-    def uninstall_all(self):
-        '''Uninstall/Delete all the testsuite's apks and binaries on the device.
-
-        Raises:
-            TestSuiteException: One or more apks could not be uninstalled.
-        '''
-        self.uninstall_all_apk()
-        self._delete_all_ndk()
-        self._uninstall_all_jni()
-
-    def uninstall_all_apk(self):
-        '''Uninstall all apks used by the test suite from the device.
-
-        Raises:
-            TestSuiteException: An apk could not be uninstalled.
-        '''
-        max_num_attempts = 3
-        timeout = 180
-
-        for app, package in self._tests_apk.items():
-            self._log.info('Uninstalling the application: %s', app)
-            output = self._android.adb_retry('uninstall ' + package,
-                                             max_num_attempts, timeout)
-
-            if output is None:
-                raise TestSuiteException('Repeated timeouts when uninstalling '
-                                         'the application: ' + app)
-            elif 'Success' not in output:
-                outmsg = '\n' + output.rstrip() if output else '<empty>'
-                self._log.error('Cannot match the string "Success" in the '
-                                'output: %s', outmsg)
-                raise TestSuiteException('Unable to uninstall app ' + app)
-            else:
-                self._log.debug('Application uninstalled: %r', app)
-
-            if 'Success' not in output:
-                self._log.warning('unable to uninstall app ' + app)
-
-    def _uninstall_all_jni(self):
-        '''Uninstall all apks used by the test suite from the device.
-
-        Raises:
-            TestSuiteException: An apk could not be uninstalled.
-        '''
-        for app, package in self._tests_jni.items():
-            output = self._android.adb('uninstall ' + package)
-
-            if 'Success' not in output:
-                raise TestSuiteException('unable to uninstall app ' + app)
-
-    def _delete_all_ndk(self):
-        '''Delete all ndk binaries that were pushed to the device.
-
-        Raises:
-            TestSuiteException: A binary could not be deleted from the device.
-        '''
-        for app in self._tests_ndk:
-            output = self._android.shell('rm /data/' + app)
-            if 'No such file or directory' in output:
-                self._log.warning('unable to uninstall app ' + app)
-
-
-    def push_all(self):
-        '''Push all apk and ndk binaries required by the testsuite to the device
-
-        Raises:
-            TestSuiteException: One or more apks could not be installed or
-                                previously running processes thereof could not
-                                be killed.
-        '''
-        self._push_all_java()
-        self._push_all_ndk()
-        self._push_all_jni()
-
-    def _install_apk(self, app, package):
-        '''Push an apk files to the device.
-
-        This involves uninstalling any old installation and installing again.
-
-        Args:
-            app: A string that is the name of the apk.
-            package: A string that is the name of the package of the apk.
-
-        Raises:
-            TestSuiteException: The apk could not be installed.
-        '''
-        self._log.info('pushing {0}'.format(app))
-
-        self._android.stop_app(package)
-
-        self._android.adb('uninstall ' + package)
-        # Ignore the output of uninstall.
-        # The app may not have been installed in the first place. That's ok.
-
-        flags = ''
-
-        product_folder = self._aosp_product_path
-        if not product_folder:
-            raise TestSuiteException(self._missing_path_msg)
-
-        app_folder = os.path.join(product_folder, 'data/app')
-
-        cmd = 'install {0} {1}/{2}/{2}.apk'.format(flags, app_folder, app)
-        output = self._android.adb(cmd, False, True,
-                                   util_constants.PUSH_TIMEOUT)
-        if ('Success' not in output) or ("can't find" in output):
-            raise TestSuiteException('unable to install app {}: {}'.format(
-                app, output))
-
-    def _push_all_java(self):
-        '''Push all apk files to the device.
-
-        This involves uninstalling any old installations and installing again.
-
-        Raises:
-            TestSuiteException: An apk could not be installed.
-        '''
-        for app, package in self._tests_apk.items():
-            self._install_apk(app, package)
-
-    def _push_all_ndk(self):
-        '''Push all ndk binaries to the device.
-
-        Raises:
-            TestSuiteException: A binary could not be pushed to the device or
-                                a previous process could not be killed.
-        '''
-        product_folder = self._aosp_product_path
-        if not product_folder:
-            raise TestSuiteException(self._missing_path_msg)
-
-        bin_folder = os.path.join(product_folder, 'system/bin')
-
-        for app in self._tests_ndk:
-            self._log.info('pushing {0}'.format(app))
-
-            self._android.kill_all_processes(app)
-
-            cmd = 'push %s/%s /data' % (bin_folder, app)
-            output = self._android.adb(cmd, False, True,
-                                       util_constants.PUSH_TIMEOUT)
-            if ('failed to copy' in output or
-                'No such file or directory' in output):
-                raise TestSuiteException('unable to push binary ' + app)
-
-            # be sure to set the execute bit for NDK binaries
-            self._android.shell('chmod 777 /data/{0}'.format(app))
-
-    def _push_all_jni(self):
-        '''Push all JNI apk files to the device.
-
-        This involves uninstalling any old installations and installing again.
-
-        Raises:
-            TestSuiteException: An apk could not be installed.
-        '''
-        product_folder = self._aosp_product_path
-        if not product_folder:
-            raise TestSuiteException(self._missing_path_msg)
-
-        app_folder = os.path.join(product_folder, 'system/lib')
-
-        # Ensure the system/lib directory is writable
-        self._android.make_device_writeable()
-
-        for app, package in self._tests_jni.items():
-            self._install_apk(app, package)
-
-    def delete_ndk_cache(self):
-        '''Deletes NDK cached scripts from the device.
-
-        The NDK caches compiled scripts as shared libraries in
-        the folder specified when calling `rs->init()`.
-
-        For all out tests this is set to '/data/rscache'.
-        '''
-        self._android.shell('rm -r /data/rscache')
-
-    def get_package(self, app_name):
-        '''From a given apk name get the name of its package.
-
-        Args:
-            app_name: The string that is the name of the apk.
-
-        Returns:
-            A string representing the name of the package of the app.
-
-        Raises:
-            TestSuiteException: The app name is not in the list of apks.
-        '''
-        if app_name in self._tests_apk:
-            return self._tests_apk[app_name]
-        elif app_name in self._tests_jni:
-            return self._tests_jni[app_name]
-        else:
-            msg = ('unknown app %s. (Do you need to add an '
-                  'entry to bundle.py :: test_apps_?)' % app_name)
-            raise TestSuiteException(msg)
-        return self._tests_apk[app_name]
-
-    def launch(self, app_name):
-        '''Launch an apk/ndk app on a remote device.
-
-        Args:
-            app_name: The string that is the name of the APK or NDK executable.
-
-        Returns:
-            The Process ID of the launched executable, otherwise None
-
-        Raises:
-            TestSuiteException: Previous processes of this apk could not be
-                                killed.
-        '''
-        process_name = ''
-        success = False
-        if app_name in self._tests_apk:
-            process_name = self._tests_apk[app_name]
-
-            self._android.kill_all_processes(process_name)
-
-            success = self._android.launch_app(process_name, 'MainActivity')
-        elif app_name in self._tests_ndk:
-            process_name = app_name
-            self._android.kill_all_processes(process_name)
-            success = self._android.launch_elf(process_name)
-        elif app_name in self._tests_jni:
-            package = self._tests_jni[app_name]
-
-            self._android.kill_process(package)
-
-            success = self._android.launch_app(package, 'MainActivity')
-            if not success:
-                self._log.log_and_print(app_name +
-                    ' is not installed. Try removing the --no-install option?')
-                return None
-
-            return self._android.find_app_pid(package)
-        else:
-            self._log.error('Executable {0} neither Java nor NDK.'
-                            .format(app_name))
-
-            self._log.fatal('Failed to launch test executable {0}'
-                            .format(app_name))
-            return None
-
-        if not success:
-            self._log.log_and_print(app_name +
-                ' is not installed. Try removing the --no-install option?')
-            return None
-
-        return self._android.find_app_pid(process_name)
-
-    def check_apps_installed(self, java_only):
-        ''' Check whether all Java/JNI/NDK apps are installed on the device.
-
-        Args:
-            java_only: Boolean to specify whether only the Java apks should be
-                       checked (in case of --wimpy mode for example).
-
-        Raises:
-            TestSuiteException: Not all apps are installed.
-        '''
-        java_and_jni_apks = self._tests_apk.copy()
-
-        if not java_only:
-            java_and_jni_apks.update(self._tests_jni)
-
-        installed = self._android.shell('pm list packages -f')
-
-        for app, package in java_and_jni_apks.items():
-            if package not in installed:
-                raise TestSuiteException('apk %s is not installed.' % app)
-
-        if not java_only:
-            ls_data = self._android.shell('ls /data')
-            for app in self._tests_ndk:
-                if app not in ls_data:
-                    raise TestSuiteException('app %s is not installed.' % app)
diff --git a/tests/lldb/tests/harness/util_constants.py b/tests/lldb/tests/harness/util_constants.py
deleted file mode 100644
index 9c7b18c..0000000
--- a/tests/lldb/tests/harness/util_constants.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''This file contains constants shared between the test suite runner and the
-individual test runner.'''
-
-from __future__ import absolute_import
-
-RC_TEST_OK = 0
-RC_TEST_TIMEOUT = 64
-RC_TEST_FAIL = 65
-RC_TEST_FATAL = 66
-RC_TEST_IGNORED = 67
-PUSH_TIMEOUT = 60*5
-
diff --git a/tests/lldb/tests/harness/util_functions.py b/tests/lldb/tests/harness/util_functions.py
deleted file mode 100644
index 32dca1c..0000000
--- a/tests/lldb/tests/harness/util_functions.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''This file contains utility functions used by both the test suite and the
-single test executor.'''
-
-from __future__ import absolute_import
-
-import os
-import importlib
-import sys
-
-
-def load_py_module(path):
-    '''Load a python file from disk.
-
-    Args:
-        path: String path to python file.
-
-    Returns:
-        python module if success, None otherwise.
-    '''
-    assert isinstance(path, str)
-    try:
-        if not os.path.exists(path):
-            print('Path does not exist: ' + path)
-            return None
-        path = os.path.abspath(path)
-        module_dir, module_file = os.path.split(path)
-        module_name, _ = os.path.splitext(module_file)
-        # adjust sys.path, runtime counterpart of PYTHONPATH, to temporarily
-        # include the folder containing the user configuration module
-        sys.path.append(module_dir)
-        module_obj = importlib.import_module(module_name)
-        sys.path.pop(0)
-        return module_obj
-    except ImportError as err:
-        print(str(err))
-        print("Looking in directory ")
-        print(module_dir)
-        return None
diff --git a/tests/lldb/tests/harness/util_lldb.py b/tests/lldb/tests/harness/util_lldb.py
deleted file mode 100644
index 2d1adca..0000000
--- a/tests/lldb/tests/harness/util_lldb.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the class UtilLLDB, which provides lldb utility
-methods.'''
-
-from __future__ import absolute_import
-
-from . import util_constants
-
-try:
-    import lldb
-except ImportError:
-    print('unable to import lldb')
-    print('please run "lldb -P" and add to $PYTHONPATH')
-    quit(util_constants.RC_TEST_FATAL)
-
-
-class UtilLLDB(object):
-    '''Provides utility methods to interface with lldb's python bindings.'''
-
-    @staticmethod
-    def start():
-        '''Initialise the lldb debugger framework.'''
-        lldb.SBDebugger_Initialize()
-
-    @staticmethod
-    def stop():
-        '''Terminate the lldb debugger framework.
-
-        Raises:
-            AssertionError: If an assertion fails.
-        '''
-        assert lldb
-        lldb.SBDebugger_Terminate()
-
-    @staticmethod
-    def create_debugger():
-        '''Create an lldb debugger instance.
-
-        Returns:
-            The SBDebugger instance that was created.
-
-        Raises:
-            AssertionError: If an assertion fails.
-        '''
-        assert lldb
-        inst = lldb.SBDebugger_Create()
-        inst.SetAsync(False)
-        return inst
-
-    @staticmethod
-    def destroy_debugger(dbg):
-        '''Destroy the lldb debugger instance.
-
-        Args:
-            dbg: Instance of SBDebugger that is to be destroyed.
-
-        Raises:
-            AssertionError: If an assertion fails.
-        '''
-        assert lldb
-        lldb.SBDebugger_Destroy(dbg)
-
-    @staticmethod
-    def get_module():
-        '''Get the lldb module.
-
-        Returns:
-            The lldb module.
-
-        Raises:
-            AssertionError: If an assertion fails.
-        '''
-        assert lldb
-        return lldb
diff --git a/tests/lldb/tests/harness/util_log.py b/tests/lldb/tests/harness/util_log.py
deleted file mode 100644
index fec0703..0000000
--- a/tests/lldb/tests/harness/util_log.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Initialise the Python logging facility for the test suite.
-
-from __future__ import absolute_import
-
-It provides the function to initialise the logging facility and retrieve an
-instance of the logger class. It also contains the definition of the internal
-logger class.
-'''
-from __future__ import print_function
-
-import io
-import sys
-import logging
-
-
-INITIALISED = False
-NAMESPACE = 'RS_LLDB_TESTSUITE'
-
-def initialise(identifier, level=logging.INFO, print_to_stdout=False,
-               file_path=None, file_mode='a'):
-    '''Initialise the logging facility for the test suite.
-
-    This function should be invoked only once, at the start of the program, and
-    before emitting any log.
-
-    Args:
-        identifier: String, a label that will be part of each record. It is
-                    usually the test case name.
-        level: Integer, all messages above this log level will be discarded.
-               Valid values are those recognised by the python logging module:
-               https://docs.python.org/2/library/logging.html#levels .
-        print_to_stdout: Boolean, whether the logs should be redirected to
-                         sys.stdout (true) or stored into a text file (false).
-        file_path: String, path to the text file in which to store the logs.
-                   This option is only meaningful when print_to_stdout = False.
-        file_mode: String, the mode to open the text file. Valid modes are
-                   those recognised by the standard Python `open' function.
-                   This option is only meaningful when print_to_stdout = False.
-
-    Raises:
-        RuntimeError: If the logging has already been initialised
-        ValueError: If the argument "file_path" has not been provided when
-                    print_to_stdout=False
-    '''
-    # pylint: disable=global-statement
-    global INITIALISED
-    if INITIALISED:
-        raise RuntimeError('Already initialised')
-
-    # set the logging class
-    old_logger_class = logging.getLoggerClass()
-    logging.setLoggerClass(RsLogger)
-
-    # initialise the Logger
-    log = logging.getLogger(NAMESPACE)
-    log.setLevel(level) # reject all logs below
-
-    # don't propagate the log records to the logging root
-    log.propagate = False
-
-    # restore the previous class
-    logging.setLoggerClass(old_logger_class)
-
-    # handler
-    if print_to_stdout:
-        handler_default = logging.StreamHandler(sys.stdout)
-    else:
-        if file_path is None:
-            raise ValueError('Missing mandatory argument "file_path"')
-
-        handler_default = logging.FileHandler(file_path, file_mode)
-
-    # Do not filter records in the handler because of the level
-    handler_default.setLevel(logging.NOTSET)
-
-    # format the message
-    handler_default.setFormatter(
-        logging.Formatter(
-            '%(asctime)s [{0}] [%(levelname)s] %(message)s'
-                .format(identifier)
-    ))
-
-    log.addHandler(handler_default)
-
-    INITIALISED = True
-
-
-class RsLogger(logging.getLoggerClass()):
-    '''Internal logging class.
-
-    This is an internal class to enhance the logging facility with the methods
-    "log_and_print" and "seek_to_end".
-    '''
-    # pylint: disable=too-many-public-methods
-
-    def log_and_print(self, msg, level=logging.INFO):
-        '''Print "msg" to stdout and emit a log record.
-
-        Args:
-            msg: The message to emit.
-            level: The level to use. By default it is logging.INFO.
-        '''
-        print(msg)
-        self.log(level, msg)
-
-    def seek_to_end(self):
-        '''Reset the cursor position to the end for all handlers that are
-        Text File managers.'''
-        for hndlr in self.handlers:
-            if isinstance(hndlr, logging.FileHandler):
-                hndlr.stream.seek(0, io.SEEK_END)
-
-
-def get_logger():
-    '''Retrieves the Logger instance related to the testsuite.
-
-    Throws:
-        RuntimeError: If the logging facility has not been initialised with
-                      "initialise" beforehand.
-
-    Returns:
-        An instance of logging.Logger to write the logs.
-    '''
-    if not INITIALISED:
-        raise RuntimeError('Logging facility not initialised')
-
-    return logging.getLogger(NAMESPACE)
diff --git a/tests/lldb/tests/harness/util_timer.py b/tests/lldb/tests/harness/util_timer.py
deleted file mode 100644
index b83a76f..0000000
--- a/tests/lldb/tests/harness/util_timer.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Timer utility'''
-
-from __future__ import absolute_import
-
-import threading
-
-
-class Timer(object):
-    '''A Timer utility to execute a callback after a certain interval.'''
-
-    def __init__(self, interval, callback):
-        '''Initialise the Timer without starting it.
-
-        Args:
-            interval: int or float, interval in seconds to count, before
-                invoking the callback
-            callback: function, it handles the function to call once
-                the timeout expires.
-        '''
-
-        # validate input parameters
-        if not isinstance(interval, (int, float)):
-            raise TypeError('Argument "interval" is not a number: '
-                             '{0}'.format(type(interval)))
-        if not callable(callback):
-            raise TypeError('Argument "callback" is not a function: '
-                             '{0}'.format(type(callback)))
-
-        self._timer = None
-        self._callback = callback
-        self._interval = interval
-
-    def _is_running(self):
-        '''Checks whether the timer is executing.
-
-        Returns:
-            boolean, true if the timer is currently running, false otherwise
-        '''
-        return self._timer is not None
-
-    def start(self):
-        '''Starts the timer.
-
-        Returns:
-            self, the Timer instance
-
-        Throws:
-            RuntimeError: if the timer is already running
-        '''
-        if self._is_running():
-            raise RuntimeError('Timer already running')
-
-        self._timer = threading.Timer(self._interval, self._callback)
-        self._timer.start()
-        return self # so that we can perform Timer(...).start()
-
-    def stop(self):
-        '''Stops the timer if it's executing.
-
-        Returns:
-            self, the Timer instance
-        '''
-
-        if self._is_running():
-            self._timer.cancel()
-            self._timer = None
-        return self
-
-    def reset(self):
-        '''Restart the timer.
-
-        Returns:
-            self, the Timer instance
-        '''
-
-        self.stop()
-        self.start()
-        return self
diff --git a/tests/lldb/tests/harness/util_warnings.py b/tests/lldb/tests/harness/util_warnings.py
deleted file mode 100644
index dd52740..0000000
--- a/tests/lldb/tests/harness/util_warnings.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Redirect the Python warnings into the log.'''
-
-from __future__ import absolute_import
-
-import warnings
-
-from . import util_log
-
-_OLD_WARNINGS_HANDLER = None
-
-
-def redirect_warnings():
-    '''Redirect all warnings issued by warnings::warn to the log.
-
-    By default all python warnings are printed into sys.stderr. This method
-    will force to redirect them into the test suite logger.
-    '''
-
-    # pylint: disable=global-statement
-    global _OLD_WARNINGS_HANDLER
-
-    # Already redirecting?
-    if _OLD_WARNINGS_HANDLER:
-        return None
-
-    _OLD_WARNINGS_HANDLER = warnings.showwarning
-
-    log = util_log.get_logger()
-
-    def _redirect_warnings_to_log(*args):
-        '''Redirect the warnings to the Logger.'''
-        log.warn(warnings.formatwarning(*args).rstrip())
-
-    warnings.showwarning = _redirect_warnings_to_log
-
-
-def restore_warnings():
-    '''Restore the reporting of warnings::warn as before.'''
-
-    # pylint: disable=global-statement
-    global _OLD_WARNINGS_HANDLER
-
-    if _OLD_WARNINGS_HANDLER:
-        warnings.showwarning = _OLD_WARNINGS_HANDLER
-        _OLD_WARNINGS_HANDLER = None
-
diff --git a/tests/lldb/tests/run_test.py b/tests/lldb/tests/run_test.py
deleted file mode 100644
index 50a0530..0000000
--- a/tests/lldb/tests/run_test.py
+++ /dev/null
@@ -1,422 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''This script will run one specific test.'''
-from __future__ import print_function, absolute_import
-
-import os
-import sys
-import atexit
-import inspect
-import logging
-import argparse
-import warnings
-
-import harness
-from harness import util_constants
-from harness import util_log
-from harness import util_warnings
-from harness.util_functions import load_py_module
-from harness.util_lldb import UtilLLDB
-from harness.exception import DisconnectedException
-from harness.exception import TestSuiteException, TestIgnoredException
-from harness.util_timer import Timer
-
-
-class TestState(object):
-    '''Simple mutable mapping (like namedtuple)'''
-    def __init__(self, **kwargs):
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-
-def _test_pre_run(state):
-    '''This function is called before a test is executed (setup).
-
-    Args:
-        state: Test suite state collection, instance of TestState.
-
-    Returns:
-        True if the pre_run step completed without error. Currently the pre-run
-        will launch the target test binary on the device and attach an
-        lldb-server to it in platform mode.
-
-    Raises:
-        AssertionError: If an assertion fails.
-        TestSuiteException: Previous processes of this apk required for this
-                            test could not be killed.
-    '''
-    assert state.test
-    assert state.bundle
-
-    log = util_log.get_logger()
-    log.info('running: {0}'.format(state.name))
-
-    # Remove any cached NDK scripts between tests
-    state.bundle.delete_ndk_cache()
-
-    # query our test case for the remote target app it needs
-    # First try the legacy behaviour
-    try:
-        target_name = state.test.get_bundle_target()
-        warnings.warn("get_bundle_target() is deprecated and will be removed soon"
-                      " - use the `bundle_target` dictionary attribute instead")
-    except AttributeError:
-        try:
-            target_name = state.test.bundle_target[state.bundle_type]
-        except KeyError:
-            raise TestIgnoredException()
-
-    if target_name is None:
-        # test case doesn't require a remote process to debug
-        return True
-    else:
-        # find the pid of our remote test process
-        state.pid = state.bundle.launch(target_name)
-        if not state.pid:
-            log.error('unable to get pid of target')
-            return False
-        state.android.kill_servers()
-        # spawn lldb platform on the target device
-        state.android.launch_lldb_platform(state.device_port)
-        return True
-
-
-def _test_post_run(state):
-    '''This function is called after a test is executed (cleanup).
-
-    Args:
-        state: Test suite state collection, instance of TestState.
-
-    Raises:
-        AssertionError: If an assertion fails.
-    '''
-    assert state.test
-    assert state.bundle
-
-    try:
-        target_name = state.test.get_bundle_target()
-        warnings.warn("get_bundle_target() is deprecated and will be removed soon"
-                      " - use the `bundle_target` dictionary attribute instead")
-    except AttributeError:
-        try:
-            target_name = state.test.bundle_target[state.bundle_type]
-        except KeyError:
-            raise TestIgnoredException()
-
-
-    if target_name:
-        if state.bundle.is_apk(target_name):
-            state.android.stop_app(state.bundle.get_package(target_name))
-        else:
-            state.android.kill_process(target_name)
-
-
-def _test_run(state):
-    '''Execute a single test suite.
-
-    Args:
-        state: test suite state collection, instance of TestState.
-
-    Returns:
-        True: if the test case ran successfully and passed.
-        False: if the test case failed or suffered an error.
-
-    Raises:
-        AssertionError: If an assertion fails.
-    '''
-    assert state.lldb
-    assert state.lldb_module
-    assert state.test
-
-    test_failures = state.test.run(state.lldb, state.pid, state.lldb_module)
-
-    if test_failures:
-        log = util_log.get_logger()
-        for test, err in test_failures:
-            log.error('test %s:%s failed: %r' % (state.name, test, err))
-
-        return False
-
-    return True
-
-
-def _initialise_timer(android, interval):
-    '''Start a 'timeout' timer, to catch stalled execution.
-
-    This function will start a timer that will act as a timeout killing this
-    test session if a test becomes un-responsive.
-
-    Args:
-        android: current instance of harness.UtilAndroid
-        interval: the interval for the timeout, in seconds
-
-    Returns:
-        The instance of the Timer class that was created.
-    '''
-
-    def on_timeout():
-        '''This is a callback function that will fire if a test takes longer
-        then a threshold time to complete.'''
-        # Clean up the android properties
-        android.reset_all_props()
-        # pylint: disable=protected-access
-        sys.stdout.flush()
-        # hard exit to force kill all threads that may block our exit
-        os._exit(util_constants.RC_TEST_TIMEOUT)
-
-    timer = Timer(interval, on_timeout)
-    timer.start()
-    atexit.register(Timer.stop, timer)
-    return timer
-
-
-def _quit_test(num, timer):
-    '''This function will exit making sure the timeout thread is killed.
-
-    Args:
-        num: An integer specifying the exit status, 0 meaning "successful
-             termination".
-        timer: The current Timer instance.
-    '''
-    if timer:
-        timer.stop()
-    sys.stdout.flush()
-    sys.exit(num)
-
-
-def _execute_test(state):
-    '''Execute a test suite.
-
-    Args:
-        state: The current TestState object.
-    '''
-    log = util_log.get_logger()
-
-    state.test.setup(state.android)
-    try:
-        if not _test_pre_run(state):
-            raise TestSuiteException('test_pre_run() failed')
-        if not _test_run(state):
-            raise TestSuiteException('test_run() failed')
-        _test_post_run(state)
-        log.info('Test passed')
-
-    finally:
-        state.test.post_run()
-        state.test.teardown(state.android)
-
-
-def _get_test_case_class(module):
-    '''Inspect a test case module and return the test case class.
-
-    Args:
-        module: A loaded test case module.
-    '''
-    # We consider only subclasses of TestCase that have `test_` methods`
-    log = util_log.get_logger()
-    log.debug("loading test suites from %r", module)
-    for name, klass in inspect.getmembers(module, inspect.isclass):
-        for attr in dir(klass):
-            if attr.startswith('test_'):
-                log.info("Found test class %r", name)
-                return klass
-        else:
-            log.debug("class %r has no test_ methods", name)
-    return None
-
-
-def get_test_dir(test_name):
-    ''' Get the directory that contains a test with a given name.
-
-    Returns:
-        A string that is the directory containing the test.
-
-    Raises:
-        TestSuiteException: If a test with this name does not exist.
-    '''
-    tests_dir = os.path.dirname(os.path.realpath(__file__))
-    for sub_dir in os.listdir(tests_dir):
-        current_test_dir = os.path.join(tests_dir, sub_dir)
-        if (os.path.isdir(current_test_dir) and
-            test_name in os.listdir(current_test_dir)):
-            return current_test_dir
-
-    raise TestSuiteException(
-        'unable to find test: {0}'.format(test_name))
-
-
-def main():
-    '''Test runner entry point.'''
-
-    # re-open stdout with no buffering
-    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
-
-    android = None
-    timer = None
-    log = None
-
-    # parse the command line (positional arguments only)
-    truthy = lambda x: x.lower() in ('true', '1')
-    parser = argparse.ArgumentParser("Run a single RenderScript TestSuite against lldb")
-    for name, formatter in (
-       ('test_name', str),
-       ('log_file_path', str),
-       ('adb_path', str),
-       ('lldb_server_path_device', str),
-       ('aosp_product_path', str),
-       ('device_port', int),
-       ('device', str),
-       ('print_to_stdout', truthy),
-       ('verbose', truthy),
-       ('wimpy', truthy),
-       ('timeout', int),
-       ('bundle_type', str),
-    ):
-        parser.add_argument(name, type=formatter)
-
-    args = parser.parse_args()
-
-    try:
-        # create utility classes
-        harness.util_log.initialise(
-            '%s(%s)' % (args.test_name, args.bundle_type),
-            print_to_stdout=args.print_to_stdout,
-            level=logging.INFO if not args.verbose else logging.DEBUG,
-            file_path=args.log_file_path,
-            file_mode='a'
-        )
-        log = util_log.get_logger()
-        log.debug('Logger initialised')
-
-        android = harness.UtilAndroid(args.adb_path,
-                                      args.lldb_server_path_device,
-                                      args.device)
-
-        # start the timeout counter
-        timer = _initialise_timer(android, args.timeout)
-
-        # startup lldb and register teardown handler
-        atexit.register(UtilLLDB.stop)
-        UtilLLDB.start()
-
-        current_test_dir = get_test_dir(args.test_name)
-
-        # load a test case module
-        test_module = load_py_module(os.path.join(current_test_dir,
-                                                  args.test_name))
-
-
-        # inspect the test module and locate our test case class
-        test_class = _get_test_case_class(test_module)
-
-        # if our test inherits from TestBaseRemote, check we have a valid device
-        if (hasattr(test_module, "TestBaseRemote") and
-            issubclass(test_class, test_module.TestBaseRemote)):
-            android.validate_device()
-
-        # create an instance of our test case
-        test_inst = test_class(
-            args.device_port,
-            args.device,
-            timer,
-            args.bundle_type,
-            wimpy=args.wimpy
-        )
-
-        # instantiate a test target bundle
-        bundle = harness.UtilBundle(android, args.aosp_product_path)
-
-        # execute the test case
-        try:
-            for _ in range(2):
-                try:
-                    # create an lldb instance
-                    lldb = UtilLLDB.create_debugger()
-
-                    # create state object to encapsulate instances
-
-                    state = TestState(
-                         android=android,
-                         bundle=bundle,
-                         lldb=lldb,
-                         lldb_module=UtilLLDB.get_module(),
-                         test=test_inst,
-                         pid=None,
-                         name=args.test_name,
-                         device_port=args.device_port,
-                         bundle_type=args.bundle_type
-                    )
-
-                    util_warnings.redirect_warnings()
-
-                    _execute_test(state)
-
-                    # tear down the lldb instance
-                    UtilLLDB.destroy_debugger(lldb)
-                    break
-                except DisconnectedException as error:
-                    log.warning(error)
-                    log.warning('Trying again.')
-            else:
-                log.fatal('Not trying again, maximum retries exceeded.')
-                raise TestSuiteException('Lost connection to lldb-server')
-
-        finally:
-            util_warnings.restore_warnings()
-
-        _quit_test(util_constants.RC_TEST_OK, timer)
-
-    except AssertionError:
-        if log:
-            log.critical('Internal test suite error', exc_info=1)
-        print('Internal test suite error', file=sys.stderr)
-        _quit_test(util_constants.RC_TEST_FATAL, timer)
-
-    except TestIgnoredException:
-        if log:
-            log.warn("test ignored")
-        _quit_test(util_constants.RC_TEST_IGNORED, timer)
-
-    except TestSuiteException as error:
-        if log:
-            log.exception(str(error))
-        else:
-            print(error, file=sys.stderr)
-        _quit_test(util_constants.RC_TEST_FAIL, timer)
-
-    # use a global exception handler to be sure that we will
-    # exit safely and correctly
-    except Exception:
-        if log:
-            log.exception('INTERNAL ERROR')
-        else:
-            import traceback
-            print('Exception {0}'.format(traceback.format_exc()),
-                  file=sys.stderr)
-        _quit_test(util_constants.RC_TEST_FATAL, timer)
-
-    finally:
-        if android:
-            android.reset_all_props()
-        if timer:
-            timer.stop()
-
-
-# execution trampoline
-if __name__ == '__main__':
-    print(' '.join(sys.argv))
-    main()
diff --git a/tests/lldb/tests/testcases/reduce_common.py b/tests/lldb/tests/testcases/reduce_common.py
deleted file mode 100644
index 462d0b3..0000000
--- a/tests/lldb/tests/testcases/reduce_common.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-
-REDUCE_ITERATIONS = 128  # This is in MainActivity.java
-REDUCE_STARTVAL = 10 # This is in MainActivity.java
-REDUCE_AUTO_COMB_SCRIPT = "reduce_common.rsh"
-REDUCE_SCRIPT = "reduce_common.rsh"
-X_TESTS = 100
-Y_TESTS = 2
-Z_TESTS = 2
-
-
-class ReductionMixin(object):
-    def _test_func_role_combinations(self, func_role_combinations):
-        """
-        Assert that when a reduction breakpoint is conditional on a function
-        role, that breakpoints are only set on the the given functions.
-        We do this by setting breakpoints on all possible pairs of functions
-        and check that the resolved breakpoints are on functions that are part
-        of the given pair
-        """
-        for combination in func_role_combinations:
-            self._delete_breakpoints()
-            self.try_command(
-                'language renderscript reduction breakpoint set '
-                'find_min_user_type --function-role %s' % (
-                    ','.join(combination)
-                ),
-                [r'Breakpoint(s) created']
-            )
-            func_suffixes = [combination[0][:4], combination[1][:4]]
-            # just match the first 4 chars of the roles prefix
-            funcs_match = 'find_min_user_type_((%s|%s))' % tuple(func_suffixes)
-            # now check we stop on both functions for each coordinate in the
-            # allocation
-            for x in range(REDUCE_ITERATIONS):
-                output = self.try_command(
-                    'process continue',
-                    expected_regex=[
-                        r'resuming',
-                        r'Process \d+ stopped',
-                        r'frame #0: (0x[0-9a-fA-F]+ )?librs.reduce.so`%s' % funcs_match
-                    ]
-                )
-                for line in output.splitlines():
-                    match = re.search(funcs_match, line)
-                    if match:
-                        try:
-                            func_suffixes.remove(match.group(1))
-                        except ValueError:
-                            # The outconverter may only be called in the final
-                            # step but the accumulator will be called for every
-                            # input index
-                            continue
-                        break
-                if len(func_suffixes) == 0:
-                    # We've popped the functions we're interested in off the list
-                    break
-            else:
-                raise self.TestFail(
-                    "unable to match function roles for " + repr(combination))
-
-    def _reduction_breakpoint_set_single_type(
-            self, script_soname, script_basename, reduce_name, funcname_types):
-        """
-        Assert - for each function role - that the correct symbol is resolved
-        and trapped by the debugger.
-        """
-        for func, typename in funcname_types:
-            self._delete_breakpoints()
-            breakpoint_match = r'Breakpoint \d+: where = librs.%s.so`%s'
-            # Autogenerated combiners don't have a filename in the debugger
-            if not func.endswith(".combiner"):
-                breakpoint_match = r'%s (\+ \d+ )?at %s' % (
-                        breakpoint_match, script_basename)
-            self.try_command(
-                'language renderscript reduction breakpoint set %s'
-                ' --function-role %s' % (reduce_name, typename),
-                expected_regex=[breakpoint_match % (script_soname, func)]
-            )
-            self.try_command(
-                'process continue',
-                expected_regex=[
-                    r'resuming',
-                    r'Process \d+ stopped',
-                    r'frame #0: (0x[0-9a-fA-F]+ )?librs.%s.so`%s' % (
-                        script_soname, func)
-                ]
-            )
diff --git a/tests/lldb/tests/testcases/test_allocation_dump_1.py b/tests/lldb/tests/testcases/test_allocation_dump_1.py
deleted file mode 100644
index 53e77fc..0000000
--- a/tests/lldb/tests/testcases/test_allocation_dump_1.py
+++ /dev/null
@@ -1,348 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestAllocationDump1.'''
-from __future__ import absolute_import
-
-import os
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    wimpy,
-    cpp_only_test,
-)
-
-
-class TestAllocationDump1(TestBaseRemote):
-    '''Tests printing the contents of allocations.'''
-
-    bundle_target = {
-        'java': 'Allocations',
-        'jni': 'JNIAllocations',
-        'cpp': 'CppAllocations'
-    }
-
-    @wimpy
-    @ordered_test(0)
-    def test_setup(self):
-        self.try_command('language renderscript kernel breakpoint all enable',
-                         ['Breakpoints will be set on all kernels'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-    @wimpy
-    def test_dump_to_file1(self):
-        # Test dumping large allocations to file
-        output_file_1 = self.get_tmp_file_path()
-
-        self.try_command('language renderscript allocation dump 1 -f ' +
-                         output_file_1,
-                         ["Results written to '%s'" % output_file_1])
-
-        # Check the file was created
-        self.assert_true(os.path.isfile(output_file_1))
-        os.remove(output_file_1)
-
-    def test_dump_to_file2(self):
-        output_file_2 = self.get_tmp_file_path()
-
-        self.try_command('language renderscript allocation dump 2 -f ' +
-                         output_file_2,
-                         ["Results written to '%s'" % output_file_2])
-
-        self.assert_true(os.path.isfile(output_file_2))
-        os.remove(output_file_2)
-
-    @wimpy
-    def test_dump_char(self):
-        self.try_command('language renderscript allocation dump 3',
-                         ['(0, 0, 0) = 0',
-                          '(0, 1, 0) = 1',
-                          '(0, 2, 0) = 2',
-                          '(0, 0, 1) = 3',
-                          '(0, 1, 1) = 4',
-                          '(0, 2, 1) = 5',
-                          '(0, 0, 2) = 6',
-                          '(0, 1, 2) = 7',
-                          '(0, 2, 2) = 8',
-                          '(0, 0, 3) = 9',
-                          '(0, 1, 3) = 10',
-                          '(0, 2, 3) = 11',
-                          '(0, 0, 4) = 12',
-                          '(0, 1, 4) = 13',
-                          '(0, 2, 4) = 14',
-                          '(0, 0, 5) = 15',
-                          '(0, 1, 5) = 16',
-                          '(0, 2, 5) = 17',
-                          '(0, 0, 6) = 18',
-                          '(0, 1, 6) = 19',
-                          '(0, 2, 6) = 20',
-                          '(0, 0, 7) = 21',
-                          '(0, 1, 7) = 22',
-                          '(0, 2, 7) = 23'])
-
-    def test_dump_char2(self):
-        self.try_command('language renderscript allocation dump 4',
-                         ['(0, 0, 0) = {0 1}',
-                          '(1, 0, 0) = {2 3}',
-                          '(2, 0, 0) = {4 5}',
-                          '(3, 0, 0) = {6 7}',
-                          '(4, 0, 0) = {8 9}',
-                          '(5, 0, 0) = {10 11}',
-                          '(6, 0, 0) = {12 13}',
-                          '(7, 0, 0) = {14 15}',
-                          '(8, 0, 0) = {16 17}',
-                          '(9, 0, 0) = {18 19}',
-                          '(10, 0, 0) = {20 21}',
-                          '(11, 0, 0) = {22 23}'])
-
-    def test_dump_char3(self):
-        self.try_command('language renderscript allocation dump 5',
-                         ['(0, 0, 0) = {0 1 2}',
-                          '(1, 0, 0) = {4 5 6}',
-                          '(2, 0, 0) = {8 9 10}',
-                          '(3, 0, 0) = {12 13 14}',
-                          '(4, 0, 0) = {16 17 18}',
-                          '(5, 0, 0) = {20 21 22}'])
-
-    def test_dump_char4(self):
-        self.try_command('language renderscript allocation dump 6',
-                         ['(0, 0, 0) = {0 1 2 3}',
-                          '(1, 0, 0) = {4 5 6 7}',
-                          '(2, 0, 0) = {8 9 10 11}',
-                          '(3, 0, 0) = {12 13 14 15}',
-                          '(4, 0, 0) = {16 17 18 19}',
-                          '(5, 0, 0) = {20 21 22 23}'])
-
-    def test_dump_short(self):
-        self.try_command('language renderscript allocation dump 7',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-    def test_dump_short2(self):
-        self.try_command('language renderscript allocation dump 8',
-                         ['(0, 0, 0) = {0 1}',
-                          '(1, 0, 0) = {2 3}',
-                          '(2, 0, 0) = {4 5}',
-                          '(3, 0, 0) = {6 7}',
-                          '(4, 0, 0) = {8 9}',
-                          '(5, 0, 0) = {10 11}',
-                          '(0, 0, 1) = {12 13}',
-                          '(1, 0, 1) = {14 15}',
-                          '(2, 0, 1) = {16 17}',
-                          '(3, 0, 1) = {18 19}',
-                          '(4, 0, 1) = {20 21}',
-                          '(5, 0, 1) = {22 23}'])
-
-    def test_dump_short3(self):
-        self.try_command('language renderscript allocation dump 9',
-                         ['(0, 0, 0) = {0 1 2}',
-                          '(1, 0, 0) = {4 5 6}',
-                          '(2, 0, 0) = {8 9 10}',
-                          '(3, 0, 0) = {12 13 14}',
-                          '(4, 0, 0) = {16 17 18}',
-                          '(5, 0, 0) = {20 21 22}'])
-
-    def test_dump_short4(self):
-        self.try_command('language renderscript allocation dump 10',
-                         ['(0, 0, 0) = {0 1 2 3}',
-                          '(1, 0, 0) = {4 5 6 7}',
-                          '(2, 0, 0) = {8 9 10 11}',
-                          '(3, 0, 0) = {12 13 14 15}',
-                          '(4, 0, 0) = {16 17 18 19}',
-                          '(5, 0, 0) = {20 21 22 23}'])
-
-    def test_dump_int(self):
-        self.try_command('language renderscript allocation dump 11',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-    def test_dump_int2(self):
-        self.try_command('language renderscript allocation dump 12',
-                         ['(0, 0, 0) = {0 1}',
-                          '(1, 0, 0) = {2 3}',
-                          '(2, 0, 0) = {4 5}',
-                          '(3, 0, 0) = {6 7}',
-                          '(4, 0, 0) = {8 9}',
-                          '(5, 0, 0) = {10 11}',
-                          '(6, 0, 0) = {12 13}',
-                          '(7, 0, 0) = {14 15}',
-                          '(8, 0, 0) = {16 17}',
-                          '(9, 0, 0) = {18 19}',
-                          '(10, 0, 0) = {20 21}',
-                          '(11, 0, 0) = {22 23}'])
-
-    def test_dump_int3(self):
-        self.try_command('language renderscript allocation dump 13',
-                         ['(0, 0, 0) = {0 1 2}',
-                          '(1, 0, 0) = {4 5 6}',
-                          '(2, 0, 0) = {8 9 10}',
-                          '(0, 1, 0) = {12 13 14}',
-                          '(1, 1, 0) = {16 17 18}',
-                          '(2, 1, 0) = {20 21 22}'])
-
-    def test_dump_int4(self):
-        self.try_command('language renderscript allocation dump 14',
-                         ['(0, 0, 0) = {0 1 2 3}',
-                          '(1, 0, 0) = {4 5 6 7}',
-                          '(2, 0, 0) = {8 9 10 11}',
-                          '(3, 0, 0) = {12 13 14 15}',
-                          '(4, 0, 0) = {16 17 18 19}',
-                          '(5, 0, 0) = {20 21 22 23}'])
-
-    def test_dump_int5(self):
-        self.try_command('language renderscript allocation dump 15',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-    def test_dump_long2(self):
-        self.try_command('language renderscript allocation dump 16',
-                         ['(0, 0, 0) = {0 1}',
-                          '(1, 0, 0) = {2 3}',
-                          '(2, 0, 0) = {4 5}',
-                          '(3, 0, 0) = {6 7}',
-                          '(4, 0, 0) = {8 9}',
-                          '(5, 0, 0) = {10 11}',
-                          '(6, 0, 0) = {12 13}',
-                          '(7, 0, 0) = {14 15}',
-                          '(8, 0, 0) = {16 17}',
-                          '(9, 0, 0) = {18 19}',
-                          '(10, 0, 0) = {20 21}',
-                          '(11, 0, 0) = {22 23}'])
-
-    def test_dump_long3(self):
-        self.try_command('language renderscript allocation dump 17',
-                         ['(0, 0, 0) = {0 1 2}',
-                          '(1, 0, 0) = {4 5 6}',
-                          '(2, 0, 0) = {8 9 10}',
-                          '(3, 0, 0) = {12 13 14}',
-                          '(4, 0, 0) = {16 17 18}',
-                          '(5, 0, 0) = {20 21 22}'])
-
-    def test_dump_long4(self):
-        self.try_command('language renderscript allocation dump 18',
-                         ['(0, 0, 0) = {0 1 2 3}',
-                          '(0, 1, 0) = {4 5 6 7}',
-                          '(0, 2, 0) = {8 9 10 11}',
-                          '(0, 3, 0) = {12 13 14 15}',
-                          '(0, 4, 0) = {16 17 18 19}',
-                          '(0, 5, 0) = {20 21 22 23}'])
-
-    def test_dump_bool(self):
-        self.try_command('language renderscript allocation dump 19',
-                         ['(0, 0, 0) = false',
-                          '(1, 0, 0) = true',
-                          '(2, 0, 0) = false',
-                          '(3, 0, 0) = true',
-                          '(4, 0, 0) = false',
-                          '(5, 0, 0) = true',
-                          '(6, 0, 0) = false',
-                          '(7, 0, 0) = true',
-                          '(8, 0, 0) = false',
-                          '(9, 0, 0) = true',
-                          '(10, 0, 0) = false',
-                          '(11, 0, 0) = true',
-                          '(12, 0, 0) = false',
-                          '(13, 0, 0) = true',
-                          '(14, 0, 0) = false',
-                          '(15, 0, 0) = true',
-                          '(16, 0, 0) = false',
-                          '(17, 0, 0) = true',
-                          '(18, 0, 0) = false',
-                          '(19, 0, 0) = true',
-                          '(20, 0, 0) = false',
-                          '(21, 0, 0) = true',
-                          '(22, 0, 0) = false',
-                          '(23, 0, 0) = true'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup_breakpoints(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('breakpoint delete 2', ['1 breakpoints deleted'])
-
-        self.try_command('breakpoint delete 3', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_allocation_dump_2.py b/tests/lldb/tests/testcases/test_allocation_dump_2.py
deleted file mode 100644
index 13123ec..0000000
--- a/tests/lldb/tests/testcases/test_allocation_dump_2.py
+++ /dev/null
@@ -1,604 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestAllocationDump2'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    wimpy,
-    ordered_test
-)
-
-
-class TestAllocationDump2(TestBaseRemote):
-    '''Tests printing the contents of allocations.'''
-
-    bundle_target = {
-        'java': 'Allocations'
-    }
-
-    @wimpy
-    @ordered_test(0)
-    def test_allocation_dump1(self):
-        # pylint: disable=line-too-long
-        self.try_command('language renderscript kernel breakpoint all enable',
-            ['Breakpoints will be set on all kernels'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('breakpoint del 1',
-                         ['1 breakpoints deleted'])
-
-        # Hit second kernel
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # uchar
-        self.try_command('language renderscript allocation dump 20',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-    @ordered_test(1)
-    def test_allocation_dump_unsigned_types(self):
-        # uchar2
-        self.try_command('language renderscript allocation dump 21',
-                         ['(0, 0, 0) = {0x00 0x01}',
-                          '(1, 0, 0) = {0x02 0x03}',
-                          '(0, 1, 0) = {0x04 0x05}',
-                          '(1, 1, 0) = {0x06 0x07}',
-                          '(0, 2, 0) = {0x08 0x09}',
-                          '(1, 2, 0) = {0x0a 0x0b}',
-                          '(0, 3, 0) = {0x0c 0x0d}',
-                          '(1, 3, 0) = {0x0e 0x0f}',
-                          '(0, 4, 0) = {0x10 0x11}',
-                          '(1, 4, 0) = {0x12 0x13}',
-                          '(0, 5, 0) = {0x14 0x15}',
-                          '(1, 5, 0) = {0x16 0x17}'])
-
-        # uchar3
-        self.try_command('language renderscript allocation dump 22',
-                         ['(0, 0, 0) = {0x00 0x01 0x02}',
-                          '(1, 0, 0) = {0x04 0x05 0x06}',
-                          '(2, 0, 0) = {0x08 0x09 0x0a}',
-                          '(3, 0, 0) = {0x0c 0x0d 0x0e}',
-                          '(4, 0, 0) = {0x10 0x11 0x12}',
-                          '(5, 0, 0) = {0x14 0x15 0x16}'])
-
-        # uchar4
-        self.try_command('language renderscript allocation dump 23',
-                         ['(0, 0, 0) = {0x00 0x01 0x02 0x03}',
-                          '(1, 0, 0) = {0x04 0x05 0x06 0x07}',
-                          '(2, 0, 0) = {0x08 0x09 0x0a 0x0b}',
-                          '(3, 0, 0) = {0x0c 0x0d 0x0e 0x0f}',
-                          '(4, 0, 0) = {0x10 0x11 0x12 0x13}',
-                          '(5, 0, 0) = {0x14 0x15 0x16 0x17}'])
-
-        # ushort
-        self.try_command('language renderscript allocation dump 24',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-        # ushort2
-        self.try_command('language renderscript allocation dump 25',
-                         ['(0, 0, 0) = {0x0000 0x0001}',
-                          '(1, 0, 0) = {0x0002 0x0003}',
-                          '(2, 0, 0) = {0x0004 0x0005}',
-                          '(3, 0, 0) = {0x0006 0x0007}',
-                          '(4, 0, 0) = {0x0008 0x0009}',
-                          '(5, 0, 0) = {0x000a 0x000b}',
-                          '(6, 0, 0) = {0x000c 0x000d}',
-                          '(7, 0, 0) = {0x000e 0x000f}',
-                          '(8, 0, 0) = {0x0010 0x0011}',
-                          '(9, 0, 0) = {0x0012 0x0013}',
-                          '(10, 0, 0) = {0x0014 0x0015}',
-                          '(11, 0, 0) = {0x0016 0x0017}'])
-
-        # ushort3
-        self.try_command('language renderscript allocation dump 26',
-                         ['(0, 0, 0) = {0x0000 0x0001 0x0002}',
-                          '(0, 1, 0) = {0x0004 0x0005 0x0006}',
-                          '(0, 2, 0) = {0x0008 0x0009 0x000a}',
-                          '(0, 3, 0) = {0x000c 0x000d 0x000e}',
-                          '(0, 4, 0) = {0x0010 0x0011 0x0012}',
-                          '(0, 5, 0) = {0x0014 0x0015 0x0016}'])
-
-        # ushort4
-        self.try_command('language renderscript allocation dump 27',
-                         ['(0, 0, 0) = {0x0000 0x0001 0x0002 0x0003}',
-                          '(1, 0, 0) = {0x0004 0x0005 0x0006 0x0007}',
-                          '(2, 0, 0) = {0x0008 0x0009 0x000a 0x000b}',
-                          '(3, 0, 0) = {0x000c 0x000d 0x000e 0x000f}',
-                          '(4, 0, 0) = {0x0010 0x0011 0x0012 0x0013}',
-                          '(5, 0, 0) = {0x0014 0x0015 0x0016 0x0017}'])
-
-        # uint
-        self.try_command('language renderscript allocation dump 28',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-        # uint2
-        self.try_command('language renderscript allocation dump 29',
-                         ['(0, 0, 0) = {0x00000000 0x00000001}',
-                          '(1, 0, 0) = {0x00000002 0x00000003}',
-                          '(2, 0, 0) = {0x00000004 0x00000005}',
-                          '(3, 0, 0) = {0x00000006 0x00000007}',
-                          '(4, 0, 0) = {0x00000008 0x00000009}',
-                          '(5, 0, 0) = {0x0000000a 0x0000000b}',
-                          '(6, 0, 0) = {0x0000000c 0x0000000d}',
-                          '(7, 0, 0) = {0x0000000e 0x0000000f}',
-                          '(8, 0, 0) = {0x00000010 0x00000011}',
-                          '(9, 0, 0) = {0x00000012 0x00000013}',
-                          '(10, 0, 0) = {0x00000014 0x00000015}',
-                          '(11, 0, 0) = {0x00000016 0x00000017}'])
-
-        # uint3
-        self.try_command('language renderscript allocation dump 30',
-                         ['(0, 0, 0) = {0x00000000 0x00000001 0x00000002}',
-                          '(1, 0, 0) = {0x00000004 0x00000005 0x00000006}',
-                          '(2, 0, 0) = {0x00000008 0x00000009 0x0000000a}',
-                          '(3, 0, 0) = {0x0000000c 0x0000000d 0x0000000e}',
-                          '(4, 0, 0) = {0x00000010 0x00000011 0x00000012}',
-                          '(5, 0, 0) = {0x00000014 0x00000015 0x00000016}'])
-
-        # uint4
-        self.try_command('language renderscript allocation dump 31',
-                         ['(0, 0, 0) = {0x00000000 0x00000001 0x00000002 0x00000003}',
-                          '(0, 0, 1) = {0x00000004 0x00000005 0x00000006 0x00000007}',
-                          '(0, 0, 2) = {0x00000008 0x00000009 0x0000000a 0x0000000b}',
-                          '(0, 0, 3) = {0x0000000c 0x0000000d 0x0000000e 0x0000000f}',
-                          '(0, 0, 4) = {0x00000010 0x00000011 0x00000012 0x00000013}',
-                          '(0, 0, 5) = {0x00000014 0x00000015 0x00000016 0x00000017}'])
-
-        # ulong
-        self.try_command('language renderscript allocation dump 32',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(0, 1, 0) = 4',
-                          '(1, 1, 0) = 5',
-                          '(2, 1, 0) = 6',
-                          '(3, 1, 0) = 7',
-                          '(0, 2, 0) = 8',
-                          '(1, 2, 0) = 9',
-                          '(2, 2, 0) = 10',
-                          '(3, 2, 0) = 11',
-                          '(0, 0, 1) = 12',
-                          '(1, 0, 1) = 13',
-                          '(2, 0, 1) = 14',
-                          '(3, 0, 1) = 15',
-                          '(0, 1, 1) = 16',
-                          '(1, 1, 1) = 17',
-                          '(2, 1, 1) = 18',
-                          '(3, 1, 1) = 19',
-                          '(0, 2, 1) = 20',
-                          '(1, 2, 1) = 21',
-                          '(2, 2, 1) = 22',
-                          '(3, 2, 1) = 23'])
-
-        # ulong2
-        self.try_command('language renderscript allocation dump 33',
-                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001}',
-                          '(1, 0, 0) = {0x0000000000000002 0x0000000000000003}',
-                          '(2, 0, 0) = {0x0000000000000004 0x0000000000000005}',
-                          '(3, 0, 0) = {0x0000000000000006 0x0000000000000007}',
-                          '(4, 0, 0) = {0x0000000000000008 0x0000000000000009}',
-                          '(5, 0, 0) = {0x000000000000000a 0x000000000000000b}',
-                          '(6, 0, 0) = {0x000000000000000c 0x000000000000000d}',
-                          '(7, 0, 0) = {0x000000000000000e 0x000000000000000f}',
-                          '(8, 0, 0) = {0x0000000000000010 0x0000000000000011}',
-                          '(9, 0, 0) = {0x0000000000000012 0x0000000000000013}',
-                          '(10, 0, 0) = {0x0000000000000014 0x0000000000000015}',
-                          '(11, 0, 0) = {0x0000000000000016 0x0000000000000017}'])
-
-        # ulong3
-        self.try_command('language renderscript allocation dump 34',
-                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001 0x0000000000000002}',
-                          '(1, 0, 0) = {0x0000000000000004 0x0000000000000005 0x0000000000000006}',
-                          '(2, 0, 0) = {0x0000000000000008 0x0000000000000009 0x000000000000000a}',
-                          '(3, 0, 0) = {0x000000000000000c 0x000000000000000d 0x000000000000000e}',
-                          '(4, 0, 0) = {0x0000000000000010 0x0000000000000011 0x0000000000000012}',
-                          '(5, 0, 0) = {0x0000000000000014 0x0000000000000015 0x0000000000000016}'])
-
-        # ulong4
-        self.try_command('language renderscript allocation dump 35',
-                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001 '
-                                       '0x0000000000000002 0x0000000000000003}',
-                          '(1, 0, 0) = {0x0000000000000004 0x0000000000000005 '
-                                       '0x0000000000000006 0x0000000000000007}',
-                          '(2, 0, 0) = {0x0000000000000008 0x0000000000000009 '
-                                       '0x000000000000000a 0x000000000000000b}',
-                          '(3, 0, 0) = {0x000000000000000c 0x000000000000000d '
-                                       '0x000000000000000e 0x000000000000000f}',
-                          '(4, 0, 0) = {0x0000000000000010 0x0000000000000011 '
-                                       '0x0000000000000012 0x0000000000000013}',
-                          '(5, 0, 0) = {0x0000000000000014 0x0000000000000015 '
-                                       '0x0000000000000016 0x0000000000000017}'])
-
-    @wimpy
-    @ordered_test(3)
-    def test_dump_square_kernel(self):
-        self.try_command('breakpoint del 2',
-                         ['1 breakpoints deleted'])
-
-        # Hit third kernel
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # Test that uint allocation has been squared by square_kernel
-        self.try_command('language renderscript allocation dump 28',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 4',
-                          '(3, 0, 0) = 9',
-                          '(4, 0, 0) = 16',
-                          '(5, 0, 0) = 25',
-                          '(6, 0, 0) = 36',
-                          '(7, 0, 0) = 49',
-                          '(8, 0, 0) = 64',
-                          '(9, 0, 0) = 81',
-                          '(10, 0, 0) = 100',
-                          '(11, 0, 0) = 121',
-                          '(12, 0, 0) = 144',
-                          '(13, 0, 0) = 169',
-                          '(14, 0, 0) = 196',
-                          '(15, 0, 0) = 225',
-                          '(16, 0, 0) = 256',
-                          '(17, 0, 0) = 289',
-                          '(18, 0, 0) = 324',
-                          '(19, 0, 0) = 361',
-                          '(20, 0, 0) = 400',
-                          '(21, 0, 0) = 441',
-                          '(22, 0, 0) = 484',
-                          '(23, 0, 0) = 529'])
-
-    @ordered_test(4)
-    def test_alloction_dump_floating_types(self):
-        # half
-        self.try_command('language renderscript allocation dump 36',
-                         ['(0, 0, 0) = 1',
-                          '(1, 0, 0) = 1.00098',
-                          '(2, 0, 0) = 1.00195',
-                          '(3, 0, 0) = 1.00293',
-                          '(4, 0, 0) = 1.00391',
-                          '(5, 0, 0) = 1.00488',
-                          '(6, 0, 0) = 1.00586',
-                          '(7, 0, 0) = 1.00684',
-                          '(8, 0, 0) = 1.00781',
-                          '(9, 0, 0) = 1.00879',
-                          '(10, 0, 0) = 1.00977',
-                          '(11, 0, 0) = 1.01074',
-                          '(12, 0, 0) = 1.01172',
-                          '(13, 0, 0) = 1.0127',
-                          '(14, 0, 0) = 1.01367',
-                          '(15, 0, 0) = 1.01465',
-                          '(16, 0, 0) = 1.0156',
-                          '(17, 0, 0) = 1.0166',
-                          '(18, 0, 0) = 1.01758',
-                          '(19, 0, 0) = 1.01855',
-                          '(20, 0, 0) = 1.01953',
-                          '(21, 0, 0) = 1.02051',
-                          '(22, 0, 0) = 1.02148',
-                          '(23, 0, 0) = 1.02246'])
-
-        # half2
-        self.try_command('language renderscript allocation dump 37',
-                         ['(0, 0, 0) = {1 1.00098}',
-                          '(1, 0, 0) = {1.00195 1.00293}',
-                          '(2, 0, 0) = {1.00391 1.00488}',
-                          '(3, 0, 0) = {1.00586 1.00684}',
-                          '(4, 0, 0) = {1.00781 1.00879}',
-                          '(5, 0, 0) = {1.00977 1.01074}',
-                          '(6, 0, 0) = {1.01172 1.0127}',
-                          '(7, 0, 0) = {1.01367 1.01465}',
-                          '(9, 0, 0) = {1.01758 1.01855}',
-                          '(10, 0, 0) = {1.01953 1.02051}',
-                          '(11, 0, 0) = {1.02148 1.02246}'],
-                          [r'\(8, 0, 0\) = \{1\.0156[23] 1\.0166\}'])
-
-        # half3
-        self.try_command('language renderscript allocation dump 38',
-                         ['(0, 0, 0) = {1 1.00098 1.00195}',
-                          '(0, 1, 0) = {1.00391 1.00488 1.00586}',
-                          '(0, 2, 0) = {1.00781 1.00879 1.00977}',
-                          '(0, 3, 0) = {1.01172 1.0127 1.01367}',
-                          '(0, 5, 0) = {1.01953 1.02051 1.02148}'],
-                        [r'\(0, 4, 0\) = \{1\.0156[23] 1\.0166 1\.01758\}'])
-
-        # half4
-        self.try_command('language renderscript allocation dump 39',
-                         ['(0, 0, 0) = {1 1.00098 1.00195 1.00293}',
-                          '(1, 0, 0) = {1.00391 1.00488 1.00586 1.00684}',
-                          '(2, 0, 0) = {1.00781 1.00879 1.00977 1.01074}',
-                          '(3, 0, 0) = {1.01172 1.0127 1.01367 1.01465}',
-                          '(5, 0, 0) = {1.01953 1.02051 1.02148 1.02246}'],
-                         [r'\(4, 0, 0\) = \{1\.0156[23] 1\.0166 1\.01758 1\.01855\}'])
-
-        # float
-        self.try_command('language renderscript allocation dump 40',
-                         ['(0, 0, 0) = inf',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 0.5',
-                          '(3, 0, 0) = 0.333333',
-                          '(4, 0, 0) = 0.25',
-                          '(5, 0, 0) = 0.2',
-                          '(6, 0, 0) = 0.166667',
-                          '(7, 0, 0) = 0.142857',
-                          '(8, 0, 0) = 0.125',
-                          '(9, 0, 0) = 0.111111',
-                          '(10, 0, 0) = 0.1',
-                          '(11, 0, 0) = 0.0909091',
-                          '(12, 0, 0) = 0.0833333',
-                          '(13, 0, 0) = 0.0769231',
-                          '(14, 0, 0) = 0.0714286',
-                          '(15, 0, 0) = 0.0666667',
-                          '(16, 0, 0) = 0.0625',
-                          '(17, 0, 0) = 0.0588235',
-                          '(18, 0, 0) = 0.0555556',
-                          '(19, 0, 0) = 0.0526316',
-                          '(20, 0, 0) = 0.05',
-                          '(21, 0, 0) = 0.047619',
-                          '(22, 0, 0) = 0.0454545',
-                          '(23, 0, 0) = 0.0434783'])
-
-        # float2
-        self.try_command('language renderscript allocation dump 41',
-                         ['(0, 0, 0) = {inf 1}',
-                          '(1, 0, 0) = {0.5 0.333333}',
-                          '(2, 0, 0) = {0.25 0.2}',
-                          '(3, 0, 0) = {0.166667 0.142857}',
-                          '(4, 0, 0) = {0.125 0.111111}',
-                          '(5, 0, 0) = {0.1 0.0909091}',
-                          '(6, 0, 0) = {0.0833333 0.0769231}',
-                          '(7, 0, 0) = {0.0714286 0.0666667}',
-                          '(8, 0, 0) = {0.0625 0.0588235}',
-                          '(9, 0, 0) = {0.0555556 0.0526316}',
-                          '(10, 0, 0) = {0.05 0.047619}',
-                          '(11, 0, 0) = {0.0454545 0.0434783}'])
-
-        # float3
-        self.try_command('language renderscript allocation dump 42',
-                         ['(0, 0, 0) = {inf 1 0.5}',
-                          '(1, 0, 0) = {0.25 0.2 0.166667}',
-                          '(2, 0, 0) = {0.125 0.111111 0.1}',
-                          '(3, 0, 0) = {0.0833333 0.0769231 0.0714286}',
-                          '(4, 0, 0) = {0.0625 0.0588235 0.0555556}',
-                          '(5, 0, 0) = {0.05 0.047619 0.0454545}'])
-
-        # float4
-        self.try_command('language renderscript allocation dump 43',
-                         ['(0, 0, 0) = {inf 1 0.5 0.333333}',
-                          '(1, 0, 0) = {0.25 0.2 0.166667 0.142857}',
-                          '(2, 0, 0) = {0.125 0.111111 0.1 0.0909091}',
-                          '(0, 1, 0) = {0.0833333 0.0769231 0.0714286 0.0666667}',
-                          '(1, 1, 0) = {0.0625 0.0588235 0.0555556 0.0526316}',
-                          '(2, 1, 0) = {0.05 0.047619 0.0454545 0.0434783}'])
-
-        # double
-        self.try_command('language renderscript allocation dump 44',
-                         ['(0, 0, 0) = inf',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 0.5',
-                          '(3, 0, 0) = 0.333333333333333',
-                          '(4, 0, 0) = 0.25',
-                          '(5, 0, 0) = 0.2',
-                          '(6, 0, 0) = 0.166666666666667',
-                          '(7, 0, 0) = 0.142857142857143',
-                          '(8, 0, 0) = 0.125',
-                          '(9, 0, 0) = 0.111111111111111',
-                          '(10, 0, 0) = 0.1',
-                          '(11, 0, 0) = 0.0909090909090909',
-                          '(12, 0, 0) = 0.0833333333333333',
-                          '(13, 0, 0) = 0.0769230769230769',
-                          '(14, 0, 0) = 0.0714285714285714',
-                          '(15, 0, 0) = 0.0666666666666667',
-                          '(16, 0, 0) = 0.0625',
-                          '(17, 0, 0) = 0.0588235294117647',
-                          '(18, 0, 0) = 0.0555555555555556',
-                          '(19, 0, 0) = 0.0526315789473684',
-                          '(20, 0, 0) = 0.05',
-                          '(21, 0, 0) = 0.0476190476190476',
-                          '(22, 0, 0) = 0.0454545454545455',
-                          '(23, 0, 0) = 0.0434782608695652'])
-
-        # double2
-        self.try_command('language renderscript allocation dump 45',
-                         ['(0, 0, 0) = {inf 1}',
-                          '(1, 0, 0) = {0.5 0.333333333333333}',
-                          '(2, 0, 0) = {0.25 0.2}',
-                          '(3, 0, 0) = {0.166666666666667 0.142857142857143}',
-                          '(0, 0, 1) = {0.125 0.111111111111111}',
-                          '(1, 0, 1) = {0.1 0.0909090909090909}',
-                          '(2, 0, 1) = {0.0833333333333333 0.0769230769230769}',
-                          '(3, 0, 1) = {0.0714285714285714 0.0666666666666667}',
-                          '(0, 0, 2) = {0.0625 0.0588235294117647}',
-                          '(1, 0, 2) = {0.0555555555555556 0.0526315789473684}',
-                          '(2, 0, 2) = {0.05 0.0476190476190476}',
-                          '(3, 0, 2) = {0.0454545454545455 0.0434782608695652}'])
-
-        # double3
-        self.try_command('language renderscript allocation dump 46',
-                         ['(0, 0, 0) = {inf 1 0.5}',
-                          '(0, 1, 0) = {0.25 0.2 0.166666666666667}',
-                          '(0, 0, 1) = {0.125 0.111111111111111 0.1}',
-                          '(0, 1, 1) = {0.0833333333333333 0.0769230769230769 '
-                                       '0.0714285714285714}',
-                          '(0, 0, 2) = {0.0625 0.0588235294117647 0.0555555555555556}',
-                          '(0, 1, 2) = {0.05 0.0476190476190476 0.0454545454545455}'])
-
-        # double4
-        self.try_command('language renderscript allocation dump 47',
-                         ['(0, 0, 0) = {inf 1 0.5 0.333333333333333}',
-                          '(0, 1, 0) = {0.25 0.2 0.166666666666667 0.142857142857143}',
-                          '(0, 0, 1) = {0.125 0.111111111111111 0.1 0.0909090909090909}',
-                          '(0, 1, 1) = {0.0833333333333333 0.0769230769230769 '
-                                       '0.0714285714285714 0.0666666666666667}',
-                          '(0, 0, 2) = {0.0625 0.0588235294117647 '
-                                       '0.0555555555555556 0.0526315789473684}',
-                          '(0, 1, 2) = {0.05 0.0476190476190476 '
-                                       '0.0454545454545455 0.0434782608695652}'])
-
-    @wimpy
-    @ordered_test(5)
-    def test_allocation_dump_half_kernel(self):
-        # Delete kernel breakpoint on add_half_kernel
-        self.try_command('breakpoint del 3',
-                         ['1 breakpoints deleted'])
-
-        # Hit struct_kernel
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # Double 3 has been modified by add_half_kernel
-        self.try_command('language renderscript allocation dump 46',
-                         ['(0, 0, 0) = {inf 1.5 1}',
-                          '(0, 1, 0) = {0.75 0.7 0.666666666666667}',
-                          '(0, 0, 1) = {0.625 0.611111111111111 0.6}',
-                          '(0, 1, 1) = {0.583333333333333 0.576923076923077 0.571428571428571}',
-                          '(0, 0, 2) = {0.5625 0.558823529411765 0.555555555555556}',
-                          '(0, 1, 2) = {0.55 0.547619047619048 0.545454545454545}'])
-
-        # Floating point allocation data should have been overwritten
-        self.try_command('language renderscript allocation dump 40',
-                         ['(0, 0, 0) = -inf',
-                          '(1, 0, 0) = -1',
-                          '(2, 0, 0) = -0.5',
-                          '(3, 0, 0) = -0.333333',
-                          '(4, 0, 0) = -0.25',
-                          '(5, 0, 0) = -0.2',
-                          '(6, 0, 0) = -0.166667',
-                          '(7, 0, 0) = -0.142857',
-                          '(8, 0, 0) = -0.125',
-                          '(9, 0, 0) = -0.111111',
-                          '(10, 0, 0) = -0.1',
-                          '(11, 0, 0) = -0.0909091',
-                          '(12, 0, 0) = -0.0833333',
-                          '(13, 0, 0) = -0.0769231',
-                          '(14, 0, 0) = -0.0714286',
-                          '(15, 0, 0) = -0.0666667',
-                          '(16, 0, 0) = -0.0625',
-                          '(17, 0, 0) = -0.0588235',
-                          '(18, 0, 0) = -0.0555556',
-                          '(19, 0, 0) = -0.0526316',
-                          '(20, 0, 0) = -0.05',
-                          '(21, 0, 0) = -0.047619',
-                          '(22, 0, 0) = -0.0454545',
-                          '(23, 0, 0) = -0.0434783'])
-
-        self.try_command('language renderscript allocation dump 41',
-                         ['(0, 0, 0) = {-inf -1}',
-                          '(1, 0, 0) = {-0.5 -0.333333}',
-                          '(2, 0, 0) = {-0.25 -0.2}',
-                          '(3, 0, 0) = {-0.166667 -0.142857}',
-                          '(4, 0, 0) = {-0.125 -0.111111}',
-                          '(5, 0, 0) = {-0.1 -0.0909091}',
-                          '(6, 0, 0) = {-0.0833333 -0.0769231}',
-                          '(7, 0, 0) = {-0.0714286 -0.0666667}',
-                          '(8, 0, 0) = {-0.0625 -0.0588235}',
-                          '(9, 0, 0) = {-0.0555556 -0.0526316}',
-                          '(10, 0, 0) = {-0.05 -0.047619}',
-                          '(11, 0, 0) = {-0.0454545 -0.0434783}'])
-
-        self.try_command('language renderscript allocation dump 42',
-                         ['(0, 0, 0) = {-inf -1 -0.5}',
-                          '(1, 0, 0) = {-0.25 -0.2 -0.166667}',
-                          '(2, 0, 0) = {-0.125 -0.111111 -0.1}',
-                          '(3, 0, 0) = {-0.0833333 -0.0769231 -0.0714286}',
-                          '(4, 0, 0) = {-0.0625 -0.0588235 -0.0555556}',
-                          '(5, 0, 0) = {-0.05 -0.047619 -0.0454545}'])
-
-        self.try_command('language renderscript allocation dump 43',
-                         ['(0, 0, 0) = {-inf -1 -0.5 -0.333333}',
-                          '(1, 0, 0) = {-0.25 -0.2 -0.166667 -0.142857}',
-                          '(2, 0, 0) = {-0.125 -0.111111 -0.1 -0.0909091}',
-                          '(0, 1, 0) = {-0.0833333 -0.0769231 -0.0714286 -0.0666667}',
-                          '(1, 1, 0) = {-0.0625 -0.0588235 -0.0555556 -0.0526316}',
-                          '(2, 1, 0) = {-0.05 -0.047619 -0.0454545 -0.0434783}'])
diff --git a/tests/lldb/tests/testcases/test_allocation_dump_2_cpp.py b/tests/lldb/tests/testcases/test_allocation_dump_2_cpp.py
deleted file mode 100644
index b103ccb..0000000
--- a/tests/lldb/tests/testcases/test_allocation_dump_2_cpp.py
+++ /dev/null
@@ -1,525 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestAllocationDump2Cpp.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-
-
-class TestAllocationDump2Cpp(TestBaseRemote):
-    '''Tests printing the contents of allocations in an NDK app.'''
-
-    bundle_target = {
-        'cpp': 'CppAllocations'
-    }
-
-    def test_case(self):
-        '''Run the lldb commands that are being tested.
-
-        Raises:
-            TestFail: One of the lldb commands did not provide the expected
-            output.
-        '''
-        # pylint: disable=line-too-long
-        self.try_command('language renderscript kernel breakpoint all enable',
-                         ['Breakpoints will be set on all kernels'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('breakpoint del 1',
-                         ['1 breakpoints deleted'])
-
-        # Hit second kernel
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # uchar
-        self.try_command('language renderscript allocation dump 20',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-        # uchar2
-        self.try_command('language renderscript allocation dump 21',
-                         ['(0, 0, 0) = {0x00 0x01}',
-                          '(1, 0, 0) = {0x02 0x03}',
-                          '(0, 1, 0) = {0x04 0x05}',
-                          '(1, 1, 0) = {0x06 0x07}',
-                          '(0, 2, 0) = {0x08 0x09}',
-                          '(1, 2, 0) = {0x0a 0x0b}',
-                          '(0, 3, 0) = {0x0c 0x0d}',
-                          '(1, 3, 0) = {0x0e 0x0f}',
-                          '(0, 4, 0) = {0x10 0x11}',
-                          '(1, 4, 0) = {0x12 0x13}',
-                          '(0, 5, 0) = {0x14 0x15}',
-                          '(1, 5, 0) = {0x16 0x17}'])
-
-        # uchar3
-        self.try_command('language renderscript allocation dump 22',
-                         ['(0, 0, 0) = {0x00 0x01 0x02}',
-                          '(1, 0, 0) = {0x04 0x05 0x06}',
-                          '(2, 0, 0) = {0x08 0x09 0x0a}',
-                          '(3, 0, 0) = {0x0c 0x0d 0x0e}',
-                          '(4, 0, 0) = {0x10 0x11 0x12}',
-                          '(5, 0, 0) = {0x14 0x15 0x16}'])
-
-        # uchar4
-        self.try_command('language renderscript allocation dump 23',
-                         ['(0, 0, 0) = {0x00 0x01 0x02 0x03}',
-                          '(1, 0, 0) = {0x04 0x05 0x06 0x07}',
-                          '(2, 0, 0) = {0x08 0x09 0x0a 0x0b}',
-                          '(3, 0, 0) = {0x0c 0x0d 0x0e 0x0f}',
-                          '(4, 0, 0) = {0x10 0x11 0x12 0x13}',
-                          '(5, 0, 0) = {0x14 0x15 0x16 0x17}'])
-
-        # ushort
-        self.try_command('language renderscript allocation dump 24',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-        # ushort2
-        self.try_command('language renderscript allocation dump 25',
-                         ['(0, 0, 0) = {0x0000 0x0001}',
-                          '(1, 0, 0) = {0x0002 0x0003}',
-                          '(2, 0, 0) = {0x0004 0x0005}',
-                          '(3, 0, 0) = {0x0006 0x0007}',
-                          '(4, 0, 0) = {0x0008 0x0009}',
-                          '(5, 0, 0) = {0x000a 0x000b}',
-                          '(6, 0, 0) = {0x000c 0x000d}',
-                          '(7, 0, 0) = {0x000e 0x000f}',
-                          '(8, 0, 0) = {0x0010 0x0011}',
-                          '(9, 0, 0) = {0x0012 0x0013}',
-                          '(10, 0, 0) = {0x0014 0x0015}',
-                          '(11, 0, 0) = {0x0016 0x0017}'])
-
-        # ushort3
-        self.try_command('language renderscript allocation dump 26',
-                         ['(0, 0, 0) = {0x0000 0x0001 0x0002}',
-                          '(0, 1, 0) = {0x0004 0x0005 0x0006}',
-                          '(0, 2, 0) = {0x0008 0x0009 0x000a}',
-                          '(0, 3, 0) = {0x000c 0x000d 0x000e}',
-                          '(0, 4, 0) = {0x0010 0x0011 0x0012}',
-                          '(0, 5, 0) = {0x0014 0x0015 0x0016}'])
-
-        # ushort4
-        self.try_command('language renderscript allocation dump 27',
-                         ['(0, 0, 0) = {0x0000 0x0001 0x0002 0x0003}',
-                          '(1, 0, 0) = {0x0004 0x0005 0x0006 0x0007}',
-                          '(2, 0, 0) = {0x0008 0x0009 0x000a 0x000b}',
-                          '(3, 0, 0) = {0x000c 0x000d 0x000e 0x000f}',
-                          '(4, 0, 0) = {0x0010 0x0011 0x0012 0x0013}',
-                          '(5, 0, 0) = {0x0014 0x0015 0x0016 0x0017}'])
-
-        # uint
-        self.try_command('language renderscript allocation dump 28',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-        # uint2
-        self.try_command('language renderscript allocation dump 29',
-                         ['(0, 0, 0) = {0x00000000 0x00000001}',
-                          '(1, 0, 0) = {0x00000002 0x00000003}',
-                          '(2, 0, 0) = {0x00000004 0x00000005}',
-                          '(3, 0, 0) = {0x00000006 0x00000007}',
-                          '(4, 0, 0) = {0x00000008 0x00000009}',
-                          '(5, 0, 0) = {0x0000000a 0x0000000b}',
-                          '(6, 0, 0) = {0x0000000c 0x0000000d}',
-                          '(7, 0, 0) = {0x0000000e 0x0000000f}',
-                          '(8, 0, 0) = {0x00000010 0x00000011}',
-                          '(9, 0, 0) = {0x00000012 0x00000013}',
-                          '(10, 0, 0) = {0x00000014 0x00000015}',
-                          '(11, 0, 0) = {0x00000016 0x00000017}'])
-
-        # uint3
-        self.try_command('language renderscript allocation dump 30',
-                         ['(0, 0, 0) = {0x00000000 0x00000001 0x00000002}',
-                          '(1, 0, 0) = {0x00000004 0x00000005 0x00000006}',
-                          '(2, 0, 0) = {0x00000008 0x00000009 0x0000000a}',
-                          '(3, 0, 0) = {0x0000000c 0x0000000d 0x0000000e}',
-                          '(4, 0, 0) = {0x00000010 0x00000011 0x00000012}',
-                          '(5, 0, 0) = {0x00000014 0x00000015 0x00000016}'])
-
-        # uint4
-        self.try_command('language renderscript allocation dump 31',
-                         ['(0, 0, 0) = {0x00000000 0x00000001 0x00000002 0x00000003}',
-                          '(0, 0, 1) = {0x00000004 0x00000005 0x00000006 0x00000007}',
-                          '(0, 0, 2) = {0x00000008 0x00000009 0x0000000a 0x0000000b}',
-                          '(0, 0, 3) = {0x0000000c 0x0000000d 0x0000000e 0x0000000f}',
-                          '(0, 0, 4) = {0x00000010 0x00000011 0x00000012 0x00000013}',
-                          '(0, 0, 5) = {0x00000014 0x00000015 0x00000016 0x00000017}'])
-
-        # ulong
-        self.try_command('language renderscript allocation dump 32',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(0, 1, 0) = 4',
-                          '(1, 1, 0) = 5',
-                          '(2, 1, 0) = 6',
-                          '(3, 1, 0) = 7',
-                          '(0, 2, 0) = 8',
-                          '(1, 2, 0) = 9',
-                          '(2, 2, 0) = 10',
-                          '(3, 2, 0) = 11',
-                          '(0, 0, 1) = 12',
-                          '(1, 0, 1) = 13',
-                          '(2, 0, 1) = 14',
-                          '(3, 0, 1) = 15',
-                          '(0, 1, 1) = 16',
-                          '(1, 1, 1) = 17',
-                          '(2, 1, 1) = 18',
-                          '(3, 1, 1) = 19',
-                          '(0, 2, 1) = 20',
-                          '(1, 2, 1) = 21',
-                          '(2, 2, 1) = 22',
-                          '(3, 2, 1) = 23'])
-
-        # ulong2
-        self.try_command('language renderscript allocation dump 33',
-                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001}',
-                          '(1, 0, 0) = {0x0000000000000002 0x0000000000000003}',
-                          '(2, 0, 0) = {0x0000000000000004 0x0000000000000005}',
-                          '(3, 0, 0) = {0x0000000000000006 0x0000000000000007}',
-                          '(4, 0, 0) = {0x0000000000000008 0x0000000000000009}',
-                          '(5, 0, 0) = {0x000000000000000a 0x000000000000000b}',
-                          '(6, 0, 0) = {0x000000000000000c 0x000000000000000d}',
-                          '(7, 0, 0) = {0x000000000000000e 0x000000000000000f}',
-                          '(8, 0, 0) = {0x0000000000000010 0x0000000000000011}',
-                          '(9, 0, 0) = {0x0000000000000012 0x0000000000000013}',
-                          '(10, 0, 0) = {0x0000000000000014 0x0000000000000015}',
-                          '(11, 0, 0) = {0x0000000000000016 0x0000000000000017}'])
-
-        # ulong3
-        self.try_command('language renderscript allocation dump 34',
-                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001 0x0000000000000002}',
-                          '(1, 0, 0) = {0x0000000000000004 0x0000000000000005 0x0000000000000006}',
-                          '(2, 0, 0) = {0x0000000000000008 0x0000000000000009 0x000000000000000a}',
-                          '(3, 0, 0) = {0x000000000000000c 0x000000000000000d 0x000000000000000e}',
-                          '(4, 0, 0) = {0x0000000000000010 0x0000000000000011 0x0000000000000012}',
-                          '(5, 0, 0) = {0x0000000000000014 0x0000000000000015 0x0000000000000016}'])
-
-        # ulong4
-        self.try_command('language renderscript allocation dump 35',
-                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001 '
-                                       '0x0000000000000002 0x0000000000000003}',
-                          '(1, 0, 0) = {0x0000000000000004 0x0000000000000005 '
-                                       '0x0000000000000006 0x0000000000000007}',
-                          '(2, 0, 0) = {0x0000000000000008 0x0000000000000009 '
-                                       '0x000000000000000a 0x000000000000000b}',
-                          '(3, 0, 0) = {0x000000000000000c 0x000000000000000d '
-                                       '0x000000000000000e 0x000000000000000f}',
-                          '(4, 0, 0) = {0x0000000000000010 0x0000000000000011 '
-                                       '0x0000000000000012 0x0000000000000013}',
-                          '(5, 0, 0) = {0x0000000000000014 0x0000000000000015 '
-                                       '0x0000000000000016 0x0000000000000017}'])
-
-        self.try_command('breakpoint del 2',
-                         ['1 breakpoints deleted'])
-
-        # Hit third kernel
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # Test that uint allocation has been squared by square_kernel
-        self.try_command('language renderscript allocation dump 28',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 4',
-                          '(3, 0, 0) = 9',
-                          '(4, 0, 0) = 16',
-                          '(5, 0, 0) = 25',
-                          '(6, 0, 0) = 36',
-                          '(7, 0, 0) = 49',
-                          '(8, 0, 0) = 64',
-                          '(9, 0, 0) = 81',
-                          '(10, 0, 0) = 100',
-                          '(11, 0, 0) = 121',
-                          '(12, 0, 0) = 144',
-                          '(13, 0, 0) = 169',
-                          '(14, 0, 0) = 196',
-                          '(15, 0, 0) = 225',
-                          '(16, 0, 0) = 256',
-                          '(17, 0, 0) = 289',
-                          '(18, 0, 0) = 324',
-                          '(19, 0, 0) = 361',
-                          '(20, 0, 0) = 400',
-                          '(21, 0, 0) = 441',
-                          '(22, 0, 0) = 484',
-                          '(23, 0, 0) = 529'])
-
-        # half
-        self.try_command('language renderscript allocation dump 36',
-                         ['(0, 0, 0) = inf',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 0.5',
-                          '(3, 0, 0) = 0.333252',
-                          '(4, 0, 0) = 0.25',
-                          '(5, 0, 0) = 0.199951',
-                          '(6, 0, 0) = 0.166626',
-                          '(7, 0, 0) = 0.142822',
-                          '(8, 0, 0) = 0.125',
-                          '(9, 0, 0) = 0.111084',
-                          '(10, 0, 0) = 0.0999756',
-                          '(11, 0, 0) = 0.0908813',
-                          '(12, 0, 0) = 0.083313',
-                          '(13, 0, 0) = 0.0769043',
-                          '(14, 0, 0) = 0.0714111',
-                          '(15, 0, 0) = 0.0666504',
-                          '(16, 0, 0) = 0.0625',
-                          '(17, 0, 0) = 0.0588379',
-                          '(18, 0, 0) = 0.055542',
-                          '(19, 0, 0) = 0.0526428',
-                          '(20, 0, 0) = 0.0499878',
-                          '(21, 0, 0) = 0.0476074',
-                          '(22, 0, 0) = 0.0454407',
-                          '(23, 0, 0) = 0.0434875'])
-
-        # half2
-        self.try_command('language renderscript allocation dump 37',
-                         ['(0, 0, 0) = {inf 1}',
-                          '(1, 0, 0) = {0.5 0.333252}',
-                          '(2, 0, 0) = {0.25 0.199951}',
-                          '(3, 0, 0) = {0.166626 0.142822}',
-                          '(4, 0, 0) = {0.125 0.111084}',
-                          '(5, 0, 0) = {0.0999756 0.0908813}',
-                          '(6, 0, 0) = {0.083313 0.0769043}',
-                          '(7, 0, 0) = {0.0714111 0.0666504}',
-                          '(8, 0, 0) = {0.0625 0.0588379}',
-                          '(9, 0, 0) = {0.055542 0.0526428}',
-                          '(10, 0, 0) = {0.0499878 0.0476074}',
-                          '(11, 0, 0) = {0.0454407 0.0434875}'])
-
-        # half3
-        self.try_command('language renderscript allocation dump 38',
-                         ['(0, 0, 0) = {inf 1 0.5}',
-                          '(0, 1, 0) = {0.25 0.199951 0.166626}',
-                          '(0, 2, 0) = {0.125 0.111084 0.0999756}',
-                          '(0, 3, 0) = {0.083313 0.0769043 0.0714111}',
-                          '(0, 4, 0) = {0.0625 0.0588379 0.055542}',
-                          '(0, 5, 0) = {0.0499878 0.0476074 0.0454407}'])
-
-        # half4
-        self.try_command('language renderscript allocation dump 39',
-                         ['(0, 0, 0) = {inf 1 0.5 0.333252}',
-                          '(1, 0, 0) = {0.25 0.199951 0.166626 0.142822}',
-                          '(2, 0, 0) = {0.125 0.111084 0.0999756 0.0908813}',
-                          '(3, 0, 0) = {0.083313 0.0769043 0.0714111 0.0666504}',
-                          '(4, 0, 0) = {0.0625 0.0588379 0.055542 0.0526428}',
-                          '(5, 0, 0) = {0.0499878 0.0476074 0.0454407 0.0434875}'])
-
-        # float
-        self.try_command('language renderscript allocation dump 40',
-                         ['(0, 0, 0) = inf',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 0.5',
-                          '(3, 0, 0) = 0.333333',
-                          '(4, 0, 0) = 0.25',
-                          '(5, 0, 0) = 0.2',
-                          '(6, 0, 0) = 0.166667',
-                          '(7, 0, 0) = 0.142857',
-                          '(8, 0, 0) = 0.125',
-                          '(9, 0, 0) = 0.111111',
-                          '(10, 0, 0) = 0.1',
-                          '(11, 0, 0) = 0.0909091',
-                          '(12, 0, 0) = 0.0833333',
-                          '(13, 0, 0) = 0.0769231',
-                          '(14, 0, 0) = 0.0714286',
-                          '(15, 0, 0) = 0.0666667',
-                          '(16, 0, 0) = 0.0625',
-                          '(17, 0, 0) = 0.0588235',
-                          '(18, 0, 0) = 0.0555556',
-                          '(19, 0, 0) = 0.0526316',
-                          '(20, 0, 0) = 0.05',
-                          '(21, 0, 0) = 0.047619',
-                          '(22, 0, 0) = 0.0454545',
-                          '(23, 0, 0) = 0.0434783'])
-
-        # float2
-        self.try_command('language renderscript allocation dump 41',
-                         ['(0, 0, 0) = {inf 1}',
-                          '(1, 0, 0) = {0.5 0.333333}',
-                          '(2, 0, 0) = {0.25 0.2}',
-                          '(3, 0, 0) = {0.166667 0.142857}',
-                          '(4, 0, 0) = {0.125 0.111111}',
-                          '(5, 0, 0) = {0.1 0.0909091}',
-                          '(6, 0, 0) = {0.0833333 0.0769231}',
-                          '(7, 0, 0) = {0.0714286 0.0666667}',
-                          '(8, 0, 0) = {0.0625 0.0588235}',
-                          '(9, 0, 0) = {0.0555556 0.0526316}',
-                          '(10, 0, 0) = {0.05 0.047619}',
-                          '(11, 0, 0) = {0.0454545 0.0434783}'])
-
-        # float3
-        self.try_command('language renderscript allocation dump 42',
-                         ['(0, 0, 0) = {inf 1 0.5}',
-                          '(1, 0, 0) = {0.25 0.2 0.166667}',
-                          '(2, 0, 0) = {0.125 0.111111 0.1}',
-                          '(3, 0, 0) = {0.0833333 0.0769231 0.0714286}',
-                          '(4, 0, 0) = {0.0625 0.0588235 0.0555556}',
-                          '(5, 0, 0) = {0.05 0.047619 0.0454545}'])
-
-        # float4
-        self.try_command('language renderscript allocation dump 43',
-                         ['(0, 0, 0) = {inf 1 0.5 0.333333}',
-                          '(1, 0, 0) = {0.25 0.2 0.166667 0.142857}',
-                          '(2, 0, 0) = {0.125 0.111111 0.1 0.0909091}',
-                          '(0, 1, 0) = {0.0833333 0.0769231 0.0714286 0.0666667}',
-                          '(1, 1, 0) = {0.0625 0.0588235 0.0555556 0.0526316}',
-                          '(2, 1, 0) = {0.05 0.047619 0.0454545 0.0434783}'])
-
-        # double
-        self.try_command('language renderscript allocation dump 44',
-                         ['(0, 0, 0) = inf',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 0.5',
-                          '(3, 0, 0) = 0.333333333333333',
-                          '(4, 0, 0) = 0.25',
-                          '(5, 0, 0) = 0.2',
-                          '(6, 0, 0) = 0.166666666666667',
-                          '(7, 0, 0) = 0.142857142857143',
-                          '(8, 0, 0) = 0.125',
-                          '(9, 0, 0) = 0.111111111111111',
-                          '(10, 0, 0) = 0.1',
-                          '(11, 0, 0) = 0.0909090909090909',
-                          '(12, 0, 0) = 0.0833333333333333',
-                          '(13, 0, 0) = 0.0769230769230769',
-                          '(14, 0, 0) = 0.0714285714285714',
-                          '(15, 0, 0) = 0.0666666666666667',
-                          '(16, 0, 0) = 0.0625',
-                          '(17, 0, 0) = 0.0588235294117647',
-                          '(18, 0, 0) = 0.0555555555555556',
-                          '(19, 0, 0) = 0.0526315789473684',
-                          '(20, 0, 0) = 0.05',
-                          '(21, 0, 0) = 0.0476190476190476',
-                          '(22, 0, 0) = 0.0454545454545455',
-                          '(23, 0, 0) = 0.0434782608695652'])
-
-        # double2
-        self.try_command('language renderscript allocation dump 45',
-                         ['(0, 0, 0) = {inf 1}',
-                          '(1, 0, 0) = {0.5 0.333333333333333}',
-                          '(2, 0, 0) = {0.25 0.2}',
-                          '(3, 0, 0) = {0.166666666666667 0.142857142857143}',
-                          '(0, 0, 1) = {0.125 0.111111111111111}',
-                          '(1, 0, 1) = {0.1 0.0909090909090909}',
-                          '(2, 0, 1) = {0.0833333333333333 0.0769230769230769}',
-                          '(3, 0, 1) = {0.0714285714285714 0.0666666666666667}',
-                          '(0, 0, 2) = {0.0625 0.0588235294117647}',
-                          '(1, 0, 2) = {0.0555555555555556 0.0526315789473684}',
-                          '(2, 0, 2) = {0.05 0.0476190476190476}',
-                          '(3, 0, 2) = {0.0454545454545455 0.0434782608695652}'])
-
-        # double3
-        self.try_command('language renderscript allocation dump 46',
-                         ['(0, 0, 0) = {inf 1 0.5}',
-                          '(0, 1, 0) = {0.25 0.2 0.166666666666667}',
-                          '(0, 0, 1) = {0.125 0.111111111111111 0.1}',
-                          '(0, 1, 1) = {0.0833333333333333 0.0769230769230769 '
-                                       '0.0714285714285714}',
-                          '(0, 0, 2) = {0.0625 0.0588235294117647 0.0555555555555556}',
-                          '(0, 1, 2) = {0.05 0.0476190476190476 0.0454545454545455}'])
-
-        # double4
-        self.try_command('language renderscript allocation dump 47',
-                         ['(0, 0, 0) = {inf 1 0.5 0.333333333333333}',
-                          '(0, 1, 0) = {0.25 0.2 0.166666666666667 0.142857142857143}',
-                          '(0, 0, 1) = {0.125 0.111111111111111 0.1 0.0909090909090909}',
-                          '(0, 1, 1) = {0.0833333333333333 0.0769230769230769 '
-                                       '0.0714285714285714 0.0666666666666667}',
-                          '(0, 0, 2) = {0.0625 0.0588235294117647 '
-                                       '0.0555555555555556 0.0526315789473684}',
-                          '(0, 1, 2) = {0.05 0.0476190476190476 '
-                                       '0.0454545454545455 0.0434782608695652}'])
-
-        # Delete kernel breakpoint on add_half_kernel
-        self.try_command('breakpoint del 3',
-                         ['1 breakpoints deleted'])
-
-        self.try_command('process continue',
-                         ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_allocation_dump_2_jni.py b/tests/lldb/tests/testcases/test_allocation_dump_2_jni.py
deleted file mode 100644
index 41e33d3..0000000
--- a/tests/lldb/tests/testcases/test_allocation_dump_2_jni.py
+++ /dev/null
@@ -1,518 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestAllocationDump2JNI.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-
-
-class TestAllocationDump2JNI(TestBaseRemote):
-    '''Tests printing the contents of allocations of a JNI apk.'''
-
-    bundle_target = {
-        'jni': 'JNIAllocations'
-    }
-
-    def test_case(self):
-        '''Run the lldb commands that are being tested.
-
-        Raises:
-            TestFail: One of the lldb commands did not provide the expected
-            output.
-        '''
-        # pylint: disable=line-too-long
-        self.try_command('language renderscript kernel breakpoint all enable',
-                         ['Breakpoints will be set on all kernels'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('breakpoint del 1',
-                         ['1 breakpoints deleted'])
-
-        # Hit second kernel
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # uchar
-        self.try_command('language renderscript allocation dump 20',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-        # uchar2
-        self.try_command('language renderscript allocation dump 21',
-                         ['(0, 0, 0) = {0x00 0x01}',
-                          '(1, 0, 0) = {0x02 0x03}',
-                          '(0, 1, 0) = {0x04 0x05}',
-                          '(1, 1, 0) = {0x06 0x07}',
-                          '(0, 2, 0) = {0x08 0x09}',
-                          '(1, 2, 0) = {0x0a 0x0b}',
-                          '(0, 3, 0) = {0x0c 0x0d}',
-                          '(1, 3, 0) = {0x0e 0x0f}',
-                          '(0, 4, 0) = {0x10 0x11}',
-                          '(1, 4, 0) = {0x12 0x13}',
-                          '(0, 5, 0) = {0x14 0x15}',
-                          '(1, 5, 0) = {0x16 0x17}'])
-
-        # uchar3
-        self.try_command('language renderscript allocation dump 22',
-                         ['(0, 0, 0) = {0x00 0x01 0x02}',
-                          '(1, 0, 0) = {0x04 0x05 0x06}',
-                          '(2, 0, 0) = {0x08 0x09 0x0a}',
-                          '(3, 0, 0) = {0x0c 0x0d 0x0e}',
-                          '(4, 0, 0) = {0x10 0x11 0x12}',
-                          '(5, 0, 0) = {0x14 0x15 0x16}'])
-
-        # uchar4
-        self.try_command('language renderscript allocation dump 23',
-                         ['(0, 0, 0) = {0x00 0x01 0x02 0x03}',
-                          '(1, 0, 0) = {0x04 0x05 0x06 0x07}',
-                          '(2, 0, 0) = {0x08 0x09 0x0a 0x0b}',
-                          '(3, 0, 0) = {0x0c 0x0d 0x0e 0x0f}',
-                          '(4, 0, 0) = {0x10 0x11 0x12 0x13}',
-                          '(5, 0, 0) = {0x14 0x15 0x16 0x17}'])
-
-        # ushort
-        self.try_command('language renderscript allocation dump 24',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-        # ushort2
-        self.try_command('language renderscript allocation dump 25',
-                         ['(0, 0, 0) = {0x0000 0x0001}',
-                          '(1, 0, 0) = {0x0002 0x0003}',
-                          '(2, 0, 0) = {0x0004 0x0005}',
-                          '(3, 0, 0) = {0x0006 0x0007}',
-                          '(4, 0, 0) = {0x0008 0x0009}',
-                          '(5, 0, 0) = {0x000a 0x000b}',
-                          '(6, 0, 0) = {0x000c 0x000d}',
-                          '(7, 0, 0) = {0x000e 0x000f}',
-                          '(8, 0, 0) = {0x0010 0x0011}',
-                          '(9, 0, 0) = {0x0012 0x0013}',
-                          '(10, 0, 0) = {0x0014 0x0015}',
-                          '(11, 0, 0) = {0x0016 0x0017}'])
-
-        # ushort3
-        self.try_command('language renderscript allocation dump 26',
-                         ['(0, 0, 0) = {0x0000 0x0001 0x0002}',
-                          '(0, 1, 0) = {0x0004 0x0005 0x0006}',
-                          '(0, 2, 0) = {0x0008 0x0009 0x000a}',
-                          '(0, 3, 0) = {0x000c 0x000d 0x000e}',
-                          '(0, 4, 0) = {0x0010 0x0011 0x0012}',
-                          '(0, 5, 0) = {0x0014 0x0015 0x0016}'])
-
-        # ushort4
-        self.try_command('language renderscript allocation dump 27',
-                         ['(0, 0, 0) = {0x0000 0x0001 0x0002 0x0003}',
-                          '(1, 0, 0) = {0x0004 0x0005 0x0006 0x0007}',
-                          '(2, 0, 0) = {0x0008 0x0009 0x000a 0x000b}',
-                          '(3, 0, 0) = {0x000c 0x000d 0x000e 0x000f}',
-                          '(4, 0, 0) = {0x0010 0x0011 0x0012 0x0013}',
-                          '(5, 0, 0) = {0x0014 0x0015 0x0016 0x0017}'])
-
-        # uint
-        self.try_command('language renderscript allocation dump 28',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-        # uint2
-        self.try_command('language renderscript allocation dump 29',
-                         ['(0, 0, 0) = {0x00000000 0x00000001}',
-                          '(1, 0, 0) = {0x00000002 0x00000003}',
-                          '(2, 0, 0) = {0x00000004 0x00000005}',
-                          '(3, 0, 0) = {0x00000006 0x00000007}',
-                          '(4, 0, 0) = {0x00000008 0x00000009}',
-                          '(5, 0, 0) = {0x0000000a 0x0000000b}',
-                          '(6, 0, 0) = {0x0000000c 0x0000000d}',
-                          '(7, 0, 0) = {0x0000000e 0x0000000f}',
-                          '(8, 0, 0) = {0x00000010 0x00000011}',
-                          '(9, 0, 0) = {0x00000012 0x00000013}',
-                          '(10, 0, 0) = {0x00000014 0x00000015}',
-                          '(11, 0, 0) = {0x00000016 0x00000017}'])
-
-        # uint3
-        self.try_command('language renderscript allocation dump 30',
-                         ['(0, 0, 0) = {0x00000000 0x00000001 0x00000002}',
-                          '(1, 0, 0) = {0x00000004 0x00000005 0x00000006}',
-                          '(2, 0, 0) = {0x00000008 0x00000009 0x0000000a}',
-                          '(3, 0, 0) = {0x0000000c 0x0000000d 0x0000000e}',
-                          '(4, 0, 0) = {0x00000010 0x00000011 0x00000012}',
-                          '(5, 0, 0) = {0x00000014 0x00000015 0x00000016}'])
-
-        # uint4
-        self.try_command('language renderscript allocation dump 31',
-                         ['(0, 0, 0) = {0x00000000 0x00000001 0x00000002 0x00000003}',
-                          '(0, 0, 1) = {0x00000004 0x00000005 0x00000006 0x00000007}',
-                          '(0, 0, 2) = {0x00000008 0x00000009 0x0000000a 0x0000000b}',
-                          '(0, 0, 3) = {0x0000000c 0x0000000d 0x0000000e 0x0000000f}',
-                          '(0, 0, 4) = {0x00000010 0x00000011 0x00000012 0x00000013}',
-                          '(0, 0, 5) = {0x00000014 0x00000015 0x00000016 0x00000017}'])
-
-        # ulong
-        self.try_command('language renderscript allocation dump 32',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(0, 1, 0) = 4',
-                          '(1, 1, 0) = 5',
-                          '(2, 1, 0) = 6',
-                          '(3, 1, 0) = 7',
-                          '(0, 2, 0) = 8',
-                          '(1, 2, 0) = 9',
-                          '(2, 2, 0) = 10',
-                          '(3, 2, 0) = 11',
-                          '(0, 0, 1) = 12',
-                          '(1, 0, 1) = 13',
-                          '(2, 0, 1) = 14',
-                          '(3, 0, 1) = 15',
-                          '(0, 1, 1) = 16',
-                          '(1, 1, 1) = 17',
-                          '(2, 1, 1) = 18',
-                          '(3, 1, 1) = 19',
-                          '(0, 2, 1) = 20',
-                          '(1, 2, 1) = 21',
-                          '(2, 2, 1) = 22',
-                          '(3, 2, 1) = 23'])
-
-        # ulong2
-        self.try_command('language renderscript allocation dump 33',
-                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001}',
-                          '(1, 0, 0) = {0x0000000000000002 0x0000000000000003}',
-                          '(2, 0, 0) = {0x0000000000000004 0x0000000000000005}',
-                          '(3, 0, 0) = {0x0000000000000006 0x0000000000000007}',
-                          '(4, 0, 0) = {0x0000000000000008 0x0000000000000009}',
-                          '(5, 0, 0) = {0x000000000000000a 0x000000000000000b}',
-                          '(6, 0, 0) = {0x000000000000000c 0x000000000000000d}',
-                          '(7, 0, 0) = {0x000000000000000e 0x000000000000000f}',
-                          '(8, 0, 0) = {0x0000000000000010 0x0000000000000011}',
-                          '(9, 0, 0) = {0x0000000000000012 0x0000000000000013}',
-                          '(10, 0, 0) = {0x0000000000000014 0x0000000000000015}',
-                          '(11, 0, 0) = {0x0000000000000016 0x0000000000000017}'])
-
-        # ulong3
-        self.try_command('language renderscript allocation dump 34',
-                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001 0x0000000000000002}',
-                          '(1, 0, 0) = {0x0000000000000004 0x0000000000000005 0x0000000000000006}',
-                          '(2, 0, 0) = {0x0000000000000008 0x0000000000000009 0x000000000000000a}',
-                          '(3, 0, 0) = {0x000000000000000c 0x000000000000000d 0x000000000000000e}',
-                          '(4, 0, 0) = {0x0000000000000010 0x0000000000000011 0x0000000000000012}',
-                          '(5, 0, 0) = {0x0000000000000014 0x0000000000000015 0x0000000000000016}'])
-
-        # ulong4
-        self.try_command('language renderscript allocation dump 35',
-                         ['(0, 0, 0) = {0x0000000000000000 0x0000000000000001 '
-                                       '0x0000000000000002 0x0000000000000003}',
-                          '(1, 0, 0) = {0x0000000000000004 0x0000000000000005 '
-                                       '0x0000000000000006 0x0000000000000007}',
-                          '(2, 0, 0) = {0x0000000000000008 0x0000000000000009 '
-                                       '0x000000000000000a 0x000000000000000b}',
-                          '(3, 0, 0) = {0x000000000000000c 0x000000000000000d '
-                                       '0x000000000000000e 0x000000000000000f}',
-                          '(4, 0, 0) = {0x0000000000000010 0x0000000000000011 '
-                                       '0x0000000000000012 0x0000000000000013}',
-                          '(5, 0, 0) = {0x0000000000000014 0x0000000000000015 '
-                                       '0x0000000000000016 0x0000000000000017}'])
-
-        self.try_command('breakpoint del 2',
-                         ['1 breakpoints deleted'])
-
-        # Hit third kernel
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # Test that uint allocation has been squared by square_kernel
-        self.try_command('language renderscript allocation dump 28',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 4',
-                          '(3, 0, 0) = 9',
-                          '(4, 0, 0) = 16',
-                          '(5, 0, 0) = 25',
-                          '(6, 0, 0) = 36',
-                          '(7, 0, 0) = 49',
-                          '(8, 0, 0) = 64',
-                          '(9, 0, 0) = 81',
-                          '(10, 0, 0) = 100',
-                          '(11, 0, 0) = 121',
-                          '(12, 0, 0) = 144',
-                          '(13, 0, 0) = 169',
-                          '(14, 0, 0) = 196',
-                          '(15, 0, 0) = 225',
-                          '(16, 0, 0) = 256',
-                          '(17, 0, 0) = 289',
-                          '(18, 0, 0) = 324',
-                          '(19, 0, 0) = 361',
-                          '(20, 0, 0) = 400',
-                          '(21, 0, 0) = 441',
-                          '(22, 0, 0) = 484',
-                          '(23, 0, 0) = 529'])
-
-        # half
-        self.try_command('language renderscript allocation dump 36',
-                         ['(0, 0, 0) = inf',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 0.5',
-                          '(3, 0, 0) = 0.333252',
-                          '(4, 0, 0) = 0.25',
-                          '(5, 0, 0) = 0.199951',
-                          '(6, 0, 0) = 0.166626',
-                          '(7, 0, 0) = 0.142822',
-                          '(8, 0, 0) = 0.125',
-                          '(9, 0, 0) = 0.111084',
-                          '(10, 0, 0) = 0.0999756',
-                          '(11, 0, 0) = 0.0908813',
-                          '(12, 0, 0) = 0.083313',
-                          '(13, 0, 0) = 0.0769043',
-                          '(14, 0, 0) = 0.0714111',
-                          '(15, 0, 0) = 0.0666504',
-                          '(16, 0, 0) = 0.0625',
-                          '(17, 0, 0) = 0.0588379',
-                          '(18, 0, 0) = 0.055542',
-                          '(19, 0, 0) = 0.0526428',
-                          '(20, 0, 0) = 0.0499878',
-                          '(21, 0, 0) = 0.0476074',
-                          '(22, 0, 0) = 0.0454407',
-                          '(23, 0, 0) = 0.0434875'])
-
-        # half2
-        self.try_command('language renderscript allocation dump 37',
-                         ['(0, 0, 0) = {inf 1}',
-                          '(1, 0, 0) = {0.5 0.333252}',
-                          '(2, 0, 0) = {0.25 0.199951}',
-                          '(3, 0, 0) = {0.166626 0.142822}',
-                          '(4, 0, 0) = {0.125 0.111084}',
-                          '(5, 0, 0) = {0.0999756 0.0908813}',
-                          '(6, 0, 0) = {0.083313 0.0769043}',
-                          '(7, 0, 0) = {0.0714111 0.0666504}',
-                          '(8, 0, 0) = {0.0625 0.0588379}',
-                          '(9, 0, 0) = {0.055542 0.0526428}',
-                          '(10, 0, 0) = {0.0499878 0.0476074}',
-                          '(11, 0, 0) = {0.0454407 0.0434875}'])
-
-        # half3
-        self.try_command('language renderscript allocation dump 38',
-                         ['(0, 0, 0) = {inf 1 0.5}',
-                          '(0, 1, 0) = {0.25 0.199951 0.166626}',
-                          '(0, 2, 0) = {0.125 0.111084 0.0999756}',
-                          '(0, 3, 0) = {0.083313 0.0769043 0.0714111}',
-                          '(0, 4, 0) = {0.0625 0.0588379 0.055542}',
-                          '(0, 5, 0) = {0.0499878 0.0476074 0.0454407}'])
-
-        # half4
-        self.try_command('language renderscript allocation dump 39',
-                         ['(0, 0, 0) = {inf 1 0.5 0.333252}',
-                          '(1, 0, 0) = {0.25 0.199951 0.166626 0.142822}',
-                          '(2, 0, 0) = {0.125 0.111084 0.0999756 0.0908813}',
-                          '(3, 0, 0) = {0.083313 0.0769043 0.0714111 0.0666504}',
-                          '(4, 0, 0) = {0.0625 0.0588379 0.055542 0.0526428}',
-                          '(5, 0, 0) = {0.0499878 0.0476074 0.0454407 0.0434875}'])
-
-        # float
-        self.try_command('language renderscript allocation dump 40',
-                         ['(0, 0, 0) = inf',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 0.5',
-                          '(3, 0, 0) = 0.333333',
-                          '(4, 0, 0) = 0.25',
-                          '(5, 0, 0) = 0.2',
-                          '(6, 0, 0) = 0.166667',
-                          '(7, 0, 0) = 0.142857',
-                          '(8, 0, 0) = 0.125',
-                          '(9, 0, 0) = 0.111111',
-                          '(10, 0, 0) = 0.1',
-                          '(11, 0, 0) = 0.0909091',
-                          '(12, 0, 0) = 0.0833333',
-                          '(13, 0, 0) = 0.0769231',
-                          '(14, 0, 0) = 0.0714286',
-                          '(15, 0, 0) = 0.0666667',
-                          '(16, 0, 0) = 0.0625',
-                          '(17, 0, 0) = 0.0588235',
-                          '(18, 0, 0) = 0.0555556',
-                          '(19, 0, 0) = 0.0526316',
-                          '(20, 0, 0) = 0.05',
-                          '(21, 0, 0) = 0.047619',
-                          '(22, 0, 0) = 0.0454545',
-                          '(23, 0, 0) = 0.0434783'])
-
-        # float2
-        self.try_command('language renderscript allocation dump 41',
-                         ['(0, 0, 0) = {inf 1}',
-                          '(1, 0, 0) = {0.5 0.333333}',
-                          '(2, 0, 0) = {0.25 0.2}',
-                          '(3, 0, 0) = {0.166667 0.142857}',
-                          '(4, 0, 0) = {0.125 0.111111}',
-                          '(5, 0, 0) = {0.1 0.0909091}',
-                          '(6, 0, 0) = {0.0833333 0.0769231}',
-                          '(7, 0, 0) = {0.0714286 0.0666667}',
-                          '(8, 0, 0) = {0.0625 0.0588235}',
-                          '(9, 0, 0) = {0.0555556 0.0526316}',
-                          '(10, 0, 0) = {0.05 0.047619}',
-                          '(11, 0, 0) = {0.0454545 0.0434783}'])
-
-        # float3
-        self.try_command('language renderscript allocation dump 42',
-                         ['(0, 0, 0) = {inf 1 0.5}',
-                          '(1, 0, 0) = {0.25 0.2 0.166667}',
-                          '(2, 0, 0) = {0.125 0.111111 0.1}',
-                          '(3, 0, 0) = {0.0833333 0.0769231 0.0714286}',
-                          '(4, 0, 0) = {0.0625 0.0588235 0.0555556}',
-                          '(5, 0, 0) = {0.05 0.047619 0.0454545}'])
-
-        # float4
-        self.try_command('language renderscript allocation dump 43',
-                         ['(0, 0, 0) = {inf 1 0.5 0.333333}',
-                          '(1, 0, 0) = {0.25 0.2 0.166667 0.142857}',
-                          '(2, 0, 0) = {0.125 0.111111 0.1 0.0909091}',
-                          '(0, 1, 0) = {0.0833333 0.0769231 0.0714286 0.0666667}',
-                          '(1, 1, 0) = {0.0625 0.0588235 0.0555556 0.0526316}',
-                          '(2, 1, 0) = {0.05 0.047619 0.0454545 0.0434783}'])
-
-        # double
-        self.try_command('language renderscript allocation dump 44',
-                         ['(0, 0, 0) = inf',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 0.5',
-                          '(3, 0, 0) = 0.333333333333333',
-                          '(4, 0, 0) = 0.25',
-                          '(5, 0, 0) = 0.2',
-                          '(6, 0, 0) = 0.166666666666667',
-                          '(7, 0, 0) = 0.142857142857143',
-                          '(8, 0, 0) = 0.125',
-                          '(9, 0, 0) = 0.111111111111111',
-                          '(10, 0, 0) = 0.1',
-                          '(11, 0, 0) = 0.0909090909090909',
-                          '(12, 0, 0) = 0.0833333333333333',
-                          '(13, 0, 0) = 0.0769230769230769',
-                          '(14, 0, 0) = 0.0714285714285714',
-                          '(15, 0, 0) = 0.0666666666666667',
-                          '(16, 0, 0) = 0.0625',
-                          '(17, 0, 0) = 0.0588235294117647',
-                          '(18, 0, 0) = 0.0555555555555556',
-                          '(19, 0, 0) = 0.0526315789473684',
-                          '(20, 0, 0) = 0.05',
-                          '(21, 0, 0) = 0.0476190476190476',
-                          '(22, 0, 0) = 0.0454545454545455',
-                          '(23, 0, 0) = 0.0434782608695652'])
-
-        # double2
-        self.try_command('language renderscript allocation dump 45',
-                         ['(0, 0, 0) = {inf 1}',
-                          '(1, 0, 0) = {0.5 0.333333333333333}',
-                          '(2, 0, 0) = {0.25 0.2}',
-                          '(3, 0, 0) = {0.166666666666667 0.142857142857143}',
-                          '(0, 0, 1) = {0.125 0.111111111111111}',
-                          '(1, 0, 1) = {0.1 0.0909090909090909}',
-                          '(2, 0, 1) = {0.0833333333333333 0.0769230769230769}',
-                          '(3, 0, 1) = {0.0714285714285714 0.0666666666666667}',
-                          '(0, 0, 2) = {0.0625 0.0588235294117647}',
-                          '(1, 0, 2) = {0.0555555555555556 0.0526315789473684}',
-                          '(2, 0, 2) = {0.05 0.0476190476190476}',
-                          '(3, 0, 2) = {0.0454545454545455 0.0434782608695652}'])
-
-        # double3
-        self.try_command('language renderscript allocation dump 46',
-                         ['(0, 0, 0) = {inf 1 0.5}',
-                          '(0, 1, 0) = {0.25 0.2 0.166666666666667}',
-                          '(0, 0, 1) = {0.125 0.111111111111111 0.1}',
-                          '(0, 1, 1) = {0.0833333333333333 0.0769230769230769 '
-                                       '0.0714285714285714}',
-                          '(0, 0, 2) = {0.0625 0.0588235294117647 0.0555555555555556}',
-                          '(0, 1, 2) = {0.05 0.0476190476190476 0.0454545454545455}'])
-
-        # double4
-        self.try_command('language renderscript allocation dump 47',
-                         ['(0, 0, 0) = {inf 1 0.5 0.333333333333333}',
-                          '(0, 1, 0) = {0.25 0.2 0.166666666666667 0.142857142857143}',
-                          '(0, 0, 1) = {0.125 0.111111111111111 0.1 0.0909090909090909}',
-                          '(0, 1, 1) = {0.0833333333333333 0.0769230769230769 '
-                                       '0.0714285714285714 0.0666666666666667}',
-                          '(0, 0, 2) = {0.0625 0.0588235294117647 '
-                                       '0.0555555555555556 0.0526315789473684}',
-                          '(0, 1, 2) = {0.05 0.0476190476190476 '
-                                       '0.0454545454545455 0.0434782608695652}'])
diff --git a/tests/lldb/tests/testcases/test_allocation_dump_struct.py b/tests/lldb/tests/testcases/test_allocation_dump_struct.py
deleted file mode 100644
index f161131..0000000
--- a/tests/lldb/tests/testcases/test_allocation_dump_struct.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestAllocationDumpStruct.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-
-
-class TestAllocationDumpStruct(TestBaseRemote):
-    '''Tests printing the contents of a struct allocation.'''
-
-    bundle_target = {
-        'java': 'Allocations'
-    }
-
-    def setup(self, android):
-        '''This test requires to be run on one thread.'''
-        android.push_prop('debug.rs.max-threads', 1)
-
-    def teardown(self, android):
-        '''Reset the number of RS threads to the previous value.'''
-        android.pop_prop('debug.rs.max-threads')
-
-    def test_dump_complex_struct_allocation(self):
-        # Hit struct_kernel on last coordinate, so almost all elements have been initalised
-        self.try_command(
-            'language renderscript kernel breakpoint set struct_kernel -c 23',
-            ['Conditional kernel breakpoint on coordinate (23, 0, 0)',
-            'Breakpoint(s) created'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # complex_struct output allocation
-        self.try_command('language renderscript allocation dump 49',
-                         ['(0, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 0, j = 0)\n'
-                          '   (0x00, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 0, [1] = 0.5)\n'
-                          '}',
-                          '(1, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 1, j = 1)\n'
-                          '   (0x01, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 1, [1] = 1.5)\n'
-                          '}',
-                          '(2, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 2, j = 2)\n'
-                          '   (0x02, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 2, [1] = 2.5)\n'
-                          '}',
-                          '(3, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 3, j = 3)\n'
-                          '   (0x03, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 3, [1] = 3.5)\n'
-                          '}',
-                          '(4, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 4, j = 4)\n'
-                          '   (0x04, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 4, [1] = 4.5)\n'
-                          '}',
-                          '(5, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 5, j = 5)\n'
-                          '   (0x05, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 5, [1] = 5.5)\n'
-                          '}',
-                          '(6, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 6, j = 6)\n'
-                          '   (0x06, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 6, [1] = 6.5)\n'
-                          '}',
-                          '(7, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 7, j = 7)\n'
-                          '   (0x07, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 7, [1] = 7.5)\n'
-                          '}',
-                          '(8, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 8, j = 8)\n'
-                          '   (0x08, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 8, [1] = 8.5)\n'
-                          '}',
-                          '(9, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 9, j = 9)\n'
-                          '   (0x09, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 9, [1] = 9.5)\n'
-                          '}',
-                          '(10, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 10, j = 10)\n'
-                          '   (0x0a, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 10, [1] = 10.5)\n'
-                          '}',
-                          '(11, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 11, j = 11)\n'
-                          '   (0x0b, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 11, [1] = 11.5)\n'
-                          '}',
-                          '(12, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 12, j = 12)\n'
-                          '   (0x0c, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 12, [1] = 12.5)\n'
-                          '}',
-                          '(13, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 13, j = 13)\n'
-                          '   (0x0d, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 13, [1] = 13.5)\n'
-                          '}',
-                          '(14, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 14, j = 14)\n'
-                          '   (0x0e, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 14, [1] = 14.5)\n'
-                          '}',
-                          '(15, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 15, j = 15)\n'
-                          '   (0x0f, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 15, [1] = 15.5)\n'
-                          '}',
-                          '(16, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 16, j = 16)\n'
-                          '   (0x10, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 16, [1] = 16.5)\n'
-                          '}',
-                          '(17, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 17, j = 17)\n'
-                          '   (0x11, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 17, [1] = 17.5)\n'
-                          '}',
-                          '(18, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 18, j = 18)\n'
-                          '   (0x12, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 18, [1] = 18.5)\n'
-                          '}',
-                          '(19, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 19, j = 19)\n'
-                          '   (0x13, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 19, [1] = 19.5)\n'
-                          '}',
-                          '(20, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 20, j = 20)\n'
-                          '   (0x14, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 20, [1] = 20.5)\n'
-                          '}',
-                          '(21, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 21, j = 21)\n'
-                          '   (0x15, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 21, [1] = 21.5)\n'
-                          '}',
-                          '(22, 0, 0) = (complexStruct)  {\n'
-                          '   (i = 22, j = 22)\n'
-                          '   (0x16, 0x41, 0x42, 0x43)\n'
-                          '   ([0] = 22, [1] = 22.5)\n'
-                          '}'])
diff --git a/tests/lldb/tests/testcases/test_allocation_file.py b/tests/lldb/tests/testcases/test_allocation_file.py
deleted file mode 100644
index 680ae4a..0000000
--- a/tests/lldb/tests/testcases/test_allocation_file.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestAllocationFile.'''
-
-from __future__ import absolute_import
-
-import os
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    cpp_only_test,
-    ordered_test
-)
-
-
-class TestAllocationFile(TestBaseRemote):
-    '''Tests saving the contents of allocations to disk and reloading them.'''
-
-    bundle_target = {
-        'java': 'Allocations',
-        'cpp': 'CppAllocations',
-        'jni': 'JNIAllocations'
-    }
-
-    @ordered_test(0)
-    def test_allocation_file_roundtrip(self):
-        self.try_command('language renderscript kernel breakpoint all enable',
-                         ['Breakpoints will be set on all kernels'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # Binary file of int2 allocation
-        file_int2 = self.get_tmp_file_path()
-
-        self.try_command('language renderscript allocation save 12 ' +
-                         file_int2,
-                         ["Allocation written to file '%s'" % file_int2])
-
-        # Check file was created
-        self.assert_true(os.path.isfile(file_int2))
-
-        # Load the file we just created, to assert the allocation contents are
-        # the same
-        self.try_command('language renderscript allocation load 12 ' +
-                         file_int2,
-                         ["Contents of file '%s' read into allocation 12" %
-                          file_int2])
-        os.remove(file_int2)
-
-        self.try_command('language renderscript allocation dump 12',
-                         ['(0, 0, 0) = {0 1}',
-                          '(1, 0, 0) = {2 3}',
-                          '(2, 0, 0) = {4 5}',
-                          '(3, 0, 0) = {6 7}',
-                          '(4, 0, 0) = {8 9}',
-                          '(5, 0, 0) = {10 11}',
-                          '(6, 0, 0) = {12 13}',
-                          '(7, 0, 0) = {14 15}',
-                          '(8, 0, 0) = {16 17}',
-                          '(9, 0, 0) = {18 19}',
-                          '(10, 0, 0) = {20 21}',
-                          '(11, 0, 0) = {22 23}'])
-
-        self.try_command('breakpoint del 1',
-                         ['1 breakpoints deleted'])
-
-        # Hit second kernel
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # Binary file of uint allocation
-        file_uint = self.get_tmp_file_path()
-
-        self.try_command('language renderscript allocation save 28 ' +
-                         file_uint,
-                         ["Allocation written to file '%s'" % file_uint])
-
-        # Check file was created
-        self.assert_true(os.path.isfile(file_uint))
-
-        # Test loading file into allocation with an incompatible type 'short'
-        self.try_command('language renderscript allocation load 7 ' + file_uint,
-                         ["Contents of file '%s' read into allocation 7" %
-                          file_uint,
-                          "Warning: Mismatched Element sizes",
-                          "Warning: Mismatched Types",
-                          "Warning: Mismatched allocation sizes"])
-
-        # Check result of size inconsistency, mapping 4-byte unsigned to 2-byte
-        # int
-        self.try_command('language renderscript allocation dump 7',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 0',
-                          '(2, 0, 0) = 1',
-                          '(3, 0, 0) = 0',
-                          '(4, 0, 0) = 2',
-                          '(5, 0, 0) = 0',
-                          '(6, 0, 0) = 3',
-                          '(7, 0, 0) = 0',
-                          '(8, 0, 0) = 4',
-                          '(9, 0, 0) = 0',
-                          '(10, 0, 0) = 5',
-                          '(11, 0, 0) = 0',
-                          '(12, 0, 0) = 6',
-                          '(13, 0, 0) = 0',
-                          '(14, 0, 0) = 7',
-                          '(15, 0, 0) = 0',
-                          '(16, 0, 0) = 8',
-                          '(17, 0, 0) = 0',
-                          '(18, 0, 0) = 9',
-                          '(19, 0, 0) = 0',
-                          '(20, 0, 0) = 10',
-                          '(21, 0, 0) = 0',
-                          '(22, 0, 0) = 11',
-                          '(23, 0, 0) = 0'])
-
-        self.try_command('breakpoint del 2',
-                         ['1 breakpoints deleted'])
-
-        # Hit third kernel
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # Test that uint allocation has been squared by square_kernel
-        self.try_command('language renderscript allocation dump 28',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 4',
-                          '(3, 0, 0) = 9',
-                          '(4, 0, 0) = 16',
-                          '(5, 0, 0) = 25',
-                          '(6, 0, 0) = 36',
-                          '(7, 0, 0) = 49',
-                          '(8, 0, 0) = 64',
-                          '(9, 0, 0) = 81',
-                          '(10, 0, 0) = 100',
-                          '(11, 0, 0) = 121',
-                          '(12, 0, 0) = 144',
-                          '(13, 0, 0) = 169',
-                          '(14, 0, 0) = 196',
-                          '(15, 0, 0) = 225',
-                          '(16, 0, 0) = 256',
-                          '(17, 0, 0) = 289',
-                          '(18, 0, 0) = 324',
-                          '(19, 0, 0) = 361',
-                          '(20, 0, 0) = 400',
-                          '(21, 0, 0) = 441',
-                          '(22, 0, 0) = 484',
-                          '(23, 0, 0) = 529'])
-
-        # Load uint allocation from save before square_kernel had been run
-        self.try_command('language renderscript allocation load 28 ' +
-                         file_uint,
-                         ["Contents of file '%s' read into allocation 28" %
-                          file_uint])
-        os.remove(file_uint)
-
-        # Check contents are back to original
-        self.try_command('language renderscript allocation dump 28',
-                         ['(0, 0, 0) = 0',
-                          '(1, 0, 0) = 1',
-                          '(2, 0, 0) = 2',
-                          '(3, 0, 0) = 3',
-                          '(4, 0, 0) = 4',
-                          '(5, 0, 0) = 5',
-                          '(6, 0, 0) = 6',
-                          '(7, 0, 0) = 7',
-                          '(8, 0, 0) = 8',
-                          '(9, 0, 0) = 9',
-                          '(10, 0, 0) = 10',
-                          '(11, 0, 0) = 11',
-                          '(12, 0, 0) = 12',
-                          '(13, 0, 0) = 13',
-                          '(14, 0, 0) = 14',
-                          '(15, 0, 0) = 15',
-                          '(16, 0, 0) = 16',
-                          '(17, 0, 0) = 17',
-                          '(18, 0, 0) = 18',
-                          '(19, 0, 0) = 19',
-                          '(20, 0, 0) = 20',
-                          '(21, 0, 0) = 21',
-                          '(22, 0, 0) = 22',
-                          '(23, 0, 0) = 23'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 3', ['1 breakpoints deleted'])
-
-        self.try_command('process continue',
-                         ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_allocation_list.py b/tests/lldb/tests/testcases/test_allocation_list.py
deleted file mode 100644
index ca492f2..0000000
--- a/tests/lldb/tests/testcases/test_allocation_list.py
+++ /dev/null
@@ -1,547 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestAllocationList.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    wimpy,
-    cpp_only_test,
-    java_only_test,
-)
-
-
-class TestAllocationList(TestBaseRemote):
-    '''Tests printing the details of all allocations.'''
-
-    bundle_target = {
-        'java': 'Allocations',
-        'jni': 'JNIAllocations',
-        'cpp': 'CppAllocations'
-    }
-
-    @wimpy
-    @ordered_test(0)
-    def test_allocation_list_single(self):
-        # pylint: disable=anomalous-backslash-in-string
-        self.try_command('language renderscript kernel breakpoint all enable',
-                         ['Breakpoints will be set on all kernels'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # Test command line flag for single allocation
-        self.try_command('language renderscript allocation list -i 3',
-                         [],
-                         ['3:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(1, 3, 8\)\n'
-                          '    Data Type: char\n'
-                          '    Data Kind: User'])
-
-    @ordered_test(1)
-    def test_allocation_list_all(self):
-        self.try_command('language renderscript allocation list',
-                         [],
-                         ['1:\n'
-                         # Regex for non zero hex number
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(64, 64, 0\)\n'
-                          '    Data Type: uchar4\n'
-                          '    Data Kind: RGBA Pixel',
-                          '2:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(64, 64, 0\)\n'
-                          '    Data Type: uchar4\n'
-                          '    Data Kind: RGBA Pixel',
-                          '3:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(1, 3, 8\)\n'
-                          '    Data Type: char\n'
-                          '    Data Kind: User',
-                          '4:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(12, 0, 0\)\n'
-                          '    Data Type: char2\n'
-                          '    Data Kind: User',
-                          '5:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: char3\n'
-                          '    Data Kind: User',
-                          '6:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: char4\n'
-                          '    Data Kind: User',
-                          '7:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: short\n'
-                          '    Data Kind: User',
-                          '8:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 1, 2\)\n'
-                          '    Data Type: short2\n'
-                          '    Data Kind: User',
-                          '9:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: short3\n'
-                          '    Data Kind: User',
-                          '10:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: short4\n'
-                          '    Data Kind: User',
-                          '11:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: int\n'
-                          '    Data Kind: User',
-                          '12:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(12, 0, 0\)\n'
-                          '    Data Type: int2\n'
-                          '    Data Kind: User',
-                          '13:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(3, 2, 0\)\n'
-                          '    Data Type: int3\n'
-                          '    Data Kind: User',
-                          '14:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: int4\n'
-                          '    Data Kind: User',
-                          '15:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: long\n'
-                          '    Data Kind: User',
-                          '16:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(12, 0, 0\)\n'
-                          '    Data Type: long2\n'
-                          '    Data Kind: User',
-                          '17:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: long3\n'
-                          '    Data Kind: User',
-                          '18:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(1, 6, 0\)\n'
-                          '    Data Type: long4\n'
-                          '    Data Kind: User',
-                          '19:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: bool\n'
-                          '    Data Kind: User'
-                         ])
-
-    @wimpy
-    @ordered_test(2)
-    def test_continue_1(self):
-        self.try_command('breakpoint del 1',
-                          ['1 breakpoints deleted'])
-
-        # Hit second kernel
-        self.try_command('process continue',
-                          ['resuming',
-                           'stopped',
-                           'stop reason = breakpoint'])
-
-    @ordered_test(3)
-    def test_allocation_list_all2_java(self):
-        # TODO investigate why java tests show extra allocations
-        if self.app_type == 'java':
-            allocation_1_re = [
-                '1:\n'
-                # Regex for non zero hex number
-                '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                '    Dimensions: \(64, 64, 0\)\n'
-                '    Data Type: uchar4\n'
-                '    Data Kind: RGBA Pixel'
-            ]
-        else:
-            allocation_1_re = []
-
-        self.try_command('language renderscript allocation list',
-                         [],
-                         allocation_1_re +
-                         ['2:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(64, 64, 0\)\n'
-                          '    Data Type: uchar4\n'
-                          '    Data Kind: RGBA Pixel',
-                          '7:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: short\n'
-                          '    Data Kind: User',
-                          '20:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: uchar\n'
-                          '    Data Kind: User',
-                          '21:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(2, 6, 0\)\n'
-                          '    Data Type: uchar2\n'
-                          '    Data Kind: User',
-                          '22:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: uchar3\n'
-                          '    Data Kind: User',
-                          '23:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: uchar4\n'
-                          '    Data Kind: User',
-                          '24:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: ushort\n'
-                          '    Data Kind: User',
-                          '25:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(12, 0, 0\)\n'
-                          '    Data Type: ushort2\n'
-                          '    Data Kind: User',
-                          '26:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(1, 6, 0\)\n'
-                          '    Data Type: ushort3\n'
-                          '    Data Kind: User',
-                          '27:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: ushort4\n'
-                          '    Data Kind: User',
-                          '28:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: uint\n'
-                          '    Data Kind: User',
-                          '29:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(12, 0, 0\)\n'
-                          '    Data Type: uint2\n'
-                          '    Data Kind: User',
-                          '30:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: uint3\n'
-                          '    Data Kind: User',
-                          '31:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(1, 1, 6\)\n'
-                          '    Data Type: uint4\n'
-                          '    Data Kind: User',
-                          '32:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(4, 3, 2\)\n'
-                          '    Data Type: ulong\n'
-                          '    Data Kind: User',
-                          '33:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(12, 0, 0\)\n'
-                          '    Data Type: ulong2\n'
-                          '    Data Kind: User',
-                          '34:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: ulong3\n'
-                          '    Data Kind: User',
-                          '35:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: ulong4\n'
-                          '    Data Kind: User'
-                         ])
-
-    @wimpy
-    @ordered_test(4)
-    def test_continue_2(self):
-        self.try_command('breakpoint del 2',
-                         ['1 breakpoints deleted'])
-
-        # Hit third kernel
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-    @ordered_test(5)
-    def test_allocation_list_all3(self):
-        self.try_command('language renderscript allocation list',
-                         [],
-                         ['2:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(64, 64, 0\)\n'
-                          '    Data Type: uchar4\n'
-                          '    Data Kind: RGBA Pixel',
-                          '7:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: short\n'
-                          '    Data Kind: User',
-                          '28:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: uint\n'
-                          '    Data Kind: User',
-                          '36:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: half\n'
-                          '    Data Kind: User',
-                          '37:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(12, 0, 0\)\n'
-                          '    Data Type: half2\n'
-                          '    Data Kind: User',
-                          '38:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(1, 6, 0\)\n'
-                          '    Data Type: half3\n'
-                          '    Data Kind: User',
-                          '39:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: half4\n'
-                          '    Data Kind: User',
-                          '40:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: float\n'
-                          '    Data Kind: User',
-                          '41:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(12, 0, 0\)\n'
-                          '    Data Type: float2\n'
-                          '    Data Kind: User',
-                          '42:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(6, 0, 0\)\n'
-                          '    Data Type: float3\n'
-                          '    Data Kind: User',
-                          '43:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(3, 2, 0\)\n'
-                          '    Data Type: float4\n'
-                          '    Data Kind: User',
-                          '44:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: double\n'
-                          '    Data Kind: User',
-                          '45:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(4, 1, 3\)\n'
-                          '    Data Type: double2\n'
-                          '    Data Kind: User',
-                          '46:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(1, 2, 3\)\n'
-                          '    Data Type: double3\n'
-                          '    Data Kind: User',
-                          '47:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(1, 2, 3\)\n'
-                          '    Data Type: double4\n'
-                          '    Data Kind: User'])
-
-    @wimpy
-    @ordered_test(6)
-    @java_only_test()
-    def test_allocation_list_all4(self):
-        self.try_command('breakpoint del 3',
-                         ['1 breakpoints deleted'])
-
-        # Hit last kernel
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('language renderscript allocation list',
-                         [],
-                         ['2:\n'
-                          # Regex for non zero hex number
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(64, 64, 0\)\n'
-                          '    Data Type: uchar4\n'
-                          '    Data Kind: RGBA Pixel',
-                          '7:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: short\n'
-                          '    Data Kind: User',
-                          '28:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: uint\n'
-                          '    Data Kind: User',
-                          '46:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(1, 2, 3\)\n'
-                          '    Data Type: double3\n'
-                          '    Data Kind: User',
-                          '48:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: complexStruct\n'
-                          '    Data Kind: User',
-                          '49:\n'
-                          '    Context: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Address: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Data pointer: 0x0*[1-9a-fA-F][0-9a-fA-F]*\n'
-                          '    Dimensions: \(24, 0, 0\)\n'
-                          '    Data Type: complexStruct\n'
-                          '    Data Kind: User'])
-
-    @ordered_test(7)
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 3', ['1 breakpoints deleted'])
-
-        self.try_command('process continue',
-                         ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_backtrace.py b/tests/lldb/tests/testcases/test_backtrace.py
deleted file mode 100644
index 1e5e79f..0000000
--- a/tests/lldb/tests/testcases/test_backtrace.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestBacktrace.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    cpp_only_test,
-)
-
-
-class TestBacktrace(TestBaseRemote):
-    '''Tests breaking on a kernel and a function, and viewing the call stack.'''
-
-    bundle_target = {
-        'java': 'BranchingFunCalls',
-        'jni': 'JNIBranchingFunCalls',
-        'cpp': 'CppBranchingFunCalls'
-    }
-
-    def test_kernel_backtrace(self):
-        # pylint: disable=line-too-long
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('language renderscript kernel breakpoint set simple_kernel',
-                         ['Breakpoint(s) created',
-                          '(pending)'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('bt',
-                         ['stop reason = breakpoint',
-                          # We should be able to see three functions in bt:
-                          # libRSCpuRef, kernel.expand and the kernel
-                          'frame #2:',
-                          'librs.scalars.so',
-                          'simple_kernel'],
-                         [r'scalars\.rs:6[123]'])
-
-        self.try_command('breakpoint delete 1',
-                         ['1 breakpoints deleted'])
-
-        self.try_command('b set_i',
-                         ['Breakpoint 2',
-                          'set_i'],
-                         [r'scalars\.rs:3[678]'])
-
-        self.try_command('breakpoint list',
-                         ['set_i', 'resolved'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('bt',
-                         ['stop reason = breakpoint',
-                          # We should be able to see five functions in bt:
-                          # libRSCpuRef, kernel.expand, kernel and two functions
-                          'frame #4:',
-                          'librs.scalars.so',
-                          'modify_i',
-                          'set_i'],
-                         [r'scalars\.rs:3[678]'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 2',
-                         ['1 breakpoints deleted'])
-
-        self.try_command('process continue',
-                         ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_breakpoint_coordinate.py b/tests/lldb/tests/testcases/test_breakpoint_coordinate.py
deleted file mode 100644
index 319d4f2..0000000
--- a/tests/lldb/tests/testcases/test_breakpoint_coordinate.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestBreakpointCoordinate.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    wimpy,
-    ordered_test,
-    cpp_only_test,
-)
-from harness.assert_mixins import CoordinateAssertionsMixin
-
-
-class TestBreakpointCoordinate(TestBaseRemote, CoordinateAssertionsMixin):
-    '''Tests breaking on a specific kernel invocation.
-
-    Uses the -c option to specify the coordinate.
-    '''
-
-    bundle_target = {
-        'java': 'Allocations',
-        'jni': 'JNIAllocations',
-        'cpp': 'CppAllocations'
-    }
-
-    def setup(self, android):
-        '''This test requires to be run on one thread.
-
-        Args:
-            android: The android_util module.
-        '''
-        android.push_prop('debug.rs.max-threads', 1)
-
-    def teardown(self, android):
-        '''Reset the number of RS threads to the previous value.
-
-        Args:
-            android: The android_util module.
-        '''
-        android.pop_prop('debug.rs.max-threads')
-
-    @wimpy
-    @ordered_test(0)
-    def test_breakpoint_coordinate_2d_swizzle_kernel(self):
-        # pylint: disable=line-too-long
-
-        # test conditional coordinate in two dimensions
-        # breakpoint 1
-        self.assert_coord_bp_set('swizzle_kernel', 3, 7)
-
-        # we will delete this breakpoint before we hit it.
-        # breakpoint 2
-        self.assert_coord_bp_set('swizzle_kernel', 199, 190)
-
-        self.assert_coord_stop('allocs', 'swizzle_kernel', x=3, y=7)
-
-        # check breakpoints that have been hit are disabled
-        self.try_command(
-            'breakpoint list',
-            [
-                "1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled",
-                "2: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1"
-            ]
-        )
-
-        # delete breakpoint on 199,199,0
-        self.try_command('breakpoint delete 2', ['1 breakpoints deleted'])
-
-        # check breakpoints that have been hit are disabled
-        self.try_command(
-            'breakpoint list',
-            ["1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled"]
-        )
-
-        # test conditional coordinate in a single dimension
-        # breakpoint 3
-        self.assert_coord_bp_set('square_kernel', 8)
-
-        # check breakpoints that have been hit are disabled
-        self.try_command(
-            'breakpoint list',
-            [
-                "1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled",
-                "3: RenderScript kernel breakpoint for 'square_kernel', locations = 1"
-            ]
-        )
-
-        self.assert_coord_stop('allocs', 'square_kernel', x=8)
-
-        # check breakpoints that have been hit are disabled
-        self.try_command(
-            'breakpoint list',
-            [
-                "1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled",
-                "3: RenderScript kernel breakpoint for 'square_kernel', locations = 1 Options: disabled"
-            ]
-        )
-
-    @wimpy
-    @ordered_test(1)
-    def test_breakpoint_coordinate_3d_add_half_kernel(self):
-        # test conditional coordinate in three dimensions
-        # breakpoint 4
-        self.assert_coord_bp_set('add_half_kernel', 0, 0, 1)
-        # test we can set more than one conditional kernel breakpoint
-        # and both will be hit;
-        # breakpoint 5
-        self.assert_coord_bp_set('add_half_kernel', 0, 1, 2)
-
-        # Now assert that the next two continue/stop cycles hit our conditionals
-        self.assert_coord_stop('allocs', 'add_half_kernel', x=0, y=0, z=1)
-        self.assert_coord_stop('allocs', 'add_half_kernel', x=0, y=1, z=2)
-
-        # check we can see the coordinate from a function invoked by the kernel
-        # breakpoint 6
-        self.try_command(
-            'break set -n half_helper',
-            ['librs.allocs.so`half_helper']
-        )
-
-        # continue till we hit breakpoint 6
-        self.assert_coord_stop('allocs', 'half_helper', x=0, y=1, z=2)
-
-        self.try_command(
-            'breakpoint list',
-            [
-                "1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled",
-                "3: RenderScript kernel breakpoint for 'square_kernel', locations = 1 Options: disabled",
-                "4: RenderScript kernel breakpoint for 'add_half_kernel', locations = 1 Options: disabled",
-                "5: RenderScript kernel breakpoint for 'add_half_kernel', locations = 1 Options: disabled",
-                "6: name = 'half_helper', locations = 1, resolved = 1, hit count = 1"
-            ]
-        )
-
-        self.try_command('breakpoint delete 3', ['1 breakpoints deleted'])
-
-        self.try_command(
-            'breakpoint list',
-            [
-                "1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled",
-                "4: RenderScript kernel breakpoint for 'add_half_kernel', locations = 1 Options: disabled",
-                "5: RenderScript kernel breakpoint for 'add_half_kernel', locations = 1 Options: disabled",
-                "6: name = 'half_helper', locations = 1, resolved = 1, hit count = 1"
-            ]
-        )
-
-        self.try_command('breakpoint delete 6', ['1 breakpoints deleted'])
-
-        self.try_command(
-            'breakpoint list',
-            [
-                "1: RenderScript kernel breakpoint for 'swizzle_kernel', locations = 1 Options: disabled",
-                "4: RenderScript kernel breakpoint for 'add_half_kernel', locations = 1 Options: disabled",
-                "5: RenderScript kernel breakpoint for 'add_half_kernel', locations = 1 Options: disabled"
-            ]
-        )
-
-    @cpp_only_test()
-    @ordered_test('last')
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 4', ['1 breakpoints deleted'])
-        self.try_command('breakpoint delete 5', ['1 breakpoints deleted'])
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_breakpoint_fileline.py b/tests/lldb/tests/testcases/test_breakpoint_fileline.py
deleted file mode 100644
index be89ca9..0000000
--- a/tests/lldb/tests/testcases/test_breakpoint_fileline.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestBreakpointFileLine.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    cpp_only_test,
-    ordered_test
-)
-
-
-class TestBreakpointFileLine(TestBaseRemote):
-    '''Tests the setting of a breakpoint on a specific line of a RS file.'''
-
-    bundle_target = {
-        'java': 'JavaDebugWaitAttach',
-        'jni': 'JNIDebugWaitAttach',
-        'cpp': 'CppDebugWaitAttach'
-    }
-
-    @ordered_test(0)
-    def test_breakpoint_fileline(self):
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('breakpoint set --file simple.rs --line 28',
-                         ['(pending)'])
-
-        self.try_command('process continue',
-                         [])
-
-        self.try_command('bt',
-                         ['librs.simple.so',
-                          'simple_kernel',
-                          'stop reason = breakpoint'])
-
-        self.try_command('breakpoint list',
-                         ['simple.rs',
-                          'resolved = 1'])
-
-        self.try_command('process status',
-                         ['stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('breakpoint delete 1',
-                         ['1 breakpoints deleted'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_breakpoint_fileline_multiple_rs_files.py b/tests/lldb/tests/testcases/test_breakpoint_fileline_multiple_rs_files.py
deleted file mode 100644
index 9ba4283..0000000
--- a/tests/lldb/tests/testcases/test_breakpoint_fileline_multiple_rs_files.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestBreakpointFileLineMultipleRSFiles.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    cpp_only_test,
-    ordered_test
-)
-
-
-class TestBreakpointFileLineMultipleRSFiles(TestBaseRemote):
-    '''Tests the setting of a breakpoint on one of multiple RS files.'''
-
-    bundle_target = {
-        'java': 'MultipleRSFiles',
-        'jni': 'JNIMultipleRSFiles',
-        'cpp': 'CppMultipleRSFiles'
-    }
-
-    def _binary_name(self):
-        return {
-            'java': 'multiplersfiles',
-            'jni': 'multiplersfiles',
-            'cpp': 'CppMultipleRSFi'
-        }[self.app_type]
-
-    @ordered_test(0)
-    def test_breakpoint_fileline_multiple_files(self):
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('breakpoint set --file first.rs --line 28',
-                         ['(pending)'])
-
-        self.try_command('process continue',
-                         ['stopped',
-                          'librs.first.so`first_kernel',
-                          'at first.rs:28',
-                          "name = '%s'" % self._binary_name(),
-                          'stop reason = breakpoint 1'])
-
-        self.try_command('breakpoint set --file second.rs --line 23',
-                         ['Breakpoint 2',
-                          'librs.second.so`second_kernel',
-                          'second.rs:23'])
-
-        self.try_command('breakpoint list',
-                         ['first.rs',
-                          'second.rs',
-                          'resolved = 1',
-                          'first.rs:28',
-                          'second.rs:23'])
-
-        self.try_command('breakpoint delete 1',
-                         ['1 breakpoints deleted'])
-
-        self.try_command('process continue',
-                         ['stopped',
-                          'librs.second.so`second_kernel',
-                          'at second.rs:23',
-                          "name = '%s'" % self._binary_name(),
-                          'stop reason = breakpoint 2'])
-
-        self.try_command('process status',
-                         ['stopped',
-                          'stop reason = breakpoint'])
-
-    @cpp_only_test()
-    @ordered_test('last')
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 2', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_breakpoint_kernel_1.py b/tests/lldb/tests/testcases/test_breakpoint_kernel_1.py
deleted file mode 100644
index a501b66..0000000
--- a/tests/lldb/tests/testcases/test_breakpoint_kernel_1.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestBreakpointKernel1.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    cpp_only_test
-)
-
-
-class TestBreakpointKernel1(TestBaseRemote):
-    '''Tests the setting of a breakpoint on a RS kernel.'''
-
-    bundle_target = {
-        'java': 'JavaDebugWaitAttach',
-        'jni': 'JNIDebugWaitAttach',
-        'cpp': 'CppDebugWaitAttach'
-    }
-
-    @ordered_test(0)
-    def test_breakpoint_set_nonexistent_kernel(self):
-        # pylint: disable=line-too-long
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('language renderscript kernel breakpoint set simple_kernel',
-                         ['Breakpoint(s) created',
-                          '(pending)'])
-
-        # Try set a breakpoint on a kernel which doesn't exist
-        self.try_command('language renderscript kernel breakpoint set imaginary_kernel',
-                         ['Breakpoint(s) created',
-                          '(pending)'])
-
-        self.try_command('breakpoint list',
-                         ["'simple_kernel', locations = 0 (pending)",
-                          "'imaginary_kernel', locations = 0 (pending)"])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('bt',
-                         ['stop reason = breakpoint',
-                          'frame #0:',
-                          'librs.simple.so',
-                          'simple_kernel'])
-
-        self.try_command('breakpoint list',
-                         ["'imaginary_kernel', locations = 0 (pending)",
-                          "'simple_kernel', locations = 1, resolved = 1"])
-
-    @ordered_test(1)
-    def test_breakpoint_delete_nonexistent_kernel(self):
-        # Delete breakpoint on kernel which doesn't exist
-        self.try_command('breakpoint delete 2',
-                         ['1 breakpoints deleted'])
-
-        self.try_command('breakpoint list',
-                         ["'simple_kernel', locations = 1, resolved = 1"])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('breakpoint list',
-                         ["'simple_kernel', locations = 1, resolved = 1"])
-
-        self.try_command('breakpoint delete 1',
-                         ['1 breakpoints deleted'])
-
-        self.try_command('breakpoint list',
-                         ['No breakpoints currently set'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_breakpoint_kernel_2.py b/tests/lldb/tests/testcases/test_breakpoint_kernel_2.py
deleted file mode 100644
index 6dea13f..0000000
--- a/tests/lldb/tests/testcases/test_breakpoint_kernel_2.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestBreakpointKernel2.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-
-
-class TestBreakpointKernel2(TestBaseRemote):
-    '''Tests the setting of a breakpoint on a RS kernel.'''
-
-    bundle_target = {
-        'java': 'JavaInfiniteLoop',
-        'jni': 'JNIInfiniteLoop',
-        'cpp': 'CppInfiniteLoop'
-    }
-
-    def test_breakpoint_resolution_simple_kernel(self):
-        # pylint: disable=line-too-long
-        self.try_command('language renderscript kernel breakpoint set simple_kernel',
-                         ['Breakpoint(s) created'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('bt',
-                         ['stop reason = breakpoint',
-                          'frame #0:',
-                          'librs.infiniteloop.so',
-                          'simple_kernel'],
-                         [r'infiniteloop\.rs:4[34]'])
-
-        self.try_command('breakpoint list',
-                         ['simple_kernel',
-                          'resolved = 1'])
-
-        self.try_command('process status',
-                         ['stopped',
-                          '.so`simple_kernel',
-                          'stop reason = breakpoint'])
diff --git a/tests/lldb/tests/testcases/test_breakpoint_kernel_all.py b/tests/lldb/tests/testcases/test_breakpoint_kernel_all.py
deleted file mode 100644
index ea988e4..0000000
--- a/tests/lldb/tests/testcases/test_breakpoint_kernel_all.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestBreakpointKernelAll.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    cpp_only_test,
-)
-
-
-class TestBreakpointKernelAll(TestBaseRemote):
-    '''Tests setting breakpoints on every RS kernel.'''
-
-    bundle_target = {
-        'java': 'JavaDebugWaitAttach',
-        'jni': 'JNIDebugWaitAttach',
-        'cpp': 'CppDebugWaitAttach'
-    }
-
-    @ordered_test(0)
-    def test_kernel_breakpoint_all_unloaded_kernels(self):
-        # Test command works with no kernels currently loaded
-        self.try_command('language renderscript kernel breakpoint all enable',
-                         ['Breakpoints will be set on all kernels'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('breakpoint list',
-                         ["'simple_kernel', locations = 1, resolved = 1",
-                          "'other_kernel', locations = 1, resolved = 1"])
-
-        # Check disable doesn't delete breakpoints
-        self.try_command('language renderscript kernel breakpoint all disable',
-                         ['Breakpoints will not be set on any new kernels'])
-
-        # Delete all breakpoints manually
-        self.try_command('breakpoint delete 1',
-                         ['1 breakpoints deleted'])
-
-        self.try_command('breakpoint delete 2',
-                         ['1 breakpoints deleted'])
-
-        self.try_command('breakpoint list',
-                         ["No breakpoints currently set"])
-
-        # Test command works when kernels are loaded
-        self.try_command('language renderscript kernel breakpoint all enable',
-                         ['Breakpoints will be set on all kernels'])
-
-        self.try_command('breakpoint list',
-                         ["'simple_kernel', locations = 1, resolved = 1",
-                          "'other_kernel', locations = 1, resolved = 1"])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('breakpoint delete 3',
-                         ['1 breakpoints deleted'])
-
-        # Check other_kernel breakpoint gets hit
-        self.try_command('breakpoint list',
-                         ["'other_kernel', locations = 1, resolved = 1"])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 4', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_breakpoint_kernel_all_multiple_rs_files.py b/tests/lldb/tests/testcases/test_breakpoint_kernel_all_multiple_rs_files.py
deleted file mode 100644
index 675b5a3..0000000
--- a/tests/lldb/tests/testcases/test_breakpoint_kernel_all_multiple_rs_files.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestBreakpointKernelAllMultipleRSFiles.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    wimpy,
-    ordered_test,
-    cpp_only_test,
-)
-
-
-class TestBreakpointKernelAllMultipleRSFiles(TestBaseRemote):
-    '''Tests setting breakpoints on every RS kernel in multiple kernel files.'''
-
-    bundle_target = {
-        'java': 'MultipleRSFiles',
-        'jni': 'JNIMultipleRSFiles',
-        'cpp': 'CppMultipleRSFiles'
-    }
-
-    @ordered_test(0)
-    def test_deferred_breakpoint_resolution(self):
-        # Test command works with no kernels currently loaded
-        self.try_command('language renderscript kernel breakpoint all enable',
-                         ['Breakpoints will be set on all kernels'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('breakpoint list',
-                         ["'first_kernel', locations = 1, resolved = 1",
-                          "'second_kernel', locations = 1, resolved = 1"])
-
-    @ordered_test(1)
-    def test_disable_all_kernel_breakpoint_doesnt_delete_breakpoints(self):
-        # Check disable doesn't delete breakpoints
-        self.try_command('language renderscript kernel breakpoint all disable',
-                         ['Breakpoints will not be set on any new kernels'])
-
-        # Delete all breakpoints manually
-        self.try_command('breakpoint delete 1',
-                         ['1 breakpoints deleted'])
-
-        self.try_command('breakpoint delete 2',
-                         ['1 breakpoints deleted'])
-
-        self.try_command('breakpoint list',
-                         ["No breakpoints currently set"])
-
-    @ordered_test(2)
-    def test_enable_breakpoint_on_loaded_kernels(self):
-        # Test command works when kernels are loaded
-        self.try_command('language renderscript kernel breakpoint all enable',
-                         ['Breakpoints will be set on all kernels'])
-
-        self.try_command('breakpoint list',
-                         ["'first_kernel', locations = 1, resolved = 1",
-                          "'second_kernel', locations = 1, resolved = 1"])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        self.try_command('breakpoint delete 3',
-                         ['1 breakpoints deleted'])
-
-        # Check other_kernel breakpoint gets hit
-        self.try_command('breakpoint list',
-                         ["'second_kernel', locations = 1, resolved = 1"])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 4', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
-
diff --git a/tests/lldb/tests/testcases/test_breakpoint_kernel_multiple_rs_files.py b/tests/lldb/tests/testcases/test_breakpoint_kernel_multiple_rs_files.py
deleted file mode 100644
index 5cb29d0..0000000
--- a/tests/lldb/tests/testcases/test_breakpoint_kernel_multiple_rs_files.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestBreakpointKernelMultipleRSFiles.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    cpp_only_test
-)
-
-
-class TestBreakpointKernelMultipleRSFiles(TestBaseRemote):
-    '''Tests the setting of a breakpoint on RS kernels in multiple files.'''
-
-    bundle_target = {
-        'java': 'MultipleRSFiles',
-        'jni': 'JNIMultipleRSFiles',
-        'cpp': 'CppMultipleRSFiles'
-    }
-
-    def _binary_name(self):
-        return {
-             'java': 'multiplersfiles',
-             'jni': 'multiplersfiles',
-             'cpp': 'CppMultipleRSFi'
-         }[self.app_type]
-
-    def test_kernel_breakpoint_multiple_rs_files(self):
-        # pylint: disable=line-too-long
-        self.try_command('language renderscript kernel breakpoint set first_kernel',
-                         ['Breakpoint(s) created',
-                          '(pending)'])
-
-        self.try_command('breakpoint list',
-                         ["'first_kernel', locations = 0 (pending)"])
-
-        self.try_command('process continue',
-                         ['stopped',
-                          'librs.first.so`first_kernel',
-                          "name = '%s'" % self._binary_name(),
-                          'stop reason = breakpoint 1'],
-                          [r'at first\.rs:2[678]'])
-
-        self.try_command('breakpoint list',
-                         ["'first_kernel', locations = 1, resolved = 1"])
-
-        self.try_command('language renderscript kernel breakpoint set second_kernel',
-                         ['Breakpoint(s) created',
-                          'Breakpoint 2',
-                          'Breakpoint(s) created'],
-                          [r"librs\.second\.so`second_kernel at second\.rs:2[012]",])
-
-        self.try_command('breakpoint list',
-                         ["'first_kernel', locations = 1, resolved = 1",
-                          "'second_kernel', locations = 1, resolved = 1"])
-
-        self.try_command('breakpoint delete 1',
-                         ['1 breakpoints deleted'])
-
-        self.try_command('breakpoint list',
-                         ["'second_kernel', locations = 1, resolved = 1"])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint',
-                          "librs.second.so`second_kernel"],
-                          [r'second\.rs:2[012]'])
-
-        self.try_command('breakpoint delete 2',
-                         ['1 breakpoints deleted'])
-
-        self.try_command('breakpoint list',
-                         ['No breakpoints currently set'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_call_api_funs.py b/tests/lldb/tests/testcases/test_call_api_funs.py
deleted file mode 100644
index d94df3a..0000000
--- a/tests/lldb/tests/testcases/test_call_api_funs.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestCallApiFuns.'''
-
-from __future__ import absolute_import
-
-import re
-import string
-
-from harness.test_base_remote import TestBaseRemote
-from harness import RS_funs
-from harness.decorators import (
-    wimpy,
-    ordered_test,
-    cpp_only_test,
-)
-
-
-class _APIFunsExprTestsMeta(type):
-    """
-    Generate unique, standalone test methods from a list of lldb expressions.
-    The lldb expression evaluation engine for calling RenderScript
-    builtins need to be tested thoroughly; rather than manually
-    write the 1000s of individual test cases, we automatically generate them
-    and their variants to add to the test class. This is done from a list
-    of expressions that are all tested in the same way.
-    """
-    def __new__(self, name, bases, class_dict):
-        func_name_sub = re.compile(r'[%s\s]+' % string.punctuation)
-
-        for count, line in enumerate(RS_funs.FUNC_LIST):
-            def make_test(line):
-                """
-                We use an extra level of indirection here to properly
-                close over the *value* of the loop variable, `line`
-                """
-                @ordered_test(count)
-                def test(self):
-                    # build the expression
-                    ret, expr = RS_funs.build_expr(line)
-                    try:
-                        # evaluate the expression with expected return value
-                        self.try_command(expr, [], [RS_funs.TYPE_MAP[ret]])
-                    except KeyError:
-                        # or just check the return type if no return value
-                        # specified
-                        self.try_command(expr, '(%s)' % ret)
-                return test
-
-            # Make a pretty python method that adheres to the testcase standard
-            # Use the `count` parameter to ensure the name is unique in the class
-            test_name = 'test_%s_%s' % (re.sub(func_name_sub, '_', line), count)
-            test = make_test(line)
-            test.func_name = test_name
-            # We mark every 10th test case as runnable in wimpy mode
-            class_dict[test_name] = wimpy(test) if count % 10 == 0 else test
-
-        return type(name, bases, class_dict)
-
-
-class TestCallApiFuns(TestBaseRemote):
-    '''Tests calling of some RS API functions. This tests that JITing works.'''
-
-    __metaclass__ = _APIFunsExprTestsMeta
-
-    bundle_target = {
-        'java': "KernelVariables",
-        'jni': "JNIKernelVariables",
-        'cpp': "CppKernelVariables"
-    }
-
-    @wimpy
-    @ordered_test(-2)
-    def test_setup(self):
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('b -f simple.rs -l 145', [])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-    @wimpy
-    @ordered_test(-1)
-    def test_call_api_funs_atomic(self):
-        # Test the atomics separately because we want to check the output
-        # AtomicAdd(1234, 2)
-        self.try_command('expr rsAtomicAdd(&int_global, 2)',
-                         ['1234'],
-                         [r'\(int(32_t)?\)'])
-
-        self.try_command('expr int_global',
-                         ['(int)',
-                          '1236'])
-
-        # AtomicAnd(2345, 333)
-        self.try_command('expr rsAtomicAnd(&uint_global, 333)',
-                         ['2345'],
-                         [r'\(int(32_t)?\)'])
-
-        self.try_command('expr uint_global',
-                         ['(uint)',
-                          '265'])
-
-        # AtomicCas(1236, 1236, 2345)
-        self.try_command('expr rsAtomicCas(&int_global, 1236, 2345)',
-                         ['1236'],
-                         [r'\(int(32_t)?\)'])
-
-        self.try_command('expr int_global',
-                         ['(int)',
-                          '2345'])
-
-        # AtomicDec(265)
-        self.try_command('expr rsAtomicDec(&uint_global)',
-                         ['265'],
-                         [r'\(int(32_t)?\)'])
-
-        self.try_command('expr uint_global',
-                         ['(uint)',
-                          '264'])
-
-        # AtomicInc(2345)
-        self.try_command('expr rsAtomicInc(&int_global)',
-                         ['2345'],
-                         [r'\(int(32_t)?\)'])
-
-        self.try_command('expr int_global',
-                         ['(int)',
-                          '2346'])
-
-        # AtomicMax(264, 3456)
-        self.try_command('expr rsAtomicMax(&uint_global, 3456)',
-                         ['264'],
-                         [r'\(uint(32_t)?\)'])
-
-        self.try_command('expr uint_global',
-                         ['(uint)',
-                          '3456'])
-
-        # AtomicMin(2346, 3)
-        self.try_command('expr rsAtomicMin(&int_global, 3)',
-                         ['2346'],
-                         [r'\(int(32_t)?\)'])
-
-        self.try_command('expr int_global',
-                         ['(int)',
-                          '3'])
-
-        # AtomicOr(3, 456)
-        self.try_command('expr rsAtomicOr(&int_global, 456)',
-                         ['3'],
-                         [r'\(int(32_t)?\)'])
-
-        self.try_command('expr int_global',
-                         ['(int)',
-                          '459'])
-
-        # AtomicSub(3456, 7)
-        self.try_command('expr rsAtomicSub(&uint_global, 7)',
-                         ['3456'],
-                         [r'\(int(32_t)?\)'])
-
-        self.try_command('expr uint_global',
-                         ['(uint)',
-                          '3449'])
-
-        # AtomicXor(459, 89)
-        self.try_command('expr rsAtomicXor(&int_global, 89)',
-                         ['459'],
-                         [r'\(int(32_t)?\)'])
-
-        self.try_command('expr int_global',
-                         ['(int)',
-                          '402'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_coordinates.py b/tests/lldb/tests/testcases/test_coordinates.py
deleted file mode 100644
index 8680795..0000000
--- a/tests/lldb/tests/testcases/test_coordinates.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestCoordinates.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    cpp_only_test
-)
-
-
-class TestCoordinates(TestBaseRemote):
-    '''Tests the inspection of coordinates.
-
-    Tests the inspection of the range and dimension of coordinates as well
-    as the current coordinates.'''
-
-    bundle_target = {
-        'java': 'JavaDebugWaitAttach',
-        'jni': 'JNIDebugWaitAttach',
-        'cpp': 'CppDebugWaitAttach'
-    }
-
-    def setup(self, android):
-        '''This test requires to be run on one thread.
-
-        Args:
-            android: The android_util module.
-        '''
-        android.push_prop('debug.rs.max-threads', 1)
-
-    def teardown(self, android):
-        '''Reset the number of RS threads to the previous value.
-
-        Args:
-            android: The android_util module.
-        '''
-        android.pop_prop('debug.rs.max-threads')
-
-    @ordered_test(0)
-    def test_inspect_coordinates(self):
-        # pylint: disable=line-too-long
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('language renderscript kernel breakpoint set simple_kernel',
-                         ['Breakpoint(s) created',
-                          '(pending)'])
-
-        # Check the initial conditions.
-        self._lldb_continue()
-        self._inspect_coordinates(0, 0, 0)
-
-        # Check two more steps.
-        self._lldb_continue()
-        self._inspect_coordinates(1, 0, 0)
-        self._lldb_continue()
-        self._inspect_coordinates(2, 0, 0)
-
-        # After eight more steps we should have advanced one step in the y dimension.
-        for _ in range(8):
-            self._lldb_continue()
-        self._inspect_coordinates(2, 1, 0)
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('process continue',
-                         ['exited with status = 0'])
-
-    def _lldb_continue(self):
-        '''Try 'continue' lldb command. Expect to hit a breakpoint.'''
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-    def _inspect_coordinates(self, x_coord, y_coord, z_coord):
-        '''Run lldb commands to inspect kernel size and coordinates
-        and match against expected values.
-
-        Args:
-            (x_coord, y_coord, z_coord): The expected coordinates (int triple)
-
-        Raises:
-            TestFail: One of the lldb commands did not provide the expected
-                      output.
-        '''
-        self.try_command('language renderscript kernel coordinate',
-                         ['Coordinate: (%d, %d, %d)'
-                          % (x_coord, y_coord, z_coord)])
-
-        self.try_command('frame select 1',
-                         ['librs.simple.so`simple_kernel.expand',
-                         'at generated.rs:1'])
-
-        # Inspect the invocation length, should be the same every time.
-        self.try_command('expr p->dim',
-                         ['x = 8',
-                          'y = 8',
-                          'z = 0'])
-
-        # The X coordinate is in the rsIndex variable.
-        self.try_command('expr rsIndex',
-                          ['= ' + str(x_coord)])
-
-        # Inspect the Y and Z coordinates.
-        self.try_command('expr p->current',
-                         ['x = ' + str(0),
-                          'y = ' + str(y_coord),
-                          'z = ' + str(z_coord)])
diff --git a/tests/lldb/tests/testcases/test_dwarf_lang.py b/tests/lldb/tests/testcases/test_dwarf_lang.py
deleted file mode 100644
index 08cf859..0000000
--- a/tests/lldb/tests/testcases/test_dwarf_lang.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test DWARF language attribute test.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-
-
-class TestDWARFLang(TestBaseRemote):
-    '''Tests the DWARF language attribute is present in RenderScript kernels.'''
-
-    bundle_target = {
-        'java': 'JavaDebugWaitAttach',
-        'jni': 'JNIDebugWaitAttach',
-        'cpp': 'CppDebugWaitAttach'
-    }
-
-    def test_renderscript_kernel_frame_dwarf_language(self):
-        self.try_command('language renderscript status', [])
-        self.try_command('b simple_kernel', [])
-        self.try_command('process continue', [])
-
-        self.assert_lang_renderscript()
diff --git a/tests/lldb/tests/testcases/test_invoke_fun.py b/tests/lldb/tests/testcases/test_invoke_fun.py
deleted file mode 100644
index 2648558..0000000
--- a/tests/lldb/tests/testcases/test_invoke_fun.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestInvokeFun.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    cpp_only_test
-)
-
-
-class TestInvokeFun(TestBaseRemote):
-    '''Tests debugging a function executed from Java using invoke_*.'''
-
-    bundle_target = {
-        'java': 'BranchingFunCalls',
-        'jni': 'JNIBranchingFunCalls',
-        'cpp': 'CppBranchingFunCalls'
-    }
-
-    def test_invoke_fun(self):
-        # pylint: disable=line-too-long
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('breakpoint set --name addToGlobal',
-                         ['Breakpoint 1', '(pending)'])
-
-        self.try_command('process continue',
-                         ['stopped',
-                          'stop reason = breakpoint'],
-                         [r'scalars\.rs:7[345]'])
-
-        self.try_command('language renderscript kernel breakpoint set simple_kernel',
-                         ['Breakpoint 2', 'Breakpoint(s) created'])
-
-        self.try_command('process continue',
-                         ['stopped',
-                          'stop reason = breakpoint',
-                          'simple_kernel'],
-                         [r'scalars\.rs:6[123]'])
-
-        self.try_command('expr glob',
-                         ['(int)',
-                          '357'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('breakpoint delete 2', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_language.py b/tests/lldb/tests/testcases/test_language.py
deleted file mode 100644
index 8432781..0000000
--- a/tests/lldb/tests/testcases/test_language.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestLanguage.'''
-
-from __future__ import absolute_import
-
-from harness.test_base import TestBaseNoTargetProcess
-
-
-class TestLanguage(TestBaseNoTargetProcess):
-    '''
-    Tests the "language" command and "language renderscript" subcommand.
-    '''
-
-    def test_lldb_has_language_commands(self):
-        ci = self._ci
-        self.assert_true(
-            ci.HasCommands() and
-            ci.CommandExists('language')
-        )
-
-        self.try_command('language', ['renderscript'])
-        self.try_command('language renderscript', ['kernel',
-                                                   'context',
-                                                   'module',
-                                                   'status'])
-
diff --git a/tests/lldb/tests/testcases/test_language_subcmds.py b/tests/lldb/tests/testcases/test_language_subcmds.py
deleted file mode 100644
index 67c8bd1..0000000
--- a/tests/lldb/tests/testcases/test_language_subcmds.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestLanguageSubcmds.'''
-
-from __future__ import absolute_import
-
-import os
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    cpp_only_test,
-    ordered_test,
-)
-
-
-class TestLanguageSubcmds(TestBaseRemote):
-    '''Tests the 'language renderscript' subcommands.'''
-
-    bundle_target = {
-        'java': 'JavaDebugWaitAttach',
-        'jni': 'JNIDebugWaitAttach',
-        'cpp': 'CppDebugWaitAttach'
-    }
-
-    def setup(self, android):
-        '''This test requires to be run on one thread.'''
-        android.push_prop('debug.rs.max-threads', 1)
-
-    def teardown(self, android):
-        '''Reset the number of RS threads to the previous value.'''
-        android.pop_prop('debug.rs.max-threads')
-
-    def _pkg_name(self):
-        return {
-            'java': 'com.android.rs.waitattachdebug',
-            'jni': 'com.android.rs.jnidebugwaitattach',
-            'cpp': 'com.android.rs.cppwaitattach'
-        }[self.app_type]
-
-    def test_language_subcommands(self):
-        self.try_command('language',
-                         [])
-
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered',
-                          'Runtime functions hooked',
-                          'rsdAllocationInit',
-                          'rsdAllocationRead2D',
-                          'rsdScriptInit',
-                          'rsdScriptInvokeForEach',
-                          'rsdScriptInvokeForEachMulti',
-                          'rsdScriptSetGlobalVar'])
-
-        self.try_command('breakpoint set --file simple.rs --line 28',
-                         ['(pending)'])
-
-        self.try_command('process continue',
-                         [])
-
-        self.try_command('language renderscript kernel',
-                         ['breakpoint',
-                          'coordinate',
-                          'list'])
-
-        self.try_command('language renderscript kernel breakpoint',
-                         ['all',
-                          'set'])
-
-        self.try_command('language renderscript kernel list',
-                         ['RenderScript Kernels',
-                          "Resource 'simple'",
-                          'root',
-                          'simple_kernel'])
-
-        self.try_command('language renderscript kernel coordinate',
-                         ['Coordinate: (0, 0, 0)'])
-
-        self.try_command('language renderscript context',
-                         ['dump'])
-
-        self.try_command('language renderscript context dump',
-                         ['Inferred RenderScript Contexts',
-                          '1 script instances'])
-
-        self.try_command('language renderscript allocation',
-                         ['list',
-                          'load',
-                          'save',
-                          'dump',
-                          'refresh'])
-
-        self.try_command('language renderscript allocation list',
-                         ['RenderScript Allocations:'])
-
-        self.try_command('language renderscript allocation list -i 0',
-                         ['RenderScript Allocations:'])
-
-        self.try_command('language renderscript allocation list --id 0',
-                         ['RenderScript Allocations:'])
-
-        self.try_command('language renderscript allocation dump 1',
-                         ['Data (X, Y, Z):'])
-
-        output_file = self.get_tmp_file_path()
-        self.try_command('language renderscript allocation dump 1 -f ' +
-                         output_file,
-                         ["Results written to '%s'" % output_file])
-
-        if os.path.isfile(output_file):
-            os.remove(output_file)
-
-        self.try_command('language renderscript allocation dump 1 --file ' +
-                         output_file,
-                         ["Results written to '%s'" % output_file])
-
-        self.try_command('language renderscript allocation save 1 ' +
-                         output_file,
-                         ["Allocation written to file '%s'" % output_file])
-
-        self.try_command('language renderscript allocation load 1 ' +
-                         output_file,
-                         ["Contents of file '%s' read into allocation 1" %
-                          output_file])
-
-        self.try_command('language renderscript allocation refresh',
-                         ['All allocations successfully recomputed'])
-
-        self.try_command('language renderscript module',
-                         ['dump'])
-
-        self.try_command('language renderscript module dump',
-                         ['RenderScript Modules:',
-                          'librs.simple.so',
-                          'Debug info loaded',
-                          'Globals: 1',
-                          'gColor - float4',
-                          'Kernels: 3',
-                          'root',
-                          'simple_kernel',
-                          'other_kernel',
-                          'java_package_name: %s' % self._pkg_name(),
-                          'version:'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_language_subcmds_no_debug.py b/tests/lldb/tests/testcases/test_language_subcmds_no_debug.py
deleted file mode 100644
index c57343d..0000000
--- a/tests/lldb/tests/testcases/test_language_subcmds_no_debug.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestLanguageSubcmdsNoDebug.'''
-
-from __future__ import absolute_import
-
-import os
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    cpp_only_test,
-    ordered_test,
-)
-
-
-class TestLanguageSubcmdsNoDebug(TestBaseRemote):
-    '''Tests the 'language renderscript' subcommands without debug info.
-
-    In particular, module dump should report missing debug info.
-    '''
-
-    bundle_target = {
-        'java': 'JavaNoDebugWaitAttach',
-        'jni': 'JNINoDebugWaitAttach',
-        'cpp': 'CppNoDebugWaitAttach'
-    }
-
-    def _pkg_name(self):
-        return {
-            'java': 'com.android.rs.waitattachnodebug',
-            'jni': 'com.android.rs.jninodebugwaitattach',
-            'cpp': 'com.android.rs.cppwaitattach'
-        }[self.app_type]
-
-    @ordered_test(0)
-    def test_language_subcommands_no_debug(self):
-        # pylint: disable=line-too-long
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('language renderscript kernel breakpoint set simple_kernel'
-                         '',
-                         ['(pending)'])
-
-        self.try_command('process continue',
-                         [])
-
-        self.try_command('language renderscript kernel',
-                         ['breakpoint',
-                          'coordinate',
-                          'list'])
-
-        self.try_command('language renderscript kernel list',
-                         ['RenderScript Kernels',
-                          "Resource 'simple'",
-                          'root',
-                          'simple_kernel'])
-
-        self.try_command('language renderscript context',
-                         ['dump'])
-
-        self.try_command('language renderscript context dump',
-                         ['Inferred RenderScript Contexts',
-                          '1 script instances'])
-
-        self.try_command('language renderscript allocation',
-                         ['list',
-                          'load',
-                          'save',
-                          'dump',
-                          'refresh'])
-
-        self.try_command('language renderscript allocation list',
-                         ['RenderScript Allocations:'])
-
-        self.try_command('language renderscript allocation list -i 0',
-                         ['RenderScript Allocations:'])
-
-        self.try_command('language renderscript allocation list --id 0',
-                         ['RenderScript Allocations:'])
-
-        self.try_command('language renderscript allocation dump 1',
-                         ['Data (X, Y, Z):'])
-
-        output_file = self.get_tmp_file_path()
-        self.try_command('language renderscript allocation dump 1 -f ' +
-                         output_file,
-                         ["Results written to '%s'" % output_file])
-
-        if os.path.isfile(output_file):
-            os.remove(output_file)
-
-        self.try_command('language renderscript allocation dump 1 --file ' +
-                         output_file,
-                         ["Results written to '%s'" % output_file])
-
-        self.try_command('language renderscript allocation save 1 ' +
-                         output_file,
-                         ["Allocation written to file '%s'" % output_file])
-
-        self.try_command('language renderscript allocation load 1 ' +
-                         output_file,
-                         ["Contents of file '%s' read into allocation 1" %
-                          output_file])
-
-        self.try_command('language renderscript allocation refresh',
-                         ['All allocations successfully recomputed'])
-
-        # C++ tests have an additional kernel `other_kernel`
-        kernel_count = 3 if self.app_type == 'cpp' else 2
-        self.try_command('language renderscript module',
-                         ['dump'])
-
-        self.try_command('language renderscript module dump',
-                         ['RenderScript Modules:',
-                          'librs.simple.so',
-                          'Debug info does not exist.',
-                          'Globals: 1',
-                          'gColor - variable identified, but not found in '
-                            'binary (symbol exists)',
-                          'Kernels: %s' % kernel_count,
-                          'root',
-                          'simple_kernel',
-                          '',
-                          'java_package_name: %s' % self._pkg_name(),
-                          'version'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_multiple_rs_files.py b/tests/lldb/tests/testcases/test_multiple_rs_files.py
deleted file mode 100644
index e0497f0..0000000
--- a/tests/lldb/tests/testcases/test_multiple_rs_files.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestMultipleRSFiles.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    cpp_only_test,
-)
-
-class TestMultipleRSFiles(TestBaseRemote):
-    '''Tests some commands on an apk which has two rs files.'''
-
-    bundle_target = {
-        'java': 'MultipleRSFiles',
-        'jni': 'JNIMultipleRSFiles',
-        'cpp': 'CppMultipleRSFiles'
-    }
-
-    def _binary_name(self):
-        return {
-            'java': 'multiplersfiles',
-            'jni': 'multiplersfiles',
-            'cpp': 'CppMultipleRSFi'
-        }[self.app_type]
-
-    def _pkg_name(self):
-        return {
-            'java': 'com.android.rs.multiplersfiles',
-            'jni': 'com.android.rs.jnimultiplersfiles',
-            'cpp': 'com.android.rs.cppmultiplersfiles'
-        }[self.app_type]
-
-    def test_multiple_rs_files(self):
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered',
-                          'Runtime functions hooked'])
-
-        self.try_command('breakpoint set --file first.rs --line 28',
-                         ['(pending)'])
-
-        self.try_command('process continue',
-                         ['stopped',
-                          'librs.first.so`first_kernel',
-                          'at first.rs:28',
-                          "name = '%s'" % self._binary_name(),
-                          'stop reason = breakpoint 1'])
-
-        self.try_command('language renderscript kernel list',
-                         ['RenderScript Kernels',
-                          "Resource 'first'",
-                          "Resource 'second'",
-                          'root',
-                          'first_kernel',
-                          'second_kernel'])
-
-        self.try_command('language renderscript context dump',
-                         ['Inferred RenderScript Contexts',
-                          '2 script instances'])
-
-        self.try_command('language renderscript module dump',
-                         ['RenderScript Modules:',
-                          'librs.first.so',
-                          'librs.second.so',
-                          'Debug info loaded',
-                          'Globals: 1',
-                          'gColor - float4',
-                          'Kernels: 2',
-                          'root',
-                          'first_kernel',
-                          'second_kernel',
-                          'java_package_name: %s' % self._pkg_name(),
-                          'version:'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_read_global.py b/tests/lldb/tests/testcases/test_read_global.py
deleted file mode 100644
index 0a337e0..0000000
--- a/tests/lldb/tests/testcases/test_read_global.py
+++ /dev/null
@@ -1,344 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestReadGlobal.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    wimpy,
-    ordered_test,
-    cpp_only_test,
-)
-
-
-class TestReadGlobal(TestBaseRemote):
-    '''Tests inspecting global variables of all types.'''
-
-    bundle_target = {
-        'java': 'KernelVariables',
-        'jni': 'JNIKernelVariables',
-        'cpp': 'CppKernelVariables'
-    }
-
-    def _try_inspecting_global(self, global_name, expected_output,
-                              expected_regex=None):
-        '''Inspect a global and check for the output.
-
-        Run the "expr" and "target variable" commands on a given global and
-        with a given output. (The commands should be equivalent.)
-
-        Args:
-            global_name: String which is the name of the global to inspect.
-            expected_output: List of strings that should be found in the output.
-            expected_regex: List of regular expressions that should match lldb's
-                            output.
-
-        Raises:
-            TestFail: One of the lldb commands did not provide the expected
-                      output.
-        '''
-        self.try_command('expr ' + global_name,
-                         expected_output,
-                         expected_regex)
-
-        self.try_command('target variable ' + global_name,
-                         expected_output,
-                         expected_regex)
-
-    @wimpy
-    @ordered_test(0)
-    def test_setup(self):
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('b -f simple.rs -l 145', [])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-    @wimpy
-    def test_list_script_globals(self):
-        # pylint: disable=line-too-long
-
-        self.try_command('target variable',
-                         ['Global variables for',
-                          'librs.simple.so',
-                          "(uchar) uchar_global = '\\xea'",
-                          '(short) short_global = -321',
-                          '(ushort) ushort_global = 432',
-                          '(int) int_global = 1234',
-                          '(uint) uint_global = 2345',
-                          '(float) float_global = 4.5',
-                          '(ulong) ulong_global = 8888',
-                          '(double) double_global = -456.5',
-                          '(char2) char2_global = (11, -22)',
-                          '(uchar2) uchar2_global = (0x21, 0x2c)',
-                          '(short2) short2_global = (-555, 666)',
-                          '(ushort2) ushort2_global = (777, 888)',
-                          '(int2) int2_global = (999, -1111)',
-                          '(uint2) uint2_global = (2222, 3333)',
-                          '(float2) float2_global = (4.5, -5)',
-                          '(long2) long2_global = (-4444, 5555)',
-                          '(ulong2) ulong2_global = (6666, 7777)',
-                          '(double2) double2_global = (88.5, -99)',
-                          '(char3) char3_global = (11, -22, -33,',
-                          '(uchar3) uchar3_global = (0x21, 0x2c, 0x37,',
-                          '(short3) short3_global = (-555, 666, 777,',
-                          '(ushort3) ushort3_global = (777, 888, 999,',
-                          '(int3) int3_global = (999, -1111, 2222,',
-                          '(uint3) uint3_global = (2222, 3333, 4444,',
-                          '(float3) float3_global = (4.5, -5, -6.5,',
-                          '(long3) long3_global = (-4444, 5555, 6666,',
-                          '(ulong3) ulong3_global = (6666, 7777, 8888,',
-                          '(double3) double3_global = (88.5, -99, 111.5,',
-                          '(char4) char4_global = (55, 11, -22, -33)',
-                          '(uchar4) uchar4_global = (0xde, 0x21, 0x2c, 0x37)',
-                          '(short4) short4_global = (-444, -555, 666, 777)',
-                          '(ushort4) ushort4_global = (666, 777, 888, 999)',
-                          '(int4) int4_global = (888, 999, -1111, 2222)',
-                          '(uint4) uint4_global = (1111, 2222, 3333, 4444)',
-                          '(float4) float4_global = (3, 4.5, -5, -6.5)',
-                          '(long4) long4_global = (-3333, -4444, 5555, 6666)',
-                          '(ulong4) ulong4_global = (5555, 6666, 7777, 8888)',
-                          '(double4) double4_global = (-77, 88.5, -99, 111.5)',
-                          '(rs_matrix2x2) matrix2x2_global = (m = (1, 2.5, 3, 4.5))',
-                          '(rs_matrix3x3) matrix3x3_global = {\n'
-                          '  m = ([0] = 5, [1] = 6.5, [2] = 7, [3] = 8.5, [4] = 9, [5] = 1.5, [6] = 2, [7] = 3.5, [8] = 4)',
-                          '(rs_matrix4x4) matrix4x4_global = {\n'
-                          '  m = {\n'
-                          '    [0] = 5.5\n'
-                          '    [1] = 6\n'
-                          '    [2] = 7.5\n'
-                          '    [3] = 8\n'
-                          '    [4] = 9\n'
-                          '    [5] = 1.5\n'
-                          '    [6] = 2\n'
-                          '    [7] = 3.5\n'
-                          '    [8] = 4.5\n'
-                          '    [9] = 5.5\n'
-                          '    [10] = 6.5\n'
-                          '    [11] = 7\n'
-                          '    [12] = 8\n'
-                          '    [13] = 9.5\n'
-                          '    [14] = 1.5\n'
-                          '    [15] = 2.5\n'
-                          '  }\n',
-                          '(rs_quaternion) quaternion_global = (4.5, 5.5, 6, 3)'],
-                         [r"\((signed )?char\) char_global = '\\f'",
-                          r'\((long )?long\) long_global = -77777'])
-
-    @wimpy
-    def test_read_char_global(self):
-        # Use expr to inspect locals
-        self._try_inspecting_global('char_global',
-                         ["'\\f'"],
-                         [r'\((signed )?char\)'])
-
-    def test_read_primitive_global(self):
-        self._try_inspecting_global('uchar_global',
-                         ['(uchar)', "'\\xea'"])
-
-        self._try_inspecting_global('short_global',
-                         ['(short)', '-321'])
-
-        self._try_inspecting_global('ushort_global',
-                         ['(ushort)', '432'])
-
-        self._try_inspecting_global('int_global',
-                         ['(int)', '1234'])
-
-        self._try_inspecting_global('uint_global',
-                         ['(uint)', '2345'])
-
-        self._try_inspecting_global('float_global',
-                         ['(float)', '4.5'])
-
-        self._try_inspecting_global('long_global',
-                         ['-77777'],
-                         [r'\((long )?long\)'])
-
-        self._try_inspecting_global('ulong_global',
-                         ['(ulong)', '8888'])
-
-        self._try_inspecting_global('double_global',
-                         ['(double)', '-456.5'])
-
-        self._try_inspecting_global('char2_global',
-                                   ['(char2)', '(11, -22)'])
-
-    @wimpy
-    def test_write_global2(self):
-        self._try_inspecting_global('uchar2_global',
-                               ['(uchar2)', '(0x21, 0x2c)'])
-
-    def test_write_global3(self):
-        self._try_inspecting_global('short2_global',
-                                   ['(short2)', '(-555, 666)'])
-
-        self._try_inspecting_global('ushort2_global',
-                                   ['(ushort2)', '(777, 888)'])
-
-        self._try_inspecting_global('int2_global',
-                                   ['(int2)', '(999, -1111)'])
-
-        self._try_inspecting_global('uint2_global',
-                                   ['(uint2)', '(2222, 3333)'])
-
-        self._try_inspecting_global('float2_global',
-                                   ['(float2)', '(4.5, -5)'])
-
-        self._try_inspecting_global('long2_global',
-                                   ['(long2)', '(-4444, 5555)'])
-
-        self._try_inspecting_global('ulong2_global',
-                                   ['(ulong2)', '(6666, 7777)'])
-
-        self._try_inspecting_global('double2_global',
-                                   ['(double2)', '(88.5, -99)'])
-
-        self._try_inspecting_global('char3_global',
-                                   ['(char3)',
-                                    '(11, -22, -33,'])
-
-        self._try_inspecting_global('uchar3_global',
-                                   ['(uchar3)',
-                                    '(0x21, 0x2c, 0x37,'])
-
-    @wimpy
-    def test_global_write_short3(self):
-        self._try_inspecting_global('short3_global',
-                                   ['(short3)',
-                                   '(-555, 666, 777,'])
-
-    def test_read_vec3(self):
-        self._try_inspecting_global('ushort3_global',
-                                   ['(ushort3)',
-                                    '(777, 888, 999,'])
-
-        self._try_inspecting_global('int3_global',
-                                   ['(int3)',
-                                    '(999, -1111, 2222,'])
-
-        self._try_inspecting_global('uint3_global',
-                                   ['(uint3)',
-                                    '(2222, 3333, 4444,'])
-
-        self._try_inspecting_global('float3_global',
-                                   ['(float3)',
-                                    '(4.5, -5, -6.5,'])
-
-        self._try_inspecting_global('long3_global',
-                                   ['(long3)',
-                                    '(-4444, 5555, 6666,'])
-
-        self._try_inspecting_global('ulong3_global',
-                                   ['(ulong3)',
-                                    '(6666, 7777, 8888,'])
-
-        self._try_inspecting_global('double3_global',
-                                   ['(double3)',
-                                    '(88.5, -99, 111.5,'])
-
-        self._try_inspecting_global('char4_global',
-                                   ['(char4)',
-                                    '(55, 11, -22, -33)'])
-
-        self._try_inspecting_global('uchar4_global',
-                                   ['(uchar4)',
-                                    '(0xde, 0x21, 0x2c, 0x37)'])
-
-        self._try_inspecting_global('short4_global',
-                                   ['(short4)',
-                                    '(-444, -555, 666, 777)'])
-
-    @wimpy
-    def test_read_ushort4(self):
-        self._try_inspecting_global('ushort4_global',
-                                   ['(ushort4)',
-                                    '(666, 777, 888, 999)'])
-
-    def test_read_vec4(self):
-        self._try_inspecting_global('int4_global',
-                                   ['(int4)',
-                                    '(888, 999, -1111, 2222)'])
-
-        self._try_inspecting_global('uint4_global',
-                                   ['(uint4)',
-                                    '(1111, 2222, 3333, 4444)'])
-
-        self._try_inspecting_global('float4_global',
-                                   ['(float4)',
-                                    '(3, 4.5, -5, -6.5)'])
-
-        self._try_inspecting_global('long4_global',
-                                   ['(long4)',
-                                    '(-3333, -4444, 5555, 6666)'])
-
-        self._try_inspecting_global('ulong4_global',
-                                   ['(ulong4)',
-                                    '(5555, 6666, 7777, 8888)'])
-
-        self._try_inspecting_global('double4_global',
-                                   ['(double4)',
-                                    '(-77, 88.5, -99, 111.5)'])
-
-        self._try_inspecting_global('matrix2x2_global',
-                                   ['(rs_matrix2x2)',
-                                    '= (m = (1, 2.5, 3, 4.5))'])
-
-        self._try_inspecting_global('matrix3x3_global',
-                                       ['(rs_matrix3x3)',
-                                        '= {\n'
-                                        '  m = ([0] = 5, [1] = 6.5, [2] = 7, [3] = 8.5, [4] = 9, [5] = 1.5, [6] = 2, [7] = 3.5, [8] = 4)'])
-    @wimpy
-    def test_read_matrix(self):
-        self._try_inspecting_global('matrix4x4_global',
-                                   ['(rs_matrix4x4)',
-                                    '= {\n'
-                                    '  m = {\n'
-                                    '    [0] = 5.5\n'
-                                    '    [1] = 6\n'
-                                    '    [2] = 7.5\n'
-                                    '    [3] = 8\n'
-                                    '    [4] = 9\n'
-                                    '    [5] = 1.5\n'
-                                    '    [6] = 2\n'
-                                    '    [7] = 3.5\n'
-                                    '    [8] = 4.5\n'
-                                    '    [9] = 5.5\n'
-                                    '    [10] = 6.5\n'
-                                    '    [11] = 7\n'
-                                    '    [12] = 8\n'
-                                    '    [13] = 9.5\n'
-                                    '    [14] = 1.5\n'
-                                    '    [15] = 2.5\n'
-                                    '  }\n'])
-
-    @wimpy
-    def test_read_quaternion(self):
-        self._try_inspecting_global('quaternion_global',
-                                   ['(rs_quaternion)',
-                                    '(4.5, 5.5, 6, 3)'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_read_local.py b/tests/lldb/tests/testcases/test_read_local.py
deleted file mode 100644
index 9a6a80f..0000000
--- a/tests/lldb/tests/testcases/test_read_local.py
+++ /dev/null
@@ -1,344 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestReadLocal.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    wimpy,
-    ordered_test,
-    skip_conditional,
-    cpp_only_test
-)
-
-
-class TestReadLocal(TestBaseRemote):
-    '''Tests inspecting local variables of all types.'''
-
-    bundle_target = {
-        'java': 'KernelVariables',
-        'jni': 'JNIKernelVariables',
-        'cpp': 'CppKernelVariables'
-    }
-
-    def _try_inspecting_local(self, local_name, expected_output,
-                             expected_regex=None):
-        '''Inspect a local and check for the output.
-
-        Run the "expr" and "frame variable" commands on a given local and
-        with a given output. (The commands should be equivalent.)
-
-        Args:
-            local_name: String which is the name of the global to inspect.
-            expected_output: List of strings that should be found in the output.
-            expected_regex: List of regular expressions that should match lldb's
-                            output.
-
-        Raises:
-            TestFail: One of the lldb commands did not provide the expected
-                      output.
-        '''
-        self.try_command('expr ' + local_name,
-                         expected_output,
-                         expected_regex)
-
-        self.try_command('frame variable ' + local_name,
-                         expected_output,
-                         expected_regex)
-
-    @wimpy
-    @ordered_test(0)
-    def test_setup(self):
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('breakpoint set --file simple.rs --line 145', [])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-    @wimpy
-    def test_list_rs_kernel_frame_variables(self):
-        # pylint: disable=line-too-long
-
-        self.try_command('frame variable',
-                         ["(uchar) uchar_local = 'b'",
-                          '(short) short_local = -321',
-                          '(ushort) ushort_local = 432',
-                          '(int) int_local = 1234',
-                          '(uint) uint_local = 2345',
-                          '(float) float_local = 4.5',
-                          '(ulong) ulong_local = 8888',
-                          '(double) double_local = -456.5',
-                          '(char2) char2_local = (-11, -22)',
-                          '(uchar2) uchar2_local = (0x21, 0x2c)',
-                          '(short2) short2_local = (-555, 666)',
-                          '(ushort2) ushort2_local = (777, 888)',
-                          '(int2) int2_local = (999, -1111)',
-                          '(uint2) uint2_local = (2222, 3333)',
-                          '(float2) float2_local = (4.5, -5)',
-                          '(long2) long2_local = (-4444, 5555)',
-                          '(ulong2) ulong2_local = (6666, 7777)',
-                          '(double2) double2_local = (88.5, -99)',
-                          '(char3) char3_local = (11, -22, -33,',
-                          '(uchar3) uchar3_local = (0x21, 0x2c, 0x37,',
-                          '(short3) short3_local = (-555, 666, 777,',
-                          '(ushort3) ushort3_local = (777, 888, 999,',
-                          '(int3) int3_local = (999, -1111, 2222,',
-                          '(uint3) uint3_local = (2222, 3333, 4444,',
-                          '(float3) float3_local = (4.5, -5, -6.5,',
-                          '(long3) long3_local = (-4444, 5555, 6666,',
-                          '(ulong3) ulong3_local = (6666, 7777, 8888,',
-                          '(double3) double3_local = (88.5, -99, 111.5,',
-                          '(char4) char4_local = (55, 11, -22, -33)',
-                          '(uchar4) uchar4_local = (0x16, 0x21, 0x2c, 0x37)',
-                          '(short4) short4_local = (-444, -555, 666, 777)',
-                          '(ushort4) ushort4_local = (666, 777, 888, 999)',
-                          '(int4) int4_local = (888, 999, -1111, 2222)',
-                          '(uint4) uint4_local = (1111, 2222, 3333, 4444)',
-                          '(float4) float4_local = (3, 4.5, -5, -6.5)',
-                          '(long4) long4_local = (-3333, -4444, 5555, 6666)',
-                          '(ulong4) ulong4_local = (5555, 6666, 7777, 8888)',
-                          '(double4) double4_local = (-77, 88.5, -99, 111.5)',
-                          '(rs_matrix2x2) matrix2x2_local = (m = (1, 2.5, 3, 4.5))',
-                          '(rs_matrix3x3) matrix3x3_local = {\n'
-                          '  m = ([0] = 5, [1] = 6.5, [2] = 7, [3] = 8.5, [4] = 9, [5] = 1.5, [6] = 2, [7] = 3.5, [8] = 4)',
-                          '(rs_matrix4x4) matrix4x4_local = {\n'
-                          '  m = {\n'
-                          '    [0] = 5.5\n'
-                          '    [1] = 6\n'
-                          '    [2] = 7.5\n'
-                          '    [3] = 8\n'
-                          '    [4] = 9\n'
-                          '    [5] = 1.5\n'
-                          '    [6] = 2\n'
-                          '    [7] = 3.5\n'
-                          '    [8] = 4.5\n'
-                          '    [9] = 5.5\n'
-                          '    [10] = 6.5\n'
-                          '    [11] = 7\n'
-                          '    [12] = 8\n'
-                          '    [13] = 9.5\n'
-                          '    [14] = 1.5\n'
-                          '    [15] = 2.5\n'
-                          '  }\n',
-                          '(rs_quaternion) quaternion_local = (8, 9, 0.5, 7.5)'],
-                         [r"\((signed )?char\) char_local = 'a'",
-                          r'\((long )?long\) long_local = -77777'])
-
-
-    @wimpy
-    def test_inspect_primitive_types(self):
-        # Use expr to inspect locals
-        self._try_inspecting_local('char_local',
-                                  ["'a'"],
-                                  [r'\((signed )?char\)'])
-
-        self._try_inspecting_local('uchar_local',
-                                  ['(uchar)', "'b'"])
-
-        self._try_inspecting_local('short_local',
-                                  ['(short)', '-321'])
-
-        self._try_inspecting_local('ushort_local',
-                                  ['(ushort)', '432'])
-
-        self._try_inspecting_local('int_local',
-                                  ['(int)', '1234'])
-
-        self._try_inspecting_local('uint_local',
-                                  ['(uint)', '2345'])
-
-        self._try_inspecting_local('float_local',
-                                  ['(float)', '4.5'])
-
-        self._try_inspecting_local('long_local',
-                                  ['-77777'], [r'\((long )?long\)'])
-
-        self._try_inspecting_local('ulong_local',
-                                  ['(ulong)', '8888'])
-
-        self._try_inspecting_local('double_local',
-                                  ['(double)', '-456.5'])
-
-
-    @wimpy
-    def test_inspect_uchar2(self):
-        self._try_inspecting_local('uchar2_local',
-                                  ['(uchar2)', '(0x21, 0x2c)'])
-
-    def test_inspect_vec2_types(self):
-        self._try_inspecting_local('char2_local',
-                                  ['(char2)', '(-11, -22)'])
-
-        self._try_inspecting_local('short2_local',
-                                  ['(short2)', '(-555, 666)'])
-
-        self._try_inspecting_local('ushort2_local',
-                                  ['(ushort2)', '(777, 888)'])
-
-        self._try_inspecting_local('int2_local',
-                                  ['(int2)', '(999, -1111)'])
-
-        self._try_inspecting_local('uint2_local',
-                                  ['(uint2)', '(2222, 3333)'])
-
-        self._try_inspecting_local('float2_local',
-                                  ['(float2)', '(4.5, -5)'])
-
-        self._try_inspecting_local('long2_local',
-                                  ['(long2)', '(-4444, 5555)'])
-
-        self._try_inspecting_local('ulong2_local',
-                                  ['(ulong2)', '(6666, 7777)'])
-
-        self._try_inspecting_local('double2_local',
-                                  ['(double2)', '(88.5, -99)'])
-
-        self._try_inspecting_local('char3_local',
-                                  ['(char3)',
-                                   '(11, -22, -33,'])
-
-        self._try_inspecting_local('uchar3_local',
-                                  ['(uchar3)',
-                                   '(0x21, 0x2c, 0x37,'])
-
-    @wimpy
-    def test_inspect_short3(self):
-        self._try_inspecting_local('short3_local',
-                                  ['(short3)',
-                                   '(-555, 666, 777,'])
-
-    def test_inspect_vec3_types(self):
-        self._try_inspecting_local('ushort3_local',
-                                  ['(ushort3)',
-                                   '(777, 888, 999,'])
-
-        self._try_inspecting_local('int3_local',
-                                  ['(int3)',
-                                   '(999, -1111, 2222,'])
-
-        self._try_inspecting_local('uint3_local',
-                                  ['(uint3)',
-                                   '(2222, 3333, 4444,'])
-
-        self._try_inspecting_local('float3_local',
-                                  ['(float3)',
-                                   '(4.5, -5, -6.5,'])
-
-        self._try_inspecting_local('long3_local',
-                                  ['(long3)',
-                                   '(-4444, 5555, 6666,'])
-
-        self._try_inspecting_local('ulong3_local',
-                                  ['(ulong3)',
-                                   '(6666, 7777, 8888,'])
-
-        self._try_inspecting_local('double3_local',
-                                  ['(double3)',
-                                   '(88.5, -99, 111.5,'])
-
-        self._try_inspecting_local('char4_local',
-                                  ['(char4)',
-                                   '(55, 11, -22, -33)'])
-
-        self._try_inspecting_local('uchar4_local',
-                                  ['(uchar4)',
-                                   '(0x16, 0x21, 0x2c, 0x37)'])
-
-        self._try_inspecting_local('short4_local',
-                                  ['(short4)',
-                                   '(-444, -555, 666, 777)'])
-
-    @wimpy
-    def test_inspect_ushort4(self):
-        self._try_inspecting_local('ushort4_local',
-                                  ['(ushort4)',
-                                   '(666, 777, 888, 999)'])
-
-    def test_inspect_vec4_types(self):
-        self._try_inspecting_local('int4_local',
-                                  ['(int4)',
-                                   '(888, 999, -1111, 2222)'])
-
-        self._try_inspecting_local('uint4_local',
-                                  ['(uint4)',
-                                   '(1111, 2222, 3333, 4444)'])
-
-        self._try_inspecting_local('float4_local',
-                                  ['(float4)',
-                                   '(3, 4.5, -5, -6.5)'])
-
-        self._try_inspecting_local('long4_local',
-                                  ['(long4)',
-                                   '(-3333, -4444, 5555, 6666)'])
-
-        self._try_inspecting_local('ulong4_local',
-                                  ['(ulong4)',
-                                   '(5555, 6666, 7777, 8888)'])
-
-        self._try_inspecting_local('double4_local',
-                                  ['(double4)',
-                                   '(-77, 88.5, -99, 111.5)'])
-    def test_inspect_matrix_types(self):
-        self._try_inspecting_local('matrix2x2_local',
-                                   ['(rs_matrix2x2)',
-                                    '= (m = (1, 2.5, 3, 4.5))'])
-
-        self._try_inspecting_local('matrix3x3_local',
-                                   ['(rs_matrix3x3)',
-                                    '= {\n'
-                                    '  m = ([0] = 5, [1] = 6.5, [2] = 7, [3] = 8.5, [4] = 9, [5] = 1.5, [6] = 2, [7] = 3.5, [8] = 4)'])
-
-    @wimpy
-    def test_inspect_matrix_4x4_local(self):
-        self._try_inspecting_local('matrix4x4_local',
-                                   ['(rs_matrix4x4)',
-                                    '= {\n'
-                                    '  m = {\n'
-                                    '    [0] = 5.5\n'
-                                    '    [1] = 6\n'
-                                    '    [2] = 7.5\n'
-                                    '    [3] = 8\n'
-                                    '    [4] = 9\n'
-                                    '    [5] = 1.5\n'
-                                    '    [6] = 2\n'
-                                    '    [7] = 3.5\n'
-                                    '    [8] = 4.5\n'
-                                    '    [9] = 5.5\n'
-                                    '    [10] = 6.5\n'
-                                    '    [11] = 7\n'
-                                    '    [12] = 8\n'
-                                    '    [13] = 9.5\n'
-                                    '    [14] = 1.5\n'
-                                    '    [15] = 2.5\n'
-                                    '  }\n'])
-
-    @wimpy
-    def test_inspect_quaternion_local(self):
-        self._try_inspecting_local('quaternion_local',
-                                  ['(rs_quaternion)',
-                                   '(8, 9, 0.5, 7.5)'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_reduction.py b/tests/lldb/tests/testcases/test_reduction.py
deleted file mode 100644
index 9653c02..0000000
--- a/tests/lldb/tests/testcases/test_reduction.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-
-import itertools
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    wimpy,
-)
-from harness.assert_mixins import CoordinateAssertionsMixin
-
-from reduce_common import (
-    REDUCE_ITERATIONS,
-    REDUCE_STARTVAL,
-    REDUCE_SCRIPT,
-    X_TESTS,
-    Y_TESTS,
-    Z_TESTS,
-    ReductionMixin,
-)
-
-
-def coords_range_3d(x_range, y_range, z_range):
-    count = max((x_range, y_range, z_range))
-    x = itertools.cycle(range(x_range))
-    y = itertools.cycle(range(y_range))
-    z = itertools.cycle(range(z_range))
-    return itertools.islice(
-        itertools.izip(x, y, z),
-        count
-    )
-
-
-class TestReduce1DSingleThreaded(
-        TestBaseRemote, CoordinateAssertionsMixin, ReductionMixin):
-    """
-    Reduction kernels for RenderScript are launched using
-    a different `.expand` function than regular `ForEach` kernels and reflect a
-    different API to the invoking program
-
-    Although the debugger implementation for accessing these features tracks
-    this slightly differently for reduction kernels, the user interface should
-    still offer the basic functionality:
-        - breakpoints on a coordinate
-        - tracking, viewing and dumping allocations
-        - listing modules and constituent kernels and types
-    """
-
-    bundle_target = {
-        'java': 'Reduction',
-    }
-
-    def _delete_breakpoints(self):
-        try:
-            self.do_command('breakpoint delete -f')
-        except self.TestFail:
-            pass
-
-    def setup(self, android):
-        """This test requires to be run on one thread."""
-        android.push_prop('debug.rs.max-threads', 1)
-
-    def teardown(self, android):
-        """Reset the number of RS threads to the previous value."""
-        android.pop_prop('debug.rs.max-threads')
-
-    @ordered_test(0)
-    @wimpy
-    def test_setup(self):
-        self.try_command('language renderscript status', [])
-        self.try_command('b find_min_user_type_accum', [])
-        self.try_command('c', [])
-
-    @ordered_test(1)
-    @wimpy
-    def test_renderscript_module_dump(self):
-        """
-        Generalised Reduction kernels for RenderScript are not tracked in the
-        same way as `ForEach` kernels, and do not have `__attribute__((kernel))`
-        so we need to make sure that when a module contains reduction kernels,
-        `language renderscript module dump` in lldb prints the correct kernels.
-        """
-        self.try_command(
-            'language renderscript module dump',
-            [
-                'Reductions: 1',
-                'find_min_user_type',
-                'accumulator: find_min_user_type_accum',
-                'combiner: find_min_user_type_comb',
-                'outconverter: find_min_user_type_outc'
-            ]
-        )
-
-    @ordered_test(2)
-    @wimpy
-    def test_module_dump_with_foreach_kernel_separate(self):
-        """
-        The reduction breakpoint is separate from that of a standard kernel
-        function breakpoint, so we need to make sure that when we dump a module,
-        reductions are properly collected and displayed alongside the standard
-        __attribute__((kernel)) functions.
-        Assert that `... module dump` can correctly distinguish between `reduce`
-        kernels and `ForEach` kernels.
-        """
-        self.try_command(
-            'language renderscript module dump',
-            [
-                'Kernels: 2',
-                'Reductions: 1',
-                'accumulator: find_min_user_type_accum',
-                'initializer: find_min_user_type_init',
-                'combiner: find_min_user_type_comb',
-                'outconverter: find_min_user_type_outc'
-            ]
-        )
-
-    @wimpy
-    @ordered_test(3)
-    def test_reduction_breakpoint_set_all_roles_resolved(self):
-        """
-        Assert that a reduction breakpoint successfully resolves all the
-        functions that make up the reduction kernel
-        """
-        self.try_command(
-            'language renderscript reduction breakpoint set find_min_user_type',
-            ['Breakpoint(s) created']
-        )
-
-        self.try_command(
-            'process continue',
-            expected_regex=[
-                r'Process \d+ stopped',
-                r'librs.reduce.so`find_min_user_type',
-                r'stop reason = breakpoint'
-            ]
-        )
-        name = REDUCE_SCRIPT
-        self.try_command(
-            'breakpoint list',
-            expected_regex=[
-                "RenderScript reduce breakpoint for 'find_min_user_type', locations = 4, resolved = 4",
-                'where = librs.reduce.so`find_min_user_type_init (\+ \d+ )?at %s(.+, resolved,)' % name,
-                'where = librs.reduce.so`find_min_user_type_accum (\+ \d+ )?at %s(.+, resolved,)' % name,
-                'where = librs.reduce.so`find_min_user_type_comb (\+ \d+ )?at %s(.+, resolved,)' % name,
-                'where = librs.reduce.so`find_min_user_type_outc (\+ \d+ )?at %s(.+, resolved,)' % name,
-            ]
-        )
-
-    @ordered_test(4)
-    def test_reduce_iterations(self):
-        """
-        Given a reduction, we want to make sure that we break on
-        every accumulator invocation before seeing the outconverter called.
-        This requires the tests to be run single threaded
-        """
-        self._delete_breakpoints()
-        self.try_command(
-            'language renderscript reduction breakpoint set find_min_user_type -t initializer',
-        )
-        self.try_command(
-            'process continue',
-            expected_regex=[
-                r'Process \d+ stopped',
-                r'librs.reduce.so`find_min_user_type_init',
-                r'stop reason = breakpoint',
-            ]
-        )
-        self._delete_breakpoints()
-
-        self.try_command((
-            'language renderscript reduction breakpoint '
-            'set find_min_user_type --function-role accumulator,outconverter'),
-            ['Breakpoint(s) created']
-        )
-        for i in range(REDUCE_ITERATIONS):
-            self.try_command(
-                'process continue',
-                expected_regex=[
-                    r'Process \d+ resuming',
-                    r'Process \d+ stopped',
-                    r'librs.reduce.so`find_min_user_type_accum',
-                    r'stop reason = breakpoint'
-                ]
-            )
-            self.try_command('p val')
-            self.try_command(
-                'p val.b',
-                expected_regex=[
-                    r'^\((const )?int32_t\)\s*\$\d+ = %s\s*$' % (
-                        i + REDUCE_STARTVAL)
-                ]
-            )
-        # We should then finally break on the outconverter
-        self.try_command(
-            'process continue',
-            expected_regex=[
-                r'Process \d+ resuming',
-                r'Process \d+ stopped',
-                r'librs.reduce.so`find_min_user_type_outc',
-                r'stop reason = breakpoint'
-            ]
-        )
-
-    @ordered_test(5)
-    def test_function_role_breakpoints_combinations(self):
-        func_role_combinations = itertools.combinations(
-            ('accumulator', 'initializer'),
-            r=2
-        )
-        self._test_func_role_combinations(func_role_combinations)
-
-    @wimpy
-    @ordered_test(6)
-    def test_resolve_function_role_all_reduce_functions(self):
-        """
-        Assert that a reduction breakpoint successfully resolves all the
-        functions that make up the reduction kernel when the parameter `all` is
-        passed to `--function-role` for the breakpoint command
-        """
-        self._delete_breakpoints()
-        self.try_command(
-            'language renderscript reduction breakpoint set find_min_user_type -t all',
-            [r'Breakpoint(s) created']
-        )
-        self.try_command('c', [])
-        breakpoints_match = [
-            r"where = librs.reduce.so`%s (\+ \d+ )?at %s:\d+, address = 0x[0-9a-fA-F]+, resolved" % (
-                 'find_min_user_type_%s' % func_match,
-                 REDUCE_SCRIPT
-            )
-            for func_match in ('accum', 'init', 'comb', 'outc')
-        ]
-        self.try_command(
-            'breakpoint list',
-            expected_regex=[
-                r"Current breakpoints:",
-                r"RenderScript reduce breakpoint for 'find_min_user_type', locations = 4, resolved = 4",
-                r"Names:",
-                r"RenderScriptReduction",
-            ] + breakpoints_match
-        )
-
-    @ordered_test(8)
-    def test_reduce_breakpoint_conditional_1d_coordinate(self):
-        """
-        Assert that breakpoints conditional on an allocation coordinate
-        are only triggered on that coordinate
-        """
-        for x, _, __ in sorted(coords_range_3d(X_TESTS, Y_TESTS, Z_TESTS)):
-            self._delete_breakpoints()
-            self.assert_coord_bp_set(
-                'find_min_user_type -t accumulator',
-                x,
-                kernel_type='reduction'
-            )
-            self.assert_coord_stop('reduce', 'find_min_user_type', x)
-            # Step *into* the function so locals are available
-            # FIXME remove the need for `next` here; skip the function prologue
-            self.try_command('n')
-            self.try_command('p accum->a')
-            self.try_command('p accum->b')
-
-    @ordered_test('last')
-    def test_exit(self):
-        self.try_command('process kill', [])
diff --git a/tests/lldb/tests/testcases/test_reduction_combiner.py b/tests/lldb/tests/testcases/test_reduction_combiner.py
deleted file mode 100644
index c070f3b..0000000
--- a/tests/lldb/tests/testcases/test_reduction_combiner.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-
-import re
-import itertools
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    wimpy,
-    skip_conditional,
-)
-from harness.assert_mixins import CoordinateAssertionsMixin
-
-from reduce_common import (
-    REDUCE_SCRIPT,
-    REDUCE_AUTO_COMB_SCRIPT,
-    ReductionMixin,
-)
-
-
-multithreaded = lambda: skip_conditional(
-    lambda self: self.cpu_count == 1,
-    "skipping multithreaded test (1 CPU present)"
-)
-
-
-class TestReduceCombinerMultithreaded(
-        TestBaseRemote, CoordinateAssertionsMixin, ReductionMixin):
-    """
-    RenderScript reduction combiners are currently run only on the output of a
-    parallel reduction step for the CPU reference implementation.  These
-    testcases test LLDB's handling of breakpoints for the combiner function.
-    """
-
-    bundle_target = {
-        'java': 'Reduction',
-    }
-
-    def _delete_breakpoints(self):
-        try:
-            self.do_command('breakpoint delete -f')
-        except self.TestFail:
-            pass
-
-    def setup(self, android):
-        """
-        This test *must* be run on multiple threads, and is skipped if the
-        device does not support multiple threads
-        """
-        cpu_spec = android.shell("cat /sys/devices/system/cpu/online").strip()
-        match = re.search(r'(^0(-\d+)?(,\d+([-]\d*)?)*)$', cpu_spec)
-        if not match or not match.groups():
-            raise self.TestFail(
-                "unable to parse number of available CPUs in %r" % cpu_spec)
-
-        def parse_range(s):
-            r = s.split('-')
-            if len(r) == 1:
-                return 1
-            return int(r[1]) - int(r[0])
-
-        self.cpu_count = sum(map(parse_range, cpu_spec.split(',')))
-        android.push_prop('debug.rs.max-threads', self.cpu_count + 1)
-
-    def teardown(self, android):
-        """Reset the number of RS threads to the previous value."""
-        android.pop_prop('debug.rs.max-threads')
-
-    @multithreaded()
-    @ordered_test(0)
-    @wimpy
-    def test_setup(self):
-        self.try_command('language renderscript status', [])
-        # first point of order: make sure the compiled script is properly
-        # loaded and that we can set a breakpoint on the named reduction
-        self.try_command(
-            'language renderscript reduction breakpoint set find_min_user_type_auto_comb')
-        self.try_command(
-            'process continue',
-            expected_regex=[
-                r'Process \d+ stopped',
-                r'frame #0: (0x[0-9a-fA-F]+ )?librs.reduce_auto_comb.so`'
-            ]
-        )
-
-    @multithreaded()
-    def test_function_role_breakpoints_combinations(self):
-        func_role_combinations = itertools.combinations(
-            ('accumulator', 'outconverter', 'initializer', 'combiner'),
-            r=2
-        )
-        self._test_func_role_combinations(func_role_combinations)
-
-    @multithreaded()
-    def test_reduction_breakpoint_set_single_type_user_comb(self):
-        return self._reduction_breakpoint_set_single_type(
-            'reduce',
-            REDUCE_SCRIPT,
-            'find_min_user_type',
-            (
-                ('find_min_user_type_init', 'initializer'),
-                ('find_min_user_type_accum', 'accumulator'),
-                ('find_min_user_type_comb', 'combiner'),
-                ('find_min_user_type_outc', 'outconverter')
-            )
-        )
-
-    @multithreaded()
-    def test_reduction_breakpoint_set_single_type_auto_comb(self):
-        return self._reduction_breakpoint_set_single_type(
-            'reduce_auto_comb',
-            REDUCE_AUTO_COMB_SCRIPT,
-            'find_min_user_type_auto_comb',
-            (
-                ('find_min_user_type_init', 'initializer'),
-                ('find_min_user_type_accum', 'accumulator'),
-                ('find_min_user_type_accum.combiner', 'combiner'),
-                ('find_min_user_type_outc', 'outconverter')
-            )
-        )
diff --git a/tests/lldb/tests/testcases/test_rs_consts.py b/tests/lldb/tests/testcases/test_rs_consts.py
deleted file mode 100644
index 9a7ba70..0000000
--- a/tests/lldb/tests/testcases/test_rs_consts.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestRSConsts.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    cpp_only_test,
-)
-
-
-class TestRSConsts(TestBaseRemote):
-    '''Tests examining the RenderScript constants.'''
-
-    bundle_target = {
-        'java': 'KernelVariables',
-        'jni': 'JNIKernelVariables',
-        'cpp': 'CppKernelVariables'
-    }
-
-    def test_rs_consts(self):
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('language renderscript kernel breakpoint set kernel',
-                         [])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-        # Constants
-        self.try_command('expr M_1_PI',
-                         ['0.318309'])
-
-        self.try_command('expr M_2_PI',
-                         ['0.636619'])
-
-        self.try_command('expr M_2_SQRTPI',
-                         ['1.128379'])
-
-        self.try_command('expr M_E',
-                         ['2.718281'])
-
-        self.try_command('expr M_LN10',
-                         ['2.302585'])
-
-        self.try_command('expr M_LN2',
-                         ['0.693147'])
-
-        self.try_command('expr M_LOG10E',
-                         ['0.434294'])
-
-        self.try_command('expr M_LOG2E',
-                         ['1.442695'])
-
-        self.try_command('expr M_PI',
-                         ['3.141592'])
-
-        self.try_command('expr M_PI_2',
-                         ['1.570796'])
-
-        self.try_command('expr M_PI_4',
-                         ['0.785398'])
-
-        self.try_command('expr M_SQRT1_2',
-                         ['0.707106'])
-
-        self.try_command('expr M_SQRT2',
-                         ['1.414213'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
-
diff --git a/tests/lldb/tests/testcases/test_script_group.py b/tests/lldb/tests/testcases/test_script_group.py
deleted file mode 100644
index b3b3186..0000000
--- a/tests/lldb/tests/testcases/test_script_group.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestScriptGroup.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import wimpy
-
-
-class TestScriptGroup(TestBaseRemote):
-    bundle_target = {
-        'java': 'ScriptGroup'
-    }
-
-    def setup(self, android):
-        '''This test requires to be run on one thread.'''
-        android.push_prop('debug.rs.max-threads', 1)
-
-    def teardown(self, android):
-        '''Reset the number of RS threads to the previous value.'''
-        android.pop_prop('debug.rs.max-threads')
-
-    @wimpy
-    def test_kernel_backtrace(self):
-        # number of allocation elements
-        array_size = 8
-
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered',
-                          'rsdDebugHintScriptGroup2'])
-
-        self.try_command('language renderscript scriptgroup breakpoint set scriptgroup_test',
-                         ['Breakpoint 1: no locations (pending)'])
-
-        self.try_command('language renderscript scriptgroup list',
-                         ['0 script groups'])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint',
-                          'librs.scriptgroup.so`foo',
-                          'scriptgroup.rs'])
-
-        self.try_command('breakpoint list',
-                         ['scriptgroup_test',
-                          'locations = 1'])
-
-        self.try_command('language renderscript scriptgroup list',
-                         ['1 script group',
-                          'scriptgroup_test',
-                          'foo',
-                          'goo'])
-
-        self.try_command('language renderscript scriptgroup breakpoint set --stop-on-all scriptgroup_test',
-                         ['Breakpoint 2: 2 locations'])
-
-        self.try_command('breakpoint list',
-                         ['scriptgroup_test',
-                          'librs.scriptgroup.so`foo',
-                          'librs.scriptgroup.so`goo'])
-
-        # iterate over foo kernels
-        self.try_command('bt',
-                         ['scriptgroup.rs:',
-                          'frame #0', 'librs.scriptgroup.so`foo',
-                          'frame #1', 'librs.scriptgroup.so`foo.expand'])
-
-        for x in range(array_size):
-            self.try_command('frame var',
-                             ['(int) a = {0}'.format(x)])
-            self.try_command('process continue',
-                             ['resuming',
-                              'stopped',
-                              'stop reason = breakpoint',
-                              'librs.scriptgroup.so`{0}'.format(
-                                  'foo' if x < 7 else 'goo')])
-
-        # iterate over goo kernels
-        self.try_command('bt',
-                         ['stop reason = breakpoint',
-                          'scriptgroup.rs:',
-                          'frame #0', 'librs.scriptgroup.so`goo',
-                          'frame #1', 'librs.scriptgroup.so`goo.expand'])
-
-        for x in range(array_size):
-            self.try_command('frame var',
-                             ['(int) a = {0}'.format(x * x)])
-
-            if x < 7:
-                self.try_command('process continue',
-                                 ['resuming',
-                                  'stopped',
-                                  'stop reason = breakpoint',
-                                  'librs.scriptgroup.so`goo'])
diff --git a/tests/lldb/tests/testcases/test_single_source.py b/tests/lldb/tests/testcases/test_single_source.py
deleted file mode 100644
index 5da1d97..0000000
--- a/tests/lldb/tests/testcases/test_single_source.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestInvokeFun.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (ordered_test, wimpy)
-from harness.exception import TestSuiteException
-
-
-class TestSingleSource(TestBaseRemote):
-    '''Tests debugging a function executed from Java using invoke_*.'''
-
-    bundle_target = {
-        'java': "SingleSource"
-    }
-
-    def setup(self, android):
-
-        '''This test requires to be run on one thread.'''
-        android.push_prop('debug.rs.max-threads', 1)
-
-    def teardown(self, android):
-
-        '''Reset the number of RS threads to the previous value.'''
-        android.pop_prop('debug.rs.max-threads')
-
-    @ordered_test(-1)
-    @wimpy
-    def test_startup(self):
-
-        # pylint: disable=line-too-long
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('breakpoint set --name check_in',
-                         ['(pending)'])
-
-    @ordered_test(0)
-    @wimpy
-    def test_invoke_1(self):
-
-        # enter script_invoke_1
-        self.try_command('breakpoint set --name script_invoke_1',
-                         ['(pending)'])
-
-        self.try_command('process continue',
-                         ['stopped',
-                          'stop reason = breakpoint'],
-                         [r'librs.rs_single_source.so`script_invoke_1'])
-
-        self.try_command(
-            'language renderscript allocation dump 1',
-            ['(0, 0, 0) = 1',
-             '(1, 0, 0) = 2',
-             '(2, 0, 0) = 3',
-             '(3, 0, 0) = 4'])
-
-        self.try_command(
-            'language renderscript allocation dump 2',
-            ['(0, 0, 0) = 5',
-             '(1, 0, 0) = 6',
-             '(2, 0, 0) = 7',
-             '(3, 0, 0) = 8'])
-
-        self.try_command('breakpoint set --name `kernel_1',
-                         ['address'])
-
-        self.try_command('breakpoint set --name `kernel_2',
-                         ['address'])
-
-        # check our global allocation is visible
-        self.try_command('p global_alloc',
-                         ['(rs_allocation)',
-                          'p = 0x'])
-
-        # test kernel_1
-        for _ in range(10):
-            # continue as long as there are threads hitting kernel_1
-            out = self.do_command('process continue')
-            if 'librs.rs_single_source.so`kernel_1' in out:
-                continue
-            # if we hit check_in we have finished with kernel_1
-            if 'librs.rs_single_source.so`check_in' in out:
-                self.try_command(
-                    'language renderscript allocation dump 1',
-                    ['(0, 0, 0) = 25',
-                     '(1, 0, 0) = 36',
-                     '(2, 0, 0) = 49',
-                     '(3, 0, 0) = 64'])
-                break
-            TestSuiteException('unexpected breakpoint')
-        else:
-            TestSuiteException('loop quota exceeded')
-
-        # test kernel_2
-        for _ in range(10):
-            # continue as long as there are threads hitting kernel_2
-            out = self.do_command('process continue')
-            if 'librs.rs_single_source.so`kernel_2' in out:
-                continue
-            # if we hit check_in we have finished with kernel_2
-            if 'librs.rs_single_source.so`check_in' in out:
-                self.try_command(
-                    'language renderscript allocation dump 2',
-                    ['(0, 0, 0) = 125',
-                     '(1, 0, 0) = 216',
-                     '(2, 0, 0) = 343',
-                     '(3, 0, 0) = 512'])
-                break
-            TestSuiteException('unexpected breakpoint')
-        else:
-            TestSuiteException('loop quota exceeded')
-
-    @ordered_test(1)
-    @wimpy
-    def test_invoke_2(self):
-
-        # enter script_invoke_2
-        self.try_command('breakpoint set --name script_invoke_2',
-                         ['address'])
-
-        self.try_command('process continue',
-                         ['stopped',
-                          'stop reason = breakpoint'],
-                         [r'librs.rs_single_source.so`script_invoke_2'])
-
-        # test void_kernel_1
-        self.try_command('breakpoint set --name void_kernel_1',
-                         ['address'])
-
-        for _ in range(10):
-            out = self.do_command('process continue')
-
-            # continue as long as there are threads hitting void_kernel_1
-            if 'librs.rs_single_source.so`void_kernel_1' in out:
-                continue
-
-            # if we hit check_in we have finished with void_kernel_1
-            if 'librs.rs_single_source.so`check_in' in out:
-                self.try_command(
-                    'language renderscript allocation dump 4',
-                    ['(0, 0, 0) = 0',
-                     '(1, 0, 0) = 1',
-                     '(2, 0, 0) = 2',
-                     '(3, 0, 0) = 3'])
-                break
-
-            TestSuiteException('unexpected breakpoint')
-        else:
-            TestSuiteException('loop quota exceeded')
diff --git a/tests/lldb/tests/testcases/test_source_step.py b/tests/lldb/tests/testcases/test_source_step.py
deleted file mode 100644
index c780b8a..0000000
--- a/tests/lldb/tests/testcases/test_source_step.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestSourceStep.'''
-
-from __future__ import absolute_import
-
-import os
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    cpp_only_test,
-)
-
-
-class TestSourceStep(TestBaseRemote):
-    '''Test stepping through the source using step-in, -over and -out.'''
-
-    bundle_target = {
-        'java': 'BranchingFunCalls',
-        'jni': 'JNIBranchingFunCalls',
-        'cpp': 'CppBranchingFunCalls'
-
-    }
-
-    def script_dir(self):
-        file_dir = os.path.dirname(os.path.realpath(__file__))
-        app_root = os.path.join(file_dir, '..', '..')
-
-        return {
-            'java': os.path.join(app_root, 'java', 'BranchingFunCalls', 'src', 'rs'),
-            'cpp': os.path.join(app_root, 'cpp', 'BranchingFunCalls'),
-            'jni': os.path.join(app_root, 'jni', 'BranchingFunCalls', 'jnibranchingfuncalls')
-        }[self.app_type]
-
-    def setup(self, android):
-        '''This test requires to be run on one thread.'''
-        android.push_prop('debug.rs.max-threads', 1)
-
-    def teardown(self, android):
-        '''Reset the number of RS threads to the previous value.'''
-        android.pop_prop('debug.rs.max-threads')
-
-    def test_source_thread_step_in_out(self):
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('b -f scalars.rs -l 63',
-                         ['(pending)'])
-
-        self.try_command('process continue',
-                         ['stopped',
-                          'stop reason = breakpoint',
-                          'scalars.rs:63'])
-
-        # set the source mapping
-        self.set_src_map('scalars.rs', self.script_dir())
-
-        self.try_command('process status',
-                         ['-> 63',
-                          'int i = in;'])
-
-        #63     int i = in;
-        self.try_command('thread step-in',
-                         ['-> 64'])
-        #64     float f = (float) i;
-        self.try_command('thread step-in',
-                         ['-> 65'])
-        #49     modify_f(&f);
-        self.try_command('thread step-over',
-                         ['-> 66'])
-        #50  	modify_i(&i);
-        self.try_command('thread step-in',
-                         ['-> 49'])
-        #49         int j = *i;
-        self.try_command('b -f scalars.rs -l 54',
-                         ['librs.scalars.so`modify_i',
-                          'scalars.rs:54'])
-        self.try_command('c',
-                         ['stop reason = breakpoint',
-                          'scalars.rs:54',
-                          '-> 54'])
-        #54    set_i(i, 0);
-        # For the line number anything between #37 and #38 is fine
-        self.try_command('thread step-in',
-                         [],
-                         [r'-> 3[678]'])
-        #38    int tmp = b;
-        self.try_command('thread step-out',
-                         ['-> 54'])
-
-    @cpp_only_test()
-    @ordered_test('last')
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('breakpoint delete 2', ['1 breakpoints deleted'])
-
-        self.try_command('process continue',
-                         ['exited with status = 0'])
-
diff --git a/tests/lldb/tests/testcases/test_write_global.py b/tests/lldb/tests/testcases/test_write_global.py
deleted file mode 100644
index 1d8d301..0000000
--- a/tests/lldb/tests/testcases/test_write_global.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestWriteGlobal.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    wimpy,
-    ordered_test,
-    cpp_only_test,
-)
-
-
-class TestWriteGlobal(TestBaseRemote):
-    '''Tests modifying global variables of all types.'''
-
-    bundle_target = {
-        'java': 'KernelVariables',
-        'jni': 'JNIKernelVariables',
-        'cpp': 'CppKernelVariables'
-    }
-
-    def _try_modifying_global(self, global_name, new_value, data_type_in,
-                             expected_output, expected_output_regex=None):
-        '''Modify and then inspect a global and check for the output.
-
-        Run the "expr" command to set a given global to a new value and
-        check that it is set afterwards by running the "target variable"
-        command.
-
-        Args:
-            global_name: String which is the name of the global to modify.
-            new_value: A string that is the new value of the global.
-            data_type_in: A string containing a c-style parenthesised data type
-                          representing the type of the global.
-            expected_output: List of strings that should be found in the output
-                             of both commands.
-            expected_output_regex: List of regular expressions that should be
-                                   found in the output of the target variable
-                                   command.
-
-        Raises:
-            TestFail: One of the lldb commands did not provide the expected
-                      output.
-        '''
-        self.try_command('expr %s = %s%s' %
-                         (global_name, data_type_in, new_value),
-                         expected_output,
-                         expected_output_regex)
-        self.try_command('target variable ' + global_name,
-                         expected_output,
-                         expected_output_regex)
-
-    @wimpy
-    @ordered_test(0)
-    def test_setup(self):
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('b -f simple.rs -l 145', [])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-    @wimpy
-    def test_char_global(self):
-        self._try_modifying_global('char_global', '-2',
-                                  '(signed char)', ['\'\\xfe\''],
-                                  [r'\((signed )?char\)'])
-
-    def test_write_primitive_types(self):
-        self._try_modifying_global('uchar_global', '22',
-                                  '(uchar)', ['(uchar)', '\'\\x16\''])
-
-        self._try_modifying_global('short_global', '-33',
-                                  '(short)', ['(short)', '-33'])
-
-        self._try_modifying_global('ushort_global', '44',
-                                  '(ushort)', ['(ushort)', '44'])
-
-        self._try_modifying_global('int_global', '-55',
-                                  '(int)', ['(int)', '-55'])
-
-        self._try_modifying_global('uint_global', '66',
-                                  '(uint)', ['(uint)', '66'])
-
-        self._try_modifying_global('float_global', '-7.5',
-                                  '(float)', ['(float)', '-7.5'])
-
-        self._try_modifying_global('long_global', '-888888',
-                                  '(long long)', ['-888888'],
-                                  [r'\((long )?long\)'])
-
-        self._try_modifying_global('ulong_global', '99999999',
-                                  '(ulong)', ['(ulong)', '99999999'])
-
-        self._try_modifying_global('double_global', '-10101.5',
-                                  '(double)', ['(double)', '-10101.5'])
-
-        self._try_modifying_global('char2_global', '{22, 4}',
-                                  '(char2)', ['(char2)', '(22, 4)'])
-
-    @wimpy
-    def test_write_uchar2(self):
-        self._try_modifying_global('uchar2_global', '{44, 55}',
-                                  '(uchar2)', ['(uchar2)', '(0x2c, 0x37)'])
-
-    def test_write_vec2(self):
-        self._try_modifying_global('short2_global', '{-66, 77}',
-                                  '(short2)', ['(short2)', '(-66, 77)'])
-
-        self._try_modifying_global('ushort2_global', '{88, 99}',
-                                  '(ushort2)', ['(ushort2)', '(88, 99)'])
-
-        self._try_modifying_global('int2_global', '{111, -222}',
-                                  '(int2)', ['(int2)', '(111, -222)'])
-
-        self._try_modifying_global('uint2_global', '{333, 444}',
-                                  '(uint2)', ['(uint2)', '(333, 444)'])
-
-        self._try_modifying_global('float2_global', '{-55.5f, 6.0}',
-                                  '(float2)', ['(float2)', '(-55.5, 6)'])
-
-        self._try_modifying_global('long2_global', '{666666, -777777}',
-                                  '(long2)', ['(long2)', '(666666, -777777)'])
-
-        self._try_modifying_global('ulong2_global', '{888888, 999999}',
-                                  '(ulong2)', ['(ulong2)', '(888888, 999999)'])
-
-        self._try_modifying_global('double2_global', '{11.0000000, -0.0l}',
-                                  '(double2)', ['(double2)', '(11, -0)'])
-
-        self._try_modifying_global('char3_global', '{2, -3, 4}',
-                                  '(char3)', ['(char3)', '(2, -3, 4,'])
-
-        self._try_modifying_global('uchar3_global', '{\'a\', \'b\', \'c\'}',
-                                  '(uchar3)', ['(uchar3)', '(0x61, 0x62, 0x63,'])
-
-    @wimpy
-    def test_write_short3(self):
-        self._try_modifying_global('short3_global', '{44, -55, 66}',
-                                  '(short3)', ['(short3)', '(44, -55, 66,'])
-
-    def test_write_vec3(self):
-        self._try_modifying_global('ushort3_global', '{88, 99, 111}',
-                                  '(ushort3)', ['(ushort3)', '(88, 99, 111,'])
-
-        self._try_modifying_global('int3_global', '{-111, 222, -333}',
-                                  '(int3)', ['(int3)', '(-111, 222, -333,'])
-
-        self._try_modifying_global('uint3_global', '{444, 555, 666}',
-                                  '(uint3)', ['(uint3)', '(444, 555, 666,'])
-
-        self._try_modifying_global('float3_global', '{7.5F, 0008.000, 9}',
-                                  '(float3)', ['(float3)', '(7.5, 8, 9,'])
-
-        self._try_modifying_global('long3_global', '{111111, -22222222, 3333333}',
-                                  '(long3)', ['(long3)', '(111111, -22222222, 3333333,'])
-
-        self._try_modifying_global('ulong3_global', '{4444444, 5555555, 66666666}',
-                                  '(ulong3)', ['(ulong3)', '(4444444, 5555555, 66666666,'])
-
-        self._try_modifying_global('double3_global', '{7.5L, -0, 8.9e1}',
-                                  '(double3)', ['(double3)', '(7.5, 0, 89,'])
-
-        self._try_modifying_global('char4_global', '{0x1, 0x2, 0x3, 0x4}',
-                                  '(char4)',
-                                  ['(char4)', '(1, 2, 3, 4)'])
-
-        self._try_modifying_global('uchar4_global', '{0x5, 0x6, 0x7, 0x8}',
-                                  '(uchar4)',
-                                  ['(uchar4)', '(0x05, 0x06, 0x07, 0x08)'])
-
-        self._try_modifying_global('short4_global', '{0x9, 0xa, 0xb, 0xc}',
-                                  '(short4)',
-                                  ['(short4)', '(9, 10, 11, 12)'])
-
-    @wimpy
-    def test_write_ushort4(self):
-        self._try_modifying_global('ushort4_global', '{0xd, 0xe, 0xf, 0x10}',
-                                  '(ushort4)',
-                                  ['(ushort4)', '(13, 14, 15, 16)'])
-
-    def test_write_vec4_global(self):
-        self._try_modifying_global('int4_global', '{0x11, 0x12, 0x13, 0x14}',
-                                  '(int4)',
-                                  ['(int4)', '(17, 18, 19, 20)'])
-
-        self._try_modifying_global('uint4_global', '{0x15, 0x16, 0x17, 0x18}',
-                                  '(uint4)',
-                                  ['(uint4)', '(21, 22, 23, 24)'])
-
-        self._try_modifying_global('float4_global', '{19.0, 20.5, -21, -22.5}',
-                                  '(float4)',
-                                  ['(float4)', '(19, 20.5, -21, -22.5)'])
-
-        self._try_modifying_global('long4_global', '{0x1d, 0x1e, 0x1f, 0x20}',
-                                  '(long4)',
-                                  ['(long4)', '(29, 30, 31, 32)'])
-
-        self._try_modifying_global('ulong4_global', '{0x21, 0x22, 0x23, 0x24}',
-                                  '(ulong4)',
-                                  ['(ulong4)', '(33, 34, 35, 36)'])
-
-        self._try_modifying_global('double4_global', '{25.000, -26, -27.5, 28.0}',
-                                  '(double4)',
-                                  ['(double4)', '(25, -26, -27.5, 28)'])
-
-    @ordered_test('last')
-    @cpp_only_test()
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('process continue', ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_write_global_element.py b/tests/lldb/tests/testcases/test_write_global_element.py
deleted file mode 100644
index 28882bd..0000000
--- a/tests/lldb/tests/testcases/test_write_global_element.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestWriteGlobalElement.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    wimpy,
-    cpp_only_test
-)
-
-
-class TestWriteGlobalElement(TestBaseRemote):
-    '''Tests modifying elements of global variables of all types.'''
-
-    bundle_target = {
-        'java': 'KernelVariables',
-        'jni': 'JNIKernelVariables',
-        'cpp': 'CppKernelVariables'
-    }
-
-    def _try_inspecting_global(self, global_name, expected_output):
-        '''Run the "expr" command on a given global and with a given output.
-
-        Args:
-            global_name: String which is the name of the global to inspect.
-            expected_output: List of strings that should be found in the output.
-
-        Raises:
-            TestFail: The lldb command did not provide the expected output.
-        '''
-        self.try_command('expr ' + global_name, expected_output)
-
-    def _try_modifying_global(self, global_name, new_value, expected_output,
-                             expected_output_regex=None):
-        '''Modify and then inspect a global and check for the output.
-
-        Run the "expr" command to set a given global to a new value and
-        check that it is set afterwards by running the "target variable"
-        command.
-
-        Args:
-            global_name: String which is the name of the global to modify.
-            new_value: A string that is the new value of the global.
-            expected_output: List of strings that should be found in the output
-                             of both commands.
-            expected_output_regex: List of regular expressions that should be
-                                   found in the output of the target variable
-                                   command.
-
-        Raises:
-            TestFail: One of the lldb commands did not provide the expected
-                      output.
-        '''
-        self.try_command('expr %s = %s' % (global_name, new_value),
-                         expected_output,
-                         expected_output_regex)
-        self.try_command('target variable ' + global_name,
-                         expected_output,
-                         expected_output_regex)
-
-    @wimpy
-    @ordered_test(0)
-    def test_setup(self):
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('b -f simple.rs -l 145', [])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-    @wimpy
-    def test_modify_global(self):
-        self._try_modifying_global('char2_global[0]', '2',
-                                  ['\'\\x02\''],
-                                  [r'\((signed )?char\)'])
-        self._try_inspecting_global('char2_global', ['(char2)', '(2, -22)'])
-
-    def test_vec2(self):
-        self._try_modifying_global('uchar2_global[1]', '3',
-                                  ['\'\\x03\''],
-                                  [r'\(u(nsigned )?char\)'])
-        self._try_inspecting_global('uchar2_global',
-                                   ['(uchar2)', '(0x21, 0x03)'])
-
-        self._try_modifying_global('short2_global[0]', '-44',
-                                  ['(short)', '-44'])
-        self._try_inspecting_global('short2_global',
-                                   ['(short2)', '(-44, 666)'])
-
-        self._try_modifying_global('ushort2_global[1]', '55',
-                                  ['55'],
-                                  [r'\(u(nsigned )?short\)'])
-        self._try_inspecting_global('ushort2_global',
-                                   ['(ushort2)', '(777, 55)'])
-
-        self._try_modifying_global('int2_global[0]', '666',
-                                  ['(int)', '666'])
-        self._try_inspecting_global('int2_global',
-                                   ['(int2)', '(666, -1111)'])
-
-        self._try_modifying_global('uint2_global[1]', '777',
-                                  ['777'],
-                                  [r'\(u(nsigned )?int\)'])
-        self._try_inspecting_global('uint2_global',
-                                   ['(uint2)', '(2222, 777)'])
-
-        self._try_modifying_global('float2_global[0]', '-8.5',
-                                  ['(float)', '-8.5'])
-        self._try_inspecting_global('float2_global',
-                                   ['(float2)', '(-8.5, -5)'])
-
-        self._try_modifying_global('long2_global[1]', '999999',
-                                  ['999999'],
-                                  [r'\((long )?long\)'])
-        self._try_inspecting_global('long2_global',
-                                   ['(long2)', '(-4444, 999999)'])
-
-        self._try_modifying_global('ulong2_global[0]', '10101010101',
-                                  ['10101010101'],
-                                  [r'\(u(nsigned )?(long )?long\)'])
-        self._try_inspecting_global('ulong2_global',
-                                   ['(ulong2)', '(10101010101, 7777)'])
-
-        self._try_modifying_global('double2_global[1]', '-11.000',
-                                  ['(double)', '-11'])
-        self._try_inspecting_global('double2_global',
-                                   ['(double2)', '(88.5, -11)'])
-
-        self._try_modifying_global('char3_global[0]', '12',
-                                  ['\'\\f\''],
-                                  [r'\((signed )?char\)'])
-        self._try_inspecting_global('char3_global',
-                                   ['(char3)',
-                                    '(12, -22, -33,'])
-
-    @wimpy
-    def test_uchar3(self):
-        self._try_modifying_global('uchar3_global[1]', '\'d\'',
-                                  ['\'d\''],
-                                  [r'\(u(nsigned )?char\)'])
-        self._try_inspecting_global('uchar3_global',
-                                   ['(uchar3)',
-                                    '(0x21, 0x64, 0x37,'])
-
-    def test_vec3(self):
-        self._try_modifying_global('short3_global[2]', '-131',
-                                  ['(short)', '-131'])
-        self._try_inspecting_global('short3_global',
-                                   ['(short3)',
-                                    '(-555, 666, -131,'])
-
-        self._try_modifying_global('ushort3_global[0]', '1414',
-                                  ['1414'],
-                                  [r'\(u(nsigned )?short\)'])
-        self._try_inspecting_global('ushort3_global',
-                                   ['(ushort3)',
-                                    '(1414, 888, 999,'])
-
-        self._try_modifying_global('int3_global[0]', '151515',
-                                  ['(int)', '151515'])
-        self._try_inspecting_global('int3_global',
-                                   ['(int3)',
-                                    '(151515, -1111, 2222,'])
-
-        self._try_modifying_global('uint3_global[1]', '161616',
-                                  ['161616'],
-                                  [r'\(u(nsigned )?int\)'])
-        self._try_inspecting_global('uint3_global',
-                                   ['(uint3)',
-                                    '(2222, 161616, 4444,'])
-
-        self._try_modifying_global('float3_global[2]', '17.5',
-                                  ['(float)', '17.5'])
-        self._try_inspecting_global('float3_global',
-                                   ['(float3)',
-                                    '(4.5, -5, 17.5,'])
-
-        self._try_modifying_global('long3_global[0]', '-181818181818',
-                                  ['-181818181818'],
-                                  [r'\((long )?long\)'])
-        self._try_inspecting_global('long3_global',
-                                   ['(long3)',
-                                    '(-181818181818, 5555, 6666,'])
-
-        self._try_modifying_global('ulong3_global[1]', '191919191919',
-                                  ['191919191919'],
-                                  [r'\(u(nsigned )?(long )?long\)'])
-        self._try_inspecting_global('ulong3_global',
-                                   ['(ulong3)',
-                                    '(6666, 191919191919, 8888,'])
-
-        self._try_modifying_global('double3_global[2]', '20.5',
-                                  ['(double)', '20.5'])
-        self._try_inspecting_global('double3_global',
-                                   ['(double3)',
-                                    '(88.5, -99, 20.5,'])
-
-        self._try_modifying_global('char4_global[0]', '-21',
-                                  ['\'\\xeb\''],
-                                  [r'\((signed )?char\)'])
-        self._try_inspecting_global('char4_global',
-                                   ['(char4)',
-                                    '(-21, 11, -22, -33)'])
-
-        self._try_modifying_global('uchar4_global[1]', '22',
-                                  ['\'\\x16\''],
-                                  [r'\(u(nsigned )?char\)'])
-        self._try_inspecting_global('uchar4_global',
-                                   ['(uchar4)',
-                                    '(0xde, 0x16, 0x2c, 0x37)'])
-
-    @wimpy
-    def test_short4(self):
-        self._try_modifying_global('short4_global[2]', '23',
-                                  ['(short)', '23'])
-        self._try_inspecting_global('short4_global',
-                                   ['(short4)',
-                                    '(-444, -555, 23, 777)'])
-
-    def test_vec4(self):
-        self._try_modifying_global('ushort4_global[3]', '24',
-                                  ['24'],
-                                  [r'\(u(nsigned )?short\)'])
-        self._try_inspecting_global('ushort4_global',
-                                   ['(ushort4)',
-                                    '(666, 777, 888, 24)'])
-
-        self._try_modifying_global('int4_global[0]', '-2525',
-                                  ['(int)', '-2525'])
-        self._try_inspecting_global('int4_global',
-                                   ['(int4)',
-                                    '(-2525, 999, -1111, 2222)'])
-
-        self._try_modifying_global('uint4_global[1]', '26262',
-                                  ['26262'],
-                                  [r'\(u(nsigned )?int\)'])
-        self._try_inspecting_global('uint4_global',
-                                   ['(uint4)',
-                                    '(1111, 26262, 3333, 4444)'])
-
-        self._try_modifying_global('float4_global[2]', '27.0f',
-                                  ['(float)', '27'])
-        self._try_inspecting_global('float4_global',
-                                   ['(float4)',
-                                    '(3, 4.5, 27, -6.5)'])
-
-        self._try_modifying_global('long4_global[3]', '-28282828282',
-                                  ['-28282828282'],
-                                  [r'\((long )?long\)'])
-        self._try_inspecting_global('long4_global',
-                                   ['(long4)',
-                                    '(-3333, -4444, 5555, -28282828282)'])
-
-        self._try_modifying_global('ulong4_global[0]', '2929292929',
-                                  ['2929292929'],
-                                  [r'\(u(nsigned )?(long )?long\)'])
-        self._try_inspecting_global('ulong4_global',
-                                   ['(ulong4)',
-                                    '(2929292929, 6666, 7777, 8888)'])
-
-        self._try_modifying_global('double4_global[1]', '30.5',
-                                  ['(double)', '30.5'])
-        self._try_inspecting_global('double4_global',
-                                   ['(double4)',
-                                    '(-77, 30.5, -99, 111.5)'])
-
-    @cpp_only_test()
-    @ordered_test('last')
-    def test_cpp_cleanup(self):
-        self.try_command('breakpoint delete 1', ['1 breakpoints deleted'])
-
-        self.try_command('process continue',
-                         ['exited with status = 0'])
diff --git a/tests/lldb/tests/testcases/test_write_local.py b/tests/lldb/tests/testcases/test_write_local.py
deleted file mode 100644
index cd32ecc..0000000
--- a/tests/lldb/tests/testcases/test_write_local.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestWriteLocal.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    ordered_test,
-    wimpy
-)
-
-
-class TestWriteLocal(TestBaseRemote):
-    '''Tests modifying local variables of all types.'''
-
-    bundle_target = {
-        'java': 'KernelVariables',
-        'jni': 'JNIKernelVariables',
-        'cpp': 'CppKernelVariables'
-    }
-
-    def _try_modifying_local(self, local_name, new_value, data_type_in,
-                             expected_output, expected_output_regex=None):
-        '''Try getting lldb to modify a local and check the output.
-
-        Run the "expr" command to set a given local to a new value and
-        check that it is set afterwards by running the "target variable"
-        command.
-
-        Args:
-            local_name: String which is the name of the local to modify.
-            new_value: A string that is the new value of the local.
-            data_type_in: A string containing a c-style parenthesised data type
-                          representing the type of the local.
-            expected_output: List of strings that should be found in the output
-                             of both commands.
-            expected_output_regex: List of regular expressions that should be
-                                   found in the output of the target variable
-                                   command.
-
-        Raises:
-            TestFail: One of the lldb commands did not provide the expected
-                      output.
-        '''
-        # pylint: disable=too-many-arguments
-        self.try_command('expr %s = %s%s'
-                         % (local_name, data_type_in, new_value),
-                         expected_output,
-                         expected_output_regex)
-        self.try_command('frame variable ' + local_name,
-                         expected_output,
-                         expected_output_regex)
-
-    @wimpy
-    @ordered_test(0)
-    def test_setup(self):
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('b -f simple.rs -l 145', [])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-    @wimpy
-    def test_modify_char(self):
-        self._try_modifying_local('char_local', '-2',
-                                 '(signed char)', ['\'\\xfe\''],
-                                 [r'\((signed )?char\)'])
-
-    def test_modify_primitive(self):
-        self._try_modifying_local('uchar_local', '22',
-                                 '(uchar)', ['(uchar)', '\'\\x16\''])
-
-        self._try_modifying_local('short_local', '-33',
-                                 '(short)', ['(short)', '-33'])
-
-        self._try_modifying_local('ushort_local', '44',
-                                 '(ushort)', ['(ushort)', '44'])
-
-        self._try_modifying_local('int_local', '-55',
-                                 '(int)', ['(int)', '-55'])
-
-        self._try_modifying_local('uint_local', '66',
-                                 '(uint)', ['(uint)', '66'])
-
-        self._try_modifying_local('float_local', '-7.5',
-                                 '(float)', ['(float)', '-7.5'])
-
-        self._try_modifying_local('long_local', '-888888',
-                                 '(long long)', ['-888888'],
-                                 [r'\((long )?long\)'])
-
-        self._try_modifying_local('ulong_local', '99999999',
-                                 '(ulong)', ['(ulong)', '99999999'])
-
-        self._try_modifying_local('double_local', '-10101.5',
-                                 '(double)', ['(double)', '-10101.5'])
-
-        self._try_modifying_local('char2_local', '{22, 4}',
-                                 '(char2)', ['(char2)', '(22, 4)'])
-
-    @wimpy
-    def test_modify_uchar2(self):
-        self._try_modifying_local('uchar2_local', '{44, 55}',
-                             '(uchar2)', ['(uchar2)', '(0x2c, 0x37)'])
-
-    def test_modify_vec2(self):
-        self._try_modifying_local('short2_local', '{-66, 77}',
-                                 '(short2)', ['(short2)', '(-66, 77)'])
-
-        self._try_modifying_local('ushort2_local', '{88, 99}',
-                                 '(ushort2)', ['(ushort2)', '(88, 99)'])
-
-        self._try_modifying_local('int2_local', '{111, -222}',
-                                 '(int2)', ['(int2)', '(111, -222)'])
-
-        self._try_modifying_local('uint2_local', '{333, 444}',
-                                 '(uint2)', ['(uint2)', '(333, 444)'])
-
-        self._try_modifying_local('float2_local', '{-55.5f, 6.0}',
-                                 '(float2)', ['(float2)', '(-55.5, 6)'])
-
-        self._try_modifying_local('long2_local', '{666666, -777777}',
-                                 '(long2)', ['(long2)', '(666666, -777777)'])
-
-        self._try_modifying_local('ulong2_local', '{888888, 999999}',
-                                 '(ulong2)', ['(ulong2)', '(888888, 999999)'])
-
-        self._try_modifying_local('double2_local', '{11.0000000, -0.0l}',
-                                 '(double2)', ['(double2)', '(11, -0)'])
-
-        self._try_modifying_local('char3_local', '{2, -3, 4}',
-                                 '(char3)', ['(char3)', '(2, -3, 4,'])
-
-        self._try_modifying_local('uchar3_local', '{\'a\', \'b\', \'c\'}',
-                                 '(uchar3)', ['(uchar3)', '(0x61, 0x62, 0x63,'])
-
-    @wimpy
-    def test_modify_short3(self):
-        self._try_modifying_local('short3_local', '{44, -55, 66}',
-                             '(short3)', ['(short3)', '(44, -55, 66,'])
-
-    def test_modify_vec3(self):
-        self._try_modifying_local('ushort3_local', '{88, 99, 111}',
-                                 '(ushort3)', ['(ushort3)', '(88, 99, 111,'])
-
-        self._try_modifying_local('int3_local', '{-111, 222, -333}',
-                                 '(int3)', ['(int3)', '(-111, 222, -333,'])
-
-        self._try_modifying_local('uint3_local', '{444, 555, 666}',
-                                 '(uint3)', ['(uint3)', '(444, 555, 666,'])
-
-        self._try_modifying_local('float3_local', '{7.5F, 0008.000, 9}',
-                                 '(float3)', ['(float3)', '(7.5, 8, 9,'])
-
-        self._try_modifying_local('long3_local', '{111111, -22222222, 3333333}',
-                                 '(long3)', ['(long3)', '(111111, -22222222, 3333333,'])
-
-        self._try_modifying_local('ulong3_local', '{4444444, 5555555, 66666666}',
-                                 '(ulong3)', ['(ulong3)', '(4444444, 5555555, 66666666,'])
-
-        self._try_modifying_local('double3_local', '{7.5L, -0, 8.9e1}',
-                                 '(double3)', ['(double3)', '(7.5, 0, 89,'])
-
-        self._try_modifying_local('char4_local', '{0x1, 0x2, 0x3, 0x4}',
-                                 '(char4)',
-                                 ['(char4)', '(1, 2, 3, 4)'])
-
-        self._try_modifying_local('uchar4_local', '{0x5, 0x6, 0x7, 0x8}',
-                                 '(uchar4)',
-                                 ['(uchar4)', '(0x05, 0x06, 0x07, 0x08)'])
-
-        self._try_modifying_local('short4_local', '{0x9, 0xa, 0xb, 0xc}',
-                                 '(short4)',
-                                 ['(short4)', '(9, 10, 11, 12)'])
-
-    @wimpy
-    def test_modify_ushort4(self):
-        self._try_modifying_local('ushort4_local', '{0xd, 0xe, 0xf, 0x10}',
-                             '(ushort4)',
-                             ['(ushort4)', '(13, 14, 15, 16)'])
-
-    def test_modify_vec4(self):
-        self._try_modifying_local('int4_local', '{0x11, 0x12, 0x13, 0x14}',
-                                 '(int4)',
-                                 ['(int4)', '(17, 18, 19, 20)'])
-
-        self._try_modifying_local('uint4_local', '{0x15, 0x16, 0x17, 0x18}',
-                                 '(uint4)',
-                                 ['(uint4)', '(21, 22, 23, 24)'])
-
-        self._try_modifying_local('float4_local', '{19.0, 20.5, -21, -22.5}',
-                                 '(float4)',
-                                 ['(float4)', '(19, 20.5, -21, -22.5)'])
-
-        self._try_modifying_local('long4_local', '{0x1d, 0x1e, 0x1f, 0x20}',
-                                 '(long4)',
-                                 ['(long4)', '(29, 30, 31, 32)'])
-
-        self._try_modifying_local('ulong4_local', '{0x21, 0x22, 0x23, 0x24}',
-                                 '(ulong4)',
-                                 ['(ulong4)', '(33, 34, 35, 36)'])
-
-        self._try_modifying_local('double4_local', '{25.000, -26, -27.5, 28.0}',
-                                 '(double4)',
-                                 ['(double4)', '(25, -26, -27.5, 28)'])
diff --git a/tests/lldb/tests/testcases/test_write_local_element.py b/tests/lldb/tests/testcases/test_write_local_element.py
deleted file mode 100644
index 17e6121..0000000
--- a/tests/lldb/tests/testcases/test_write_local_element.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''Module that contains the test TestWriteLocalElement.'''
-
-from __future__ import absolute_import
-
-from harness.test_base_remote import TestBaseRemote
-from harness.decorators import (
-    wimpy,
-    ordered_test
-)
-
-
-class TestWriteLocalElement(TestBaseRemote):
-    '''Tests modifying elements of local variables of all types.'''
-
-    bundle_target = {
-        'java': 'KernelVariables',
-        'jni': 'JNIKernelVariables',
-        'cpp': 'CppKernelVariables'
-    }
-
-    def _try_inspecting_local(self, local_name, expected_output):
-        '''Run the "expr" command on a given local and with a given output.
-
-        Args:
-            local_name: String which is the name of the local to inspect.
-            expected_output: List of strings that should be found in the output.
-
-        Raises:
-            TestFail: The lldb command did not provide the expected output.
-        '''
-        self.try_command('expr ' + local_name, expected_output)
-
-    def _try_modifying_local(self, local_name, new_value, expected_output,
-                             expected_output_regex=None):
-        '''Modify and then inspect a local and check for the output.
-
-        Run the "expr" command to set a given local to a new value and
-        check that it is set afterwards by running the "target variable"
-        command.
-
-        Args:
-            local_name: String which is the name of the local to modify.
-            new_value: A string that is the new value of the local.
-            expected_output: List of strings that should be found in the output
-                             of both commands.
-            expected_output_regex: List of regular expressions that should be
-                                   found in the output of the target variable
-                                   command.
-
-        Raises:
-            TestFail: One of the lldb commands did not provide the expected
-                      output.
-        '''
-        self.try_command('expr %s = %s' % (local_name, new_value),
-                         expected_output,
-                         expected_output_regex)
-        self.try_command('frame variable ' + local_name,
-                         expected_output,
-                         expected_output_regex)
-
-    @wimpy
-    @ordered_test(0)
-    def test_setup(self):
-        self.try_command('language renderscript status',
-                         ['Runtime Library discovered',
-                          'Runtime Driver discovered'])
-
-        self.try_command('b -f simple.rs -l 145', [])
-
-        self.try_command('process continue',
-                         ['resuming',
-                          'stopped',
-                          'stop reason = breakpoint'])
-
-    @wimpy
-    def test_modify_char2(self):
-        self._try_modifying_local('char2_local[0]', '2',
-                                 ['2'], [r'\((signed )?char\)'])
-        self._try_inspecting_local('char2_local',
-                                 ['(char2)', '(2, -22)'])
-
-    def test_modify_vec2(self):
-        self._try_modifying_local('uchar2_local[1]', '3',
-                                 ['3'], [r'\(u(nsigned )?char\)'])
-        self._try_inspecting_local('uchar2_local',
-                                  ['(uchar2)', '(0x21, 0x03)'])
-
-        self._try_modifying_local('short2_local[0]', '-44',
-                                 ['(short)', '-44'])
-        self._try_inspecting_local('short2_local',
-                                  ['(short2)', '(-44, 666)'])
-
-        self._try_modifying_local('ushort2_local[1]', '55',
-                                 ['55'], [r'\(u(nsigned )?short\)'])
-        self._try_inspecting_local('ushort2_local',
-                                  ['(ushort2)', '(777, 55)'])
-
-        self._try_modifying_local('int2_local[0]', '666',
-                                 ['(int)', '666'])
-        self._try_inspecting_local('int2_local',
-                                  ['(int2)', '(666, -1111)'])
-
-        self._try_modifying_local('uint2_local[1]', '777',
-                                 ['777'], [r'\(u(nsigned )?int\)'])
-        self._try_inspecting_local('uint2_local',
-                                  ['(uint2)', '(2222, 777)'])
-
-        self._try_modifying_local('float2_local[0]', '-8.5',
-                                 ['(float)', '-8.5'])
-        self._try_inspecting_local('float2_local',
-                                  ['(float2)', '(-8.5, -5)'])
-
-        self._try_modifying_local('long2_local[1]', '999999',
-                                 ['999999'],
-                                 [r'\((long )?long\)'])
-        self._try_inspecting_local('long2_local',
-                                  ['(long2)', '(-4444, 999999)'])
-
-        self._try_modifying_local('ulong2_local[0]', '10101010101',
-                                 ['10101010101'],
-                                 [r'\(u(nsigned )?(long )?long\)'])
-        self._try_inspecting_local('ulong2_local',
-                                  ['(ulong2)', '(10101010101, 7777)'])
-
-        self._try_modifying_local('double2_local[1]', '-11.000',
-                                 ['(double)', '-11'])
-        self._try_inspecting_local('double2_local',
-                                  ['(double2)', '(88.5, -11)'])
-
-        # For some reason the result of some char and uchar expr is in hex
-        # and that of frame variable in decimal, so calling
-        # try_modifying_local doesn't work, because it reuses the expected
-        # output for both commands.
-        self.try_command('expr char3_local[0] = 12',
-                         ['\'\\f\''],
-                         [r'\((signed )?char\)'])
-        self.try_command('frame variable char3_local[0]',
-                         ['12'],
-                         [r'\((signed )?char\)'])
-
-        self._try_inspecting_local('char3_local',
-                                  ['(char3)',
-                                   '(12, -22, -33,'])
-
-    @wimpy
-    def test_modify_uchar3(self):
-        self.try_command('expr uchar3_local[1] = \'d\'',
-                         ['\'d\''],
-                         [r'\(u(nsigned )?char\)'])
-        self.try_command('frame variable uchar3_local[1]',
-                         ['0x64'],
-                         [r'\(u(nsigned )?char\)'])
-
-
-    def test_modify_vec3(self):
-        self._try_inspecting_local('uchar3_local',
-                                  ['(uchar3)',
-                                   '(0x21, 0x64, 0x37,'])
-
-        self._try_modifying_local('short3_local[2]', '-131',
-                                 ['(short)', '-131'])
-        self._try_inspecting_local('short3_local',
-                                  ['(short3)',
-                                   '(-555, 666, -131,'])
-
-        self._try_modifying_local('ushort3_local[0]', '1414',
-                                 ['1414'], [r'\(u(nsigned )?short\)'])
-        self._try_inspecting_local('ushort3_local',
-                                  ['(ushort3)',
-                                   '(1414, 888, 999,'])
-
-        self._try_modifying_local('int3_local[0]', '151515',
-                                 ['(int)', '151515'])
-        self._try_inspecting_local('int3_local',
-                                  ['(int3)',
-                                   '(151515, -1111, 2222,'])
-
-        self._try_modifying_local('uint3_local[1]', '161616',
-                                 ['161616'], [r'\(u(nsigned )?int\)'])
-        self._try_inspecting_local('uint3_local',
-                                  ['(uint3)',
-                                   '(2222, 161616, 4444,'])
-
-        self._try_modifying_local('float3_local[2]', '17.5',
-                                 ['(float)', '17.5'])
-        self._try_inspecting_local('float3_local',
-                                  ['(float3)',
-                                   '(4.5, -5, 17.5,'])
-
-        self._try_modifying_local('long3_local[0]', '-181818181818',
-                                 ['-181818181818'], [r'\((long )?long\)'])
-        self._try_inspecting_local('long3_local',
-                                  ['(long3)',
-                                   '(-181818181818, 5555, 6666,'])
-
-        self._try_modifying_local('ulong3_local[1]', '191919191919',
-                                 ['191919191919'],
-                                 [r'\(u(nsigned )?(long )?long\)'])
-        self._try_inspecting_local('ulong3_local',
-                                  ['(ulong3)',
-                                   '(6666, 191919191919, 8888,'])
-
-        self._try_modifying_local('double3_local[2]', '20.5',
-                                 ['(double)', '20.5'])
-        self._try_inspecting_local('double3_local',
-                                  ['(double3)',
-                                   '(88.5, -99, 20.5,'])
-
-        self.try_command('expr char4_local[0] = -21',
-                         ['\'\\xeb\''],
-                         [r'\((signed )?char\)'])
-        self.try_command('frame variable char4_local[0]',
-                         ['-21'],
-                         [r'\((signed )?char\)'])
-
-        self._try_inspecting_local('char4_local',
-                                  ['(char4)',
-                                   '(-21, 11, -22, -33)'])
-
-        self.try_command('expr uchar4_local[1] = 22',
-                         ['\'\\x16\''],
-                         [r'\(u(nsigned )?char\)'])
-        self.try_command('frame variable uchar4_local[1]',
-                         ['0x16'],
-                         [r'\(u(nsigned )?char\)'])
-
-        self._try_inspecting_local('uchar4_local',
-                                  ['(uchar4)',
-                                   '(0x16, 0x16, 0x2c, 0x37)'])
-
-    @wimpy
-    def test_modify_short4(self):
-        self._try_modifying_local('short4_local[2]', '23',
-                                 ['(short)', '23'])
-        self._try_inspecting_local('short4_local',
-                                  ['(short4)',
-                                   '(-444, -555, 23, 777)'])
-
-    def test_modify_vec4(self):
-        self._try_modifying_local('ushort4_local[3]', '24',
-                                 ['24'], [r'\(u(nsigned )?short\)'])
-        self._try_inspecting_local('ushort4_local',
-                                  ['(ushort4)',
-                                   '(666, 777, 888, 24)'])
-
-        self._try_modifying_local('int4_local[0]', '-2525',
-                                 ['(int)', '-2525'])
-        self._try_inspecting_local('int4_local',
-                                  ['(int4)',
-                                   '(-2525, 999, -1111, 2222)'])
-
-        self._try_modifying_local('uint4_local[1]', '26262',
-                                 ['26262'], [r'\(u(nsigned )?int\)'])
-        self._try_inspecting_local('uint4_local',
-                                  ['(uint4)',
-                                   '(1111, 26262, 3333, 4444)'])
-
-        self._try_modifying_local('float4_local[2]', '27.0f',
-                                 ['(float)', '27'])
-        self._try_inspecting_local('float4_local',
-                                  ['(float4)',
-                                   '(3, 4.5, 27, -6.5)'])
-
-        self._try_modifying_local('long4_local[3]', '-28282828282',
-                                 ['-28282828282'], [r'\((long )?long\)'])
-        self._try_inspecting_local('long4_local',
-                                  ['(long4)',
-                                   '(-3333, -4444, 5555, -28282828282)'])
-
-        self._try_modifying_local('ulong4_local[0]', '2929292929',
-                                 ['2929292929'],
-                                 [r'\(u(nsigned )?(long )?long\)'])
-        self._try_inspecting_local('ulong4_local',
-                                  ['(ulong4)',
-                                   '(2929292929, 6666, 7777, 8888)'])
-
-        self._try_modifying_local('double4_local[1]', '30.5',
-                                 ['(double)', '30.5'])
-        self._try_inspecting_local('double4_local',
-                                  ['(double4)',
-                                   '(-77, 30.5, -99, 111.5)'])
diff --git a/toolkit/Android.bp b/toolkit/Android.bp
new file mode 100644
index 0000000..d3fa21e
--- /dev/null
+++ b/toolkit/Android.bp
@@ -0,0 +1,135 @@
+package {
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+// TODO: In later CLs, this build file will be replaced by a stand alone build that's not part of Android.
+
+cc_binary {
+    name: "renderscripttoolkittest",
+    srcs: [
+        "TestTaskProcessor.cpp"
+    ],
+    shared_libs: [
+         "libbase",
+         "librenderscripttoolkit",
+    ],
+}
+
+cc_library_shared {
+    name: "librenderscripttoolkit",
+    defaults: [],
+    vendor_available: false,
+    native_bridge_supported: false,
+    vndk: {
+        enabled: false,
+        support_system_process: false,
+    },
+
+    srcs: [
+        "Blend.cpp",
+        "Blur.cpp",
+        "ColorMatrix.cpp",
+        "Convolve3x3.cpp",
+        "Convolve5x5.cpp",
+        "Histogram.cpp",
+        "Lut.cpp",
+        "Lut3d.cpp",
+	"RenderScriptToolkit.cpp",
+        "Resize.cpp",
+        "TaskProcessor.cpp",
+        "Utils.cpp",
+        "YuvToRgb.cpp",
+    ],
+
+    static_libs: [ "cpufeatures" ],
+
+    arch: {
+        arm64: {
+            cflags: [
+                "-DARCH_ARM_USE_INTRINSICS",
+                "-DARCH_ARM64_USE_INTRINSICS",
+                "-DARCH_ARM64_HAVE_NEON",
+            ],
+
+            srcs: [
+                "Blend_advsimd.S",
+                "Blur_advsimd.S",
+                "ColorMatrix_advsimd.S",
+                "Convolve_advsimd.S",
+                "Lut3d_advsimd.S",
+                "Resize_advsimd.S",
+                "YuvToRgb_advsimd.S",
+            ],
+        },
+
+        arm: {
+            cflags: [
+                "-DARCH_ARM_HAVE_VFP",
+                "-DARCH_ARM_USE_INTRINSICS",
+            ],
+
+            srcs: [
+                "Blend_neon.S",
+                "Blur_neon.S",
+                "ColorMatrix_neon.S",
+                "Convolve_neon.S",
+                "Lut3d_neon.S",
+                "Resize_neon.S",
+                "YuvToRgb_neon.S",
+            ],
+
+            asflags: ["-mfpu=neon"],
+
+            neon: {
+                cflags: [
+                    "-DARCH_ARM_HAVE_NEON",
+                ],
+            },
+        },
+
+        x86: {
+            cflags: ["-DARCH_X86_HAVE_SSSE3"],
+            srcs: ["x86.cpp"],
+        },
+        x86_64: {
+            cflags: ["-DARCH_X86_HAVE_SSSE3"],
+            srcs: ["x86.cpp"],
+        avx2: {
+                cflags: ["-DARCH_X86_HAVE_AVX2", "-mavx2", "-mfma"],
+            },
+        },
+    },
+
+    shared_libs: [
+        "libbase",
+        "liblog",
+	"libnativehelper",
+	"libjnigraphics",
+    ],
+    header_libs: [
+        // TODO Once we compile in the .cpp files, check if any of these libraries are needed.
+        //"libutils_headers",
+        //"libhardware_headers",
+    ],
+
+    include_dirs: [
+    ],
+
+    cflags: [
+        "-Wthread-safety",
+        "-Werror",
+        "-Wall",
+        "-Wextra",
+        "-Wno-unused-parameter",
+        "-Wno-unused-variable",
+    ],
+
+    // TODO: Is this needed?
+    product_variables: {
+        pdk: {
+            // Not building RenderScript modules in PDK builds, as libmediandk
+            // is not available in PDK.
+            enabled: false,
+        },
+    },
+}
diff --git a/toolkit/Blend.cpp b/toolkit/Blend.cpp
new file mode 100644
index 0000000..1f6319e
--- /dev/null
+++ b/toolkit/Blend.cpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+namespace android {
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Blend"
+
+/**
+ * Blends a source into a destination, based on the mode.
+ */
+class BlendTask : public Task {
+    // The type of blending to do.
+    RenderScriptToolkit::BlendingMode mMode;
+    // The input we're blending.
+    const uchar4* mIn;
+    // The destination, used both for input and output.
+    uchar4* mOut;
+
+    void blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out,
+               uint32_t length);
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) override;
+
+   public:
+    BlendTask(RenderScriptToolkit::BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX,
+              size_t sizeY, const Restriction* restriction)
+        : Task{sizeX, sizeY, 4, true, restriction},
+          mMode{mode},
+          mIn{reinterpret_cast<const uchar4*>(in)},
+          mOut{reinterpret_cast<uchar4*>(out)} {}
+};
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+extern "C" int rsdIntrinsicBlend_K(uchar4 *out, uchar4 const *in, int slot,
+                    uint32_t xstart, uint32_t xend);
+#endif
+
+#if defined(ARCH_X86_HAVE_SSSE3)
+extern void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
+#endif
+
+// Convert vector to uchar4, clipping each value to 255.
+template <typename TI>
+static inline uchar4 convertClipped(TI amount) {
+    return uchar4 { static_cast<uchar>(amount.x > 255 ? 255 : amount.x),
+                    static_cast<uchar>(amount.y > 255 ? 255 : amount.y),
+                    static_cast<uchar>(amount.z > 255 ? 255 : amount.z),
+                    static_cast<uchar>(amount.w > 255 ? 255 : amount.w)};
+}
+
+void BlendTask::blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out,
+                      uint32_t length) {
+    uint32_t x1 = 0;
+    uint32_t x2 = length;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd) {
+        if (rsdIntrinsicBlend_K(out, in, (int) mode, x1, x2) >= 0) {
+            return;
+        } else {
+            ALOGW("Intrinsic Blend failed to use SIMD for %d", mode);
+        }
+    }
+#endif
+    switch (mode) {
+    case RenderScriptToolkit::BlendingMode::CLEAR:
+        for (;x1 < x2; x1++, out++) {
+            *out = 0;
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::SRC:
+        for (;x1 < x2; x1++, out++, in++) {
+          *out = *in;
+        }
+        break;
+    //RenderScriptToolkit::BlendingMode::DST is a NOP
+    case RenderScriptToolkit::BlendingMode::DST:
+        break;
+    case RenderScriptToolkit::BlendingMode::SRC_OVER:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcOver_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            ushort4 in_s = convert<ushort4>(*in);
+            ushort4 out_s = convert<ushort4>(*out);
+            in_s = in_s + ((out_s * (ushort4)(255 - in_s.w)) >> (ushort4)8);
+            *out = convertClipped(in_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::DST_OVER:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstOver_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+     #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            ushort4 in_s = convert<ushort4>(*in);
+            ushort4 out_s = convert<ushort4>(*out);
+            in_s = out_s + ((in_s * (ushort4)(255 - out_s.w)) >> (ushort4)8);
+            *out = convertClipped(in_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::SRC_IN:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcIn_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            ushort4 in_s = convert<ushort4>(*in);
+            in_s = (in_s * out->w) >> (ushort4)8;
+            *out = convert<uchar4>(in_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::DST_IN:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstIn_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+     #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            ushort4 out_s = convert<ushort4>(*out);
+            out_s = (out_s * in->w) >> (ushort4)8;
+            *out = convert<uchar4>(out_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::SRC_OUT:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcOut_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            ushort4 in_s = convert<ushort4>(*in);
+            in_s = (in_s * (ushort4)(255 - out->w)) >> (ushort4)8;
+            *out = convert<uchar4>(in_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::DST_OUT:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstOut_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            ushort4 out_s = convert<ushort4>(*out);
+            out_s = (out_s * (ushort4)(255 - in->w)) >> (ushort4)8;
+            *out = convert<uchar4>(out_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::SRC_ATOP:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcAtop_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            // The max value the operation could produce before the shift
+            // is 255 * 255 + 255 * (255 - 0) = 130050, or 0x1FC02.
+            // That value does not fit in a ushort, so we use uint.
+            uint4 in_s = convert<uint4>(*in);
+            uint4 out_s = convert<uint4>(*out);
+            out_s.xyz = ((in_s.xyz * out_s.w) +
+              (out_s.xyz * ((uint3)255 - (uint3)in_s.w))) >> (uint3)8;
+            *out = convertClipped(out_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::DST_ATOP:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstAtop_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+     #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            uint4 in_s = convert<uint4>(*in);
+            uint4 out_s = convert<uint4>(*out);
+            out_s.xyz = ((out_s.xyz * in_s.w) +
+              (in_s.xyz * ((uint3)255 - (uint3)out_s.w))) >> (uint3)8;
+            out_s.w = in_s.w;
+            *out = convertClipped(out_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::XOR:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendXor_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            *out = *in ^ *out;
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::MULTIPLY:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendMultiply_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+          *out = convert<uchar4>((convert<ushort4>(*in) * convert<ushort4>(*out))
+                                >> (ushort4)8);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::ADD:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendAdd_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            uint32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
+                oR = out->x, oG = out->y, oB = out->z, oA = out->w;
+            out->x = (oR + iR) > 255 ? 255 : oR + iR;
+            out->y = (oG + iG) > 255 ? 255 : oG + iG;
+            out->z = (oB + iB) > 255 ? 255 : oB + iB;
+            out->w = (oA + iA) > 255 ? 255 : oA + iA;
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::SUBTRACT:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSub_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            int32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
+                oR = out->x, oG = out->y, oB = out->z, oA = out->w;
+            out->x = (oR - iR) < 0 ? 0 : oR - iR;
+            out->y = (oG - iG) < 0 ? 0 : oG - iG;
+            out->z = (oB - iB) < 0 ? 0 : oB - iB;
+            out->w = (oA - iA) < 0 ? 0 : oA - iA;
+        }
+        break;
+
+    default:
+        ALOGE("Called unimplemented value %d", mode);
+        assert(false);
+    }
+}
+
+void BlendTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                            size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        size_t offset = y * mSizeX + startX;
+        blend(mMode, mIn + offset, mOut + offset, endX - startX);
+    }
+}
+
+void RenderScriptToolkit::blend(BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX,
+                                size_t sizeY, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+#endif
+
+    BlendTask task(mode, in, out, sizeX, sizeY, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/Blend_advsimd.S b/toolkit/Blend_advsimd.S
new file mode 100644
index 0000000..e5cb29b
--- /dev/null
+++ b/toolkit/Blend_advsimd.S
@@ -0,0 +1,622 @@
+/*
+ * Copyright (C) 2013-2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+#define BLEND_LIST(X) \
+    X(0, CLEAR) \
+    X(1, SRC) \
+    X(2, DST) \
+    X(3, SRC_OVER) \
+    X(4, DST_OVER) \
+    X(5, SRC_IN) \
+    X(6, DST_IN) \
+    X(7, SRC_OUT) \
+    X(8, DST_OUT) \
+    X(9, SRC_ATOP) \
+    X(10, DST_ATOP) \
+    X(11, XOR) \
+    X(12, MULTIPLY) \
+    X(13, ADD) \
+    X(14, SUBTRACT)
+
+/* This operation was not enabled in the original RenderScript. We could
+ * enable it.
+ *
+ *  X(15, DIFFERENCE) \
+ */
+
+/* For every blend operation supported, define a macro with just the arithmetic
+ * component.  The rest can be handled later on.
+ *
+ * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
+ * contain the data from the source buffer.  Both have already been split out
+ * into one colour component per register (if necessary).  q3 and q11 contain
+ * the alpha components.
+ *
+ * At the same time as defining the assembly macro, define a corresponding
+ * preprocessor macro indicating any other requirements.
+ *    zipped=0 -- The macro does not require the RGBA components to be
+ *                separated.
+ *    lddst=0  -- The macro does not require data from the destination buffer.
+ *    ldsrc=0  -- The macro does not require data from the source buffer.
+ *    nowrap=1 -- The macro requires no wrapper at all, and should simply be
+ *                inserted without any surrounding load/store or loop code.
+ */
+
+#define params_CLEAR zipped=0, lddst=0, ldsrc=0
+.macro blend_kernel_CLEAR
+        movi    v0.16b, #0
+        movi    v1.16b, #0
+        movi    v2.16b, #0
+        movi    v3.16b, #0
+.endm
+
+#define params_SRC zipped=0, lddst=0
+.macro blend_kernel_SRC
+        mov     v0.16b, v8.16b
+        mov     v1.16b, v9.16b
+        mov     v2.16b, v10.16b
+        mov     v3.16b, v11.16b
+.endm
+
+#define params_DST nowrap=1
+.macro blend_kernel_DST
+        /* nop */
+.endm
+
+#define params_SRC_OVER zipped=1
+.macro blend_kernel_SRC_OVER
+        mvn         v7.16b, v11.16b
+
+        umull2      v12.8h, v7.16b, v0.16b
+        umull       v0.8h,  v7.8b,  v0.8b
+        umull2      v13.8h, v7.16b, v1.16b
+        umull       v1.8h,  v7.8b,  v1.8b
+        umull2      v14.8h, v7.16b, v2.16b
+        umull       v2.8h,  v7.8b,  v2.8b
+        umull2      v15.8h, v7.16b, v3.16b
+        umull       v3.8h,  v7.8b,  v3.8b
+
+        rshrn       v4.8b,  v0.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v1.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v2.8h,  #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v3.8h,  #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v0.8h,  v0.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v1.8h,  v1.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v2.8h,  v2.8h,  v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v3.8h,  v3.8h,  v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v0.8b,  v0.8h,  #8
+        rshrn2      v0.16b, v12.8h, #8
+        rshrn       v1.8b,  v1.8h,  #8
+        rshrn2      v1.16b, v13.8h, #8
+        rshrn       v2.8b,  v2.8h,  #8
+        rshrn2      v2.16b, v14.8h, #8
+        rshrn       v3.8b,  v3.8h,  #8
+        rshrn2      v3.16b, v15.8h, #8
+
+        uqadd       v0.16b, v0.16b, v8.16b
+        uqadd       v1.16b, v1.16b, v9.16b
+        uqadd       v2.16b, v2.16b, v10.16b
+        uqadd       v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_DST_OVER zipped=1
+.macro blend_kernel_DST_OVER
+        mvn         v7.16b, v3.16b
+
+        umull2      v12.8h, v7.16b, v8.16b
+        umull       v8.8h,  v7.8b,  v8.8b
+        umull2      v13.8h, v7.16b, v9.16b
+        umull       v9.8h,  v7.8b,  v9.8b
+        umull2      v14.8h, v7.16b, v10.16b
+        umull       v10.8h, v7.8b,  v10.8b
+        umull2      v15.8h, v7.16b, v11.16b
+        umull       v11.8h, v7.8b,  v11.8b
+
+        rshrn       v4.8b,  v8.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v9.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v10.8h, #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v11.8h, #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v8.8h,  v8.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v9.8h,  v9.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v10.8h, v10.8h, v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v11.8h, v11.8h, v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v8.8b,  v8.8h,  #8
+        rshrn2      v8.16b, v12.8h, #8
+        rshrn       v9.8b,  v9.8h,  #8
+        rshrn2      v9.16b, v13.8h, #8
+        rshrn       v10.8b,  v10.8h, #8
+        rshrn2      v10.16b, v14.8h, #8
+        rshrn       v11.8b,  v11.8h, #8
+        rshrn2      v11.16b, v15.8h, #8
+
+        uqadd       v0.16b, v0.16b, v8.16b
+        uqadd       v1.16b, v1.16b, v9.16b
+        uqadd       v2.16b, v2.16b, v10.16b
+        uqadd       v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_SRC_IN zipped=1
+.macro blend_kernel_SRC_IN
+        umull2      v12.8h, v3.16b, v8.16b
+        umull       v0.8h,  v3.8b,  v8.8b
+        umull2      v13.8h, v3.16b, v9.16b
+        umull       v1.8h,  v3.8b,  v9.8b
+        umull2      v14.8h, v3.16b, v10.16b
+        umull       v2.8h,  v3.8b,  v10.8b
+        umull2      v15.8h, v3.16b, v11.16b
+        umull       v3.8h,  v3.8b,  v11.8b
+
+        rshrn       v4.8b,  v0.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v1.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v2.8h,  #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v3.8h,  #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v0.8h,  v0.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v1.8h,  v1.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v2.8h,  v2.8h,  v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v3.8h,  v3.8h,  v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v0.8b,  v0.8h,  #8
+        rshrn2      v0.16b, v12.8h, #8
+        rshrn       v1.8b,  v1.8h,  #8
+        rshrn2      v1.16b, v13.8h, #8
+        rshrn       v2.8b,  v2.8h,  #8
+        rshrn2      v2.16b, v14.8h, #8
+        rshrn       v3.8b,  v3.8h,  #8
+        rshrn2      v3.16b, v15.8h, #8
+.endm
+
+#define params_DST_IN zipped=1
+.macro blend_kernel_DST_IN
+        umull2      v12.8h, v0.16b, v11.16b
+        umull       v0.8h,  v0.8b,  v11.8b
+        umull2      v13.8h, v1.16b, v11.16b
+        umull       v1.8h,  v1.8b,  v11.8b
+        umull2      v14.8h, v2.16b, v11.16b
+        umull       v2.8h,  v2.8b,  v11.8b
+        umull2      v15.8h, v3.16b, v11.16b
+        umull       v3.8h,  v3.8b,  v11.8b
+
+        rshrn       v4.8b,  v0.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v1.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v2.8h,  #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v3.8h,  #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v0.8h,  v0.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v1.8h,  v1.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v2.8h,  v2.8h,  v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v3.8h,  v3.8h,  v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v0.8b,  v0.8h,  #8
+        rshrn2      v0.16b, v12.8h, #8
+        rshrn       v1.8b,  v1.8h,  #8
+        rshrn2      v1.16b, v13.8h, #8
+        rshrn       v2.8b,  v2.8h,  #8
+        rshrn2      v2.16b, v14.8h, #8
+        rshrn       v3.8b,  v3.8h,  #8
+        rshrn2      v3.16b, v15.8h, #8
+.endm
+
+#define params_SRC_OUT zipped=1
+.macro blend_kernel_SRC_OUT
+        mvn         v3.16b, v3.16b
+        blend_kernel_SRC_IN
+.endm
+
+
+#define params_DST_OUT zipped=1
+.macro blend_kernel_DST_OUT
+        mvn         v11.16b, v11.16b
+        blend_kernel_DST_IN
+.endm
+
+#define params_SRC_ATOP zipped=1
+.macro blend_kernel_SRC_ATOP
+        mvn         v11.16b, v11.16b
+
+        umull2      v12.8h, v11.16b, v0.16b
+        umull       v0.8h,  v11.8b,  v0.8b
+        umull2      v13.8h, v11.16b, v1.16b
+        umull       v1.8h,  v11.8b,  v1.8b
+        umull2      v14.8h, v11.16b, v2.16b
+        umull       v2.8h,  v11.8b,  v2.8b
+
+        umull2      v4.8h,  v3.16b, v8.16b
+        umull       v8.8h,  v3.8b,  v8.8b
+        umull2      v5.8h,  v3.16b, v9.16b
+        umull       v9.8h,  v3.8b,  v9.8b
+        umull2      v6.8h,  v3.16b, v10.16b
+        umull       v10.8h, v3.8b,  v10.8b
+
+        uqadd       v12.8h, v12.8h, v4.8h
+        uqadd       v0.8h,  v0.8h,  v8.8h
+        uqadd       v13.8h, v13.8h, v5.8h
+        uqadd       v1.8h,  v1.8h,  v9.8h
+        uqadd       v14.8h, v14.8h, v6.8h
+        uqadd       v2.8h,  v2.8h,  v10.8h
+
+        urshr       v8.8h,  v0.8h,  #8
+        urshr       v4.8h,  v12.8h, #8
+        urshr       v9.8h,  v1.8h,  #8
+        urshr       v5.8h,  v13.8h, #8
+        urshr       v10.8h, v2.8h,  #8
+        urshr       v6.8h,  v14.8h, #8
+
+        uqadd       v0.8h,  v0.8h,  v8.8h
+        uqadd       v12.8h, v12.8h, v4.8h
+        uqadd       v1.8h,  v1.8h,  v9.8h
+        uqadd       v13.8h, v13.8h, v5.8h
+        uqadd       v2.8h,  v2.8h,  v10.8h
+        uqadd       v14.8h, v14.8h, v6.8h
+
+        uqrshrn     v0.8b,  v0.8h,  #8
+        uqrshrn2    v0.16b, v12.8h, #8
+        uqrshrn     v1.8b,  v1.8h,  #8
+        uqrshrn2    v1.16b, v13.8h, #8
+        uqrshrn     v2.8b,  v2.8h,  #8
+        uqrshrn2    v2.16b, v14.8h, #8
+.endm
+
+#define params_DST_ATOP zipped=1
+.macro blend_kernel_DST_ATOP
+        mvn         v3.16b, v3.16b
+
+        umull2      v12.8h, v11.16b, v0.16b
+        umull       v0.8h,  v11.8b,  v0.8b
+        umull2      v13.8h, v11.16b, v1.16b
+        umull       v1.8h,  v11.8b,  v1.8b
+        umull2      v14.8h, v11.16b, v2.16b
+        umull       v2.8h,  v11.8b,  v2.8b
+
+        umull2      v4.8h,  v3.16b, v8.16b
+        umull       v8.8h,  v3.8b,  v8.8b
+        umull2      v5.8h,  v3.16b, v9.16b
+        umull       v9.8h,  v3.8b,  v9.8b
+        umull2      v6.8h,  v3.16b, v10.16b
+        umull       v10.8h, v3.8b,  v10.8b
+
+        uqadd       v12.8h, v12.8h, v4.8h
+        uqadd       v0.8h,  v0.8h,  v8.8h
+        uqadd       v13.8h, v13.8h, v5.8h
+        uqadd       v1.8h,  v1.8h,  v9.8h
+        uqadd       v14.8h, v14.8h, v6.8h
+        uqadd       v2.8h,  v2.8h,  v10.8h
+
+        urshr       v8.8h,  v0.8h,  #8
+        urshr       v4.8h,  v12.8h, #8
+        urshr       v9.8h,  v1.8h,  #8
+        urshr       v5.8h,  v13.8h, #8
+        urshr       v10.8h, v2.8h,  #8
+        urshr       v6.8h,  v14.8h, #8
+
+        uqadd       v0.8h,  v0.8h,  v8.8h
+        uqadd       v12.8h, v12.8h, v4.8h
+        uqadd       v1.8h,  v1.8h,  v9.8h
+        uqadd       v13.8h, v13.8h, v5.8h
+        uqadd       v2.8h,  v2.8h,  v10.8h
+        uqadd       v14.8h, v14.8h, v6.8h
+
+        uqrshrn     v0.8b,  v0.8h,  #8
+        uqrshrn2    v0.16b, v12.8h, #8
+        uqrshrn     v1.8b,  v1.8h,  #8
+        uqrshrn2    v1.16b, v13.8h, #8
+        uqrshrn     v2.8b,  v2.8h,  #8
+        uqrshrn2    v2.16b, v14.8h, #8
+
+        mov         v3.16b, v11.16b
+.endm
+
+#define params_MULTIPLY zipped=0
+.macro blend_kernel_MULTIPLY
+        umull2      v12.8h, v0.16b, v8.16b
+        umull       v0.8h,  v0.8b,  v8.8b
+        umull2      v13.8h, v1.16b, v9.16b
+        umull       v1.8h,  v1.8b,  v9.8b
+        umull2      v14.8h, v2.16b, v10.16b
+        umull       v2.8h,  v2.8b,  v10.8b
+        umull2      v15.8h, v3.16b, v11.16b
+        umull       v3.8h,  v3.8b,  v11.8b
+
+        rshrn       v4.8b,  v0.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v1.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v2.8h,  #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v3.8h,  #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v0.8h,  v0.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v1.8h,  v1.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v2.8h,  v2.8h,  v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v3.8h,  v3.8h,  v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v0.8b,  v0.8h,  #8
+        rshrn2      v0.16b, v12.8h, #8
+        rshrn       v1.8b,  v1.8h,  #8
+        rshrn2      v1.16b, v13.8h, #8
+        rshrn       v2.8b,  v2.8h,  #8
+        rshrn2      v2.16b, v14.8h, #8
+        rshrn       v3.8b,  v3.8h,  #8
+        rshrn2      v3.16b, v15.8h, #8
+.endm
+
+#define params_ADD zipped=0
+.macro blend_kernel_ADD
+        uqadd    v0.16b, v0.16b, v8.16b
+        uqadd    v1.16b, v1.16b, v9.16b
+        uqadd    v2.16b, v2.16b, v10.16b
+        uqadd    v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_SUBTRACT zipped=0
+.macro blend_kernel_SUBTRACT
+        uqsub    v0.16b, v0.16b, v8.16b
+        uqsub    v1.16b, v1.16b, v9.16b
+        uqsub    v2.16b, v2.16b, v10.16b
+        uqsub    v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_DIFFERENCE zipped=0
+.macro blend_kernel_DIFFERENCE
+        uabd    v0.16b, v0.16b, v8.16b
+        uabd    v1.16b, v1.16b, v9.16b
+        uabd    v2.16b, v2.16b, v10.16b
+        uabd    v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_XOR zipped=0
+.macro blend_kernel_XOR
+        eor     v0.16b, v0.16b, v8.16b
+        eor     v1.16b, v1.16b, v9.16b
+        eor     v2.16b, v2.16b, v10.16b
+        eor     v3.16b, v3.16b, v11.16b
+.endm
+
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop.  Various sections of assembly code are dropped or substituted for
+ * simpler operations if they're not needed.
+ */
+.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
+.if \nowrap
+        \kernel
+.else
+        sub     x3, sp, #32
+        sub     sp, sp, #64
+        st1     {v8.1d - v11.1d}, [sp]
+        st1     {v12.1d - v15.1d}, [x3]
+        subs    x2, x2, #64
+        b       2f
+.align 4
+1:
+  .if \lddst
+    .if \zipped
+        ld4     {v0.16b - v3.16b}, [x0]
+    .else
+        ld1     {v0.16b - v3.16b}, [x0]
+    .endif
+  .endif
+  .if \ldsrc
+    .if \zipped
+        ld4     {v8.16b - v11.16b}, [x1], #64
+    .else
+        ld1     {v8.16b - v11.16b}, [x1], #64
+    .endif
+  .endif
+  .if \pld
+#if 0 /* TODO: test this on real hardware */
+    .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif
+    .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif
+#endif
+  .endif
+
+        \kernel
+
+        subs    x2, x2, #64
+  .if \zipped
+        st4     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+  .else
+        st1     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+  .endif
+
+2:      bge     1b
+        adds    x2, x2, #64
+        beq     2f
+
+        /* To handle the tail portion of the data (something less than 64
+         * bytes) load small power-of-two chunks into working registers.  It
+         * doesn't matter where they end up in the register; the same process
+         * will store them back out using the same positions and the operations
+         * don't require data to interact with its neighbours.
+         */
+        movi    v0.16b, #0
+        movi    v1.16b, #0
+        movi    v2.16b, #0
+        movi    v3.16b, #0
+
+        movi    v8.16b, #0
+        movi    v9.16b, #0
+        movi    v10.16b, #0
+        movi    v11.16b, #0
+
+        tbz     x2, #5, 1f
+  .if \lddst ; ld1     {v2.16b,v3.16b}, [x0], #32   ; .endif
+  .if \ldsrc ; ld1     {v10.16b,v11.16b}, [x1], #32 ; .endif
+1:      tbz     x2, #4, 1f
+  .if \lddst ; ld1     {v1.16b}, [x0], #16  ; .endif
+  .if \ldsrc ; ld1     {v9.16b}, [x1], #16  ; .endif
+1:      tbz     x2, #3, 1f
+  .if \lddst ; ld1     {v0.d}[1], [x0], #8 ; .endif
+  .if \ldsrc ; ld1     {v8.d}[1], [x1], #8 ; .endif
+1:      tbz     x2, #2, 1f
+  .if \lddst ; ld1     {v0.s}[1], [x0], #4 ; .endif
+  .if \ldsrc ; ld1     {v8.s}[1], [x1], #4 ; .endif
+1:      tbz     x2, #1, 1f
+  .if \lddst ; ld1     {v0.h}[1], [x0], #2 ; .endif
+  .if \ldsrc ; ld1     {v8.h}[1], [x1], #2 ; .endif
+1:      tbz     x2, #0, 1f
+  .if \lddst ; ld1     {v0.b}[1], [x0], #1 ; .endif
+  .if \ldsrc ; ld1     {v8.b}[1], [x1], #1 ; .endif
+1:
+  .if \lddst ; sub     x0, x0, x2           ; .endif
+
+.if \zipped
+        /* One small impediment in the process above is that some of the load
+         * operations can't perform byte-wise structure deinterleaving at the
+         * same time as loading only part of a register.  So the data is loaded
+         * linearly and unpacked manually at this point.
+         */
+        uzp1    v4.16b, v0.16b, v1.16b
+        uzp2    v5.16b, v0.16b, v1.16b
+        uzp1    v6.16b, v2.16b, v3.16b
+        uzp2    v7.16b, v2.16b, v3.16b
+        uzp1    v0.16b, v4.16b, v6.16b
+        uzp2    v2.16b, v4.16b, v6.16b
+        uzp1    v1.16b, v5.16b, v7.16b
+        uzp2    v3.16b, v5.16b, v7.16b
+
+        uzp1    v4.16b, v8.16b, v9.16b
+        uzp2    v5.16b, v8.16b, v9.16b
+        uzp1    v6.16b, v10.16b, v11.16b
+        uzp2    v7.16b, v10.16b, v11.16b
+        uzp1    v8.16b, v4.16b, v6.16b
+        uzp2    v10.16b, v4.16b, v6.16b
+        uzp1    v9.16b, v5.16b, v7.16b
+        uzp2    v11.16b, v5.16b, v7.16b
+
+        \kernel
+
+        zip1    v4.16b, v0.16b, v2.16b
+        zip2    v6.16b, v0.16b, v2.16b
+        zip1    v5.16b, v1.16b, v3.16b
+        zip2    v7.16b, v1.16b, v3.16b
+        zip1    v0.16b, v4.16b, v5.16b
+        zip2    v1.16b, v4.16b, v5.16b
+        zip1    v2.16b, v6.16b, v7.16b
+        zip2    v3.16b, v6.16b, v7.16b
+  .else
+        \kernel
+  .endif
+
+        tbz     x2, #5, 1f
+        st1     {v2.16b,v3.16b}, [x0], #32
+1:      tbz     x2, #4, 1f
+        st1     {v1.16b}, [x0], #16
+1:      tbz     x2, #3, 1f
+        st1     {v0.d}[1], [x0], #8
+1:      tbz     x2, #2, 1f
+        st1     {v0.s}[1], [x0], #4
+1:      tbz     x2, #1, 1f
+        st1     {v0.h}[1], [x0], #2
+1:      tbz     x2, #0, 2f
+        st1     {v0.b}[1], [x0], #1
+2:      ld1     {v8.1d - v11.1d}, [sp], #32
+        ld1     {v12.1d - v15.1d}, [sp], #32
+.endif
+        mov     x0, #0
+        ret
+.endm
+
+
+/* produce list of blend_line_XX() functions; each function uses the wrap_line
+ * macro, passing it the name of the operation macro it wants along with
+ * optional parameters to remove unnecessary operations.
+ */
+#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
+    BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+#define BLEND_X(d, n) .set tablesize, d+1 ;
+    BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+/*  int rsdIntrinsicBlend_K(
+ *          uchar4 *out,        // x0
+ *          uchar4 const *in,   // x1
+ *          int slot,           // x2
+ *          size_t xstart,      // x3
+ *          size_t xend);       // x4
+ */
+ENTRY(rsdIntrinsicBlend_K)
+    adrp    x5, blendtable
+    add     x5, x5, :lo12:blendtable
+    cmp     w2, tablesize
+    bhs     1f
+    ldrsh   x6, [x5, w2, uxtw #1]
+    add     x0, x0, w3, uxtw #2
+    add     x1, x1, w3, uxtw #2
+    sub     w2, w4, w3
+    ubfiz   x2, x2, #2, #32 /* TODO: fix */
+    cbz     x6, 1f
+    adr     x5, 2f
+    add     x6, x5, x6
+2:  br      x6
+1:  mov     x0, #-1
+    ret
+
+END(rsdIntrinsicBlend_K)
+
+.rodata
+.set off,0
+blendtable:
+#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
+        BLEND_LIST(BLEND_X)
+#undef BLEND_X
diff --git a/toolkit/Blend_neon.S b/toolkit/Blend_neon.S
new file mode 100644
index 0000000..a1fa1b5
--- /dev/null
+++ b/toolkit/Blend_neon.S
@@ -0,0 +1,617 @@
+/*
+ * Copyright (C) 2013-2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+#define BLEND_LIST(X) \
+    X(0, CLEAR) \
+    X(1, SRC) \
+    X(2, DST) \
+    X(3, SRC_OVER) \
+    X(4, DST_OVER) \
+    X(5, SRC_IN) \
+    X(6, DST_IN) \
+    X(7, SRC_OUT) \
+    X(8, DST_OUT) \
+    X(9, SRC_ATOP) \
+    X(10, DST_ATOP) \
+    X(11, XOR) \
+    X(14, MULTIPLY) \
+    X(21, DIFFERENCE) \
+    X(34, ADD) \
+    X(35, SUBTRACT)
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* For every blend operation supported, define a macro with just the arithmetic
+ * component.  The rest can be handled later on.
+ *
+ * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
+ * contain the data from the source buffer.  Both have already been split out
+ * into one colour component per register (if necessary).  q3 and q11 contain
+ * the alpha components.
+ *
+ * At the same time as defining the assembly macro, define a corresponding
+ * preprocessor macro indicating any other requirements.
+ *    zipped=0 -- The macro does not require the RGBA components to be
+ *                separated.
+ *    lddst=0  -- The macro does not require data from the destination buffer.
+ *    ldsrc=0  -- The macro does not require data from the source buffer.
+ *    nowrap=1 -- The macro requires no wrapper at all, and should simply be
+ *                inserted without any surrounding load/store or loop code.
+ */
+
+#define params_CLEAR zipped=0, lddst=0, ldsrc=0
+.macro blend_kernel_CLEAR
+        vmov.i8 q0, #0
+        vmov.i8 q1, #0
+        vmov.i8 q2, #0
+        vmov.i8 q3, #0
+.endm
+
+#define params_SRC zipped=0, lddst=0
+.macro blend_kernel_SRC
+        vmov    q0, q8
+        vmov    q1, q9
+        vmov    q2, q10
+        vmov    q3, q11
+.endm
+
+#define params_DST nowrap=1
+.macro blend_kernel_DST
+        /* nop */
+.endm
+
+#define params_SRC_OVER zipped=1
+.macro blend_kernel_SRC_OVER
+        vmvn        q7, q11
+
+        vmull.u8    q12, d15, d1
+        vmull.u8    q0,  d14, d0
+        vmull.u8    q13, d15, d3
+        vmull.u8    q1,  d14, d2
+        vmull.u8    q14, d15, d5
+        vmull.u8    q2,  d14, d4
+        vmull.u8    q15, d15, d7
+        vmull.u8    q3,  d14, d6
+
+        vrshrn.u16  d8,  q0,  #8
+        vrshrn.u16  d9,  q12, #8
+        vrshrn.u16  d10, q1,  #8
+        vrshrn.u16  d11, q13, #8
+        vrshrn.u16  d12, q2,  #8
+        vrshrn.u16  d13, q14, #8
+        vrshrn.u16  d14, q3,  #8
+        vrshrn.u16  d15, q15, #8
+
+        vaddw.u8    q0,  d8
+        vaddw.u8    q12, d9
+        vaddw.u8    q1,  d10
+        vaddw.u8    q13, d11
+        vaddw.u8    q2,  d12
+        vaddw.u8    q14, d13
+        vaddw.u8    q3,  d14
+        vaddw.u8    q15, d15
+
+        vrshrn.u16  d0, q0,  #8
+        vrshrn.u16  d1, q12, #8
+        vrshrn.u16  d2, q1,  #8
+        vrshrn.u16  d3, q13, #8
+        vrshrn.u16  d4, q2,  #8
+        vrshrn.u16  d5, q14, #8
+        vrshrn.u16  d6, q3,  #8
+        vrshrn.u16  d7, q15, #8
+
+        vqadd.u8    q0, q8
+        vqadd.u8    q1, q9
+        vqadd.u8    q2, q10
+        vqadd.u8    q3, q11
+.endm
+
+#define params_DST_OVER zipped=1
+.macro blend_kernel_DST_OVER
+        vmvn        q7, q3
+
+        vmull.u8    q12, d15, d17
+        vmull.u8    q8,  d14, d16
+        vmull.u8    q13, d15, d19
+        vmull.u8    q9,  d14, d18
+        vmull.u8    q14, d15, d21
+        vmull.u8    q10, d14, d20
+        vmull.u8    q15, d15, d23
+        vmull.u8    q11, d14, d22
+
+        vrshrn.u16  d8,  q0,  #8
+        vrshrn.u16  d9,  q12, #8
+        vrshrn.u16  d10, q1,  #8
+        vrshrn.u16  d11, q13, #8
+        vrshrn.u16  d12, q2,  #8
+        vrshrn.u16  d13, q14, #8
+        vrshrn.u16  d14, q3,  #8
+        vrshrn.u16  d15, q15, #8
+
+        vaddw.u8    q8,  d8
+        vaddw.u8    q12, d9
+        vaddw.u8    q9,  d10
+        vaddw.u8    q13, d11
+        vaddw.u8    q10, d12
+        vaddw.u8    q14, d13
+        vaddw.u8    q11, d14
+        vaddw.u8    q15, d15
+
+        vrshrn.u16  d16, q8,  #8
+        vrshrn.u16  d17, q12, #8
+        vrshrn.u16  d18, q9,  #8
+        vrshrn.u16  d19, q13, #8
+        vrshrn.u16  d20, q10, #8
+        vrshrn.u16  d21, q14, #8
+        vrshrn.u16  d22, q11, #8
+        vrshrn.u16  d23, q15, #8
+
+        vqadd.u8    q0, q8
+        vqadd.u8    q1, q9
+        vqadd.u8    q2, q10
+        vqadd.u8    q3, q11
+.endm
+
+#define params_SRC_IN zipped=1
+.macro blend_kernel_SRC_IN
+        vmull.u8    q12, d7, d17
+        vmull.u8    q0,  d6, d16
+        vmull.u8    q13, d7, d19
+        vmull.u8    q1,  d6, d18
+        vmull.u8    q14, d7, d21
+        vmull.u8    q2,  d6, d20
+        vmull.u8    q15, d7, d23
+        vmull.u8    q3,  d6, d22
+
+        vrshrn.u16  d8,  q0,  #8
+        vrshrn.u16  d9,  q12, #8
+        vrshrn.u16  d10, q1,  #8
+        vrshrn.u16  d11, q13, #8
+        vrshrn.u16  d12, q2,  #8
+        vrshrn.u16  d13, q14, #8
+        vrshrn.u16  d14, q3,  #8
+        vrshrn.u16  d15, q15, #8
+
+        vaddw.u8    q0,  d8
+        vaddw.u8    q12, d9
+        vaddw.u8    q1,  d10
+        vaddw.u8    q13, d11
+        vaddw.u8    q2,  d12
+        vaddw.u8    q14, d13
+        vaddw.u8    q3,  d14
+        vaddw.u8    q15, d15
+
+        vrshrn.u16  d0, q0,  #8
+        vrshrn.u16  d1, q12, #8
+        vrshrn.u16  d2, q1,  #8
+        vrshrn.u16  d3, q13, #8
+        vrshrn.u16  d4, q2,  #8
+        vrshrn.u16  d5, q14, #8
+        vrshrn.u16  d6, q3,  #8
+        vrshrn.u16  d7, q15, #8
+.endm
+
+#define params_DST_IN zipped=1
+.macro blend_kernel_DST_IN
+        vmull.u8    q12, d1, d23
+        vmull.u8    q0,  d0, d22
+        vmull.u8    q13, d3, d23
+        vmull.u8    q1,  d2, d22
+        vmull.u8    q14, d5, d23
+        vmull.u8    q2,  d4, d22
+        vmull.u8    q15, d7, d23
+        vmull.u8    q3,  d6, d22
+
+        vrshrn.u16  d8,  q0,  #8
+        vrshrn.u16  d9,  q12, #8
+        vrshrn.u16  d10, q1,  #8
+        vrshrn.u16  d11, q13, #8
+        vrshrn.u16  d12, q2,  #8
+        vrshrn.u16  d13, q14, #8
+        vrshrn.u16  d14, q3,  #8
+        vrshrn.u16  d15, q15, #8
+
+        vaddw.u8    q0,  d8
+        vaddw.u8    q12, d9
+        vaddw.u8    q1,  d10
+        vaddw.u8    q13, d11
+        vaddw.u8    q2,  d12
+        vaddw.u8    q14, d13
+        vaddw.u8    q3,  d14
+        vaddw.u8    q15, d15
+
+        vrshrn.u16  d0, q0,  #8
+        vrshrn.u16  d1, q12, #8
+        vrshrn.u16  d2, q1,  #8
+        vrshrn.u16  d3, q13, #8
+        vrshrn.u16  d4, q2,  #8
+        vrshrn.u16  d5, q14, #8
+        vrshrn.u16  d6, q3,  #8
+        vrshrn.u16  d7, q15, #8
+.endm
+
+#define params_SRC_OUT zipped=1
+.macro blend_kernel_SRC_OUT
+        vmvn        q3, q3
+        blend_kernel_SRC_IN
+.endm
+
+
+#define params_DST_OUT zipped=1
+.macro blend_kernel_DST_OUT
+        vmvn        q11, q11
+        blend_kernel_DST_IN
+.endm
+
+#define params_SRC_ATOP zipped=1
+.macro blend_kernel_SRC_ATOP
+        vmvn        q11, q11
+
+        vmull.u8    q12, d23, d1
+        vmull.u8    q0,  d22, d0
+        vmull.u8    q13, d23, d3
+        vmull.u8    q1,  d22, d2
+        vmull.u8    q14, d23, d5
+        vmull.u8    q2,  d22, d4
+
+        vmull.u8    q4,  d7, d17
+        vmull.u8    q8,  d6, d16
+        vmull.u8    q5,  d7, d19
+        vmull.u8    q9,  d6, d18
+        vmull.u8    q6,  d7, d21
+        vmull.u8    q10, d6, d20
+
+        vqadd.u16   q12, q4
+        vqadd.u16   q0,  q8
+        vqadd.u16   q13, q5
+        vqadd.u16   q1,  q9
+        vqadd.u16   q14, q6
+        vqadd.u16   q2,  q10
+
+        vrshr.u16   q8,  q0,  #8
+        vrshr.u16   q4,  q12, #8
+        vrshr.u16   q9,  q1,  #8
+        vrshr.u16   q5,  q13, #8
+        vrshr.u16   q10, q2,  #8
+        vrshr.u16   q6,  q14, #8
+
+        vqadd.u16   q0,  q8
+        vqadd.u16   q12, q4
+        vqadd.u16   q1,  q9
+        vqadd.u16   q13, q5
+        vqadd.u16   q2,  q10
+        vqadd.u16   q14, q6
+
+        vqrshrn.u16 d0, q0,  #8
+        vqrshrn.u16 d1, q12, #8
+        vqrshrn.u16 d2, q1,  #8
+        vqrshrn.u16 d3, q13, #8
+        vqrshrn.u16 d4, q2,  #8
+        vqrshrn.u16 d5, q14, #8
+.endm
+
+#define params_DST_ATOP zipped=1
+.macro blend_kernel_DST_ATOP
+        vmvn        q3, q3
+
+        vmull.u8    q12, d23, d1
+        vmull.u8    q0,  d22, d0
+        vmull.u8    q13, d23, d3
+        vmull.u8    q1,  d22, d2
+        vmull.u8    q14, d23, d5
+        vmull.u8    q2,  d22, d4
+
+        vmull.u8    q4,  d7, d17
+        vmull.u8    q8,  d6, d16
+        vmull.u8    q5,  d7, d19
+        vmull.u8    q9,  d6, d18
+        vmull.u8    q6,  d7, d21
+        vmull.u8    q10, d6, d20
+
+        vqadd.u16   q12, q4
+        vqadd.u16   q0,  q8
+        vqadd.u16   q13, q5
+        vqadd.u16   q1,  q9
+        vqadd.u16   q14, q6
+        vqadd.u16   q2,  q10
+
+        vrshr.u16   q8,  q0,  #8
+        vrshr.u16   q4,  q12, #8
+        vrshr.u16   q9,  q1,  #8
+        vrshr.u16   q5,  q13, #8
+        vrshr.u16   q10, q2,  #8
+        vrshr.u16   q6,  q14, #8
+
+        vqadd.u16   q0,  q8
+        vqadd.u16   q12, q4
+        vqadd.u16   q1,  q9
+        vqadd.u16   q13, q5
+        vqadd.u16   q2,  q10
+        vqadd.u16   q14, q6
+
+        vqrshrn.u16 d0, q0,  #8
+        vqrshrn.u16 d1, q12, #8
+        vqrshrn.u16 d2, q1,  #8
+        vqrshrn.u16 d3, q13, #8
+        vqrshrn.u16 d4, q2,  #8
+        vqrshrn.u16 d5, q14, #8
+
+        vmov        q3, q11
+.endm
+
+#define params_MULTIPLY zipped=0
+.macro blend_kernel_MULTIPLY
+        vmull.u8    q12, d1, d17
+        vmull.u8    q0,  d0, d16
+        vmull.u8    q13, d3, d19
+        vmull.u8    q1,  d2, d18
+        vmull.u8    q14, d5, d21
+        vmull.u8    q2,  d4, d20
+        vmull.u8    q15, d7, d23
+        vmull.u8    q3,  d6, d22
+
+        vrshrn.u16  d8,  q0,  #8
+        vrshrn.u16  d9,  q12, #8
+        vrshrn.u16  d10, q1,  #8
+        vrshrn.u16  d11, q13, #8
+        vrshrn.u16  d12, q2,  #8
+        vrshrn.u16  d13, q14, #8
+        vrshrn.u16  d14, q3,  #8
+        vrshrn.u16  d15, q15, #8
+
+        vaddw.u8    q0,  d8
+        vaddw.u8    q12, d9
+        vaddw.u8    q1,  d10
+        vaddw.u8    q13, d11
+        vaddw.u8    q2,  d12
+        vaddw.u8    q14, d13
+        vaddw.u8    q3,  d14
+        vaddw.u8    q15, d15
+
+        vrshrn.u16  d0, q0,  #8
+        vrshrn.u16  d1, q12, #8
+        vrshrn.u16  d2, q1,  #8
+        vrshrn.u16  d3, q13, #8
+        vrshrn.u16  d4, q2,  #8
+        vrshrn.u16  d5, q14, #8
+        vrshrn.u16  d6, q3,  #8
+        vrshrn.u16  d7, q15, #8
+.endm
+
+#define params_ADD zipped=0
+.macro blend_kernel_ADD
+        vqadd.u8 q0, q0, q8
+        vqadd.u8 q1, q1, q9
+        vqadd.u8 q2, q2, q10
+        vqadd.u8 q3, q3, q11
+.endm
+
+#define params_SUBTRACT zipped=0
+.macro blend_kernel_SUBTRACT
+        vqsub.u8 q0, q0, q8
+        vqsub.u8 q1, q1, q9
+        vqsub.u8 q2, q2, q10
+        vqsub.u8 q3, q3, q11
+.endm
+
+#define params_DIFFERENCE zipped=0
+.macro blend_kernel_DIFFERENCE
+        vabd.u8 q0, q0, q8
+        vabd.u8 q1, q1, q9
+        vabd.u8 q2, q2, q10
+        vabd.u8 q3, q3, q11
+.endm
+
+#define params_XOR zipped=0
+.macro blend_kernel_XOR
+        veor    q0, q0, q8
+        veor    q1, q1, q9
+        veor    q2, q2, q10
+        veor    q3, q3, q11
+.endm
+
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop.  Various sections of assembly code are dropped or substituted for
+ * simpler operations if they're not needed.
+ */
+.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
+.if \nowrap
+        \kernel
+.else
+        vpush   {d8-d15}
+        subs    r2, #64
+        b       2f
+        .align 4
+1:
+  .if \lddst
+    .if \zipped
+        vld4.8  {d0,d2,d4,d6}, [r0]!
+        vld4.8  {d1,d3,d5,d7}, [r0]!
+    .else
+        vld1.8  {d0-d3}, [r0]!
+        vld1.8  {d4-d7}, [r0]!
+    .endif
+        sub     r0, #64
+  .endif
+  .if \ldsrc
+    .if \zipped
+        vld4.8  {d16,d18,d20,d22}, [r1]!
+        vld4.8  {d17,d19,d21,d23}, [r1]!
+    .else
+        vld1.8  {d16-d19}, [r1]!
+        vld1.8  {d20-d23}, [r1]!
+    .endif
+  .endif
+  .if \pld
+    .if \lddst ; pld [r0, #192] ; .endif
+    .if \ldsrc ; pld [r1, #192] ; .endif
+  .endif
+
+        \kernel
+
+        subs    r2, #64
+  .if \zipped
+        vst4.8  {d0,d2,d4,d6}, [r0]!
+        vst4.8  {d1,d3,d5,d7}, [r0]!
+  .else
+        vst1.8  {d0-d3}, [r0]!
+        vst1.8  {d4-d7}, [r0]!
+  .endif
+
+2:      bge     1b
+        adds    r2, #64
+        beq     2f
+
+        /* To handle the tail portion of the data (something less than 64
+         * bytes) load small power-of-two chunks into working registers.  It
+         * doesn't matter where they end up in the register; the same process
+         * will store them back out using the same positions and the operations
+         * don't require data to interact with its neighbours.
+         */
+        vmov.i8 q0, #0
+        vmov.i8 q1, #0
+        vmov.i8 q2, #0
+        vmov.i8 q3, #0
+
+        vmov.i8 q8, #0
+        vmov.i8 q9, #0
+        vmov.i8 q10, #0
+        vmov.i8 q11, #0
+
+        tst     r2, #32
+        beq     1f
+  .if \lddst ; vld1.64 {d4-d7}, [r0]!   ; .endif
+  .if \ldsrc ; vld1.64 {d20-d23}, [r1]! ; .endif
+1:      tst     r2, #16
+        beq     1f
+  .if \lddst ; vld1.64 {d2-d3}, [r0]!   ; .endif
+  .if \ldsrc ; vld1.64 {d18-d19}, [r1]! ; .endif
+1:      tst     r2, #8
+        beq     1f
+  .if \lddst ; vld1.64 {d1}, [r0]!      ; .endif
+  .if \ldsrc ; vld1.64 {d17}, [r1]!     ; .endif
+1:      tst     r2, #4
+        beq     1f
+  .if \lddst ; vld1.32 {d0[1]}, [r0]!   ; .endif
+  .if \ldsrc ; vld1.32 {d16[1]}, [r1]!  ; .endif
+1:      tst     r2, #2
+        beq     1f
+  .if \lddst ; vld1.16 {d0[1]}, [r0]!   ; .endif
+  .if \ldsrc ; vld1.16 {d16[1]}, [r1]!  ; .endif
+1:      tst     r2, #1
+        beq     1f
+  .if \lddst ; vld1.8  {d0[1]}, [r0]!   ; .endif
+  .if \ldsrc ; vld1.8  {d16[1]}, [r1]!  ; .endif
+1:
+  .if \lddst ; sub     r0, r2           ; .endif
+
+  .if \zipped
+        /* One small impediment in the process above is that some of the load
+         * operations can't perform byte-wise structure deinterleaving at the
+         * same time as loading only part of a register.  So the data is loaded
+         * linearly and unpacked manually at this point.
+         */
+        vuzp.8  q0, q1
+        vuzp.8  q2, q3
+        vuzp.8  q0, q2
+        vuzp.8  q1, q3
+
+        vuzp.8  q8, q9
+        vuzp.8  q10, q11
+        vuzp.8  q8, q10
+        vuzp.8  q9, q11
+
+        \kernel
+
+        vzip.8  q0, q2
+        vzip.8  q1, q3
+        vzip.8  q0, q1
+        vzip.8  q2, q3
+  .else
+        \kernel
+  .endif
+
+        tst     r2, #32
+        beq     1f
+        vst1.64 {d4-d7}, [r0]!
+1:      tst     r2, #16
+        beq     1f
+        vst1.64 {d2-d3}, [r0]!
+1:      tst     r2, #8
+        beq     1f
+        vst1.64 {d1}, [r0]!
+1:      tst     r2, #4
+        beq     1f
+        vst1.32 {d0[1]}, [r0]!
+1:      tst     r2, #2
+        beq     1f
+        vst1.16 {d0[1]}, [r0]!
+1:      tst     r2, #1
+        beq     2f
+        vst1.8  {d0[1]}, [r0]!
+2:      vpop    {d8-d15}
+.endif
+        mov     r0, #0
+        bx      lr
+.endm
+
+
+/* produce list of blend_line_XX() functions; each function uses the wrap_line
+ * macro, passing it the name of the operation macro it wants along with
+ * optional parameters to remove unnecessary operations.
+ */
+#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
+    BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+
+/*  int rsdIntrinsicBlend_K(
+ *          uchar4 *out,        // r0
+ *          uchar4 const *in,   // r1
+ *          int slot,           // r2
+ *          size_t xstart,      // r3
+ *          size_t xend);       // [sp]
+ */
+ENTRY(rsdIntrinsicBlend_K)
+    adr     ip, blend_functions
+    cmp     r2, #(blend_functions_end - blend_functions) >> 2
+    ldrlo   ip, [ip, r2, LSL #2]
+    movhs   ip, #0
+    ldr     r2, [sp]
+    add     r0, r3, LSL #2
+    add     r1, r3, LSL #2
+    sub     r2, r3
+    mov     r2, r2, LSL #2
+    cmp     ip, #0
+    addne   ip, ip, pc
+    bxne    ip
+1:  mov     r0, #-1
+    bx      lr
+
+blend_functions:
+.set off,0
+#define BLEND_X(d, n) .rept d-off ; .word 0 ; .endr ; .word blend_line_##n-1b ; .set off, d+1 ;
+        BLEND_LIST(BLEND_X)
+#undef BLEND_X
+blend_functions_end:
+
+END(rsdIntrinsicBlend_K)
diff --git a/toolkit/Blur.cpp b/toolkit/Blur.cpp
new file mode 100644
index 0000000..a95ff43
--- /dev/null
+++ b/toolkit/Blur.cpp
@@ -0,0 +1,545 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <math.h>
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+namespace android {
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Blur"
+
+/**
+ * Blurs an image or a section of an image.
+ *
+ * Our algorithm does two passes: a vertical blur followed by an horizontal blur.
+ */
+class BlurTask : public Task {
+    // The image we're blurring.
+    const uchar* mIn;
+    // Where we store the blurred image.
+    uchar* outArray;
+    // The size of the kernel radius is limited to 25 in ScriptIntrinsicBlur.java.
+    // So, the max kernel size is 51 (= 2 * 25 + 1).
+    // Considering SSSE3 case, which requires the size is multiple of 4,
+    // at least 52 words are necessary. Values outside of the kernel should be 0.
+    float mFp[104];
+    uint16_t mIp[104];
+
+    // Working area to store the result of the vertical blur, to be used by the horizontal pass.
+    // There's one area per thread. Since the needed working area may be too large to put on the
+    // stack, we are allocating it from the heap. To avoid paying the allocation cost for each
+    // tile, we cache the scratch area here.
+    std::vector<void*> mScratch;       // Pointers to the scratch areas, one per thread.
+    std::vector<size_t> mScratchSize;  // The size in bytes of the scratch areas, one per thread.
+
+    // The radius of the blur, in floating point and integer format.
+    float mRadius;
+    int mIradius;
+
+    void kernelU4(void* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
+                  uint32_t threadIndex);
+    void kernelU1(void* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+    void ComputeGaussianWeights();
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) override;
+
+   public:
+    BlurTask(const uint8_t* in, uint8_t* out, size_t sizeX, size_t sizeY, size_t vectorSize,
+             uint32_t threadCount, float radius, const Restriction* restriction)
+        : Task{sizeX, sizeY, vectorSize, false, restriction},
+          mIn{in},
+          outArray{out},
+          mScratch{threadCount},
+          mScratchSize{threadCount},
+          mRadius{std::min(25.0f, radius)} {
+        ComputeGaussianWeights();
+    }
+
+    ~BlurTask() {
+        for (size_t i = 0; i < mScratch.size(); i++) {
+            if (mScratch[i]) {
+                free(mScratch[i]);
+            }
+        }
+    }
+};
+
+void BlurTask::ComputeGaussianWeights() {
+    memset(mFp, 0, sizeof(mFp));
+    memset(mIp, 0, sizeof(mIp));
+
+    // Compute gaussian weights for the blur
+    // e is the euler's number
+    float e = 2.718281828459045f;
+    float pi = 3.1415926535897932f;
+    // g(x) = (1 / (sqrt(2 * pi) * sigma)) * e ^ (-x^2 / (2 * sigma^2))
+    // x is of the form [-radius .. 0 .. radius]
+    // and sigma varies with the radius.
+    // Based on some experimental radius values and sigmas,
+    // we approximately fit sigma = f(radius) as
+    // sigma = radius * 0.4  + 0.6
+    // The larger the radius gets, the more our gaussian blur
+    // will resemble a box blur since with large sigma
+    // the gaussian curve begins to lose its shape
+    float sigma = 0.4f * mRadius + 0.6f;
+
+    // Now compute the coefficients. We will store some redundant values to save
+    // some math during the blur calculations precompute some values
+    float coeff1 = 1.0f / (sqrtf(2.0f * pi) * sigma);
+    float coeff2 = - 1.0f / (2.0f * sigma * sigma);
+
+    float normalizeFactor = 0.0f;
+    float floatR = 0.0f;
+    int r;
+    mIradius = (float)ceil(mRadius) + 0.5f;
+    for (r = -mIradius; r <= mIradius; r ++) {
+        floatR = (float)r;
+        mFp[r + mIradius] = coeff1 * powf(e, floatR * floatR * coeff2);
+        normalizeFactor += mFp[r + mIradius];
+    }
+
+    // Now we need to normalize the weights because all our coefficients need to add up to one
+    normalizeFactor = 1.0f / normalizeFactor;
+    for (r = -mIradius; r <= mIradius; r ++) {
+        mFp[r + mIradius] *= normalizeFactor;
+        mIp[r + mIradius] = (uint16_t)(mFp[r + mIradius] * 65536.0f + 0.5f);
+    }
+}
+
+/**
+ * Vertical blur of a uchar4 line.
+ *
+ * @param sizeY Number of cells of the input array in the vertical direction.
+ * @param out Where to place the computed value.
+ * @param x Coordinate of the point we're blurring.
+ * @param y Coordinate of the point we're blurring.
+ * @param ptrIn Start of the input array.
+ * @param iStride The size in byte of a row of the input array.
+ * @param gPtr The gaussian coefficients.
+ * @param iradius The radius of the blur.
+ */
+static void OneVU4(uint32_t sizeY, float4* out, int32_t x, int32_t y, const uchar* ptrIn,
+                   int iStride, const float* gPtr, int iradius) {
+    const uchar *pi = ptrIn + x*4;
+
+    float4 blurredPixel = 0;
+    for (int r = -iradius; r <= iradius; r ++) {
+        int validY = std::max((y + r), 0);
+        validY = std::min(validY, (int)(sizeY - 1));
+        const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
+        float4 pf = convert<float4>(pvy[0]);
+        blurredPixel += pf * gPtr[0];
+        gPtr++;
+    }
+
+    out[0] = blurredPixel;
+}
+
+/**
+ * Vertical blur of a uchar1 line.
+ *
+ * @param sizeY Number of cells of the input array in the vertical direction.
+ * @param out Where to place the computed value.
+ * @param x Coordinate of the point we're blurring.
+ * @param y Coordinate of the point we're blurring.
+ * @param ptrIn Start of the input array.
+ * @param iStride The size in byte of a row of the input array.
+ * @param gPtr The gaussian coefficients.
+ * @param iradius The radius of the blur.
+ */
+static void OneVU1(uint32_t sizeY, float *out, int32_t x, int32_t y,
+                   const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
+
+    const uchar *pi = ptrIn + x;
+
+    float blurredPixel = 0;
+    for (int r = -iradius; r <= iradius; r ++) {
+        int validY = std::max((y + r), 0);
+        validY = std::min(validY, (int)(sizeY - 1));
+        float pf = (float)pi[validY * iStride];
+        blurredPixel += pf * gPtr[0];
+        gPtr++;
+    }
+
+    out[0] = blurredPixel;
+}
+
+
+extern "C" void rsdIntrinsicBlurU1_K(uchar *out, uchar const *in, size_t w, size_t h,
+                 size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
+extern "C" void rsdIntrinsicBlurU4_K(uchar4 *out, uchar4 const *in, size_t w, size_t h,
+                 size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
+
+#if defined(ARCH_X86_HAVE_SSSE3)
+extern void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr,
+                                   int rct, int x1, int ct);
+extern void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1,
+                                   int ct);
+extern void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1,
+                                   int ct);
+#endif
+
+/**
+ * Vertical blur of a line of RGBA, knowing that there's enough rows above and below us to avoid
+ * dealing with boundary conditions.
+ *
+ * @param out Where to store the results. This is the input to the horizontal blur.
+ * @param ptrIn The input data for this line.
+ * @param iStride The width of the input.
+ * @param gPtr The gaussian coefficients.
+ * @param ct The diameter of the blur.
+ * @param len How many cells to blur.
+ * @param usesSimd Whether this processor supports SIMD.
+ */
+static void OneVFU4(float4 *out, const uchar *ptrIn, int iStride, const float* gPtr, int ct,
+                    int x2, bool usesSimd) {
+    int x1 = 0;
+#if defined(ARCH_X86_HAVE_SSSE3)
+    if (usesSimd) {
+        int t = (x2 - x1);
+        t &= ~1;
+        if (t) {
+            rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
+        }
+        x1 += t;
+        out += t;
+        ptrIn += t << 2;
+    }
+#else
+    (void) usesSimd; // Avoid unused parameter warning.
+#endif
+    while(x2 > x1) {
+        const uchar *pi = ptrIn;
+        float4 blurredPixel = 0;
+        const float* gp = gPtr;
+
+        for (int r = 0; r < ct; r++) {
+            float4 pf = convert<float4>(((const uchar4 *)pi)[0]);
+            blurredPixel += pf * gp[0];
+            pi += iStride;
+            gp++;
+        }
+        out->xyzw = blurredPixel;
+        x1++;
+        out++;
+        ptrIn+=4;
+    }
+}
+
+/**
+ * Vertical blur of a line of U_8, knowing that there's enough rows above and below us to avoid
+ * dealing with boundary conditions.
+ *
+ * @param out Where to store the results. This is the input to the horizontal blur.
+ * @param ptrIn The input data for this line.
+ * @param iStride The width of the input.
+ * @param gPtr The gaussian coefficients.
+ * @param ct The diameter of the blur.
+ * @param len How many cells to blur.
+ * @param usesSimd Whether this processor supports SIMD.
+ */
+static void OneVFU1(float* out, const uchar* ptrIn, int iStride, const float* gPtr, int ct, int len,
+                    bool usesSimd) {
+    int x1 = 0;
+
+    while((len > x1) && (((uintptr_t)ptrIn) & 0x3)) {
+        const uchar *pi = ptrIn;
+        float blurredPixel = 0;
+        const float* gp = gPtr;
+
+        for (int r = 0; r < ct; r++) {
+            float pf = (float)pi[0];
+            blurredPixel += pf * gp[0];
+            pi += iStride;
+            gp++;
+        }
+        out[0] = blurredPixel;
+        x1++;
+        out++;
+        ptrIn++;
+        len--;
+    }
+#if defined(ARCH_X86_HAVE_SSSE3)
+    if (usesSimd && (len > x1)) {
+        int t = (len - x1) >> 2;
+        t &= ~1;
+        if (t) {
+            rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t );
+            len -= t << 2;
+            ptrIn += t << 2;
+            out += t << 2;
+        }
+    }
+#else
+    (void) usesSimd; // Avoid unused parameter warning.
+#endif
+    while(len > 0) {
+        const uchar *pi = ptrIn;
+        float blurredPixel = 0;
+        const float* gp = gPtr;
+
+        for (int r = 0; r < ct; r++) {
+            float pf = (float)pi[0];
+            blurredPixel += pf * gp[0];
+            pi += iStride;
+            gp++;
+        }
+        out[0] = blurredPixel;
+        len--;
+        out++;
+        ptrIn++;
+    }
+}
+
+/**
+ * Horizontal blur of a uchar4 line.
+ *
+ * @param sizeX Number of cells of the input array in the horizontal direction.
+ * @param out Where to place the computed value.
+ * @param x Coordinate of the point we're blurring.
+ * @param ptrIn The start of the input row from which we're indexing x.
+ * @param gPtr The gaussian coefficients.
+ * @param iradius The radius of the blur.
+ */
+static void OneHU4(uint32_t sizeX, uchar4* out, int32_t x, const float4* ptrIn, const float* gPtr,
+                   int iradius) {
+    float4 blurredPixel = 0;
+    for (int r = -iradius; r <= iradius; r ++) {
+        int validX = std::max((x + r), 0);
+        validX = std::min(validX, (int)(sizeX - 1));
+        float4 pf = ptrIn[validX];
+        blurredPixel += pf * gPtr[0];
+        gPtr++;
+    }
+
+    out->xyzw = convert<uchar4>(blurredPixel);
+}
+
+/**
+ * Horizontal blur of a uchar line.
+ *
+ * @param sizeX Number of cells of the input array in the horizontal direction.
+ * @param out Where to place the computed value.
+ * @param x Coordinate of the point we're blurring.
+ * @param ptrIn The start of the input row from which we're indexing x.
+ * @param gPtr The gaussian coefficients.
+ * @param iradius The radius of the blur.
+ */
+static void OneHU1(uint32_t sizeX, uchar* out, int32_t x, const float* ptrIn, const float* gPtr,
+                   int iradius) {
+    float blurredPixel = 0;
+    for (int r = -iradius; r <= iradius; r ++) {
+        int validX = std::max((x + r), 0);
+        validX = std::min(validX, (int)(sizeX - 1));
+        float pf = ptrIn[validX];
+        blurredPixel += pf * gPtr[0];
+        gPtr++;
+    }
+
+    out[0] = (uchar)blurredPixel;
+}
+
+/**
+ * Full blur of a line of RGBA data.
+ *
+ * @param outPtr Where to store the results
+ * @param xstart The index of the section we're starting to blur.
+ * @param xend  The end index of the section.
+ * @param currentY The index of the line we're blurring.
+ * @param usesSimd Whether this processor supports SIMD.
+ */
+void BlurTask::kernelU4(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
+                        uint32_t threadIndex) {
+    float4 stackbuf[2048];
+    float4 *buf = &stackbuf[0];
+    const uint32_t stride = mSizeX * mVectorSize;
+
+    uchar4 *out = (uchar4 *)outPtr;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd && mSizeX >= 4) {
+      rsdIntrinsicBlurU4_K(out, (uchar4 const *)(mIn + stride * currentY),
+                 mSizeX, mSizeY,
+                 stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
+        return;
+    }
+#endif
+
+    if (mSizeX > 2048) {
+        if ((mSizeX > mScratchSize[threadIndex]) || !mScratch[threadIndex]) {
+            // Pad the side of the allocation by one unit to allow alignment later
+            mScratch[threadIndex] = realloc(mScratch[threadIndex], (mSizeX + 1) * 16);
+            mScratchSize[threadIndex] = mSizeX;
+        }
+        // realloc only aligns to 8 bytes so we manually align to 16.
+        buf = (float4 *) ((((intptr_t)mScratch[threadIndex]) + 15) & ~0xf);
+    }
+    float4 *fout = (float4 *)buf;
+    int y = currentY;
+    if ((y > mIradius) && (y < ((int)mSizeY - mIradius))) {
+        const uchar *pi = mIn + (y - mIradius) * stride;
+        OneVFU4(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
+    } else {
+        x1 = 0;
+        while(mSizeX > x1) {
+            OneVU4(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
+            fout++;
+            x1++;
+        }
+    }
+
+    x1 = xstart;
+    while ((x1 < (uint32_t)mIradius) && (x1 < x2)) {
+        OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
+        out++;
+        x1++;
+    }
+#if defined(ARCH_X86_HAVE_SSSE3)
+    if (mUsesSimd) {
+        if ((x1 + mIradius) < x2) {
+            rsdIntrinsicBlurHFU4_K(out, buf - mIradius, mFp,
+                                   mIradius * 2 + 1, x1, x2 - mIradius);
+            out += (x2 - mIradius) - x1;
+            x1 = x2 - mIradius;
+        }
+    }
+#endif
+    while(x2 > x1) {
+        OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
+        out++;
+        x1++;
+    }
+}
+
+/**
+ * Full blur of a line of U_8 data.
+ *
+ * @param outPtr Where to store the results
+ * @param xstart The index of the section we're starting to blur.
+ * @param xend  The end index of the section.
+ * @param currentY The index of the line we're blurring.
+ */
+void BlurTask::kernelU1(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    float buf[4 * 2048];
+    const uint32_t stride = mSizeX * mVectorSize;
+
+    uchar *out = (uchar *)outPtr;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd && mSizeX >= 16) {
+        // The specialisation for r<=8 has an awkward prefill case, which is
+        // fiddly to resolve, where starting close to the right edge can cause
+        // a read beyond the end of input.  So avoid that case here.
+        if (mIradius > 8 || (mSizeX - std::max(0, (int32_t)x1 - 8)) >= 16) {
+            rsdIntrinsicBlurU1_K(out, mIn + stride * currentY, mSizeX, mSizeY,
+                     stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
+            return;
+        }
+    }
+#endif
+
+    float *fout = (float *)buf;
+    int y = currentY;
+    if ((y > mIradius) && (y < ((int)mSizeY - mIradius -1))) {
+        const uchar *pi = mIn + (y - mIradius) * stride;
+        OneVFU1(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
+    } else {
+        x1 = 0;
+        while(mSizeX > x1) {
+            OneVU1(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
+            fout++;
+            x1++;
+        }
+    }
+
+    x1 = xstart;
+    while ((x1 < x2) &&
+           ((x1 < (uint32_t)mIradius) || (((uintptr_t)out) & 0x3))) {
+        OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
+        out++;
+        x1++;
+    }
+#if defined(ARCH_X86_HAVE_SSSE3)
+    if (mUsesSimd) {
+        if ((x1 + mIradius) < x2) {
+            uint32_t len = x2 - (x1 + mIradius);
+            len &= ~3;
+
+            // rsdIntrinsicBlurHFU1_K() processes each four float values in |buf| at once, so it
+            // nees to ensure four more values can be accessed in order to avoid accessing
+            // uninitialized buffer.
+            if (len > 4) {
+                len -= 4;
+                rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - mIradius, mFp,
+                                       mIradius * 2 + 1, x1, x1 + len);
+                out += len;
+                x1 += len;
+            }
+        }
+    }
+#endif
+    while(x2 > x1) {
+        OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
+        out++;
+        x1++;
+    }
+}
+
+void BlurTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                           size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        void* outPtr = outArray + (mSizeX * y + startX) * mVectorSize;
+        if (mVectorSize == 4) {
+            kernelU4(outPtr, startX, endX, y, threadIndex);
+        } else {
+            kernelU1(outPtr, startX, endX, y);
+        }
+    }
+}
+
+void RenderScriptToolkit::blur(const uint8_t* in, uint8_t* out, size_t sizeX, size_t sizeY,
+                               size_t vectorSize, int radius, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+    if (radius <= 0 || radius > 25) {
+        ALOGE("The radius should be between 1 and 25. %d provided.", radius);
+    }
+    if (vectorSize != 1 && vectorSize != 4) {
+        ALOGE("The vectorSize should be 1 or 4. %zu provided.", vectorSize);
+    }
+#endif
+
+    BlurTask task(in, out, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), radius,
+                  restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/Blur_advsimd.S b/toolkit/Blur_advsimd.S
new file mode 100644
index 0000000..6d3cb8d
--- /dev/null
+++ b/toolkit/Blur_advsimd.S
@@ -0,0 +1,1868 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define PRIVATE(f) .text; .align 4; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+//#define ARCH_ARM64_USE_BLUR_PRELOAD
+
+/* Number of fractional bits to preserve in intermediate results.  The
+ * intermediate storage is 16-bit, and we started with 8 bit data (the integer
+ * part), so this should be between 0 and 8.
+ */
+.set FRACTION_BITS, 7
+.set MAX_R, 25
+
+
+/* A quick way of making a line of code conditional on some other condition.
+ * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
+ * `ifcc`:
+ */
+.macro ifcc zzz:vararg
+.if cc
+            \zzz
+.endif
+.endm
+
+/* It's not always clear that prefetching is beneficial and this needs further
+ * testing on different cores, so it's made switchable here.
+ */
+#if defined(ARCH_ARM64_USE_BLUR_PRELOAD)
+#define VERTPLD(...) prfm        PLDL1KEEP, [__VA_ARGS__]
+#else
+#define VERTPLD(...) nop
+#endif
+
+/* Fetch 16 columns of bytes (regardless of image format), convolve these
+ * vertically, and leave them in the register file.  If working near the top or
+ * bottom of an image then clamp the addressing while loading the data in.
+ *
+ * The convolution is fully unrolled for windows up to max_r, with the
+ * outermost edges calculated first.  This way it's possible to branch directly
+ * into the relevant part of the code for an arbitrary convolution radius.  Two
+ * variants of the loop are produced; one eliminates the clamping code for a
+ * slight speed advantage.
+ *
+ * Where the macro is called with reg=x, the specified register is taken to
+ * contain a pre-calculated pointer into one of the two loops.
+ *
+ * Input:
+ *      x1 -- src
+ *      x2 -- pitch
+ *      x5 -- r
+ *      x6 -- rup (r, unless clipped to top of source image)
+ *      x7 -- rdn (r, unless clipped to bottom of source image)
+ *      x12 -- switch index
+ *      v0-v3 -- coefficient table
+ *      x13 = -pitch
+ *      x15 = top-row in
+ *      x19 = bottom-row in
+ * Output:
+ *      x1 += 16
+ *      v10,v11 -- 16 convolved columns
+ * Modifies:
+ *      x10 = upper row pointer
+ *      x11 = lower row pointer
+ *      v12-v15 = temporary sums
+ */
+.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
+  .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
+
+            ld1         {v15.16b}, [x1], #16
+            mov         x10, x15
+
+            uxtl        v14.8h, v15.8b
+            VERTPLD(x1, #16)
+            uxtl2       v15.8h, v15.16b
+  .if \max_r < 16 // approximate
+    ifcc    adr         \reg, 1f
+  .else
+    ifcc    adrp        \reg, 1f
+    ifcc    add         \reg, \reg, #:lo12:1f
+  .endif
+
+            umull       v12.4s, v14.4h, v0.h[0]
+    ifcc    sub         \reg, \reg, x5, LSL #6
+            umull2      v13.4s, v14.8h, v0.h[0]
+            mov         x11, x19
+            umull       v14.4s, v15.4h, v0.h[0]
+    ifcc    add         \reg, \reg, x5, LSL #3
+            umull2      v15.4s, v15.8h, v0.h[0]
+            br          \reg
+
+  /* This version of the vertical fetch loop body is used away from the edges
+   * of the source image.  The pointers start at the top and bottom source rows
+   * and work their way towards the centre on each iteration.  This way the
+   * number of taps used can be controlled by jumping directly into the middle
+   * of the loop and running to completion.
+   * If the loop body changes size then the code which calculates the address of
+   * the initial iteration must be updated to accordingly.
+   */
+  .macro vertfetch_noclamp i, dreg
+    .if 0 < \i && \i <= \max_r
+            ld1         {v10.16b}, [x10], x2
+            ld1         {v11.16b}, [x11], x13
+            uaddl       v16.8h, v10.8b, v11.8b
+            uaddl2      v11.8h, v10.16b, v11.16b
+            umlal       v12.4s, v16.4h, \dreg
+            umlal2      v13.4s, v16.8h, \dreg
+            VERTPLD(x10, #32)
+            umlal       v14.4s, v11.4h, \dreg
+            VERTPLD(x11, #32)
+            umlal2      v15.4s, v11.8h, \dreg
+    .endif
+  .endm
+
+  /* This version of the vertical fetch loop body is used near the edges of the
+   * source image, where one or both of the accesses may start with a clamped
+   * value, and the row addresses only begin to change after some number of
+   * iterations before the end.
+   * If the loop body changes size then the code which calculates the address of
+   * the initial iteration must be updated to accordingly.
+   */
+  .macro vertfetch_clamped i, dreg
+    .if 0 < \i && \i <= \max_r
+            ld1         {v10.16b}, [x10], x2
+            cmp         x6, #\i
+            ld1         {v11.16b}, [x11], x13
+            csel        x10, x15, x10, lo
+            uaddl       v16.8h, v10.8b, v11.8b
+            cmp         x7, #\i
+            uaddl2      v11.8h, v10.16b, v11.16b
+            csel        x11, x19, x11, lo
+            umlal       v12.4s, v16.4h, \dreg
+            umlal2      v13.4s, v16.8h, \dreg
+            VERTPLD(x10, #32)
+            umlal       v14.4s, v11.4h, \dreg
+            VERTPLD(x11, #32)
+            umlal2      v15.4s, v11.8h, \dreg
+    .endif
+  .endm
+
+  /* Entry into this unrolled loop is computed as a negative index from
+   * \labelc at the end of the block.
+   */
+  .align 4
+  vertfetch_clamped 27, v3.h[3]
+  vertfetch_clamped 26, v3.h[2]
+  vertfetch_clamped 25, v3.h[1]
+  vertfetch_clamped 24, v3.h[0]
+  vertfetch_clamped 23, v2.h[7]
+  vertfetch_clamped 22, v2.h[6]
+  vertfetch_clamped 21, v2.h[5]
+  vertfetch_clamped 20, v2.h[4]
+  vertfetch_clamped 19, v2.h[3]
+  vertfetch_clamped 18, v2.h[2]
+  vertfetch_clamped 17, v2.h[1]
+  vertfetch_clamped 16, v2.h[0]
+  vertfetch_clamped 15, v1.h[7]
+  vertfetch_clamped 14, v1.h[6]
+  vertfetch_clamped 13, v1.h[5]
+  vertfetch_clamped 12, v1.h[4]
+  vertfetch_clamped 11, v1.h[3]
+  vertfetch_clamped 10, v1.h[2]
+  vertfetch_clamped  9, v1.h[1]
+  vertfetch_clamped  8, v1.h[0]
+  vertfetch_clamped  7, v0.h[7]
+  vertfetch_clamped  6, v0.h[6]
+  vertfetch_clamped  5, v0.h[5]
+  vertfetch_clamped  4, v0.h[4]
+  vertfetch_clamped  3, v0.h[3]
+  vertfetch_clamped  2, v0.h[2]
+  vertfetch_clamped  1, v0.h[1]
+  vertfetch_clamped  0, v0.h[0]
+  1:
+  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
+
+  /* Entry into this unrolled loop is computed as a negative index from
+   * \labelnc at the end of the block.
+   */
+  .align 4
+  vertfetch_noclamp 27, v3.h[3]
+  vertfetch_noclamp 26, v3.h[2]
+  vertfetch_noclamp 25, v3.h[1]
+  vertfetch_noclamp 24, v3.h[0]
+  vertfetch_noclamp 23, v2.h[7]
+  vertfetch_noclamp 22, v2.h[6]
+  vertfetch_noclamp 21, v2.h[5]
+  vertfetch_noclamp 20, v2.h[4]
+  vertfetch_noclamp 19, v2.h[3]
+  vertfetch_noclamp 18, v2.h[2]
+  vertfetch_noclamp 17, v2.h[1]
+  vertfetch_noclamp 16, v2.h[0]
+  vertfetch_noclamp 15, v1.h[7]
+  vertfetch_noclamp 14, v1.h[6]
+  vertfetch_noclamp 13, v1.h[5]
+  vertfetch_noclamp 12, v1.h[4]
+  vertfetch_noclamp 11, v1.h[3]
+  vertfetch_noclamp 10, v1.h[2]
+  vertfetch_noclamp  9, v1.h[1]
+  vertfetch_noclamp  8, v1.h[0]
+  vertfetch_noclamp  7, v0.h[7]
+  vertfetch_noclamp  6, v0.h[6]
+  vertfetch_noclamp  5, v0.h[5]
+  vertfetch_noclamp  4, v0.h[4]
+  vertfetch_noclamp  3, v0.h[3]
+  vertfetch_noclamp  2, v0.h[2]
+  vertfetch_noclamp  1, v0.h[1]
+  vertfetch_noclamp  0, v0.h[0]
+  \labelnc :
+
+  .purgem vertfetch_clamped
+  .purgem vertfetch_noclamp
+
+  2:        uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
+            add         x15, x15, #16
+            uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
+            add         x19, x19, #16
+            uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
+            uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
+.endm /*}}}*/
+
+/* Some portion of the convolution window (as much as will fit, and all of it
+ * for the uchar1 cases) is kept in the register file to avoid unnecessary
+ * memory accesses.  This forces the horizontal loops to be unrolled because
+ * there's no indexed addressing into the register file.
+ *
+ * As in the fetch macro, the operations are ordered from outside to inside, so
+ * that jumping into the middle of the block bypasses the unwanted window taps.
+ *
+ * There are several variants of the macro because of the fixed offets of the
+ * taps -- the wider the maximum radius the further the centre tap is from the
+ * most recently fetched data.  This means that pre-filling the window requires
+ * more data that won't be used and it means that rotating the window involves
+ * more mov operations.
+ *
+ * When the buffer gets too big the buffer at [x9] is used.
+ *
+ * Input:
+ *      v16-v31,v4-v11 -- convoltion window
+ *      x9 -- pointer to additional convolution window data
+ * Output:
+ *      x9 -- updated buffer pointer (if used)
+ *      d31 -- result to be stored
+ * Modifies:
+ *      x12 -- temp buffer pointer
+ *      v12-v13 -- temporaries for load and vext operations.
+ *      v14-v15 -- intermediate sums
+ */
+#define TUNED_LIST1 8, 16
+.macro hconv1_8/*{{{*/
+
+.rodata
+    200:    .hword -4
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .hword 107f-100f
+            .hword 108f-100f
+            .align      4
+.text
+            umull       v14.4s, v9.4h, v0.h[0]
+            umull2      v15.4s, v9.8h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    108:    umlal       v14.4s, v8.4h, v1.h[0]
+            umlal2      v15.4s, v8.8h, v1.h[0]
+            umlal       v14.4s, v10.4h, v1.h[0]
+            umlal2      v15.4s, v10.8h, v1.h[0]
+    107:    ext         v12.16b, v8.16b, v9.16b, #1*2
+            ext         v13.16b, v9.16b, v10.16b, #7*2
+            umlal       v14.4s, v12.4h, v0.h[7]
+            umlal2      v15.4s, v12.8h, v0.h[7]
+            umlal       v14.4s, v13.4h, v0.h[7]
+            umlal2      v15.4s, v13.8h, v0.h[7]
+    106:    ext         v12.16b, v8.16b, v9.16b, #2*2
+            ext         v13.16b, v9.16b, v10.16b, #6*2
+            umlal       v14.4s, v12.4h, v0.h[6]
+            umlal2      v15.4s, v12.8h, v0.h[6]
+            umlal       v14.4s, v13.4h, v0.h[6]
+            umlal2      v15.4s, v13.8h, v0.h[6]
+    105:    ext         v12.16b, v8.16b, v9.16b, #3*2
+            ext         v13.16b, v9.16b, v10.16b, #5*2
+            umlal       v14.4s, v12.4h, v0.h[5]
+            umlal2      v15.4s, v12.8h, v0.h[5]
+            umlal       v14.4s, v13.4h, v0.h[5]
+            umlal2      v15.4s, v13.8h, v0.h[5]
+    104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
+            //ext         v13.16b, v9.16b, v10.16b, #4*2
+            umlal2      v14.4s, v8.8h, v0.h[4]
+            umlal       v15.4s, v9.4h, v0.h[4]
+            umlal2      v14.4s, v9.8h, v0.h[4]
+            umlal       v15.4s, v10.4h, v0.h[4]
+    103:    ext         v12.16b, v8.16b, v9.16b, #5*2
+            ext         v13.16b, v9.16b, v10.16b, #3*2
+            umlal       v14.4s, v12.4h, v0.h[3]
+            umlal2      v15.4s, v12.8h, v0.h[3]
+            umlal       v14.4s, v13.4h, v0.h[3]
+            umlal2      v15.4s, v13.8h, v0.h[3]
+    102:    ext         v12.16b, v8.16b, v9.16b, #6*2
+            ext         v13.16b, v9.16b, v10.16b, #2*2
+            umlal       v14.4s, v12.4h, v0.h[2]
+            umlal2      v15.4s, v12.8h, v0.h[2]
+            umlal       v14.4s, v13.4h, v0.h[2]
+            umlal2      v15.4s, v13.8h, v0.h[2]
+    101:    ext         v12.16b, v8.16b, v9.16b, #7*2
+            ext         v13.16b, v9.16b, v10.16b, #1*2
+            umlal       v14.4s, v12.4h, v0.h[1]
+            umlal2      v15.4s, v12.8h, v0.h[1]
+            umlal       v14.4s, v13.4h, v0.h[1]
+            umlal2      v15.4s, v13.8h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv1_16/*{{{*/
+.rodata
+   200:     .hword -4
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .hword 107f-100f
+            .hword 108f-100f
+            .hword 109f-100f
+            .hword 110f-100f
+            .hword 111f-100f
+            .hword 112f-100f
+            .hword 113f-100f
+            .hword 114f-100f
+            .hword 115f-100f
+            .hword 116f-100f
+            .align 4
+
+.text
+            umull       v14.4s, v8.4h, v0.h[0]
+            umull2      v15.4s, v8.8h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
+            //ext         v13.16b, v10.16b, v11.16b, #0*2
+            umlal       v14.4s, v6.4h, v2.h[0]
+            umlal2      v15.4s, v6.8h, v2.h[0]
+            umlal       v14.4s, v10.4h, v2.h[0]
+            umlal2      v15.4s, v10.8h, v2.h[0]
+    115:    ext         v12.16b, v6.16b, v7.16b, #1*2
+            ext         v13.16b, v9.16b, v10.16b, #7*2
+            umlal       v14.4s, v12.4h, v1.h[7]
+            umlal2      v15.4s, v12.8h, v1.h[7]
+            umlal       v14.4s, v13.4h, v1.h[7]
+            umlal2      v15.4s, v13.8h, v1.h[7]
+    114:    ext         v12.16b, v6.16b, v7.16b, #2*2
+            ext         v13.16b, v9.16b, v10.16b, #6*2
+            umlal       v14.4s, v12.4h, v1.h[6]
+            umlal2      v15.4s, v12.8h, v1.h[6]
+            umlal       v14.4s, v13.4h, v1.h[6]
+            umlal2      v15.4s, v13.8h, v1.h[6]
+    113:    ext         v12.16b, v6.16b, v7.16b, #3*2
+            ext         v13.16b, v9.16b, v10.16b, #5*2
+            umlal       v14.4s, v12.4h, v1.h[5]
+            umlal2      v15.4s, v12.8h, v1.h[5]
+            umlal       v14.4s, v13.4h, v1.h[5]
+            umlal2      v15.4s, v13.8h, v1.h[5]
+    112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
+            //ext         v13.16b, v9.16b, v10.16b, #4*2
+            umlal2      v14.4s, v6.8h, v1.h[4]
+            umlal       v15.4s, v7.4h, v1.h[4]
+            umlal2      v14.4s, v9.8h, v1.h[4]
+            umlal       v15.4s, v10.4h, v1.h[4]
+    111:    ext         v12.16b, v6.16b, v7.16b, #5*2
+            ext         v13.16b, v9.16b, v10.16b, #3*2
+            umlal       v14.4s, v12.4h, v1.h[3]
+            umlal2      v15.4s, v12.8h, v1.h[3]
+            umlal       v14.4s, v13.4h, v1.h[3]
+            umlal2      v15.4s, v13.8h, v1.h[3]
+    110:    ext         v12.16b, v6.16b, v7.16b, #6*2
+            ext         v13.16b, v9.16b, v10.16b, #2*2
+            umlal       v14.4s, v12.4h, v1.h[2]
+            umlal2      v15.4s, v12.8h, v1.h[2]
+            umlal       v14.4s, v13.4h, v1.h[2]
+            umlal2      v15.4s, v13.8h, v1.h[2]
+    109:    ext         v12.16b, v6.16b, v7.16b, #7*2
+            ext         v13.16b, v9.16b, v10.16b, #1*2
+            umlal       v14.4s, v12.4h, v1.h[1]
+            umlal2      v15.4s, v12.8h, v1.h[1]
+            umlal       v14.4s, v13.4h, v1.h[1]
+            umlal2      v15.4s, v13.8h, v1.h[1]
+    108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
+            //ext         v13.16b, v9.16b, v10.16b, #0*2
+            umlal       v14.4s, v7.4h, v1.h[0]
+            umlal2      v15.4s, v7.8h, v1.h[0]
+            umlal       v14.4s, v9.4h, v1.h[0]
+            umlal2      v15.4s, v9.8h, v1.h[0]
+    107:    ext         v12.16b, v7.16b, v8.16b, #1*2
+            ext         v13.16b, v8.16b, v9.16b, #7*2
+            umlal       v14.4s, v12.4h, v0.h[7]
+            umlal2      v15.4s, v12.8h, v0.h[7]
+            umlal       v14.4s, v13.4h, v0.h[7]
+            umlal2      v15.4s, v13.8h, v0.h[7]
+    106:    ext         v12.16b, v7.16b, v8.16b, #2*2
+            ext         v13.16b, v8.16b, v9.16b, #6*2
+            umlal       v14.4s, v12.4h, v0.h[6]
+            umlal2      v15.4s, v12.8h, v0.h[6]
+            umlal       v14.4s, v13.4h, v0.h[6]
+            umlal2      v15.4s, v13.8h, v0.h[6]
+    105:    ext         v12.16b, v7.16b, v8.16b, #3*2
+            ext         v13.16b, v8.16b, v9.16b, #5*2
+            umlal       v14.4s, v12.4h, v0.h[5]
+            umlal2      v15.4s, v12.8h, v0.h[5]
+            umlal       v14.4s, v13.4h, v0.h[5]
+            umlal2      v15.4s, v13.8h, v0.h[5]
+    104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
+            //ext         v13.16b, v8.16b, v9.16b, #4*2
+            umlal2      v14.4s, v7.8h, v0.h[4]
+            umlal       v15.4s, v8.4h, v0.h[4]
+            umlal2      v14.4s, v8.8h, v0.h[4]
+            umlal       v15.4s, v9.4h, v0.h[4]
+    103:    ext         v12.16b, v7.16b, v8.16b, #5*2
+            ext         v13.16b, v8.16b, v9.16b, #3*2
+            umlal       v14.4s, v12.4h, v0.h[3]
+            umlal2      v15.4s, v12.8h, v0.h[3]
+            umlal       v14.4s, v13.4h, v0.h[3]
+            umlal2      v15.4s, v13.8h, v0.h[3]
+    102:    ext         v12.16b, v7.16b, v8.16b, #6*2
+            ext         v13.16b, v8.16b, v9.16b, #2*2
+            umlal       v14.4s, v12.4h, v0.h[2]
+            umlal2      v15.4s, v12.8h, v0.h[2]
+            umlal       v14.4s, v13.4h, v0.h[2]
+            umlal2      v15.4s, v13.8h, v0.h[2]
+    101:    ext         v12.16b, v7.16b, v8.16b, #7*2
+            ext         v13.16b, v8.16b, v9.16b, #1*2
+            umlal       v14.4s, v12.4h, v0.h[1]
+            umlal2      v15.4s, v12.8h, v0.h[1]
+            umlal       v14.4s, v13.4h, v0.h[1]
+            umlal2      v15.4s, v13.8h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            mov         v6.16b, v7.16b
+            mov         v7.16b, v8.16b
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv1_25/*{{{*/
+.rodata
+   200:     .hword -4
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .hword 107f-100f
+            .hword 108f-100f
+            .hword 109f-100f
+            .hword 110f-100f
+            .hword 111f-100f
+            .hword 112f-100f
+            .hword 113f-100f
+            .hword 114f-100f
+            .hword 115f-100f
+            .hword 116f-100f
+            .hword 117f-100f
+            .hword 118f-100f
+            .hword 119f-100f
+            .hword 120f-100f
+            .hword 121f-100f
+            .hword 122f-100f
+            .hword 123f-100f
+            .hword 124f-100f
+            .hword 125f-100f
+            .align 4
+.text
+            ext         v12.16b, v6.16b, v7.16b, #7*2
+            umull       v14.4s, v12.4h, v0.h[0]
+            umull2      v15.4s, v12.8h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    125:    ext         v12.16b, v31.16b, v4.16b, #6*2
+            ext         v13.16b, v10.16b, v11.16b, #0*2
+            umlal       v14.4s, v12.4h, v3.h[1]
+            umlal2      v15.4s, v12.8h, v3.h[1]
+            umlal       v14.4s, v13.4h, v3.h[1]
+            umlal2      v15.4s, v13.8h, v3.h[1]
+    124:    ext         v12.16b, v31.16b, v4.16b, #7*2
+            ext         v13.16b, v9.16b, v10.16b, #7*2
+            umlal       v14.4s, v12.4h, v3.h[0]
+            umlal2      v15.4s, v12.8h, v3.h[0]
+            umlal       v14.4s, v13.4h, v3.h[0]
+            umlal2      v15.4s, v13.8h, v3.h[0]
+    123:    ext         v12.16b, v4.16b, v5.16b, #0*2
+            ext         v13.16b, v9.16b, v10.16b, #6*2
+            umlal       v14.4s, v12.4h, v2.h[7]
+            umlal2      v15.4s, v12.8h, v2.h[7]
+            umlal       v14.4s, v13.4h, v2.h[7]
+            umlal2      v15.4s, v13.8h, v2.h[7]
+    122:    ext         v12.16b, v4.16b, v5.16b, #1*2
+            ext         v13.16b, v9.16b, v10.16b, #5*2
+            umlal       v14.4s, v12.4h, v2.h[6]
+            umlal2      v15.4s, v12.8h, v2.h[6]
+            umlal       v14.4s, v13.4h, v2.h[6]
+            umlal2      v15.4s, v13.8h, v2.h[6]
+    121:    ext         v12.16b, v4.16b, v5.16b, #2*2
+            ext         v13.16b, v9.16b, v10.16b, #4*2
+            umlal       v14.4s, v12.4h, v2.h[5]
+            umlal2      v15.4s, v12.8h, v2.h[5]
+            umlal       v14.4s, v13.4h, v2.h[5]
+            umlal2      v15.4s, v13.8h, v2.h[5]
+    120:    ext         v12.16b, v4.16b, v5.16b, #3*2
+            ext         v13.16b, v9.16b, v10.16b, #3*2
+            umlal       v14.4s, v12.4h, v2.h[4]
+            umlal2      v15.4s, v12.8h, v2.h[4]
+            umlal       v14.4s, v13.4h, v2.h[4]
+            umlal2      v15.4s, v13.8h, v2.h[4]
+    119:    ext         v12.16b, v4.16b, v5.16b, #4*2
+            ext         v13.16b, v9.16b, v10.16b, #2*2
+            umlal       v14.4s, v12.4h, v2.h[3]
+            umlal2      v15.4s, v12.8h, v2.h[3]
+            umlal       v14.4s, v13.4h, v2.h[3]
+            umlal2      v15.4s, v13.8h, v2.h[3]
+    118:    ext         v12.16b, v4.16b, v5.16b, #5*2
+            ext         v13.16b, v9.16b, v10.16b, #1*2
+            umlal       v14.4s, v12.4h, v2.h[2]
+            umlal2      v15.4s, v12.8h, v2.h[2]
+            umlal       v14.4s, v13.4h, v2.h[2]
+            umlal2      v15.4s, v13.8h, v2.h[2]
+    117:    ext         v12.16b, v4.16b, v5.16b, #6*2
+            ext         v13.16b, v9.16b, v10.16b, #0*2
+            umlal       v14.4s, v12.4h, v2.h[1]
+            umlal2      v15.4s, v12.8h, v2.h[1]
+            umlal       v14.4s, v13.4h, v2.h[1]
+            umlal2      v15.4s, v13.8h, v2.h[1]
+    116:    ext         v12.16b, v4.16b, v5.16b, #7*2
+            ext         v13.16b, v8.16b, v9.16b, #7*2
+            umlal       v14.4s, v12.4h, v2.h[0]
+            umlal2      v15.4s, v12.8h, v2.h[0]
+            umlal       v14.4s, v13.4h, v2.h[0]
+            umlal2      v15.4s, v13.8h, v2.h[0]
+    115:    ext         v12.16b, v5.16b, v6.16b, #0*2
+            ext         v13.16b, v8.16b, v9.16b, #6*2
+            umlal       v14.4s, v12.4h, v1.h[7]
+            umlal2      v15.4s, v12.8h, v1.h[7]
+            umlal       v14.4s, v13.4h, v1.h[7]
+            umlal2      v15.4s, v13.8h, v1.h[7]
+    114:    ext         v12.16b, v5.16b, v6.16b, #1*2
+            ext         v13.16b, v8.16b, v9.16b, #5*2
+            umlal       v14.4s, v12.4h, v1.h[6]
+            umlal2      v15.4s, v12.8h, v1.h[6]
+            umlal       v14.4s, v13.4h, v1.h[6]
+            umlal2      v15.4s, v13.8h, v1.h[6]
+    113:    ext         v12.16b, v5.16b, v6.16b, #2*2
+            ext         v13.16b, v8.16b, v9.16b, #4*2
+            umlal       v14.4s, v12.4h, v1.h[5]
+            umlal2      v15.4s, v12.8h, v1.h[5]
+            umlal       v14.4s, v13.4h, v1.h[5]
+            umlal2      v15.4s, v13.8h, v1.h[5]
+    112:    ext         v12.16b, v5.16b, v6.16b, #3*2
+            ext         v13.16b, v8.16b, v9.16b, #3*2
+            umlal       v14.4s, v12.4h, v1.h[4]
+            umlal2      v15.4s, v12.8h, v1.h[4]
+            umlal       v14.4s, v13.4h, v1.h[4]
+            umlal2      v15.4s, v13.8h, v1.h[4]
+    111:    ext         v12.16b, v5.16b, v6.16b, #4*2
+            ext         v13.16b, v8.16b, v9.16b, #2*2
+            umlal       v14.4s, v12.4h, v1.h[3]
+            umlal2      v15.4s, v12.8h, v1.h[3]
+            umlal       v14.4s, v13.4h, v1.h[3]
+            umlal2      v15.4s, v13.8h, v1.h[3]
+    110:    ext         v12.16b, v5.16b, v6.16b, #5*2
+            ext         v13.16b, v8.16b, v9.16b, #1*2
+            umlal       v14.4s, v12.4h, v1.h[2]
+            umlal2      v15.4s, v12.8h, v1.h[2]
+            umlal       v14.4s, v13.4h, v1.h[2]
+            umlal2      v15.4s, v13.8h, v1.h[2]
+    109:    ext         v12.16b, v5.16b, v6.16b, #6*2
+            ext         v13.16b, v8.16b, v9.16b, #0*2
+            umlal       v14.4s, v12.4h, v1.h[1]
+            umlal2      v15.4s, v12.8h, v1.h[1]
+            umlal       v14.4s, v13.4h, v1.h[1]
+            umlal2      v15.4s, v13.8h, v1.h[1]
+    108:    ext         v12.16b, v5.16b, v6.16b, #7*2
+            ext         v13.16b, v7.16b, v8.16b, #7*2
+            umlal       v14.4s, v12.4h, v1.h[0]
+            umlal2      v15.4s, v12.8h, v1.h[0]
+            umlal       v14.4s, v13.4h, v1.h[0]
+            umlal2      v15.4s, v13.8h, v1.h[0]
+    107:    ext         v12.16b, v6.16b, v7.16b, #0*2
+            ext         v13.16b, v7.16b, v8.16b, #6*2
+            umlal       v14.4s, v12.4h, v0.h[7]
+            umlal2      v15.4s, v12.8h, v0.h[7]
+            umlal       v14.4s, v13.4h, v0.h[7]
+            umlal2      v15.4s, v13.8h, v0.h[7]
+    106:    ext         v12.16b, v6.16b, v7.16b, #1*2
+            ext         v13.16b, v7.16b, v8.16b, #5*2
+            umlal       v14.4s, v12.4h, v0.h[6]
+            umlal2      v15.4s, v12.8h, v0.h[6]
+            umlal       v14.4s, v13.4h, v0.h[6]
+            umlal2      v15.4s, v13.8h, v0.h[6]
+    105:    ext         v12.16b, v6.16b, v7.16b, #2*2
+            ext         v13.16b, v7.16b, v8.16b, #4*2
+            umlal       v14.4s, v12.4h, v0.h[5]
+            umlal2      v15.4s, v12.8h, v0.h[5]
+            umlal       v14.4s, v13.4h, v0.h[5]
+            umlal2      v15.4s, v13.8h, v0.h[5]
+    104:    ext         v12.16b, v6.16b, v7.16b, #3*2
+            ext         v13.16b, v7.16b, v8.16b, #3*2
+            umlal       v14.4s, v12.4h, v0.h[4]
+            umlal2      v15.4s, v12.8h, v0.h[4]
+            umlal       v14.4s, v13.4h, v0.h[4]
+            umlal2      v15.4s, v13.8h, v0.h[4]
+    103:    ext         v12.16b, v6.16b, v7.16b, #4*2
+            ext         v13.16b, v7.16b, v8.16b, #2*2
+            umlal       v14.4s, v12.4h, v0.h[3]
+            umlal2      v15.4s, v12.8h, v0.h[3]
+            umlal       v14.4s, v13.4h, v0.h[3]
+            umlal2      v15.4s, v13.8h, v0.h[3]
+    102:    ext         v12.16b, v6.16b, v7.16b, #5*2
+            ext         v13.16b, v7.16b, v8.16b, #1*2
+            umlal       v14.4s, v12.4h, v0.h[2]
+            umlal2      v15.4s, v12.8h, v0.h[2]
+            umlal       v14.4s, v13.4h, v0.h[2]
+            umlal2      v15.4s, v13.8h, v0.h[2]
+    101:    ext         v12.16b, v6.16b, v7.16b, #6*2
+            ext         v13.16b, v7.16b, v8.16b, #0*2
+            umlal       v14.4s, v12.4h, v0.h[1]
+            umlal2      v15.4s, v12.8h, v0.h[1]
+            umlal       v14.4s, v13.4h, v0.h[1]
+            umlal2      v15.4s, v13.8h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            mov         v31.16b, v4.16b
+            mov         v4.16b, v5.16b
+            mov         v5.16b, v6.16b
+            mov         v6.16b, v7.16b
+            mov         v7.16b, v8.16b
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+#define TUNED_LIST4 6, 12, 20
+.macro hconv4_6/*{{{*/
+.rodata
+   200:     .hword -4
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .align      4
+.text
+            umull       v14.4s, v7.4h, v0.h[0]
+            umull2      v15.4s, v7.8h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    106:    umlal       v14.4s, v4.4h,  v0.h[6]
+            umlal2      v15.4s, v4.8h,  v0.h[6]
+            umlal       v14.4s, v10.4h, v0.h[6]
+            umlal2      v15.4s, v10.8h, v0.h[6]
+    105:    umlal2      v14.4s, v4.8h,  v0.h[5]
+            umlal       v15.4s, v5.4h, v0.h[5]
+            umlal2      v14.4s, v9.8h, v0.h[5]
+            umlal       v15.4s, v10.4h, v0.h[5]
+    104:    umlal       v14.4s, v5.4h, v0.h[4]
+            umlal2      v15.4s, v5.8h, v0.h[4]
+            umlal       v14.4s, v9.4h, v0.h[4]
+            umlal2      v15.4s, v9.8h, v0.h[4]
+    103:    umlal2      v14.4s, v5.8h, v0.h[3]
+            umlal       v15.4s, v6.4h, v0.h[3]
+            umlal2      v14.4s, v8.8h, v0.h[3]
+            umlal       v15.4s, v9.4h, v0.h[3]
+    102:    umlal       v14.4s, v6.4h, v0.h[2]
+            umlal2      v15.4s, v6.8h, v0.h[2]
+            umlal       v14.4s, v8.4h, v0.h[2]
+            umlal2      v15.4s, v8.8h, v0.h[2]
+    101:    umlal2      v14.4s, v6.8h, v0.h[1]
+            umlal       v15.4s, v7.4h, v0.h[1]
+            umlal2      v14.4s, v7.8h, v0.h[1]
+            umlal       v15.4s, v8.4h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            mov         v4.16b, v5.16b
+            mov         v5.16b, v6.16b
+            mov         v6.16b, v7.16b
+            mov         v7.16b, v8.16b
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv4_12/*{{{*/
+.rodata
+   200:     .hword -4 //Might need to remove these...
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .hword 107f-100f
+            .hword 108f-100f
+            .hword 109f-100f
+            .hword 110f-100f
+            .hword 111f-100f
+            .hword 112f-100f
+            .align 4
+.text
+            umull       v14.4s, v4.4h, v0.h[0]
+            umull2      v15.4s, v4.8h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    112:    umlal       v14.4s, v26.4h, v1.h[4]
+            umlal2      v15.4s, v26.8h, v1.h[4]
+            umlal       v14.4s, v10.4h, v1.h[4]
+            umlal2      v15.4s, v10.8h, v1.h[4]
+    111:    umlal2      v14.4s, v26.8h, v1.h[3]
+            umlal       v15.4s, v27.4h, v1.h[3]
+            umlal2      v14.4s, v9.8h, v1.h[3]
+            umlal       v15.4s, v10.4h, v1.h[3]
+    110:    umlal       v14.4s, v27.4h, v1.h[2]
+            umlal2      v15.4s, v27.8h, v1.h[2]
+            umlal       v14.4s, v9.4h, v1.h[2]
+            umlal2      v15.4s, v9.8h, v1.h[2]
+    109:    umlal2      v14.4s, v27.8h, v1.h[1]
+            umlal       v15.4s, v28.4h, v1.h[1]
+            umlal2      v14.4s, v8.8h, v1.h[1]
+            umlal       v15.4s, v9.4h, v1.h[1]
+    108:    umlal       v14.4s, v28.4h, v1.h[0]
+            umlal2      v15.4s, v28.8h, v1.h[0]
+            umlal       v14.4s, v8.4h, v1.h[0]
+            umlal2      v15.4s, v8.8h, v1.h[0]
+    107:    umlal2      v14.4s, v28.8h, v0.h[7]
+            umlal       v15.4s, v29.4h, v0.h[7]
+            umlal2      v14.4s, v7.8h, v0.h[7]
+            umlal       v15.4s, v8.4h, v0.h[7]
+    106:    umlal       v14.4s, v29.4h, v0.h[6]
+            umlal2      v15.4s, v29.8h, v0.h[6]
+            umlal       v14.4s, v7.4h, v0.h[6]
+            umlal2      v15.4s, v7.8h, v0.h[6]
+    105:    umlal2      v14.4s, v29.8h, v0.h[5]
+            umlal       v15.4s, v30.4h, v0.h[5]
+            umlal2      v14.4s, v6.8h, v0.h[5]
+            umlal       v15.4s, v7.4h, v0.h[5]
+    104:    umlal       v14.4s, v30.4h, v0.h[4]
+            umlal2      v15.4s, v30.8h, v0.h[4]
+            umlal       v14.4s, v6.4h, v0.h[4]
+            umlal2      v15.4s, v6.8h, v0.h[4]
+    103:    umlal2      v14.4s, v30.8h, v0.h[3]
+            umlal       v15.4s, v31.4h, v0.h[3]
+            umlal2      v14.4s, v5.8h, v0.h[3]
+            umlal       v15.4s, v6.4h, v0.h[3]
+    102:    umlal       v14.4s, v31.4h, v0.h[2]
+            umlal2      v15.4s, v31.8h, v0.h[2]
+            umlal       v14.4s, v5.4h, v0.h[2]
+            umlal2      v15.4s, v5.8h, v0.h[2]
+    101:    umlal2      v14.4s, v31.8h, v0.h[1]
+            umlal       v15.4s, v4.4h,  v0.h[1]
+            umlal2      v14.4s, v4.8h,  v0.h[1]
+            umlal       v15.4s, v5.4h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            mov         v26.16b, v27.16b
+            mov         v27.16b, v28.16b
+            mov         v28.16b, v29.16b
+            mov         v29.16b, v30.16b
+            mov         v30.16b, v31.16b
+            mov         v31.16b, v4.16b
+            mov         v4.16b, v5.16b
+            mov         v5.16b, v6.16b
+            mov         v6.16b, v7.16b
+            mov         v7.16b, v8.16b
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv4_20/*{{{*/
+.rodata
+   200:     .hword -4
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .hword 107f-100f
+            .hword 108f-100f
+            .hword 109f-100f
+            .hword 110f-100f
+            .hword 111f-100f
+            .hword 112f-100f
+            .hword 113f-100f
+            .hword 114f-100f
+            .hword 115f-100f
+            .hword 116f-100f
+            .hword 117f-100f
+            .hword 118f-100f
+            .hword 119f-100f
+            .hword 120f-100f
+            .align 4
+.text
+            umull       v14.4s, v28.4h, v0.h[0]
+            umull2      v15.4s, v28.8h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    120:    umlal       v14.4s, v18.4h, v2.h[4]
+            umlal2      v15.4s, v18.8h, v2.h[4]
+            umlal       v14.4s, v10.4h, v2.h[4]
+            umlal2      v15.4s, v10.8h, v2.h[4]
+    119:    umlal2      v14.4s, v18.8h, v2.h[3]
+            umlal       v15.4s, v19.4h, v2.h[3]
+            umlal2      v14.4s, v9.8h,  v2.h[3]
+            umlal       v15.4s, v10.4h, v2.h[3]
+    118:    umlal       v14.4s, v19.4h, v2.h[2]
+            umlal2      v15.4s, v19.8h, v2.h[2]
+            umlal       v14.4s, v9.4h,  v2.h[2]
+            umlal2      v15.4s, v9.8h,  v2.h[2]
+    117:    umlal2      v14.4s, v19.8h, v2.h[1]
+            umlal       v15.4s, v20.4h, v2.h[1]
+            umlal2      v14.4s, v8.8h,  v2.h[1]
+            umlal       v15.4s, v9.4h,  v2.h[1]
+    116:    umlal       v14.4s, v20.4h, v2.h[0]
+            umlal2      v15.4s, v20.8h, v2.h[0]
+            umlal       v14.4s, v8.4h,  v2.h[0]
+            umlal2      v15.4s, v8.8h,  v2.h[0]
+    115:    umlal2      v14.4s, v20.8h, v1.h[7]
+            umlal       v15.4s, v21.4h, v1.h[7]
+            umlal2      v14.4s, v7.8h,  v1.h[7]
+            umlal       v15.4s, v8.4h,  v1.h[7]
+    114:    umlal       v14.4s, v21.4h, v1.h[6]
+            umlal2      v15.4s, v21.8h, v1.h[6]
+            umlal       v14.4s, v7.4h,  v1.h[6]
+            umlal2      v15.4s, v7.8h,  v1.h[6]
+    113:    umlal2      v14.4s, v21.8h, v1.h[5]
+            umlal       v15.4s, v22.4h, v1.h[5]
+            umlal2      v14.4s, v6.8h,  v1.h[5]
+            umlal       v15.4s, v7.4h,  v1.h[5]
+    112:    umlal       v14.4s, v22.4h, v1.h[4]
+            umlal2      v15.4s, v22.8h, v1.h[4]
+            umlal       v14.4s, v6.4h,  v1.h[4]
+            umlal2      v15.4s, v6.8h,  v1.h[4]
+    111:    umlal2      v14.4s, v22.8h, v1.h[3]
+            umlal       v15.4s, v23.4h, v1.h[3]
+            umlal2      v14.4s, v5.8h,  v1.h[3]
+            umlal       v15.4s, v6.4h,  v1.h[3]
+    110:    umlal       v14.4s, v23.4h, v1.h[2]
+            umlal2      v15.4s, v23.8h, v1.h[2]
+            umlal       v14.4s, v5.4h,  v1.h[2]
+            umlal2      v15.4s, v5.8h,  v1.h[2]
+    109:    umlal2      v14.4s, v23.8h, v1.h[1]
+            umlal       v15.4s, v24.4h, v1.h[1]
+            umlal2      v14.4s, v4.8h,  v1.h[1]
+            umlal       v15.4s, v5.4h,  v1.h[1]
+    108:    umlal       v14.4s, v24.4h, v1.h[0]
+            umlal2      v15.4s, v24.8h, v1.h[0]
+            umlal       v14.4s, v4.4h,  v1.h[0]
+            umlal2      v15.4s, v4.8h,  v1.h[0]
+    107:    umlal2      v14.4s, v24.8h, v0.h[7]
+            umlal       v15.4s, v25.4h, v0.h[7]
+            umlal2      v14.4s, v31.8h, v0.h[7]
+            umlal       v15.4s, v4.4h,  v0.h[7]
+    106:    umlal       v14.4s, v25.4h, v0.h[6]
+            umlal2      v15.4s, v25.8h, v0.h[6]
+            umlal       v14.4s, v31.4h, v0.h[6]
+            umlal2      v15.4s, v31.8h, v0.h[6]
+    105:    umlal2      v14.4s, v25.8h, v0.h[5]
+            umlal       v15.4s, v26.4h, v0.h[5]
+            umlal2      v14.4s, v30.8h, v0.h[5]
+            umlal       v15.4s, v31.4h, v0.h[5]
+    104:    umlal       v14.4s, v26.4h, v0.h[4]
+            umlal2      v15.4s, v26.8h, v0.h[4]
+            umlal       v14.4s, v30.4h, v0.h[4]
+            umlal2      v15.4s, v30.8h, v0.h[4]
+    103:    umlal2      v14.4s, v26.8h, v0.h[3]
+            umlal       v15.4s, v27.4h, v0.h[3]
+            umlal2      v14.4s, v29.8h, v0.h[3]
+            umlal       v15.4s, v30.4h, v0.h[3]
+    102:    umlal       v14.4s, v27.4h, v0.h[2]
+            umlal2      v15.4s, v27.8h, v0.h[2]
+            umlal       v14.4s, v29.4h, v0.h[2]
+            umlal2      v15.4s, v29.8h, v0.h[2]
+    101:    umlal2      v14.4s, v27.8h, v0.h[1]
+            umlal       v15.4s, v28.4h, v0.h[1]
+            umlal2      v14.4s, v28.8h, v0.h[1]
+            umlal       v15.4s, v29.4h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            mov         v18.16b, v19.16b
+            mov         v19.16b, v20.16b
+            mov         v20.16b, v21.16b
+            mov         v21.16b, v22.16b
+            mov         v22.16b, v23.16b
+            mov         v23.16b, v24.16b
+            mov         v24.16b, v25.16b
+            mov         v25.16b, v26.16b
+            mov         v26.16b, v27.16b
+            mov         v27.16b, v28.16b
+            mov         v28.16b, v29.16b
+            mov         v29.16b, v30.16b
+            mov         v30.16b, v31.16b
+            mov         v31.16b, v4.16b
+            mov         v4.16b, v5.16b
+            mov         v5.16b, v6.16b
+            mov         v6.16b, v7.16b
+            mov         v7.16b, v8.16b
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv4_25/*{{{*/
+.rodata
+   200:     .hword -4
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .hword 107f-100f
+            .hword 108f-100f
+            .hword 109f-100f
+            .hword 110f-100f
+            .hword 111f-100f
+            .hword 112f-100f
+            .hword 113f-100f
+            .hword 114f-100f
+            .hword 115f-100f
+            .hword 116f-100f
+            .hword 117f-100f
+            .hword 118f-100f
+            .hword 119f-100f
+            .hword 120f-100f
+            .hword 121f-100f
+            .hword 122f-100f
+            .hword 123f-100f
+            .hword 124f-100f
+            .hword 125f-100f
+            .align 4
+.text
+            umull2      v14.4s, v25.8h, v0.h[0]
+            umull       v15.4s, v26.4h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    125:    ld1         {v12.8h}, [x9]
+            umlal       v14.4s, v12.4h, v3.h[1]
+            umlal2      v15.4s, v12.8h, v3.h[1]
+            umlal       v14.4s, v10.4h, v3.h[1]
+            umlal2      v15.4s, v10.8h, v3.h[1]
+    124:    add         x12, x9, #0x08
+            bic         x12, x12, #0x40
+            ld1         {v12.4h}, [x12], #8
+            bic         x12, x12, #0x40
+            ld1         {v13.4h}, [x12]
+            umlal       v14.4s, v12.4h, v3.h[0]
+            umlal       v15.4s, v13.4h, v3.h[0]
+            umlal2      v14.4s, v9.8h,  v3.h[0]
+            umlal       v15.4s, v10.4h, v3.h[0]
+    123:    add         x12, x9, #0x10
+            bic         x12, x12, #0x40
+            ld1         {v12.8h}, [x12]
+            umlal       v14.4s, v12.4h, v2.h[7]
+            umlal2      v15.4s, v12.8h, v2.h[7]
+            umlal       v14.4s, v9.4h,  v2.h[7]
+            umlal2      v15.4s, v9.8h,  v2.h[7]
+    122:    add         x12, x9, #0x18
+            bic         x12, x12, #0x40
+            ld1         {v12.4h}, [x12], #8
+            bic         x12, x12, #0x40
+            ld1         {v13.4h}, [x12]
+            umlal       v14.4s, v12.4h, v2.h[6]
+            umlal       v15.4s, v13.4h, v2.h[6]
+            umlal2      v14.4s, v8.8h,  v2.h[6]
+            umlal       v15.4s, v9.4h,  v2.h[6]
+    121:    add         x12, x9, #0x20
+            bic         x12, x12, #0x40
+            ld1         {v12.8h}, [x12]
+            umlal       v14.4s, v12.4h, v2.h[5]
+            umlal2      v15.4s, v12.8h, v2.h[5]
+            umlal       v14.4s, v8.4h,  v2.h[5]
+            umlal2      v15.4s, v8.8h,  v2.h[5]
+    120:    add         x12, x9, #0x28
+            bic         x12, x12, #0x40
+            ld1         {v12.4h}, [x12], #8
+            bic         x12, x12, #0x40
+            ld1         {v13.4h}, [x12]
+            umlal       v14.4s, v12.4h, v2.h[4]
+            umlal       v15.4s, v13.4h, v2.h[4]
+            umlal2      v14.4s, v7.8h,  v2.h[4]
+            umlal       v15.4s, v8.4h,  v2.h[4]
+    119:    add         x12, x9, #0x30
+            bic         x12, x12, #0x40
+            ld1         {v12.8h}, [x12]
+            umlal       v14.4s, v12.4h, v2.h[3]
+            umlal2      v15.4s, v12.8h, v2.h[3]
+            umlal       v14.4s, v7.4h,  v2.h[3]
+            umlal2      v15.4s, v7.8h,  v2.h[3]
+    118:    add         x12, x9, #0x38
+            bic         x12, x12, #0x40
+            ld1         {v12.4h}, [x12]
+            umlal       v14.4s, v12.4h, v2.h[2]
+            umlal       v15.4s, v17.4h, v2.h[2]
+            umlal2      v14.4s, v6.8h,  v2.h[2]
+            umlal       v15.4s, v7.4h,  v2.h[2]
+    117:    umlal       v14.4s, v17.4h, v2.h[1]
+            umlal2      v15.4s, v17.8h, v2.h[1]
+            umlal       v14.4s, v6.4h,  v2.h[1]
+            umlal2      v15.4s, v6.8h,  v2.h[1]
+    116:    umlal2      v14.4s, v17.8h, v2.h[0]
+            umlal       v15.4s, v18.4h, v2.h[0]
+            umlal2      v14.4s, v5.8h,  v2.h[0]
+            umlal       v15.4s, v6.4h,  v2.h[0]
+    115:    umlal       v14.4s, v18.4h, v1.h[7]
+            umlal2      v15.4s, v18.8h, v1.h[7]
+            umlal       v14.4s, v5.4h,  v1.h[7]
+            umlal2      v15.4s, v5.8h,  v1.h[7]
+    114:    umlal2      v14.4s, v18.8h, v1.h[6]
+            umlal       v15.4s, v19.4h, v1.h[6]
+            umlal2      v14.4s, v4.8h,  v1.h[6]
+            umlal       v15.4s, v5.4h,  v1.h[6]
+    113:    umlal       v14.4s, v19.4h, v1.h[5]
+            umlal2      v15.4s, v19.8h, v1.h[5]
+            umlal       v14.4s, v4.4h,  v1.h[5]
+            umlal2      v15.4s, v4.8h,  v1.h[5]
+    112:    umlal2      v14.4s, v19.8h, v1.h[4]
+            umlal       v15.4s, v20.4h, v1.h[4]
+            umlal2      v14.4s, v31.8h, v1.h[4]
+            umlal       v15.4s, v4.4h,  v1.h[4]
+    111:    umlal       v14.4s, v20.4h, v1.h[3]
+            umlal2      v15.4s, v20.8h, v1.h[3]
+            umlal       v14.4s, v31.4h, v1.h[3]
+            umlal2      v15.4s, v31.8h, v1.h[3]
+    110:    umlal2      v14.4s, v20.8h, v1.h[2]
+            umlal       v15.4s, v21.4h, v1.h[2]
+            umlal2      v14.4s, v30.8h, v1.h[2]
+            umlal       v15.4s, v31.4h, v1.h[2]
+    109:    umlal       v14.4s, v21.4h, v1.h[1]
+            umlal2      v15.4s, v21.8h, v1.h[1]
+            umlal       v14.4s, v30.4h, v1.h[1]
+            umlal2      v15.4s, v30.8h, v1.h[1]
+    108:    umlal2      v14.4s, v21.8h, v1.h[0]
+            umlal       v15.4s, v22.4h, v1.h[0]
+            umlal2      v14.4s, v29.8h, v1.h[0]
+            umlal       v15.4s, v30.4h, v1.h[0]
+    107:    umlal       v14.4s, v22.4h, v0.h[7]
+            umlal2      v15.4s, v22.8h, v0.h[7]
+            umlal       v14.4s, v29.4h, v0.h[7]
+            umlal2      v15.4s, v29.8h, v0.h[7]
+    106:    umlal2      v14.4s, v22.8h, v0.h[6]
+            umlal       v15.4s, v23.4h, v0.h[6]
+            umlal2      v14.4s, v28.8h, v0.h[6]
+            umlal       v15.4s, v29.4h, v0.h[6]
+    105:    umlal       v14.4s, v23.4h, v0.h[5]
+            umlal2      v15.4s, v23.8h, v0.h[5]
+            umlal       v14.4s, v28.4h, v0.h[5]
+            umlal2      v15.4s, v28.8h, v0.h[5]
+    104:    umlal2      v14.4s, v23.8h, v0.h[4]
+            umlal       v15.4s, v24.4h, v0.h[4]
+            umlal2      v14.4s, v27.8h, v0.h[4]
+            umlal       v15.4s, v28.4h, v0.h[4]
+    103:    umlal       v14.4s, v24.4h, v0.h[3]
+            umlal2      v15.4s, v24.8h, v0.h[3]
+            umlal       v14.4s, v27.4h, v0.h[3]
+            umlal2      v15.4s, v27.8h, v0.h[3]
+    102:    umlal2      v14.4s, v24.8h, v0.h[2]
+            umlal       v15.4s, v25.4h, v0.h[2]
+            umlal2      v14.4s, v26.8h, v0.h[2]
+            umlal       v15.4s, v27.4h, v0.h[2]
+    101:    umlal       v14.4s, v25.4h, v0.h[1]
+            umlal2      v15.4s, v25.8h, v0.h[1]
+            umlal       v14.4s, v26.4h, v0.h[1]
+            umlal2      v15.4s, v26.8h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            st1         {v17.16b}, [x9], #16
+            bic         x9, x9, #0x40
+            mov         v17.16b, v18.16b
+            mov         v18.16b, v19.16b
+            mov         v19.16b, v20.16b
+            mov         v20.16b, v21.16b
+            mov         v21.16b, v22.16b
+            mov         v22.16b, v23.16b
+            mov         v23.16b, v24.16b
+            mov         v24.16b, v25.16b
+            mov         v25.16b, v26.16b
+            mov         v26.16b, v27.16b
+            mov         v27.16b, v28.16b
+            mov         v28.16b, v29.16b
+            mov         v29.16b, v30.16b
+            mov         v30.16b, v31.16b
+            mov         v31.16b, v4.16b
+            mov         v4.16b, v5.16b
+            mov         v5.16b, v6.16b
+            mov         v6.16b, v7.16b
+            mov         v7.16b, v8.16b
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+/* Dedicated function wrapper for the fetch macro, for the cases where
+ * performance isn't that important, to keep code size down.
+ */
+PRIVATE(fetch_generic_asm)
+            stp         x10, x11, [sp, #-16]!
+            fetch
+            ldp         x10, x11, [sp], #16
+            ret
+END(fetch_generic_asm)
+
+
+/* Fetch the next (16 - (x10 & 15)) columns of data, avoiding reading memory
+ * beyond that limit, and filling the rest of the vector with the last legal
+ * pixel.
+ * Result is in v10 and v11.  v8 and v9 are filled with the first legal pixel.
+ * Note: This function can read beyond the right edge of input if the image is
+ * narrower than 16 bytes.
+ */
+PRIVATE(fetch_clampleft1)
+            stp         x29, x30, [sp, #-16]!
+            bl          fetch_generic_asm
+            dup         v8.8h, v10.h[0]
+            dup         v9.8h, v10.h[0]
+            ands        x12, x10, #15
+            beq         1f
+            sub         x1, x1, x12
+            sub         x15, x15, x12
+            sub         x19, x19, x12
+            sub         x10, x10, x12
+            sub         x12, sp, x12, LSL #1
+            sub         sp, sp, #64
+            sub         x12, x12, #32
+            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
+            ld1         {v10.8h,v11.8h}, [x12]
+            add         sp, sp, #64
+1:          ldp         x29, x30, [sp], #16
+            ret
+END(fetch_clampleft1)
+
+PRIVATE(fetch_clampleft4)
+            stp         x29, x30, [sp, #-16]!
+            bl          fetch_generic_asm
+            dup         v8.2d, v10.d[0]
+            dup         v9.2d, v10.d[0]
+            ands        x12, x10, #15
+            beq         1f
+            sub         x1, x1, x12
+            sub         x15, x15, x12
+            sub         x19, x19, x12
+            sub         x10, x10, x12
+            sub         x12, sp, x12, LSL #1
+            sub         sp, sp, #64
+            sub         x12, x12, #32
+            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
+            ld1         {v10.8h,v11.8h}, [x12]
+            add         sp, sp, #64
+1:          ldp         x29, x30, [sp], #16
+            ret
+END(fetch_clampleft4)
+
+/* Fetch only the next (x11 & 15) (where 0 means 16) columns of data, avoiding
+ * reading memory beyond that limit, and filling the rest of the vector with
+ * the last legal pixel.
+ * Result is in v10 and v11.  v12 and v13 are filled with the last legal pixel.
+ * Note: This function can read beyond the left edge of input if the image is
+ * narrower than 16 bytes.
+ */
+PRIVATE(fetch_clampright1)
+            stp         x29, x30, [sp, #-16]!
+            sub         x12, xzr, x11
+            ands        x12, x12, #15
+            beq         1f
+            sub         x1, x1, x12
+            sub         x15, x15, x12
+            sub         x19, x19, x12
+            bl          fetch_generic_asm
+            dup         v12.8h, v11.h[7]
+            dup         v13.8h, v11.h[7]
+            sub         x12, xzr, x11
+            and         x12, x12, #15
+            sub         sp, sp, #64
+            add         x12, sp, x12, LSL #1
+            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
+            ld1         {v10.8h,v11.8h}, [x12]
+            add         sp, sp, #64
+            ldp         x29, x30, [sp], #16
+            ret
+1:          bl          fetch_generic_asm
+            dup         v12.8h, v11.h[7]
+            dup         v13.8h, v11.h[7]
+            ldp         x29, x30, [sp], #16
+            ret
+END(fetch_clampright1)
+
+PRIVATE(fetch_clampright4)
+            stp         x29, x30, [sp, #-16]!
+            sub         x12, xzr, x11
+            ands        x12, x12, #15
+            beq         1f
+            sub         x1, x1, x12
+            sub         x15, x15, x12
+            sub         x19, x19, x12
+            bl          fetch_generic_asm
+            dup         v12.2d, v11.d[1]
+            dup         v13.2d, v11.d[1]
+            sub         x12, xzr, x11
+            and         x12, x12, #15
+            sub         sp, sp, #64
+            add         x12, sp, x12, LSL #1
+            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
+            ld1         {v10.8h,v11.8h}, [x12]
+            add         sp, sp, #64
+            ldp         x29, x30, [sp], #16
+            ret
+1:          bl          fetch_generic_asm
+            dup         v12.2d, v11.d[1]
+            dup         v13.2d, v11.d[1]
+            ldp         x29, x30, [sp], #16
+            ret
+END(fetch_clampright4)
+
+/* Given values in v10 and v11, and an index in x11, sweep the (x11 & 15)th
+ * value across to fill the rest of the register pair.  Used for filling the
+ * right hand edge of the window when reading too close to the right hand edge
+ * of the image.
+ * Also returns a dup-ed copy of the last element in v12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
+ */
+PRIVATE(prefill_sweepright1)
+            ands        x12, x11, #15
+            beq         1f
+            sub         x12, x12, #1
+            sub         sp, sp, #64
+            st1         {v10.8h,v11.8h}, [sp]
+            add         x12, sp, x12, LSL #1
+            ld1r        {v12.8h}, [x12]
+            ld1r        {v13.8h}, [x12]
+            st1         {v12.8h,v13.8h}, [x12]
+            ld1         {v10.8h,v11.8h}, [sp]
+            add         sp, sp, #64
+            ret
+1:          dup         v12.8h, v11.h[7]
+            dup         v13.8h, v11.h[7]
+            ret
+END(prefill_sweepright1)
+
+PRIVATE(prefill_sweepright4)
+            ands        x12, x11, #15
+            beq         1f
+            sub         x12, x12, #4
+            sub         sp, sp, #64
+            st1         {v10.8h,v11.8h}, [sp]
+            add         x12, sp, x12, LSL #1
+            ld1r        {v12.2d}, [x12]
+            st1         {v13.8h}, [x12]
+            ld1         {v10.8h,v11.8h}, [sp]
+            add         sp, sp, #64
+            ret
+1:          dup         v12.2d, v11.d[1]
+            dup         v13.2d, v11.d[1]
+            ret
+END(prefill_sweepright4)
+
+/* The main loop keeps a sliding window of data that has already been convolved
+ * in the vertical axis for the current line.  This usually stays in the
+ * register file, but spills to memory for large windows.  The first thing that
+ * needs to be done at start-up is to fill this window with image data, taking
+ * into account the padding needed if the left or right edges of the image fall
+ * within this window.
+ */
+
+/* Because the window is in the register file writes to it cannot be indexed
+ * by another register.  Consequently the fill loops are unrolled to address
+ * the registers directly.  This macro distinguishes between writes to the
+ * register file and writes to the spill buffer (indicated by a destination
+ * register named xx).
+ */
+.macro prefill_out ra, rb, sra, srb
+  .ifc \ra,xx
+    .ifc \rb,xx
+            st1         {\sra,\srb}, [x9], #32
+    .else
+            bic         x9, x9, #0x40
+            st1         {\sra}, [x9], #16
+            mov         \rb, \srb
+    .endif
+  .else
+    .ifnc \ra,\sra
+            mov         \ra, \sra
+    .endif
+    .ifnc \rb,\srb
+            mov         \rb, \srb
+    .endif
+  .endif
+.endm
+
+/* This macro provides the list of registers representing the window, and the
+ * cases where the register file is too small and a spill buffer is used
+ * instead.
+ * Since several specialisations of each function are generated, this also
+ * culls superfluous iterations, and sets the variable `i` for subsequent
+ * macros indicating the current index into the window.
+ */
+.macro prefill_list, macro, nextmacro, max_r, step, label
+  .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
+    .if windowsize >= (\line * 16)
+      .set i, windowsize - (\line * 16)
+\label\macro\line:
+            prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
+    .endif
+  .endm
+            ifneeded \macro \nextmacro, 13, 12, xx,      xx,      \step, \label
+            ifneeded \macro \nextmacro, 12, 11, xx,      xx,      \step, \label
+            ifneeded \macro \nextmacro, 11, 10, xx,      v17.16b, \step, \label
+            ifneeded \macro \nextmacro, 10,  9, v18.16b, v19.16b, \step, \label
+            ifneeded \macro \nextmacro,  9,  8, v20.16b, v21.16b, \step, \label
+            ifneeded \macro \nextmacro,  8,  7, v22.16b, v23.16b, \step, \label
+            ifneeded \macro \nextmacro,  7,  6, v24.16b, v25.16b, \step, \label
+            ifneeded \macro \nextmacro,  6,  5, v26.16b, v27.16b, \step, \label
+            ifneeded \macro \nextmacro,  5,  4, v28.16b, v29.16b, \step, \label
+            ifneeded \macro \nextmacro,  4,  3, v30.16b, v31.16b, \step, \label
+            ifneeded \macro \nextmacro,  3,  2, v4.16b,  v5.16b,  \step, \label
+            ifneeded \macro \nextmacro,  2,  1, v6.16b,  v7.16b,  \step, \label
+            ifneeded \macro \nextmacro,  1,  0, v8.16b,  v9.16b,  \step, \label
+\label\macro\()0:
+            b           \label\()_end
+  .purgem ifneeded
+.endm
+
+/* These macros represent the possible stages of filling the window.
+ * Each macro is unrolled enough times that it can fill the entire window
+ * itself, but normally it will have to hand control to subsequent macros
+ * part-way through and this is done using labels named \next and \after, where
+ * \next is the next macro starting at the same window position and \after is
+ * the next macro starting after the current window position.
+ */
+
+/* leftfill: v8 and v9 contain the left padding value.  While the window
+ * extends outside of the image on the left-hand side, and at least 16 more
+ * padding values are needed in the window, store v8 and v9 into the window.
+ * Otherwise skip forward to storing image data.
+ */
+.macro prefill_leftfill, next, after, ra, rb, step
+            cmp         x10, #i+16
+            blo         \next
+            prefill_out \ra, \rb, v8.16b, v9.16b
+.endm
+
+/* leftedge: The very first non-fill or partial-fill chunk from the image is
+ * already loaded (as it was used to calculate the left padding value), so
+ * store it here, and then drop into the regular load/store cycle in the next
+ * macro.
+ */
+.macro prefill_leftedge, next, after, ra, rb, step
+1:          prefill_out \ra, \rb, v10.16b, v11.16b
+            b           \after
+.endm
+
+/* dofetch: Copy chunks of the image into the window without any complications
+ * from edge conditions.
+ */
+.macro prefill_dofetch, next, after, ra, rb, step
+            cmp         x11, #i+16
+            bls         \next
+            bl          fetch_generic_asm
+            prefill_out \ra, \rb, v10.16b, v11.16b
+.endm
+
+/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
+ * the right-hand edge of the image.  In that case sweep the last valid pixel
+ * across the rest of the chunk, and in either case prepare padding data in v12
+ * and v13 for the next macro.  This is done in fetch_clampright.
+ * This only happens once before going on to the next macro.
+ * Sometimes leftedge also covers the rightedge case, in which case this has
+ * to be skipped altogether.
+ */
+.macro prefill_rightedge, next, after, ra, rb, step
+            cmp         x11, #i
+            bls         \next
+            bl          fetch_clampright\step
+            prefill_out \ra, \rb, v10.16b, v11.16b
+            b           \after
+.endm
+
+/* rightfill: The rest of the window is simply filled with right padding from
+ * v12 and v13.
+ */
+.macro prefill_rightfill, next, after, ra, rb, step
+            prefill_out \ra, \rb, v12.16b, v13.16b
+.endm
+
+/* Here all of the macros above are unrolled and laid out in the proper order.
+ */
+.macro prefill_body, max_r, step, label
+            prefill_list leftfill,  leftedge,   \max_r, \step, \label
+            prefill_list leftedge,  dofetch,    \max_r, \step, \label
+            prefill_list dofetch,   rightedge,  \max_r, \step, \label
+            prefill_list rightedge, rightfill,  \max_r, \step, \label
+            prefill_list rightfill, oops,       \max_r, \step, \label
+\label\()_end:
+.endm
+
+
+/* Fill the convolution window with context data.  The aim here is to load
+ * exactly 2*r columns, and in the main loop to read as many columns as will be
+ * written.  This is complicated by the window being divided into chunks at
+ * register boundaries, and the need to handle cases when the input starts very
+ * close to the left or right (or both) edges of the image and the need to fill
+ * the spaces that leaves with left and right edge padding values.
+ *
+ * Input:
+ *      x1 -- src
+ *      x2 -- pitch
+ *      x3 -- count
+ *      x4 -- available image data right of src pointer
+ *      x5 -- r
+ *      x6 -- rup
+ *      x7 -- rdn
+ *      x8 -- available image data left of src pointer
+ *      x9 -- buffer (if needed)
+ *      x13 = -pitch
+ *      x15 = top-row in
+ *      x19 = bottom-row in
+ * Output:
+ *      x4 -= min(inlen, count + windowsize - centertap)
+ *      x1 += min(inlen, count + windowsize - centertap)
+ *      x15 += min(inlen, count + windowsize - centertap)
+ *      x19 += min(inlen, count + windowsize - centertap)
+ * Modifies:
+ *      x10 -- fill start index in the window
+ *      x11 -- fill stop index in the window
+ *      x12 -- scratch
+ */
+.macro prefill step=1, max_r=25, label=xx
+.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
+.set centertap, (windowsize - \max_r * \step)
+            mov         x10, #centertap
+            subs        x10, x10, x8
+            csel        x10, xzr, x10, lo
+
+            subs        x11, x4, #windowsize - centertap
+            csel        x11, xzr, x11, hs
+            add         x11, x11, #windowsize
+
+            /* x10 indicates where in the window legal image data begins.
+             * x11 indicates where in the window legal image date ends.
+             * When starting near the centre of a large image these would be
+             * zero and windowsize respectively, but when starting near the
+             * edges this can change.
+             * When starting on the leftmost pixel, x10 will be centertap.
+             * When starting on the rightmost pixel, x11 will be centertap+1.
+             */
+
+            /* x4 indicates how much data there is between the current pointers
+             * and the right edge of the image.  The pointers currently point
+             * to the data needed at centertap.  The subsequent code will
+             * consume (windowsize - x10) data, but only the data from
+             * centertap to windowsize comes out of x4's budget.
+             */
+1:          subs        x4, x4, #windowsize - centertap
+            csel        x4, xzr, x4, lo
+
+            /* And the pointers need to rewind to the start of the window.
+             */
+            sub         x1, x1, #centertap
+            sub         x15, x15, #centertap
+            sub         x19, x19, #centertap
+
+            /* Unless x8 indicated that there wasn't that much data available.
+             */
+            add         x1, x1, x10
+            add         x15, x15, x10
+            add         x19, x19, x10
+
+            /* Get the first chunk, and add padding to align it to the window
+             * if necessary.
+             */
+            bl          fetch_clampleft\step
+
+            /* Sometimes the start and the end of the window are in the same
+             * chunk.  In that case both ends need filler at the outset.
+             */
+            sub         x12, x11, #1
+            eor         x12,  x10, x12
+            cmp         x12, #16
+            bhs         1f
+            bl          prefill_sweepright\step
+
+            /* Iterate through all the points in the window and fill them in
+             * with padding or image data as needed.
+             */
+1:          prefill_body \max_r, \step, \label
+.endm
+
+/* The main body of the convolve functions.  Having already pre-filled the
+ * convolution window with 2*r input values, the logic settles into a regular
+ * pattern of reading and writing at a 1:1 rate until either input or output
+ * expires.  The input leads the output by r values, so when processing all the
+ * way to the right-hand edge, or within r pixels of that edge, the input will
+ * run out first.  In the case of very narrow images, or sub-windows starting
+ * near the right edge, the input may already have run out while the
+ * convolution window was being filled and this loop will start with a
+ * zero-length input.
+ *
+ * Once the input runs out, the rest of the output must be processed by padding
+ * the remainder of the window with pad value from the last valid pixel from
+ * the source.
+ *
+ * Input:
+ *      x0 = dst
+ *      x1 = src
+ *      x2 = pitch
+ *      x3 = count
+ *      x4 = inlen
+ *      x5 = r
+ *      x6 = rup
+ *      x7 = rdn
+ *      x9 = buffer
+ *      x13 = -pitch
+ *      x15 = top-row in
+ *      x19 = bottom-row in
+ * Modifies
+ *      x8 = fetch code pointer
+ */
+.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
+
+            /* If x4 >= x3 then there's no need for clipping.  The main loop
+             * needs to exit when either x3 or x4 runs out, so clamp x4 to be
+             * no greater than x3 and use x4 for the loop.
+             * However, if x4 comes out of the loop with less than 16 bytes
+             * left, a partial read would be necessary to avoid reading beyond
+             * the end of the image.  To avoid this, clamp x4 to the next
+             * multiple of 16, which is still sufficient to force it out of the
+             * loop but doesn't imply a rewind.
+             */
+            add         x12, x3, #15
+            bic         x12, x12, #15
+            cmp         x4, x12
+            csel        x4, x12, x4, hi
+
+            /* First calculate the entry-point into the internal fetch logic.
+             * This is done so the same function can service several kernel
+             * sizes.
+             */
+            adrp        x8, \labelnc
+            add         x8, x8, #:lo12:\labelnc
+            sub         x8, x8, x5, LSL #5
+            sub         x8, x8, x5, LSL #3
+            cmp         x5, x6
+            ccmp        x5, x7, #0, eq
+            beq         5f
+
+            /* if (r != rup || r != rdn) then the address-clamping table should
+             * be used rather than the short-cut version.
+             */
+            adrp        x8, \labelc
+            add         x8, x8, #:lo12:\labelc
+            sub         x8, x8, x5, LSL #6
+            add         x8, x8, x5, LSL #3
+            b           5f
+
+            /* Main loop: ... */
+            .align  4
+3:          /* first perform a vertical convolution from memory to get the next
+             * 16 taps of the horizontal window into the register file...
+             */
+            fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
+
+            /* ...then perform a horizontal convolution on that window to
+             * produce eight output bytes, and slide the window along.
+             * This has to be done twice to match the 16-way vertical pass.
+             * It would be preferable to have twice the work done in \core, but
+             * that would demand yet another variant on those macros and would
+             * perturb the register allocation severely.
+             */
+            \core
+            st1         {v15.8b}, [x0], #8
+            \core
+            st1         {v15.8b}, [x0], #8
+
+            sub         x3, x3, #16
+5:          subs        x4, x4, #16
+            bhi         3b
+            /* Here there's 16 or fewer bytes available before the edge of the
+             * source image.  x4 holds that count minus 16 (because it was
+             * decremented before the first iteration ran).  The last read may
+             * not be a whole chunk, and beyond that a fill value must be used.
+             *
+             * Of course, none of that matters if there's no more output to
+             * produce...
+             */
+            cbz         x3, 5f
+
+            /* Oh well. */
+            adds        x4, x4, #16
+            bne         1f
+  .if \step==1
+            dup         v10.8h, v9.h[7]
+            dup         v11.8h, v9.h[7]
+  .else
+            dup         v10.2d, v9.d[1]
+            dup         v11.2d, v9.d[1]
+  .endif
+            b           3f
+
+            /* To avoid reading past end of input, rewind pointers by (16-x4)
+             * to ensure that they're exactly 16 bytes from the edge.
+             */
+1:          mov         x11, x4
+            bl          fetch_clampright\step
+            /* Now to put this padding to use, perform any remaining
+             * iterations.  This is done at half the rate of the main loop,
+             * because there's no longer pressure from a 16-lane window filler.
+             */
+3:          \core
+  .if \step==1
+            dup         v11.8h, v11.h[7]
+  .else
+            dup         v11.2d, v11.d[1]
+  .endif
+            subs        x3, x3, #8
+            blo         4f
+            st1         {v15.8b}, [x0], #8
+            bne         3b
+            b           5f
+
+            /* If the final iteration contained 0 < l < 8 values, then perform
+             * a piecewise store of the final vector.
+             */
+4:          tbz         x3, #2, 1f
+            st1         {v15.s}[0], [x0], #4
+            ext         v15.8b, v15.8b, v15.8b, #4
+1:          tbz         x3, #1, 1f
+            st1         {v15.h}[0], [x0], #2
+            ext         v15.8b, v15.8b, v15.8b, #2
+1:          tbz         x3, #0, 5f
+            st1         {v15.b}[0], [x0], #1
+            ext         v15.8b, v15.8b, v15.8b, #1
+5:          mov         x0, #0
+.endm
+
+
+.irp r, TUNED_LIST1, 25
+PRIVATE(convolve1_\r)
+            stp         x29,x30, [sp, #-16]!
+
+            prefill     step=1, max_r=\r, label=.Lcnv1_\r
+
+            conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
+
+            ldp         x29,x30, [sp], #16
+            ret
+END(convolve1_\r)
+.endr
+
+.irp r, TUNED_LIST4, 25
+PRIVATE(convolve4_\r)
+            sub         x9, sp, #0x40
+            stp         x29,x30, [sp, #-(16 + 0x40 + 0x80)]!
+            bic         x9, x9, #0x7f
+
+            /* x9 now points to a 0x40 byte buffer on the stack whose address
+             * has the low 7 bits clear.  This allows easy address calculation
+             * in the wrap-around cases.
+             */
+
+            prefill     step=4, max_r=\r, label=.Lcnv4_\r
+
+            conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
+
+            ldp         x29,x30, [sp], #(16 + 0x40 + 0x80)
+            ret
+END(convolve4_\r)
+.endr
+
+/* void rsdIntrinsicBlurU1_K(
+ *                  void *out,      // x0
+ *                  void *in,       // x1
+ *                  size_t w,       // x2
+ *                  size_t h,       // x3
+ *                  size_t p,       // x4
+ *                  size_t x,       // x5
+ *                  size_t y,       // x6
+ *                  size_t count,   // x7
+ *                  size_t r,       // [sp]
+ *                  uint16_t *tab); // [sp,#8]
+ */
+ENTRY(rsdIntrinsicBlurU1_K)
+            stp         x19,x30, [sp, #-16]!
+            sub         x8, sp, #32
+            sub         sp, sp, #64
+            st1         {v8.1d - v11.1d}, [sp]
+            st1         {v12.1d - v15.1d}, [x8]
+            mov         x8, x5          // x
+            ldr         w5, [sp,#80]    // r
+            sub         x9, x2, x8      // w - x
+            sub         x10, x3, x6     // h - y
+            mov         x2, x4          // pitch
+            mov         x3, x7          // count
+            sub         x7, x10, #1     // h - y - 1
+            mov         x4, x9          // inlen = (w - x)
+
+            ldr         x12, [sp, #88] // tab
+
+            add         x1, x1, x8      // src += x
+
+            cmp         x6, x5
+            csel        x6, x5, x6, hs  // rup = min(r, y)
+            cmp         x7, x5
+            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
+
+            sub         x13, xzr, x2    // -pitch
+            msub        x15, x2, x6, x1
+            madd        x19, x2, x7, x1
+
+            ld1         {v0.8h,v1.8h}, [x12], #32
+            ld1         {v2.8h,v3.8h}, [x12], #32
+
+            adr         x30, 1f
+  .irp r, TUNED_LIST1
+            cmp         x5, #\r
+            bls         convolve1_\r
+  .endr
+            b           convolve1_25
+
+1:          ld1         {v8.1d - v11.1d}, [sp], #32
+            ld1         {v12.1d - v15.1d}, [sp], #32
+            ldp         x19,x30, [sp], #16
+            ret
+END(rsdIntrinsicBlurU1_K)
+
+/* void rsdIntrinsicBlurU4_K(
+ *                  void *out,      // x0
+ *                  void *in,       // x1
+ *                  size_t w,       // x2
+ *                  size_t h,       // x3
+ *                  size_t p,       // x4
+ *                  size_t x,       // x5
+ *                  size_t y,       // x6
+ *                  size_t count,   // x7
+ *                  size_t r,       // [sp]
+ *                  uint16_t *tab); // [sp,#8]
+ */
+ENTRY(rsdIntrinsicBlurU4_K)
+            stp         x19,x30, [sp, #-16]!
+            sub         x8, sp, #32
+            sub         sp, sp, #64
+            st1         {v8.1d - v11.1d}, [sp]
+            st1         {v12.1d - v15.1d}, [x8]
+            lsl         x8, x5, #2      // x
+            lsl         x2, x2, #2
+            ldr         w5, [sp,#80]    // r
+            sub         x9, x2, x8      // w - x
+            sub         x10, x3, x6     // h - y
+            mov         x2, x4          // pitch
+            lsl         x3, x7, #2      // count
+            sub         x7, x10, #1     // h - y - 1
+            mov         x4, x9          // inlen = (w - x)
+
+            ldr         x12, [sp, #88]
+
+            add         x1, x1, x8      // in += x
+
+            cmp         x6, x5
+            csel        x6, x5, x6, hs  // rup = min(r, y)
+            cmp         x7, x5
+            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
+
+
+            sub         x13, xzr, x2
+            msub        x15, x2, x6, x1
+            madd        x19, x2, x7, x1
+
+            ld1         {v0.8h,v1.8h}, [x12], #32
+            ld1         {v2.8h,v3.8h}, [x12], #32
+
+            adr         x30, 1f
+  .irp r, TUNED_LIST4
+            cmp         x5, #\r
+            bls         convolve4_\r
+  .endr
+            b           convolve4_25
+
+1:          ld1         {v8.1d - v11.1d}, [sp], #32
+            ld1         {v12.1d - v15.1d}, [sp], #32
+            ldp         x19,x30, [sp], #16
+            ret
+END(rsdIntrinsicBlurU4_K)
diff --git a/toolkit/Blur_neon.S b/toolkit/Blur_neon.S
new file mode 100644
index 0000000..241af5f
--- /dev/null
+++ b/toolkit/Blur_neon.S
@@ -0,0 +1,1824 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+#define ARCH_ARM_USE_BLUR_PRELOAD
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* Number of fractional bits to preserve in intermediate results.  The
+ * intermediate storage is 16-bit, and we started with 8 bit data (the integer
+ * part), so this should be between 0 and 8.
+ */
+.set FRACTION_BITS, 7
+
+.set MAX_R, 25
+
+
+/* A quick way of making a line of code conditional on some other condition.
+ * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
+ * `ifcc`:
+ */
+.macro ifcc zzz:vararg
+.if cc
+            \zzz
+.endif
+.endm
+
+/* It's not always clear that prefetching is beneficial and this needs further
+ * testing on different cores, so it's made switchable here.
+ */
+#if defined(ARCH_ARM_USE_BLUR_PRELOAD)
+#define VERTPLD(...) pld [__VA_ARGS__]
+#else
+#define VERTPLD(...) nop
+#endif
+
+/* Fetch 16 columns of bytes (regardless of image format), convolve these
+ * vertically, and leave them in the register file.  If working near the top or
+ * bottom of an image then clamp the addressing while loading the data in.
+ *
+ * The convolution is fully unrolled for windows up to max_r, with the
+ * outermost edges calculated first.  This way it's possible to branch directly
+ * into the relevant part of the code for an arbitrary convolution radius.  Two
+ * variants of the loop are produced; one eliminates the clamping code for a
+ * slight speed advantage.
+ *
+ * Where the macro is called with reg=x, the specified register is taken to
+ * contain a pre-calculated pointer into one of the two loops.
+ *
+ * Input:
+ *      r1 -- src
+ *      r2 -- pitch
+ *      r5 -- r
+ *      r6 -- rup (r, unless clipped to top of source image)
+ *      r7 -- rdn (r, unless clipped to bottom of source image)
+ *      r12 -- switch index
+ *      q0-q3 -- coefficient table
+ * Output:
+ *      r1 += 16
+ *      q10,q11 -- 16 convolved columns
+ * Modifies:
+ *      r10 = upper row pointer
+ *      r11 = lower row pointer
+ *      q12-q15 = temporary sums
+ */
+.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=r12 /*{{{*/
+  .ifc \reg,r12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
+
+            vld1.8      {d30,d31}, [r1]
+            mls         r10, r2, r6, r1
+
+            vmovl.u8    q14, d30
+            VERTPLD(r1, #32)
+            vmovl.u8    q15, d31
+  .if \max_r < 16 // approximate
+    ifcc    adr         \reg, 1f
+  .else
+    ifcc    ldr         \reg, 2f
+1:  ifcc    add         \reg, \reg, pc
+  .endif
+
+            vmull.u16   q12, d28, d0[0]
+    ifcc    sub         \reg, r5, LSL #6
+            vmull.u16   q13, d29, d0[0]
+            mla         r11, r2, r7, r1
+            vmull.u16   q14, d30, d0[0]
+            add         r1, r1, #16
+            vmull.u16   q15, d31, d0[0]
+            bx          \reg
+
+     ifcc   .align 2
+  2: ifcc   .word       1f-1b-8
+
+  /* This version of the vertical fetch loop body is used away from the edges
+   * of the source image.  The pointers start at the top and bottom source rows
+   * and work their way towards the centre on each iteration.  This way the
+   * number of taps used can be controlled by jumping directly into the middle
+   * of the loop and running to completion.
+   * If the loop body changes size then the code which calculates the address of
+   * the initial iteration must be updated to accordingly.
+   */
+  .macro vertfetch_noclamp i, dreg
+    .if 0 < \i && \i <= \max_r
+            vld1.8      {d20,d21}, [r10], r2
+            vld1.8      {d22,d23}, [r11]
+            sub         r11, r11, r2
+            vswp        d21, d22
+            VERTPLD(r10, #32)
+            vaddl.u8    q10, d20, d21
+            vaddl.u8    q11, d22, d23
+            vmlal.u16   q12, d20, \dreg
+            VERTPLD(r11, #32)
+            vmlal.u16   q13, d21, \dreg
+            vmlal.u16   q14, d22, \dreg
+            vmlal.u16   q15, d23, \dreg
+    .endif
+  .endm
+
+  /* This version of the vertical fetch loop body is used near the edges of the
+   * source image, where one or both of the accesses may start with a clamped
+   * value, and the row addresses only begin to change after some number of
+   * iterations before the end.
+   * If the loop body changes size then the code which calculates the address of
+   * the initial iteration must be updated to accordingly.
+   */
+  .macro vertfetch_clamped i, dreg
+    .if 0 < \i && \i <= \max_r
+            vld1.8      {d20,d21}, [r10]
+            vld1.8      {d22,d23}, [r11]
+            cmp         r6, #\i
+            vswp        d21, d22
+            VERTPLD(r10, #32)
+            vaddl.u8    q10, d20, d21
+            addhs       r10, r10, r2
+            vaddl.u8    q11, d22, d23
+            cmp         r7, #\i
+            vmlal.u16   q12, d20, \dreg
+            VERTPLD(r11, #32)
+            vmlal.u16   q13, d21, \dreg
+            subhs       r11, r11, r2
+            vmlal.u16   q14, d22, \dreg
+            nop
+            vmlal.u16   q15, d23, \dreg
+    .endif
+  .endm
+
+  /* Entry into this unrolled loop is computed as a negative index from
+   * \labelc at the end of the block.
+   */
+  .align 4
+  vertfetch_clamped 27, d6[3]
+  vertfetch_clamped 26, d6[2]
+  vertfetch_clamped 25, d6[1]
+  vertfetch_clamped 24, d6[0]
+  vertfetch_clamped 23, d5[3]
+  vertfetch_clamped 22, d5[2]
+  vertfetch_clamped 21, d5[1]
+  vertfetch_clamped 20, d5[0]
+  vertfetch_clamped 19, d4[3]
+  vertfetch_clamped 18, d4[2]
+  vertfetch_clamped 17, d4[1]
+  vertfetch_clamped 16, d4[0]
+  vertfetch_clamped 15, d3[3]
+  vertfetch_clamped 14, d3[2]
+  vertfetch_clamped 13, d3[1]
+  vertfetch_clamped 12, d3[0]
+  vertfetch_clamped 11, d2[3]
+  vertfetch_clamped 10, d2[2]
+  vertfetch_clamped  9, d2[1]
+  vertfetch_clamped  8, d2[0]
+  vertfetch_clamped  7, d1[3]
+  vertfetch_clamped  6, d1[2]
+  vertfetch_clamped  5, d1[1]
+  vertfetch_clamped  4, d1[0]
+  vertfetch_clamped  3, d0[3]
+  vertfetch_clamped  2, d0[2]
+  vertfetch_clamped  1, d0[1]
+  vertfetch_clamped  0, d0[0]
+  1:
+  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
+
+  /* Entry into this unrolled loop is computed as a negative index from
+   * \labelnc at the end of the block.
+   */
+  .align 4
+  vertfetch_noclamp 27, d6[3]
+  vertfetch_noclamp 26, d6[2]
+  vertfetch_noclamp 25, d6[1]
+  vertfetch_noclamp 24, d6[0]
+  vertfetch_noclamp 23, d5[3]
+  vertfetch_noclamp 22, d5[2]
+  vertfetch_noclamp 21, d5[1]
+  vertfetch_noclamp 20, d5[0]
+  vertfetch_noclamp 19, d4[3]
+  vertfetch_noclamp 18, d4[2]
+  vertfetch_noclamp 17, d4[1]
+  vertfetch_noclamp 16, d4[0]
+  vertfetch_noclamp 15, d3[3]
+  vertfetch_noclamp 14, d3[2]
+  vertfetch_noclamp 13, d3[1]
+  vertfetch_noclamp 12, d3[0]
+  vertfetch_noclamp 11, d2[3]
+  vertfetch_noclamp 10, d2[2]
+  vertfetch_noclamp  9, d2[1]
+  vertfetch_noclamp  8, d2[0]
+  vertfetch_noclamp  7, d1[3]
+  vertfetch_noclamp  6, d1[2]
+  vertfetch_noclamp  5, d1[1]
+  vertfetch_noclamp  4, d1[0]
+  vertfetch_noclamp  3, d0[3]
+  vertfetch_noclamp  2, d0[2]
+  vertfetch_noclamp  1, d0[1]
+  vertfetch_noclamp  0, d0[0]
+  \labelnc :
+
+  .purgem vertfetch_clamped
+  .purgem vertfetch_noclamp
+
+  2:        vqrshrn.u32 d20, q12, #16 - FRACTION_BITS
+            vqrshrn.u32 d21, q13, #16 - FRACTION_BITS
+            vqrshrn.u32 d22, q14, #16 - FRACTION_BITS
+            vqrshrn.u32 d23, q15, #16 - FRACTION_BITS
+.endm /*}}}*/
+
+/* Some portion of the convolution window (as much as will fit, and all of it
+ * for the uchar1 cases) is kept in the register file to avoid unnecessary
+ * memory accesses.  This forces the horizontal loops to be unrolled because
+ * there's no indexed addressing into the register file.
+ *
+ * As in the fetch macro, the operations are ordered from outside to inside, so
+ * that jumping into the middle of the block bypasses the unwanted window taps.
+ *
+ * There are several variants of the macro because of the fixed offets of the
+ * taps -- the wider the maximum radius the further the centre tap is from the
+ * most recently fetched data.  This means that pre-filling the window requires
+ * more data that won't be used and it means that rotating the window involves
+ * more mov operations.
+ *
+ * When the buffer gets too big the buffer at [r9] is used.
+ *
+ * Input:
+ *      q4-q11 -- convoltion window
+ *      r9 -- pointer to additional convolution window data
+ * Output:
+ *      r9 -- updated buffer pointer (if used)
+ *      d31 -- result to be stored
+ * Modifies:
+ *      r12 -- temp buffer pointer
+ *      q12-q13 -- temporaries for load and vext operations.
+ *      q14-q15 -- intermediate sums
+ */
+#define TUNED_LIST1 8, 16
+.macro hconv1_8/*{{{*/
+            vmull.u16   q14, d18, d0[0]
+            vmull.u16   q15, d19, d0[0]
+
+            ldr         r12, [pc, r5, LSL #2]
+            add         pc, pc, r12
+            bkpt
+    100:    .word 101f-100b
+            .word 102f-100b
+            .word 103f-100b
+            .word 104f-100b
+            .word 105f-100b
+            .word 106f-100b
+            .word 107f-100b
+            .word 108f-100b
+    108:    vmlal.u16   q14, d16, d2[0]
+            vmlal.u16   q15, d17, d2[0]
+            vmlal.u16   q14, d20, d2[0]
+            vmlal.u16   q15, d21, d2[0]
+    107:    vext.u16    q12, q8, q9, #1
+            vext.u16    q13, q9, q10, #7
+            vmlal.u16   q14, d24, d1[3]
+            vmlal.u16   q15, d25, d1[3]
+            vmlal.u16   q14, d26, d1[3]
+            vmlal.u16   q15, d27, d1[3]
+    106:    vext.u16    q12, q8, q9, #2
+            vext.u16    q13, q9, q10, #6
+            vmlal.u16   q14, d24, d1[2]
+            vmlal.u16   q15, d25, d1[2]
+            vmlal.u16   q14, d26, d1[2]
+            vmlal.u16   q15, d27, d1[2]
+    105:    vext.u16    q12, q8, q9, #3
+            vext.u16    q13, q9, q10, #5
+            vmlal.u16   q14, d24, d1[1]
+            vmlal.u16   q15, d25, d1[1]
+            vmlal.u16   q14, d26, d1[1]
+            vmlal.u16   q15, d27, d1[1]
+    104:    //vext.u16    q12, q8, q9, #4
+            //vext.u16    q13, q9, q10, #4
+            vmlal.u16   q14, d17, d1[0]
+            vmlal.u16   q15, d18, d1[0]
+            vmlal.u16   q14, d19, d1[0]
+            vmlal.u16   q15, d20, d1[0]
+    103:    vext.u16    q12, q8, q9, #5
+            vext.u16    q13, q9, q10, #3
+            vmlal.u16   q14, d24, d0[3]
+            vmlal.u16   q15, d25, d0[3]
+            vmlal.u16   q14, d26, d0[3]
+            vmlal.u16   q15, d27, d0[3]
+    102:    vext.u16    q12, q8, q9, #6
+            vext.u16    q13, q9, q10, #2
+            vmlal.u16   q14, d24, d0[2]
+            vmlal.u16   q15, d25, d0[2]
+            vmlal.u16   q14, d26, d0[2]
+            vmlal.u16   q15, d27, d0[2]
+    101:    vext.u16    q12, q8, q9, #7
+            vext.u16    q13, q9, q10, #1
+            vmlal.u16   q14, d24, d0[1]
+            vmlal.u16   q15, d25, d0[1]
+            vmlal.u16   q14, d26, d0[1]
+            vmlal.u16   q15, d27, d0[1]
+
+            vqrshrn.u32 d28, q14, #16
+            vqrshrn.u32 d29, q15, #16
+            vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+            vmov        q8, q9
+            vmov        q9, q10
+            vmov        q10, q11
+.endm/*}}}*/
+
+.macro hconv1_16/*{{{*/
+            vmull.u16   q14, d16, d0[0]
+            vmull.u16   q15, d17, d0[0]
+
+            ldr         r12, [pc, r5, LSL #2]
+            add         pc, pc, r12
+            bkpt
+    100:    .word 101f-100b
+            .word 102f-100b
+            .word 103f-100b
+            .word 104f-100b
+            .word 105f-100b
+            .word 106f-100b
+            .word 107f-100b
+            .word 108f-100b
+            .word 109f-100b
+            .word 110f-100b
+            .word 111f-100b
+            .word 112f-100b
+            .word 113f-100b
+            .word 114f-100b
+            .word 115f-100b
+            .word 116f-100b
+    116:    //vext.u16    q12, q6, q7, #0
+            //vext.u16    q13, q10, q11, #0
+            vmlal.u16   q14, d12, d4[0]
+            vmlal.u16   q15, d13, d4[0]
+            vmlal.u16   q14, d20, d4[0]
+            vmlal.u16   q15, d21, d4[0]
+    115:    vext.u16    q12, q6, q7, #1
+            vext.u16    q13, q9, q10, #7
+            vmlal.u16   q14, d24, d3[3]
+            vmlal.u16   q15, d25, d3[3]
+            vmlal.u16   q14, d26, d3[3]
+            vmlal.u16   q15, d27, d3[3]
+    114:    vext.u16    q12, q6, q7, #2
+            vext.u16    q13, q9, q10, #6
+            vmlal.u16   q14, d24, d3[2]
+            vmlal.u16   q15, d25, d3[2]
+            vmlal.u16   q14, d26, d3[2]
+            vmlal.u16   q15, d27, d3[2]
+    113:    vext.u16    q12, q6, q7, #3
+            vext.u16    q13, q9, q10, #5
+            vmlal.u16   q14, d24, d3[1]
+            vmlal.u16   q15, d25, d3[1]
+            vmlal.u16   q14, d26, d3[1]
+            vmlal.u16   q15, d27, d3[1]
+    112:    //vext.u16    q12, q6, q7, #4
+            //vext.u16    q13, q9, q10, #4
+            vmlal.u16   q14, d13, d3[0]
+            vmlal.u16   q15, d14, d3[0]
+            vmlal.u16   q14, d19, d3[0]
+            vmlal.u16   q15, d20, d3[0]
+    111:    vext.u16    q12, q6, q7, #5
+            vext.u16    q13, q9, q10, #3
+            vmlal.u16   q14, d24, d2[3]
+            vmlal.u16   q15, d25, d2[3]
+            vmlal.u16   q14, d26, d2[3]
+            vmlal.u16   q15, d27, d2[3]
+    110:    vext.u16    q12, q6, q7, #6
+            vext.u16    q13, q9, q10, #2
+            vmlal.u16   q14, d24, d2[2]
+            vmlal.u16   q15, d25, d2[2]
+            vmlal.u16   q14, d26, d2[2]
+            vmlal.u16   q15, d27, d2[2]
+    109:    vext.u16    q12, q6, q7, #7
+            vext.u16    q13, q9, q10, #1
+            vmlal.u16   q14, d24, d2[1]
+            vmlal.u16   q15, d25, d2[1]
+            vmlal.u16   q14, d26, d2[1]
+            vmlal.u16   q15, d27, d2[1]
+    108:    //vext.u16    q12, q7, q8, #0
+            //vext.u16    q13, q9, q10, #0
+            vmlal.u16   q14, d14, d2[0]
+            vmlal.u16   q15, d15, d2[0]
+            vmlal.u16   q14, d18, d2[0]
+            vmlal.u16   q15, d19, d2[0]
+    107:    vext.u16    q12, q7, q8, #1
+            vext.u16    q13, q8, q9, #7
+            vmlal.u16   q14, d24, d1[3]
+            vmlal.u16   q15, d25, d1[3]
+            vmlal.u16   q14, d26, d1[3]
+            vmlal.u16   q15, d27, d1[3]
+    106:    vext.u16    q12, q7, q8, #2
+            vext.u16    q13, q8, q9, #6
+            vmlal.u16   q14, d24, d1[2]
+            vmlal.u16   q15, d25, d1[2]
+            vmlal.u16   q14, d26, d1[2]
+            vmlal.u16   q15, d27, d1[2]
+    105:    vext.u16    q12, q7, q8, #3
+            vext.u16    q13, q8, q9, #5
+            vmlal.u16   q14, d24, d1[1]
+            vmlal.u16   q15, d25, d1[1]
+            vmlal.u16   q14, d26, d1[1]
+            vmlal.u16   q15, d27, d1[1]
+    104:    //vext.u16    q12, q7, q8, #4
+            //vext.u16    q13, q8, q9, #4
+            vmlal.u16   q14, d15, d1[0]
+            vmlal.u16   q15, d16, d1[0]
+            vmlal.u16   q14, d17, d1[0]
+            vmlal.u16   q15, d18, d1[0]
+    103:    vext.u16    q12, q7, q8, #5
+            vext.u16    q13, q8, q9, #3
+            vmlal.u16   q14, d24, d0[3]
+            vmlal.u16   q15, d25, d0[3]
+            vmlal.u16   q14, d26, d0[3]
+            vmlal.u16   q15, d27, d0[3]
+    102:    vext.u16    q12, q7, q8, #6
+            vext.u16    q13, q8, q9, #2
+            vmlal.u16   q14, d24, d0[2]
+            vmlal.u16   q15, d25, d0[2]
+            vmlal.u16   q14, d26, d0[2]
+            vmlal.u16   q15, d27, d0[2]
+    101:    vext.u16    q12, q7, q8, #7
+            vext.u16    q13, q8, q9, #1
+            vmlal.u16   q14, d24, d0[1]
+            vmlal.u16   q15, d25, d0[1]
+            vmlal.u16   q14, d26, d0[1]
+            vmlal.u16   q15, d27, d0[1]
+
+            vqrshrn.u32 d28, q14, #16
+            vqrshrn.u32 d29, q15, #16
+            vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+            vmov        q6, q7
+            vmov        q7, q8
+            vmov        q8, q9
+            vmov        q9, q10
+            vmov        q10, q11
+.endm/*}}}*/
+
+.macro hconv1_25/*{{{*/
+            vext.u16    q12, q6, q7, #7
+            vmull.u16   q14, d24, d0[0]
+            vmull.u16   q15, d25, d0[0]
+
+            ldr         r12, [pc, r5, LSL #2]
+            add         pc, pc, r12
+            bkpt
+    100:    .word 101f-100b
+            .word 102f-100b
+            .word 103f-100b
+            .word 104f-100b
+            .word 105f-100b
+            .word 106f-100b
+            .word 107f-100b
+            .word 108f-100b
+            .word 109f-100b
+            .word 110f-100b
+            .word 111f-100b
+            .word 112f-100b
+            .word 113f-100b
+            .word 114f-100b
+            .word 115f-100b
+            .word 116f-100b
+            .word 117f-100b
+            .word 118f-100b
+            .word 119f-100b
+            .word 120f-100b
+            .word 121f-100b
+            .word 122f-100b
+            .word 123f-100b
+            .word 124f-100b
+            .word 125f-100b
+    125:    vext.u16    q12, q3, q4, #6
+            vext.u16    q13, q10, q11, #0
+            vmlal.u16   q14, d24, d6[1]
+            vmlal.u16   q15, d25, d6[1]
+            vmlal.u16   q14, d26, d6[1]
+            vmlal.u16   q15, d27, d6[1]
+    124:    vext.u16    q12, q3, q4, #7
+            vext.u16    q13, q9, q10, #7
+            vmlal.u16   q14, d24, d6[0]
+            vmlal.u16   q15, d25, d6[0]
+            vmlal.u16   q14, d26, d6[0]
+            vmlal.u16   q15, d27, d6[0]
+    123:    vext.u16    q12, q4, q5, #0
+            vext.u16    q13, q9, q10, #6
+            vmlal.u16   q14, d24, d5[3]
+            vmlal.u16   q15, d25, d5[3]
+            vmlal.u16   q14, d26, d5[3]
+            vmlal.u16   q15, d27, d5[3]
+    122:    vext.u16    q12, q4, q5, #1
+            vext.u16    q13, q9, q10, #5
+            vmlal.u16   q14, d24, d5[2]
+            vmlal.u16   q15, d25, d5[2]
+            vmlal.u16   q14, d26, d5[2]
+            vmlal.u16   q15, d27, d5[2]
+    121:    vext.u16    q12, q4, q5, #2
+            vext.u16    q13, q9, q10, #4
+            vmlal.u16   q14, d24, d5[1]
+            vmlal.u16   q15, d25, d5[1]
+            vmlal.u16   q14, d26, d5[1]
+            vmlal.u16   q15, d27, d5[1]
+    120:    vext.u16    q12, q4, q5, #3
+            vext.u16    q13, q9, q10, #3
+            vmlal.u16   q14, d24, d5[0]
+            vmlal.u16   q15, d25, d5[0]
+            vmlal.u16   q14, d26, d5[0]
+            vmlal.u16   q15, d27, d5[0]
+    119:    vext.u16    q12, q4, q5, #4
+            vext.u16    q13, q9, q10, #2
+            vmlal.u16   q14, d24, d4[3]
+            vmlal.u16   q15, d25, d4[3]
+            vmlal.u16   q14, d26, d4[3]
+            vmlal.u16   q15, d27, d4[3]
+    118:    vext.u16    q12, q4, q5, #5
+            vext.u16    q13, q9, q10, #1
+            vmlal.u16   q14, d24, d4[2]
+            vmlal.u16   q15, d25, d4[2]
+            vmlal.u16   q14, d26, d4[2]
+            vmlal.u16   q15, d27, d4[2]
+    117:    vext.u16    q12, q4, q5, #6
+            vext.u16    q13, q9, q10, #0
+            vmlal.u16   q14, d24, d4[1]
+            vmlal.u16   q15, d25, d4[1]
+            vmlal.u16   q14, d26, d4[1]
+            vmlal.u16   q15, d27, d4[1]
+    116:    vext.u16    q12, q4, q5, #7
+            vext.u16    q13, q8, q9, #7
+            vmlal.u16   q14, d24, d4[0]
+            vmlal.u16   q15, d25, d4[0]
+            vmlal.u16   q14, d26, d4[0]
+            vmlal.u16   q15, d27, d4[0]
+    115:    vext.u16    q12, q5, q6, #0
+            vext.u16    q13, q8, q9, #6
+            vmlal.u16   q14, d24, d3[3]
+            vmlal.u16   q15, d25, d3[3]
+            vmlal.u16   q14, d26, d3[3]
+            vmlal.u16   q15, d27, d3[3]
+    114:    vext.u16    q12, q5, q6, #1
+            vext.u16    q13, q8, q9, #5
+            vmlal.u16   q14, d24, d3[2]
+            vmlal.u16   q15, d25, d3[2]
+            vmlal.u16   q14, d26, d3[2]
+            vmlal.u16   q15, d27, d3[2]
+    113:    vext.u16    q12, q5, q6, #2
+            vext.u16    q13, q8, q9, #4
+            vmlal.u16   q14, d24, d3[1]
+            vmlal.u16   q15, d25, d3[1]
+            vmlal.u16   q14, d26, d3[1]
+            vmlal.u16   q15, d27, d3[1]
+    112:    vext.u16    q12, q5, q6, #3
+            vext.u16    q13, q8, q9, #3
+            vmlal.u16   q14, d24, d3[0]
+            vmlal.u16   q15, d25, d3[0]
+            vmlal.u16   q14, d26, d3[0]
+            vmlal.u16   q15, d27, d3[0]
+    111:    vext.u16    q12, q5, q6, #4
+            vext.u16    q13, q8, q9, #2
+            vmlal.u16   q14, d24, d2[3]
+            vmlal.u16   q15, d25, d2[3]
+            vmlal.u16   q14, d26, d2[3]
+            vmlal.u16   q15, d27, d2[3]
+    110:    vext.u16    q12, q5, q6, #5
+            vext.u16    q13, q8, q9, #1
+            vmlal.u16   q14, d24, d2[2]
+            vmlal.u16   q15, d25, d2[2]
+            vmlal.u16   q14, d26, d2[2]
+            vmlal.u16   q15, d27, d2[2]
+    109:    vext.u16    q12, q5, q6, #6
+            vext.u16    q13, q8, q9, #0
+            vmlal.u16   q14, d24, d2[1]
+            vmlal.u16   q15, d25, d2[1]
+            vmlal.u16   q14, d26, d2[1]
+            vmlal.u16   q15, d27, d2[1]
+    108:    vext.u16    q12, q5, q6, #7
+            vext.u16    q13, q7, q8, #7
+            vmlal.u16   q14, d24, d2[0]
+            vmlal.u16   q15, d25, d2[0]
+            vmlal.u16   q14, d26, d2[0]
+            vmlal.u16   q15, d27, d2[0]
+    107:    vext.u16    q12, q6, q7, #0
+            vext.u16    q13, q7, q8, #6
+            vmlal.u16   q14, d24, d1[3]
+            vmlal.u16   q15, d25, d1[3]
+            vmlal.u16   q14, d26, d1[3]
+            vmlal.u16   q15, d27, d1[3]
+    106:    vext.u16    q12, q6, q7, #1
+            vext.u16    q13, q7, q8, #5
+            vmlal.u16   q14, d24, d1[2]
+            vmlal.u16   q15, d25, d1[2]
+            vmlal.u16   q14, d26, d1[2]
+            vmlal.u16   q15, d27, d1[2]
+    105:    vext.u16    q12, q6, q7, #2
+            vext.u16    q13, q7, q8, #4
+            vmlal.u16   q14, d24, d1[1]
+            vmlal.u16   q15, d25, d1[1]
+            vmlal.u16   q14, d26, d1[1]
+            vmlal.u16   q15, d27, d1[1]
+    104:    vext.u16    q12, q6, q7, #3
+            vext.u16    q13, q7, q8, #3
+            vmlal.u16   q14, d24, d1[0]
+            vmlal.u16   q15, d25, d1[0]
+            vmlal.u16   q14, d26, d1[0]
+            vmlal.u16   q15, d27, d1[0]
+    103:    vext.u16    q12, q6, q7, #4
+            vext.u16    q13, q7, q8, #2
+            vmlal.u16   q14, d24, d0[3]
+            vmlal.u16   q15, d25, d0[3]
+            vmlal.u16   q14, d26, d0[3]
+            vmlal.u16   q15, d27, d0[3]
+    102:    vext.u16    q12, q6, q7, #5
+            vext.u16    q13, q7, q8, #1
+            vmlal.u16   q14, d24, d0[2]
+            vmlal.u16   q15, d25, d0[2]
+            vmlal.u16   q14, d26, d0[2]
+            vmlal.u16   q15, d27, d0[2]
+    101:    vext.u16    q12, q6, q7, #6
+            vext.u16    q13, q7, q8, #0
+            vmlal.u16   q14, d24, d0[1]
+            vmlal.u16   q15, d25, d0[1]
+            vmlal.u16   q14, d26, d0[1]
+            vmlal.u16   q15, d27, d0[1]
+
+            vqrshrn.u32 d28, q14, #16
+            vqrshrn.u32 d29, q15, #16
+            vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+            vmov        d7, d9
+            vmov        q4, q5
+            vmov        q5, q6
+            vmov        q6, q7
+            vmov        q7, q8
+            vmov        q8, q9
+            vmov        q9, q10
+            vmov        q10, q11
+.endm/*}}}*/
+
+#define TUNED_LIST4 6, 12
+.macro hconv4_6/*{{{*/
+            vmull.u16   q14, d14, d0[0]
+            vmull.u16   q15, d15, d0[0]
+
+            ldr         r12, [pc, r5, LSL #2]
+            add         pc, pc, r12
+            bkpt
+    100:    .word 101f-100b
+            .word 102f-100b
+            .word 103f-100b
+            .word 104f-100b
+            .word 105f-100b
+            .word 106f-100b
+    106:    vmlal.u16   q14, d8,  d1[2]
+            vmlal.u16   q15, d9,  d1[2]
+            vmlal.u16   q14, d20, d1[2]
+            vmlal.u16   q15, d21, d1[2]
+    105:    vmlal.u16   q14, d9,  d1[1]
+            vmlal.u16   q15, d10, d1[1]
+            vmlal.u16   q14, d19, d1[1]
+            vmlal.u16   q15, d20, d1[1]
+    104:    vmlal.u16   q14, d10, d1[0]
+            vmlal.u16   q15, d11, d1[0]
+            vmlal.u16   q14, d18, d1[0]
+            vmlal.u16   q15, d19, d1[0]
+    103:    vmlal.u16   q14, d11, d0[3]
+            vmlal.u16   q15, d12, d0[3]
+            vmlal.u16   q14, d17, d0[3]
+            vmlal.u16   q15, d18, d0[3]
+    102:    vmlal.u16   q14, d12, d0[2]
+            vmlal.u16   q15, d13, d0[2]
+            vmlal.u16   q14, d16, d0[2]
+            vmlal.u16   q15, d17, d0[2]
+    101:    vmlal.u16   q14, d13, d0[1]
+            vmlal.u16   q15, d14, d0[1]
+            vmlal.u16   q14, d15, d0[1]
+            vmlal.u16   q15, d16, d0[1]
+
+            vqrshrn.u32 d28, q14, #16
+            vqrshrn.u32 d29, q15, #16
+            vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+            vmov        q4, q5
+            vmov        q5, q6
+            vmov        q6, q7
+            vmov        q7, q8
+            vmov        q8, q9
+            vmov        q9, q10
+            vmov        q10, q11
+.endm/*}}}*/
+
+.macro hconv4_12/*{{{*/
+            vmull.u16   q14, d8, d0[0]
+            vmull.u16   q15, d9, d0[0]
+
+            ldr         r12, [pc, r5, LSL #2]
+            add         pc, pc, r12
+            bkpt
+    100:    .word 101f-100b
+            .word 102f-100b
+            .word 103f-100b
+            .word 104f-100b
+            .word 105f-100b
+            .word 106f-100b
+            .word 107f-100b
+            .word 108f-100b
+            .word 109f-100b
+            .word 110f-100b
+            .word 111f-100b
+            .word 112f-100b
+    112:    add         r12, r9, #0x1a0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d3[0]
+            vmlal.u16   q15, d25, d3[0]
+            vmlal.u16   q14, d20, d3[0]
+            vmlal.u16   q15, d21, d3[0]
+    111:    add         r12, r9, #0x1a8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12:64]
+            vmlal.u16   q14, d24, d2[3]
+            vmlal.u16   q15, d25, d2[3]
+            vmlal.u16   q14, d19, d2[3]
+            vmlal.u16   q15, d20, d2[3]
+    110:    add         r12, r9, #0x1b0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d2[2]
+            vmlal.u16   q15, d25, d2[2]
+            vmlal.u16   q14, d18, d2[2]
+            vmlal.u16   q15, d19, d2[2]
+    109:    add         r12, r9, #0x1b8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12:64]
+            vmlal.u16   q14, d24, d2[1]
+            vmlal.u16   q15, d25, d2[1]
+            vmlal.u16   q14, d17, d2[1]
+            vmlal.u16   q15, d18, d2[1]
+    108:    add         r12, r9, #0x1c0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d2[0]
+            vmlal.u16   q15, d25, d2[0]
+            vmlal.u16   q14, d16, d2[0]
+            vmlal.u16   q15, d17, d2[0]
+    107:    add         r12, r9, #0x1c8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12:64]
+            vmlal.u16   q14, d24, d1[3]
+            vmlal.u16   q15, d25, d1[3]
+            vmlal.u16   q14, d15, d1[3]
+            vmlal.u16   q15, d16, d1[3]
+    106:    add         r12, r9, #0x1d0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d1[2]
+            vmlal.u16   q15, d25, d1[2]
+            vmlal.u16   q14, d14, d1[2]
+            vmlal.u16   q15, d15, d1[2]
+    105:    add         r12, r9, #0x1d8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12:64]
+            vmlal.u16   q14, d24, d1[1]
+            vmlal.u16   q15, d25, d1[1]
+            vmlal.u16   q14, d13, d1[1]
+            vmlal.u16   q15, d14, d1[1]
+    104:    add         r12, r9, #0x1e0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d1[0]
+            vmlal.u16   q15, d25, d1[0]
+            vmlal.u16   q14, d12, d1[0]
+            vmlal.u16   q15, d13, d1[0]
+    103:    add         r12, r9, #0x1e8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12:64]
+            vmlal.u16   q14, d24, d0[3]
+            vmlal.u16   q15, d25, d0[3]
+            vmlal.u16   q14, d11, d0[3]
+            vmlal.u16   q15, d12, d0[3]
+    102:    add         r12, r9, #0x1f0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d0[2]
+            vmlal.u16   q15, d25, d0[2]
+            vmlal.u16   q14, d10, d0[2]
+            vmlal.u16   q15, d11, d0[2]
+    101:    add         r12, r9, #0x1f8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]
+            vmlal.u16   q14, d24, d0[1]
+            vmlal.u16   q15, d8,  d0[1]
+            vmlal.u16   q14, d9,  d0[1]
+            vmlal.u16   q15, d10, d0[1]
+
+            vqrshrn.u32 d28, q14, #16
+            vqrshrn.u32 d29, q15, #16
+            vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+            vst1.u8     {q4}, [r9:128]!
+            bic         r9, r9, #0x200
+            vmov        q4, q5
+            vmov        q5, q6
+            vmov        q6, q7
+            vmov        q7, q8
+            vmov        q8, q9
+            vmov        q9, q10
+            vmov        q10, q11
+.endm/*}}}*/
+
+.macro hconv4_25/*{{{*/
+            add         r12, r9, #0x198
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12:64]
+            vmull.u16   q14, d24, d0[0]
+            vmull.u16   q15, d25, d0[0]
+
+            ldr         r12, [pc, r5, LSL #2]
+            add         pc, pc, r12
+            bkpt
+    100:    .word 101f-100b
+            .word 102f-100b
+            .word 103f-100b
+            .word 104f-100b
+            .word 105f-100b
+            .word 106f-100b
+            .word 107f-100b
+            .word 108f-100b
+            .word 109f-100b
+            .word 110f-100b
+            .word 111f-100b
+            .word 112f-100b
+            .word 113f-100b
+            .word 114f-100b
+            .word 115f-100b
+            .word 116f-100b
+            .word 117f-100b
+            .word 118f-100b
+            .word 119f-100b
+            .word 120f-100b
+            .word 121f-100b
+            .word 122f-100b
+            .word 123f-100b
+            .word 124f-100b
+            .word 125f-100b
+    125:    add         r12, r9, #0x0d0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d6[1]
+            vmlal.u16   q15, d25, d6[1]
+            vmlal.u16   q14, d20, d6[1]
+            vmlal.u16   q15, d21, d6[1]
+    124:    add         r12, r9, #0x0d8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+            vmlal.u16   q14, d24, d6[0]
+            vmlal.u16   q15, d25, d6[0]
+            vmlal.u16   q14, d19, d6[0]
+            vmlal.u16   q15, d20, d6[0]
+    123:    add         r12, r9, #0x0e0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d5[3]
+            vmlal.u16   q15, d25, d5[3]
+            vmlal.u16   q14, d18, d5[3]
+            vmlal.u16   q15, d19, d5[3]
+    122:    add         r12, r9, #0x0e8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+            vmlal.u16   q14, d24, d5[2]
+            vmlal.u16   q15, d25, d5[2]
+            vmlal.u16   q14, d17, d5[2]
+            vmlal.u16   q15, d18, d5[2]
+    121:    add         r12, r9, #0x0f0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d5[1]
+            vmlal.u16   q15, d25, d5[1]
+            vmlal.u16   q14, d16, d5[1]
+            vmlal.u16   q15, d17, d5[1]
+    120:    add         r12, r9, #0x0f8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+            vmlal.u16   q14, d24, d5[0]
+            vmlal.u16   q15, d25, d5[0]
+            vmlal.u16   q14, d15, d5[0]
+            vmlal.u16   q15, d16, d5[0]
+    119:    add         r12, r9, #0x100
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d4[3]
+            vmlal.u16   q15, d25, d4[3]
+            vmlal.u16   q14, d14, d4[3]
+            vmlal.u16   q15, d15, d4[3]
+    118:    add         r12, r9, #0x108
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+            vmlal.u16   q14, d24, d4[2]
+            vmlal.u16   q15, d25, d4[2]
+            vmlal.u16   q14, d13, d4[2]
+            vmlal.u16   q15, d14, d4[2]
+    117:    add         r12, r9, #0x110
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d4[1]
+            vmlal.u16   q15, d25, d4[1]
+            vmlal.u16   q14, d12, d4[1]
+            vmlal.u16   q15, d13, d4[1]
+    116:    add         r12, r9, #0x118
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+            vmlal.u16   q14, d24, d4[0]
+            vmlal.u16   q15, d25, d4[0]
+            vmlal.u16   q14, d11, d4[0]
+            vmlal.u16   q15, d12, d4[0]
+    115:    add         r12, r9, #0x120
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d3[3]
+            vmlal.u16   q15, d25, d3[3]
+            vmlal.u16   q14, d10, d3[3]
+            vmlal.u16   q15, d11, d3[3]
+    114:    add         r12, r9, #0x128
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+            vmlal.u16   q14, d24, d3[2]
+            vmlal.u16   q15, d25, d3[2]
+            vmlal.u16   q14, d9,  d3[2]
+            vmlal.u16   q15, d10, d3[2]
+    113:    add         r12, r9, #0x130
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d3[1]
+            vmlal.u16   q15, d25, d3[1]
+            vmlal.u16   q14, d8,  d3[1]
+            vmlal.u16   q15, d9,  d3[1]
+    112:    add         r12, r9, #0x138
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+                                            add         r12, r9, #0x1f8
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26}, [r12:64]
+            vmlal.u16   q14, d24, d3[0]
+            vmlal.u16   q15, d25, d3[0]
+            vmlal.u16   q14, d26, d3[0]   @ Could be d7, without the load, right?
+            vmlal.u16   q15, d8,  d3[0]
+    111:    add         r12, r9, #0x140
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+                                            add         r12, r9, #0x1f0
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26,d27}, [r12:128]
+            vmlal.u16   q14, d24, d2[3]
+            vmlal.u16   q15, d25, d2[3]
+            vmlal.u16   q14, d26, d2[3]
+            vmlal.u16   q15, d27, d2[3]
+    110:    add         r12, r9, #0x148
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+                                            add         r12, r9, #0x1e8
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26}, [r12:64]!
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d27}, [r12:64]
+            vmlal.u16   q14, d24, d2[2]
+            vmlal.u16   q15, d25, d2[2]
+            vmlal.u16   q14, d26, d2[2]
+            vmlal.u16   q15, d27, d2[2]
+    109:    add         r12, r9, #0x150
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+                                            add         r12, r9, #0x1e0
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26,d27}, [r12:128]
+            vmlal.u16   q14, d24, d2[1]
+            vmlal.u16   q15, d25, d2[1]
+            vmlal.u16   q14, d26, d2[1]
+            vmlal.u16   q15, d27, d2[1]
+    108:    add         r12, r9, #0x158
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+                                            add         r12, r9, #0x1d8
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26}, [r12:64]!
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d27}, [r12:64]
+            vmlal.u16   q14, d24, d2[0]
+            vmlal.u16   q15, d25, d2[0]
+            vmlal.u16   q14, d26, d2[0]
+            vmlal.u16   q15, d27, d2[0]
+    107:    add         r12, r9, #0x160
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+                                            add         r12, r9, #0x1d0
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26,d27}, [r12:128]
+            vmlal.u16   q14, d24, d1[3]
+            vmlal.u16   q15, d25, d1[3]
+            vmlal.u16   q14, d26, d1[3]
+            vmlal.u16   q15, d27, d1[3]
+    106:    add         r12, r9, #0x168
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+                                            add         r12, r9, #0x1c8
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26}, [r12:64]!
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d27}, [r12:64]
+            vmlal.u16   q14, d24, d1[2]
+            vmlal.u16   q15, d25, d1[2]
+            vmlal.u16   q14, d26, d1[2]
+            vmlal.u16   q15, d27, d1[2]
+    105:    add         r12, r9, #0x170
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+                                            add         r12, r9, #0x1c0
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26,d27}, [r12:128]
+            vmlal.u16   q14, d24, d1[1]
+            vmlal.u16   q15, d25, d1[1]
+            vmlal.u16   q14, d26, d1[1]
+            vmlal.u16   q15, d27, d1[1]
+    104:    add         r12, r9, #0x178
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+                                            add         r12, r9, #0x1b8
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26}, [r12:64]!
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d27}, [r12:64]
+            vmlal.u16   q14, d24, d1[0]
+            vmlal.u16   q15, d25, d1[0]
+            vmlal.u16   q14, d26, d1[0]
+            vmlal.u16   q15, d27, d1[0]
+    103:    add         r12, r9, #0x180
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+                                            add         r12, r9, #0x1b0
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26,d27}, [r12:128]
+            vmlal.u16   q14, d24, d0[3]
+            vmlal.u16   q15, d25, d0[3]
+            vmlal.u16   q14, d26, d0[3]
+            vmlal.u16   q15, d27, d0[3]
+    102:    add         r12, r9, #0x188
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+                                            add         r12, r9, #0x1a8
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26}, [r12:64]!
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d27}, [r12:64]
+            vmlal.u16   q14, d24, d0[2]
+            vmlal.u16   q15, d25, d0[2]
+            vmlal.u16   q14, d26, d0[2]
+            vmlal.u16   q15, d27, d0[2]
+    101:    add         r12, r9, #0x190
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d26,d27}, [r12:128]
+            vmlal.u16   q14, d24, d0[1]
+            vmlal.u16   q15, d25, d0[1]
+            vmlal.u16   q14, d26, d0[1]
+            vmlal.u16   q15, d27, d0[1]
+
+            vqrshrn.u32 d28, q14, #16
+            vqrshrn.u32 d29, q15, #16
+            vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+            vst1.u8     {q4}, [r9:128]!
+            bic         r9, r9, #0x200
+            vmov        q4, q5
+            vmov        q5, q6
+            vmov        q6, q7
+            vmov        q7, q8
+            vmov        q8, q9
+            vmov        q9, q10
+            vmov        q10, q11
+.endm/*}}}*/
+
+/* Dedicated function wrapper for the fetch macro, for the cases where
+ * performance isn't that important, to keep code size down.
+ */
+PRIVATE(fetch_generic_asm)
+            push        {r10,r11}
+            fetch
+            pop         {r10,r11}
+            bx          lr
+END(fetch_generic_asm)
+
+
+/* Fetch the next (16 - (r10 & 15)) columns of data, avoiding reading memory
+ * beyond that limit, and filling the rest of the vector with the last legal
+ * pixel.
+ * Result is in q10 and q11.  q8 and q9 are filled with the first legal pixel.
+ * Note: This function can read beyond the right edge of input if the image is
+ * narrower than 16 bytes.
+ */
+PRIVATE(fetch_clampleft1)
+            push        {r12,lr}
+            bl          fetch_generic_asm
+            vdup.u16    q8, d20[0]
+            vdup.u16    q9, d20[0]
+            ands        r12, r10, #15
+            beq         1f
+            sub         r1, r1, r12
+            sub         r10, r10, r12
+            sub         sp, sp, #32
+            vst1.u16    {q10,q11}, [sp]
+            sub         r12, sp, r12, LSL #1
+            sub         sp, sp, #32
+            vst1.u16    {q8,q9}, [sp]
+            vld1.u16    {q10,q11}, [r12]
+            add         sp, sp, #64
+1:          pop         {r12,pc}
+END(fetch_clampleft1)
+
+PRIVATE(fetch_clampleft4)
+            push        {r12,lr}
+            bl          fetch_generic_asm
+            vmov.u16    d16, d20
+            vmov.u16    d17, d20
+            vmov.u16    d18, d20
+            vmov.u16    d19, d20
+            ands        r12, r10, #15
+            beq         1f
+            sub         r1, r1, r12
+            sub         r10, r10, r12
+            sub         sp, sp, #32
+            vst1.u16    {q10-q11}, [sp]
+            sub         r12, sp, r12, LSL #1
+            sub         sp, sp, #32
+            vst1.u16    {q8,q9}, [sp]
+            vld1.u16    {q10,q11}, [r12]
+            add         sp, sp, #64
+1:          pop         {r12,pc}
+END(fetch_clampleft4)
+
+/* Fetch only the next (r11 & 15) (where 0 means 16) columns of data, avoiding
+ * reading memory beyond that limit, and filling the rest of the vector with
+ * the last legal pixel.
+ * Result is in q10 and q11.  q12 and q13 are filled with the last legal pixel.
+ * Note: This function can read beyond the left edge of input if the image is
+ * narrower than 16 bytes.
+ */
+PRIVATE(fetch_clampright1)
+            push        {r12, lr}
+            rsb         r12, r11, #0
+            ands        r12, r12, #15
+            beq         1f
+            sub         r1, r1, r12
+            bl          fetch_generic_asm
+            vdup.u16    q12, d23[3]
+            vdup.u16    q13, d23[3]
+            rsb         r12, r11, #0
+            and         r12, r12, #15
+            sub         sp, sp, #32
+            vst1.u16    {q12,q13}, [sp]
+            sub         sp, sp, #32
+            add         r12, sp, r12, LSL #1
+            vst1.u16    {q10,q11}, [sp]
+            vld1.u16    {q10,q11}, [r12]
+            add         sp, sp, #64
+            pop         {r12,pc}
+1:          bl          fetch_generic_asm
+            vdup.u16    q12, d23[3]
+            vdup.u16    q13, d23[3]
+            pop         {r12,pc}
+END(fetch_clampright1)
+
+PRIVATE(fetch_clampright4)
+            push        {r12, lr}
+            rsb         r12, r11, #0
+            ands        r12, r12, #15
+            beq         1f
+            sub         r1, r1, r12
+            bl          fetch_generic_asm
+            vmov.u16    d24, d23
+            vmov.u16    d25, d23
+            vmov.u16    d26, d23
+            vmov.u16    d27, d23
+            rsb         r12, r11, #0
+            and         r12, r12, #15
+            sub         sp, sp, #32
+            vst1.u16    {q12-q13}, [sp]
+            sub         sp, sp, #32
+            add         r12, sp, r12, LSL #1
+            vst1.u16    {q10,q11}, [sp]
+            vld1.u16    {q10,q11}, [r12]
+            add         sp, sp, #64
+            pop         {r12,pc}
+1:          bl          fetch_generic_asm
+            vmov.u16    d24, d23
+            vmov.u16    d25, d23
+            vmov.u16    d26, d23
+            vmov.u16    d27, d23
+            pop         {r12,pc}
+END(fetch_clampright4)
+
+/* Given values in q10 and q11, and an index in r11, sweep the (r11 & 15)th
+ * value across to fill the rest of the register pair.  Used for filling the
+ * right hand edge of the window when reading too close to the right hand edge
+ * of the image.
+ * Also returns a dup-ed copy of the last element in q12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
+ */
+PRIVATE(prefill_sweepright1)
+            ands        r12, r11, #15
+            beq         1f
+            sub         r12, r12, #1
+            sub         sp, sp, #64
+            vst1.u16    {q10,q11}, [sp]
+            add         r12, sp, r12, LSL #1
+            vld1.u16    {d24[],d25[]}, [r12]
+            vld1.u16    {d26[],d27[]}, [r12]
+            vst1.u16    {q12,q13}, [r12]
+            vld1.u16    {q10,q11}, [sp]
+            add         sp, sp, #64
+            bx          lr
+1:          vdup.u16    q12, d23[3]
+            vdup.u16    q13, d23[3]
+            bx          lr
+END(prefill_sweepright1)
+
+PRIVATE(prefill_sweepright4)
+            ands        r12, r11, #15
+            beq         1f
+            sub         r12, r12, #4
+            sub         sp, sp, #64
+            vst1.u16    {q10,q11}, [sp]
+            add         r12, sp, r12, LSL #1
+            vld1.u64    {d24}, [r12]
+            vld1.u64    {d25}, [r12]
+            vld1.u64    {d26}, [r12]
+            vld1.u64    {d27}, [r12]
+            vst1.u16    {q12,q13}, [r12]
+            vld1.u16    {q10,q11}, [sp]
+            add         sp, sp, #64
+            bx          lr
+1:          vmov.u16    d24, d23
+            vmov.u16    d25, d23
+            vmov.u16    d26, d23
+            vmov.u16    d27, d23
+            bx          lr
+END(prefill_sweepright4)
+
+/* The main loop keeps a sliding window of data that has already been convolved
+ * in the vertical axis for the current line.  This usually stays in the
+ * register file, but spills to memory for large windows.  The first thing that
+ * needs to be done at start-up is to fill this window with image data, taking
+ * into account the padding needed if the left or right edges of the image fall
+ * within this window.
+ */
+
+/* Because the window is in the register file writes to it cannot be indexed
+ * by another register.  Consequently the fill loops are unrolled to address
+ * the registers directly.  This macro distinguishes between writes to the
+ * register file and writes to the spill buffer (indicated by a destination
+ * register named xx).
+ */
+.macro prefill_out ra, rb, sra, srb, srb_hi
+  .ifc \ra,xx
+    .ifc \rb,xx
+            vst1.u16    {\sra,\srb}, [r9:128]!
+    .else
+            /* this case is used only for the last tap of uchar1 r=25 */
+            /* discard \sra */
+            vmov.u16    \rb, \srb_hi
+    .endif
+  .else
+    .ifnc \ra,\sra
+            vmov.u16    \ra, \sra
+    .endif
+    .ifnc \rb,\srb
+            vmov.u16    \rb, \srb
+    .endif
+  .endif
+.endm
+
+/* This macro provides the list of registers representing the window, and the
+ * cases where the register file is too small and a spill buffer is used
+ * instead.
+ * Since several specialisations of each function are generated, this also
+ * culls superfluous iterations, and sets the variable `i` for subsequent
+ * macros indicating the current index into the window.
+ */
+.macro prefill_list, macro, nextmacro, max_r, step, label
+  .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
+    .if windowsize >= (\line * 16)
+      .set i, windowsize - (\line * 16)
+\label\macro\line:
+            prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
+    .endif
+  .endm
+  .if \step > 1
+            ifneeded \macro \nextmacro, 13, 12, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro, 12, 11, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro, 11, 10, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro, 10,  9, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro,  9,  8, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro,  8,  7, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro,  7,  6, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro,  6,  5, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro,  5,  4, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro,  4,  3, xx, xx,  \step, \label
+  .else
+            /* q3 normally contains the coefficient table, but it's not fully
+             * used.  In the uchar1, r=25 case the other half of q3 is used for
+             * the last two window taps to avoid falling out to memory.
+             */
+            ifneeded \macro \nextmacro,  4,  3, xx, d7,   \step, \label
+  .endif
+            ifneeded \macro \nextmacro,  3,  2, q4, q5,   \step, \label
+            ifneeded \macro \nextmacro,  2,  1, q6, q7,   \step, \label
+            ifneeded \macro \nextmacro,  1,  0, q8, q9,   \step, \label
+
+\label\macro\()0:
+            b           \label\()_end
+  .purgem ifneeded
+.endm
+
+/* These macros represent the possible stages of filling the window.
+ * Each macro is unrolled enough times that it can fill the entire window
+ * itself, but normally it will have to hand control to subsequent macros
+ * part-way through and this is done using labels named \next and \after, where
+ * \next is the next macro starting at the same window position and \after is
+ * the next macro starting after the current window position.
+ */
+
+/* leftfill: v8 and v9 contain the left padding value.  While the window
+ * extends outside of the image on the left-hand side, and at least 16 more
+ * padding values are needed in the window, store v8 and v9 into the window.
+ * Otherwise skip forward to storing image data.
+ */
+.macro prefill_leftfill, next, after, ra, rb, step
+            cmp         r10, #i+16
+            blo         \next
+            prefill_out \ra, \rb, q8, q9, d19
+.endm
+
+/* leftedge: The very first non-fill or partial-fill chunk from the image is
+ * already loaded (as it was used to calculate the left padding value), so
+ * store it here, and then drop into the regular load/store cycle in the next
+ * macro.
+ */
+.macro prefill_leftedge, next, after, ra, rb, step
+1:          prefill_out \ra, \rb, q10, q11, d23
+            b           \after
+.endm
+
+/* dofetch: Copy chunks of the image into the window without any complications
+ * from edge conditions.
+ */
+.macro prefill_dofetch, next, after, ra, rb, step
+            cmp         r11, #i+16
+            bls         \next
+            bl          fetch_generic_asm
+            prefill_out \ra, \rb, q10, q11, d23
+.endm
+
+/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
+ * the right-hand edge of the image.  In that case sweep the last valid pixel
+ * across the rest of the chunk, and in either case prepare padding data in v12
+ * and v13 for the next macro.  This is done in fetch_clampright.
+ * This only happens once before going on to the next macro.
+ * Sometimes leftedge also covers the rightedge case, in which case this has
+ * to be skipped altogether.
+ */
+.macro prefill_rightedge, next, after, ra, rb, step
+            cmp         r11, #i
+            bls         \next
+            bl          fetch_clampright\step
+            prefill_out \ra, \rb, q10, q11, d23
+            b           \after
+.endm
+
+/* rightfill: The rest of the window is simply filled with right padding from
+ * v12 and v13.
+ */
+.macro prefill_rightfill, next, after, ra, rb, step
+            prefill_out \ra, \rb, q12, q13, d25
+.endm
+
+/* Here all of the macros above are unrolled and laid out in the proper order.
+ */
+.macro prefill_body, max_r, step, label
+            prefill_list leftfill,  leftedge,   \max_r, \step, \label
+            prefill_list leftedge,  dofetch,    \max_r, \step, \label
+            prefill_list dofetch,   rightedge,  \max_r, \step, \label
+            prefill_list rightedge, rightfill,  \max_r, \step, \label
+            prefill_list rightfill, oops,       \max_r, \step, \label
+\label\()_end:
+.endm
+
+/* Fill the convolution window with context data.  The aim here is to load
+ * exactly 2*r columns, and in the main loop to read as many columns as will be
+ * written.  This is complicated by the window being divided into chunks at
+ * register boundaries, and the need to handle cases when the input starts very
+ * close to the left or right (or both) edges of the image and the need to fill
+ * the spaces that leaves with left and right edge padding values.
+ *
+ * Input:
+ *      r1 -- src
+ *      r2 -- pitch
+ *      r3 -- count
+ *      r4 -- available image data right of src pointer
+ *      r5 -- r
+ *      r6 -- rup
+ *      r7 -- rdn
+ *      r8 -- available image data left of src pointer
+ *      r9 -- buffer (if needed)
+ * Output:
+ *      r4 -= min(inlen, count + windowsize - centertap)
+ *      r1 += min(inlen, count + windowsize - centertap)
+ * Modifies:
+ *      r10 -- fill start index in the window
+ *      r11 -- fill stop index in the window
+ *      r12 -- scratch
+ */
+.macro prefill step=1, max_r=25, label=xx
+.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
+.set centertap, (windowsize - \max_r * \step)
+            mov         r10, #centertap
+            subs        r10, r10, r8
+            movlo       r10, #0
+
+            subs        r11, r4, #windowsize - centertap
+            movhs       r11, #0
+            add         r11, r11, #windowsize
+
+            /* r10 indicates where in the window legal image data begins.
+             * r11 indicates where in the window legal image date ends.
+             * When starting near the centre of a large image these would be
+             * zero and windowsize respectively, but when starting near the
+             * edges this can change.
+             * When starting on the leftmost pixel, r10 will be centertap.
+             * When starting on the rightmost pixel, r11 will be centertap+1.
+             */
+
+            /* r4 indicates how much data there is between the current pointers
+             * and the right edge of the image.  The pointers currently point
+             * to the data needed at centertap.  The subsequent code will
+             * consume (windowsize - r10) data, but only the data from
+             * centertap to windowsize comes out of r4's budget.
+             */
+1:          subs        r4, r4, #windowsize - centertap
+            movlo       r4, #0
+
+            /* And the pointers need to rewind to the start of the window.
+             */
+            sub         r1, r1, #centertap
+
+            /* Unless x8 indicated that there wasn't that much data available.
+             */
+            add         r1, r1, r10
+
+
+            /* Get the first chunk, and add padding to align it to the window
+             * if necessary.
+             */
+            bl          fetch_clampleft\step
+
+            /* Sometimes the start and the end of the window are in the same
+             * chunk.  In that case both ends need filler at the outset.
+             */
+            sub         r12, r11, #1
+            eor         r12,  r10, r12
+            cmp         r12, #16
+            bllo        prefill_sweepright\step
+
+            /* Iterate through all the points in the window and fill them in
+             * with padding or image data as needed.
+             */
+            prefill_body \max_r, \step, \label
+.endm
+
+/* The main body of the convolve functions.  Having already pre-filled the
+ * convolution window with 2*r input values, the logic settles into a regular
+ * pattern of reading and writing at a 1:1 rate until either input or output
+ * expires.  The input leads the output by r values, so when processing all the
+ * way to the right-hand edge, or within r pixels of that edge, the input will
+ * run out first.  In the case of very narrow images, or sub-windows starting
+ * near the right edge, the input may already have run out while the
+ * convolution window was being filled and this loop will start with a
+ * zero-length input.
+ *
+ * Once the input runs out, the rest of the output must be processed by padding
+ * the remainder of the window with pad value from the last valid pixel from
+ * the source.
+ *
+ * Input:
+ *      r0 = dst
+ *      r1 = src
+ *      r2 = pitch
+ *      r3 = count
+ *      r4 = inlen
+ *      r5 = r
+ *      r6 = rup
+ *      r7 = rdn
+ *      r9 = buffer
+ * Modifies
+ *      r8 = fetch code pointer
+ */
+.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
+
+            /* If x4 >= x3 then there's no need for clipping.  The main loop
+             * needs to exit when either x3 or x4 runs out, so clamp x4 to be
+             * no greater than x3 and use x4 for the loop.
+             * However, if x4 comes out of the loop with less than 16 bytes
+             * left, a partial read would be necessary to avoid reading beyond
+             * the end of the image.  To avoid this, clamp x4 to the next
+             * multiple of 16, which is still sufficient to force it out of the
+             * loop but doesn't imply a rewind.
+             */
+            add         r12, r3, #15
+            bic         r12, r12, #15
+            cmp         r4, r12
+            movhi       r4, r12
+
+            /* First calculate the entry-point into the internal fetch logic.
+             * This is done so the same function can service several kernel
+             * sizes.
+             */
+            ldr         r8, 3f
+1:          add         r8, r8, pc
+            sub         r8, r5, LSL #5
+            sub         r8, r5, LSL #4
+            cmp         r5, r6
+            cmpeq       r5, r7
+            beq         5f
+
+            /* if (r != rup || r != rdn) then the address-clamping table should
+             * be used rather than the short-cut version.
+             */
+            ldr         r8, 3f+4
+2:          add         r8, r8, pc
+            sub         r8, r5, LSL #6
+            b           5f
+            .align 3
+3:          .word       \labelnc-1b-8
+            .word       \labelc-2b-8
+
+            /* Main loop: ... */
+            .align 4
+3:          /* first perform a vertical convolution from memory to get the next
+             * 16 taps of the horizontal window into the register file...
+             */
+            fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=r8
+
+            /* ...then perform a horizontal convolution on that window to
+             * produce eight output bytes, and slide the window along.
+             * This has to be done twice to match the 16-way vertical pass.
+             * It would be preferable to have twice the work done in \core, but
+             * that would demand yet another variant on those macros and would
+             * perturb the register allocation severely.
+             */
+            \core
+            vst1.u8     {d31}, [r0]!
+            \core
+            vst1.u8     {d31}, [r0]!
+
+            sub         r3, r3, #16
+5:          subs        r4, r4, #16
+            bhi         3b
+            /* Here there's 16 or fewer bytes available before the edge of the
+             * source image.  x4 holds that count minus 16 (because it was
+             * decremented before the first iteration ran).  The last read may
+             * not be a whole chunk, and beyond that a fill value must be used.
+             *
+             * Of course, none of that matters if there's no more output to
+             * produce...
+             */
+            cmp         r3, #0
+            beq         5f
+
+            /* Oh well. */
+            adds        r4, r4, #16
+            bne         1f
+  .if \step==1
+            vdup.u16    q10, d19[3]
+            vdup.u16    q11, d19[3]
+  .else
+            vmov.u64    d20, d19
+            vmov.u64    d21, d19
+            vmov.u64    d22, d19
+            vmov.u64    d23, d19
+  .endif
+            b           3f
+
+            /* To avoid reading past end of input, rewind pointers by (16-r4)
+             * to ensure that they're exactly 16 bytes from the edge.
+             */
+1:          mov         r11, r4
+            bl          fetch_clampright\step
+            /* Now to put this padding to use, perform any remaining
+             * iterations.  This is done at half the rate of the main loop,
+             * because there's no longer pressure from a 16-lane window filler.
+             */
+3:          \core
+  .if \step==1
+            vdup.u16    q11, d23[3]
+  .else
+            vmov.u64    d22, d23
+  .endif
+            subs        r3, r3, #8
+            blo         4f
+            vst1.u8     {d31}, [r0]!
+            bne         3b
+            b           5f
+
+            /* If the final iteration contained 0 < l < 8 values, then perform
+             * a piecewise store of the final vector.
+             */
+4:          tst         r3, #4
+            beq         1f
+            vst1.u32    {d31[0]}, [r0]!
+            vext.u8     d31, d31, d31, #4
+1:          tst         r3, #2
+            beq         1f
+            vst1.u16    {d31[0]}, [r0]!
+            vext.u8     d31, d31, d31, #2
+1:          tst         r3, #1
+            beq         5f
+            vst1.u8     {d31[0]}, [r0]!
+            vext.u8     d31, d31, d31, #1
+5:          mov         r0, #0
+.endm
+
+.irp r, TUNED_LIST1, 25
+PRIVATE(convolve1_\r)
+            push        {r12,lr}
+
+            prefill     step=1, max_r=\r, label=.Lcnv1_\r
+
+            conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
+
+            pop         {r12,pc}
+END(convolve1_\r)
+.endr
+
+.irp r, TUNED_LIST4, 25
+PRIVATE(convolve4_\r)
+            push        {r12,lr}
+            sub         r9, sp, #0x200
+            sub         sp, sp, #0x200 + 0x400
+            bic         r9, r9, #0x3fc
+
+            /* r9 now points to a 0x200 byte buffer on the stack whose address
+             * has the low 10 bits clear.  This allows easy address calculation
+             * in the wrap-around cases.
+             */
+
+            prefill     step=4, max_r=\r, label=.Lcnv4_\r
+
+            conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
+
+            add         sp, sp, #0x200 + 0x400
+            pop         {r12,pc}
+END(convolve4_\r)
+.endr
+
+/* void rsdIntrinsicBlurU1_K(
+ *                  void *out,      // r0
+ *                  void *in,       // r1
+ *                  size_t w,       // r2
+ *                  size_t h,       // r3
+ *                  size_t p,       // [sp]
+ *                  size_t x,       // [sp,#4]
+ *                  size_t y,       // [sp,#8]
+ *                  size_t count,   // [sp,#12]
+ *                  size_t r,       // [sp,#16]
+ *                  uint16_t *tab); // [sp,#20]
+ */
+ENTRY(rsdIntrinsicBlurU1_K)
+            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+            vpush       {d8-d15}
+            ldr         r6, [sp,#112]   // y
+            ldr         r8, [sp,#108]   // x
+            ldr         r5, [sp,#120]   // r
+            sub         r4, r2, r8      // inlen = w - x
+            sub         r7, r3, r6      // h - y
+            ldr         r2, [sp,#104]   // pitch
+            ldr         r3, [sp,#116]   // count
+            sub         r7, r7, #1      // h - y - 1
+
+            ldr         r12, [sp,#124]
+
+            add         r1, r1, r8      // src += x
+
+            cmp         r6, r5
+            movhi       r6, r5          // rup = min(r, y)
+            cmp         r7, r5
+            movhi       r7, r5          // rdn = min(r, h - y - 1)
+
+            vld1.u16    {d0,d1,d2,d3}, [r12]!
+            vld1.u16    {d4,d5,d6}, [r12]!
+
+            adr         lr, 1f
+  .irp r, TUNED_LIST1
+            cmp         r5, #\r
+            bls         convolve1_\r
+  .endr
+            b           convolve1_25
+
+1:          vpop        {d8-d15}
+            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+END(rsdIntrinsicBlurU1_K)
+
+/* void rsdIntrinsicBlurU4_K(
+ *                  void *out,      // r0
+ *                  void *in,       // r1
+ *                  size_t w,       // r2
+ *                  size_t h,       // r3
+ *                  size_t p,       // [sp]
+ *                  size_t x,       // [sp,#4]
+ *                  size_t y,       // [sp,#8]
+ *                  size_t count,   // [sp,#12]
+ *                  size_t r,       // [sp,#16]
+ *                  uint16_t *tab); // [sp,#20]
+ */
+ENTRY(rsdIntrinsicBlurU4_K)
+            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+            vpush       {d8-d15}
+            ldr         r6, [sp,#112]   // y
+            ldr         r8, [sp,#108]   // x
+            ldr         r5, [sp,#120]   // r
+            lsl         r8, r8, #2
+            rsb         r4, r8, r2, LSL #2 // inlen = (w - x)
+            sub         r7, r3, r6      // h - y
+            ldr         r2, [sp,#104]   // pitch
+            ldr         r3, [sp,#116]   // count
+            sub         r7, r7, #1      // h - y - 1
+            lsl         r3, r3, #2      // count
+
+            ldr         r12, [sp,#124]
+
+            add         r1, r1, r8      // in += x
+
+            cmp         r6, r5
+            movhi       r6, r5          // rup = min(r, y)
+            cmp         r7, r5
+            movhi       r7, r5          // rdn = min(r, h - y - 1)
+
+            vld1.u16    {d0,d1,d2,d3}, [r12]!
+            vld1.u16    {d4,d5,d6}, [r12]!
+
+            adr         lr, 1f
+  .irp r, TUNED_LIST4
+            cmp         r5, #\r
+            bls         convolve4_\r
+  .endr
+            b           convolve4_25
+
+1:          vpop        {d8-d15}
+            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+END(rsdIntrinsicBlurU4_K)
diff --git a/toolkit/ColorMatrix.cpp b/toolkit/ColorMatrix.cpp
new file mode 100644
index 0000000..dd426cf
--- /dev/null
+++ b/toolkit/ColorMatrix.cpp
@@ -0,0 +1,1066 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+#include <assert.h>
+#include <cstdint>
+#include <sys/mman.h>
+
+namespace android {
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.ColorMatrix"
+
+/*  uint kernel
+ *  Q0  D0:  Load slot for R
+ *      D1:  Load slot for G
+ *  Q1  D2:  Load slot for B
+ *      D3:  Load slot for A
+ *  Q2  D4:  Matrix
+ *      D5:  =
+ *  Q3  D6:  =
+ *      D7:  =
+ *  Q4  D8:  Add R
+ *      D9:
+ *  Q5  D10: Add G
+ *      D11:
+ *  Q6  D12: Add B
+ *      D13:
+ *  Q7  D14: Add A
+ *      D15:
+ *  Q8  D16:  I32: R Sum
+ *      D17:
+ *  Q9  D18:  I32: G Sum
+ *      D19:
+ *  Q10 D20:  I32: B Sum
+ *      D21:
+ *  Q11 D22:  I32: A Sum
+ *      D23:
+ *  Q12 D24:  U16: expanded R
+ *      D25:
+ *  Q13 D26:  U16: expanded G
+ *      D27:
+ *  Q14 D28:  U16: expanded B
+ *      D29:
+ *  Q15 D30:  U16: expanded A
+ *      D31:
+ *
+ */
+
+/*  float kernel
+ *  Q0  D0:  Load slot for R
+ *      D1:  =
+ *  Q1  D2:  Load slot for G
+ *      D3:  =
+ *  Q2  D4:  Load slot for B
+ *      D5:  =
+ *  Q3  D6:  Load slot for A
+ *      D7:  =
+ *  Q4  D8:  Matrix
+ *      D9:  =
+ *  Q5  D10: =
+ *      D11: =
+ *  Q6  D12: =
+ *      D13: =
+ *  Q7  D14: =
+ *      D15: =
+ *  Q8  D16: Add R
+ *      D17: =
+ *  Q9  D18: Add G
+ *      D19: =
+ *  Q10 D20: Add B
+ *      D21: =
+ *  Q11 D22: Add A
+ *      D23: =
+ *  Q12 D24: Sum R
+ *      D25: =
+ *  Q13 D26: Sum G
+ *      D27: =
+ *  Q14 D28: Sum B
+ *      D29: =
+ *  Q15 D30: Sum A
+ *      D31: =
+ *
+ */
+
+typedef union {
+    uint64_t key;
+    struct {
+        uint32_t inVecSize          :2;  // [0 - 1]
+        uint32_t outVecSize         :2;  // [2 - 3]
+        uint32_t inType             :4;  // [4 - 7]
+        uint32_t outType            :4;  // [8 - 11]
+        uint32_t dot                :1;  // [12]
+        uint32_t _unused1           :1;  // [13]
+        uint32_t copyAlpha          :1;  // [14]
+        uint32_t _unused2           :1;  // [15]
+        uint32_t coeffMask          :16; // [16-31]
+        uint32_t addMask            :4;  // [32-35]
+    } u;
+} Key_t;
+
+/* The two data types and their value, as specified in the RenderScript documentation.
+ * Only RS_TYPE_UNSIGNED_8 is currently supported.
+ *
+ * TODO: The actual values of these constants are likely not important. We may be
+ * able to simplify the key related code.
+ */
+const int RS_TYPE_UNSIGNED_8 = 8;
+const int RS_TYPE_FLOAT_32 = 2;
+
+//Re-enable when intrinsic is fixed
+#if defined(ARCH_ARM64_USE_INTRINSICS)
+typedef struct {
+    void (*column[4])(void);
+    void (*store)(void);
+    void (*load)(void);
+    void (*store_end)(void);
+    void (*load_end)(void);
+} FunctionTab_t;
+
+extern "C" void rsdIntrinsicColorMatrix_int_K(
+             void *out, void const *in, size_t count,
+             FunctionTab_t const *fns,
+             int16_t const *mult, int32_t const *add);
+
+extern "C" void rsdIntrinsicColorMatrix_float_K(
+             void *out, void const *in, size_t count,
+             FunctionTab_t const *fns,
+             float const *mult, float const *add);
+
+/* The setup functions fill in function tables to be used by above functions;
+ * this code also eliminates jump-to-another-jump cases by short-circuiting
+ * empty functions.  While it's not performance critical, it works out easier
+ * to write the set-up code in assembly than to try to expose the same symbols
+ * and write the code in C.
+ */
+extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
+             FunctionTab_t *fns,
+             uint32_t mask, int dt, int st);
+
+extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
+             FunctionTab_t *fns,
+             uint32_t mask, int dt, int st);
+#endif
+
+class ColorMatrixTask : public Task {
+    const void* mIn;
+    void* mOut;
+    size_t mInputVectorSize;
+    uint32_t mOutstep;
+    uint32_t mInstep;
+
+    float mFp[16];
+    float mFpa[4];
+
+    // The following four fields are read as constants
+    // by the SIMD assembly code.
+    int16_t mIp[16];
+    int mIpa[4];
+    float mTmpFp[16];
+    float mTmpFpa[4];
+#if defined(ARCH_ARM64_USE_INTRINSICS)
+    FunctionTab_t mFnTab;
+#endif
+
+    void kernel(uchar* out, uchar* in, uint32_t xstart, uint32_t xend);
+    void updateCoeffCache(float fpMul, float addMul);
+
+    Key_t mLastKey;
+    unsigned char* mBuf;
+    size_t mBufSize;
+
+    bool build(Key_t key);
+    void (*mOptKernel)(void* dst, const void* src, const int16_t* coef, uint32_t count);
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+    Key_t computeKey(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
+    void preLaunch(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
+#else
+    Key_t computeKey(size_t inVectorSize, size_t outVectorSize);
+    void preLaunch(size_t inVectorSize, size_t outVectorSize);
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) override;
+
+   public:
+    ColorMatrixTask(const void* in, void* out, size_t inputVectorSize, size_t outputVectorSize,
+                    size_t sizeX, size_t sizeY, const float* matrix, const float* addVector,
+                    const Restriction* restriction)
+        : Task{sizeX, sizeY, outputVectorSize, true, restriction},
+          mIn{in},
+          mOut{out},
+          mInputVectorSize{inputVectorSize} {
+        mLastKey.key = 0;
+        mBuf = nullptr;
+        mBufSize = 0;
+        mOptKernel = nullptr;
+
+        mOutstep = paddedSize(outputVectorSize);
+        mInstep = paddedSize(inputVectorSize);
+
+        memcpy(mFp, matrix, sizeof(mFp));
+        memcpy(mFpa, addVector, sizeof(mFpa));
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+        // For float support, we'll have to pass the type in the constructor too.
+        preLaunch(inputVectorSize, RS_TYPE_UNSIGNED_8, outputVectorSize, RS_TYPE_UNSIGNED_8);
+#else
+        preLaunch(inputVectorSize, outputVectorSize);
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+    }
+    ~ColorMatrixTask() {
+        if (mBuf) munmap(mBuf, mBufSize);
+        mBuf = nullptr;
+        mOptKernel = nullptr;
+    }
+};
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+Key_t ColorMatrixTask::computeKey(size_t inVectorSize, int inType, size_t outVectorSize,
+                                  int outType) {
+    Key_t key;
+    key.key = 0;
+
+    // Compute a unique code key for this operation
+
+    // Add to the key the input and output types
+    bool hasFloat = false;
+    if (inType == RS_TYPE_FLOAT_32) {
+        hasFloat = true;
+        key.u.inType = RS_TYPE_FLOAT_32;
+    }
+    if (outType == RS_TYPE_FLOAT_32) {
+        hasFloat = true;
+        key.u.outType = RS_TYPE_FLOAT_32;
+    }
+
+    // Mask in the bits indicating which coefficients in the
+    // color matrix are needed.
+    if (hasFloat) {
+        for (uint32_t i=0; i < 16; i++) {
+            if (fabs(mFp[i]) != 0.f) {
+                key.u.coeffMask |= 1 << i;
+            }
+        }
+        if (fabs(mFpa[0]) != 0.f) key.u.addMask |= 0x1;
+        if (fabs(mFpa[1]) != 0.f) key.u.addMask |= 0x2;
+        if (fabs(mFpa[2]) != 0.f) key.u.addMask |= 0x4;
+        if (fabs(mFpa[3]) != 0.f) key.u.addMask |= 0x8;
+
+    } else {
+#else
+Key_t ColorMatrixTask::computeKey(size_t inVectorSize, size_t outVectorSize) {
+    Key_t key;
+    key.key = 0;
+
+    // Compute a unique code key for this operation
+    {
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+        for (uint32_t i=0; i < 16; i++) {
+            if (mIp[i] != 0) {
+                key.u.coeffMask |= 1 << i;
+            }
+        }
+        if (mIpa[0] != 0) key.u.addMask |= 0x1;
+        if (mIpa[1] != 0) key.u.addMask |= 0x2;
+        if (mIpa[2] != 0) key.u.addMask |= 0x4;
+        if (mIpa[3] != 0) key.u.addMask |= 0x8;
+    }
+
+    // Look for a dot product where the r,g,b colums are the same
+    if ((mIp[0] == mIp[1]) && (mIp[0] == mIp[2]) &&
+        (mIp[4] == mIp[5]) && (mIp[4] == mIp[6]) &&
+        (mIp[8] == mIp[9]) && (mIp[8] == mIp[10]) &&
+        (mIp[12] == mIp[13]) && (mIp[12] == mIp[14])) {
+
+        if (!key.u.addMask) key.u.dot = 1;
+    }
+
+    // Is alpha a simple copy
+    if (!(key.u.coeffMask & 0x0888) && (mIp[15] == 256) && !(key.u.addMask & 0x8)) {
+        key.u.copyAlpha = !(key.u.inType || key.u.outType);
+    }
+
+    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
+
+    switch (inVectorSize) {
+    case 4:
+        key.u.inVecSize = 3;
+        break;
+    case 3:
+        key.u.inVecSize = 2;
+        key.u.coeffMask &= ~0xF000;
+        break;
+    case 2:
+        key.u.inVecSize = 1;
+        key.u.coeffMask &= ~0xFF00;
+        break;
+    default:
+        key.u.coeffMask &= ~0xFFF0;
+        break;
+    }
+
+    switch (outVectorSize) {
+    case 4:
+        key.u.outVecSize = 3;
+        break;
+    case 3:
+        key.u.outVecSize = 2;
+        key.u.coeffMask &= ~0x8888;
+        key.u.addMask &= 7;
+        break;
+    case 2:
+        key.u.outVecSize = 1;
+        key.u.coeffMask &= ~0xCCCC;
+        key.u.addMask &= 3;
+        break;
+    default:
+        key.u.coeffMask &= ~0xEEEE;
+        key.u.addMask &= 1;
+        break;
+    }
+
+    if (key.u.inType && !key.u.outType) {
+        key.u.addMask |= 1;
+        if (key.u.outVecSize > 0) key.u.addMask |= 2;
+        if (key.u.outVecSize > 1) key.u.addMask |= 4;
+        if (key.u.outVecSize > 2) key.u.addMask |= 8;
+    }
+
+    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
+    return key;
+}
+
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
+
+#define DEF_SYM(x)                                  \
+    extern "C" uint32_t _N_ColorMatrix_##x;      \
+    extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
+    extern "C" uint32_t _N_ColorMatrix_##x##_len;
+
+DEF_SYM(prefix_i)
+DEF_SYM(prefix_f)
+DEF_SYM(postfix1)
+DEF_SYM(postfix2)
+
+DEF_SYM(load_u8_4)
+DEF_SYM(load_u8_3)
+DEF_SYM(load_u8_2)
+DEF_SYM(load_u8_1)
+DEF_SYM(load_u8f_4)
+DEF_SYM(load_u8f_3)
+DEF_SYM(load_u8f_2)
+DEF_SYM(load_u8f_1)
+DEF_SYM(load_f32_4)
+DEF_SYM(load_f32_3)
+DEF_SYM(load_f32_2)
+DEF_SYM(load_f32_1)
+
+DEF_SYM(store_u8_4)
+DEF_SYM(store_u8_2)
+DEF_SYM(store_u8_1)
+DEF_SYM(store_f32_4)
+DEF_SYM(store_f32_3)
+DEF_SYM(store_f32_2)
+DEF_SYM(store_f32_1)
+DEF_SYM(store_f32u_4)
+DEF_SYM(store_f32u_2)
+DEF_SYM(store_f32u_1)
+
+DEF_SYM(unpack_u8_4)
+DEF_SYM(unpack_u8_3)
+DEF_SYM(unpack_u8_2)
+DEF_SYM(unpack_u8_1)
+DEF_SYM(pack_u8_4)
+DEF_SYM(pack_u8_3)
+DEF_SYM(pack_u8_2)
+DEF_SYM(pack_u8_1)
+DEF_SYM(dot)
+DEF_SYM(add_0_u8)
+DEF_SYM(add_1_u8)
+DEF_SYM(add_2_u8)
+DEF_SYM(add_3_u8)
+
+#define ADD_CHUNK(x) \
+    memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
+    buf += _N_ColorMatrix_##x##_len
+
+
+static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
+    size_t off = (target - buf - 8) >> 2;
+    assert(((off & 0xff000000) == 0) ||
+           ((off & 0xff000000) == 0xff000000));
+
+    uint32_t op = (condition << 28);
+    op |= 0xa << 24;  // branch
+    op |= 0xffffff & off;
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
+    assert(vd < 32);
+    assert(vm < 32);
+    assert(vn < 32);
+
+    uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
+    op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
+    op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
+    return op;
+}
+
+static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
+                              uint32_t src_d2_s) {
+    //vmlal.s16 Q#1, D#1, D#2[#]
+    uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
+                              uint32_t src_d2_s) {
+    //vmull.s16 Q#1, D#1, D#2[#]
+    uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
+    //vqadd.s32 Q#1, Q#1, Q#2
+    uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
+                              uint32_t src_d2_s) {
+    //vmlal.f32 Q#1, D#1, D#2[#]
+    uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
+                              uint32_t src_d2_s) {
+    //vmull.f32 Q#1, D#1, D#2[#]
+    uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
+    //vadd.f32 Q#1, D#1, D#2
+    uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
+    //vmov.32 Q#1, #imm
+    assert(imm == 0);
+    (void) imm; // Avoid unused parameter warnings for non-debug builds
+    uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
+    //vadd.f32 Q#1, D#1, D#2
+    uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+#endif
+
+#if defined(ARCH_X86_HAVE_SSSE3)
+extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
+                                  const int16_t *coef, uint32_t count);
+extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
+                                  const int16_t *coef, uint32_t count);
+extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
+                                  const int16_t *coef, uint32_t count);
+
+using android::renderscript::Key_t;
+
+void * selectKernel(Key_t key)
+{
+    void * kernel = nullptr;
+
+    // inType, outType float if nonzero
+    if (!(key.u.inType || key.u.outType)) {
+        if (key.u.dot)
+            kernel = (void *)rsdIntrinsicColorMatrixDot_K;
+        else if (key.u.copyAlpha)
+            kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
+        else
+            kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
+    }
+
+    return kernel;
+}
+#endif
+
+bool ColorMatrixTask::build(Key_t key) {
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
+    mBufSize = 4096;
+    //StopWatch build_time("rs cm: build time");
+    mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
+                                  MAP_PRIVATE | MAP_ANON, -1, 0);
+    if (mBuf == MAP_FAILED) {
+        mBuf = NULL;
+        return false;
+    }
+
+    uint8_t *buf = mBuf;
+    uint8_t *buf2 = nullptr;
+
+    int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
+    int opInit[4] = {0, 0, 0, 0};
+
+    memset(ops, 0, sizeof(ops));
+    for (int i=0; i < 4; i++) {
+        if (key.u.coeffMask & (1 << (i*4))) {
+            ops[i][0] = 0x2 | opInit[0];
+            opInit[0] = 1;
+        }
+        if (!key.u.dot) {
+            if (key.u.coeffMask & (1 << (1 + i*4))) {
+                ops[i][1] = 0x2 | opInit[1];
+                opInit[1] = 1;
+            }
+            if (key.u.coeffMask & (1 << (2 + i*4))) {
+                ops[i][2] = 0x2 | opInit[2];
+                opInit[2] = 1;
+            }
+        }
+        if (!key.u.copyAlpha) {
+            if (key.u.coeffMask & (1 << (3 + i*4))) {
+                ops[i][3] = 0x2 | opInit[3];
+                opInit[3] = 1;
+            }
+        }
+    }
+
+    if (key.u.inType || key.u.outType) {
+        key.u.copyAlpha = 0;
+        ADD_CHUNK(prefix_f);
+        buf2 = buf;
+
+        // Load the incoming r,g,b,a as needed
+        if (key.u.inType) {
+            switch(key.u.inVecSize) {
+            case 3:
+                ADD_CHUNK(load_f32_4);
+                break;
+            case 2:
+                ADD_CHUNK(load_f32_3);
+                break;
+            case 1:
+                ADD_CHUNK(load_f32_2);
+                break;
+            case 0:
+                ADD_CHUNK(load_f32_1);
+                break;
+            }
+        } else {
+            switch(key.u.inVecSize) {
+            case 3:
+                ADD_CHUNK(load_u8f_4);
+                break;
+            case 2:
+                ADD_CHUNK(load_u8f_3);
+                break;
+            case 1:
+                ADD_CHUNK(load_u8f_2);
+                break;
+            case 0:
+                ADD_CHUNK(load_u8f_1);
+                break;
+            }
+        }
+
+        for (int i=0; i < 4; i++) {
+            for (int j=0; j < 4; j++) {
+                switch(ops[i][j]) {
+                case 0:
+                    break;
+                case 2:
+                    buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
+                    break;
+                case 3:
+                    buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
+                    break;
+                }
+            }
+        }
+        for (int j=0; j < 4; j++) {
+            if (opInit[j]) {
+                if (key.u.addMask & (1 << j)) {
+                    buf = addVADD_F32(buf, j, 12+j, 8+j);
+                } else {
+                    buf = addVORR_32(buf, j, 12+j, 12+j);
+                }
+            } else {
+                if (key.u.addMask & (1 << j)) {
+                    buf = addVORR_32(buf, j, 8+j, 8+j);
+                } else {
+                    buf = addVMOV_32(buf, j, 0);
+                }
+            }
+        }
+
+        if (key.u.outType) {
+            switch(key.u.outVecSize) {
+            case 3:
+                ADD_CHUNK(store_f32_4);
+                break;
+            case 2:
+                ADD_CHUNK(store_f32_3);
+                break;
+            case 1:
+                ADD_CHUNK(store_f32_2);
+                break;
+            case 0:
+                ADD_CHUNK(store_f32_1);
+                break;
+            }
+        } else {
+            switch(key.u.outVecSize) {
+            case 3:
+            case 2:
+                ADD_CHUNK(store_f32u_4);
+                break;
+            case 1:
+                ADD_CHUNK(store_f32u_2);
+                break;
+            case 0:
+                ADD_CHUNK(store_f32u_1);
+                break;
+            }
+        }
+
+
+    } else {
+        // Add the function prefix
+        // Store the address for the loop return
+        ADD_CHUNK(prefix_i);
+        buf2 = buf;
+
+        // Load the incoming r,g,b,a as needed
+        switch(key.u.inVecSize) {
+        case 3:
+            ADD_CHUNK(load_u8_4);
+            if (key.u.copyAlpha) {
+                ADD_CHUNK(unpack_u8_3);
+            } else {
+                ADD_CHUNK(unpack_u8_4);
+            }
+            break;
+        case 2:
+            ADD_CHUNK(load_u8_3);
+            ADD_CHUNK(unpack_u8_3);
+            break;
+        case 1:
+            ADD_CHUNK(load_u8_2);
+            ADD_CHUNK(unpack_u8_2);
+            break;
+        case 0:
+            ADD_CHUNK(load_u8_1);
+            ADD_CHUNK(unpack_u8_1);
+            break;
+        }
+
+        // Add multiply and accumulate
+        // use MULL to init the output register,
+        // use MLAL from there
+        for (int i=0; i < 4; i++) {
+            for (int j=0; j < 4; j++) {
+                switch(ops[i][j]) {
+                case 0:
+                    break;
+                case 2:
+                    buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
+                    break;
+                case 3:
+                    buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
+                    break;
+                }
+            }
+        }
+        for (int j=0; j < 4; j++) {
+            if (opInit[j]) {
+                if (key.u.addMask & (1 << j)) {
+                    buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
+                }
+            } else {
+                if (key.u.addMask & (1 << j)) {
+                    buf = addVORR_32(buf, 8+j, 4+j, 4+j);
+                }
+            }
+        }
+
+        // If we have a dot product, perform the special pack.
+        if (key.u.dot) {
+            ADD_CHUNK(pack_u8_1);
+            ADD_CHUNK(dot);
+        } else {
+            switch(key.u.outVecSize) {
+            case 3:
+                if (key.u.copyAlpha) {
+                    ADD_CHUNK(pack_u8_3);
+                } else {
+                    ADD_CHUNK(pack_u8_4);
+                }
+                break;
+            case 2:
+                ADD_CHUNK(pack_u8_3);
+                break;
+            case 1:
+                ADD_CHUNK(pack_u8_2);
+                break;
+            case 0:
+                ADD_CHUNK(pack_u8_1);
+                break;
+            }
+        }
+
+        // Write out result
+        switch(key.u.outVecSize) {
+        case 3:
+        case 2:
+            ADD_CHUNK(store_u8_4);
+            break;
+        case 1:
+            ADD_CHUNK(store_u8_2);
+            break;
+        case 0:
+            ADD_CHUNK(store_u8_1);
+            break;
+        }
+    }
+
+    if (key.u.inType != key.u.outType) {
+        key.u.copyAlpha = 0;
+        key.u.dot = 0;
+    }
+
+    // Loop, branch, and cleanup
+    ADD_CHUNK(postfix1);
+    buf = addBranch(buf, buf2, 0x01);
+    ADD_CHUNK(postfix2);
+
+    int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
+    if (ret == -1) {
+        ALOGE("mprotect error %i", ret);
+        return false;
+    }
+
+    __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
+    return true;
+#else
+    (void) key; // Avoid unused parameter warning.
+    return false;
+#endif
+}
+
+void ColorMatrixTask::updateCoeffCache(float fpMul, float addMul) {
+    for(int ct=0; ct < 16; ct++) {
+        mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
+        mTmpFp[ct] = mFp[ct] * fpMul;
+        //ALOGE("mat %i %f  %f", ct, mFp[ct], tmpFp[ct]);
+    }
+
+    float add = 0.f;
+    if (fpMul > 254.f) add = 0.5f;
+    for(int ct=0; ct < 4; ct++) {
+        mTmpFpa[ct] = mFpa[ct] * addMul + add;
+        //ALOGE("mFpa %i %f  %f", ct, mFpa[ct], tmpFpa[ct * 4 + 0]);
+    }
+
+    for(int ct=0; ct < 4; ct++) {
+        mIpa[ct] = (int)(mFpa[ct] * 65536.f + 0.5f);
+    }
+}
+
+
+
+static void One(void *out,
+                const void *py, const float* coeff, const float *add,
+                uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
+
+    float4 f = 0.f;
+    if (fin) {
+        switch(vsin) {
+        case 3:
+            f = ((const float4 *)py)[0];
+            break;
+        case 2:
+            f = ((const float4 *)py)[0];
+            f.w = 0.f;
+            break;
+        case 1:
+            f.xy = ((const float2 *)py)[0];
+            break;
+        case 0:
+            f.x = ((const float *)py)[0];
+            break;
+        }
+    } else {
+        switch(vsin) {
+        case 3:
+            f = convert<float4>(((const uchar4 *)py)[0]);
+            break;
+        case 2:
+            f = convert<float4>(((const uchar4 *)py)[0]);
+            f.w = 0.f;
+            break;
+        case 1:
+            f.xy = convert<float2>(((const uchar2 *)py)[0]);
+            break;
+        case 0:
+            f.x = (float)(((const uchar *)py)[0]);
+            break;
+        }
+    }
+    //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
+
+    float4 sum;
+    sum.x = f.x * coeff[0] +
+            f.y * coeff[4] +
+            f.z * coeff[8] +
+            f.w * coeff[12];
+    sum.y = f.x * coeff[1] +
+            f.y * coeff[5] +
+            f.z * coeff[9] +
+            f.w * coeff[13];
+    sum.z = f.x * coeff[2] +
+            f.y * coeff[6] +
+            f.z * coeff[10] +
+            f.w * coeff[14];
+    sum.w = f.x * coeff[3] +
+            f.y * coeff[7] +
+            f.z * coeff[11] +
+            f.w * coeff[15];
+    //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
+
+    sum.x += add[0];
+    sum.y += add[1];
+    sum.z += add[2];
+    sum.w += add[3];
+
+
+    //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
+    if (fout) {
+        switch(vsout) {
+        case 3:
+        case 2:
+            ((float4 *)out)[0] = sum;
+            break;
+        case 1:
+            ((float2 *)out)[0] = sum.xy;
+            break;
+        case 0:
+            ((float *)out)[0] = sum.x;
+            break;
+        }
+    } else {
+        sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
+        sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
+        sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
+        sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
+
+        switch(vsout) {
+        case 3:
+        case 2:
+            ((uchar4 *)out)[0] = convert<uchar4>(sum);
+            break;
+        case 1:
+            ((uchar2 *)out)[0] = convert<uchar2>(sum.xy);
+            break;
+        case 0:
+            ((uchar *)out)[0] = sum.x;
+            break;
+        }
+    }
+    //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2],
+    //      ((float *)out)[3]);
+}
+
+void ColorMatrixTask::kernel(uchar *out, uchar *in, uint32_t xstart, uint32_t xend) {
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    uint32_t vsin = mLastKey.u.inVecSize;
+    uint32_t vsout = mLastKey.u.outVecSize;
+    bool floatIn = !!mLastKey.u.inType;
+    bool floatOut = !!mLastKey.u.outType;
+
+    //if (!info->current.y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
+
+    if(x2 > x1) {
+        int32_t len = x2 - x1;
+        if (mUsesSimd) {
+            if((mOptKernel != nullptr) && (len >= 4)) {
+                // The optimized kernel processes 4 pixels at once
+                // and requires a minimum of 1 chunk of 4
+                mOptKernel(out, in, mIp, len >> 2);
+                // Update the len and pointers so the generic code can
+                // finish any leftover pixels
+                len &= ~3;
+                x1 += len;
+                out += mOutstep * len;
+                in += mInstep * len;
+            }
+#if defined(ARCH_ARM64_USE_INTRINSICS)
+            else {
+                if (mLastKey.u.inType == RS_TYPE_FLOAT_32 ||
+                    mLastKey.u.outType == RS_TYPE_FLOAT_32) {
+                    // Currently this generates off by one errors.
+                    // rsdIntrinsicColorMatrix_float_K(out, in, len, &mFnTab, tmpFp, tmpFpa);
+                    // x1 += len;
+                    // out += outstep * len;
+                    // in += instep * len;
+                } else {
+                    rsdIntrinsicColorMatrix_int_K(out, in, len, &mFnTab, mIp, mIpa);
+                    x1 += len;
+                    out += mOutstep * len;
+                    in += mInstep * len;
+                }
+            }
+#endif
+        }
+
+        while(x1 != x2) {
+            One(out, in, mTmpFp, mTmpFpa, vsin, vsout, floatIn, floatOut);
+            out += mOutstep;
+            in += mInstep;
+            x1++;
+        }
+    }
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+void ColorMatrixTask::preLaunch(size_t inVectorSize, int inType, size_t outVectorSize,
+                                int outType) {
+    if (inType == outType) {
+        if (outType == RS_TYPE_UNSIGNED_8) {
+            updateCoeffCache(1.f, 255.f);
+        } else {
+            updateCoeffCache(1.f, 1.f);
+        }
+    } else {
+        if (outType == RS_TYPE_UNSIGNED_8) {
+            updateCoeffCache(255.f, 255.f);
+        } else {
+            updateCoeffCache(1.f / 255.f, 1.f);
+        }
+    }
+
+    Key_t key = computeKey(inVectorSize, inType, outVectorSize, outType);
+#else
+void ColorMatrixTask::preLaunch(size_t inVectorSize, size_t outVectorSize) {
+    updateCoeffCache(1.f, 255.f);
+
+    Key_t key = computeKey(inVectorSize, outVectorSize);
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+#if defined(ARCH_X86_HAVE_SSSE3)
+    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
+        // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
+        // mOptKernel =
+        //     (void (*)(void *, const void *, const int16_t *, uint32_t)) selectKernel(key);
+        mLastKey = key;
+    }
+
+#else //if !defined(ARCH_X86_HAVE_SSSE3)
+    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
+        if (mBuf) munmap(mBuf, mBufSize);
+        mBuf = nullptr;
+        mOptKernel = nullptr;
+        if (build(key)) {
+            mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) mBuf;
+        }
+#if defined(ARCH_ARM64_USE_INTRINSICS)
+        else {
+            int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
+            int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
+            uint32_t mm = 0;
+            int i;
+            for (i = 0; i < 4; i++)
+            {
+                uint32_t m = (key.u.coeffMask >> i) & 0x1111;
+                m = ((m * 0x249) >> 9) & 15;
+                m |= ((key.u.addMask >> i) & 1) << 4;
+                mm |= m << (i * 5);
+            }
+
+            if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
+                rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
+            } else {
+                rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
+            }
+        }
+#endif
+        mLastKey = key;
+    }
+#endif //if !defined(ARCH_X86_HAVE_SSSE3)
+}
+
+void ColorMatrixTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                                  size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        size_t offset = mSizeX * y + startX;
+        uchar* in = ((uchar*)mIn) + offset * paddedSize(mInputVectorSize);
+        uchar* out = ((uchar*)mOut) + offset * paddedSize(mVectorSize);
+        kernel(out, in, startX, endX);
+    }
+}
+
+static const float fourZeroes[]{0.0f, 0.0f, 0.0f, 0.0f};
+
+void RenderScriptToolkit::colorMatrix(const void* in, void* out, size_t inputVectorSize,
+                                      size_t outputVectorSize, size_t sizeX, size_t sizeY,
+                                      const float* matrix, const float* addVector,
+                                      const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+    if (inputVectorSize < 1 || inputVectorSize > 4) {
+        ALOGE("The inputVectorSize should be between 1 and 4. %zu provided.", inputVectorSize);
+        return;
+    }
+    if (outputVectorSize < 1 || outputVectorSize > 4) {
+        ALOGE("The outputVectorSize should be between 1 and 4. %zu provided.", outputVectorSize);
+        return;
+    }
+#endif
+
+    if (addVector == nullptr) {
+        addVector = fourZeroes;
+    }
+    ColorMatrixTask task(in, out, inputVectorSize, outputVectorSize, sizeX, sizeY, matrix,
+                         addVector, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/ColorMatrix_advsimd.S b/toolkit/ColorMatrix_advsimd.S
new file mode 100644
index 0000000..55b0029
--- /dev/null
+++ b/toolkit/ColorMatrix_advsimd.S
@@ -0,0 +1,1277 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro vmxx_f32 i, mask, opd, opa, opb
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1)
+        fmla            \opd, \opa, \opb
+    .else
+        fmul            \opd, \opa, \opb
+    .endif
+  .endif
+.endm
+
+.macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1)
+        fadd            \opd, \opa, \opb
+    .else
+        mov             \stupidsyntax1, \stupidsyntax2
+    .endif
+  .endif
+.endm
+
+.macro vmxx_s16 i, mask, opd, opa, opb
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1 + 16)
+        smlal           \opd, \opa, \opb
+    .else
+        smull           \opd, \opa, \opb
+    .endif
+  .endif
+.endm
+
+.macro vmxx2_s16 i, mask, opd, opa, opb
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1 + 16)
+        smlal2          \opd, \opa, \opb
+    .else
+        smull2          \opd, \opa, \opb
+    .endif
+  .endif
+.endm
+
+/* x0 = dst
+ * x1 = src
+ * x2 = count
+ * x3 = params
+ * x4 = column0_fn
+ * x5 = column1_fn
+ * x6 = column2_fn
+ * x7 = column3_fn
+ * x8 = store_fn
+ * x9 = load_fn
+ */
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+.align 6
+colormatrix_int_col0_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[0]
+            dup         v7.4s, v4.s[0]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[0]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[4]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[0]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[4]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[0]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[4]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[0]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[4]
+            sqshrun     v8.4h, v6.4s, #8
+            sqshrun2    v8.8h, v7.4s, #8
+            br          x5
+
+colormatrix_int_col0_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[0]
+            dup         v7.4s, v4.s[0]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[0]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[4]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[0]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[4]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[0]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[4]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[0]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[4]
+            sqshrun     v8.4h, v6.4s, #8
+            sqshrun2    v8.8h, v7.4s, #8
+            br          x5
+
+.align 6
+colormatrix_int_col1_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[1]
+            dup         v7.4s, v4.s[1]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[1]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[5]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[1]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[5]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[1]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[5]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[1]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[5]
+            sqshrun     v9.4h, v6.4s, #8
+            sqshrun2    v9.8h, v7.4s, #8
+            br          x6
+
+colormatrix_int_col1_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[1]
+            dup         v7.4s, v4.s[1]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[1]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[5]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[1]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[5]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[1]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[5]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[1]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[5]
+            sqshrun     v9.4h, v6.4s, #8
+            sqshrun2    v9.8h, v7.4s, #8
+            br          x6
+
+.align 6
+colormatrix_int_col2_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[2]
+            dup         v7.4s, v4.s[2]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[2]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[6]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[2]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[6]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[2]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[6]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[2]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[6]
+            sqshrun     v10.4h, v6.4s, #8
+            sqshrun2    v10.8h, v7.4s, #8
+            br          x7
+
+colormatrix_int_col2_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[2]
+            dup         v7.4s, v4.s[2]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[2]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[6]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[2]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[6]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[2]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[6]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[2]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[6]
+            sqshrun     v10.4h, v6.4s, #8
+            sqshrun2    v10.8h, v7.4s, #8
+            br          x7
+
+.align 6
+colormatrix_int_col3_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[3]
+            dup         v7.4s, v4.s[3]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[3]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[7]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[3]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[7]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[3]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[7]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[3]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[7]
+            sqshrun     v11.4h, v6.4s, #8
+            sqshrun2    v11.8h, v7.4s, #8
+            br          x8
+
+colormatrix_int_col3_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[3]
+            dup         v7.4s, v4.s[3]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[3]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[7]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[3]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[7]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[3]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[7]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[3]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[7]
+            sqshrun     v11.4h, v6.4s, #8
+            sqshrun2    v11.8h, v7.4s, #8
+            br          x8
+
+.align 5
+colormatrix_float_col0_\i:
+            vmxx_f32    \i, 1,  v8.4s, v12.4s, v0.s[0]
+            vmxx_f32    \i, 2,  v8.4s, v13.4s, v1.s[0]
+            vmxx_f32    \i, 4,  v8.4s, v14.4s, v2.s[0]
+            vmxx_f32    \i, 8,  v8.4s, v15.4s, v3.s[0]
+            vadd_f32    \i, 16, v8.4s, v8.4s, v4.4s,        v8.16b, v4.16b
+            vmxx_f32    \i, 1,  v16.4s, v20.4s, v0.s[0]
+            vmxx_f32    \i, 2,  v16.4s, v21.4s, v1.s[0]
+            vmxx_f32    \i, 4,  v16.4s, v22.4s, v2.s[0]
+            vmxx_f32    \i, 8,  v16.4s, v23.4s, v3.s[0]
+            vadd_f32    \i, 16, v16.4s, v16.4s, v4.4s,      v16.16b, v4.16b
+            br          x5
+
+.align 4
+colormatrix_float_col0_n\i:
+            vmxx_f32    \i^31, 1,  v8.4s, v12.4s, v0.s[0]
+            vmxx_f32    \i^31, 2,  v8.4s, v13.4s, v1.s[0]
+            vmxx_f32    \i^31, 4,  v8.4s, v14.4s, v2.s[0]
+            vmxx_f32    \i^31, 8,  v8.4s, v15.4s, v3.s[0]
+            vadd_f32    \i^31, 16, v8.4s, v8.4s, v4.4s,     v8.16b, v4.16b
+            vmxx_f32    \i^31, 1,  v16.4s, v20.4s, v0.s[0]
+            vmxx_f32    \i^31, 2,  v16.4s, v21.4s, v1.s[0]
+            vmxx_f32    \i^31, 4,  v16.4s, v22.4s, v2.s[0]
+            vmxx_f32    \i^31, 8,  v16.4s, v23.4s, v3.s[0]
+            vadd_f32    \i^31, 16, v16.4s, v16.4s, v4.4s,   v16.16b, v4.16b
+            br          x5
+
+.align 5
+colormatrix_float_col1_\i:
+            vmxx_f32    \i, 1,  v9.4s, v12.4s, v0.s[1]
+            vmxx_f32    \i, 2,  v9.4s, v13.4s, v1.s[1]
+            vmxx_f32    \i, 4,  v9.4s, v14.4s, v2.s[1]
+            vmxx_f32    \i, 8,  v9.4s, v15.4s, v3.s[1]
+            vadd_f32    \i, 16, v9.4s, v9.4s, v5.4s,        v9.16b, v5.16b
+            vmxx_f32    \i, 1,  v17.4s, v20.4s, v0.s[1]
+            vmxx_f32    \i, 2,  v17.4s, v21.4s, v1.s[1]
+            vmxx_f32    \i, 4,  v17.4s, v22.4s, v2.s[1]
+            vmxx_f32    \i, 8,  v17.4s, v23.4s, v3.s[1]
+            vadd_f32    \i, 16, v17.4s, v17.4s, v5.4s,      v17.16b, v5.16b
+            br          x6
+
+.align 4
+colormatrix_float_col1_n\i:
+            vmxx_f32    \i^31, 1,  v9.4s, v12.4s, v0.s[1]
+            vmxx_f32    \i^31, 2,  v9.4s, v13.4s, v1.s[1]
+            vmxx_f32    \i^31, 4,  v9.4s, v14.4s, v2.s[1]
+            vmxx_f32    \i^31, 8,  v9.4s, v15.4s, v3.s[1]
+            vadd_f32    \i^31, 16, v9.4s, v9.4s, v5.4s,     v9.16b, v5.16b
+            vmxx_f32    \i^31, 1,  v17.4s, v20.4s, v0.s[1]
+            vmxx_f32    \i^31, 2,  v17.4s, v21.4s, v1.s[1]
+            vmxx_f32    \i^31, 4,  v17.4s, v22.4s, v2.s[1]
+            vmxx_f32    \i^31, 8,  v17.4s, v23.4s, v3.s[1]
+            vadd_f32    \i^31, 16, v17.4s, v17.4s, v5.4s,   v17.16b, v5.16b
+            br          x6
+
+.align 5
+colormatrix_float_col2_\i:
+            vmxx_f32    \i, 1,  v10.4s, v12.4s, v0.s[2]
+            vmxx_f32    \i, 2,  v10.4s, v13.4s, v1.s[2]
+            vmxx_f32    \i, 4,  v10.4s, v14.4s, v2.s[2]
+            vmxx_f32    \i, 8,  v10.4s, v15.4s, v3.s[2]
+            vadd_f32    \i, 16, v10.4s, v10.4s, v6.4s,      v10.16b, v6.16b
+            vmxx_f32    \i, 1,  v18.4s, v20.4s, v0.s[2]
+            vmxx_f32    \i, 2,  v18.4s, v21.4s, v1.s[2]
+            vmxx_f32    \i, 4,  v18.4s, v22.4s, v2.s[2]
+            vmxx_f32    \i, 8,  v18.4s, v23.4s, v3.s[2]
+            vadd_f32    \i, 16, v18.4s, v18.4s, v6.4s,      v18.16b, v6.16b
+            br          x7
+
+.align 4
+colormatrix_float_col2_n\i:
+            vmxx_f32    \i^31, 1,  v10.4s, v12.4s, v0.s[2]
+            vmxx_f32    \i^31, 2,  v10.4s, v13.4s, v1.s[2]
+            vmxx_f32    \i^31, 4,  v10.4s, v14.4s, v2.s[2]
+            vmxx_f32    \i^31, 8,  v10.4s, v15.4s, v3.s[2]
+            vadd_f32    \i^31, 16, v10.4s, v10.4s, v6.4s,   v10.16b, v6.16b
+            vmxx_f32    \i^31, 1,  v18.4s, v20.4s, v0.s[2]
+            vmxx_f32    \i^31, 2,  v18.4s, v21.4s, v1.s[2]
+            vmxx_f32    \i^31, 4,  v18.4s, v22.4s, v2.s[2]
+            vmxx_f32    \i^31, 8,  v18.4s, v23.4s, v3.s[2]
+            vadd_f32    \i^31, 16, v18.4s, v18.4s, v6.4s,   v18.16b, v6.16b
+            br          x7
+
+.align 5
+colormatrix_float_col3_\i:
+            vmxx_f32    \i, 1,  v11.4s, v12.4s, v0.s[3]
+            vmxx_f32    \i, 2,  v11.4s, v13.4s, v1.s[3]
+            vmxx_f32    \i, 4,  v11.4s, v14.4s, v2.s[3]
+            vmxx_f32    \i, 8,  v11.4s, v15.4s, v3.s[3]
+            vadd_f32    \i, 16, v11.4s, v11.4s, v7.4s,      v11.16b, v7.16b
+            vmxx_f32    \i, 1,  v19.4s, v20.4s, v0.s[3]
+            vmxx_f32    \i, 2,  v19.4s, v21.4s, v1.s[3]
+            vmxx_f32    \i, 4,  v19.4s, v22.4s, v2.s[3]
+            vmxx_f32    \i, 8,  v19.4s, v23.4s, v3.s[3]
+            vadd_f32    \i, 16, v19.4s, v19.4s, v7.4s,      v19.16b, v7.16b
+            br          x8
+
+.align 4
+colormatrix_float_col3_n\i:
+            vmxx_f32    \i^31, 1,  v11.4s, v12.4s, v0.s[3]
+            vmxx_f32    \i^31, 2,  v11.4s, v13.4s, v1.s[3]
+            vmxx_f32    \i^31, 4,  v11.4s, v14.4s, v2.s[3]
+            vmxx_f32    \i^31, 8,  v11.4s, v15.4s, v3.s[3]
+            vadd_f32    \i^31, 16, v11.4s, v11.4s, v7.4s,  v11.16b, v7.16b
+            vmxx_f32    \i^31, 1,  v19.4s, v20.4s, v0.s[3]
+            vmxx_f32    \i^31, 2,  v19.4s, v21.4s, v1.s[3]
+            vmxx_f32    \i^31, 4,  v19.4s, v22.4s, v2.s[3]
+            vmxx_f32    \i^31, 8,  v19.4s, v23.4s, v3.s[3]
+            vadd_f32    \i^31, 16, v19.4s, v19.4s, v7.4s,  v19.16b, v7.16b
+            br          x8
+
+.endr
+
+.align 6
+colormatrix_float_ldu4:
+            ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+            uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v23.8h, v23.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl        v15.4s, v23.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            uxtl2       v23.4s, v23.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v15.4s, v15.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            ucvtf       v23.4s, v23.4s
+            br          x4
+
+.align 5
+colormatrix_int_ldu4:
+            ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+            uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            uxtl        v15.8h, v15.8b
+            br          x4
+
+.align 6
+colormatrix_float_ldu3:
+            ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+            uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            br          x4
+
+colormatrix_int_ldu3:
+            ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+            uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            br          x4
+
+.align 5
+colormatrix_float_ldu1:
+            ld1         {v20.8b}, [x1], #8
+            uxtl        v20.8h, v20.8b
+            uxtl        v12.4s, v20.4h
+            uxtl2       v20.4s, v20.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v20.4s, v20.4s
+            br          x4
+
+.align 6
+colormatrix_float_ldu2:
+            ld2         {v20.8b,v21.8b}, [x1], #16
+            uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            br          x4
+
+.align 4
+colormatrix_int_ldu2:
+            ld2         {v12.8b,v13.8b}, [x1], #16
+            uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            br          x4
+
+.align 6
+colormatrix_float_stu4:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v27.4s, v11.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            fcvtzs      v31.4s, v19.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun    v27.4h, v27.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            sqrshrun2   v27.8h, v31.4s, #1
+            uqxtn       v24.8b, v24.8h
+            uqxtn       v25.8b, v25.8h
+            uqxtn       v26.8b, v26.8h
+            uqxtn       v27.8b, v27.8h
+            subs        x2, x2, #8
+            st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+            blo         colormatrix_float_end
+            br          x9
+
+.align 5
+colormatrix_int_stu4:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            uqxtn       v15.8b, v11.8h
+            subs        x2, x2, #8
+            st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+            blo         colormatrix_int_end
+            br          x9
+
+.align 6
+colormatrix_float_stu3:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            uqxtn       v24.8b, v24.8h
+            uqxtn       v25.8b, v25.8h
+            uqxtn       v26.8b, v26.8h
+            movi        v27.8b, #0
+            subs        x2, x2, #8
+            st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+            blo         colormatrix_float_end
+            br          x9
+
+.align 4
+colormatrix_int_ldu1:
+            ld1         {v12.8b}, [x1], #8
+            uxtl        v12.8h, v12.8b
+            br          x4
+
+.align 5
+colormatrix_int_stu3:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            movi        v15.8b, #0
+            subs        x2, x2, #8
+            st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+            blo         colormatrix_int_end
+            br          x9
+
+.align 6
+colormatrix_float_stu2:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            uqxtn       v24.8b, v24.8h
+            uqxtn       v25.8b, v25.8h
+            subs        x2, x2, #8
+            st2         {v24.8b,v25.8b}, [x0], #16
+            blo         colormatrix_float_end
+            br          x9
+
+.align 5
+colormatrix_int_stu2:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            subs        x2, x2, #8
+            st2         {v12.8b,v13.8b}, [x0], #16
+            blo         colormatrix_int_end
+            br          x9
+
+.align 5
+colormatrix_int_stu1:
+            uqxtn       v12.8b, v8.8h
+            subs        x2, x2, #8
+            st1         {v12.8b}, [x0], #8
+            blo         colormatrix_int_end
+            br          x9
+
+colormatrix_float_ldf3:
+            ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+            br          x4
+
+.align 6
+colormatrix_float_stu1:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            uqxtn       v24.8b, v24.8h
+            subs        x2, x2, #8
+            st1         {v24.8b}, [x0], #8
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_stf3:
+            movi        v11.16b, #0
+            st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+            movi        v19.16b, #0
+            subs        x2, x2, #8
+            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+            blo         colormatrix_float_end
+            br          x9
+
+.align 5
+colormatrix_float_stf4:
+            st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+            subs        x2, x2, #8
+            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_ldf4:
+            ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+            br          x4
+
+.align 5
+colormatrix_float_stf2:
+            st2         {v8.4s, v9.4s}, [x0], #32
+            subs        x2, x2, #8
+            st2         {v16.4s, v17.4s}, [x0], #32
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_ldf2:
+            ld2         {v12.4s,v13.4s}, [x1], #32
+            ld2         {v20.4s,v21.4s}, [x1], #32
+            br          x4
+
+.align 5
+colormatrix_float_stf1:
+            st1         {v8.4s}, [x0], #16
+            subs        x2, x2, #8
+            st1         {v16.4s}, [x0], #16
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_ldf1:
+            ld1         {v12.4s}, [x1], #16
+            ld1         {v20.4s}, [x1], #16
+            br          x4
+
+colormatrix_int_stu1_end:
+            uqxtn       v12.8b, v8.8h
+            tbz         x2, #2, 1f
+            st1         {v12.s}[1], [x0], #4
+1:          tbz         x2, #1, 1f
+            st1         {v12.h}[1], [x0], #2
+1:          tbz         x2, #0, 1f
+            st1         {v12.b}[1], [x0], #1
+1:          b           colormatrix_int_realend
+
+colormatrix_int_stu2_end:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            zip1        v12.16b, v12.16b, v13.16b
+            tbz         x2, #2, 1f
+            st1         {v12.d}[1], [x0], #8
+1:          tbz         x2, #1, 1f
+            st1         {v12.s}[1], [x0], #4
+1:          tbz         x2, #0, 1f
+            st1         {v12.h}[1], [x0], #2
+1:          b           colormatrix_int_realend
+
+colormatrix_int_stu3_end:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            movi        v15.8b, #0
+            tbz         x2, #2, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1:          tbz         x2, #1, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1:          tbz         x2, #0, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1:          b           colormatrix_int_realend
+
+colormatrix_int_stu4_end:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            uqxtn       v15.8b, v11.8h
+            tbz         x2, #2, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1:          tbz         x2, #1, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1:          tbz         x2, #0, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1:          b           colormatrix_int_realend
+
+
+colormatrix_int_ldu1_end:
+            tbz         x2, #2, 1f
+            ld1         {v15.s}[3], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld1         {v15.h}[5], [x1], #2
+1:          tbz         x2, #0, 1f
+            ld1         {v15.b}[9], [x1], #1
+1:          uxtl2       v12.8h, v15.16b
+            br          x4
+
+colormatrix_int_ldu2_end:
+            tbz         x2, #2, 1f
+            ld1         {v15.d}[1], [x1], #8
+1:          tbz         x2, #1, 1f
+            ld1         {v15.s}[1], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld1         {v15.h}[1], [x1], #2
+1:          uzp1        v14.16b, v15.16b, v15.16b
+            uzp2        v15.16b, v15.16b, v15.16b
+            uxtl        v12.8h, v14.8b
+            uxtl        v13.8h, v15.8b
+            br          x4
+
+colormatrix_int_ldu3_end:
+            tbz         x2, #2, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
+1:          uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            br          x4
+
+colormatrix_int_ldu4_end:
+            tbz         x2, #2, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
+1:          uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            uxtl        v15.8h, v15.8b
+            br          x4
+
+colormatrix_float_stu1_end:
+            fcvtzs      v12.4s, v8.4s, #1
+            fcvtzs      v13.4s, v16.4s, #1
+            sqrshrun    v12.4h, v12.4s, #1
+            sqrshrun2   v12.8h, v13.4s, #1
+            uqxtn       v12.8b, v12.8h
+            tbz         x2, #2, 1f
+            st1         {v12.s}[1], [x0], #4
+1:          tbz         x2, #1, 1f
+            st1         {v12.h}[1], [x0], #2
+1:          tbz         x2, #0, 1f
+            st1         {v12.b}[1], [x0], #1
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stu2_end:
+            fcvtzs      v12.4s, v8.4s, #1
+            fcvtzs      v13.4s, v9.4s, #1
+            fcvtzs      v14.4s, v16.4s, #1
+            fcvtzs      v15.4s, v17.4s, #1
+            sqrshrun    v12.4h, v12.4s, #1
+            sqrshrun    v13.4h, v13.4s, #1
+            sqrshrun    v14.4h, v14.4s, #1
+            sqrshrun    v15.4h, v15.4s, #1
+            zip1        v12.8h, v12.8h, v13.8h
+            zip1        v13.8h, v14.8h, v15.8h
+            uqxtn       v12.8b, v12.8h
+            uqxtn2      v12.16b, v13.8h
+            tbz         x2, #2, 1f
+            st1         {v12.d}[1], [x0], #8
+1:          tbz         x2, #1, 1f
+            st1         {v12.s}[1], [x0], #4
+1:          tbz         x2, #0, 1f
+            st1         {v12.h}[1], [x0], #2
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stu3_end:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            uqxtn       v12.8b, v24.8h
+            uqxtn       v13.8b, v25.8h
+            uqxtn       v14.8b, v26.8h
+            movi        v15.8b, #0
+            tbz         x2, #2, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1:          tbz         x2, #1, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1:          tbz         x2, #0, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stu4_end:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v27.4s, v11.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            fcvtzs      v31.4s, v19.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun    v27.4h, v27.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            sqrshrun2   v27.8h, v31.4s, #1
+            uqxtn       v12.8b, v24.8h
+            uqxtn       v13.8b, v25.8h
+            uqxtn       v14.8b, v26.8h
+            uqxtn       v15.8b, v27.8h
+            tbz         x2, #2, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1:          tbz         x2, #1, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1:          tbz         x2, #0, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stf1_end:
+            tbz         x2, #2, 1f
+            st1         {v16.4s}, [x0], #16
+1:          tbz         x2, #1, 1f
+            st1         {v8.d}[1], [x0], #8
+1:          tbz         x2, #0, 1f
+            st1         {v8.s}[1], [x0], #4
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stf2_end:
+            tbz         x2, #2, 1f
+            st2         {v16.4s, v17.4s}, [x0], #32
+1:          tbz         x2, #1, 1f
+            st2         {v8.s,v9.s}[2], [x0], #8
+            st2         {v8.s,v9.s}[3], [x0], #8
+1:          tbz         x2, #0, 1f
+            st2         {v8.s,v9.s}[1], [x0], #8
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stf3_end:
+            movi        v11.16b, #0
+            movi        v19.16b, #0
+colormatrix_float_stf4_end:
+            tbz         x2, #2, 1f
+            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+1:          tbz         x2, #1, 1f
+            st4         {v8.s,v9.s,v10.s,v11.s}[2], [x0], #16
+            st4         {v8.s,v9.s,v10.s,v11.s}[3], [x0], #16
+1:          tbz         x2, #0, 1f
+            st4         {v8.s,v9.s,v10.s,v11.s}[1], [x0], #16
+1:          b           colormatrix_float_realend
+
+colormatrix_float_ldu1_end:
+            tbz         x2, #2, 1f
+            ld1         {v15.s}[1], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld1         {v15.h}[1], [x1], #2
+1:          tbz         x2, #0, 1f
+            ld1         {v15.b}[1], [x1], #1
+1:          uxtl        v15.8h, v15.8b
+            uxtl        v12.4s, v15.4h
+            uxtl2       v20.4s, v15.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v20.4s, v20.4s
+            br          x4
+
+colormatrix_float_ldu2_end:
+            tbz         x2, #2, 1f
+            ld1         {v15.d}[1], [x1], #8
+1:          tbz         x2, #1, 1f
+            ld1         {v15.s}[1], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld1         {v15.h}[1], [x1], #2
+1:          uxtl        v14.8h, v15.8b
+            uxtl2       v15.8h, v15.16b
+            uzp1        v12.8h, v14.8h, v14.8h
+            uzp2        v13.8h, v14.8h, v14.8h
+            uzp1        v20.8h, v15.8h, v15.8h
+            uzp2        v21.8h, v15.8h, v15.8h
+            uxtl        v12.4s, v12.4h
+            uxtl        v13.4s, v13.4h
+            uxtl        v20.4s, v20.4h
+            uxtl        v21.4s, v21.4h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            br          x4
+
+colormatrix_float_ldu3_end:
+            tbz         x2, #2, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
+1:          uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            br          x4
+
+colormatrix_float_ldu4_end:
+            tbz         x2, #2, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
+1:          uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v23.8h, v23.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl        v15.4s, v23.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            uxtl2       v23.4s, v23.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v15.4s, v15.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            ucvtf       v23.4s, v23.4s
+            br          x4
+
+colormatrix_float_ldf1_end:
+            tbz         x2, #2, 1f
+            ld1         {v20.4s}, [x1], #16
+1:          tbz         x2, #1, 1f
+            ld1         {v12.d}[1], [x1], #8
+1:          tbz         x2, #0, 1f
+            ld1         {v12.s}[1], [x1], #4
+1:          br          x4
+
+colormatrix_float_ldf2_end:
+            tbz         x2, #2, 1f
+            ld2         {v20.4s,v21.4s}, [x1], #32
+1:          tbz         x2, #1, 1f
+            ld2         {v12.s,v13.s}[2], [x1], #8
+            ld2         {v12.s,v13.s}[3], [x1], #8
+1:          tbz         x2, #0, 1f
+            ld2         {v12.s,v13.s}[1], [x1], #8
+1:          br          x4
+
+colormatrix_float_ldf3_end:
+colormatrix_float_ldf4_end:
+            tbz         x2, #2, 1f
+            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+1:          tbz         x2, #1, 1f
+            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x1], #16
+            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x1], #16
+1:          tbz         x2, #0, 1f
+            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x1], #16
+1:          br          x4
+
+/* void rsdIntrinsicColorMatrix_int_K(
+ *          void *out,              // x0
+ *          void const *in,         // x1
+ *          size_t count,           // x2
+ *          fntab_t const *fns,     // x3
+ *          int16_t const *mult,    // x4
+ *          int32_t const *add);    // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_int_K)
+            sub         x7, sp, #32
+            sub         sp, sp, #64
+            st1         {v8.1d-v11.1d}, [sp]
+            st1         {v12.1d-v15.1d}, [x7]
+
+            ld1         {v0.8h,v1.8h}, [x4], #32
+            ld1         {v4.4s}, [x5], #16
+
+            ldp         x4,x5, [x3],#16
+            ldp         x6,x7, [x3],#16
+            ldp         x8,x9, [x3],#16
+
+            dup         v12.4s, v4.s[0]
+            dup         v13.4s, v4.s[1]
+            dup         v14.4s, v4.s[2]
+            dup         v15.4s, v4.s[3]
+            sqshrun     v8.4h, v12.4s, #8
+            sqshrun2    v8.8h, v12.4s, #8
+            sqshrun     v9.4h, v13.4s, #8
+            sqshrun2    v9.8h, v13.4s, #8
+            sqshrun     v10.4h, v14.4s, #8
+            sqshrun2    v10.8h, v14.4s, #8
+            sqshrun     v11.4h, v15.4s, #8
+            sqshrun2    v11.8h, v15.4s, #8
+
+            subs        x2, x2, #8
+            blo         colormatrix_int_end
+            br          x9
+
+colormatrix_int_end:
+            adds        x2, x2, #8
+            bls         colormatrix_int_realend
+            mov         x16, x8
+            ldp         x8, x9, [x3], #16
+            cmp         x4, x16
+            csel        x4, x8, x4, eq
+            cmp         x5, x16
+            csel        x5, x8, x5, eq
+            cmp         x6, x16
+            csel        x6, x8, x6, eq
+            cmp         x7, x16
+            csel        x7, x8, x7, eq
+            br          x9
+
+colormatrix_int_realend:
+            ld1         {v8.1d-v11.1d}, [sp], #32
+            ld1         {v12.1d-v15.1d}, [sp], #32
+            ret
+END(rsdIntrinsicColorMatrix_int_K)
+
+/* void rsdIntrinsicColorMatrixSetup_int_K(
+ *          fntab_t const *fns, // x0
+ *          uint32_t mask,      // x1
+ *          int dt,             // x2
+ *          int st);            // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
+            adrp        x7, 2f
+            add         x7, x7, :lo12:2f
+            add         x4, x7, x2, LSL #3
+            ldrsw       x2, [x4], #4
+            ldrsw       x4, [x4]
+            add         x2, x2, x7
+            add         x4, x4, x7
+            adrp        x7, 3f
+            add         x7, x7, :lo12:3f
+            add         x5, x7, x3, LSL #3
+            ldrsw       x3, [x5], #4
+            ldrsw       x5, [x5]
+            add         x3, x3, x7
+            add         x5, x5, x7
+            stp         x2, x3, [x0, #32]
+            stp         x4, x5, [x0, #48]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+            mov         x3, #4
+            adrp        x7, 4f
+            add         x7, x7, :lo12:4f
+1:          ands        x2, x1, #15
+            beq         9f
+            and         x2, x1, #31
+            lsl         x2, x2, #4
+            ldrsw       x2, [x7, x2]
+            add         x2, x2, x7
+9:          str         x2, [x0], #8
+            lsr         x1, x1, #5
+            add         x7, x7, #4
+            subs        x3, x3, #1
+            bne         1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+            ldr         x2, [x0]
+            mov         x3, #4
+1:          ldr         x1, [x0, #-8]!
+            cmp         x1, #0
+            csel        x2, x1, x2, ne
+            str         x2, [x0]
+            subs        x3, x3, #1
+            bne         1b
+            ret
+
+END(rsdIntrinsicColorMatrixSetup_int_K)
+.rodata
+            .align 4
+2:          .word      colormatrix_int_stu1-2b
+            .word      colormatrix_int_stu1_end-2b
+            .word      colormatrix_int_stu2-2b
+            .word      colormatrix_int_stu2_end-2b
+            .word      colormatrix_int_stu3-2b
+            .word      colormatrix_int_stu3_end-2b
+            .word      colormatrix_int_stu4-2b
+            .word      colormatrix_int_stu4_end-2b
+3:          .word      colormatrix_int_ldu1-3b
+            .word      colormatrix_int_ldu1_end-3b
+            .word      colormatrix_int_ldu2-3b
+            .word      colormatrix_int_ldu2_end-3b
+            .word      colormatrix_int_ldu3-3b
+            .word      colormatrix_int_ldu3_end-3b
+            .word      colormatrix_int_ldu4-3b
+            .word      colormatrix_int_ldu4_end-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+            .word      colormatrix_int_col0_\i-4b
+            .word      colormatrix_int_col1_\i-4b-4
+            .word      colormatrix_int_col2_\i-4b-8
+            .word      colormatrix_int_col3_\i-4b-12
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+            .word      colormatrix_int_col0_n\i-4b
+            .word      colormatrix_int_col1_n\i-4b-4
+            .word      colormatrix_int_col2_n\i-4b-8
+            .word      colormatrix_int_col3_n\i-4b-12
+.endr
+
+
+/* void rsdIntrinsicColorMatrix_float_K(
+ *          void *out,              // x0
+ *          void const *in,         // x1
+ *          size_t count,           // x2
+ *          fntab_t const *fns,     // x3
+ *          float const *mult,      // x4
+ *          float const *add);      // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_float_K)
+            sub         x7, sp, #32
+            sub         sp, sp, #64
+            st1         {v8.1d-v11.1d}, [sp]
+            st1         {v12.1d-v15.1d}, [x7]
+
+            ld1         {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
+            ld1r        {v4.4s}, [x5], #4
+            ld1r        {v5.4s}, [x5], #4
+            ld1r        {v6.4s}, [x5], #4
+            ld1r        {v7.4s}, [x5], #4
+
+            ldp         x4,x5, [x3], #16
+            ldp         x6,x7, [x3], #16
+            ldp         x8,x9, [x3], #16
+
+            mov         v8.16b, v4.16b
+            mov         v9.16b, v5.16b
+            mov         v10.16b, v6.16b
+            mov         v11.16b, v7.16b
+
+            mov         v16.16b, v4.16b
+            mov         v17.16b, v5.16b
+            mov         v18.16b, v6.16b
+            mov         v19.16b, v7.16b
+
+            subs        x2, x2, #8
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_end:
+            adds        x2, x2, #8
+            bls         colormatrix_int_realend
+            mov         x16, x8
+            ldp         x8,x9, [x3], #16
+            cmp         x4, x16
+            csel        x4, x8, x4, eq
+            cmp         x5, x16
+            csel        x5, x8, x5, eq
+            cmp         x6, x16
+            csel        x6, x8, x6, eq
+            cmp         x7, x16
+            csel        x7, x8, x7, eq
+            br          x9
+
+colormatrix_float_realend:
+            ld1         {v8.1d-v11.1d}, [sp], #32
+            ld1         {v12.1d-v15.1d}, [sp], #32
+            ret
+END(rsdIntrinsicColorMatrix_float_K)
+
+/* void rsdIntrinsicColorMatrixSetup_float_K(
+ *          fntab_t const *fns, // x0
+ *          uint32_t mask,      // x1
+ *          int dt,             // x2
+ *          int st);            // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
+            adrp        x7, 2f
+            add         x7, x7, :lo12:2f
+            add         x4, x7, x2, LSL #3
+            ldrsw       x2, [x4], #4
+            ldrsw       x4, [x4]
+            add         x2, x2, x7
+            add         x4, x4, x7
+            adrp        x7, 3f
+            add         x7, x7, :lo12:3f
+            add         x5, x7, x3, LSL #3
+            ldrsw       x3, [x5], #4
+            ldrsw       x5, [x5]
+            add         x3, x3, x7
+            add         x5, x5, x7
+            stp         x2, x3, [x0, #32]
+            stp         x4, x5, [x0, #48]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+            mov         x3, #4
+            adrp        x7, 4f
+            add         x7, x7, :lo12:4f
+1:          ands        x2, x1, #15
+            beq         9f
+            and         x2, x1, #31
+            lsl         x2, x2, #4
+            ldrsw       x2, [x7, x2]
+            add         x2, x2, x7
+9:          str         x2, [x0], #8
+            lsr         x1, x1, #5
+            add         x7, x7, #4
+            subs        x3, x3, #1
+            bne         1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+            ldr         x2, [x0]
+            mov         x3, #4
+1:          ldr         x1, [x0, #-8]!
+            cmp         x1, #0
+            csel        x2, x1, x2, ne
+            str         x2, [x0]
+            subs        x3, x3, #1
+            bne         1b
+            ret
+
+END(rsdIntrinsicColorMatrixSetup_float_K)
+.rodata
+            .align 4
+2:          .word      colormatrix_float_stu1-2b
+            .word      colormatrix_float_stu1_end-2b
+            .word      colormatrix_float_stu2-2b
+            .word      colormatrix_float_stu2_end-2b
+            .word      colormatrix_float_stu3-2b
+            .word      colormatrix_float_stu3_end-2b
+            .word      colormatrix_float_stu4-2b
+            .word      colormatrix_float_stu4_end-2b
+            .word      colormatrix_float_stf1-2b
+            .word      colormatrix_float_stf1_end-2b
+            .word      colormatrix_float_stf2-2b
+            .word      colormatrix_float_stf2_end-2b
+            .word      colormatrix_float_stf3-2b
+            .word      colormatrix_float_stf3_end-2b
+            .word      colormatrix_float_stf4-2b
+            .word      colormatrix_float_stf4_end-2b
+3:          .word      colormatrix_float_ldu1-3b
+            .word      colormatrix_float_ldu1_end-3b
+            .word      colormatrix_float_ldu2-3b
+            .word      colormatrix_float_ldu2_end-3b
+            .word      colormatrix_float_ldu3-3b
+            .word      colormatrix_float_ldu3_end-3b
+            .word      colormatrix_float_ldu4-3b
+            .word      colormatrix_float_ldu4_end-3b
+            .word      colormatrix_float_ldf1-3b
+            .word      colormatrix_float_ldf1_end-3b
+            .word      colormatrix_float_ldf2-3b
+            .word      colormatrix_float_ldf2_end-3b
+            .word      colormatrix_float_ldf3-3b
+            .word      colormatrix_float_ldf3_end-3b
+            .word      colormatrix_float_ldf4-3b
+            .word      colormatrix_float_ldf4_end-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+            .word      colormatrix_float_col0_\i-4b
+            .word      colormatrix_float_col1_\i-4b-4
+            .word      colormatrix_float_col2_\i-4b-8
+            .word      colormatrix_float_col3_\i-4b-12
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+            .word      colormatrix_float_col0_n\i-4b
+            .word      colormatrix_float_col1_n\i-4b-4
+            .word      colormatrix_float_col2_n\i-4b-8
+            .word      colormatrix_float_col3_n\i-4b-12
+.endr
diff --git a/toolkit/ColorMatrix_neon.S b/toolkit/ColorMatrix_neon.S
new file mode 100644
index 0000000..ecb8c13
--- /dev/null
+++ b/toolkit/ColorMatrix_neon.S
@@ -0,0 +1,361 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define SNIP_START(x) \
+    .globl x; x:
+
+#define SNIP_END(x) \
+    .globl x##_end; x##_end: \
+    .globl x##_len; x##_len: \
+    .word x##_end-x
+
+SNIP_START(_N_ColorMatrix_prefix_i)
+    stmfd           sp!, {r4, lr}
+    vpush           {q4-q7}
+    vld1.16 {q2}, [r2]!
+    vld1.16 {q3}, [r2]!
+    vld1.32 {d8[],d9[]}, [r2]!
+    vld1.32 {d10[],d11[]}, [r2]!
+    vld1.32 {d12[],d13[]}, [r2]!
+    vld1.32 {d14[],d15[]}, [r2]!
+    veor q0, q0
+    veor q1, q1
+    veor q9, q9
+    veor q10, q10
+    veor q11, q11
+SNIP_END(_N_ColorMatrix_prefix_i)
+
+SNIP_START(_N_ColorMatrix_prefix_f)
+    stmfd           sp!, {r4, lr}
+    vpush           {q4-q7}
+    add r2, #48
+    vld1.32 {q4}, [r2]!
+    vld1.32 {q5}, [r2]!
+    vld1.32 {q6}, [r2]!
+    vld1.32 {q7}, [r2]!
+    vld1.32 {d16[],d17[]}, [r2]!
+    vld1.32 {d18[],d19[]}, [r2]!
+    vld1.32 {d20[],d21[]}, [r2]!
+    vld1.32 {d22[],d23[]}, [r2]!
+    veor q1, q1
+    veor q2, q2
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_prefix_f)
+
+SNIP_START(_N_ColorMatrix_postfix1)
+    subs r3, r3, #1
+    #bne 1b
+SNIP_END(_N_ColorMatrix_postfix1)
+
+SNIP_START(_N_ColorMatrix_postfix2)
+
+    #mov r0, #0
+    #ldr r0, [r0]
+
+    #vqadd.s32 q0,q0,q0
+    #vadd.f32 q0,q0,q0
+    #vmul.f32 q0,q0,d0[0]
+    #vmla.f32 q0,q0,d0[0]
+    #vmov q0, q0
+
+
+    vpop            {q4-q7}
+    ldmfd           sp!, {r4, lr}
+    bx              lr
+SNIP_END(_N_ColorMatrix_postfix2)
+
+SNIP_START(_N_ColorMatrix_load_u8_4)
+    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+SNIP_END(_N_ColorMatrix_load_u8_4)
+
+SNIP_START(_N_ColorMatrix_load_u8_3)
+    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+    veor d3, d3
+SNIP_END(_N_ColorMatrix_load_u8_3)
+
+SNIP_START(_N_ColorMatrix_load_u8_2)
+    vld2.8 {d0[0],d1[0]}, [r1]!
+    vld2.8 {d0[1],d1[1]}, [r1]!
+    vld2.8 {d0[2],d1[2]}, [r1]!
+    vld2.8 {d0[3],d1[3]}, [r1]!
+    veor d2, d2
+    veor d3, d3
+SNIP_END(_N_ColorMatrix_load_u8_2)
+
+SNIP_START(_N_ColorMatrix_load_u8_1)
+    vld1.32 {d0[0]}, [r1]!
+    veor d1, d1
+    veor d2, d2
+    veor d3, d3
+SNIP_END(_N_ColorMatrix_load_u8_1)
+
+SNIP_START(_N_ColorMatrix_load_u8f_4)
+    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+    vmovl.u8 q3, d3
+    vmovl.u8 q2, d2
+    vmovl.u8 q1, d1
+    vmovl.u8 q0, d0
+    vmovl.u16 q3, d6
+    vmovl.u16 q2, d4
+    vmovl.u16 q1, d2
+    vmovl.u16 q0, d0
+    vcvt.f32.s32 q3, q3
+    vcvt.f32.s32 q2, q2
+    vcvt.f32.s32 q1, q1
+    vcvt.f32.s32 q0, q0
+SNIP_END(_N_ColorMatrix_load_u8f_4)
+
+SNIP_START(_N_ColorMatrix_load_u8f_3)
+    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+    vmovl.u8 q2, d2
+    vmovl.u8 q1, d1
+    vmovl.u8 q0, d0
+    vmovl.u16 q2, d4
+    vmovl.u16 q1, d2
+    vmovl.u16 q0, d0
+    vcvt.f32.s32 q2, q2
+    vcvt.f32.s32 q1, q1
+    vcvt.f32.s32 q0, q0
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_u8f_3)
+
+SNIP_START(_N_ColorMatrix_load_u8f_2)
+    vld2.8 {d0[0],d1[0]}, [r1]!
+    vld2.8 {d0[1],d1[1]}, [r1]!
+    vld2.8 {d0[2],d1[2]}, [r1]!
+    vld2.8 {d0[3],d1[3]}, [r1]!
+    vmovl.u8 q1, d1
+    vmovl.u8 q0, d0
+    vmovl.u16 q1, d2
+    vmovl.u16 q0, d0
+    vcvt.f32.s32 q1, q1
+    vcvt.f32.s32 q0, q0
+    veor q2, q2
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_u8f_2)
+
+SNIP_START(_N_ColorMatrix_load_u8f_1)
+    vld1.32 {d0[0]}, [r1]!
+    vmovl.u8 q0, d0
+    vmovl.u16 q0, d0
+    vcvt.f32.s32 q0, q0
+    veor q1, q1
+    veor q2, q2
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_u8f_1)
+
+SNIP_START(_N_ColorMatrix_load_f32_4)
+    vld4.32 {d0[0],d2[0],d4[0],d6[0]}, [r1]!
+    vld4.32 {d0[1],d2[1],d4[1],d6[1]}, [r1]!
+    vld4.32 {d1[0],d3[0],d5[0],d7[0]}, [r1]!
+    vld4.32 {d1[1],d3[1],d5[1],d7[1]}, [r1]!
+SNIP_END(_N_ColorMatrix_load_f32_4)
+
+SNIP_START(_N_ColorMatrix_load_f32_3)
+    vld3.32 {d0[0],d2[0],d4[0]}, [r1]!
+    add r1, r1, #4
+    vld3.32 {d0[1],d2[1],d4[1]}, [r1]!
+    add r1, r1, #4
+    vld3.32 {d1[0],d3[0],d5[0]}, [r1]!
+    add r1, r1, #4
+    vld3.32 {d1[1],d3[1],d5[1]}, [r1]!
+    add r1, r1, #4
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_f32_3)
+
+SNIP_START(_N_ColorMatrix_load_f32_2)
+    vld2.32 {d0[0],d2[0]}, [r1]!
+    vld2.32 {d0[1],d2[1]}, [r1]!
+    vld2.32 {d1[0],d3[0]}, [r1]!
+    vld2.32 {d1[1],d3[1]}, [r1]!
+    veor q2, q2
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_f32_2)
+
+SNIP_START(_N_ColorMatrix_load_f32_1)
+    vld1.32 {q0}, [r1]!
+    veor q1, q1
+    veor q2, q2
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_f32_1)
+
+
+SNIP_START(_N_ColorMatrix_store_u8_4)
+#mov r0, #0
+    vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+    vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+    vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+    vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_4)
+
+SNIP_START(_N_ColorMatrix_store_u8_2)
+    vst2.8 {d0[0],d1[0]}, [r0]!
+    vst2.8 {d0[1],d1[1]}, [r0]!
+    vst2.8 {d0[2],d1[2]}, [r0]!
+    vst2.8 {d0[3],d1[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_2)
+
+SNIP_START(_N_ColorMatrix_store_u8_1)
+    vst1.32 {d0[0]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_1)
+
+
+SNIP_START(_N_ColorMatrix_store_f32u_4)
+    vcvt.s32.f32 q0, q0
+    vcvt.s32.f32 q1, q1
+    vcvt.s32.f32 q2, q2
+    vcvt.s32.f32 q3, q3
+    vqmovn.s32 d0, q0
+    vqmovn.s32 d2, q1
+    vqmovn.s32 d4, q2
+    vqmovn.s32 d6, q3
+    vqmovun.s16 d0, q0
+    vqmovun.s16 d1, q1
+    vqmovun.s16 d2, q2
+    vqmovun.s16 d3, q3
+    vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+    vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+    vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+    vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+
+    #mov r0, #0
+    #ldr r0, [r0]
+
+SNIP_END(_N_ColorMatrix_store_f32u_4)
+
+SNIP_START(_N_ColorMatrix_store_f32u_2)
+    vcvt.s32.f32 q0, q0
+    vcvt.s32.f32 q1, q1
+    vqmovn.s32 d0, q0
+    vqmovn.s32 d2, q1
+    vqmovun.s16 d0, q0
+    vqmovun.s16 d1, q1
+    vst2.8 {d0[0],d1[0]}, [r0]!
+    vst2.8 {d0[1],d1[1]}, [r0]!
+    vst2.8 {d0[2],d1[2]}, [r0]!
+    vst2.8 {d0[3],d1[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32u_2)
+
+SNIP_START(_N_ColorMatrix_store_f32u_1)
+    vcvt.s32.f32 q0, q0
+    vqmovn.s32 d0, q0
+    vqmovun.s16 d0, q0
+    vst1.32 {d0[0]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32u_1)
+
+SNIP_START(_N_ColorMatrix_store_f32_4)
+    vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [r0]!
+    vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r0]!
+    vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [r0]!
+    vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_4)
+
+SNIP_START(_N_ColorMatrix_store_f32_3)
+    vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [r0]!
+    vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r0]!
+    vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [r0]!
+    vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_3)
+
+SNIP_START(_N_ColorMatrix_store_f32_2)
+    vst2.32 {d0[0],d2[0]}, [r0]!
+    vst2.32 {d0[1],d2[1]}, [r0]!
+    vst2.32 {d1[0],d3[0]}, [r0]!
+    vst2.32 {d1[1],d3[1]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_2)
+
+SNIP_START(_N_ColorMatrix_store_f32_1)
+    vst1.32 {q0}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_1)
+
+
+SNIP_START(_N_ColorMatrix_unpack_u8_4)
+    vmovl.u8 q12, d0  /* R */
+    vmovl.u8 q13, d1  /* G */
+    vmovl.u8 q14, d2  /* B */
+    vmovl.u8 q15, d3  /* A */
+SNIP_END(_N_ColorMatrix_unpack_u8_4)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_3)
+    vmovl.u8 q12, d0  /* R */
+    vmovl.u8 q13, d1  /* G */
+    vmovl.u8 q14, d2  /* B */
+    veor q15, q15
+SNIP_END(_N_ColorMatrix_unpack_u8_3)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_2)
+    vmovl.u8 q12, d0  /* R */
+    vmovl.u8 q13, d1  /* G */
+    veor q14, q14
+    veor q15, q15
+SNIP_END(_N_ColorMatrix_unpack_u8_2)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_1)
+    vmovl.u8 q12, d0  /* R */
+    veor q13, q13
+    veor q14, q14
+    veor q15, q15
+SNIP_END(_N_ColorMatrix_unpack_u8_1)
+
+SNIP_START(_N_ColorMatrix_pack_u8_4)
+    vqrshrn.s32 d24, q8, #8
+    vqrshrn.s32 d26, q9, #8
+    vqrshrn.s32 d28, q10, #8
+    vqrshrn.s32 d30, q11, #8
+    vqmovun.s16 d0, q12
+    vqmovun.s16 d1, q13
+    vqmovun.s16 d2, q14
+    vqmovun.s16 d3, q15
+SNIP_END(_N_ColorMatrix_pack_u8_4)
+
+SNIP_START(_N_ColorMatrix_pack_u8_3)
+    vqrshrn.s32 d24, q8, #8
+    vqrshrn.s32 d26, q9, #8
+    vqrshrn.s32 d28, q10, #8
+    vqmovun.s16 d0, q12
+    vqmovun.s16 d1, q13
+    vqmovun.s16 d2, q14
+SNIP_END(_N_ColorMatrix_pack_u8_3)
+
+SNIP_START(_N_ColorMatrix_pack_u8_2)
+    vqrshrn.s32 d24, q8, #8
+    vqrshrn.s32 d26, q9, #8
+    vqmovun.s16 d0, q12
+    vqmovun.s16 d1, q13
+SNIP_END(_N_ColorMatrix_pack_u8_2)
+
+SNIP_START(_N_ColorMatrix_pack_u8_1)
+    vqrshrn.s32 d24, q8, #8
+    vqmovun.s16 d0, q12
+SNIP_END(_N_ColorMatrix_pack_u8_1)
+
+SNIP_START(_N_ColorMatrix_dot)
+    vmov.u8 d1, d0
+    vmov.u8 d2, d0
+SNIP_END(_N_ColorMatrix_dot)
+
diff --git a/toolkit/Convolve3x3.cpp b/toolkit/Convolve3x3.cpp
new file mode 100644
index 0000000..51339a2
--- /dev/null
+++ b/toolkit/Convolve3x3.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.Convolve3x3"
+
+namespace android {
+namespace renderscript {
+
+extern "C" void rsdIntrinsicConvolve3x3_K(void* dst, const void* y0, const void* y1, const void* y2,
+                                          const int16_t* coef, uint32_t count);
+
+class Convolve3x3Task : public Task {
+    const void* mIn;
+    void* mOut;
+    // Even though we have exactly 9 coefficients, store them in an array of size 16 so that
+    // the SIMD instructions can load them in chunks multiple of 8.
+    float mFp[16];
+    int16_t mIp[16];
+
+    void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
+                  const uchar* py2);
+    void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
+                    size_t startX, size_t startY, size_t endX, size_t endY);
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) override;
+
+   public:
+    Convolve3x3Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
+                    const float* coefficients, const Restriction* restriction)
+        : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
+        for (int ct = 0; ct < 9; ct++) {
+            mFp[ct] = coefficients[ct];
+            if (mFp[ct] >= 0) {
+                mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
+            } else {
+                mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
+            }
+        }
+    }
+};
+
+/**
+ * Computes one convolution and stores the result in the output. This is used for uchar, uchar2,
+ * uchar3, and uchar4 vectors.
+ *
+ * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. uchar4.
+ * @tparam ComputationType Type we use for the intermediate computations.
+ * @param x The index in the row of the value we'll convolve.
+ * @param out The location in the output array where we store the value.
+ * @param py0 The start of the top row.
+ * @param py1 The start of the middle row.
+ * @param py2 The start of the bottom row.
+ * @param coeff Pointer to the float coefficients, in row major format.
+ * @param sizeX The number of cells of one row.
+ */
+template <typename InputOutputType, typename ComputationType>
+static void convolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
+                         const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
+                         int32_t sizeX) {
+    uint32_t x1 = std::max((int32_t)x - 1, 0);
+    uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
+
+    ComputationType px = convert<ComputationType>(py0[x1]) * coeff[0] +
+                         convert<ComputationType>(py0[x]) * coeff[1] +
+                         convert<ComputationType>(py0[x2]) * coeff[2] +
+                         convert<ComputationType>(py1[x1]) * coeff[3] +
+                         convert<ComputationType>(py1[x]) * coeff[4] +
+                         convert<ComputationType>(py1[x2]) * coeff[5] +
+                         convert<ComputationType>(py2[x1]) * coeff[6] +
+                         convert<ComputationType>(py2[x]) * coeff[7] +
+                         convert<ComputationType>(py2[x2]) * coeff[8];
+
+    px = clamp(px + 0.5f, 0.f, 255.f);
+    *out = convert<InputOutputType>(px);
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+/**
+ * Computes one convolution and stores the result in the output. This is used for float, float2,
+ * float3, and float4 vectors.
+ *
+ * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. float4.
+ * @param x The index in the row of the value we'll convolve.
+ * @param out The location in the output array where we store the value.
+ * @param py0 The start of the top row.
+ * @param py1 The start of the middle row.
+ * @param py2 The start of the bottom row.
+ * @param coeff Pointer to the float coefficients, in row major format.
+ * @param sizeX The number of cells of one row.
+ */
+template <typename InputOutputType>
+static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
+                         const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
+                         int32_t sizeX) {
+    uint32_t x1 = std::max((int32_t)x - 1, 0);
+    uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
+    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
+           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
+           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
+}
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+/**
+ * This function convolves one line.
+ *
+ * @param pout Where to place the next output.
+ * @param xstart Index in the X direction of where to start.
+ * @param xend End index
+ * @param ppy0 Points to the start of the previous line.
+ * @param ppy1 Points to the start of the current line.
+ * @param ppy2 Points to the start of the next line.
+ */
+void Convolve3x3Task::kernelU4(uchar* pout, uint32_t xstart, uint32_t xend, const uchar* ppy0,
+                               const uchar* ppy1, const uchar* ppy2) {
+    uchar4* out = (uchar4*)pout;
+    const uchar4* py0 = (const uchar4*)ppy0;
+    const uchar4* py1 = (const uchar4*)ppy1;
+    const uchar4* py2 = (const uchar4*)ppy2;
+
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+    if (x1 == 0) {
+        convolveOneU<uchar4, float4>(0, out, py0, py1, py2, mFp, mSizeX);
+        x1++;
+        out++;
+    }
+
+    if (x2 > x1) {
+#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            int32_t len = (x2 - x1 - 1) >> 1;
+            if (len > 0) {
+                rsdIntrinsicConvolve3x3_K(out, &py0[x1 - 1], &py1[x1 - 1], &py2[x1 - 1], mIp, len);
+                x1 += len << 1;
+                out += len << 1;
+            }
+        }
+#endif
+
+        while (x1 != x2) {
+            convolveOneU<uchar4, float4>(x1, out, py0, py1, py2, mFp, mSizeX);
+            out++;
+            x1++;
+        }
+    }
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+template <typename T>
+void RsdCpuScriptIntrinsicConvolve3x3_kernelF(void* in, T* out, uint32_t xstart, uint32_t xend,
+                                              uint32_t currentY, size_t sizeX, size_t sizeY,
+                                              size_t vectorSize, float* fp) {
+    const uchar* pin = (const uchar*)in;
+    const size_t stride = sizeX * vectorSize * 4;  // float takes 4 bytes
+
+    uint32_t y1 = std::min((int32_t)currentY + 1, (int32_t)(sizeY - 1));
+    uint32_t y2 = std::max((int32_t)currentY - 1, 0);
+    const T* py0 = (const T*)(pin + stride * y2);
+    const T* py1 = (const T*)(pin + stride * currentY);
+    const T* py2 = (const T*)(pin + stride * y1);
+
+    for (uint32_t x = xstart; x < xend; x++, out++) {
+        ConvolveOneF<T>(x, out, py0, py1, py2, fp, sizeX);
+    }
+}
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+template <typename InputOutputType, typename ComputationType>
+static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
+                      size_t startX, size_t startY, size_t endX, size_t endY, float* fp) {
+    const size_t stride = vectorSize * sizeX;
+    for (size_t y = startY; y < endY; y++) {
+        uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
+        uint32_t y2 = std::max((int32_t)y - 1, 0);
+
+        size_t offset = (y * sizeX + startX) * vectorSize;
+        InputOutputType* px = (InputOutputType*)(pout + offset);
+        InputOutputType* py0 = (InputOutputType*)(pin + stride * y2);
+        InputOutputType* py1 = (InputOutputType*)(pin + stride * y);
+        InputOutputType* py2 = (InputOutputType*)(pin + stride * y1);
+        for (uint32_t x = startX; x < endX; x++, px++) {
+            convolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, fp, sizeX);
+        }
+    }
+}
+
+void Convolve3x3Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
+                                 size_t sizeY, size_t startX, size_t startY, size_t endX,
+                                 size_t endY) {
+    const size_t stride = paddedSize(vectorSize) * sizeX;
+    for (size_t y = startY; y < endY; y++) {
+        uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
+        uint32_t y2 = std::max((int32_t)y - 1, 0);
+
+        size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
+        uchar* px = pout + offset;
+        const uchar* py0 = pin + stride * y2;
+        const uchar* py1 = pin + stride * y;
+        const uchar* py2 = pin + stride * y1;
+        kernelU4(px, startX, endX, py0, py1, py2);
+    }
+}
+
+void Convolve3x3Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                                  size_t endY) {
+    // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
+    // endX, endY);
+    switch (mVectorSize) {
+        case 1:
+            convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
+                                    startX, startY, endX, endY, mFp);
+            break;
+        case 2:
+            convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
+                                      startX, startY, endX, endY, mFp);
+            break;
+        case 3:
+        case 4:
+            convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
+                       endX, endY);
+            break;
+    }
+}
+
+void RenderScriptToolkit::convolve3x3(const void* in, void* out, size_t vectorSize, size_t sizeX,
+                                      size_t sizeY, const float* coefficients,
+                                      const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+    if (vectorSize < 1 || vectorSize > 4) {
+        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+        return;
+    }
+#endif
+
+    Convolve3x3Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/Convolve5x5.cpp b/toolkit/Convolve5x5.cpp
new file mode 100644
index 0000000..1f3f75c
--- /dev/null
+++ b/toolkit/Convolve5x5.cpp
@@ -0,0 +1,350 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+namespace android {
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Convolve5x5"
+
+extern "C" void rsdIntrinsicConvolve5x5_K(void* dst, const void* y0, const void* y1, const void* y2,
+                                          const void* y3, const void* y4, const int16_t* coef,
+                                          uint32_t count);
+
+class Convolve5x5Task : public Task {
+    const void* mIn;
+    void* mOut;
+    // Even though we have exactly 25 coefficients, store them in an array of size 28 so that
+    // the SIMD instructions can load them in three chunks of 8 and 1 of chunk of 4.
+    float mFp[28];
+    int16_t mIp[28];
+
+    void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
+                  const uchar* py2, const uchar* py3, const uchar* py4);
+    void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
+                    size_t startX, size_t startY, size_t endX, size_t endY);
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) override;
+
+   public:
+    Convolve5x5Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
+                    const float* coefficients, const Restriction* restriction)
+        : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
+        for (int ct = 0; ct < 25; ct++) {
+            mFp[ct] = coefficients[ct];
+            if (mFp[ct] >= 0) {
+                mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
+            } else {
+                mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
+            }
+        }
+    }
+};
+
+template <typename InputOutputType, typename ComputationType>
+static void ConvolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
+                         const InputOutputType* py1, const InputOutputType* py2,
+                         const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
+                         int32_t width) {
+    uint32_t x0 = std::max((int32_t)x - 2, 0);
+    uint32_t x1 = std::max((int32_t)x - 1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = std::min((int32_t)x + 1, width - 1);
+    uint32_t x4 = std::min((int32_t)x + 2, width - 1);
+
+    ComputationType px = convert<ComputationType>(py0[x0]) * coeff[0] +
+                         convert<ComputationType>(py0[x1]) * coeff[1] +
+                         convert<ComputationType>(py0[x2]) * coeff[2] +
+                         convert<ComputationType>(py0[x3]) * coeff[3] +
+                         convert<ComputationType>(py0[x4]) * coeff[4] +
+
+                         convert<ComputationType>(py1[x0]) * coeff[5] +
+                         convert<ComputationType>(py1[x1]) * coeff[6] +
+                         convert<ComputationType>(py1[x2]) * coeff[7] +
+                         convert<ComputationType>(py1[x3]) * coeff[8] +
+                         convert<ComputationType>(py1[x4]) * coeff[9] +
+
+                         convert<ComputationType>(py2[x0]) * coeff[10] +
+                         convert<ComputationType>(py2[x1]) * coeff[11] +
+                         convert<ComputationType>(py2[x2]) * coeff[12] +
+                         convert<ComputationType>(py2[x3]) * coeff[13] +
+                         convert<ComputationType>(py2[x4]) * coeff[14] +
+
+                         convert<ComputationType>(py3[x0]) * coeff[15] +
+                         convert<ComputationType>(py3[x1]) * coeff[16] +
+                         convert<ComputationType>(py3[x2]) * coeff[17] +
+                         convert<ComputationType>(py3[x3]) * coeff[18] +
+                         convert<ComputationType>(py3[x4]) * coeff[19] +
+
+                         convert<ComputationType>(py4[x0]) * coeff[20] +
+                         convert<ComputationType>(py4[x1]) * coeff[21] +
+                         convert<ComputationType>(py4[x2]) * coeff[22] +
+                         convert<ComputationType>(py4[x3]) * coeff[23] +
+                         convert<ComputationType>(py4[x4]) * coeff[24];
+    px = clamp(px + 0.5f, 0.f, 255.f);
+    *out = convert<InputOutputType>(px);
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+template <typename InputOutputType>
+static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
+                         const InputOutputType* py1, const InputOutputType* py2,
+                         const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
+                         int32_t width) {
+    uint32_t x0 = std::max((int32_t)x - 2, 0);
+    uint32_t x1 = std::max((int32_t)x - 1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = std::min((int32_t)x + 1, width - 1);
+    uint32_t x4 = std::min((int32_t)x + 2, width - 1);
+
+    InputOutputType px = py0[x0] * coeff[0] + py0[x1] * coeff[1] + py0[x2] * coeff[2] +
+                         py0[x3] * coeff[3] + py0[x4] * coeff[4] +
+
+                         py1[x0] * coeff[5] + py1[x1] * coeff[6] + py1[x2] * coeff[7] +
+                         py1[x3] * coeff[8] + py1[x4] * coeff[9] +
+
+                         py2[x0] * coeff[10] + py2[x1] * coeff[11] + py2[x2] * coeff[12] +
+                         py2[x3] * coeff[13] + py2[x4] * coeff[14] +
+
+                         py3[x0] * coeff[15] + py3[x1] * coeff[16] + py3[x2] * coeff[17] +
+                         py3[x3] * coeff[18] + py3[x4] * coeff[19] +
+
+                         py4[x0] * coeff[20] + py4[x1] * coeff[21] + py4[x2] * coeff[22] +
+                         py4[x3] * coeff[23] + py4[x4] * coeff[24];
+    *out = px;
+}
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+/**
+ * This function convolves one line.
+ *
+ * @param pout Where to place the next output.
+ * @param xstart Index in the X direction of where to start.
+ * @param xend End index
+ * @param ppy0 Points to the start of the line two above.
+ * @param ppy1 Points to the start of the line one above.
+ * @param ppy2 Points to the start of the current line.
+ * @param ppy3 Points to the start of the line one below.
+ * @param ppy4 Points to the start of the line two below.
+ */
+void Convolve5x5Task::kernelU4(uchar* pout, uint32_t x1, uint32_t x2, const uchar* ppy0,
+                               const uchar* ppy1, const uchar* ppy2, const uchar* ppy3,
+                               const uchar* ppy4) {
+    uchar4* out = (uchar4*)pout;
+    const uchar4* py0 = (const uchar4*)ppy0;
+    const uchar4* py1 = (const uchar4*)ppy1;
+    const uchar4* py2 = (const uchar4*)ppy2;
+    const uchar4* py3 = (const uchar4*)ppy3;
+    const uchar4* py4 = (const uchar4*)ppy4;
+
+    while ((x1 < x2) && (x1 < 2)) {
+        ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
+        out++;
+        x1++;
+    }
+#if defined(ARCH_X86_HAVE_SSSE3)
+    // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
+    // 3 for end boundary where x may hit the end boundary)
+    if (mUsesSimd && ((x1 + 6) < x2)) {
+        // subtract 3 for end boundary
+        uint32_t len = (x2 - x1 - 3) >> 2;
+        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
+                                  py4 + x1 - 2, mIp, len);
+        out += len << 2;
+        x1 += len << 2;
+    }
+#endif
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd && ((x1 + 3) < x2)) {
+        uint32_t len = (x2 - x1 - 3) >> 1;
+        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
+                                  py4 + x1 - 2, mIp, len);
+        out += len << 1;
+        x1 += len << 1;
+    }
+#endif
+
+    while (x1 < x2) {
+        ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
+        out++;
+        x1++;
+    }
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+// This will need more cleanup before it can be used.
+void Convolve5x5Task::kernelF4(const ConvolveInfo* info, float4* out,
+                               uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar* pin = (const uchar*)info->in;
+    const size_t stride = info->stride;
+
+    uint32_t y0 = std::max((int32_t)currentY - 2, 0);
+    uint32_t y1 = std::max((int32_t)currentY - 1, 0);
+    uint32_t y2 = currentY;
+    uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
+    uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
+
+    const float4* py0 = (const float4*)(pin + stride * y0);
+    const float4* py1 = (const float4*)(pin + stride * y1);
+    const float4* py2 = (const float4*)(pin + stride * y2);
+    const float4* py3 = (const float4*)(pin + stride * y3);
+    const float4* py4 = (const float4*)(pin + stride * y4);
+
+    for (uint32_t x = xstart; x < xend; x++, out++) {
+        ConvolveOneF<float4>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
+    }
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo* info, float2* out,
+                                               uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar* pin = (const uchar*)info->in;
+    const size_t stride = info->stride;
+
+    uint32_t y0 = std::max((int32_t)currentY - 2, 0);
+    uint32_t y1 = std::max((int32_t)currentY - 1, 0);
+    uint32_t y2 = currentY;
+    uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
+    uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
+
+    const float2* py0 = (const float2*)(pin + stride * y0);
+    const float2* py1 = (const float2*)(pin + stride * y1);
+    const float2* py2 = (const float2*)(pin + stride * y2);
+    const float2* py3 = (const float2*)(pin + stride * y3);
+    const float2* py4 = (const float2*)(pin + stride * y4);
+
+    for (uint32_t x = xstart; x < xend; x++, out++) {
+        ConvolveOneF<float2>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
+    }
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo* info, float* out,
+                                               uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar* pin = (const uchar*)info->in;
+    const size_t stride = info->stride;
+
+    uint32_t y0 = std::max((int32_t)currentY - 2, 0);
+    uint32_t y1 = std::max((int32_t)currentY - 1, 0);
+    uint32_t y2 = currentY;
+    uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
+    uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
+
+    const float* py0 = (const float*)(pin + stride * y0);
+    const float* py1 = (const float*)(pin + stride * y1);
+    const float* py2 = (const float*)(pin + stride * y2);
+    const float* py3 = (const float*)(pin + stride * y3);
+    const float* py4 = (const float*)(pin + stride * y4);
+
+    for (uint32_t x = xstart; x < xend; x++, out++) {
+        ConvolveOneF<float>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
+    }
+}
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+template <typename InputOutputType, typename ComputationType>
+static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
+                      size_t startX, size_t startY, size_t endX, size_t endY, float* mFp) {
+    const size_t stride = vectorSize * sizeX;
+    for (size_t y = startY; y < endY; y++) {
+        uint32_t y0 = std::max((int32_t)y - 2, 0);
+        uint32_t y1 = std::max((int32_t)y - 1, 0);
+        uint32_t y2 = y;
+        uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
+        uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
+
+        size_t offset = (y * sizeX + startX) * vectorSize;
+        InputOutputType* px = (InputOutputType*)(pout + offset);
+        InputOutputType* py0 = (InputOutputType*)(pin + stride * y0);
+        InputOutputType* py1 = (InputOutputType*)(pin + stride * y1);
+        InputOutputType* py2 = (InputOutputType*)(pin + stride * y2);
+        InputOutputType* py3 = (InputOutputType*)(pin + stride * y3);
+        InputOutputType* py4 = (InputOutputType*)(pin + stride * y4);
+        for (uint32_t x = startX; x < endX; x++, px++) {
+            ConvolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, py3, py4, mFp,
+                                                           sizeX);
+        }
+    }
+}
+
+void Convolve5x5Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
+                                 size_t sizeY, size_t startX, size_t startY, size_t endX,
+                                 size_t endY) {
+    const size_t stride = paddedSize(vectorSize) * sizeX;
+    for (size_t y = startY; y < endY; y++) {
+        uint32_t y0 = std::max((int32_t)y - 2, 0);
+        uint32_t y1 = std::max((int32_t)y - 1, 0);
+        uint32_t y2 = y;
+        uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
+        uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
+
+        size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
+        uchar* px = pout + offset;
+        const uchar* py0 = pin + stride * y0;
+        const uchar* py1 = pin + stride * y1;
+        const uchar* py2 = pin + stride * y2;
+        const uchar* py3 = pin + stride * y3;
+        const uchar* py4 = pin + stride * y4;
+        kernelU4(px, startX, endX, py0, py1, py2, py3, py4);
+    }
+}
+
+void Convolve5x5Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                                  size_t endY) {
+    // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
+    // endX, endY);
+    switch (mVectorSize) {
+        case 1:
+            convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
+                                    startX, startY, endX, endY, mFp);
+            break;
+        case 2:
+            convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
+                                      startX, startY, endX, endY, mFp);
+            break;
+        case 3:
+        case 4:
+            convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
+                       endX, endY);
+            break;
+    }
+}
+
+void RenderScriptToolkit::convolve5x5(const void* in, void* out, size_t vectorSize, size_t sizeX,
+                                      size_t sizeY, const float* coefficients,
+                                      const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+    if (vectorSize < 1 || vectorSize > 4) {
+        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+        return;
+    }
+#endif
+
+    Convolve5x5Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/Convolve_advsimd.S b/toolkit/Convolve_advsimd.S
new file mode 100644
index 0000000..0daa0c5
--- /dev/null
+++ b/toolkit/Convolve_advsimd.S
@@ -0,0 +1,265 @@
+/*
+ * Copyright (C) 2012,2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+        x0 = dst
+        x1 = y0 base pointer
+        x2 = y1 base pointer
+        x3 = y2 base pointer
+        x4 = coeffs
+        x5 = length / 2
+*/
+
+#define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+ENTRY(rsdIntrinsicConvolve3x3_K)
+        sub             x6, sp, #64
+        sub             sp, sp, #64
+        st1             {v8.1d-v11.1d}, [x6], #32
+        st1             {v12.1d-v15.1d}, [x6]
+
+        /* Load the coefficients in the v0, v1 registers */
+        ld1     {v0.8h, v1.8h}, [x4]
+
+        /* Load the frequently used immediate in a register */
+        mov x4, #8
+
+1:
+        /* Load and post-increase the address by x4=#8 */
+        ld1     {v13.16b}, [x1], x4
+        ld1     {v14.16b}, [x2], x4
+        ld1     {v15.16b}, [x3], x4
+
+        /* Signal memory for data that will be used in the loop after the next */
+//        prfm        PLDL1KEEP,[x1, x4] // TODO: test this
+//        prfm        PLDL1KEEP,[x2, x4] // TODO: test this
+//        prfm        PLDL1KEEP,[x3, x4] // TODO: test this
+
+        uxtl      v2.8h, v13.8b
+        uxtl2     v3.8h, v13.16b
+        uxtl      v4.8h, v14.8b
+        uxtl2     v5.8h, v14.16b
+        uxtl      v6.8h, v15.8b
+        uxtl2     v7.8h, v15.16b
+
+/*
+        The two pixel source array is
+        v2,  v2hi,  v3lo,  v3hi
+        v4,  v4hi,  v5lo, v5hi
+        v6, v6hi, v7lo, v7hi
+*/
+
+        smull     v8.4s, v2.4h, v0.h[0]
+        smull2    v9.4s, v2.8h, v0.h[0]
+        smlal2    v8.4s, v2.8h, v0.h[1]
+        smlal     v9.4s, v3.4h, v0.h[1]
+        smlal     v8.4s, v3.4h, v0.h[2]
+        smlal2    v9.4s, v3.8h, v0.h[2]
+        smlal     v8.4s, v4.4h, v0.h[3]
+        smlal2    v9.4s, v4.8h, v0.h[3]
+        smlal2    v8.4s, v4.8h, v0.h[4]
+        smlal     v9.4s, v5.4h, v0.h[4]
+        smlal     v8.4s, v5.4h, v0.h[5]
+        smlal2    v9.4s, v5.8h, v0.h[5]
+        smlal     v8.4s, v6.4h, v0.h[6]
+        smlal2    v9.4s, v6.8h, v0.h[6]
+        smlal2    v8.4s, v6.8h, v0.h[7]
+        smlal     v9.4s, v7.4h, v0.h[7]
+        smlal     v8.4s, v7.4h, v1.h[0]
+        smlal2    v9.4s, v7.8h, v1.h[0]
+
+        shrn      v8.4h, v8.4s, #8
+        shrn2     v8.8h, v9.4s, #8
+
+        sqxtun      v8.8b, v8.8h
+        st1         {v8.8b}, [x0], #8
+
+        /* Are we done yet? */
+        subs x5, x5, #1
+        bne 1b
+
+        /* We're done, bye! */
+        ld1             {v8.1d-v11.1d}, [sp], #32
+        ld1             {v12.1d-v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicConvolve3x3_K)
+
+
+/* Convolve 5x5 */
+
+/*
+        x0 = dst
+        x1 = y0 base pointer
+        x2 = y1 base pointer
+        x3 = y2 base pointer
+        x4 = y3 base pointer
+        x5 = y4 base pointer
+        x6 = coeffs
+        x7 = length
+*/
+ENTRY(rsdIntrinsicConvolve5x5_K)
+        sub         x8, sp, #64
+        sub         sp, sp, #64
+        st1         {v8.1d-v11.1d}, [x8], #32
+        st1         {v12.1d-v15.1d}, [x8]
+
+        /* Create the coefficients vector  */
+        ld1         {v0.8h-v2.8h}, [x6], #48
+        ld1         {v3.4h}, [x6], #8
+
+        movi      v15.4s, #0x7f
+
+        /* Load the frequently used immediate in a register */
+        mov     x6, #8
+
+1:
+        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
+        ld1     {v9.8b-v11.8b}, [x1], x6      //  y0 ( y - 2 )
+        ld1     {v12.8b-v14.8b}, [x2], x6      //  y0 ( y - 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+//        prfm        PLDL1KEEP,[x1, x6] // TODO: test this
+//        prfm        PLDL1KEEP,[x2, x6] // TODO: test this
+
+        /* Promoting the 8bit channels to 16bit */
+        uxtl      v9.8h,  v9.8b
+        uxtl      v10.8h, v10.8b
+        uxtl      v11.8h, v11.8b
+        uxtl      v12.8h, v12.8b
+        uxtl      v13.8h, v13.8b
+        uxtl      v14.8h, v14.8b
+
+/*
+        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
+        v12,  v12hi
+*/
+        smull     v4.4s, v9.4h, v0.h[0]
+        smull2    v5.4s, v9.8h, v0.h[0]
+        smlal2    v4.4s, v9.8h, v0.h[1]
+        smlal     v5.4s, v10.4h, v0.h[1]
+        smlal     v4.4s, v10.4h, v0.h[2]
+        smlal2    v5.4s, v10.8h, v0.h[2]
+        smlal2    v4.4s, v10.8h, v0.h[3]
+        smlal     v5.4s, v11.4h, v0.h[3]
+        smlal     v4.4s, v11.4h, v0.h[4]
+        smlal2    v5.4s, v11.8h, v0.h[4]
+
+        smlal     v4.4s, v12.4h, v0.h[5]
+        smlal2    v5.4s, v12.8h, v0.h[5]
+        smlal2    v4.4s, v12.8h, v0.h[6]
+        smlal     v5.4s, v13.4h, v0.h[6]
+        smlal     v4.4s, v13.4h, v0.h[7]
+        smlal2    v5.4s, v13.8h, v0.h[7]
+        smlal2    v4.4s, v13.8h, v1.h[0]
+        smlal     v5.4s, v14.4h, v1.h[0]
+        smlal     v4.4s, v14.4h, v1.h[1]
+        smlal2    v5.4s, v14.8h, v1.h[1]
+
+        /* Next 2 rows */
+        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
+        ld1     {v9.8b-v11.8b}, [x3], x6      //  y0 ( y )
+        ld1     {v12.8b-v14.8b}, [x4], x6      //  y0 ( y + 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+//        prfm        PLDL1KEEP,[x3, x6] // TODO: test this
+//        prfm        PLDL1KEEP,[x4, x6] // TODO: test this
+
+        /* Promoting the 8bit channels to 16bit */
+        uxtl      v9.8h,  v9.8b
+        uxtl      v10.8h, v10.8b
+        uxtl      v11.8h, v11.8b
+        uxtl      v12.8h, v12.8b
+        uxtl      v13.8h, v13.8b
+        uxtl      v14.8h, v14.8b
+
+/*
+        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
+        v12,  v12hi
+*/
+        smlal     v4.4s, v9.4h, v1.h[2]
+        smlal2    v5.4s, v9.8h, v1.h[2]
+        smlal2    v4.4s, v9.8h, v1.h[3]
+        smlal     v5.4s, v10.4h, v1.h[3]
+        smlal     v4.4s, v10.4h, v1.h[4]
+        smlal2    v5.4s, v10.8h, v1.h[4]
+        smlal2    v4.4s, v10.8h, v1.h[5]
+        smlal     v5.4s, v11.4h, v1.h[5]
+        smlal     v4.4s, v11.4h, v1.h[6]
+        smlal2    v5.4s, v11.8h, v1.h[6]
+
+        smlal     v4.4s, v12.4h, v1.h[7]
+        smlal2    v5.4s, v12.8h, v1.h[7]
+        smlal2    v4.4s, v12.8h, v2.h[0]
+        smlal     v5.4s, v13.4h, v2.h[0]
+        smlal     v4.4s, v13.4h, v2.h[1]
+        smlal2    v5.4s, v13.8h, v2.h[1]
+        smlal2    v4.4s, v13.8h, v2.h[2]
+        smlal     v5.4s, v14.4h, v2.h[2]
+        smlal     v4.4s, v14.4h, v2.h[3]
+        smlal2    v5.4s, v14.8h, v2.h[3]
+
+        /* Last row */
+        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
+        ld1     {v9.8b- v11.8b}, [x5], x6      //  y0 ( y + 2 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+//        prfm        PLDL1KEEP,[x5, x6] // TODO: test this
+
+        /* Promoting the 8bit channels to 16bit */
+        uxtl      v9.8h,  v9.8b
+        uxtl      v10.8h, v10.8b
+        uxtl      v11.8h, v11.8b
+
+/*
+        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
+        v12,  v12hi
+*/
+
+        smlal     v4.4s, v9.4h, v2.h[4]
+        smlal2    v5.4s, v9.8h, v2.h[4]
+        smlal2    v4.4s, v9.8h, v2.h[5]
+        smlal     v5.4s, v10.4h, v2.h[5]
+        smlal     v4.4s, v10.4h, v2.h[6]
+        smlal2    v5.4s, v10.8h, v2.h[6]
+        smlal2    v4.4s, v10.8h, v2.h[7]
+        smlal     v5.4s, v11.4h, v2.h[7]
+        smlal     v4.4s, v11.4h, v3.h[0]
+        smlal2    v5.4s, v11.8h, v3.h[0]
+
+        add      v4.4s, v4.4s, v15.4s
+        add      v5.4s, v5.4s, v15.4s
+
+/*      Narrow it to a d-reg 32 -> 16 bit */
+        rshrn      v4.4h, v4.4s, #8
+        rshrn2     v4.8h, v5.4s, #8
+
+
+/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
+        sqxtun      v4.8b, v4.8h
+
+        st1     {v4.8b}, [x0], #8        // return the output and increase the address of x0
+
+        /* Are we done? */
+        subs x7, x7, #1
+        bne 1b
+
+        /* Yup, bye */
+        ld1         {v8.1d-v11.1d}, [sp], #32
+        ld1         {v12.1d-v15.1d}, [sp], #32
+        ret
+
+END(rsdIntrinsicConvolve5x5_K)
diff --git a/toolkit/Convolve_neon.S b/toolkit/Convolve_neon.S
new file mode 100644
index 0000000..ee10884
--- /dev/null
+++ b/toolkit/Convolve_neon.S
@@ -0,0 +1,287 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+        r0 = dst
+        r1 = y0 base pointer
+        r2 = y1 base pointer
+        r3 = y2 base pointer
+        sp = coeffs
+        sp = length / 2
+*/
+
+#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+ENTRY(rsdIntrinsicConvolve3x3_K)
+        push            {r4-r8, r10, r11, lr}
+        vpush           {q4-q7}
+
+        /* Get the coeffs pointer from the stack and load the
+           coefficients in the q0, q1 NEON registers */
+        ldr r4, [sp, #32+64]
+        vld1.16 {q0, q1}, [r4]
+
+        /* Get count from the stack */
+        ldr r4, [sp, #36+64]
+
+        /* Load the frequently used immediate in a register */
+        mov r5, #8
+
+1:
+        /* Load and post-increase the address by r5=#8 */
+        vld1.8 {q13}, [r1], r5
+        vld1.8 {q14}, [r2], r5
+        vld1.8 {q15}, [r3], r5
+
+        /* Signal memory for data that will be used in the loop after the next */
+        pld         [r1, r5]
+        pld         [r2, r5]
+        pld         [r3, r5]
+
+        vmovl.u8 q2, d26
+        vmovl.u8 q3, d27
+        vmovl.u8 q4, d28
+        vmovl.u8 q5, d29
+        vmovl.u8 q6, d30
+        vmovl.u8 q7, d31
+
+/*
+        The two pixel source array is
+        d4,  d5,  d6,  d7
+        d8,  d9,  d10, d11
+        d12, d13, d14, d15
+*/
+
+        vmull.s16 q8, d4, d0[0]
+        vmlal.s16 q8, d5, d0[1]
+        vmlal.s16 q8, d6, d0[2]
+        vmlal.s16 q8, d8, d0[3]
+        vmlal.s16 q8, d9, d1[0]
+        vmlal.s16 q8, d10, d1[1]
+        vmlal.s16 q8, d12, d1[2]
+        vmlal.s16 q8, d13, d1[3]
+        vmlal.s16 q8, d14, d2[0]
+
+        vmull.s16 q9, d5, d0[0]
+        vmlal.s16 q9, d6, d0[1]
+        vmlal.s16 q9, d7, d0[2]
+        vmlal.s16 q9, d9, d0[3]
+        vmlal.s16 q9, d10, d1[0]
+        vmlal.s16 q9, d11, d1[1]
+        vmlal.s16 q9, d13, d1[2]
+        vmlal.s16 q9, d14, d1[3]
+        vmlal.s16 q9, d15, d2[0]
+
+        vshrn.i32 d16, q8, #8
+        vshrn.i32 d17, q9, #8
+
+        vqmovun.s16 d16, q8
+        vst1.8 d16, [r0]!
+
+        /* Are we done yet? */
+        subs r4, r4, #1
+        bne 1b
+
+        /* We're done, bye! */
+        vpop            {q4-q7}
+        pop             {r4-r8, r10, r11, lr}
+        bx              lr
+END(rsdIntrinsicConvolve3x3_K)
+
+
+/* Convolve 5x5 */
+
+/*
+        r0 = dst
+        r1 = y0 base pointer
+        r2 = y1 base pointer
+        r3 = y2 base pointer
+        r4 = y3 base pointer
+        r5 = y4 base pointer
+        r6 = coeffs
+        r7 = length
+*/
+ENTRY(rsdIntrinsicConvolve5x5_K)
+        push        {r4-r7, lr}
+        vpush       {q4-q7}
+
+        /* load y3 in r4 */
+        ldr     r4, [sp, #20 + 64]
+
+        /* load y4 in r5 */
+        ldr     r5, [sp, #24 + 64]
+
+        /* Load the coefficients pointer */
+        ldr     r6, [sp, #28 + 64]
+
+        /* Create the coefficients vector */
+        vld1.16     {d0, d1, d2, d3}, [r6]!
+        vld1.16     {d4, d5, d6}, [r6]
+
+        vmov.u32  q15, #0x7f
+
+        /* load the count */
+        ldr     r6, [sp, #32 + 64]
+
+        /* Load the frequently used immediate in a register */
+        mov     r7, #8
+
+1:
+        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+        vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
+        vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+        pld         [r1, r7]
+        pld         [r2, r7]
+
+        /* Promoting the 8bit channels to 16bit */
+        vmovl.u8 q9,  d24
+        vmovl.u8 q10, d25
+        vmovl.u8 q11, d26
+        vmovl.u8 q12, d27
+        vmovl.u8 q13, d28
+        vmovl.u8 q14, d29
+
+/*
+        d18,  d19,  d20, d21, d22, d23,
+        d24,  d25
+*/
+        vmull.s16 q4, d18, d0[0]
+        vmlal.s16 q4, d19, d0[1]
+        vmlal.s16 q4, d20, d0[2]
+        vmlal.s16 q4, d21, d0[3]
+        vmlal.s16 q4, d22, d1[0]
+
+        vmlal.s16 q4, d24, d1[1]
+        vmlal.s16 q4, d25, d1[2]
+        vmlal.s16 q4, d26, d1[3]
+        vmlal.s16 q4, d27, d2[0]
+        vmlal.s16 q4, d28, d2[1]
+
+        vmull.s16 q5, d19, d0[0]
+        vmlal.s16 q5, d20, d0[1]
+        vmlal.s16 q5, d21, d0[2]
+        vmlal.s16 q5, d22, d0[3]
+        vmlal.s16 q5, d23, d1[0]
+
+        vmlal.s16 q5, d25, d1[1]
+        vmlal.s16 q5, d26, d1[2]
+        vmlal.s16 q5, d27, d1[3]
+        vmlal.s16 q5, d28, d2[0]
+        vmlal.s16 q5, d29, d2[1]
+
+
+        /* Next 2 rows */
+        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+        vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
+        vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+        pld         [r3, r7]
+        pld         [r4, r7]
+
+        /* Promoting the 8bit channels to 16bit */
+        vmovl.u8 q9,  d24
+        vmovl.u8 q10, d25
+        vmovl.u8 q11, d26
+        vmovl.u8 q12, d27
+        vmovl.u8 q13, d28
+        vmovl.u8 q14, d29
+
+/*
+        d18,  d19,  d20, d21, d22, d23,
+        d24,  d25
+*/
+        vmlal.s16 q4, d18, d2[2]
+        vmlal.s16 q4, d19, d2[3]
+        vmlal.s16 q4, d20, d3[0]
+        vmlal.s16 q4, d21, d3[1]
+        vmlal.s16 q4, d22, d3[2]
+
+        vmlal.s16 q4, d24, d3[3]
+        vmlal.s16 q4, d25, d4[0]
+        vmlal.s16 q4, d26, d4[1]
+        vmlal.s16 q4, d27, d4[2]
+        vmlal.s16 q4, d28, d4[3]
+
+        vmlal.s16 q5, d19, d2[2]
+        vmlal.s16 q5, d20, d2[3]
+        vmlal.s16 q5, d21, d3[0]
+        vmlal.s16 q5, d22, d3[1]
+        vmlal.s16 q5, d23, d3[2]
+
+        vmlal.s16 q5, d25, d3[3]
+        vmlal.s16 q5, d26, d4[0]
+        vmlal.s16 q5, d27, d4[1]
+        vmlal.s16 q5, d28, d4[2]
+        vmlal.s16 q5, d29, d4[3]
+
+        /* Last row */
+        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+        vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+        pld         [r5, r7]
+
+        /* Promoting the 8bit channels to 16bit */
+        vmovl.u8 q9,  d24
+        vmovl.u8 q10, d25
+        vmovl.u8 q11, d26
+
+/*
+        d18,  d19,  d20, d21, d22, d23,
+        d24,  d25
+*/
+
+        vmlal.s16 q4, d18, d5[0]
+        vmlal.s16 q4, d19, d5[1]
+        vmlal.s16 q4, d20, d5[2]
+        vmlal.s16 q4, d21, d5[3]
+        vmlal.s16 q4, d22, d6[0]
+
+        vmlal.s16 q5, d19, d5[0]
+        vmlal.s16 q5, d20, d5[1]
+        vmlal.s16 q5, d21, d5[2]
+        vmlal.s16 q5, d22, d5[3]
+        vmlal.s16 q5, d23, d6[0]
+
+
+
+        vadd.i32 q4, q4, q15
+        vadd.i32 q5, q5, q15
+
+/*      Narrow it to a d-reg 32 -> 16 bit */
+        vrshrn.i32 d8, q4, #8
+        vrshrn.i32 d9, q5, #8
+
+
+/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
+        vqmovun.s16 d8, q4
+
+        vst1.8 d8, [r0]!           @ return the output and increase the address of r0
+
+        /* Are we done? */
+        subs r6, r6, #1
+        bne 1b
+
+        /* Yup, bye */
+        vpop        {q4-q7}
+        pop         {r4-r7, lr}
+        bx          lr
+
+END(rsdIntrinsicConvolve5x5_K)
diff --git a/toolkit/Histogram.cpp b/toolkit/Histogram.cpp
new file mode 100644
index 0000000..86b4bed
--- /dev/null
+++ b/toolkit/Histogram.cpp
@@ -0,0 +1,299 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <array>
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.Histogram"
+
+namespace android {
+namespace renderscript {
+
+class HistogramTask : public Task {
+    const uchar* mIn;
+    std::vector<int> mSums;
+    uint32_t mThreadCount;
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) override;
+
+    void kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+    void kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+    void kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+    void kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+
+   public:
+    HistogramTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize,
+                  uint32_t threadCount, const Restriction* restriction);
+    void collateSums(int* out);
+};
+
+class HistogramDotTask : public Task {
+    const uchar* mIn;
+    float mDot[4];
+    int mDotI[4];
+    std::vector<int> mSums;
+    uint32_t mThreadCount;
+
+    void kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+    void kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+    void kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+    void kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+
+   public:
+    HistogramDotTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize,
+                     uint32_t threadCount, const float* coefficients,
+                     const Restriction* restriction);
+    void collateSums(int* out);
+
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) override;
+};
+
+HistogramTask::HistogramTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize,
+                             uint32_t threadCount, const Restriction* restriction)
+    : Task{sizeX, sizeY, vectorSize, true, restriction},
+      mIn{in},
+      mSums(256 * paddedSize(vectorSize) * threadCount) {
+    mThreadCount = threadCount;
+}
+
+void HistogramTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                                size_t endY) {
+    typedef void (HistogramTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t);
+
+    KernelFunction kernel;
+    switch (mVectorSize) {
+        case 4:
+            kernel = &HistogramTask::kernelP1U4;
+            break;
+        case 3:
+            kernel = &HistogramTask::kernelP1U3;
+            break;
+        case 2:
+            kernel = &HistogramTask::kernelP1U2;
+            break;
+        case 1:
+            kernel = &HistogramTask::kernelP1U1;
+            break;
+        default:
+            ALOGE("Bad vector size %zd", mVectorSize);
+            return;
+    }
+
+    int* sums = &mSums[256 * paddedSize(mVectorSize) * threadIndex];
+
+    for (size_t y = startY; y < endY; y++) {
+        const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize);
+        std::invoke(kernel, this, inPtr, sums, startX, endX);
+    }
+}
+
+void HistogramTask::kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        sums[(in[0] << 2)]++;
+        sums[(in[1] << 2) + 1]++;
+        sums[(in[2] << 2) + 2]++;
+        sums[(in[3] << 2) + 3]++;
+        in += 4;
+    }
+}
+
+void HistogramTask::kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        sums[(in[0] << 2)]++;
+        sums[(in[1] << 2) + 1]++;
+        sums[(in[2] << 2) + 2]++;
+        in += 4;
+    }
+}
+
+void HistogramTask::kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        sums[(in[0] << 1)]++;
+        sums[(in[1] << 1) + 1]++;
+        in += 2;
+    }
+}
+
+void HistogramTask::kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        sums[in[0]]++;
+        in++;
+    }
+}
+
+void HistogramTask::collateSums(int* out) {
+    for (uint32_t ct = 0; ct < (256 * paddedSize(mVectorSize)); ct++) {
+        out[ct] = mSums[ct];
+        for (uint32_t t = 1; t < mThreadCount; t++) {
+            out[ct] += mSums[ct + (256 * paddedSize(mVectorSize) * t)];
+        }
+    }
+}
+
+HistogramDotTask::HistogramDotTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize,
+                                   uint32_t threadCount, const float* coefficients,
+                                   const Restriction* restriction)
+    : Task{sizeX, sizeY, vectorSize, true, restriction}, mIn{in}, mSums(256 * threadCount, 0) {
+    mThreadCount = threadCount;
+
+    if (coefficients == nullptr) {
+        mDot[0] = 0.299f;
+        mDot[1] = 0.587f;
+        mDot[2] = 0.114f;
+        mDot[3] = 0;
+    } else {
+        memcpy(mDot, coefficients, 16);
+    }
+    mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
+    mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
+    mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
+    mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
+}
+
+void HistogramDotTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                                   size_t endY) {
+    typedef void (HistogramDotTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t);
+
+    KernelFunction kernel;
+    switch (mVectorSize) {
+        case 4:
+            kernel = &HistogramDotTask::kernelP1L4;
+            break;
+        case 3:
+            kernel = &HistogramDotTask::kernelP1L3;
+            break;
+        case 2:
+            kernel = &HistogramDotTask::kernelP1L2;
+            break;
+        case 1:
+            kernel = &HistogramDotTask::kernelP1L1;
+            break;
+        default:
+            ALOGI("Bad vector size %zd", mVectorSize);
+            return;
+    }
+
+    int* sums = &mSums[256 * threadIndex];
+
+    for (size_t y = startY; y < endY; y++) {
+        const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize);
+        std::invoke(kernel, this, inPtr, sums, startX, endX);
+    }
+}
+
+void HistogramDotTask::kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]) + (mDotI[3] * in[3]);
+        sums[(t + 0x7f) >> 8]++;
+        in += 4;
+    }
+}
+
+void HistogramDotTask::kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]);
+        sums[(t + 0x7f) >> 8]++;
+        in += 4;
+    }
+}
+
+void HistogramDotTask::kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]);
+        sums[(t + 0x7f) >> 8]++;
+        in += 2;
+    }
+}
+
+void HistogramDotTask::kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        int t = (mDotI[0] * in[0]);
+        sums[(t + 0x7f) >> 8]++;
+        in++;
+    }
+}
+
+void HistogramDotTask::collateSums(int* out) {
+    for (uint32_t ct = 0; ct < 256; ct++) {
+        out[ct] = mSums[ct];
+        for (uint32_t t = 1; t < mThreadCount; t++) {
+            out[ct] += mSums[ct + (256 * t)];
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////
+
+void RenderScriptToolkit::histogram(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY,
+                                    size_t vectorSize, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+    if (vectorSize < 1 || vectorSize > 4) {
+        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+        return;
+    }
+#endif
+
+    HistogramTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), restriction);
+    processor->doTask(&task);
+    task.collateSums(out);
+}
+
+void RenderScriptToolkit::histogramDot(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY,
+                                       size_t vectorSize, const float* coefficients,
+                                       const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+    if (vectorSize < 1 || vectorSize > 4) {
+        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+        return;
+    }
+    if (coefficients != nullptr) {
+        float sum = 0.0f;
+        for (size_t i = 0; i < vectorSize; i++) {
+            if (coefficients[i] < 0.0f) {
+                ALOGE("histogramDot coefficients should not be negative. Coefficient %zu was %f.",
+                      i, coefficients[i]);
+                return;
+            }
+            sum += coefficients[i];
+        }
+        if (sum > 1.0f) {
+            ALOGE("histogramDot coefficients should add to 1 or less. Their sum is %f.", sum);
+            return;
+        }
+    }
+#endif
+
+    HistogramDotTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(),
+                          coefficients, restriction);
+    processor->doTask(&task);
+    task.collateSums(out);
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/JniEntryPoints.cpp b/toolkit/JniEntryPoints.cpp
new file mode 100644
index 0000000..3bf5911
--- /dev/null
+++ b/toolkit/JniEntryPoints.cpp
@@ -0,0 +1,480 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <android/bitmap.h>
+#include <assert.h>
+#include <jni.h>
+
+#include "RenderScriptToolkit.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.JniEntryPoints"
+
+using namespace android::renderscript;
+
+/**
+ * I compared using env->GetPrimitiveArrayCritical vs. env->GetByteArrayElements to get access
+ * to the underlying data. On Pixel 4, it's actually faster to not use critical. The code is left
+ * here if you want to experiment. Note that USE_CRITICAL could block the garbage collector.
+ */
+// #define USE_CRITICAL
+
+class ByteArrayGuard {
+   private:
+    JNIEnv* env;
+    jbyteArray array;
+    jbyte* data;
+
+   public:
+    ByteArrayGuard(JNIEnv* env, jbyteArray array) : env{env}, array{array} {
+#ifdef USE_CRITICAL
+        data = reinterpret_cast<jbyte*>(env->GetPrimitiveArrayCritical(array, nullptr));
+#else
+        data = env->GetByteArrayElements(array, nullptr);
+#endif
+    }
+    ~ByteArrayGuard() {
+#ifdef USE_CRITICAL
+        env->ReleasePrimitiveArrayCritical(array, data, 0);
+#else
+        env->ReleaseByteArrayElements(array, data, 0);
+#endif
+    }
+    uint8_t* get() { return reinterpret_cast<uint8_t*>(data); }
+};
+
+class IntArrayGuard {
+   private:
+    JNIEnv* env;
+    jintArray array;
+    jint* data;
+
+   public:
+    IntArrayGuard(JNIEnv* env, jintArray array) : env{env}, array{array} {
+#ifdef USE_CRITICAL
+        data = reinterpret_cast<jint*>(env->GetPrimitiveArrayCritical(array, nullptr));
+#else
+        data = env->GetIntArrayElements(array, nullptr);
+#endif
+    }
+    ~IntArrayGuard() {
+#ifdef USE_CRITICAL
+        env->ReleasePrimitiveArrayCritical(array, data, 0);
+#else
+        env->ReleaseIntArrayElements(array, data, 0);
+#endif
+    }
+    int* get() { return reinterpret_cast<int*>(data); }
+};
+
+class FloatArrayGuard {
+   private:
+    JNIEnv* env;
+    jfloatArray array;
+    jfloat* data;
+
+   public:
+    FloatArrayGuard(JNIEnv* env, jfloatArray array) : env{env}, array{array} {
+#ifdef USE_CRITICAL
+        data = reinterpret_cast<jfloat*>(env->GetPrimitiveArrayCritical(array, nullptr));
+#else
+        data = env->GetFloatArrayElements(array, nullptr);
+#endif
+    }
+    ~FloatArrayGuard() {
+#ifdef USE_CRITICAL
+        env->ReleasePrimitiveArrayCritical(array, data, 0);
+#else
+        env->ReleaseFloatArrayElements(array, data, 0);
+#endif
+    }
+    float* get() { return reinterpret_cast<float*>(data); }
+};
+
+class BitmapGuard {
+   private:
+    JNIEnv* env;
+    jobject bitmap;
+    AndroidBitmapInfo info;
+    int bytesPerPixel;
+    void* bytes;
+    bool valid;
+
+   public:
+    BitmapGuard(JNIEnv* env, jobject jBitmap) : env{env}, bitmap{jBitmap}, bytes{nullptr} {
+        valid = false;
+        if (AndroidBitmap_getInfo(env, bitmap, &info) != ANDROID_BITMAP_RESULT_SUCCESS) {
+            ALOGE("AndroidBitmap_getInfo failed");
+            return;
+        }
+        if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888 &&
+            info.format != ANDROID_BITMAP_FORMAT_A_8) {
+            ALOGE("AndroidBitmap in the wrong format");
+            return;
+        }
+        bytesPerPixel = info.stride / info.width;
+        if (bytesPerPixel != 1 && bytesPerPixel != 4) {
+            ALOGE("Expected a vector size of 1 or 4. Got %d. Extra padding per line not currently "
+                  "supported",
+                  bytesPerPixel);
+            return;
+        }
+        if (AndroidBitmap_lockPixels(env, bitmap, &bytes) != ANDROID_BITMAP_RESULT_SUCCESS) {
+            ALOGE("AndroidBitmap_lockPixels failed");
+            return;
+        }
+        valid = true;
+    }
+    ~BitmapGuard() {
+        if (valid) {
+            AndroidBitmap_unlockPixels(env, bitmap);
+        }
+    }
+    uint8_t* get() const {
+        assert(valid);
+        return reinterpret_cast<uint8_t*>(bytes);
+    }
+    int width() const { return info.width; }
+    int height() const { return info.height; }
+    int vectorSize() const { return bytesPerPixel; }
+};
+
+/**
+ * Copies the content of Kotlin Range2d object into the equivalent C++ struct.
+ */
+class RestrictionParameter {
+   private:
+    bool isNull;
+    Restriction restriction;
+
+   public:
+    RestrictionParameter(JNIEnv* env, jobject jRestriction) : isNull{jRestriction == nullptr} {
+        if (isNull) {
+            return;
+        }
+        /* TODO Measure how long FindClass and related functions take. Consider passing the
+         * four values instead. This would also require setting the default when Range2D is null.
+         */
+        jclass restrictionClass = env->FindClass("android/renderscript/toolkit/Range2d");
+        if (restrictionClass == nullptr) {
+            ALOGE("RenderScriptToolit. Internal error. Could not find the Kotlin Range2d class.");
+            isNull = true;
+            return;
+        }
+        jfieldID startXId = env->GetFieldID(restrictionClass, "startX", "I");
+        jfieldID startYId = env->GetFieldID(restrictionClass, "startY", "I");
+        jfieldID endXId = env->GetFieldID(restrictionClass, "endX", "I");
+        jfieldID endYId = env->GetFieldID(restrictionClass, "endY", "I");
+        restriction.startX = env->GetIntField(jRestriction, startXId);
+        restriction.startY = env->GetIntField(jRestriction, startYId);
+        restriction.endX = env->GetIntField(jRestriction, endXId);
+        restriction.endY = env->GetIntField(jRestriction, endYId);
+    }
+    Restriction* get() { return isNull ? nullptr : &restriction; }
+};
+
+extern "C" JNIEXPORT jlong JNICALL
+Java_android_renderscript_toolkit_Toolkit_createNative(JNIEnv* /*env*/, jobject /*thiz*/) {
+    return reinterpret_cast<jlong>(new RenderScriptToolkit());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_destroyNative(
+        JNIEnv* /*env*/, jobject /*thiz*/, jlong native_handle) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    delete toolkit;
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeBlend(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jint jmode, jbyteArray source_array,
+        jbyteArray dest_array, jint size_x, jint size_y, jobject restriction) {
+    auto toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    auto mode = static_cast<RenderScriptToolkit::BlendingMode>(jmode);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard source{env, source_array};
+    ByteArrayGuard dest{env, dest_array};
+
+    toolkit->blend(mode, source.get(), dest.get(), size_x, size_y, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeBlendBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jint jmode, jobject source_bitmap,
+        jobject dest_bitmap, jobject restriction) {
+    auto toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    auto mode = static_cast<RenderScriptToolkit::BlendingMode>(jmode);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard source{env, source_bitmap};
+    BitmapGuard dest{env, dest_bitmap};
+
+    toolkit->blend(mode, source.get(), dest.get(), source.width(), source.height(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeBlur(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array, jint vectorSize,
+        jint size_x, jint size_y, jint radius, jbyteArray output_array, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+
+    toolkit->blur(input.get(), output.get(), size_x, size_y, vectorSize, radius, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeBlurBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jobject output_bitmap, jint radius, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    BitmapGuard output{env, output_bitmap};
+
+    toolkit->blur(input.get(), output.get(), input.width(), input.height(), input.vectorSize(),
+                  radius, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeColorMatrix(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jint input_vector_size, jint size_x, jint size_y, jbyteArray output_array,
+        jint output_vector_size, jfloatArray jmatrix, jfloatArray add_vector, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+    FloatArrayGuard matrix{env, jmatrix};
+    FloatArrayGuard add{env, add_vector};
+
+    toolkit->colorMatrix(input.get(), output.get(), input_vector_size, output_vector_size, size_x,
+                         size_y, matrix.get(), add.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeColorMatrixBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jobject output_bitmap, jfloatArray jmatrix, jfloatArray add_vector, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    BitmapGuard output{env, output_bitmap};
+    FloatArrayGuard matrix{env, jmatrix};
+    FloatArrayGuard add{env, add_vector};
+
+    toolkit->colorMatrix(input.get(), output.get(), input.vectorSize(), output.vectorSize(),
+                         input.width(), input.height(), matrix.get(), add.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeConvolve(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array, jint vectorSize,
+        jint size_x, jint size_y, jbyteArray output_array, jfloatArray coefficients,
+        jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+    FloatArrayGuard coeffs{env, coefficients};
+
+    switch (env->GetArrayLength(coefficients)) {
+        case 9:
+            toolkit->convolve3x3(input.get(), output.get(), vectorSize, size_x, size_y,
+                                 coeffs.get(), restrict.get());
+            break;
+        case 25:
+            toolkit->convolve5x5(input.get(), output.get(), vectorSize, size_x, size_y,
+                                 coeffs.get(), restrict.get());
+            break;
+    }
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeConvolveBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jobject output_bitmap, jfloatArray coefficients, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    BitmapGuard output{env, output_bitmap};
+    FloatArrayGuard coeffs{env, coefficients};
+
+    switch (env->GetArrayLength(coefficients)) {
+        case 9:
+            toolkit->convolve3x3(input.get(), output.get(), input.vectorSize(), input.width(),
+                                 input.height(), coeffs.get(), restrict.get());
+            break;
+        case 25:
+            toolkit->convolve5x5(input.get(), output.get(), input.vectorSize(), input.width(),
+                                 input.height(), coeffs.get(), restrict.get());
+            break;
+    }
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeHistogram(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jint vector_size, jint size_x, jint size_y, jintArray output_array, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    IntArrayGuard output{env, output_array};
+
+    toolkit->histogram(input.get(), output.get(), size_x, size_y, vector_size, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeHistogramBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jintArray output_array, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    IntArrayGuard output{env, output_array};
+
+    toolkit->histogram(input.get(), output.get(), input.width(), input.height(), input.vectorSize(),
+                       restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeHistogramDot(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jint vector_size, jint size_x, jint size_y, jintArray output_array,
+        jfloatArray coefficients, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    IntArrayGuard output{env, output_array};
+    FloatArrayGuard coeffs{env, coefficients};
+
+    toolkit->histogramDot(input.get(), output.get(), size_x, size_y, vector_size, coeffs.get(),
+                          restrict.get());
+}
+
+extern "C" JNIEXPORT
+void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeHistogramDotBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jintArray output_array, jfloatArray coefficients, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    IntArrayGuard output{env, output_array};
+    FloatArrayGuard coeffs{env, coefficients};
+
+    toolkit->histogramDot(input.get(), output.get(), input.width(), input.height(),
+                          input.vectorSize(), coeffs.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeLut(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jbyteArray output_array, jint size_x, jint size_y, jbyteArray red_table,
+        jbyteArray green_table, jbyteArray blue_table, jbyteArray alpha_table,
+        jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+    ByteArrayGuard red{env, red_table};
+    ByteArrayGuard green{env, green_table};
+    ByteArrayGuard blue{env, blue_table};
+    ByteArrayGuard alpha{env, alpha_table};
+
+    toolkit->lut(input.get(), output.get(), size_x, size_y, red.get(), green.get(), blue.get(),
+                 alpha.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeLutBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jobject output_bitmap, jbyteArray red_table, jbyteArray green_table, jbyteArray blue_table,
+        jbyteArray alpha_table, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+
+    BitmapGuard input{env, input_bitmap};
+    BitmapGuard output{env, output_bitmap};
+    ByteArrayGuard red{env, red_table};
+    ByteArrayGuard green{env, green_table};
+    ByteArrayGuard blue{env, blue_table};
+    ByteArrayGuard alpha{env, alpha_table};
+
+    toolkit->lut(input.get(), output.get(), input.width(), input.height(), red.get(), green.get(),
+                 blue.get(), alpha.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeLut3d(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jbyteArray output_array, jint size_x, jint size_y, jbyteArray cube_values, jint cubeSizeX,
+        jint cubeSizeY, jint cubeSizeZ, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+    ByteArrayGuard cube{env, cube_values};
+
+    toolkit->lut3d(input.get(), output.get(), size_x, size_y, cube.get(), cubeSizeX, cubeSizeY,
+                   cubeSizeZ, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeLut3dBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jobject output_bitmap, jbyteArray cube_values, jint cubeSizeX, jint cubeSizeY,
+        jint cubeSizeZ, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    BitmapGuard output{env, output_bitmap};
+    ByteArrayGuard cube{env, cube_values};
+
+    toolkit->lut3d(input.get(), output.get(), input.width(), input.height(), cube.get(), cubeSizeX,
+                   cubeSizeY, cubeSizeZ, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeResize(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jint vector_size, jint input_size_x, jint input_size_y, jbyteArray output_array,
+        jint output_size_x, jint output_size_y, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+
+    toolkit->resize(input.get(), output.get(), input_size_x, input_size_y, vector_size,
+                    output_size_x, output_size_y, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeResizeBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jobject output_bitmap, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    BitmapGuard output{env, output_bitmap};
+
+    toolkit->resize(input.get(), output.get(), input.width(), input.height(), input.vectorSize(),
+                    output.width(), output.height(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeYuvToRgb(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jbyteArray output_array, jint size_x, jint size_y, jint format) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+
+    toolkit->yuvToRgb(input.get(), output.get(), size_x, size_y,
+                      static_cast<RenderScriptToolkit::YuvFormat>(format));
+}
+
+extern "C" JNIEXPORT void JNICALL Java_android_renderscript_toolkit_Toolkit_nativeYuvToRgbBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array, jint size_x,
+        jint size_y, jobject output_bitmap, jint format) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    BitmapGuard output{env, output_bitmap};
+    ByteArrayGuard input{env, input_array};
+
+    toolkit->yuvToRgb(input.get(), output.get(), size_x, size_y,
+                      static_cast<RenderScriptToolkit::YuvFormat>(format));
+}
diff --git a/toolkit/Lut.cpp b/toolkit/Lut.cpp
new file mode 100644
index 0000000..4ac5cdc
--- /dev/null
+++ b/toolkit/Lut.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.Lut"
+
+namespace android {
+namespace renderscript {
+
+class LutTask : public Task {
+    const uchar4* mIn;
+    uchar4* mOut;
+    const uchar* mRedTable;
+    const uchar* mGreenTable;
+    const uchar* mBlueTable;
+    const uchar* mAlphaTable;
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) override;
+
+   public:
+    LutTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY, const uint8_t* red,
+            const uint8_t* green, const uint8_t* blue, const uint8_t* alpha,
+            const Restriction* restriction)
+        : Task{sizeX, sizeY, 4, true, restriction},
+          mIn{reinterpret_cast<const uchar4*>(input)},
+          mOut{reinterpret_cast<uchar4*>(output)},
+          mRedTable{red},
+          mGreenTable{green},
+          mBlueTable{blue},
+          mAlphaTable{alpha} {}
+};
+
+void LutTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                          size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        size_t offset = mSizeX * y + startX;
+        const uchar4* in = mIn + offset;
+        uchar4* out = mOut + offset;
+        for (size_t x = startX; x < endX; x++) {
+            auto v = *in;
+            *out = uchar4{mRedTable[v.x], mGreenTable[v.y], mBlueTable[v.z], mAlphaTable[v.w]};
+            in++;
+            out++;
+        }
+    }
+}
+
+void RenderScriptToolkit::lut(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
+                              const uint8_t* red, const uint8_t* green, const uint8_t* blue,
+                              const uint8_t* alpha, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+#endif
+
+    LutTask task(input, output, sizeX, sizeY, red, green, blue, alpha, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/Lut3d.cpp b/toolkit/Lut3d.cpp
new file mode 100644
index 0000000..f8a7d61
--- /dev/null
+++ b/toolkit/Lut3d.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+namespace android {
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Lut3d"
+
+/**
+ * Converts a RGBA buffer using a 3D cube.
+ */
+class Lut3dTask : public Task {
+    // The input array we're transforming.
+    const uchar4* mIn;
+    // Where we'll store the transformed result.
+    uchar4* mOut;
+    // The size of each of the three cube dimensions. We don't make use of the last value.
+    int4 mCubeDimension;
+    // The translation cube, in row major format.
+    const uchar* mCubeTable;
+
+    /**
+     * Converts a subset of a line of the 2D buffer.
+     *
+     * @param in The start of the data to transform.
+     * @param out Where to store the result.
+     * @param length The number of 4-byte vectors to transform.
+     */
+    void kernel(const uchar4* in, uchar4* out, uint32_t length);
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) override;
+
+   public:
+    Lut3dTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
+              const uint8_t* cube, int cubeSizeX, int cubeSizeY, int cubeSizeZ,
+              const Restriction* restriction)
+        : Task{sizeX, sizeY, 4, true, restriction},
+          mIn{reinterpret_cast<const uchar4*>(input)},
+          mOut{reinterpret_cast<uchar4*>(output)},
+          mCubeDimension{cubeSizeX, cubeSizeY, cubeSizeZ, 0},
+          mCubeTable{cube} {}
+};
+
+extern "C" void rsdIntrinsic3DLUT_K(void* dst, void const* in, size_t count, void const* lut,
+                                    int32_t pitchy, int32_t pitchz, int dimx, int dimy, int dimz);
+
+void Lut3dTask::kernel(const uchar4* in, uchar4* out, uint32_t length) {
+    uint32_t x1 = 0;
+    uint32_t x2 = length;
+
+    const uchar* bp = mCubeTable;
+
+    int4 dims = mCubeDimension - 1;
+
+    const float4 m = (float4)(1.f / 255.f) * convert<float4>(dims);
+    const int4 coordMul = convert<int4>(m * (float4)0x8000);
+    const size_t stride_y = mCubeDimension.x * 4;
+    const size_t stride_z = stride_y * mCubeDimension.y;
+
+    // ALOGE("strides %zu %zu", stride_y, stride_z);
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd) {
+        int32_t len = x2 - x1;
+        if (len > 0) {
+            rsdIntrinsic3DLUT_K(out, in, len, bp, stride_y, stride_z, dims.x, dims.y, dims.z);
+            x1 += len;
+            out += len;
+            in += len;
+        }
+    }
+#endif
+
+    while (x1 < x2) {
+        int4 baseCoord = convert<int4>(*in) * coordMul;
+        int4 coord1 = baseCoord >> (int4)15;
+        // int4 coord2 = min(coord1 + 1, gDims - 1);
+
+        int4 weight2 = baseCoord & 0x7fff;
+        int4 weight1 = (int4)0x8000 - weight2;
+
+        // ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
+        const uchar* bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z);
+        const uchar4* pt_00 = (const uchar4*)&bp2[0];
+        const uchar4* pt_10 = (const uchar4*)&bp2[stride_y];
+        const uchar4* pt_01 = (const uchar4*)&bp2[stride_z];
+        const uchar4* pt_11 = (const uchar4*)&bp2[stride_y + stride_z];
+
+        uint4 v000 = convert<uint4>(pt_00[0]);
+        uint4 v100 = convert<uint4>(pt_00[1]);
+        uint4 v010 = convert<uint4>(pt_10[0]);
+        uint4 v110 = convert<uint4>(pt_10[1]);
+        uint4 v001 = convert<uint4>(pt_01[0]);
+        uint4 v101 = convert<uint4>(pt_01[1]);
+        uint4 v011 = convert<uint4>(pt_11[0]);
+        uint4 v111 = convert<uint4>(pt_11[1]);
+
+        uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7;
+        uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7;
+        uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7;
+        uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7;
+
+        uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15;
+        uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15;
+
+        uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15;
+        uint4 v2 = (v + 0x7f) >> (int4)8;
+
+        uchar4 ret = convert<uchar4>(v2);
+        ret.w = in->w;
+
+#if 0
+        if (!x1) {
+            ALOGE("in          %08x %08x %08x %08x", in->r, in->g, in->b, in->a);
+            ALOGE("baseCoord   %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z,
+                  baseCoord.w);
+            ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
+            ALOGE("weight1     %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w);
+            ALOGE("weight2     %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w);
+
+            ALOGE("v000        %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w);
+            ALOGE("v100        %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w);
+            ALOGE("yz00        %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w);
+            ALOGE("z0          %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w);
+
+            ALOGE("v           %08x %08x %08x %08x", v.x, v.y, v.z, v.w);
+            ALOGE("v2          %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w);
+        }
+#endif
+        *out = ret;
+
+        in++;
+        out++;
+        x1++;
+    }
+}
+
+void Lut3dTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                            size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        size_t offset = mSizeX * y + startX;
+        kernel(mIn + offset, mOut + offset, endX - startX);
+    }
+}
+
+void RenderScriptToolkit::lut3d(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
+                                const uint8_t* cube, size_t cubeSizeX, size_t cubeSizeY,
+                                size_t cubeSizeZ, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+#endif
+
+    Lut3dTask task(input, output, sizeX, sizeY, cube, cubeSizeX, cubeSizeY, cubeSizeZ, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/Lut3d_advsimd.S b/toolkit/Lut3d_advsimd.S
new file mode 100644
index 0000000..edcb038
--- /dev/null
+++ b/toolkit/Lut3d_advsimd.S
@@ -0,0 +1,250 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1
+
+            smov        x6, \src0
+            smov        x7, \src1
+
+            add         x6, x6, x3
+            add         x7, x7, x3
+
+            ld1         {v16.2s}, [x6], x4
+            ld1         {v17.2s}, [x7], x4
+
+            ld1         {v18.2s}, [x6], x5
+            ld1         {v19.2s}, [x7], x5
+
+            dup         v8.8b, \yr0
+            dup         v9.8b, \yr1
+            /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
+            zip1        v12.16b, v5.16b, v16.16b
+            zip1        v13.16b, v5.16b, v17.16b
+            umlsl       v12.8h, v16.8b, v8.8b
+            umlsl       v13.8h, v17.8b, v9.8b
+            umlal       v12.8h, v18.8b, v8.8b
+            umlal       v13.8h, v19.8b, v9.8b
+
+            ld1         {v18.2s}, [x6]
+            ld1         {v19.2s}, [x7]
+
+            sub         x6, x6, x4
+            sub         x7, x7, x4
+
+            ld1         {v16.2s}, [x6]
+            ld1         {v17.2s}, [x7]
+
+            /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
+            zip1        v14.16b, v5.16b, v16.16b
+            zip1        v15.16b, v5.16b, v17.16b
+            umlsl       v14.8h, v16.8b, v8.8b
+            umlsl       v15.8h, v17.8b, v9.8b
+            umlal       v14.8h, v18.8b, v8.8b
+            umlal       v15.8h, v19.8b, v9.8b
+
+            /* Z interpolate, lane 0 v12/v14 -> v10 */
+            ushll       v8.4s, v12.4h, #8
+            ushll2      v9.4s, v12.8h, #8
+            umlsl       v8.4s, v12.4h, \zr0
+            umlsl2      v9.4s, v12.8h, \zr0
+            umlal       v8.4s, v14.4h, \zr0
+            umlal2      v9.4s, v14.8h, \zr0
+            rshrn       v10.4h, v8.4s, #8
+            rshrn2      v10.8h, v9.4s, #8
+
+            /* Z interpolate, lane 1 v13/v15 -> v11 */
+            ushll       v8.4s, v13.4h, #8
+            ushll2      v9.4s, v13.8h, #8
+            umlsl       v8.4s, v13.4h, \zr1
+            umlsl2      v9.4s, v13.8h, \zr1
+            umlal       v8.4s, v15.4h, \zr1
+            umlal2      v9.4s, v15.8h, \zr1
+            rshrn       v11.4h, v8.4s, #8
+            rshrn2      v11.8h, v9.4s, #8
+
+            /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
+            ushll       v8.4s, v10.4h, #8
+            ushll       v9.4s, v11.4h, #8
+            umlsl       v8.4s, v10.4h, \xr0
+            umlsl       v9.4s, v11.4h, \xr1
+            umlal2      v8.4s, v10.8h, \xr0
+            umlal2      v9.4s, v11.8h, \xr1
+            shrn        v14.4h, v8.4s, #8
+            shrn2       v14.8h, v9.4s, #8
+
+            /* pack lanes 0-1 -> v6 */
+.ifc \dst, v20.16b
+            uqrshrn2    \dst, v14.8h, #8
+.else ; .ifc \dst, v21.16b
+            uqrshrn2    \dst, v14.8h, #8
+.else
+            uqrshrn     \dst, v14.8h, #8
+.endif ; .endif
+.endm
+
+/* void rsdIntrinsic3DLUT_K(
+ *          void *dst,          // x0
+ *          void const *in,     // x1
+ *          size_t count,       // x2
+ *          void const *lut,    // x3
+ *          int32_t pitchy,     // w4
+ *          int32_t pitchz,     // w5
+ *          int dimx,           // w6
+ *          int dimy,           // w7
+ *          int dimz);          // [sp]
+ */
+ENTRY(rsdIntrinsic3DLUT_K)
+            ldr         w8, [sp]
+            stp         d8, d9, [sp, #-64]!
+            stp         d10, d11, [sp, #16]
+            stp         d12, d13, [sp, #32]
+            stp         d14, d15, [sp, #48]
+            movi        v4.8b, #1
+            ins         v4.h[0], w6
+            ins         v4.h[1], w7
+            ins         v4.h[2], w8
+            ins         v4.s[2], w4
+            ins         v4.s[3], w5
+            movi        v5.16b, #0
+
+            subs        x2, x2, #8
+            bge         2f
+            cmn         x2, #8    // same as cmp x2, #-8
+            ble         9f
+            b           4f
+
+            .align 6
+1:          st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
+/* x0  = dst
+ * x1  = src
+ * x2  = count
+ * x3  = lut
+ * x4  = pitchy
+ * x5  = pitchz
+ * x6 = offset0
+ * x7 = offset1
+ */
+2:          ld4         {v0.8b-v3.8b}, [x1], #32
+/* v0,v1,v2,v3 source data
+ * v4 dimensions and pitches
+ */
+3:          uxtl        v0.8h, v0.8b
+            uxtl        v1.8h, v1.8b
+            uxtl        v2.8h, v2.8b
+            mul         v0.8h, v0.8h, v4.h[0]
+            mul         v1.8h, v1.8h, v4.h[1]
+            mul         v2.8h, v2.8h, v4.h[2]
+
+/* ursra below would be more accurate, but this can result in a dim.0 case
+ * where we try to read from the limit of the array and the limit +1 to
+ * interpolate, even though the fractional component is zero.  Strictly this is
+ * correct, except for the llegal access problem.
+ */
+            usra        v0.8h, v0.8h, #8
+            usra        v1.8h, v1.8h, #8
+            usra        v2.8h, v2.8h, #8
+
+            ushr        v12.8h, v0.8h, #8
+            ushr        v13.8h, v1.8h, #8
+            ushr        v14.8h, v2.8h, #8
+            bic         v0.8h, #0xff, LSL #8
+            xtn         v1.8b, v1.8h
+            bic         v2.8h, #0xff, LSL #8
+
+/* v0.8h,v1.8b,v2.hb fractional offset
+ * v12.8h,v13.8h,v14.8h integer offset
+ */
+
+            ushll       v6.4s, v12.4h, #2
+            ushll2      v7.4s, v12.8h, #2
+            uxtl        v8.4s, v13.4h
+            uxtl2       v9.4s, v13.8h
+            uxtl        v10.4s, v14.4h
+            uxtl2       v11.4s, v14.8h
+            mla         v6.4s, v8.4s,  v4.s[2]
+            mla         v7.4s, v9.4s,  v4.s[2]
+            mla         v6.4s, v10.4s, v4.s[3]
+            mla         v7.4s, v11.4s, v4.s[3]
+
+/* v6,v7 list of table offsets */
+
+        /* lanes 0 and 1 */
+            lanepair    dst=v20.8b,  src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]
+
+        /* lanes 2 and 3 */
+            lanepair    dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]
+
+        /* lanes 4 and 5 */
+            lanepair    dst=v21.8b,  src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]
+
+        /* lanes 6 and 7 */
+            lanepair    dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]
+
+            uzp1        v6.16b, v20.16b, v21.16b
+            uzp2        v7.16b, v20.16b, v21.16b
+            uzp1        v20.16b, v6.16b, v7.16b
+            uzp2        v22.16b, v6.16b, v7.16b
+            mov         v21.d[0], v20.d[1]
+
+            subs        x2, x2, #8
+            mov         v23.8b, v3.8b
+
+            bge         1b
+
+            cmn         x2, #8    // same as cmp x2, #-8
+            blt         1f
+
+            st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
+            beq         9f
+
+            /* fill the vector  with a safe value */
+4:          ld4r        {v0.8b-v3.8b}, [x1]
+            tbz         x2, #2, 2f
+            ld4         {v0.b-v3.b}[0], [x1], #4
+            ld4         {v0.b-v3.b}[1], [x1], #4
+            ld4         {v0.b-v3.b}[2], [x1], #4
+            ld4         {v0.b-v3.b}[3], [x1], #4
+2:          tbz         x2, #1, 2f
+            ld4         {v0.b-v3.b}[4], [x1], #4
+            ld4         {v0.b-v3.b}[5], [x1], #4
+2:          tbz         x2, #0, 2f
+            ld4         {v0.b-v3.b}[6], [x1], #4
+2:          b           3b
+
+1:          tst         x2, #4
+            beq         2f
+            st4         {v20.b-v23.b}[0], [x0], #4
+            st4         {v20.b-v23.b}[1], [x0], #4
+            st4         {v20.b-v23.b}[2], [x0], #4
+            st4         {v20.b-v23.b}[3], [x0], #4
+2:          tst         x2, #2
+            beq         2f
+            st4         {v20.b-v23.b}[4], [x0], #4
+            st4         {v20.b-v23.b}[5], [x0], #4
+2:          tst         x2, #1
+            beq         9f
+            st4         {v20.b-v23.b}[6], [x0], #4
+
+9:          ldp         d14, d15, [sp, #48]
+            ldp         d12, d13, [sp, #32]
+            ldp         d10, d11, [sp, #16]
+            ldp         d8, d9, [sp], #64
+            ret
+END(rsdIntrinsic3DLUT_K)
diff --git a/toolkit/Lut3d_neon.S b/toolkit/Lut3d_neon.S
new file mode 100644
index 0000000..9590f9c
--- /dev/null
+++ b/toolkit/Lut3d_neon.S
@@ -0,0 +1,256 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+.macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1
+
+            vmov        r6, r7, \src
+
+            add         r6, r6, r3
+            add         r7, r7, r3
+
+            vld1.u8     d16, [r6], r4
+            vld1.u8     d17, [r7], r4
+
+            vld1.u8     d18, [r6], r5
+            vld1.u8     d19, [r7], r5
+
+            vdup.u8     d6, \yr0
+            vdup.u8     d7, \yr1
+            /* Y interpolate, front, lanes 0 and 1 -> q12 and q13 */
+            vshll.u8    q12, d16, #8
+            vshll.u8    q13, d17, #8
+            vmlsl.u8    q12, d16, d6
+            vmlsl.u8    q13, d17, d7
+            vmlal.u8    q12, d18, d6
+            vmlal.u8    q13, d19, d7
+
+            vld1.u8     d18, [r6]
+            vld1.u8     d19, [r7]
+
+            sub         r6, r6, r4
+            sub         r7, r7, r4
+
+            vld1.u8     d16, [r6]
+            vld1.u8     d17, [r7]
+
+            /* Y interpolate, rear, lanes 0 and 1 -> q14 and q15 */
+            vshll.u8    q14, d16, #8
+            vshll.u8    q15, d17, #8
+            vmlsl.u8    q14, d16, d6
+            vmlsl.u8    q15, d17, d7
+            vmlal.u8    q14, d18, d6
+            vmlal.u8    q15, d19, d7
+
+            /* Z interpolate, lane 0 q12/q14 -> q10 */
+            vshll.u16   q8, d24, #8
+            vshll.u16   q9, d25, #8
+            vmlsl.u16   q8, d24, \zr0
+            vmlsl.u16   q9, d25, \zr0
+            vmlal.u16   q8, d28, \zr0
+            vmlal.u16   q9, d29, \zr0
+            vrshrn.u32  d20, q8, #8
+            vrshrn.u32  d21, q9, #8
+
+            /* Z interpolate, lane 1 q13/q15 -> q11 */
+            vshll.u16   q8, d26, #8
+            vshll.u16   q9, d27, #8
+            vmlsl.u16   q8, d26, \zr1
+            vmlsl.u16   q9, d27, \zr1
+            vmlal.u16   q8, d30, \zr1
+            vmlal.u16   q9, d31, \zr1
+            vrshrn.u32  d22, q8, #8
+            vrshrn.u32  d23, q9, #8
+
+            /* X interpolate, lanes 0 and 1 q10,q11 -> q14 */
+            vshll.u16   q8, d20, #8
+            vshll.u16   q9, d22, #8
+            vmlsl.u16   q8, d20, \xr0
+            vmlsl.u16   q9, d22, \xr1
+            vmlal.u16   q8, d21, \xr0
+            vmlal.u16   q9, d23, \xr1
+            vshrn.u32   d28, q8, #8
+            vshrn.u32   d29, q9, #8
+
+            /* pack lanes 0-1 -> d12 */
+            vqrshrn.u16  \dst, q14, #8
+.endm
+
+/* void rsdIntrinsic3DLUT_K(
+ *          void *dst,          // r0
+ *          void const *in,     // r1
+ *          size_t count,       // r2
+ *          void const *lut,    // r3
+ *          int32_t pitchy,     // [sp]
+ *          int32_t pitchz,     // [sp+#4]
+ *          int dimx,           // [sp+#8]
+ *          int dimy,           // [sp+#12]
+ *          int dimz);          // [sp+#16]
+ */
+ENTRY(rsdIntrinsic3DLUT_K)
+            push        {r4,r5,r6,r7}
+            ldr         r4, [sp, #16]
+            ldr         r5, [sp, #20]
+            ldr         r6, [sp, #24]
+            ldr         r7, [sp, #28]
+            ldr         r12, [sp, #32]
+            vpush       {d8-d15}
+
+            vmov.u8     d8, #1
+            vmov.u16    d8[0], r6
+            vmov.u16    d8[1], r7
+            vmov.u16    d8[2], r12
+            vmov        d9, r4, r5
+
+            subs        r2, #8
+            bge         2f
+            cmp         r2, #-8
+            ble         9f
+            b           4f
+
+            .align 6
+1:          vst4.u8     {d12,d13,d14,d15}, [r0]!
+/* r0  = dst
+ * r1  = src
+ * r2  = count
+ * r3  = lut
+ * r4  = pitchy
+ * r5  = pitchz
+ * r6 = offset0
+ * r7 = offset1
+ */
+2:          vld4.u8     {d0,d2,d4,d6}, [r1]!
+3:          vmov        d10, d6
+/* q0,q1,q2,q5 source data
+ * q4 dimensions and pitches
+ * q3, scratch register for scalar access
+ */
+            vmov        q3, q4
+            vmovl.u8    q0, d0
+            vmovl.u8    q1, d2
+            vmovl.u8    q2, d4
+            vmul.u16    q0, q0, d6[0]
+            vmul.u16    q1, q1, d6[1]
+            vmul.u16    q2, q2, d6[2]
+
+/* vrsra.u16 below would be more accurate, but this can result in a dim.0 case
+ * where we try to read from the limit of the array and the limit +1 to
+ * interpolate, even though the fractional component is zero.  Strictly this is
+ * correct, except for the llegal access problem.
+ */
+            vsra.u16    q0, q0, #8
+            vsra.u16    q1, q1, #8
+            vsra.u16    q2, q2, #8
+
+            vshr.u16    q12, q0, #8
+            vshr.u16    q13, q1, #8
+            vshr.u16    q14, q2, #8
+
+            vbic.u16    q0, #0xff00
+            vmovn.u16   d2, q1
+            vbic.u16    q2, #0xff00
+
+/* q0,d2,q2 fractional offset
+ * q12,q13,q14 integer offset
+ */
+
+            vshll.u16   q6, d24, #2
+            vshll.u16   q7, d25, #2
+            vmovl.u16   q8, d26
+            vmovl.u16   q9, d27
+            vmovl.u16   q10, d28
+            vmovl.u16   q11, d29
+            vmla.s32    q6, q8,  d9[0]
+            vmla.s32    q7, q9,  d9[0]
+            vmla.s32    q6, q10, d9[1]
+            vmla.s32    q7, q11, d9[1]
+
+/* q6,q7 list of table offsets */
+
+        /* lanes 0 and 1 */
+            lanepair dst=d12, src=d12, xr0=d0[0], xr1=d0[1], yr0=d2[0], yr1=d2[1], zr0=d4[0], zr1=d4[1]
+
+        /* lanes 2 and 3 */
+            lanepair dst=d13, src=d13, xr0=d0[2], xr1=d0[3], yr0=d2[2], yr1=d2[3], zr0=d4[2], zr1=d4[3]
+
+        /* lanes 4 and 5 */
+            lanepair dst=d14, src=d14, xr0=d1[0], xr1=d1[1], yr0=d2[4], yr1=d2[5], zr0=d5[0], zr1=d5[1]
+
+        /* lanes 6 and 7 */
+            lanepair dst=d15, src=d15, xr0=d1[2], xr1=d1[3], yr0=d2[6], yr1=d2[7], zr0=d5[2], zr1=d5[3]
+
+            vuzp.u8     d12, d13
+            vuzp.u8     d14, d15
+            vuzp.u8     d12, d14
+            vuzp.u8     d13, d15
+
+            subs        r2, r2, #8
+            vmov.u8     d15, d10
+
+            bge         1b
+
+            cmp         r2, #-8
+            blt         1f
+
+            vst4.u8     {d12,d13,d14,d15}, [r0]!
+
+            beq         9f
+
+            /* fill the vector with a safe value */
+4:          vld1.u32    {d0[]}, [r1]
+            vmov        d2, d0
+            vmov        d4, d0
+            vmov        d6, d0
+            tst         r2, #4
+            beq         2f
+            vld1.u32    {d0}, [r1]!
+            vld1.u32    {d2}, [r1]!
+2:          tst         r2, #2
+            beq         2f
+            vld1.u32    {d4}, [r1]!
+2:          tst         r2, #1
+            beq         2f
+            vld1.u32    {d6[0]}, [r1]!
+2:          vuzp.8      d0, d2
+            vuzp.8      d4, d6
+            vuzp.8      d0, d4
+            vuzp.8      d2, d6
+            b           3b
+
+1:          vzip.8      d12, d14
+            vzip.8      d13, d15
+            vzip.8      d12, d13
+            vzip.8      d14, d15
+            tst         r2, #4
+            beq         2f
+            vst1.u32    {d12,d13}, [r0]!
+2:          tst         r2, #2
+            beq         2f
+            vst1.u32    {d14}, [r0]!
+2:          tst         r2, #1
+            beq         9f
+            vst1.u32    {d15[0]}, [r0]!
+
+9:          mov         r0, #0
+            vpop        {d8-d15}
+            pop         {r4,r5,r6,r7}
+            bx lr
+END(rsdIntrinsic3DLUT_K)
diff --git a/toolkit/README.txt b/toolkit/README.txt
new file mode 100644
index 0000000..4e08dc5
--- /dev/null
+++ b/toolkit/README.txt
@@ -0,0 +1,9 @@
+This directory will contain the standalone library meant to replace the RenderScript Intrinsics.
+
+The work in this directory is not complete.
+
+To make the review process manageable, a series of smaller CLs will be reviewed and submitted.
+
+While it is initially built with Soong, the end goal is to move this to github once the work
+has been completed. This is a staging area for the reviews.
+
diff --git a/toolkit/RenderScriptToolkit.cpp b/toolkit/RenderScriptToolkit.cpp
new file mode 100644
index 0000000..f110317
--- /dev/null
+++ b/toolkit/RenderScriptToolkit.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "RenderScriptToolkit.h"
+
+#include "TaskProcessor.h"
+
+#define LOG_TAG "renderscript.toolkit.RenderScriptToolkit"
+
+namespace android {
+namespace renderscript {
+
+// You will find the implementation of the various transformations in the correspondingly
+// named source file. E.g. RenderScriptToolkit::blur() is found in Blur.cpp.
+
+RenderScriptToolkit::RenderScriptToolkit(int numberOfThreads)
+    : processor{new TaskProcessor(numberOfThreads)} {}
+
+RenderScriptToolkit::~RenderScriptToolkit() {
+    // By defining the destructor here, we don't need to include TaskProcessor.h
+    // in RenderScriptToolkit.h.
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/RenderScriptToolkit.h b/toolkit/RenderScriptToolkit.h
new file mode 100644
index 0000000..fb33195
--- /dev/null
+++ b/toolkit/RenderScriptToolkit.h
@@ -0,0 +1,540 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RENDERSCRIPT_TOOLKIT_TOOLKIT_H
+#define ANDROID_RENDERSCRIPT_TOOLKIT_TOOLKIT_H
+
+#include <cstdint>
+#include <memory>
+
+namespace android {
+namespace renderscript {
+
+class TaskProcessor;
+
+/**
+ * Define a range of data to process.
+ *
+ * This class is used to restrict a Toolkit operation to a rectangular subset of the input
+ * tensor.
+ *
+ * @property startX The index of the first value to be included on the X axis.
+ * @property endX The index after the last value to be included on the X axis.
+ * @property startY The index of the first value to be included on the Y axis.
+ * @property endY The index after the last value to be included on the Y axis.
+ */
+struct Restriction {
+    size_t startX;
+    size_t endX;
+    size_t startY;
+    size_t endY;
+};
+
+/**
+ * A collection of high-performance graphic utility functions like blur and blend.
+ *
+ * This toolkit provides ten image manipulation functions: blend, blur, color matrix, convolve,
+ * histogram, histogramDot, lut, lut3d, resize, and YUV to RGB. These functions execute
+ * multithreaded on the CPU.
+ *
+ * These functions work over raw byte arrays. You'll need to specify the width and height of
+ * the data to be processed, as well as the number of bytes per pixel. For most use cases,
+ * this will be 4.
+ *
+ * You should instantiate the Toolkit once and reuse it throughout your application.
+ * On instantiation, the Toolkit creates a thread pool that's used for processing all the functions.
+ * You can limit the number of pool threads used by the Toolkit via the constructor. The pool
+ * threads are destroyed once the Toolkit is destroyed, after any pending work is done.
+ *
+ * This library is thread safe. You can call methods from different pool threads. The functions will
+ * execute sequentially.
+ *
+ * A Java/Kotlin Toolkit is available. It calls this library through JNI.
+ *
+ * This toolkit can be used as a replacement for most RenderScript Intrinsic functions. Compared
+ * to RenderScript, it's simpler to use and more than twice as fast on the CPU. However RenderScript
+ * Intrinsics allow more flexibility for the type of allocation supported. In particular, this
+ * toolkit does not support allocations of floats.
+ */
+class RenderScriptToolkit {
+    /** Each Toolkit method call is converted to a Task. The processor owns the thread pool. It
+     * tiles the tasks and schedule them over the pool threads.
+     */
+    std::unique_ptr<TaskProcessor> processor;
+
+   public:
+    /**
+     * Creates the pool threads that are used for processing the method calls.
+     */
+    RenderScriptToolkit(int numberOfThreads = 0);
+    /**
+     * Destroys the thread pool. This stops any in-progress work; the Toolkit methods called from
+     * other pool threads will return without having completed the work. Because of the undefined
+     * state of the output buffers, an application should avoid destroying the Toolkit if other pool
+     * threads are executing Toolkit methods.
+     */
+    ~RenderScriptToolkit();
+
+    /**
+     * Determines how a source buffer is blended into a destination buffer.
+     *
+     * See {@link RenderScriptToolkit::blend}.
+     *
+     * blend only works on 4 byte RGBA data. In the descriptions below, ".a" represents
+     * the alpha channel.
+     */
+    enum class BlendingMode {
+        /**
+         * dest = 0
+         *
+         * The destination is cleared, i.e. each pixel is set to (0, 0, 0, 0)
+         */
+        CLEAR = 0,
+        /**
+         * dest = src
+         *
+         * Sets each pixel of the destination to the corresponding one in the source.
+         */
+        SRC = 1,
+        /**
+         * dest = dest
+         *
+         * Leaves the destination untouched. This is a no-op.
+         */
+        DST = 2,
+        /**
+         * dest = src + dest * (1.0 - src.a)
+         */
+        SRC_OVER = 3,
+        /**
+         * dest = dest + src * (1.0 - dest.a)
+         */
+        DST_OVER = 4,
+        /**
+         * dest = src * dest.a
+         */
+        SRC_IN = 5,
+        /**
+         * dest = dest * src.a
+         */
+        DST_IN = 6,
+        /**
+         * dest = src * (1.0 - dest.a)
+         */
+        SRC_OUT = 7,
+        /**
+         * dest = dest * (1.0 - src.a)
+         */
+        DST_OUT = 8,
+        /**
+         * dest.rgb = src.rgb * dest.a + (1.0 - src.a) * dest.rgb, dest.a = dest.a
+         */
+        SRC_ATOP = 9,
+        /**
+         * dest = dest.rgb * src.a + (1.0 - dest.a) * src.rgb, dest.a = src.a
+         */
+        DST_ATOP = 10,
+        /**
+         * dest = {src.r ^ dest.r, src.g ^ dest.g, src.b ^ dest.b, src.a ^ dest.a}
+         *
+         * Note: this is NOT the Porter/Duff XOR mode; this is a bitwise xor.
+         */
+        XOR = 11,
+        /**
+         * dest = src * dest
+         */
+        MULTIPLY = 12,
+        /**
+         * dest = min(src + dest, 1.0)
+         */
+        ADD = 13,
+        /**
+         * dest = max(dest - src, 0.0)
+         */
+        SUBTRACT = 14
+    };
+
+    /**
+     * Blend a source buffer with the destination buffer.
+     *
+     * Blends a source buffer and a destination buffer, placing the result in the destination
+     * buffer. The blending is done pairwise between two corresponding RGBA values found in
+     * each buffer. The mode parameter specifies one of fifteen blending operations.
+     * See {@link BlendingMode}.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The source and destination buffers must have the same dimensions. Both buffers should be
+     * large enough for sizeX * sizeY * 4 bytes. The buffers have a row-major layout.
+     *
+     * @param mode The specific blending operation to do.
+     * @param source The RGBA input buffer.
+     * @param dest The destination buffer. Used for input and output.
+     * @param sizeX The width of both buffers, as a number of RGBA values.
+     * @param sizeY The height of both buffers, as a number of RGBA values.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void blend(BlendingMode mode, const uint8_t* _Nonnull source, uint8_t* _Nonnull dst,
+               size_t sizeX, size_t sizeY, const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Blur an image.
+     *
+     * Performs a Gaussian blur of the input image and stores the result in the out buffer.
+     *
+     * The radius determines which pixels are used to compute each blurred pixels. This Toolkit
+     * accepts values between 1 and 25. Larger values create a more blurred effect but also
+     * take longer to compute. When the radius extends past the edge, the edge pixel will
+     * be used as replacement for the pixel that's out off boundary.
+     *
+     * Each input pixel can either be represented by four bytes (RGBA format) or one byte
+     * for the less common blurring of alpha channel only image.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The input and output buffers must have the same dimensions. Both buffers should be
+     * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
+     *
+     * @param in The buffer of the image to be blurred.
+     * @param out The buffer that receives the blurred image.
+     * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
+     * @param vectorSize Either 1 or 4, the number of bytes in each cell, i.e. A vs. RGBA.
+     * @param radius The radius of the pixels used to blur.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void blur(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
+              size_t vectorSize, int radius, const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Identity matrix that can be passed to the {@link RenderScriptToolkit::colorMatrix} method.
+     *
+     * Using this matrix will result in no change to the pixel through multiplication although
+     * the pixel value can still be modified by the add vector, or transformed to a different
+     * format.
+     */
+    static constexpr float kIdentityMatrix[] =  {
+            1.0f, 0.0f, 0.0f, 0.0f,
+            0.0f, 1.0f, 0.0f, 0.0f,
+            0.0f, 0.0f, 1.0f, 0.0f,
+            0.0f, 0.0f, 0.0f, 1.0f
+    };
+
+    /**
+     * Matrix to turn color pixels to a grey scale.
+     *
+     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert an
+     * image from color to greyscale.
+     */
+    static constexpr float kGreyScaleColorMatrix[] = {
+            0.299f, 0.299f, 0.299f, 0.0f,
+            0.587f, 0.587f, 0.587f, 0.0f,
+            0.114f, 0.114f, 0.114f, 0.0f,
+            0.0f,   0.0f,   0.0f,   1.0f
+    };
+
+    /**
+     * Matrix to convert RGB to YUV.
+     *
+     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
+     * first three bytes of each pixel from RGB to YUV. This leaves the last byte (the alpha
+     * channel) untouched.
+     *
+     * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
+     * by this method.
+     */
+    static constexpr float kRgbToYuvMatrix[] = {
+            0.299f, -0.14713f,  0.615f,   0.0f,
+            0.587f, -0.28886f, -0.51499f, 0.0f,
+            0.114f,  0.436f,   -0.10001f, 0.0f,
+            0.0f,    0.0f,      0.0f,     1.0f
+    };
+
+    /**
+     * Matrix to convert YUV to RGB.
+     *
+     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
+     * first three bytes of each pixel from YUV to RGB. This leaves the last byte (the alpha
+     * channel) untouched.
+     *
+     * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
+     * by this method. Use {@link RenderScriptToolkit::yuvToRgb} to convert these buffers.
+     */
+    static constexpr float kYuvToRgbMatrix[] = {
+            1.0f,      1.0f,     1.0f,     0.0f,
+            0.0f,     -0.39465f, 2.03211f, 0.0f,
+            1.13983f, -0.5806f,  0.0f,     0.0f,
+            0.0f,      0.0f,     0.0f,     1.0f
+    };
+
+    /**
+     * Transform an image using a color matrix.
+     *
+     * Converts a 2D array of vectors of unsigned bytes, multiplying each vectors by a 4x4 matrix
+     * and adding an optional vector.
+     *
+     * Each input vector is composed of 1-4 unsigned bytes. If less than 4 bytes, it's extended to
+     * 4, padding with zeroes. The unsigned bytes are converted from 0-255 to 0.0-1.0 floats
+     * before the multiplication is done.
+     *
+     * The resulting value is normalized from 0.0-1.0 to a 0-255 value and stored in the output.
+     * If the output vector size is less than four, the unused channels are discarded.
+     *
+     * If addVector is null, a vector of zeroes is added, i.e. a noop.
+     *
+     * Check kIdentityMatrix, kGreyScaleColorMatrix, kRgbToYuvMatrix, and kYuvToRgbMatrix for sample
+     * matrices. The YUV conversion may not work for all color spaces.
+     *
+     * @param in The buffer of the image to be converted.
+     * @param out The buffer that receives the converted image.
+     * @param inputVectorSize The number of bytes in each input cell, a value from 1 to 4.
+     * @param outputVectorSize The number of bytes in each output cell, a value from 1 to 4.
+     * @param sizeX The width of both buffers, as a number of 1 to 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 1 to 4 byte cells.
+     * @param matrix The 4x4 matrix to multiply, in row major format.
+     * @param addVector A vector of four floats that's added to the result of the multiplication.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void colorMatrix(const void* _Nonnull in, void* _Nonnull out, size_t inputVectorSize,
+                     size_t outputVectorSize, size_t sizeX, size_t sizeY,
+                     const float* _Nonnull matrix, const float* _Nullable addVector = nullptr,
+                     const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Convolve a ByteArray.
+     *
+     * Applies a 3x3 or 5x5 convolution to the input array using the provided coefficients.
+     *
+     * For 3x3 convolutions, 9 coefficients must be provided. For 5x5, 25 coefficients are needed.
+     * The coefficients should be provided in row-major format.
+     *
+     * When the square extends past the edge, the edge values will be used as replacement for the
+     * values that's are off boundary.
+     *
+     * Each input cell can either be represented by one to four bytes. Each byte is multiplied
+     * and accumulated independently of the other bytes of the cell.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The input and output buffers must have the same dimensions. Both buffers should be
+     * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
+     *
+     * @param in The buffer of the image to be blurred.
+     * @param out The buffer that receives the blurred image.
+     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+     * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
+     * @param coefficients 9 or 25 multipliers.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void convolve3x3(const void* _Nonnull in, void* _Nonnull out, size_t vectorSize, size_t sizeX,
+                     size_t sizeY, const float* _Nonnull coefficients,
+                     const Restriction* _Nullable restriction = nullptr);
+
+    void convolve5x5(const void* _Nonnull in, void* _Nonnull out, size_t vectorSize, size_t sizeX,
+                     size_t sizeY, const float* _Nonnull coefficients,
+                     const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Compute the histogram of an image.
+     *
+     * Tallies how many times each of the 256 possible values of a byte is found in the input.
+     *
+     * An input cell can be represented by one to four bytes. The tally is done independently
+     * for each of the bytes of the cell. Correspondingly, the out array will have
+     * 256 * vectorSize entries. The counts for value 0 are consecutive, followed by those for
+     * value 1, etc.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The source buffers should be large enough for sizeX * sizeY * vectorSize bytes. The buffers
+     * have a row-major layout. The out buffer should be large enough for 256 * vectorSize ints.
+     *
+     * @param in The buffer of the image to be analyzed.
+     * @param out The resulting vector of counts.
+     * @param sizeX The width of the input buffers, as a number of 1 or 4 byte cells.
+     * @param sizeY The height of the input buffers, as a number of 1 or 4 byte cells.
+     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void histogram(const uint8_t* _Nonnull in, int32_t* _Nonnull out, size_t sizeX, size_t sizeY,
+                   size_t vectorSize, const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Compute the histogram of the dot product of an image.
+     *
+     * This method supports cells of 1 to 4 bytes in length. For each cell of the array,
+     * the dot product of its bytes with the provided coefficients is computed. The resulting
+     * floating point value is converted to an unsigned byte and tallied in the histogram.
+     *
+     * If coefficients is null, the coefficients used for RGBA luminosity calculation will be used,
+     * i.e. the values [0.299f, 0.587f, 0.114f, 0.f].
+     *
+     * Each coefficients must be >= 0 and their sum must be 1.0 or less. There must be the same
+     * number of coefficients as vectorSize.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The source buffers should be large enough for sizeX * sizeY * vectorSize bytes. The buffers
+     * have a row-major layout. The out array should be large enough for 256 ints.
+     *
+     * @param in The buffer of the image to be analyzed.
+     * @param out The resulting vector of counts.
+     * @param sizeX The width of the input buffers, as a number of 1 or 4 byte cells.
+     * @param sizeY The height of the input buffers, as a number of 1 or 4 byte cells.
+     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+     * @param coefficients The values used for the dot product. Can be nullptr.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void histogramDot(const uint8_t* _Nonnull in, int32_t* _Nonnull out, size_t sizeX, size_t sizeY,
+                      size_t vectorSize, const float* _Nullable coefficients,
+                      const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Transform an image using a look up table
+     *
+     * Transforms an image by using a per-channel lookup table. Each channel of the input has an
+     * independent lookup table. The tables are 256 entries in size and can cover the full value
+     * range of a byte.
+     *
+     * The input array should be in RGBA format, where four consecutive bytes form an cell.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The input and output buffers must have the same dimensions. Both buffers should be
+     * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
+     *
+     * @param in The buffer of the image to be transformed.
+     * @param out The buffer that receives the transformed image.
+     * @param sizeX The width of both buffers, as a number of 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 4 byte cells.
+     * @param red An array of 256 values that's used to convert the R channel.
+     * @param green An array of 256 values that's used to convert the G channel.
+     * @param blue An array of 256 values that's used to convert the B channel.
+     * @param alpha An array of 256 values that's used to convert the A channel.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void lut(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
+             const uint8_t* _Nonnull red, const uint8_t* _Nonnull green,
+             const uint8_t* _Nonnull blue, const uint8_t* _Nonnull alpha,
+             const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Transform an image using a 3D look up table
+     *
+     * Transforms an image, converting RGB to RGBA by using a 3D lookup table. The incoming R, G,
+     * and B values are normalized to the dimensions of the provided 3D buffer. The eight nearest
+     * values in that 3D buffer are sampled and linearly interpolated. The resulting RGBA entry
+     * is stored in the output.
+     *
+     * The input array should be in RGBA format, where four consecutive bytes form an cell.
+     * The fourth byte of each input cell is ignored.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The input and output buffers must have the same dimensions. Both buffers should be
+     * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
+     *
+     * @param in The buffer of the image to be transformed.
+     * @param out The buffer that receives the transformed image.
+     * @param sizeX The width of both buffers, as a number of 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 4 byte cells.
+     * @param cube The translation cube, in row major-format.
+     * @param cubeSizeX The number of RGBA entries in the cube in the X direction.
+     * @param cubeSizeY The number of RGBA entries in the cube in the Y direction.
+     * @param cubeSizeZ The number of RGBA entries in the cube in the Z direction.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void lut3d(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
+               const uint8_t* _Nonnull cube, size_t cubeSizeX, size_t cubeSizeY, size_t cubeSizeZ,
+               const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Resize an image.
+     *
+     * Resizes an image using bicubic interpolation.
+     *
+     * This method supports cells of 1 to 4 bytes in length. Each byte of the cell is
+     * interpolated independently from the others.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of the output buffer. The corresponding scaled range of the input will be used.  If provided,
+     * the range must be wholly contained with the dimensions described by outputSizeX and
+     * outputSizeY.
+     *
+     * The input and output buffers have a row-major layout. Both buffers should be
+     * large enough for sizeX * sizeY * vectorSize bytes.
+     *
+     * @param in The buffer of the image to be resized.
+     * @param out The buffer that receives the resized image.
+     * @param inputSizeX The width of the input buffer, as a number of 1-4 byte cells.
+     * @param inputSizeY The height of the input buffer, as a number of 1-4 byte cells.
+     * @param vectorSize The number of bytes in each cell of both buffers. A value from 1 to 4.
+     * @param outputSizeX The width of the output buffer, as a number of 1-4 byte cells.
+     * @param outputSizeY The height of the output buffer, as a number of 1-4 byte cells.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void resize(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t inputSizeX,
+                size_t inputSizeY, size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
+                const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * The YUV formats supported by yuvToRgb.
+     */
+    enum class YuvFormat {
+        NV21 = 0x11,
+        YV12 = 0x32315659,
+    };
+
+    /**
+     * Convert an image from YUV to RGB.
+     *
+     * Converts an Android YUV buffer to RGB. The input allocation should be
+     * supplied in a supported YUV format as a YUV cell Allocation.
+     * The output is RGBA; the alpha channel will be set to 255.
+     *
+     * Note that for YV12 and a sizeX that's not a multiple of 32, the
+     * RenderScript Intrinsic may not have converted the image correctly.
+     * This Toolkit method should.
+     *
+     * @param in The buffer of the image to be converted.
+     * @param out The buffer that receives the converted image.
+     * @param sizeX The width in pixels of the image. Must be even.
+     * @param sizeY The height in pixels of the image.
+     * @param format Either YV12 or NV21.
+     */
+    void yuvToRgb(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
+                  YuvFormat format);
+};
+
+}  // namespace renderscript
+}  // namespace android
+
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_TOOLKIT_H
diff --git a/toolkit/Resize.cpp b/toolkit/Resize.cpp
new file mode 100644
index 0000000..624ae8e
--- /dev/null
+++ b/toolkit/Resize.cpp
@@ -0,0 +1,769 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <math.h>
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#if defined(ARCH_X86_HAVE_AVX2)
+#include <stdint.h>
+#include <x86intrin.h>
+#include <xmmintrin.h>
+#endif
+
+#define LOG_TAG "renderscript.toolkit.Resize"
+
+namespace android {
+namespace renderscript {
+
+class ResizeTask : public Task {
+    const uchar* mIn;
+    uchar* mOut;
+    float mScaleX;
+    float mScaleY;
+    size_t mInputSizeX;
+    size_t mInputSizeY;
+
+    void kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+    void kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+    void kernelU4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+    void kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+    void kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+    void kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) override;
+
+   public:
+    ResizeTask(const uchar* input, uchar* output, size_t inputSizeX, size_t inputSizeY,
+               size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
+               const Restriction* restriction)
+        : Task{outputSizeX, outputSizeY, vectorSize, false, restriction},
+          mIn{input},
+          mOut{output},
+          mInputSizeX{inputSizeX},
+          mInputSizeY{inputSizeY} {
+        mScaleX = static_cast<float>(inputSizeX) / outputSizeX;
+        mScaleY = static_cast<float>(inputSizeY) / outputSizeY;
+    }
+};
+
+void ResizeTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                             size_t endY) {
+    typedef void (ResizeTask::*KernelFunction)(uchar*, uint32_t, uint32_t, uint32_t);
+
+    KernelFunction kernel;
+    switch (mVectorSize) {
+        case 4:
+            kernel = &ResizeTask::kernelU4;
+            break;
+        case 3:
+            kernel = &ResizeTask::kernelU4;
+            break;
+        case 2:
+            kernel = &ResizeTask::kernelU2;
+            break;
+        case 1:
+            kernel = &ResizeTask::kernelU1;
+            break;
+        default:
+            ALOGE("Bad vector size %zd", mVectorSize);
+    }
+
+    for (size_t y = startY; y < endY; y++) {
+        size_t offset = (mSizeX * y + startX) * paddedSize(mVectorSize);
+        uchar* out = mOut + offset;
+        std::invoke(kernel, this, out, startX, endX, y);
+    }
+}
+
+static float4 cubicInterpolate(float4 p0, float4 p1, float4 p2, float4 p3, float x) {
+    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
+            + x * (3.f * (p1 - p2) + p3 - p0)));
+}
+
+static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
+    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
+            + x * (3.f * (p1 - p2) + p3 - p0)));
+}
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
+   return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 +
+           _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(4.f), _mm_set1_ps(p2),_mm_set1_ps(p3)))
+           + x * (_mm_cvtss_f32(_mm_fmadd_ss (_mm_set1_ps(3.f),_mm_set1_ps(p1 - p2),
+                                              _mm_set1_ps(p3 - p0))))));
+
+}
+#else
+static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
+    //ALOGI("CP, %f, %f, %f, %f, %f", p0, p1, p2, p3, x);
+    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
+            + x * (3.f * (p1 - p2) + p3 - p0)));
+}
+#endif
+
+static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
+                         float xf, float yf, int width) {
+    int startx = (int) floor(xf - 1);
+    xf = xf - floor(xf);
+    int maxx = width - 1;
+    int xs0 = std::max(0, startx + 0);
+    int xs1 = std::max(0, startx + 1);
+    int xs2 = std::min(maxx, startx + 2);
+    int xs3 = std::min(maxx, startx + 3);
+
+    float4 p0  = cubicInterpolate(convert<float4>(yp0[xs0]),
+                                  convert<float4>(yp0[xs1]),
+                                  convert<float4>(yp0[xs2]),
+                                  convert<float4>(yp0[xs3]), xf);
+
+    float4 p1  = cubicInterpolate(convert<float4>(yp1[xs0]),
+                                  convert<float4>(yp1[xs1]),
+                                  convert<float4>(yp1[xs2]),
+                                  convert<float4>(yp1[xs3]), xf);
+
+    float4 p2  = cubicInterpolate(convert<float4>(yp2[xs0]),
+                                  convert<float4>(yp2[xs1]),
+                                  convert<float4>(yp2[xs2]),
+                                  convert<float4>(yp2[xs3]), xf);
+
+    float4 p3  = cubicInterpolate(convert<float4>(yp3[xs0]),
+                                  convert<float4>(yp3[xs1]),
+                                  convert<float4>(yp3[xs2]),
+                                  convert<float4>(yp3[xs3]), xf);
+
+    float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    p = clamp(p + 0.5f, 0.f, 255.f);
+    return convert<uchar4>(p);
+}
+
+static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
+                         float xf, float yf, int width) {
+    int startx = (int) floor(xf - 1);
+    xf = xf - floor(xf);
+    int maxx = width - 1;
+    int xs0 = std::max(0, startx + 0);
+    int xs1 = std::max(0, startx + 1);
+    int xs2 = std::min(maxx, startx + 2);
+    int xs3 = std::min(maxx, startx + 3);
+
+    float2 p0  = cubicInterpolate(convert<float2>(yp0[xs0]),
+                                  convert<float2>(yp0[xs1]),
+                                  convert<float2>(yp0[xs2]),
+                                  convert<float2>(yp0[xs3]), xf);
+
+    float2 p1  = cubicInterpolate(convert<float2>(yp1[xs0]),
+                                  convert<float2>(yp1[xs1]),
+                                  convert<float2>(yp1[xs2]),
+                                  convert<float2>(yp1[xs3]), xf);
+
+    float2 p2  = cubicInterpolate(convert<float2>(yp2[xs0]),
+                                  convert<float2>(yp2[xs1]),
+                                  convert<float2>(yp2[xs2]),
+                                  convert<float2>(yp2[xs3]), xf);
+
+    float2 p3  = cubicInterpolate(convert<float2>(yp3[xs0]),
+                                  convert<float2>(yp3[xs1]),
+                                  convert<float2>(yp3[xs2]),
+                                  convert<float2>(yp3[xs3]), xf);
+
+    float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    p = clamp(p + 0.5f, 0.f, 255.f);
+    return convert<uchar2>(p);
+}
+
+static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
+                        float xf, float yf, int width) {
+    int startx = (int) floor(xf - 1);
+    xf = xf - floor(xf);
+    int maxx = width - 1;
+    int xs0 = std::max(0, startx + 0);
+    int xs1 = std::max(0, startx + 1);
+    int xs2 = std::min(maxx, startx + 2);
+    int xs3 = std::min(maxx, startx + 3);
+
+    float p0  = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
+                                 (float)yp0[xs2], (float)yp0[xs3], xf);
+    float p1  = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
+                                 (float)yp1[xs2], (float)yp1[xs3], xf);
+    float p2  = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
+                                 (float)yp2[xs2], (float)yp2[xs3], xf);
+    float p3  = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
+                                 (float)yp3[xs2], (float)yp3[xs3], xf);
+
+    float p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    p = clamp(p + 0.5f, 0.f, 255.f);
+    //ALOGI("CUC,%f,%u", p, (uchar)p);
+    return (uchar)p;
+}
+
+extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);
+
+extern "C" void rsdIntrinsicResizeB4_K(
+            uchar4 *dst,
+            size_t count,
+            uint32_t xf,
+            uint32_t xinc,
+            uchar4 const *srcn,
+            uchar4 const *src0,
+            uchar4 const *src1,
+            uchar4 const *src2,
+            size_t xclip,
+            size_t avail,
+            uint64_t osc_ctl,
+            int32_t const *yr);
+
+extern "C" void rsdIntrinsicResizeB2_K(
+            uchar2 *dst,
+            size_t count,
+            uint32_t xf,
+            uint32_t xinc,
+            uchar2 const *srcn,
+            uchar2 const *src0,
+            uchar2 const *src1,
+            uchar2 const *src2,
+            size_t xclip,
+            size_t avail,
+            uint64_t osc_ctl,
+            int32_t const *yr);
+
+extern "C" void rsdIntrinsicResizeB1_K(
+            uchar *dst,
+            size_t count,
+            uint32_t xf,
+            uint32_t xinc,
+            uchar const *srcn,
+            uchar const *src0,
+            uchar const *src1,
+            uchar const *src2,
+            size_t xclip,
+            size_t avail,
+            uint64_t osc_ctl,
+            int32_t const *yr);
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+static void mkYCoeff(int32_t *yr, float yf) {
+    int32_t yf1 = rint(yf * 0x10000);
+    int32_t yf2 = rint(yf * yf * 0x10000);
+    int32_t yf3 = rint(yf * yf * yf * 0x10000);
+
+    yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
+    yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
+    yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
+    yr[3] = -(yf3 - yf2) >> 1;
+}
+#endif
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
+                         float xf, float yf, int width) {
+    int startx = (int) floor(xf - 1);
+    xf = xf - floor(xf);
+    int maxx = width - 1;
+    int xs0 = std::max(0, startx + 0);
+    int xs1 = std::max(0, startx + 1);
+    int xs2 = std::min(maxx, startx + 2);
+    int xs3 = std::min(maxx, startx + 3);
+
+    float4 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
+                                  yp0[xs2], yp0[xs3], xf);
+    float4 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
+                                  yp1[xs2], yp1[xs3], xf);
+    float4 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
+                                  yp2[xs2], yp2[xs3], xf);
+    float4 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
+                                  yp3[xs2], yp3[xs3], xf);
+
+    float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    return p;
+}
+
+static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
+                         float xf, float yf, int width) {
+    int startx = (int) floor(xf - 1);
+    xf = xf - floor(xf);
+    int maxx = width - 1;
+    int xs0 = std::max(0, startx + 0);
+    int xs1 = std::max(0, startx + 1);
+    int xs2 = std::min(maxx, startx + 2);
+    int xs3 = std::min(maxx, startx + 3);
+
+    float2 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
+                                  yp0[xs2], yp0[xs3], xf);
+    float2 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
+                                  yp1[xs2], yp1[xs3], xf);
+    float2 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
+                                  yp2[xs2], yp2[xs3], xf);
+    float2 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
+                                  yp3[xs2], yp3[xs3], xf);
+
+    float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    return p;
+}
+
+static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
+                        float xf, float yf, int width) {
+    int startx = (int) floor(xf - 1);
+    xf = xf - floor(xf);
+    int maxx = width - 1;
+    int xs0 = std::max(0, startx + 0);
+    int xs1 = std::max(0, startx + 1);
+    int xs2 = std::min(maxx, startx + 2);
+    int xs3 = std::min(maxx, startx + 3);
+
+    float p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
+                                 yp0[xs2], yp0[xs3], xf);
+    float p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
+                                 yp1[xs2], yp1[xs3], xf);
+    float p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
+                                 yp2[xs2], yp2[xs3], xf);
+    float p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
+                                 yp3[xs2], yp3[xs3], xf);
+
+    float p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    return p;
+}
+#endif
+
+void ResizeTask::kernelU4(uchar *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar *pin = mIn;
+    const int srcHeight = mInputSizeY;
+    const int srcWidth = mInputSizeX;
+    const size_t stride = mInputSizeX * paddedSize(mVectorSize);
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+    float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
+                                          _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+    float yf = (currentY + 0.5f) * mScaleY - 0.5f;
+#endif
+
+
+    int starty = (int) floor(yf - 1);
+    yf = yf - floor(yf);
+    int maxy = srcHeight - 1;
+    int ys0 = std::max(0, starty + 0);
+    int ys1 = std::max(0, starty + 1);
+    int ys2 = std::min(maxy, starty + 2);
+    int ys3 = std::min(maxy, starty + 3);
+
+    const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
+    const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
+    const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
+    const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
+
+    uchar4 *out = ((uchar4 *)outPtr);
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
+        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+        long xf16 = rint(xf * 0x10000);
+        uint32_t xinc16 = rint(mScaleX * 0x10000);
+
+        int xoff = (xf16 >> 16) - 1;
+        int xclip = std::max(0, xoff) - xoff;
+        int len = x2 - x1;
+
+        int32_t yr[4];
+        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
+        mkYCoeff(yr, yf);
+
+        xoff += xclip;
+
+        rsdIntrinsicResizeB4_K(
+                out, len,
+                xf16 & 0xffff, xinc16,
+                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
+                xclip, srcWidth - xoff + xclip,
+                osc_ctl, yr);
+        out += len;
+        x1 += len;
+    }
+#endif
+
+    while(x1 < x2) {
+#if defined(ARCH_X86_HAVE_AVX2)
+        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+                                              _mm_set1_ps(0.5f)));
+#else
+        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+#endif
+        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+        out++;
+        x1++;
+    }
+}
+
+void ResizeTask::kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar *pin = mIn;
+    const int srcHeight = mInputSizeY;
+    const int srcWidth = mInputSizeX;
+    const size_t stride = mInputSizeX * mVectorSize;
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+    float yf = _mm_cvtss_f32(
+            _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+    float yf = (currentY + 0.5f) * mScaleY - 0.5f;
+#endif
+
+    int starty = (int) floor(yf - 1);
+    yf = yf - floor(yf);
+    int maxy = srcHeight - 1;
+    int ys0 = std::max(0, starty + 0);
+    int ys1 = std::max(0, starty + 1);
+    int ys2 = std::min(maxy, starty + 2);
+    int ys3 = std::min(maxy, starty + 3);
+
+    const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
+    const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
+    const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
+    const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
+
+    uchar2 *out = ((uchar2 *)outPtr);
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
+        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+        long xf16 = rint(xf * 0x10000);
+        uint32_t xinc16 = rint(mScaleX * 0x10000);
+
+        int xoff = (xf16 >> 16) - 1;
+        int xclip = std::max(0, xoff) - xoff;
+        int len = x2 - x1;
+
+        int32_t yr[4];
+        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
+        mkYCoeff(yr, yf);
+
+        xoff += xclip;
+
+        rsdIntrinsicResizeB2_K(
+                out, len,
+                xf16 & 0xffff, xinc16,
+                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
+                xclip, srcWidth - xoff + xclip,
+                osc_ctl, yr);
+        out += len;
+        x1 += len;
+    }
+#endif
+
+    while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+                                              _mm_set1_ps(0.5f)));
+#else
+        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+#endif
+        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+        out++;
+        x1++;
+    }
+}
+
+void ResizeTask::kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    //ALOGI("TK kernelU1 xstart %u, xend %u, outstep %u", xstart, xend);
+    const uchar *pin = mIn;
+    const int srcHeight = mInputSizeY;
+    const int srcWidth = mInputSizeX;
+    const size_t stride = mInputSizeX * mVectorSize;
+
+    // ALOGI("Toolkit   ResizeU1 (%ux%u) by (%f,%f), xstart:%u to %u, stride %zu, out %p", srcWidth,
+    // srcHeight, scaleX, scaleY, xstart, xend, stride, outPtr);
+
+#if defined(ARCH_X86_HAVE_AVX2)
+    float yf = _mm_cvtss_f32(
+            _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+    float yf = (currentY + 0.5f) * mScaleY - 0.5f;
+#endif
+
+    int starty = (int) floor(yf - 1);
+    yf = yf - floor(yf);
+    int maxy = srcHeight - 1;
+    int ys0 = std::max(0, starty + 0);
+    int ys1 = std::min(maxy, std::max(0, starty + 1));
+    int ys2 = std::min(maxy, starty + 2);
+    int ys3 = std::min(maxy, starty + 3);
+
+    const uchar *yp0 = pin + stride * ys0;
+    const uchar *yp1 = pin + stride * ys1;
+    const uchar *yp2 = pin + stride * ys2;
+    const uchar *yp3 = pin + stride * ys3;
+
+    uchar *out = ((uchar *)outPtr);
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
+        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+        long xf16 = rint(xf * 0x10000);
+        uint32_t xinc16 = rint(mScaleX * 0x10000);
+
+        int xoff = (xf16 >> 16) - 1;
+        int xclip = std::max(0, xoff) - xoff;
+        int len = x2 - x1;
+
+        int32_t yr[4];
+        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
+        mkYCoeff(yr, yf);
+
+        // ALOGI("ys0 %d, ys1 %d, ys2 %d, ys3 %d, x1 %u, x2 %u, xf %f, xf16 %ld, xinc16 %u, xoff %d,
+        // xclip %d, len %d, osc_ctl %lu)",
+        //       ys0, ys1, ys2, ys3, x1, x2, xf, xf16, xinc16, xoff, xclip, len, (unsigned long)
+        //       osc_ctl);
+        // ALOGI("TK scaleX %f, xf %f, xf16 %ld, xinc16 %d, xoff %d, xclip %d, len %d", scaleX, xf,
+        // xf16, xinc16, xoff, xclip, len); ALOGI("TK xf16 & 0xffff %ld, ys0 %u, ys1 %u, ys2 %u, ys3
+        // %u, srcWidth - xoff + xclip %d", xf16 & 0xffff, ys0, ys1, ys2, ys3, srcWidth - xoff);
+
+        xoff += xclip;
+
+        rsdIntrinsicResizeB1_K(
+                out, len,
+                xf16 & 0xffff, xinc16,
+                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
+                xclip, srcWidth - xoff + xclip,
+                osc_ctl, yr);
+        out += len;
+        x1 += len;
+    }
+#endif
+
+    while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+                                              _mm_set1_ps(0.5f)));
+#else
+        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+#endif
+
+        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+        out++;
+        x1++;
+    }
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+void ResizeTask::kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar *pin = mIn;
+    const int srcHeight = inputSizeY;
+    const int srcWidth = inputSizeX;
+    const size_t stride = sizeX * vectorSize;
+
+#if defined(ARCH_X86_HAVE_AVX2)
+    float yf = _mm_cvtss_f32(
+            _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+    float yf = (currentY + 0.5f) * scaleY - 0.5f;
+#endif
+
+    int starty = (int) floor(yf - 1);
+    yf = yf - floor(yf);
+    int maxy = srcHeight - 1;
+    int ys0 = std::max(0, starty + 0);
+    int ys1 = std::max(0, starty + 1);
+    int ys2 = std::min(maxy, starty + 2);
+    int ys3 = std::min(maxy, starty + 3);
+
+    const float4 *yp0 = (const float4 *)(pin + stride * ys0);
+    const float4 *yp1 = (const float4 *)(pin + stride * ys1);
+    const float4 *yp2 = (const float4 *)(pin + stride * ys2);
+    const float4 *yp3 = (const float4 *)(pin + stride * ys3);
+
+    float4 *out = ((float4 *)outPtr);
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+                                              _mm_set1_ps(0.5f)));
+#else
+        float xf = (x1 + 0.5f) * scaleX - 0.5f;
+#endif
+
+        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+        out++;
+        x1++;
+    }
+}
+
+void ResizeTask::kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar *pin = mIn;
+    const int srcHeight = inputSizeY;
+    const int srcWidth = inputSizeX;
+    const size_t stride = sizeX * vectorSize;
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+    float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
+                                          _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+    float yf = (currentY + 0.5f) * scaleY - 0.5f;
+#endif
+
+    int starty = (int) floor(yf - 1);
+    yf = yf - floor(yf);
+    int maxy = srcHeight - 1;
+    int ys0 = std::max(0, starty + 0);
+    int ys1 = std::max(0, starty + 1);
+    int ys2 = std::min(maxy, starty + 2);
+    int ys3 = std::min(maxy, starty + 3);
+
+    const float2 *yp0 = (const float2 *)(pin + stride * ys0);
+    const float2 *yp1 = (const float2 *)(pin + stride * ys1);
+    const float2 *yp2 = (const float2 *)(pin + stride * ys2);
+    const float2 *yp3 = (const float2 *)(pin + stride * ys3);
+
+    float2 *out = ((float2 *)outPtr);
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+                                              _mm_set1_ps(0.5f)));
+#else
+        float xf = (x1 + 0.5f) * scaleX - 0.5f;
+#endif
+
+        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+        out++;
+        x1++;
+    }
+}
+
+void ResizeTask::kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar *pin = mIn;
+    const int srcHeight = inputSizeY;
+    const int srcWidth = inputSizeX;
+    const size_t stride = sizeX * vectorSize;
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+    float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
+                                          _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+    float yf = (currentY + 0.5f) * scaleY - 0.5f;
+#endif
+
+    int starty = (int) floor(yf - 1);
+    yf = yf - floor(yf);
+    int maxy = srcHeight - 1;
+    int ys0 = std::max(0, starty + 0);
+    int ys1 = std::max(0, starty + 1);
+    int ys2 = std::min(maxy, starty + 2);
+    int ys3 = std::min(maxy, starty + 3);
+
+    const float *yp0 = (const float *)(pin + stride * ys0);
+    const float *yp1 = (const float *)(pin + stride * ys1);
+    const float *yp2 = (const float *)(pin + stride * ys2);
+    const float *yp3 = (const float *)(pin + stride * ys3);
+
+    float *out = ((float *)outPtr);
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+                                              _mm_set1_ps(0.5f)));
+#else
+        float xf = (x1 + 0.5f) * scaleX - 0.5f;
+#endif
+
+        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+        out++;
+        x1++;
+    }
+}
+
+void ResizeTask::preLaunch(uint32_t slot, const RsScriptCall *sc)
+{
+
+    //check the data type to determine F or U.
+    if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
+        switch(mAlloc->getType()->getElement()->getVectorSize()) {
+        case 1:
+            mRootPtr = &kernelU1;
+            break;
+        case 2:
+            mRootPtr = &kernelU2;
+            break;
+        case 3:
+        case 4:
+            mRootPtr = &kernelU4;
+            break;
+        }
+    } else {
+        switch(mAlloc->getType()->getElement()->getVectorSize()) {
+        case 1:
+            mRootPtr = &kernelF1;
+            break;
+        case 2:
+            mRootPtr = &kernelF2;
+            break;
+        case 3:
+        case 4:
+            mRootPtr = &kernelF4;
+            break;
+        }
+    }
+}
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+void RenderScriptToolkit::resize(const uint8_t* input, uint8_t* output, size_t inputSizeX,
+                                 size_t inputSizeY, size_t vectorSize, size_t outputSizeX,
+                                 size_t outputSizeY, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, outputSizeX, outputSizeY, restriction)) {
+        return;
+    }
+    if (vectorSize < 1 || vectorSize > 4) {
+        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+        return;
+    }
+#endif
+
+    ResizeTask task((const uchar*)input, (uchar*)output, inputSizeX, inputSizeY, vectorSize,
+                    outputSizeX, outputSizeY, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/Resize_advsimd.S b/toolkit/Resize_advsimd.S
new file mode 100644
index 0000000..59e735c
--- /dev/null
+++ b/toolkit/Resize_advsimd.S
@@ -0,0 +1,754 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
+ * integer (bicubic has a little overshoot).  It would also be possible to add
+ * a temporary DC bias to eliminate the sign bit for more precision, but that's
+ * extra arithmetic.
+ */
+.set VERTBITS, 14
+
+/* The size of the scratch buffer in which we store our vertically convolved
+ * intermediates.
+ */
+.set CHUNKSHIFT, 7       /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */
+.set CHUNKSIZE, (1 << CHUNKSHIFT)
+
+/* The number of components processed in a single iteration of the innermost
+ * loop.
+ */
+.set VECSHIFT, 3
+.set VECSIZE, (1<<VECSHIFT)
+
+/* Read four different lines (except at edges where addresses may be clamped,
+ * which is why we don't simply take base and stride registers), and multiply
+ * and accumulate them by the coefficients in v3[0..3], leaving the results in
+ * v12.  This gives eight 16-bit results representing a horizontal line of 2-8
+ * input pixels (depending on number of components per pixel) to be fed into
+ * the horizontal scaling pass.
+ *
+ * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
+ * known to represent negative values and VMLS is used to implement this).
+ * Output is VERTBITS signed fixed-point, which must leave room for a little
+ * v12.  This gives eight 16-bit results.
+ */
+.macro vert8, dstlo=v12.4h, dsthi=v12.8h
+        ld1         {v8.8b}, [x4], #8
+        ld1         {v9.8b}, [x5], #8
+        ld1         {v10.8b}, [x6], #8
+        ld1         {v11.8b}, [x7], #8
+        uxtl        v8.8h, v8.8b
+        uxtl        v9.8h, v9.8b
+        uxtl        v10.8h, v10.8b
+        uxtl        v11.8h, v11.8b
+        umull       v12.4s, v9.4h, v3.h[1]
+        umull2      v13.4s, v9.8h, v3.h[1]
+        umlsl       v12.4s, v8.4h, v3.h[0]
+        umlsl2      v13.4s, v8.8h, v3.h[0]
+        umlal       v12.4s, v10.4h, v3.h[2]
+        umlal2      v13.4s, v10.8h, v3.h[2]
+        umlsl       v12.4s, v11.4h, v3.h[3]
+        umlsl2      v13.4s, v11.8h, v3.h[3]
+
+        /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
+         * minus VERTBITS (the number of fraction bits we want to keep from
+         * here on).
+         */
+        sqshrn      \dstlo, v12.4s, #8 + (16 - VERTBITS)
+        sqshrn2     \dsthi, v13.4s, #8 + (16 - VERTBITS)
+.endm
+
+/* As above, but only four 16-bit results into v12hi.
+ */
+.macro vert4, dst=v12.8h
+        ld1         {v8.s}[0], [x4], #4
+        ld1         {v9.s}[0], [x5], #4
+        ld1         {v10.s}[0], [x6], #4
+        ld1         {v11.s}[0], [x7], #4
+        uxtl        v8.8h, v8.8b
+        uxtl        v9.8h, v9.8b
+        uxtl        v10.8h, v10.8b
+        uxtl        v11.8h, v11.8b
+        umull       v12.4s, v9.4h, v3.h[1]
+        umlsl       v12.4s, v8.4h, v3.h[0]
+        umlal       v12.4s, v10.4h, v3.h[2]
+        umlsl       v12.4s, v11.4h, v3.h[3]
+.ifc \dst,v12.8h
+        sqshrn2     \dst, v12.4s, #8 + (16 - VERTBITS)
+.else
+        sqshrn      \dst, v12.4s, #8 + (16 - VERTBITS)
+.endif
+.endm
+
+
+/* During horizontal resize having CHUNKSIZE input available means being able
+ * to produce a varying amount of output, depending on the phase of the data.
+ * This function calculates the minimum number of VECSIZE chunks extracted from
+ * a CHUNKSIZE window (x1), and the threshold value for when the count will be
+ * one higher than that (x0).
+ * These work out, conveniently, to be the quotient and remainder from:
+ *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
+ *
+ * The two values are packed together in a uint64_t for convenience; and
+ * they are, in fact, used this way as an arithmetic short-cut later on.
+ */
+/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */
+ENTRY(rsdIntrinsicResize_oscctl_K)
+        lsl         x2, x0, #VECSHIFT
+        mov         x0, #(CHUNKSIZE << 16) - 1
+        add         x0, x0, x2
+        udiv        x1, x0, x2
+        msub        x0, x1, x2, x0
+        add         x0, x0, x1, LSL #32
+        ret
+END(rsdIntrinsicResize_oscctl_K)
+
+/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
+ * For the most part the vertical pass (the outer loop) is the same for all
+ * versions.  Exceptions are handled in-line with conditional assembly.
+ */
+.irp comp, 1, 2, 4
+.if \comp == 1
+.set COMPONENT_SHIFT, 0
+.elseif \comp == 2
+.set COMPONENT_SHIFT, 1
+.elseif \comp == 4
+.set COMPONENT_SHIFT, 2
+.else
+.error "Unknown component count"
+.endif
+.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
+.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
+
+.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
+
+/* void rsdIntrinsicResizeB1_K(
+ *             uint8_t * restrict dst,          // x0
+ *             size_t count,                    // x1
+ *             uint32_t xf,                     // x2
+ *             uint32_t xinc,                   // x3
+ *             uint8_t const * restrict srcn,   // x4
+ *             uint8_t const * restrict src0,   // x5
+ *             uint8_t const * restrict src1,   // x6
+ *             uint8_t const * restrict src2,   // x7
+ *             size_t xclip,                    // [sp,#0]  -> [sp,#80] -> x12
+ *             size_t avail,                    // [sp,#8]  -> [sp,#88] -> x11
+ *             uint64_t osc_ctl,                // [sp,#16] -> [sp,#96] -> x10
+ *             int32 const *yr,                 // [sp,#24] -> [sp,#104] -> v4   (copied to v3   for scalar access)
+ */
+ENTRY(rsdIntrinsicResizeB\comp\()_K)
+            sub         x8, sp, #48
+            sub         sp, sp, #80
+            st1         {v8.1d - v11.1d}, [sp]
+            st1         {v12.1d - v15.1d}, [x8]
+            str         x19, [x8, #32]
+
+            /* align the working buffer on the stack to make it easy to use bit
+             * twiddling for address calculations.
+             */
+            sub         x12, sp, #BUFFER_SIZE
+            bic         x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1
+
+            ldr         x8, [sp,#104]           // yr
+            adrp        x9, intrinsic_resize_consts
+            add         x9, x9, :lo12:intrinsic_resize_consts
+            ld1         {v4.4s}, [x8]
+            ld1         {v5.8h}, [x9]
+            sqxtun      v4.4h, v4.4s            // yr
+            dup         v6.8h, w2
+            dup         v7.8h, w3
+            mla         v6.8h, v5.8h, v7.8h     // vxf
+            shl         v7.8h, v7.8h, #VECSHIFT // vxinc
+
+            /* Compute starting condition for oscillator used to compute ahead
+             * of time how many iterations are possible before needing to
+             * refill the working buffer.  This is based on the fixed-point
+             * index of the last element in the vector of pixels processed in
+             * each iteration, counting up until it would overflow.
+             */
+            sub         x8, x2, x3
+            lsl         x9, x3, #VECSHIFT
+            add         x8, x8, x9
+
+            ldr         x10, [sp,#96]           // osc_ctl
+            ldp         x13,x11, [sp,#80]       // xclip, avail
+
+            mov         x19, sp
+            mov         sp, x12
+
+            /* x4-x7 contain pointers to the four lines of input to be
+             * convolved.  These pointers have been clamped vertically and
+             * horizontally (which is why it's not a simple row/stride pair),
+             * and the xclip argument (now in x13) indicates how many pixels
+             * from true the x position of the pointer is.  This value should
+             * be 0, 1, or 2 only.
+             *
+             * Start by placing four pixels worth of input at the far end of
+             * the buffer.  As many as two of these may be clipped, so four
+             * pixels are fetched, and then the first pixel is duplicated and
+             * the data shifted according to xclip.  The source pointers are
+             * then also adjusted according to xclip so that subsequent fetches
+             * match.
+             */
+            mov         v3.8b, v4.8b  /* make y coeffs available for vert4 and vert8 macros */
+            sub         x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
+            add         x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
+            add         x14, x14, #4 * COMPONENT_COUNT * 2
+.if \comp == 1
+            vert4       v12.4h
+            dup         v11.4h, v12.h[0]
+            st1         {v11.4h,v12.4h}, [x12]
+            ld1         {v12.4h}, [x14]
+            st1         {v12.4h}, [x15]
+.elseif \comp == 2
+            vert8
+            dup         v11.4s, v12.s[0]
+            st1         {v11.8h,v12.8h}, [x12]
+            ld1         {v12.8h}, [x14]
+            st1         {v12.8h}, [x15]
+.elseif \comp == 4
+            vert8       v14.4h, v14.8h
+            vert8       v15.4h, v15.8h
+            dup         v12.2d, v14.d[0]
+            dup         v13.2d, v14.d[0]
+            st1         {v12.8h,v13.8h}, [x12], #32
+            st1         {v14.8h,v15.8h}, [x12]
+            sub         x12, x12, #32
+            ld1         {v11.8h,v12.8h}, [x14]
+            st1         {v11.8h,v12.8h}, [x15]
+.endif
+            /* Count off four pixels into the working buffer.
+             */
+            sub         x11, x11, #4
+            /* Incoming pointers were to the first _legal_ pixel.  Four pixels
+             * were read unconditionally, but some may have been discarded by
+             * xclip, so we rewind the pointers to compensate.
+             */
+            sub         x4, x4, x13, LSL #(COMPONENT_SHIFT)
+            sub         x5, x5, x13, LSL #(COMPONENT_SHIFT)
+            sub         x6, x6, x13, LSL #(COMPONENT_SHIFT)
+            sub         x7, x7, x13, LSL #(COMPONENT_SHIFT)
+
+            /* First tap starts where we just pre-filled, at the end of the
+             * buffer.
+             */
+            add         x2, x2, #(CHUNKSIZE * 2 - 4) << 16
+
+            /* Use overflowing arithmetic to implement wraparound array
+             * indexing.
+             */
+            lsl         x2, x2, #(47 - CHUNKSHIFT)
+            lsl         x3, x3, #(47 - CHUNKSHIFT)
+
+
+            /* Start of outermost loop.
+             * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
+             * number of iterations of the inner loop that can be performed and
+             * get into that.
+             *
+             * The fill is complicated by the possibility of running out of
+             * input before the scratch buffer is filled.  If this isn't a risk
+             * then it's handled by the simple loop at 2:, otherwise the
+             * horrible loop at 3:.
+             */
+1:          mov         v3.8b, v4.8b            /* put y scaling coefficients somewhere handy */
+            subs        x11, x11, #CHUNKSIZE
+            bge         2f                      /* if at least CHUNKSIZE are available... */
+            add         x11, x11, #CHUNKSIZE    /* if they're not... */
+            b           4f
+            /* basic fill loop, processing 8 bytes at a time until there are
+             * fewer than eight bytes available.
+             */
+3:          vert8
+            sub         x11, x11, #8 / COMPONENT_COUNT
+            st1         {v12.8h}, [x12], #16
+4:          cmp         x11, #8 / COMPONENT_COUNT - 1
+            bgt         3b
+.if \comp == 4
+            blt         3f
+            /* The last pixel (four bytes) if necessary */
+            vert4
+.else
+            cmp         x11, #1
+            blt         3f
+            /* The last pixels if necessary */
+            sub         x4, x4, #8
+            sub         x5, x5, #8
+            sub         x6, x6, #8
+            sub         x7, x7, #8
+            add         x4, x4, x11, LSL #(COMPONENT_SHIFT)
+            add         x5, x5, x11, LSL #(COMPONENT_SHIFT)
+            add         x6, x6, x11, LSL #(COMPONENT_SHIFT)
+            add         x7, x7, x11, LSL #(COMPONENT_SHIFT)
+            vert8
+            sub         x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
+            sub         sp, sp, #32
+            sub         x11, x11, #16
+.if \comp == 1
+            dup         v13.8h, v12.h[7]
+.elseif \comp == 2
+            dup         v13.4s, v12.s[3]
+.endif
+            st1         {v12.8h,v13.8h}, [sp]
+            ld1         {v12.8h}, [x11]
+            add         sp, sp, #32
+            b           4f
+.endif
+            /* Keep filling until we get to the end of this chunk of the buffer */
+3:
+.if \comp == 1
+            dup         v12.8h, v12.h[7]
+.elseif \comp == 2
+            dup         v12.4s, v12.s[3]
+.elseif \comp == 4
+            dup         v12.2d, v12.d[1]
+.endif
+4:          st1         {v12.8h}, [x12], #16
+            tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+            bne         3b
+            b           4f
+
+.align 4
+2:          /* Quickly pull a chunk of data into the working buffer.
+             */
+            vert8
+            st1         {v12.8h}, [x12], #16
+            vert8
+            st1         {v12.8h}, [x12], #16
+            tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+            bne         2b
+            cmp         x11, #0
+            bne         3f
+4:          /* if we end with 0 pixels left we'll have nothing handy to spread
+             * across to the right, so we rewind a bit.
+             */
+            mov         x11, #1
+            sub         x4, x4, #COMPONENT_COUNT
+            sub         x5, x5, #COMPONENT_COUNT
+            sub         x6, x6, #COMPONENT_COUNT
+            sub         x7, x7, #COMPONENT_COUNT
+3:          /* copy four taps (width of cubic window) to far end for overflow
+             * address handling
+             */
+            sub         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
+            eor         x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+            ld1         {v14.4h}, [x13]
+.elseif \comp == 2
+            ld1         {v14.8h}, [x13]
+.elseif \comp == 4
+            ld1         {v14.8h,v15.8h}, [x13]
+.endif
+            add         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+            st1         {v14.4h}, [x13]
+.elseif \comp == 2
+            st1         {v14.8h}, [x13]
+.elseif \comp == 4
+            st1         {v14.8h,v15.8h}, [x13]
+.endif
+            /* The high 32-bits of x10 contains the maximum possible iteration
+             * count, but if x8 is greater than the low 32-bits of x10 then
+             * this indicates that the count must be reduced by one for this
+             * iteration to avoid reading past the end of the available data.
+             */
+            sub         x13, x10, x8
+            lsr         x13, x13, #32
+
+            madd        x8, x13, x9, x8
+            sub         x8, x8, #(CHUNKSIZE << 16)
+
+            /* prefer to count pixels, rather than vectors, to clarify the tail
+             * store case on exit.
+             */
+            lsl         x13, x13, #VECSHIFT
+            cmp         x13, x1
+            csel        x13, x1, x13, gt
+
+            sub         x1, x1, x13
+
+            lsl         x13, x13, #COMPONENT_SHIFT
+
+            mov         w14, #0x8000
+            movi        v30.8h, #3
+            dup         v31.8h, w14
+
+            cmp         x13, #0
+            bgt         3f
+            cmp         x1, #0
+            bgt         1b     /* an extreme case where we shouldn't use code in this structure */
+            b           9f
+
+            .align 4
+2:          /* Inner loop continues here, but starts at 3:, see end of loop
+             * below for explanation. */
+.if LOOP_OUTPUT_SIZE == 4
+            st1         {v8.s}[0], [x0], #4
+.elseif LOOP_OUTPUT_SIZE == 8
+            st1         {v8.8b}, [x0], #8
+.elseif LOOP_OUTPUT_SIZE == 16
+            st1         {v8.16b}, [x0], #16
+.elseif LOOP_OUTPUT_SIZE == 32
+            st1         {v8.16b,v9.16b}, [x0], #32
+.endif
+            /* Inner loop:  here the four x coefficients for each tap are
+             * calculated in vector code, and the addresses are calculated in
+             * scalar code, and these calculations are interleaved.
+             */
+3:          ushr        v8.8h, v6.8h, #1            // sxf
+            lsr         x14, x2, #(63 - CHUNKSHIFT)
+            sqrdmulh    v9.8h, v8.8h, v8.8h         // sxf**2
+            add         x2, x2, x3
+            sqrdmulh    v10.8h, v9.8h, v8.8h        // sxf**3
+            lsr         x15, x2, #(63 - CHUNKSHIFT)
+            sshll       v11.4s, v9.4h, #2
+            sshll2      v12.4s, v9.8h, #2
+            add         x2, x2, x3
+            smlsl       v11.4s, v10.4h, v30.4h
+            smlsl2      v12.4s, v10.8h, v30.8h
+            lsr         x16, x2, #(63 - CHUNKSHIFT)
+
+            shadd       v0.8h, v10.8h, v8.8h
+            add         x2, x2, x3
+            sub         v0.8h, v9.8h, v0.8h
+            lsr         x17, x2, #(63 - CHUNKSHIFT)
+
+            saddw       v1.4s, v11.4s, v9.4h
+            saddw2      v13.4s, v12.4s, v9.8h
+            add         x2, x2, x3
+            shrn        v1.4h, v1.4s, #1
+            shrn2       v1.8h, v13.4s, #1
+            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+            sub         v1.8h, v1.8h, v31.8h
+            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+
+            saddw       v2.4s, v11.4s, v8.4h
+            saddw2      v13.4s, v12.4s, v8.8h
+            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+            shrn        v2.4h, v2.4s, #1
+            shrn2       v2.8h, v13.4s, #1
+            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+            neg         v2.8h, v2.8h
+
+            shsub       v3.8h, v10.8h, v9.8h
+
+            /* increment the x fractional parts (oveflow is ignored, as the
+             * scalar arithmetic shadows this addition with full precision).
+             */
+            add         v6.8h, v6.8h, v7.8h
+
+            /* At this point we have four pointers in x8-x11, pointing to the
+             * four taps in the scratch buffer that must be convolved together
+             * to produce an output pixel (one output pixel per pointer).
+             * These pointers usually overlap, but their spacing is irregular
+             * so resolving the redundancy through L1 is a pragmatic solution.
+             *
+             * The scratch buffer is made of signed 16-bit data, holding over
+             * some extra precision, and overshoot, from the vertical pass.
+             *
+             * We also have the 16-bit unsigned fixed-point weights for each
+             * of the four taps in v0 - v3.  That's eight pixels worth of
+             * coefficients when we have only four pointers, so calculations
+             * for four more pixels are interleaved with the fetch and permute
+             * code for each variant in the following code.
+             *
+             * The data arrangement is less than ideal for any pixel format,
+             * but permuting loads help to mitigate most of the problems.
+             *
+             * Note also that the two outside taps of a bicubic are negative,
+             * but these coefficients are unsigned.  The sign is hard-coded by
+             * use of multiply-and-subtract operations.
+             */
+.if \comp == 1
+            /* The uchar 1 case.
+             * Issue one lanewise ld4.h to load four consecutive pixels from
+             * one pointer (one pixel) into four different registers; then load
+             * four consecutive s16 values from the next pointer (pixel) into
+             * the next lane of those four registers, etc., so that we finish
+             * with v12 - v15 representing the four taps, and each lane
+             * representing a separate pixel.
+             *
+             * The first ld4 uses a splat to avoid any false dependency on
+             * the previous state of the register.
+             */
+            ld4r        {v12.8h,v13.8h,v14.8h,v15.8h}, [x14]
+            lsr         x14, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.h,v13.h,v14.h,v15.h}[1], [x15]
+            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x15, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.h,v13.h,v14.h,v15.h}[2], [x16]
+            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x16, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.h,v13.h,v14.h,v15.h}[3], [x17]
+            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x17, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.h,v13.h,v14.h,v15.h}[4], [x14]
+            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+            ld4         {v12.h,v13.h,v14.h,v15.h}[5], [x15]
+            ld4         {v12.h,v13.h,v14.h,v15.h}[6], [x16]
+            ld4         {v12.h,v13.h,v14.h,v15.h}[7], [x17]
+
+            smull       v8.4s, v12.4h, v0.4h
+            smull2      v9.4s, v12.8h, v0.8h
+            smlsl       v8.4s, v13.4h, v1.4h
+            smlsl2      v9.4s, v13.8h, v1.8h
+            smlsl       v8.4s, v14.4h, v2.4h
+            smlsl2      v9.4s, v14.8h, v2.8h
+            smlal       v8.4s, v15.4h, v3.4h
+            smlal2      v9.4s, v15.8h, v3.8h
+
+            subs        x13, x13, #LOOP_OUTPUT_SIZE
+
+            sqrshrn     v8.4h, v8.4s, #15
+            sqrshrn2    v8.8h, v9.4s, #15
+
+            sqrshrun    v8.8b, v8.8h, #VERTBITS - 8
+.elseif \comp == 2
+            /* The uchar2 case:
+             * This time load pairs of values into adjacent lanes in v12 - v15
+             * by aliasing them as u32 data; leaving room for only four pixels,
+             * so the process has to be done twice.  This also means that the
+             * coefficient registers fail to align with the coefficient data
+             * (eight separate pixels), so that has to be doubled-up to match.
+             */
+            ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
+            lsr         x14, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
+            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x15, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
+            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x16, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
+            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x17, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+
+            /* double-up coefficients to align with component pairs */
+            zip1        v16.8h, v0.8h, v0.8h
+            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+            zip1        v17.8h, v1.8h, v1.8h
+            zip1        v18.8h, v2.8h, v2.8h
+            zip1        v19.8h, v3.8h, v3.8h
+
+            smull       v8.4s, v12.4h, v16.4h
+            smull2      v9.4s, v12.8h, v16.8h
+            smlsl       v8.4s, v13.4h, v17.4h
+            smlsl2      v9.4s, v13.8h, v17.8h
+            smlsl       v8.4s, v14.4h, v18.4h
+            smlsl2      v9.4s, v14.8h, v18.8h
+            smlal       v8.4s, v15.4h, v19.4h
+            smlal2      v9.4s, v15.8h, v19.8h
+
+            sqrshrn     v8.4h, v8.4s, #15
+            sqrshrn2    v8.8h, v9.4s, #15
+
+            ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
+            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
+            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
+            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
+
+            /* double-up coefficients to align with component pairs */
+            zip2        v16.8h, v0.8h, v0.8h
+            zip2        v17.8h, v1.8h, v1.8h
+            zip2        v18.8h, v2.8h, v2.8h
+            zip2        v19.8h, v3.8h, v3.8h
+
+            smull       v10.4s, v12.4h, v16.4h
+            smull2      v11.4s, v12.8h, v16.8h
+            smlsl       v10.4s, v13.4h, v17.4h
+            smlsl2      v11.4s, v13.8h, v17.8h
+            smlsl       v10.4s, v14.4h, v18.4h
+            smlsl2      v11.4s, v14.8h, v18.8h
+            smlal       v10.4s, v15.4h, v19.4h
+            smlal2      v11.4s, v15.8h, v19.8h
+
+            subs        x13, x13, #LOOP_OUTPUT_SIZE
+
+            sqrshrn     v9.4h, v10.4s, #15
+            sqrshrn2    v9.8h, v11.4s, #15
+
+            sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
+            sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
+.elseif \comp == 4
+            /* The uchar4 case.
+             * This case is comparatively painless because four s16s are the
+             * smallest addressable unit for a vmul-by-scalar.  Rather than
+             * permute the data, simply arrange the multiplies to suit the way
+             * the data comes in.  That's a lot of data, though, so things
+             * progress in pairs of pixels at a time.
+             */
+            ld1         {v12.8h,v13.8h}, [x14]
+            lsr         x14, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld1         {v14.8h,v15.8h}, [x15]
+            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x15, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+
+            smull       v8.4s, v12.4h, v0.h[0]
+            smull       v9.4s, v14.4h, v0.h[1]
+            smlsl2      v8.4s, v12.8h, v1.h[0]
+            smlsl2      v9.4s, v14.8h, v1.h[1]
+            smlsl       v8.4s, v13.4h, v2.h[0]
+            smlsl       v9.4s, v15.4h, v2.h[1]
+            smlal2      v8.4s, v13.8h, v3.h[0]
+            smlal2      v9.4s, v15.8h, v3.h[1]
+
+            /* And two more...  */
+            ld1         {v12.8h,v13.8h}, [x16]
+            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x16, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld1         {v14.8h,v15.8h}, [x17]
+            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x17, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+
+            sqrshrn     v8.4h, v8.4s, #15
+            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+            sqrshrn2    v8.8h, v9.4s, #15
+
+            smull       v10.4s, v12.4h, v0.h[2]
+            smull       v11.4s, v14.4h, v0.h[3]
+            smlsl2      v10.4s, v12.8h, v1.h[2]
+            smlsl2      v11.4s, v14.8h, v1.h[3]
+            smlsl       v10.4s, v13.4h, v2.h[2]
+            smlsl       v11.4s, v15.4h, v2.h[3]
+            smlal2      v10.4s, v13.8h, v3.h[2]
+            smlal2      v11.4s, v15.8h, v3.h[3]
+
+            sqrshrn     v9.4h, v10.4s, #15
+            sqrshrn2    v9.8h, v11.4s, #15
+
+            sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
+            sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
+
+            /* And two more...  */
+            ld1         {v12.8h,v13.8h}, [x14]
+            ld1         {v14.8h,v15.8h}, [x15]
+
+            smull       v10.4s, v12.4h, v0.h[4]
+            smull       v11.4s, v14.4h, v0.h[5]
+            smlsl2      v10.4s, v12.8h, v1.h[4]
+            smlsl2      v11.4s, v14.8h, v1.h[5]
+            smlsl       v10.4s, v13.4h, v2.h[4]
+            smlsl       v11.4s, v15.4h, v2.h[5]
+            smlal2      v10.4s, v13.8h, v3.h[4]
+            smlal2      v11.4s, v15.8h, v3.h[5]
+
+            /* And two more...  */
+            ld1         {v12.8h,v13.8h}, [x16]
+            ld1         {v14.8h,v15.8h}, [x17]
+
+            subs        x13, x13, #LOOP_OUTPUT_SIZE
+
+            sqrshrn     v9.4h, v10.4s, #15
+            sqrshrn2    v9.8h, v11.4s, #15
+
+            smull       v10.4s, v12.4h, v0.h[6]
+            smull       v11.4s, v14.4h, v0.h[7]
+            smlsl2      v10.4s, v12.8h, v1.h[6]
+            smlsl2      v11.4s, v14.8h, v1.h[7]
+            smlsl       v10.4s, v13.4h, v2.h[6]
+            smlsl       v11.4s, v15.4h, v2.h[7]
+            smlal2      v10.4s, v13.8h, v3.h[6]
+            smlal2      v11.4s, v15.8h, v3.h[7]
+
+            sqrshrn     v10.4h, v10.4s, #15
+            sqrshrn2    v10.8h, v11.4s, #15
+
+            sqrshrun     v9.8b, v9.8h, #VERTBITS - 8
+            sqrshrun2    v9.16b, v10.8h, #VERTBITS - 8
+.endif
+            bgt         2b      /* continue inner loop */
+            /* The inner loop has already been limited to ensure that none of
+             * the earlier iterations could overfill the output, so the store
+             * appears within the loop but after the conditional branch (at the
+             * top).  At the end, provided it won't overfill, perform the final
+             * store here.  If it would, then break out to the tricky tail case
+             * instead.
+             */
+            blt         1f
+            /* Store the amount of data appropriate to the configuration of the
+             * instance being assembled.
+             */
+.if LOOP_OUTPUT_SIZE == 4
+            st1         {v8.s}[0], [x0], #4
+.elseif LOOP_OUTPUT_SIZE == 8
+            st1         {v8.8b}, [x0], #8
+.elseif LOOP_OUTPUT_SIZE == 16
+            st1         {v8.16b}, [x0], #16
+.elseif LOOP_OUTPUT_SIZE == 32
+            st1         {v8.16b,v9.16b}, [x0], #32
+.endif
+            b           1b              /* resume outer loop */
+            /* Partial tail store case:
+             * Different versions of the code need different subsets of the
+             * following partial stores.  Here the number of components and the
+             * size of the chunk of data produced by each inner loop iteration
+             * is tested to figure out whether or not each phrase is relevant.
+             */
+.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
+1:          tst         x13, #16
+            beq         1f
+            st1         {v8.16b}, [x0], #16
+            mov         v8.16b, v9.16b
+.endif
+.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
+1:          tst         x13, #8
+            beq         1f
+            st1         {v8.8b}, [x0], #8
+            ext         v8.16b, v8.16b, v8.16b, #8
+.endif
+.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
+1:          tst         x13, #4
+            beq         1f
+            st1         {v8.s}[0], [x0], #4
+            ext         v8.8b, v8.8b, v8.8b, #4
+.endif
+.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
+1:          tst         x13, #2
+            beq         1f
+            st1         {v8.h}[0], [x0], #2
+            ext         v8.8b, v8.8b, v8.8b, #2
+.endif
+.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
+1:          tst         x13, #1
+            beq         1f
+            st1         {v8.b}[0], [x0], #1
+.endif
+1:
+9:          mov         sp, x19
+            ld1         {v8.1d - v11.1d}, [sp], #32
+            ld1         {v12.1d - v15.1d}, [sp], #32
+            ldr         x19, [sp], #16
+            ret
+END(rsdIntrinsicResizeB\comp\()_K)
+.endr
+
+.rodata
+intrinsic_resize_consts:          .hword      0, 1, 2, 3, 4, 5, 6, 7
diff --git a/toolkit/Resize_neon.S b/toolkit/Resize_neon.S
new file mode 100644
index 0000000..eb7f694
--- /dev/null
+++ b/toolkit/Resize_neon.S
@@ -0,0 +1,799 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
+ * integer (bicubic has a little overshoot).  It would also be possible to add
+ * a temporary DC bias to eliminate the sign bit for more precision, but that's
+ * extra arithmetic.
+ */
+.set VERTBITS, 14
+
+/* The size of the scratch buffer in which we store our vertically convolved
+ * intermediates.
+ */
+.set CHUNKSHIFT, 7
+.set CHUNKSIZE, (1 << CHUNKSHIFT)
+
+/* The number of components processed in a single iteration of the innermost
+ * loop.
+ */
+.set VECSHIFT, 3
+.set VECSIZE, (1<<VECSHIFT)
+
+/* Read four different lines (except at edges where addresses may be clamped,
+ * which is why we don't simply take base and stride registers), and multiply
+ * and accumulate them by the coefficients in d6[0..3], leaving the results in
+ * q12.  This gives eight 16-bit results representing a horizontal line of 2-8
+ * input pixels (depending on number of components per pixel) to be fed into
+ * the horizontal scaling pass.
+ *
+ * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
+ * known to represent negative values and VMLS is used to implement this).
+ * Output is VERTBITS signed fixed-point, which must leave room for a little
+ * bit of overshoot beyond [0,1.0).
+ */
+.macro vert8, dstlo=d24, dsthi=d25
+        vld1.u8     d16, [r4]!
+        vld1.u8     d18, [r5]!
+        vld1.u8     d20, [r6]!
+        vld1.u8     d22, [r7]!
+        vmovl.u8    q8, d16
+        vmovl.u8    q9, d18
+        vmovl.u8    q10, d20
+        vmovl.u8    q11, d22
+        vmull.u16   q12, d18, d6[1]
+        vmull.u16   q13, d19, d6[1]
+        vmlsl.u16   q12, d16, d6[0]
+        vmlsl.u16   q13, d17, d6[0]
+        vmlal.u16   q12, d20, d6[2]
+        vmlal.u16   q13, d21, d6[2]
+        vmlsl.u16   q12, d22, d6[3]
+        vmlsl.u16   q13, d23, d6[3]
+
+        /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
+         * minus VERTBITS (the number of fraction bits we want to keep from
+         * here on).
+         */
+        vqshrn.s32  \dstlo, q12, #8 + 16 - VERTBITS
+        vqshrn.s32  \dsthi, q13, #8 + 16 - VERTBITS
+.endm
+
+/* As above, but only four 16-bit results into d25.
+ */
+.macro vert4
+        vld1.u32    d16[0], [r4]!
+        vld1.u32    d18[0], [r5]!
+        vld1.u32    d20[0], [r6]!
+        vld1.u32    d22[0], [r7]!
+        vmovl.u8    q8, d16
+        vmovl.u8    q9, d18
+        vmovl.u8    q10, d20
+        vmovl.u8    q11, d22
+        vmull.u16   q12, d18, d6[1]
+        vmlsl.u16   q12, d16, d6[0]
+        vmlal.u16   q12, d20, d6[2]
+        vmlsl.u16   q12, d22, d6[3]
+        vqshrn.s32  d25, q12, #8 + 16 - VERTBITS
+.endm
+
+
+/* During horizontal resize having CHUNKSIZE input available means being able
+ * to produce a varying amount of output, depending on the phase of the data.
+ * This function calculates the minimum number of VECSIZE chunks extracted from
+ * a CHUNKSIZE window (r1), and the threshold value for when the count will be
+ * one higher than that (r0).
+ * These work out, conveniently, to be the quotient and remainder from:
+ *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
+ *
+ * The two values can be packed together in a uint64_t for convenience; and
+ * they are, in fact, used this way as an arithmetic short-cut later on.
+ */
+
+/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc); */
+ENTRY(rsdIntrinsicResize_oscctl_K)
+        lsl         r2, r0, #VECSHIFT
+        movw        r0, #:lower16:(CHUNKSIZE << 16) - 1
+        movt        r0, #:upper16:(CHUNKSIZE << 16) - 1
+        add         r0, r0, r2
+#if defined(ARCH_ARM_USE_UDIV)
+        udiv        r1, r0, r2
+        mls         r0, r1, r2, r0
+#else
+        clz         r3, r2
+        clz         r1, r0
+        subs        r3, r3, r1
+        movlt       r3, #0
+        mov         r1, #1
+        lsl         r2, r2, r3
+        lsl         r3, r1, r3
+        mov         r1, #0
+1:      cmp         r2, r0
+        addls       r1, r3
+        subls       r0, r2
+        lsrs        r3, r3, #1
+        lsr         r2, r2, #1
+        bne         1b
+#endif
+        bx          lr
+END(rsdIntrinsicResize_oscctl_K)
+
+/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
+ * For the most part the vertical pass (the outer loop) is the same for all
+ * versions.  Exceptions are handled in-line with conditional assembly.
+ */
+.irp comp, 1, 2, 4
+.if \comp == 1
+.set COMPONENT_SHIFT, 0
+.elseif \comp == 2
+.set COMPONENT_SHIFT, 1
+.elseif \comp == 4
+.set COMPONENT_SHIFT, 2
+.else
+.error "Unknown component count"
+.endif
+.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
+.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
+
+.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
+.set OSC_STORE, (BUFFER_SIZE + 0)
+.set OSCSTEP_STORE, (BUFFER_SIZE + 4)
+.set OSCCTL_STORE, (BUFFER_SIZE + 8)
+.set AVAIL_STORE, (BUFFER_SIZE + 16)
+.set SP_STORE, (BUFFER_SIZE + 24)   /* should be +20, but rounded up to make a legal constant somewhere */
+
+/* void rsdIntrinsicResizeB\comp\()_K(
+ *             uint8_t * restrict dst,          // r0
+ *             size_t count,                    // r1
+ *             uint32_t xf,                     // r2
+ *             uint32_t xinc,                   // r3
+ *             uint8_t const * restrict srcn,   // [sp]     -> [sp,#104] -> r4
+ *             uint8_t const * restrict src0,   // [sp,#4]  -> [sp,#108] -> r5
+ *             uint8_t const * restrict src1,   // [sp,#8]  -> [sp,#112] -> r6
+ *             uint8_t const * restrict src2,   // [sp,#12] -> [sp,#116] -> r7
+ *             size_t xclip,                    // [sp,#16] -> [sp,#120]
+ *             size_t avail,                    // [sp,#20] -> [sp,#124] -> lr
+ *             uint64_t osc_ctl,                // [sp,#24] -> [sp,#128]
+ *             int32_t const *yr);              // [sp,#32] -> [sp,#136] -> d8 (copied to d6 for scalar access)
+ */
+ENTRY(rsdIntrinsicResizeB\comp\()_K)
+            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+            vpush       {d8-d15}
+
+            /* align the working buffer on the stack to make it easy to use bit
+             * twiddling for address calculations and bounds tests.
+             */
+            sub         r12, sp, #BUFFER_SIZE + 32
+            mov         lr, sp
+            bfc         r12, #0, #CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1
+            mov         sp, r12
+            str         lr, [sp,#SP_STORE]
+
+            ldr         r8, [lr,#136]           // yr
+            adr         r9, 8f
+            vld1.s32    {q4}, [r8]
+            vld1.s16    {q5}, [r9]
+            vqmovun.s32 d8, q4                  // yr
+            vdup.s16    q6, r2
+            vdup.s16    q7, r3
+            vmla.s16    q6, q5, q7              // vxf
+            vshl.s16    q7, q7, #VECSHIFT       // vxinc
+
+            ldrd        r4,r5, [lr,#104]        // srcn, src0
+            ldrd        r6,r7, [lr,#112]        // src1, src2
+
+            /* Compute starting condition for oscillator used to compute ahead
+             * of time how many iterations are possible before needing to
+             * refill the working buffer.  This is based on the fixed-point
+             * index of the last element in the vector of pixels processed in
+             * each iteration, counting up until it would overflow.
+             */
+            sub         r8, r2, r3
+            mov         r9, r3, LSL #VECSHIFT
+            add         r8, r8, r9
+
+            ldrd        r10,r11, [lr,#128]      // osc_ctl
+
+            str         r8, [sp,#OSC_STORE]
+            str         r9, [sp,#OSCSTEP_STORE]
+            str         r10, [sp,#OSCCTL_STORE]
+            str         r11, [sp,#OSCCTL_STORE+4]
+            ldrd        r10,r11, [lr,#120]      // xclip,avail
+
+
+            /* r4-r7 contain pointers to the four lines of input to be
+             * convolved.  These pointers have been clamped vertically and
+             * horizontally (which is why it's not a simple row/stride pair),
+             * and the xclip argument (now in r10) indicates how many pixels
+             * from true the x position of the pointer is.  This value should
+             * be 0, 1, or 2 only.
+             *
+             * Start by placing four pixels worth of input at the far end of
+             * the buffer.  As many as two of these may be clipped, so four
+             * pixels are fetched, and then the first pixel is duplicated and
+             * the data shifted according to xclip.  The source pointers are
+             * then also adjusted according to xclip so that subsequent fetches
+             * match.
+             */
+            vmov        d6, d8  /* make y coeffs available for vert4 and vert8 macros */
+
+            sub         r8, r12, r10, LSL #COMPONENT_SHIFT + 1
+            add         r9, r12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
+            add         r8, r8, #4 * COMPONENT_COUNT * 2
+.if \comp == 1
+            vert4
+            vdup.s16    d24, d25[0]
+            vst1.s16    {q12}, [r12]
+            vld1.s16    {d24}, [r8]
+            vst1.s16    {d24}, [r9]
+.elseif \comp == 2
+            vert8
+            vdup.u32    q11, d24[0]
+            vst1.s16    {q11,q12}, [r12]
+            vld1.s16    {q12}, [r8]
+            vst1.s16    {q12}, [r9]
+.elseif \comp == 4
+            vert8       d28, d29
+            vert8       d30, d31
+            vmov.u64    d24, d28
+            vmov.u64    d25, d28
+            vmov.u64    d26, d28
+            vmov.u64    d27, d28
+            vst1.s16    {q12,q13}, [r12]!
+            vst1.s16    {q14,q15}, [r12]
+            sub         r12, r12, #32
+            vld1.s16    {q11,q12}, [r8]
+            vst1.s16    {q11,q12}, [r9]
+.endif
+            /* Count off four pixels into the working buffer, and move count to
+             * its new home.
+             */
+            sub         lr, r11, #4
+            /* Incoming pointers were to the first _legal_ pixel.  Four pixels
+             * were read unconditionally, but some may have been discarded by
+             * xclip, so we rewind the pointers to compensate.
+             */
+            sub         r4, r4, r10, LSL #COMPONENT_SHIFT
+            sub         r5, r5, r10, LSL #COMPONENT_SHIFT
+            sub         r6, r6, r10, LSL #COMPONENT_SHIFT
+            sub         r7, r7, r10, LSL #COMPONENT_SHIFT
+
+            /* First tap starts where we just pre-filled, at the end of the
+             * buffer.
+             */
+            add         r2, r2, #(CHUNKSIZE * 2 - 4) << 16
+
+            /* Use overflowing arithmetic to implement wraparound array
+             * indexing.
+             */
+            mov         r2, r2, LSL #(15 - CHUNKSHIFT)
+            mov         r3, r3, LSL #(15 - CHUNKSHIFT)
+
+            str         lr, [sp,#AVAIL_STORE]
+
+            /* Start of outermost loop.
+             * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
+             * number of iterations of the inner loop that can be performed and
+             * get into that.
+             *
+             * The fill is complicated by the possibility of running out of
+             * input before the scratch buffer is filled.  If this isn't a risk
+             * then it's handled by the simple loop at 2:, otherwise the
+             * horrible loop at 3:.
+             */
+1:          ldr         lr, [sp,#AVAIL_STORE]   /* get number of pixels available */
+            vmov        d6, d8              /* put y scaling coefficients somewhere handy */
+            subs        lr, #CHUNKSIZE
+            bge         2f                  /* if at least CHUNKSIZE are available... */
+            add         lr, #CHUNKSIZE      /* if they're not... */
+            b           4f
+            /* ..just sneaking a literal in here after this unconditional branch.. */
+8:          .hword      0, 1, 2, 3, 4, 5, 6, 7
+            /* basic fill loop, processing 8 bytes at a time until there are
+             * fewer than eight bytes available.
+             */
+3:          vert8
+            sub         lr, lr, #8 / COMPONENT_COUNT
+            vst1.s16    {q12}, [r12]!
+4:          cmp         lr, #8 / COMPONENT_COUNT - 1
+            bgt         3b
+.if \comp == 4
+            blt         3f
+            /* The last pixel (four bytes) if necessary */
+            vert4
+.else
+            cmp         lr, #1
+            blt         3f
+            /* The last pixels if necessary */
+            sub         r4, r4, #8
+            sub         r5, r5, #8
+            sub         r6, r6, #8
+            sub         r7, r7, #8
+            add         r4, r4, lr, LSL #COMPONENT_SHIFT
+            add         r5, r5, lr, LSL #COMPONENT_SHIFT
+            add         r6, r6, lr, LSL #COMPONENT_SHIFT
+            add         r7, r7, lr, LSL #COMPONENT_SHIFT
+            vert8
+            sub         lr, sp, lr, LSL #COMPONENT_SHIFT + 1
+            sub         sp, sp, #32
+            sub         lr, lr, #16
+.if \comp == 1
+            vdup.s16    q13, d25[3]
+.elseif \comp == 2
+            vdup.u32    q13, d25[1]
+.endif
+            vst1.s16    {q12,q13}, [sp]
+            vld1.s16    {q12}, [lr]
+            add         sp, sp, #32
+            b           4f
+.endif
+            /* Keep filling until we get to the end of this chunk of the buffer */
+3:
+.if \comp == 1
+            vdup.s16    q12, d25[3]
+.elseif \comp == 2
+            vdup.u32    q12, d25[1]
+.elseif \comp == 4
+            vmov.u64    d24, d25
+.endif
+4:          vst1.s16    {q12}, [r12]!
+            tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+            bne         3b
+            b           4f
+
+.align 4
+2:          /* Quickly pull a chunk of data into the working buffer.
+             */
+            vert8
+            vst1.s16    {q12}, [r12]!
+            vert8
+            vst1.s16    {q12}, [r12]!
+            tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+            bne         2b
+            cmp         lr, #0
+            bne         3f
+4:          /* if we end with 0 pixels left we'll have nothing handy to spread
+             * across to the right, so we rewind a bit.
+             */
+            mov         lr, #1
+            sub         r4, r4, #COMPONENT_COUNT
+            sub         r5, r5, #COMPONENT_COUNT
+            sub         r6, r6, #COMPONENT_COUNT
+            sub         r7, r7, #COMPONENT_COUNT
+3:          str         lr, [sp,#AVAIL_STORE]       /* done with available pixel count */
+            add         lr, sp, #OSC_STORE
+            ldrd        r8,r9, [lr,#0]              /* need osc, osc_step soon */
+            ldrd        r10,r11, [lr,#OSCCTL_STORE-OSC_STORE] /* need osc_ctl too */
+
+            /* copy four taps (width of cubic window) to far end for overflow
+             * address handling
+             */
+            sub         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
+            eor         r12, lr, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+            vld1.s16    {d28}, [lr]
+.elseif \comp == 2
+            vld1.s16    {q14}, [lr]
+.elseif \comp == 4
+            vld1.s16    {q14,q15}, [lr]
+.endif
+            add         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+            vst1.s16    {d28}, [lr]
+.elseif \comp == 2
+            vst1.s16    {q14}, [lr]
+.elseif \comp == 4
+            vst1.s16    {q14,q15}, [lr]
+.endif
+            /* r11 contains the maximum possible iteration count, but if r8 is
+             * greater than r10 then this indicates that the count must be
+             * reduced by one for this iteration to avoid reading past the end
+             * of the available data.
+             */
+            cmp             r10, r8
+            sbc         lr, r11, #0
+
+            mla         r8, lr, r9, r8
+            sub         r8, r8, #(CHUNKSIZE << 16)
+
+            str         r8, [sp,#OSC_STORE]         /* done with osc */
+
+            /* prefer to count pixels, rather than vectors, to clarify the tail
+             * store case on exit.
+             */
+            mov         lr, lr, LSL #VECSHIFT
+            cmp         lr, r1
+            movgt       lr, r1
+
+            sub         r1, r1, lr
+
+            mov         lr, lr, LSL #COMPONENT_SHIFT
+
+            vmov.i16    d10, #3
+            vmov.i16    d11, #0x8000
+
+            cmp         lr, #0
+            bgt         3f
+            cmp         r1, #0
+            bgt         1b     /* an extreme case where we shouldn't use code in this structure */
+            b           9f
+
+            .align 4
+2:          /* Inner loop continues here, but starts at 3:, see end of loop
+             * below for explanation. */
+.if LOOP_OUTPUT_SIZE == 4
+            vst1.u32    {d16[0]}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 8
+            vst1.u8     {d16}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 16
+            vst1.u8     {q8}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 32
+            vst1.u8     {q8,q9}, [r0]!
+.endif
+            /* Inner loop:  here the four x coefficients for each tap are
+             * calculated in vector code, and the addresses are calculated in
+             * scalar code, and these calculations are interleaved.
+             */
+3:          vshr.u16    q8, q6, #1
+            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
+            vqrdmulh.s16 q9, q8, q8
+            add         r2, r2, r3
+            vqrdmulh.s16 q10, q9, q8
+            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
+            vshll.s16   q11, d18, #2
+            vshll.s16   q12, d19, #2
+            add         r2, r2, r3
+            vmlsl.s16   q11, d20, d10
+            vmlsl.s16   q12, d21, d10
+            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
+
+            vhadd.s16   q0, q10, q8
+            add         r2, r2, r3
+            vsub.s16    q0, q9, q0
+            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
+
+            vaddw.s16   q1, q11, d18
+            vaddw.s16   q13, q12, d19
+            add         r2, r2, r3
+            vshrn.s32   d2, q1, #1
+            vshrn.s32   d3, q13, #1
+            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
+            vsub.s16    d2, d2, d11
+            vsub.s16    d3, d3, d11 // TODO: find a wider d11 and use q-reg operation
+            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
+
+            vaddw.s16   q2, q11, d16
+            vaddw.s16   q13, q12, d17
+            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
+            vshrn.s32   d4, q2, #1
+            vshrn.s32   d5, q13, #1
+            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
+            vneg.s16    q2, q2
+
+            vhsub.s16   q3, q10, q9
+
+            /* increment the x fractional parts (oveflow is ignored, as the
+             * scalar arithmetic shadows this addition with full precision).
+             */
+            vadd.s16    q6, q6, q7
+
+            /* At this point we have four pointers in r8-r11, pointing to the
+             * four taps in the scratch buffer that must be convolved together
+             * to produce an output pixel (one output pixel per pointer).
+             * These pointers usually overlap, but their spacing is irregular
+             * so resolving the redundancy through L1 is a pragmatic solution.
+             *
+             * The scratch buffer is made of signed 16-bit data, holding over
+             * some extra precision, and overshoot, from the vertical pass.
+             *
+             * We also have the 16-bit unsigned fixed-point weights for each
+             * of the four taps in q0 - q3.  That's eight pixels worth of
+             * coefficients when we have only four pointers, so calculations
+             * for four more pixels are interleaved with the fetch and permute
+             * code for each variant in the following code.
+             *
+             * The data arrangement is less than ideal for any pixel format,
+             * but permuting loads help to mitigate most of the problems.
+             *
+             * Note also that the two outside taps of a bicubic are negative,
+             * but these coefficients are unsigned.  The sign is hard-coded by
+             * use of multiply-and-subtract operations.
+             */
+.if \comp == 1
+            /* The uchar 1 case.
+             * Issue one lanewise vld4.s16 to load four consecutive pixels from
+             * one pointer (one pixel) into four different registers; then load
+             * four consecutive s16 values from the next pointer (pixel) into
+             * the next lane of those four registers, etc., so that we finish
+             * with q12 - q15 representing the four taps, and each lane
+             * representing a separate pixel.
+             *
+             * The first vld4 uses a splat to avoid any false dependency on
+             * the previous state of the register.
+             */
+            vld4.s16    {d24[],d26[],d28[],d30[]}, [r8]
+            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.s16    {d24[1],d26[1],d28[1],d30[1]}, [r9]
+            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
+            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.s16    {d24[2],d26[2],d28[2],d30[2]}, [r10]
+            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
+            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.s16    {d24[3],d26[3],d28[3],d30[3]}, [r11]
+            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
+            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.s16    {d25[],d27[],d29[],d31[]}, [r8]
+            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
+            vld4.s16    {d25[1],d27[1],d29[1],d31[1]}, [r9]
+            vld4.s16    {d25[2],d27[2],d29[2],d31[2]}, [r10]
+            vld4.s16    {d25[3],d27[3],d29[3],d31[3]}, [r11]
+
+            vmull.s16   q8, d24, d0
+            vmull.s16   q9, d25, d1
+            vmlsl.s16   q8, d26, d2
+            vmlsl.s16   q9, d27, d3
+            vmlsl.s16   q8, d28, d4
+            vmlsl.s16   q9, d29, d5
+            vmlal.s16   q8, d30, d6
+            vmlal.s16   q9, d31, d7
+
+            subs        lr, lr, #LOOP_OUTPUT_SIZE
+
+            vqrshrn.s32 d16, q8, #15
+            vqrshrn.s32 d17, q9, #15
+
+            vqrshrun.s16 d16, q8, #VERTBITS - 8
+.elseif \comp == 2
+            /* The uchar2 case:
+             * This time load pairs of values into adjacent lanes in q12 - q15
+             * by aliasing them as u32 data; leaving room for only four pixels,
+             * so the process has to be done twice.  This also means that the
+             * coefficient registers fail to align with the coefficient data
+             * (eight separate pixels), so that has to be doubled-up to match.
+             */
+            vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
+            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
+            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
+            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
+            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
+            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
+            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
+            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+
+            /* double-up coefficients to align with component pairs */
+            vmov        d20, d0
+            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
+            vmov        d21, d2
+            vmov        d22, d4
+            vmov        d23, d6
+            vzip.s16    d0, d20
+            vzip.s16    d2, d21
+            vzip.s16    d4, d22
+            vzip.s16    d6, d23
+
+            vmull.s16   q8, d24, d0
+            vmull.s16   q9, d25, d20
+            vmlsl.s16   q8, d26, d2
+            vmlsl.s16   q9, d27, d21
+            vmlsl.s16   q8, d28, d4
+            vmlsl.s16   q9, d29, d22
+            vmlal.s16   q8, d30, d6
+            vmlal.s16   q9, d31, d23
+
+            vqrshrn.s32 d16, q8, #15
+            vqrshrn.s32 d17, q9, #15
+
+            vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
+            vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
+            vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
+            vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
+
+            /* double-up coefficients to align with component pairs */
+            vmov        d0, d1
+            vmov        d2, d3
+            vmov        d4, d5
+            vmov        d6, d7
+            vzip.s16    d0, d1
+            vzip.s16    d2, d3
+            vzip.s16    d4, d5
+            vzip.s16    d6, d7
+
+            vmull.s16   q10, d24, d0
+            vmull.s16   q11, d25, d1
+            vmlsl.s16   q10, d26, d2
+            vmlsl.s16   q11, d27, d3
+            vmlsl.s16   q10, d28, d4
+            vmlsl.s16   q11, d29, d5
+            vmlal.s16   q10, d30, d6
+            vmlal.s16   q11, d31, d7
+
+            subs        lr, lr, #LOOP_OUTPUT_SIZE
+
+            vqrshrn.s32 d18, q10, #15
+            vqrshrn.s32 d19, q11, #15
+
+            vqrshrun.s16 d16, q8, #VERTBITS - 8
+            vqrshrun.s16 d17, q9, #VERTBITS - 8
+.elseif \comp == 4
+            /* The uchar4 case.
+             * This case is comparatively painless because four s16s are the
+             * smallest addressable unit for a vmul-by-scalar.  Rather than
+             * permute the data, simply arrange the multiplies to suit the way
+             * the data comes in.  That's a lot of data, though, so things
+             * progress in pairs of pixels at a time.
+             */
+            vld1.s16    {q12,q13}, [r8]
+            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld1.s16    {q14,q15}, [r9]
+            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
+            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+
+            vmull.s16   q8, d24, d0[0]
+            vmull.s16   q9, d28, d0[1]
+            vmlsl.s16   q8, d25, d2[0]
+            vmlsl.s16   q9, d29, d2[1]
+            vmlsl.s16   q8, d26, d4[0]
+            vmlsl.s16   q9, d30, d4[1]
+            vmlal.s16   q8, d27, d6[0]
+            vmlal.s16   q9, d31, d6[1]
+
+            /* And two more...  */
+            vld1.s16    {q12,q13}, [r10]
+            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
+            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld1.s16    {q14,q15}, [r11]
+            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
+            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+
+            vqrshrn.s32 d16, q8, #15
+            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
+            vqrshrn.s32 d17, q9, #15
+
+            vmull.s16   q10, d24, d0[2]
+            vmull.s16   q11, d28, d0[3]
+            vmlsl.s16   q10, d25, d2[2]
+            vmlsl.s16   q11, d29, d2[3]
+            vmlsl.s16   q10, d26, d4[2]
+            vmlsl.s16   q11, d30, d4[3]
+            vmlal.s16   q10, d27, d6[2]
+            vmlal.s16   q11, d31, d6[3]
+
+            vqrshrn.s32 d18, q10, #15
+            vqrshrn.s32 d19, q11, #15
+
+            vqrshrun.s16 d16, q8, #VERTBITS - 8
+            vqrshrun.s16 d17, q9, #VERTBITS - 8
+
+            /* And two more...  */
+            vld1.s16    {q12,q13}, [r8]
+            vld1.s16    {q14,q15}, [r9]
+
+            vmull.s16   q10, d24, d1[0]
+            vmull.s16   q11, d28, d1[1]
+            vmlsl.s16   q10, d25, d3[0]
+            vmlsl.s16   q11, d29, d3[1]
+            vmlsl.s16   q10, d26, d5[0]
+            vmlsl.s16   q11, d30, d5[1]
+            vmlal.s16   q10, d27, d7[0]
+            vmlal.s16   q11, d31, d7[1]
+
+            /* And two more...  */
+            vld1.s16    {q12,q13}, [r10]
+            vld1.s16    {q14,q15}, [r11]
+
+            subs        lr, lr, #LOOP_OUTPUT_SIZE
+
+            vqrshrn.s32 d18, q10, #15
+            vqrshrn.s32 d19, q11, #15
+
+            vmull.s16   q10, d24, d1[2]
+            vmull.s16   q11, d28, d1[3]
+            vmlsl.s16   q10, d25, d3[2]
+            vmlsl.s16   q11, d29, d3[3]
+            vmlsl.s16   q10, d26, d5[2]
+            vmlsl.s16   q11, d30, d5[3]
+            vmlal.s16   q10, d27, d7[2]
+            vmlal.s16   q11, d31, d7[3]
+
+            vqrshrn.s32 d20, q10, #15
+            vqrshrn.s32 d21, q11, #15
+
+            vqrshrun.s16 d18, q9, #VERTBITS - 8
+            vqrshrun.s16 d19, q10, #VERTBITS - 8
+.endif
+            bgt         2b      /* continue inner loop */
+            /* The inner loop has already been limited to ensure that none of
+             * the earlier iterations could overfill the output, so the store
+             * appears within the loop but after the conditional branch (at the
+             * top).  At the end, provided it won't overfill, perform the final
+             * store here.  If it would, then break out to the tricky tail case
+             * instead.
+             */
+            blt         1f
+            /* Store the amount of data appropriate to the configuration of the
+             * instance being assembled.
+             */
+.if LOOP_OUTPUT_SIZE == 4
+            vst1.u32    {d16[0]}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 8
+            vst1.u8     {d16}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 16
+            vst1.u8     {q8}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 32
+            vst1.u8     {q8,q9}, [r0]!
+.endif
+            b           1b              /* resume outer loop */
+            /* Partial tail store case:
+             * Different versions of the code need different subsets of the
+             * following partial stores.  Here the number of components and the
+             * size of the chunk of data produced by each inner loop iteration
+             * is tested to figure out whether or not each phrase is relevant.
+             */
+.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
+1:          tst         lr, #16
+            beq         1f
+            vst1.u8     {q8}, [r0]!
+            vmov        q8, q9
+.endif
+.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
+1:          tst         lr, #8
+            beq         1f
+            vst1.u8     {d16}, [r0]!
+            vmov.u8     d16, d17
+.endif
+.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
+1:          tst         lr, #4
+            beq         1f
+            vst1.u32    {d16[0]}, [r0]!
+            vext.u32    d16, d16, d16, #1
+.endif
+.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
+1:          tst         lr, #2
+            beq         1f
+            vst1.u16    {d16[0]}, [r0]!
+            vext.u16    d16, d16, d16, #1
+.endif
+.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
+1:          tst         lr, #1
+            beq         1f
+            vst1.u8     {d16[0]}, [r0]!
+.endif
+1:
+9:          ldr         sp, [sp,#SP_STORE]
+            vpop        {d8-d15}
+            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+END(rsdIntrinsicResizeB\comp\()_K)
+.endr
diff --git a/toolkit/TaskProcessor.cpp b/toolkit/TaskProcessor.cpp
new file mode 100644
index 0000000..d9ae83c
--- /dev/null
+++ b/toolkit/TaskProcessor.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TaskProcessor.h"
+
+#include <assert.h>
+#include <sys/prctl.h>
+
+#include "RenderScriptToolkit.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.TaskProcessor"
+
+namespace android {
+namespace renderscript {
+
+int Task::setTiling(unsigned int targetTileSizeInBytes) {
+    // Empirically, values smaller than 1000 are unlikely to give good performance.
+    targetTileSizeInBytes = std::max(1000u, targetTileSizeInBytes);
+    const size_t cellSizeInBytes =
+            mVectorSize;  // If we add float support, vectorSize * 4 for that.
+    const size_t targetCellsPerTile = targetTileSizeInBytes / cellSizeInBytes;
+    assert(targetCellsPerTile > 0);
+
+    size_t cellsToProcessY;
+    size_t cellsToProcessX;
+    if (mRestriction == nullptr) {
+        cellsToProcessX = mSizeX;
+        cellsToProcessY = mSizeY;
+    } else {
+        assert(mRestriction->endX > mRestriction->startX);
+        assert(mRestriction->endY > mRestriction->startY);
+        cellsToProcessX = mRestriction->endX - mRestriction->startX;
+        cellsToProcessY = mRestriction->endY - mRestriction->startY;
+    }
+
+    // We want rows as large as possible, as the SIMD code we have is more efficient with
+    // large rows.
+    mTilesPerRow = divideRoundingUp(cellsToProcessX, targetCellsPerTile);
+    // Once we know the number of tiles per row, we divide that row evenly. We round up to make
+    // sure all cells are included in the last tile of the row.
+    mCellsPerTileX = divideRoundingUp(cellsToProcessX, mTilesPerRow);
+
+    // We do the same thing for the Y direction.
+    size_t targetRowsPerTile = divideRoundingUp(targetCellsPerTile, mCellsPerTileX);
+    mTilesPerColumn = divideRoundingUp(cellsToProcessY, targetRowsPerTile);
+    mCellsPerTileY = divideRoundingUp(cellsToProcessY, mTilesPerColumn);
+
+    return mTilesPerRow * mTilesPerColumn;
+}
+
+void Task::processTile(unsigned int threadIndex, size_t tileIndex) {
+    // Figure out the overall boundaries.
+    size_t startWorkX;
+    size_t startWorkY;
+    size_t endWorkX;
+    size_t endWorkY;
+    if (mRestriction == nullptr) {
+        startWorkX = 0;
+        startWorkY = 0;
+        endWorkX = mSizeX;
+        endWorkY = mSizeY;
+    } else {
+        startWorkX = mRestriction->startX;
+        startWorkY = mRestriction->startY;
+        endWorkX = mRestriction->endX;
+        endWorkY = mRestriction->endY;
+    }
+    // Figure out the rectangle for this tileIndex. All our tiles form a 2D grid. Identify
+    // first the X, Y coordinate of our tile in that grid.
+    size_t tileIndexY = tileIndex / mTilesPerRow;
+    size_t tileIndexX = tileIndex % mTilesPerRow;
+    // Calculate the starting and ending point of that tile.
+    size_t startCellX = startWorkX + tileIndexX * mCellsPerTileX;
+    size_t startCellY = startWorkY + tileIndexY * mCellsPerTileY;
+    size_t endCellX = std::min(startCellX + mCellsPerTileX, endWorkX);
+    size_t endCellY = std::min(startCellY + mCellsPerTileY, endWorkY);
+
+    // Call the derived class to do the specific work.
+    if (mPrefersDataAsOneRow && startCellX == 0 && endCellX == mSizeX) {
+        // When the tile covers entire rows, we can take advantage that some ops are not 2D.
+        processData(threadIndex, 0, startCellY, mSizeX * (endCellY - startCellY), startCellY + 1);
+    } else {
+        processData(threadIndex, startCellX, startCellY, endCellX, endCellY);
+    }
+}
+
+TaskProcessor::TaskProcessor(unsigned int numThreads)
+    : mUsesSimd{cpuSupportsSimd()},
+      /* If the requested number of threads is 0, we'll decide based on the number of cores.
+       * Through empirical testing, we've found that using more than 6 threads does not help.
+       * There may be more optimal choices to make depending on the SoC but we'll stick to
+       * this simple heuristic for now.
+       *
+       * We'll re-use the thread that calls the processor doTask method, so we'll spawn one less
+       * worker pool thread than the total number of threads.
+       */
+      mNumberOfPoolThreads{numThreads ? numThreads - 1
+                                      : std::min(6u, std::thread::hardware_concurrency() - 1)} {
+    for (size_t i = 0; i < mNumberOfPoolThreads; i++) {
+        mPoolThreads.emplace_back(
+                std::bind(&TaskProcessor::processTilesOfWork, this, i + 1, false));
+    }
+}
+
+TaskProcessor::~TaskProcessor() {
+    {
+        std::lock_guard<std::mutex> lock(mQueueMutex);
+        mStopThreads = true;
+        mWorkAvailableOrStop.notify_all();
+    }
+
+    for (auto& thread : mPoolThreads) {
+        thread.join();
+    }
+}
+
+void TaskProcessor::processTilesOfWork(int threadIndex, bool returnWhenNoWork) {
+    if (threadIndex != 0) {
+        // Set the name of the thread, except for thread 0, which is not part of the pool.
+        // PR_SET_NAME takes a maximum of 16 characters, including the terminating null.
+        char name[16]{"RenderScToolkit"};
+        prctl(PR_SET_NAME, name, 0, 0, 0);
+        // ALOGI("Starting thread%d", threadIndex);
+    }
+
+    std::unique_lock<std::mutex> lock(mQueueMutex);
+    while (true) {
+        mWorkAvailableOrStop.wait(lock, [this, returnWhenNoWork]() REQUIRES(mQueueMutex) {
+            return mStopThreads || (mTilesNotYetStarted > 0) ||
+                   (returnWhenNoWork && (mTilesNotYetStarted == 0));
+        });
+        // ALOGI("Woke thread%d", threadIndex);
+
+        // This ScopedLockAssertion is to help the compiler when it checks thread annotations
+        // to realize that we have the lock. It's however not completely true; we don't
+        // hold the lock while processing the tile.
+        // TODO Figure out how to fix that.
+        android::base::ScopedLockAssertion lockAssert(mQueueMutex);
+        if (mStopThreads || (returnWhenNoWork && mTilesNotYetStarted == 0)) {
+            break;
+        }
+
+        while (mTilesNotYetStarted > 0 && !mStopThreads) {
+            // This picks the tiles in decreasing order but that does not matter.
+            int myTile = --mTilesNotYetStarted;
+            mTilesInProcess++;
+            lock.unlock();
+            {
+                // We won't be executing this code unless the main thread is
+                // holding the mTaskMutex lock, which guards mCurrentTask.
+                // The compiler can't figure this out.
+                android::base::ScopedLockAssertion lockAssert(mTaskMutex);
+                mCurrentTask->processTile(threadIndex, myTile);
+            }
+            lock.lock();
+            mTilesInProcess--;
+            if (mTilesInProcess == 0 && mTilesNotYetStarted == 0) {
+                mWorkIsFinished.notify_one();
+            }
+        }
+    }
+    // if (threadIndex != 0) {
+    //     ALOGI("Ending thread%d", threadIndex);
+    // }
+}
+
+void TaskProcessor::doTask(Task* task) {
+    std::lock_guard<std::mutex> lockGuard(mTaskMutex);
+    task->setUsesSimd(mUsesSimd);
+    mCurrentTask = task;
+    // Notify the thread pool of available work.
+    startWork(task);
+    // Start processing some of the tiles on the calling thread.
+    processTilesOfWork(0, true);
+    // Wait for all the pool workers to complete.
+    waitForPoolWorkersToComplete();
+    mCurrentTask = nullptr;
+}
+
+void TaskProcessor::startWork(Task* task) {
+    /**
+     * The size in bytes that we're hoping each tile will be. If this value is too small,
+     * we'll spend too much time in synchronization. If it's too large, some cores may be
+     * idle while others still have a lot of work to do. Ideally, it would depend on the
+     * device we're running. 16k is the same value used by RenderScript and seems reasonable
+     * from ad-hoc tests.
+     */
+    const size_t targetTileSize = 16 * 1024;
+
+    std::lock_guard<std::mutex> lock(mQueueMutex);
+    assert(mTilesInProcess == 0);
+    mTilesNotYetStarted = task->setTiling(targetTileSize);
+    mWorkAvailableOrStop.notify_all();
+}
+
+void TaskProcessor::waitForPoolWorkersToComplete() {
+    std::unique_lock<std::mutex> lock(mQueueMutex);
+    // The predicate, i.e. the lambda, will make sure that
+    // we terminate even if the main thread calls this after
+    // mWorkIsFinished is signaled.
+    mWorkIsFinished.wait(lock, [this]() REQUIRES(mQueueMutex) {
+        return mTilesNotYetStarted == 0 && mTilesInProcess == 0;
+    });
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/TaskProcessor.h b/toolkit/TaskProcessor.h
new file mode 100644
index 0000000..4d274fa
--- /dev/null
+++ b/toolkit/TaskProcessor.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RENDERSCRIPT_TOOLKIT_TASKPROCESSOR_H
+#define ANDROID_RENDERSCRIPT_TOOLKIT_TASKPROCESSOR_H
+
+#include <android-base/thread_annotations.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+namespace android {
+namespace renderscript {
+
+/**
+ * Description of the data to be processed for one Toolkit method call, e.g. one blur or one
+ * blend operation.
+ *
+ * The data to be processed is a 2D array of cells. Each cell is a vector of 1 to 4 unsigned bytes.
+ * The most typical configuration is a 2D array of uchar4 used to represent RGBA images.
+ *
+ * This is a base class. There will be a subclass for each Toolkit op.
+ *
+ * Typical usage of a derived class would look like:
+ *    BlurTask task(in, out, sizeX, sizeY, vectorSize, etc);
+ *    processor->doTask(&task);
+ *
+ * The TaskProcessor should call setTiling() and setUsesSimd() once, before calling processTile().
+ * Other classes should not call setTiling(), setUsesSimd(), and processTile().
+ */
+class Task {
+   protected:
+    /**
+     * Number of cells in the X direction.
+     */
+    const size_t mSizeX;
+    /**
+     * Number of cells in the Y direction.
+     */
+    const size_t mSizeY;
+    /**
+     * Number of elements in a vector (cell). From 1-4.
+     */
+    const size_t mVectorSize;
+    /**
+     * Whether the task prefers the processData call to represent the work to be done as
+     * one line rather than a rectangle. This would be the case for work that don't involve
+     * vertical neighbors, e.g. blend or histogram. A task would prefer this to minimize the
+     * number of SIMD calls to make, i.e. have one call that covers all the rows.
+     *
+     * This setting will be used only when a tile covers the entire width of the data to be
+     * processed.
+     */
+    const bool mPrefersDataAsOneRow;
+    /**
+     * Whether the processor we're working on supports SIMD operations.
+     */
+    bool mUsesSimd = false;
+
+   private:
+    /**
+     * If not null, we'll process a subset of the whole 2D array. This specifies the restriction.
+     */
+    const struct Restriction* mRestriction;
+
+    /**
+     * We'll divide the work into rectangular tiles. See setTiling().
+     */
+
+    /**
+     * Size of a tile in the X direction, as a number of cells.
+     */
+    size_t mCellsPerTileX = 0;
+    /**
+     * Size of a tile in the Y direction, as a number of cells.
+     */
+    size_t mCellsPerTileY = 0;
+    /**
+     * Number of tiles per row of the restricted area we're working on.
+     */
+    size_t mTilesPerRow = 0;
+    /**
+     * Number of tiles per column of the restricted area we're working on.
+     */
+    size_t mTilesPerColumn = 0;
+
+   public:
+    /**
+     * Construct a task.
+     *
+     * sizeX and sizeY should be greater than 0. vectorSize should be between 1 and 4.
+     * The restriction should outlive this instance. The Toolkit validates the
+     * arguments so we won't do that again here.
+     */
+    Task(size_t sizeX, size_t sizeY, size_t vectorSize, bool prefersDataAsOneRow,
+         const Restriction* restriction)
+        : mSizeX{sizeX},
+          mSizeY{sizeY},
+          mVectorSize{vectorSize},
+          mPrefersDataAsOneRow{prefersDataAsOneRow},
+          mRestriction{restriction} {}
+    virtual ~Task() {}
+
+    void setUsesSimd(bool uses) { mUsesSimd = uses; }
+
+    /**
+     * Divide the work into a number of tiles that can be distributed to the various threads.
+     * A tile will be a rectangular region. To be robust, we'll want to handle regular cases
+     * like 400x300 but also unusual ones like 1x120000, 120000x1, 1x1.
+     *
+     * We have a target size for the tiles, which corresponds roughly to how much data a thread
+     * will want to process before checking for more work. If the target is set too low, we'll spend
+     * more time in synchronization. If it's too large, some cores may not be used as efficiently.
+     *
+     * This method returns the number of tiles.
+     *
+     * @param targetTileSizeInBytes Target size. Values less than 1000 will be treated as 1000.
+     */
+    int setTiling(unsigned int targetTileSizeInBytes);
+
+    /**
+     * This is called by the TaskProcessor to instruct the task to process a tile.
+     *
+     * @param threadIndex The index of the thread that's processing the tile.
+     * @param tileIndex The index of the tile to process.
+     */
+    void processTile(unsigned int threadIndex, size_t tileIndex);
+
+   private:
+    /**
+     * Call to the derived class to process the data bounded by the rectangle specified
+     * by (startX, startY) and (endX, endY). The end values are EXCLUDED. This rectangle
+     * will be contained with the restriction, if one is provided.
+     */
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) = 0;
+};
+
+/**
+ * There's one instance of the task processor for the Toolkit. This class owns the thread pool,
+ * and dispatches the tiles of work to the threads.
+ */
+class TaskProcessor {
+    /**
+     * Does this processor support SIMD-like instructions?
+     */
+    const bool mUsesSimd;
+    /**
+     * The number of separate threads we'll spawn. It's one less than the number of threads that
+     * do the work as the client thread that starts the work will also be used.
+     */
+    const unsigned int mNumberOfPoolThreads;
+    /**
+     * Ensures that only one task is done at a time.
+     */
+    std::mutex mTaskMutex;
+    /**
+     * Ensures consistent access to the shared queue state.
+     */
+    std::mutex mQueueMutex;
+    /**
+     * The thread pool workers.
+     */
+    std::vector<std::thread> mPoolThreads;
+    /**
+     * The task being processed, if any. We only do one task at a time. We could create a queue
+     * of tasks but using a mTaskMutex is sufficient for now.
+     */
+    Task* mCurrentTask GUARDED_BY(mTaskMutex) = nullptr;
+    /**
+     * Signals that the mPoolThreads should terminate.
+     */
+    bool mStopThreads GUARDED_BY(mQueueMutex) = false;
+    /**
+     * Signaled when work is available or the mPoolThreads need to shut down. mStopThreads is used
+     * to distinguish between the two.
+     */
+    std::condition_variable mWorkAvailableOrStop;
+    /**
+     * Signaled when the work for the task is finished.
+     */
+    std::condition_variable mWorkIsFinished;
+    /**
+     * A user task, e.g. a blend or a blur, is split into a number of tiles. When a thread starts
+     * working on a new tile, it uses this count to identify which tile to work on. The tile
+     * number is sufficient to determine the boundaries of the data to process.
+     *
+     * The number of tiles left to process.
+     */
+    int mTilesNotYetStarted GUARDED_BY(mQueueMutex) = 0;
+    /**
+     * The number of tiles currently being processed. Must not be greater than
+     * mNumberOfPoolThreads + 1.
+     */
+    int mTilesInProcess GUARDED_BY(mQueueMutex) = 0;
+
+    /**
+     * Determines how we'll tile the work and signals the thread pool of available work.
+     *
+     * @param task The task to be performed.
+     */
+    void startWork(Task* task) REQUIRES(mTaskMutex);
+
+    /**
+     * Tells the thread to start processing work off the queue.
+     *
+     * The flag is used for prevent the main thread from blocking forever if the work is
+     * so trivial that the worker threads complete the work before the main thread calls this
+     * method.
+     *
+     * @param threadIndex The index number (0..mNumberOfPoolThreads) this thread will referred by.
+     * @param returnWhenNoWork If there's no work, return immediately.
+     */
+    void processTilesOfWork(int threadIndex, bool returnWhenNoWork);
+
+    /**
+     * Wait for the pool workers to complete the work on the current task.
+     */
+    void waitForPoolWorkersToComplete();
+
+   public:
+    /**
+     * Create the processor.
+     *
+     * @param numThreads The total number of threads to use. If 0, we'll decided based on system
+     * properties.
+     */
+    explicit TaskProcessor(unsigned int numThreads = 0);
+
+    ~TaskProcessor();
+
+    /**
+     * Do the specified task. Returns only after the task has been completed.
+     */
+    void doTask(Task* task);
+
+    /**
+     * Some Tasks need to allocate temporary storage for each worker thread.
+     * This provides the number of threads.
+     */
+    unsigned int getNumberOfThreads() const { return mNumberOfPoolThreads + 1; }
+};
+
+}  // namespace renderscript
+}  // namespace android
+
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_TASKPROCESSOR_H
diff --git a/toolkit/TestTaskProcessor.cpp b/toolkit/TestTaskProcessor.cpp
new file mode 100644
index 0000000..36a94f4
--- /dev/null
+++ b/toolkit/TestTaskProcessor.cpp
@@ -0,0 +1,105 @@
+#include <array>
+
+#include "TaskProcessor.h"
+
+/**
+ * Sets all entries of the buffer to a value that depends on its coordinate and a delta.
+ */
+class SimpleTask : public android::renderscript::Task {
+    uint8_t* mBuffer;
+    uint8_t mDelta;
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY);
+
+   public:
+    SimpleTask(uint8_t* buffer, size_t vectorSize, size_t sizeX, size_t sizeY, uint8_t delta)
+        : Task{sizeX, sizeY, vectorSize, false, nullptr}, mBuffer{buffer}, mDelta{delta} {}
+};
+
+/**
+ * Create a new value that's a function of the x, y coordinates and a delta.
+ */
+static uint8_t newValue(size_t x, size_t y, uint8_t delta) {
+    return (((x & 0xff) << 4) | (y & 0xff)) + delta;
+}
+
+void SimpleTask::processData(int /*threadIndex*/, size_t startX, size_t startY, size_t endX,
+                             size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        for (size_t x = startX; x < endX; x++) {
+            size_t index = (y * mSizeX + x) * mVectorSize;
+            for (size_t i = 0; i < mVectorSize; i++) {
+                // Use add to make sure the opertion is only done once. This assumes
+                // the buffer starts set at 0.
+                mBuffer[index + i] += newValue(x, y, mDelta + i);
+            }
+        }
+    }
+}
+
+/**
+ * Returns true if all the entries of the vector are the expected value.
+ * Prints an error if not.
+ */
+bool verifyAllTheSame(const std::vector<uint8_t>& buffer, size_t vectorSize, size_t sizeX,
+                      size_t sizeY, uint8_t delta) {
+    for (size_t y = 0; y < sizeY; y++) {
+        for (size_t x = 0; x < sizeX; x++) {
+            size_t index = (y * sizeX + x) * vectorSize;
+            for (size_t i = 0; i < vectorSize; i++) {
+                uint8_t expectedValue = newValue(x, y, delta + i);
+                if (buffer[index + i] != expectedValue) {
+                    printf("Test Error at %zu, %zu. Expected %u found %u instead\n", x, y,
+                           expectedValue, buffer[index + i]);
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
+}
+
+/**
+ * Create a buffer of the specified size, set each entry of that buffer
+ * to the specified value using TaskProcessor, and verify the results.
+ */
+void testOne(android::renderscript::TaskProcessor* processor, uint8_t delta, size_t vectorSize,
+             size_t sizeX, size_t sizeY) {
+    std::vector<uint8_t> buffer(sizeX * sizeY * vectorSize);
+
+    SimpleTask task{buffer.data(), vectorSize, sizeX, sizeY, delta};
+    processor->doTask(&task);
+
+    if (verifyAllTheSame(buffer, vectorSize, sizeX, sizeY, delta)) {
+        printf("Test %u: All good!\n", delta);
+    }
+}
+
+int main() {
+    std::vector<std::thread> testThreads;
+
+    // Test with multiple threads, to help find synchronization errors.
+    android::renderscript::TaskProcessor processorA(1);
+    android::renderscript::TaskProcessor processorB(4);
+    testThreads.emplace_back(testOne, &processorA, 1, 4, 30, 40);
+    testThreads.emplace_back(testOne, &processorB, 1, 4, 30, 40);
+    testThreads.emplace_back(testOne, &processorA, 2, 4, 800, 600);
+    testThreads.emplace_back(testOne, &processorB, 2, 4, 800, 600);
+    testThreads.emplace_back(testOne, &processorA, 3, 1, 123, 47);
+    testThreads.emplace_back(testOne, &processorB, 3, 1, 123, 47);
+    testThreads.emplace_back(testOne, &processorA, 5, 2, 5000, 8000);
+    testThreads.emplace_back(testOne, &processorB, 5, 2, 5000, 8000);
+    testThreads.emplace_back(testOne, &processorA, 6, 3, 26000, 1);
+    testThreads.emplace_back(testOne, &processorB, 6, 3, 26000, 1);
+    testThreads.emplace_back(testOne, &processorA, 7, 4, 1, 26000);
+    testThreads.emplace_back(testOne, &processorB, 7, 4, 1, 26000);
+    testThreads.emplace_back(testOne, &processorA, 8, 4, 1000, 1000);
+    testThreads.emplace_back(testOne, &processorB, 8, 4, 1000, 1000);
+    testThreads.emplace_back(testOne, &processorA, 9, 1, 1, 1);
+    testThreads.emplace_back(testOne, &processorB, 9, 1, 1, 1);
+
+    for (auto& thread : testThreads) {
+        thread.join();
+    }
+    return 0;
+}
diff --git a/toolkit/Utils.cpp b/toolkit/Utils.cpp
new file mode 100644
index 0000000..8ec9fbe
--- /dev/null
+++ b/toolkit/Utils.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Utils.h"
+
+#include <cpu-features.h>
+
+#include "RenderScriptToolkit.h"
+
+namespace android {
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Utils"
+
+bool cpuSupportsSimd() {
+    AndroidCpuFamily family = android_getCpuFamily();
+    uint64_t features = android_getCpuFeatures();
+
+    if (family == ANDROID_CPU_FAMILY_ARM && (features & ANDROID_CPU_ARM_FEATURE_NEON)) {
+        // ALOGI("Arm with Neon");
+        return true;
+    } else if (family == ANDROID_CPU_FAMILY_ARM64 && (features & ANDROID_CPU_ARM64_FEATURE_ASIMD)) {
+        // ALOGI("Arm64 with ASIMD");
+        return true;
+    } else if ((family == ANDROID_CPU_FAMILY_X86 || family == ANDROID_CPU_FAMILY_X86_64) &&
+               (features & ANDROID_CPU_X86_FEATURE_SSSE3)) {
+        // ALOGI("x86* with SSE3");
+        return true;
+    }
+    // ALOGI("Not simd");
+    return false;
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+bool validRestriction(const char* tag, size_t sizeX, size_t sizeY, const Restriction* restriction) {
+    if (restriction == nullptr) {
+        return true;
+    }
+    if (restriction->startX >= sizeX || restriction->endX > sizeX) {
+        ALOGE("%s. sizeX should be greater than restriction->startX and greater or equal to "
+              "restriction->endX. %zu, %zu, and %zu were provided respectively.",
+              tag, sizeX, restriction->startX, restriction->endY);
+        return false;
+    }
+    if (restriction->startY >= sizeY && restriction->endY > sizeY) {
+        ALOGE("%s. sizeY should be greater than restriction->startY and greater or equal to "
+              "restriction->endY. %zu, %zu, and %zu were provided respectively.",
+              tag, sizeY, restriction->startY, restriction->endY);
+        return false;
+    }
+    if (restriction->startX >= restriction->endX) {
+        ALOGE("%s. Restriction startX should be less than endX. "
+              "%zu and %zu were provided respectively.",
+              tag, restriction->startX, restriction->endX);
+        return false;
+    }
+    if (restriction->startY >= restriction->endY) {
+        ALOGE("%s. Restriction startY should be less than endY. "
+              "%zu and %zu were provided respectively.",
+              tag, restriction->startY, restriction->endY);
+        return false;
+    }
+    return true;
+}
+#endif
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/Utils.h b/toolkit/Utils.h
new file mode 100644
index 0000000..ff9eb43
--- /dev/null
+++ b/toolkit/Utils.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RENDERSCRIPT_TOOLKIT_UTILS_H
+#define ANDROID_RENDERSCRIPT_TOOLKIT_UTILS_H
+
+#include <android/log.h>
+
+namespace android {
+namespace renderscript {
+
+/* The Toolkit does not support floating point buffers but the original RenderScript Intrinsics
+ * did for some operations. That code was preserved and protected by
+ * ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT.
+ */
+// TODO: On final packaging, decide whether this should be define in the build file, and for which
+// config. #define ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+/* If we release the Toolkit as a C++ API, we'll want to enable validation at the C++ level
+ * by uncommenting this define.
+ *
+ * If we only have a Java/Kotlin API, the Kotlin layer does validation. We don't need to duplicate
+ * this effort.
+ */
+#define ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+
+#define ALOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
+#define ALOGW(...) __android_log_print(ANDROID_LOG_WARN, LOG_TAG, __VA_ARGS__)
+#define ALOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
+
+using uchar = unsigned char;
+using uint = unsigned int;
+using ushort = unsigned short;
+
+using uint8_t = uchar;
+using uint16_t = ushort;
+using uint32_t = uint;
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef uchar uchar2 __attribute__((ext_vector_type(2)));
+typedef uchar uchar3 __attribute__((ext_vector_type(3)));
+typedef uchar uchar4 __attribute__((ext_vector_type(4)));
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+
+template <typename TO, typename TI>
+inline TO convert(TI i) {
+    // assert(i.x >= 0 && i.y >= 0 && i.z >= 0 && i.w >= 0);
+    // assert(i.x <= 255 && i.y <= 255 && i.z <= 255 && i.w <= 255);
+    return __builtin_convertvector(i, TO);
+}
+
+template <>
+inline uchar convert(float i) {
+    // assert(i.x >= 0 && i.y >= 0 && i.z >= 0 && i.w >= 0);
+    // assert(i.x <= 255 && i.y <= 255 && i.z <= 255 && i.w <= 255);
+    return (uchar)i;
+}
+
+template <>
+inline float convert(uchar i) {
+    // assert(i.x >= 0 && i.y >= 0 && i.z >= 0 && i.w >= 0);
+    // assert(i.x <= 255 && i.y <= 255 && i.z <= 255 && i.w <= 255);
+    return (float)i;
+}
+
+inline int4 clamp(int4 amount, int low, int high) {
+    int4 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
+    return r;
+}
+
+inline float4 clamp(float4 amount, float low, float high) {
+    float4 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
+    return r;
+}
+
+inline int2 clamp(int2 amount, int low, int high) {
+    int2 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    return r;
+}
+
+inline float2 clamp(float2 amount, float low, float high) {
+    float2 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    return r;
+}
+
+inline int clamp(int amount, int low, int high) {
+    return amount < low ? low : (amount > high ? high : amount);
+}
+
+inline float clamp(float amount, float low, float high) {
+    return amount < low ? low : (amount > high ? high : amount);
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+struct Restriction;
+
+bool validRestriction(const char* tag, size_t sizeX, size_t sizeY, const Restriction* restriction);
+#endif
+
+/**
+ * Returns true if the processor we're running on supports the SIMD instructions that are
+ * used in our assembly code.
+ */
+bool cpuSupportsSimd();
+
+inline size_t divideRoundingUp(size_t a, size_t b) {
+    return a / b + (a % b == 0 ? 0 : 1);
+}
+
+inline size_t paddedSize(size_t size) {
+    return size == 3 ? 4 : size;
+}
+
+}  // namespace renderscript
+}  // namespace android
+
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_UTILS_H
diff --git a/toolkit/YuvToRgb.cpp b/toolkit/YuvToRgb.cpp
new file mode 100644
index 0000000..2da0f5c
--- /dev/null
+++ b/toolkit/YuvToRgb.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.YuvToRgb"
+
+namespace android {
+namespace renderscript {
+
+inline size_t roundUpTo16(size_t val) {
+    return (val + 15) & ~15;
+}
+
+class YuvToRgbTask : public Task {
+    uchar4* mOut;
+    size_t mCstep;
+    size_t mStrideY;
+    size_t mStrideU;
+    size_t mStrideV;
+    const uchar* mInY;
+    const uchar* mInU;
+    const uchar* mInV;
+
+    void kernel(uchar4* out, uint32_t xstart, uint32_t xend, uint32_t currentY);
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) override;
+
+   public:
+    YuvToRgbTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
+                 RenderScriptToolkit::YuvFormat format)
+        : Task{sizeX, sizeY, 4, false, nullptr}, mOut{reinterpret_cast<uchar4*>(output)} {
+        switch (format) {
+            case RenderScriptToolkit::YuvFormat::NV21:
+                mCstep = 2;
+                mStrideY = sizeX;
+                mStrideU = mStrideY;
+                mStrideV = mStrideY;
+                mInY = reinterpret_cast<const uchar*>(input);
+                mInV = reinterpret_cast<const uchar*>(input + mStrideY * sizeY);
+                mInU = mInV + 1;
+                break;
+            case RenderScriptToolkit::YuvFormat::YV12:
+                mCstep = 1;
+                mStrideY = roundUpTo16(sizeX);
+                mStrideU = roundUpTo16(mStrideY >> 1);
+                mStrideV = mStrideU;
+                mInY = reinterpret_cast<const uchar*>(input);
+                mInU = reinterpret_cast<const uchar*>(input + mStrideY * sizeY);
+                mInV = mInU + mStrideV * sizeY / 2;
+                break;
+        }
+    }
+};
+
+void YuvToRgbTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                               size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        size_t offset = mSizeX * y + startX;
+        uchar4* out = mOut + offset;
+        kernel(out, startX, endX, y);
+    }
+}
+
+static uchar4 rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v) {
+    int16_t Y = ((int16_t)y) - 16;
+    int16_t U = ((int16_t)u) - 128;
+    int16_t V = ((int16_t)v) - 128;
+
+    short4 p;
+    p.x = (Y * 298 + V * 409 + 128) >> 8;
+    p.y = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
+    p.z = (Y * 298 + U * 516 + 128) >> 8;
+    p.w = 255;
+    if(p.x < 0) {
+        p.x = 0;
+    }
+    if(p.x > 255) {
+        p.x = 255;
+    }
+    if(p.y < 0) {
+        p.y = 0;
+    }
+    if(p.y > 255) {
+        p.y = 255;
+    }
+    if(p.z < 0) {
+        p.z = 0;
+    }
+    if(p.z > 255) {
+        p.z = 255;
+    }
+
+    return (uchar4){static_cast<uchar>(p.x), static_cast<uchar>(p.y),
+                    static_cast<uchar>(p.z), static_cast<uchar>(p.w)};
+}
+
+extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart,
+                                  size_t xend);
+extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart,
+                                   size_t xend);
+extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v,
+                                   size_t xstart, size_t xend);
+
+void YuvToRgbTask::kernel(uchar4 *out, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    //ALOGI("kernel out %p, xstart=%u, xend=%u, currentY=%u", out, xstart, xend, currentY);
+
+    const uchar *y = mInY + (currentY * mStrideY);
+    const uchar *v = mInV + ((currentY >> 1) * mStrideV);
+    const uchar *u = mInU + ((currentY >> 1) * mStrideU);
+
+    //ALOGI("pinY %p, pinV %p, pinU %p", pinY, pinV, pinU);
+
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    /*
+    ALOGE("pinY, %p, Y, %p, currentY, %d, strideY, %zu", pinY, y, currentY, mStrideY);
+    ALOGE("pinU, %p, U, %p, currentY, %d, strideU, %zu", pinU, u, currentY, mStrideU);
+    ALOGE("pinV, %p, V, %p, currentY, %d, strideV, %zu", pinV, v, currentY, mStrideV);
+    ALOGE("dimX, %d, dimY, %d", cp->alloc->mHal.drvState.lod[0].dimX,
+          cp->alloc->mHal.drvState.lod[0].dimY);
+    ALOGE("info->dim.x, %d, info->dim.y, %d", info->dim.x, info->dim.y);
+    uchar* pinY = (uchar*)mInY;
+    uchar* pinU = (uchar*)mInU;
+    uchar* pinV = (uchar*)mInV;
+    ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinY, pinY[0], pinY[1], pinY[2], pinY[3], pinY[4], pinY[5], pinY[6], pinY[7], pinY[8],
+          pinY[9], pinY[10], pinY[11], pinY[12], pinY[13], pinY[14], pinY[15]);
+    ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinY, pinY[16], pinY[17], pinY[18], pinY[19], pinY[20], pinY[21], pinY[22], pinY[23],
+          pinY[24], pinY[25], pinY[26], pinY[27], pinY[28], pinY[29], pinY[30], pinY[31]);
+    ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinY, pinY[32], pinY[33], pinY[34], pinY[35], pinY[36], pinY[37], pinY[38], pinY[39],
+          pinY[40], pinY[41], pinY[42], pinY[43], pinY[44], pinY[45], pinY[46], pinY[47]);
+
+    ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinU, pinU[0], pinU[1], pinU[2], pinU[3], pinU[4], pinU[5], pinU[6], pinU[7], pinU[8],
+          pinU[9], pinU[10], pinU[11], pinU[12], pinU[13], pinU[14], pinU[15]);
+    ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinU, pinU[16], pinU[17], pinU[18], pinU[19], pinU[20], pinU[21], pinU[22], pinU[23],
+          pinU[24], pinU[25], pinU[26], pinU[27], pinU[28], pinU[29], pinU[30], pinU[31]);
+    ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinU, pinU[32], pinU[33], pinU[34], pinU[35], pinU[36], pinU[37], pinU[38], pinU[39],
+          pinU[40], pinU[41], pinU[42], pinU[43], pinU[44], pinU[45], pinU[46], pinU[47]);
+
+    ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinV, pinV[0], pinV[1], pinV[2], pinV[3], pinV[4], pinV[5], pinV[6], pinV[7], pinV[8],
+          pinV[9], pinV[10], pinV[11], pinV[12], pinV[13], pinV[14], pinV[15]);
+    ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinV, pinV[16], pinV[17], pinV[18], pinV[19], pinV[20], pinV[21], pinV[22], pinV[23],
+          pinV[24], pinV[25], pinV[26], pinV[27], pinV[28], pinV[29], pinV[30], pinV[31]);
+    ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinV, pinV[32], pinV[33], pinV[34], pinV[35], pinV[36], pinV[37], pinV[38], pinV[39],
+          pinV[40], pinV[41], pinV[42], pinV[43], pinV[44], pinV[45], pinV[46], pinV[47]);
+    */
+
+    /* If we start on an odd pixel then deal with it here and bump things along
+     * so that subsequent code can carry on with even-odd pairing assumptions.
+     */
+    if((x1 & 1) && (x2 > x1)) {
+        int cx = (x1 >> 1) * mCstep;
+        *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
+        out++;
+        x1++;
+    }
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if((x2 > x1) && mUsesSimd) {
+        int32_t len = x2 - x1;
+        if (mCstep == 1) {
+            rsdIntrinsicYuv2_K(out, y, u, v, x1, x2);
+            x1 += len;
+            out += len;
+        } else if (mCstep == 2) {
+            // Check for proper interleave
+            intptr_t ipu = (intptr_t)u;
+            intptr_t ipv = (intptr_t)v;
+
+            if (ipu == (ipv + 1)) {
+                rsdIntrinsicYuv_K(out, y, v, x1, x2);
+                x1 += len;
+                out += len;
+            } else if (ipu == (ipv - 1)) {
+                rsdIntrinsicYuvR_K(out, y, u, x1, x2);
+                x1 += len;
+                out += len;
+            }
+        }
+    }
+#endif
+
+    if(x2 > x1) {
+       // ALOGE("y %i  %i  %i", currentY, x1, x2);
+        while(x1 < x2) {
+            int cx = (x1 >> 1) * mCstep;
+            *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
+            out++;
+            x1++;
+            *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
+            out++;
+            x1++;
+        }
+    }
+}
+
+void RenderScriptToolkit::yuvToRgb(const uint8_t* input, uint8_t* output, size_t sizeX,
+                                   size_t sizeY, YuvFormat format) {
+    YuvToRgbTask task(input, output, sizeX, sizeY, format);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/toolkit/YuvToRgb_advsimd.S b/toolkit/YuvToRgb_advsimd.S
new file mode 100644
index 0000000..bb4b7ae
--- /dev/null
+++ b/toolkit/YuvToRgb_advsimd.S
@@ -0,0 +1,377 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+/* Perform the actual YuvToRGB conversion in a macro, from register to
+ * register.  This macro will be called from within several different wrapper
+ * variants for different data layouts.  Y data starts with the even and odd
+ * bytes split into the low parts of v8 and v9 respectively.  U and V are in
+ * v10 and v11.  Working constants are pre-loaded into v24-v31, and v3 and v7
+ * are pre-loaded with a constant 0xff alpha channel.
+ *
+ * The complicated arithmetic is the result of refactoring the original
+ * equations to avoid 16-bit overflow without losing any precision.
+ */
+.macro yuvkern, regu=v10, regv=v11
+        /* v0   out R_lo / even R_lo accumulator
+         * v1   out G_lo / even G_lo accumulator
+         * v2   out B_lo / even B_lo accumulator
+         * v3   out A_lo / const 0xff*ff
+         * v4   out R_hi / even R_hi accumulator
+         * v5   out G_hi / even G_hi accumulator
+         * v6   out B_hi / even B_hi accumulator
+         * v7   out A_hi / const 0xff*ff
+         * v8   even Y   / G_lo luma tmp
+         * v9   odd Y    / G_lo luma tmp
+         * \regu in U
+         * \regv in V
+         * v12  R_lo luma tmp
+         * v13  B_lo luma tmp
+         * v14  R_hi luma tmp
+         * v15  B_hi luma tmp
+         * v16  odd R_lo accumulator
+         * v17  odd G_lo accumulator
+         * v18  odd B_lo accumulator
+         * v19  multiplier extra bits low
+         * v20  odd R_hi accumulator
+         * v21  odd G_hi accumulator
+         * v22  odd B_hi accumulator
+         * v23  multiplier extra bits high
+         * v24  constant 149
+         * v25  constant 50
+         * v26  constant 104
+         * v27  constant 204
+         * v28  constant 254
+         * v29  constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+         * v30  constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+         * v31  constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+         */
+
+        umull       v1.8h,  v8.8b,  v24.8b      // g0 = y0 * 149
+        umull       v17.8h, v9.8b,  v24.8b      // g1 = y1 * 149
+        umull2      v5.8h,  v8.16b, v24.16b     // g0_hi = y0_hi * 149
+        umull2      v21.8h, v9.16b, v24.16b     // g1_hi = y1_hi * 149
+
+        umull       v8.8h, \regu\().8b, v25.8b     // g2 = u * 50 + v * 104
+        umlal       v8.8h, \regv\().8b, v26.8b
+        umull2      v9.8h, \regu\().16b, v25.16b   // g2_hi = u_hi * 50 + v_hi * 104
+        umlal2      v9.8h, \regv\().16b, v26.16b
+
+        ushr        v19.16b, \regv\().16b, #1
+        uaddw       v0.8h,  v1.8h,  v19.8b      // r0 = g0 + (v >> 1)
+        uaddw       v16.8h, v17.8h, v19.8b      // r1 = g1 + (v >> 1)
+
+        uaddw2      v4.8h,  v5.8h,  v19.16b     // r0_hi = g0_hi + (v_hi >> 1)
+        uaddw2      v20.8h, v21.8h, v19.16b     // r1_hi = g1_hi + (v_hi >> 1)
+
+        ushll       v19.8h, \regu\().8b,  #2
+        ushll2      v23.8h, \regu\().16b, #2
+        add         v2.8h,  v1.8h,  v19.8h      // b0 = g0 + (u << 2)
+        add         v18.8h, v17.8h, v19.8h      // b1 = g1 + (u << 2)
+
+        add         v6.8h,  v5.8h,  v23.8h      // b0_hi = g0_hi + (u_hi << 2)
+        add         v22.8h, v21.8h, v23.8h      // b1_hi = g1_hi + (u_hi << 2)
+
+        umull       v12.8h, \regv\().8b, v27.8b    // r2 = v * 204
+        umull       v13.8h, \regu\().8b, v28.8b    // b2 = u * 254
+
+        umull2      v14.8h, \regv\().16b, v27.16b  // r2_hi = v_hi * 204
+        umull2      v15.8h, \regu\().16b, v28.16b  // b2_hi = u_hi * 254
+
+        uhadd       v0.8h,  v0.8h,  v12.8h      // r0 = (r0 + r2) >> 1
+        uhadd       v16.8h, v16.8h, v12.8h      // r1 = (r1 + r2) >> 1
+        uqadd       v1.8h,  v1.8h,  v30.8h      // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uqadd       v17.8h, v17.8h, v30.8h      // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uhadd       v2.8h,  v2.8h,  v13.8h      // b0 = (b0 + b2) >> 1
+        uhadd       v18.8h, v18.8h, v13.8h      // b1 = (b1 + b2) >> 1
+
+        uhadd       v4.8h,  v4.8h,  v14.8h      // r0_hi = (r0_hi + r2_hi) >> 1
+        uhadd       v20.8h, v20.8h, v14.8h      // r1_hi = (r1_hi + r2_hi) >> 1
+        uqadd       v5.8h,  v5.8h,  v30.8h      // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uqadd       v21.8h, v21.8h, v30.8h      // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uhadd       v6.8h,  v6.8h,  v15.8h      // b0_hi = (b0_hi + b2_hi) >> 1
+        uhadd       v22.8h, v22.8h, v15.8h      // b1_hi = (b1_hi + b2_hi) >> 1
+
+        uqsub       v0.8h,  v0.8h,  v29.8h      // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v16.8h, v16.8h, v29.8h      // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v1.8h,  v1.8h,  v8.8h       // g0 = satu16(g0 - g2)
+        uqsub       v17.8h, v17.8h, v8.8h       // g1 = satu16(g1 - g2)
+        uqsub       v2.8h,  v2.8h,  v31.8h      // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        uqsub       v18.8h, v18.8h, v31.8h      // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        uqsub       v4.8h,  v4.8h,  v29.8h      // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v20.8h, v20.8h, v29.8h      // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v5.8h,  v5.8h,  v9.8h       // g0_hi = satu16(g0_hi - g2_hi)
+        uqsub       v21.8h, v21.8h, v9.8h       // g1_hi = satu16(g1_hi - g2_hi)
+        uqsub       v6.8h,  v6.8h,  v31.8h      // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        uqsub       v22.8h, v22.8h, v31.8h      // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        uqrshrn     v0.8b,  v0.8h,  #6
+        uqrshrn     v16.8b, v16.8h, #6
+        uqrshrn     v1.8b,  v1.8h,  #7
+        uqrshrn     v17.8b, v17.8h, #7
+        uqrshrn     v2.8b,  v2.8h,  #6
+        uqrshrn     v18.8b, v18.8h, #6
+
+        uqrshrn     v4.8b,  v4.8h,  #6
+        uqrshrn     v20.8b, v20.8h, #6
+        uqrshrn     v5.8b,  v5.8h,  #7
+        uqrshrn     v21.8b, v21.8h, #7
+        uqrshrn     v6.8b,  v6.8h,  #6
+        uqrshrn     v22.8b, v22.8h, #6
+
+        zip1        v0.16b, v0.16b, v16.16b
+        zip1        v1.16b, v1.16b, v17.16b
+        zip1        v2.16b, v2.16b, v18.16b
+
+        zip1        v4.16b, v4.16b, v20.16b
+        zip1        v5.16b, v5.16b, v21.16b
+        zip1        v6.16b, v6.16b, v22.16b
+.endm
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop.  Some sections of code are switched out depending on the data packing
+ * being handled.
+ */
+.macro wrap_line kernel, interleaved=0, swapuv=0
+        movi        v24.16b, #149
+        movi        v25.16b, #50
+        movi        v26.16b, #104
+        movi        v27.16b, #204
+        movi        v28.16b, #254
+        mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        dup         v29.8h, w5
+        mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        dup         v30.8h, w5
+        mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        dup         v31.8h, w5
+
+        movi        v3.16b, #0xff
+        movi        v7.16b, #0xff
+
+        subs        x2, x2, #32
+        bhs         1f
+        b           2f
+
+        .align 4
+1:      ld2         {v8.16b,v9.16b}, [x1], #32
+  .if \interleaved
+        ld2         {v10.16b,v11.16b}, [x3], #32
+  .else
+        ld1         {v10.16b}, [x3], #16
+        ld1         {v11.16b}, [x4], #16
+  .endif
+
+  .if \swapuv
+        \kernel regu=v11, regv=v10
+  .else
+        \kernel
+  .endif
+
+        subs        x2, x2, #32
+
+        st4         {v0.16b - v3.16b}, [x0], #64
+        st4         {v4.16b - v7.16b}, [x0], #64
+
+        bhs         1b
+
+2:      adds        x2, x2, #32
+        beq         2f
+
+        /* To handle the tail portion of the data (something less than 32
+         * bytes) load small power-of-two chunks into working registers.  It
+         * doesn't matter where they end up in the register; the same process
+         * will store them back out using the same positions and the
+         * interaction between neighbouring pixels is constrained to odd
+         * boundaries where the load operations don't interfere.
+         */
+        movi        v8.8b, #0
+        movi        v9.8b, #0
+        movi        v10.8b, #0
+        movi        v11.8b, #0
+
+        tbz         x2, #4, 1f
+        ld1         {v9.16b}, [x1], #16
+  .if \interleaved
+        ld1         {v11.16b}, [x3], #16
+  .else
+        ld1         {v10.d}[1], [x3], #8
+        ld1         {v11.d}[1], [x4], #8
+  .endif
+1:      tbz         x2, #3, 1f
+        ld1         {v8.d}[1], [x1], #8
+  .if \interleaved
+        ld1         {v10.d}[1], [x3], #8
+  .else
+        ld1         {v10.s}[1], [x3], #4
+        ld1         {v11.s}[1], [x4], #4
+  .endif
+1:      tbz         x2, #2, 1f
+        ld1         {v8.s}[1], [x1], #4
+  .if \interleaved
+        ld1         {v10.s}[1], [x3], #4
+  .else
+        ld1         {v10.h}[1], [x3], #2
+        ld1         {v11.h}[1], [x4], #2
+  .endif
+1:      tbz         x2, #1, 1f
+        ld1         {v8.h}[1], [x1], #2
+  .if \interleaved
+        ld1         {v10.h}[1], [x3], #2
+  .else
+        ld1         {v10.b}[1], [x3], #1
+        ld1         {v11.b}[1], [x4], #1
+  .endif
+1:      tbz         x2, #0, 1f
+        ld1         {v8.b}[1], [x1], #1
+  .if \interleaved
+        ld1         {v10.h}[0], [x3], #2
+  .else
+        ld1         {v10.b}[0], [x3], #1
+        ld1         {v11.b}[0], [x4], #1
+  .endif
+
+        /* One small impediment in the process above is that some of the load
+         * operations can't perform byte-wise structure deinterleaving at the
+         * same time as loading only part of a register.  So the data is loaded
+         * linearly and unpacked manually at this point if necessary.
+         */
+1:      mov         v12.16b, v8.16b
+        uzp1        v8.16b, v12.16b, v9.16b
+        uzp2        v9.16b, v12.16b, v9.16b
+  .if \interleaved
+        mov         v12.16b, v10.16b
+        uzp1        v10.16b, v12.16b, v11.16b
+        uzp2        v11.16b, v12.16b, v11.16b
+  .endif
+
+  .if \swapuv
+        \kernel regu=v11, regv=v10
+  .else
+        \kernel
+  .endif
+
+        /* As above but with the output; structured stores for partial vectors
+         * aren't available, so the data is re-packed first and stored linearly.
+         */
+        zip1        v16.16b, v0.16b, v2.16b
+        zip2        v18.16b, v0.16b, v2.16b
+        zip1        v17.16b, v1.16b, v3.16b
+        zip2        v19.16b, v1.16b, v3.16b
+        zip1        v0.16b, v16.16b, v17.16b
+        zip2        v1.16b, v16.16b, v17.16b
+        zip1        v2.16b, v18.16b, v19.16b
+        zip2        v3.16b, v18.16b, v19.16b
+
+        /* Luckily v4-v7 don't need to be unzipped because the complete set of
+         * four and can be stored using st4. */
+
+        tbz         x2, #4, 1f
+        st4         {v4.16b - v7.16b}, [x0], #64
+1:      tbz         x2, #3, 1f
+        st1         {v2.16b,v3.16b}, [x0], #32
+1:      tbz         x2, #2, 1f
+        st1         {v1.16b}, [x0], #16
+1:      tbz         x2, #1, 1f
+        st1         {v0.d}[1], [x0], #8
+1:      tbz         x2, #0, 2f
+        st1         {v0.s}[1], [x0], #4
+2:
+.endm
+
+
+/*  void rsdIntrinsicYuv2_K(
+ *          void *out,          // x0
+ *          void const *yin,    // x1
+ *          void const *uin,    // x2
+ *          void const *vin,    // x3
+ *          size_t xstart,      // x4
+ *          size_t xend);       // x5
+ */
+ENTRY(rsdIntrinsicYuv2_K)
+        lsr         x6, x4, #1
+        add         x0, x0, x4, LSL #2
+        add         x1, x1, x4
+        add         x4, x3, x6
+        add         x3, x2, x6
+        sub         x2, x5, x6, LSL #1
+
+        sub         x6, sp, #32
+        sub         sp, sp, #64
+        st1         {v8.1d - v11.1d}, [sp]
+        st1         {v12.1d - v15.1d}, [x6]
+
+        wrap_line yuvkern, 0
+
+        ld1         {v8.1d - v11.1d}, [sp], #32
+        ld1         {v12.1d - v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicYuv2_K)
+
+/*  void rsdIntrinsicYuv_K(
+ *          void *out,          // x0
+ *          void const *yin,    // x1
+ *          void const *uvin,   // x2
+ *          size_t xstart,      // x3
+ *          size_t xend);       // x4
+ */
+ENTRY(rsdIntrinsicYuv_K)
+        bic         x5, x3, #1
+        add         x0, x0, x5, LSL #2
+        add         x1, x1, x5
+        add         x3, x2, x5
+        sub         x2, x4, x5
+
+        sub         x5, sp, #32
+        sub         sp, sp, #64
+        st1         {v8.1d - v11.1d}, [sp]
+        st1         {v12.1d - v15.1d}, [x5]
+
+        wrap_line yuvkern, 1, 1
+
+        ld1         {v8.1d - v11.1d}, [sp], #32
+        ld1         {v12.1d - v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicYuv_K)
+
+/*  void rsdIntrinsicYuvR_K(
+ *          void *out,          // x0
+ *          void const *yin,    // x1
+ *          void const *uvin,   // x2
+ *          size_t xstart,      // x3
+ *          size_t xend);       // x4
+ */
+ENTRY(rsdIntrinsicYuvR_K)
+        bic         x5, x3, #1
+        add         x0, x0, x5, LSL #2
+        add         x1, x1, x5
+        add         x3, x2, x5
+        sub         x2, x4, x5
+
+        sub         x5, sp, #32
+        sub         sp, sp, #64
+        st1         {v8.1d - v11.1d}, [sp]
+        st1         {v12.1d - v15.1d}, [x5]
+
+        wrap_line yuvkern, 1
+
+        ld1         {v8.1d - v11.1d}, [sp], #32
+        ld1         {v12.1d - v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicYuvR_K)
diff --git a/toolkit/YuvToRgb_neon.S b/toolkit/YuvToRgb_neon.S
new file mode 100644
index 0000000..5c3bce4
--- /dev/null
+++ b/toolkit/YuvToRgb_neon.S
@@ -0,0 +1,298 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* Perform the actual YuvToRGB conversion in a macro, from register to
+ * register.  This macro will be called from within several different wrapper
+ * variants for different data layouts.  Y data starts in q8, but with the even
+ * and odd bytes split into d16 and d17 respectively.  U and V are in d20
+ * and d21.  Working constants are pre-loaded into q13-q15, and q3 is
+ * pre-loaded with a constant 0xff alpha channel.
+ *
+ * The complicated arithmetic is the result of refactoring the original
+ * equations to avoid 16-bit overflow without losing any precision.
+ */
+.macro yuvkern
+        vmov.i8     d15, #149
+
+        vmull.u8    q1, d16, d15        // g0 = y0 * 149
+        vmull.u8    q5, d17, d15        // g1 = y1 * 149
+
+        vmov.i8     d14, #50
+        vmov.i8     d15, #104
+        vmull.u8    q8, d20, d14        // g2 = u * 50 + v * 104
+        vmlal.u8    q8, d21, d15
+
+        vshr.u8     d14, d21, #1
+        vaddw.u8    q0, q1, d14         // r0 = y0 * 149 + (v >> 1)
+        vaddw.u8    q4, q5, d14         // r1 = y1 * 149 + (v >> 1)
+
+        vshll.u8    q7, d20, #2
+        vadd.u16    q2, q1, q7          // b0 = y0 * 149 + (u << 2)
+        vadd.u16    q6, q5, q7          // b1 = y1 * 149 + (u << 2)
+
+        vmov.i8     d14, #204
+        vmov.i8     d15, #254
+        vmull.u8    q11, d21, d14       // r2 = v * 204
+        vmull.u8    q12, d20, d15       // b2 = u * 254
+
+        vhadd.u16   q0, q11             // r0 = (r0 + r2) >> 1
+        vhadd.u16   q4, q11             // r1 = (r1 + r2) >> 1
+        vqadd.u16   q1, q14             // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        vqadd.u16   q5, q14             // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        vhadd.u16   q2, q12             // b0 = (b0 + b2) >> 1
+        vhadd.u16   q6, q12             // b1 = (b1 + b2) >> 1
+
+        vqsub.u16   q0, q13             // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        vqsub.u16   q4, q13             // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        vqsub.u16   q1, q8              // g0 = satu16(g0 - g2)
+        vqsub.u16   q5, q8              // g1 = satu16(g1 - g2)
+        vqsub.u16   q2, q15             // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        vqsub.u16   q6, q15             // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        vqrshrn.u16 d0, q0, #6
+        vqrshrn.u16 d1, q1, #7
+        vqrshrn.u16 d2, q4, #6
+        vqrshrn.u16 d3, q5, #7
+        vqrshrn.u16 d4, q2, #6
+        vqrshrn.u16 d5, q6, #6
+
+        vzip.u8     q0, q1
+        vzip.u8     d4, d5
+.endm
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop.  Some sections of code are switched out depending on the data packing
+ * being handled.
+ */
+.macro wrap_line kernel, interleaved=0, swapuv=0
+
+        movw        r5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        vdup.i16    q13, r5
+        movw        r5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        vdup.i16    q14, r5
+        movw        r5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        vdup.i16    q15, r5
+
+        vmov.i8     q3, #0xff
+
+        subs        r2, #16
+        bhs         1f
+        b           2f
+
+        .align 4
+1:      vld2.u8     {d16,d17}, [r1]!
+        pld         [r1, #256]
+  .if \interleaved
+        vld2.u8     {d20,d21}, [r3]!
+    .if \swapuv
+        vswp        d20, d21
+    .endif
+        pld         [r3, #256]
+  .else
+        vld1.u8     d20, [r3]!
+        vld1.u8     d21, [r4]!
+        pld         [r3, #128]
+        pld         [r4, #128]
+  .endif
+
+        \kernel
+
+        subs        r2, #16
+
+        vst4.u8     {d0,d2,d4,d6}, [r0]!
+        vst4.u8     {d1,d3,d5,d7}, [r0]!
+
+        bhs         1b
+
+2:      adds        r2, #16
+        beq         2f
+
+        /* To handle the tail portion of the data (something less than 16
+         * bytes) load small power-of-two chunks into working registers.  It
+         * doesn't matter where they end up in the register; the same process
+         * will store them back out using the same positions and the
+         * interaction between neighbouring pixels is constrained to odd
+         * boundaries where the load operations don't interfere.
+         */
+        vmov.i8     q8, #0
+        vmov.i8     q10, #0
+
+        tst         r2, #8
+        beq         1f
+        vld1.u8     d17, [r1]!
+  .if \interleaved
+        vld1.u8     d21, [r3]!
+  .else
+        vld1.u32    d20[1], [r3]!
+        vld1.u32    d21[1], [r4]!
+  .endif
+
+1:      tst         r2, #4
+        beq         1f
+        vld1.u32    d16[1], [r1]!
+  .if \interleaved
+        vld1.u32    d20[1], [r3]!
+  .else
+        vld1.u16    d20[1], [r3]!
+        vld1.u16    d21[1], [r4]!
+  .endif
+1:      tst         r2, #2
+        beq         1f
+        vld1.u16    d16[1], [r1]!
+  .if \interleaved
+        vld1.u16    d20[1], [r3]!
+  .else
+        vld1.u8     d20[1], [r3]!
+        vld1.u8     d21[1], [r4]!
+  .endif
+1:      tst         r2, #1
+        beq         1f
+        vld1.u8     d16[1], [r1]!
+  .if \interleaved
+        vld1.u16    d20[0], [r3]!
+  .else
+        vld1.u8     d20[0], [r3]!
+        vld1.u8     d21[0], [r4]!
+  .endif
+
+        /* One small impediment in the process above is that some of the load
+         * operations can't perform byte-wise structure deinterleaving at the
+         * same time as loading only part of a register.  So the data is loaded
+         * linearly and unpacked manually at this point if necessary.
+         */
+1:      vuzp.8      d16, d17
+  .if \interleaved
+        vuzp.8      d20, d21
+    .if \swapuv
+        vswp        d20, d21
+    .endif
+  .endif
+
+        \kernel
+
+        /* As above but with the output; structured stores for partial vectors
+         * aren't available, so the data is re-packed first and stored linearly.
+         */
+        vzip.8  q0, q2
+        vzip.8  q1, q3
+        vzip.8  q0, q1
+        vzip.8  q2, q3
+
+1:      tst         r2, #8
+        beq         1f
+        vst1.u8     {d4,d5,d6,d7}, [r0]!
+
+1:      tst         r2, #4
+        beq         1f
+        vst1.u8     {d2,d3}, [r0]!
+1:      tst         r2, #2
+        beq         1f
+        vst1.u8     d1, [r0]!
+1:      tst         r2, #1
+        beq         2f
+        vst1.u32    d0[1], [r0]!
+2:
+.endm
+
+
+/*  void rsdIntrinsicYuv2_K(
+ *          void *out,          // r0
+ *          void const *yin,    // r1
+ *          void const *uin,    // r2
+ *          void const *vin,    // r3
+ *          size_t xstart,      // [sp]
+ *          size_t xend);       // [sp+#4]
+ */
+ENTRY(rsdIntrinsicYuv2_K)
+        push        {r4,r5}
+        ldr         r5, [sp, #8]
+        mov         r4, r3
+        mov         r3, r2
+        ldr         r2, [sp, #12]
+
+        add         r0, r5, LSL #2
+        add         r1, r5
+        add         r3, r5, LSR #1
+        add         r4, r5, LSR #1
+        sub         r2, r5
+
+        vpush       {d8-d15}
+
+        wrap_line yuvkern, 0
+
+        vpop        {d8-d15}
+        pop         {r4,r5}
+        bx lr
+END(rsdIntrinsicYuv2_K)
+
+/*  void rsdIntrinsicYuv_K(
+ *          void *out,          // r0
+ *          void const *yin,    // r1
+ *          void const *uvin,   // r2
+ *          size_t xstart,      // r3
+ *          size_t xend);       // [sp]
+ */
+ENTRY(rsdIntrinsicYuv_K)
+        push        {r4,r5}
+        bic         r4, r3, #1
+        add         r3, r2, r4
+        ldr         r2, [sp, #8]
+
+        add         r0, r4, LSL #2
+        add         r1, r4
+        sub         r2, r4
+
+        vpush       {d8-d15}
+
+        wrap_line yuvkern, 1, 1
+
+        vpop        {d8-d15}
+        pop         {r4,r5}
+        bx lr
+END(rsdIntrinsicYuv_K)
+
+/*  void rsdIntrinsicYuvR_K(
+ *          void *out,          // r0
+ *          void const *yin,    // r1
+ *          void const *uvin,   // r2
+ *          size_t xstart,      // r3
+ *          size_t xend);       // [sp]
+ */
+ENTRY(rsdIntrinsicYuvR_K)
+        push        {r4,r5}
+        bic         r4, r3, #1
+        add         r3, r2, r4
+        ldr         r2, [sp, #8]
+
+        add         r0, r4, LSL #2
+        add         r1, r4
+        sub         r2, r4
+
+        vpush       {d8-d15}
+
+        wrap_line yuvkern, 1
+
+        vpop        {d8-d15}
+        pop         {r4,r5}
+        bx lr
+END(rsdIntrinsicYuvR_K)
diff --git a/toolkit/java/Toolkit.kt b/toolkit/java/Toolkit.kt
new file mode 100644
index 0000000..438f241
--- /dev/null
+++ b/toolkit/java/Toolkit.kt
@@ -0,0 +1,1566 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.android.renderscript
+
+import android.graphics.Bitmap
+import java.lang.IllegalArgumentException
+
+// This string is used for error messages.
+private const val externalName = "RenderScript Toolkit"
+
+/**
+ * A collection of high-performance graphic utility functions like blur and blend.
+ *
+ * This toolkit provides ten image manipulation functions: blend, blur, color matrix, convolve,
+ * histogram, histogramDot, lut, lut3d, resize, and YUV to RGB. These functions execute
+ * multithreaded on the CPU.
+ *
+ * Most of the functions have two variants: one that manipulates Bitmaps, the other ByteArrays.
+ * For ByteArrays, you need to specify the width and height of the data to be processed, as
+ * well as the number of bytes per pixel. For most use cases, this will be 4.
+ *
+ * The Toolkit creates a thread pool that's used for processing the functions. The threads live
+ * for the duration of the application. They can be destroyed by calling the method shutdown().
+ *
+ * This library is thread safe. You can call methods from different poolThreads. The functions will
+ * execute sequentially.
+ *
+ * A native C++ version of this Toolkit is available. Check the RenderScriptToolkit.h file in the
+ * cpp directory.
+ *
+ * This toolkit can be used as a replacement for most RenderScript Intrinsic functions. Compared
+ * to RenderScript, it's simpler to use and more than twice as fast on the CPU. However RenderScript
+ * Intrinsics allow more flexibility for the type of allocation supported. In particular, this
+ * toolkit does not support allocations of floats.
+ */
+object Toolkit {
+    /**
+     * Blends a source buffer with the destination buffer.
+     *
+     * Blends a source buffer and a destination buffer, placing the result in the destination
+     * buffer. The blending is done pairwise between two corresponding RGBA values found in
+     * each buffer. The mode parameter specifies one of fifteen supported blending operations.
+     * See {@link BlendingMode}.
+     *
+     * A variant of this method is also available to blend Bitmaps.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The source and destination buffer must have the same dimensions. Both arrays should have
+     * a size greater or equal to sizeX * sizeY * 4. The buffers have a row-major layout.
+     *
+     * @param mode The specific blending operation to do.
+     * @param sourceArray The RGBA input buffer.
+     * @param destArray The destination buffer. Used for input and output.
+     * @param sizeX The width of both buffers, as a number of RGBA values.
+     * @param sizeY The height of both buffers, as a number of RGBA values.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    @JvmOverloads
+    fun blend(
+        mode: BlendingMode,
+        sourceArray: ByteArray,
+        destArray: ByteArray,
+        sizeX: Int,
+        sizeY: Int,
+        restriction: Range2d? = null
+    ) {
+        require(sourceArray.size >= sizeX * sizeY * 4) {
+            "$externalName blend. sourceArray is too small for the given dimensions. " +
+                    "$sizeX*$sizeY*4 < ${sourceArray.size}."
+        }
+        require(destArray.size >= sizeX * sizeY * 4) {
+            "$externalName blend. sourceArray is too small for the given dimensions. " +
+                    "$sizeX*$sizeY*4 < ${sourceArray.size}."
+        }
+        validateRestriction("blend", sizeX, sizeY, restriction)
+
+        nativeBlend(nativeHandle, mode.value, sourceArray, destArray, sizeX, sizeY, restriction)
+    }
+
+    /**
+     * Blends a source bitmap with the destination bitmap.
+     *
+     * Blends a source bitmap and a destination bitmap, placing the result in the destination
+     * bitmap. The blending is done pairwise between two corresponding RGBA values found in
+     * each bitmap. The mode parameter specify one of fifteen supported blending operations.
+     * See {@link BlendingMode}.
+     *
+     * A variant of this method is available to blend ByteArrays.
+     *
+     * The bitmaps should have identical width and height, and have a config of ARGB_8888.
+     * Bitmaps with a stride different than width * vectorSize are not currently supported.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each bitmap. If provided, the range must be wholly contained with the dimensions
+     * of the bitmap.
+     *
+     * @param mode The specific blending operation to do.
+     * @param sourceBitmap The RGBA input buffer.
+     * @param destBitmap The destination buffer. Used for input and output.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    @JvmOverloads
+    fun blend(
+        mode: BlendingMode,
+        sourceBitmap: Bitmap,
+        destBitmap: Bitmap,
+        restriction: Range2d? = null
+    ) {
+        validateBitmap("blend", sourceBitmap)
+        validateBitmap("blend", destBitmap)
+        require(
+            sourceBitmap.width == destBitmap.width &&
+                    sourceBitmap.height == destBitmap.height
+        ) {
+            "$externalName blend. Source and destination bitmaps should be the same size. " +
+                    "${sourceBitmap.width}x${sourceBitmap.height} and " +
+                    "${destBitmap.width}x${destBitmap.height} provided."
+        }
+        require(sourceBitmap.config == destBitmap.config) {
+            "RenderScript Toolkit blend. Source and destination bitmaps should have the same " +
+                    "config. ${sourceBitmap.config} and ${destBitmap.config} provided."
+        }
+        validateRestriction("blend", sourceBitmap.width, sourceBitmap.height, restriction)
+
+        nativeBlendBitmap(nativeHandle, mode.value, sourceBitmap, destBitmap, restriction)
+    }
+
+    /**
+     * Blurs an image.
+     *
+     * Performs a Gaussian blur of an image and returns result in a ByteArray buffer. A variant of
+     * this method is available to blur Bitmaps.
+     *
+     * The radius determines which pixels are used to compute each blurred pixels. This Toolkit
+     * accepts values between 1 and 25. Larger values create a more blurred effect but also
+     * take longer to compute. When the radius extends past the edge, the edge pixel will
+     * be used as replacement for the pixel that's out off boundary.
+     *
+     * Each input pixel can either be represented by four bytes (RGBA format) or one byte
+     * for the less common blurring of alpha channel only image.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY. NOTE: The output buffer will still be full size, with the
+     * section that's not blurred all set to 0. This is to stay compatible with RenderScript.
+     *
+     * The source buffer should be large enough for sizeX * sizeY * mVectorSize bytes. It has a
+     * row-major layout.
+     *
+     * @param inputArray The buffer of the image to be blurred.
+     * @param vectorSize Either 1 or 4, the number of bytes in each cell, i.e. A vs. RGBA.
+     * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
+     * @param radius The radius of the pixels used to blur, a value from 1 to 25.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The blurred pixels, a ByteArray of size.
+     */
+    @JvmOverloads
+    fun blur(
+        inputArray: ByteArray,
+        vectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        radius: Int = 5,
+        restriction: Range2d? = null
+    ): ByteArray {
+        require(vectorSize == 1 || vectorSize == 4) {
+            "$externalName blur. The vectorSize should be 1 or 4. $vectorSize provided."
+        }
+        require(inputArray.size >= sizeX * sizeY * vectorSize) {
+            "$externalName blur. inputArray is too small for the given dimensions. " +
+                    "$sizeX*$sizeY*$vectorSize < ${inputArray.size}."
+        }
+        require(radius in 1..25) {
+            "$externalName blur. The radius should be between 1 and 25. $radius provided."
+        }
+        validateRestriction("blur", sizeX, sizeY, restriction)
+
+        val outputArray = ByteArray(inputArray.size)
+        nativeBlur(
+            nativeHandle, inputArray, vectorSize, sizeX, sizeY, radius, outputArray, restriction
+        )
+        return outputArray
+    }
+
+    /**
+     * Blurs an image.
+     *
+     * Performs a Gaussian blur of a Bitmap and returns result as a Bitmap. A variant of
+     * this method is available to blur ByteArrays.
+     *
+     * The radius determines which pixels are used to compute each blurred pixels. This Toolkit
+     * accepts values between 1 and 25. Larger values create a more blurred effect but also
+     * take longer to compute. When the radius extends past the edge, the edge pixel will
+     * be used as replacement for the pixel that's out off boundary.
+     *
+     * This method supports input Bitmap of config ARGB_8888 and ALPHA_8. Bitmaps with a stride
+     * different than width * vectorSize are not currently supported. The returned Bitmap has the
+     * same config.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY. NOTE: The output Bitmap will still be full size, with the
+     * section that's not blurred all set to 0. This is to stay compatible with RenderScript.
+     *
+     * @param inputBitmap The buffer of the image to be blurred.
+     * @param radius The radius of the pixels used to blur, a value from 1 to 25. Default is 5.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The blurred Bitmap.
+     */
+    @JvmOverloads
+    fun blur(inputBitmap: Bitmap, radius: Int = 5, restriction: Range2d? = null): Bitmap {
+        validateBitmap("blur", inputBitmap)
+        require(radius in 1..25) {
+            "$externalName blur. The radius should be between 1 and 25. $radius provided."
+        }
+        validateRestriction("blur", inputBitmap.width, inputBitmap.height, restriction)
+
+        val outputBitmap = createCompatibleBitmap(inputBitmap)
+        nativeBlurBitmap(nativeHandle, inputBitmap, outputBitmap, radius, restriction)
+        return outputBitmap
+    }
+
+    /**
+     * Identity matrix that can be passed to the {@link RenderScriptToolkit::colorMatrix} method.
+     *
+     * Using this matrix will result in no change to the pixel through multiplication although
+     * the pixel value can still be modified by the add vector, or transformed to a different
+     * format.
+     */
+    val identityMatrix: FloatArray
+        get() = floatArrayOf(
+            1f, 0f, 0f, 0f,
+            0f, 1f, 0f, 0f,
+            0f, 0f, 1f, 0f,
+            0f, 0f, 0f, 1f
+        )
+
+    /**
+     * Matrix to turn color pixels to a grey scale.
+     *
+     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert an
+     * image from color to greyscale.
+     */
+    val greyScaleColorMatrix: FloatArray
+        get() = floatArrayOf(
+            0.299f, 0.299f, 0.299f, 0f,
+            0.587f, 0.587f, 0.587f, 0f,
+            0.114f, 0.114f, 0.114f, 0f,
+            0f, 0f, 0f, 1f
+        )
+
+    /**
+     * Matrix to convert RGB to YUV.
+     *
+     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
+     * first three bytes of each pixel from RGB to YUV. This leaves the last byte (the alpha
+     * channel) untouched.
+     *
+     * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
+     * by this method.
+     */
+    val rgbToYuvMatrix: FloatArray
+        get() = floatArrayOf(
+            0.299f, -0.14713f, 0.615f, 0f,
+            0.587f, -0.28886f, -0.51499f, 0f,
+            0.114f, 0.436f, -0.10001f, 0f,
+            0f, 0f, 0f, 1f
+        )
+
+    /**
+     * Matrix to convert YUV to RGB.
+     *
+     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
+     * first three bytes of each pixel from YUV to RGB. This leaves the last byte (the alpha
+     * channel) untouched.
+     *
+     * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
+     * by this method. Use {@link RenderScriptToolkit::yuvToRgb} to convert these buffers.
+     */
+    val yuvToRgbMatrix: FloatArray
+        get() = floatArrayOf(
+            1f, 1f, 1f, 0f,
+            0f, -0.39465f, 2.03211f, 0f,
+            1.13983f, -0.5806f, 0f, 0f,
+            0f, 0f, 0f, 1f
+        )
+
+    /**
+     * Transform an image using a color matrix.
+     *
+     * Converts a 2D array of vectors of unsigned bytes, multiplying each vectors by a 4x4 matrix
+     * and adding an optional vector.
+     *
+     * Each input vector is composed of 1-4 unsigned bytes. If less than 4 bytes, it's extended to
+     * 4, padding with zeroes. The unsigned bytes are converted from 0-255 to 0.0-1.0 floats
+     * before the multiplication is done.
+     *
+     * The resulting value is normalized from 0.0-1.0 to a 0-255 value and stored in the output.
+     * If the output vector size is less than four, the unused channels are discarded.
+     *
+     * If addVector is not specified, a vector of zeroes is added, i.e. a noop.
+     *
+     * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
+     *
+     * Check identityMatrix, greyScaleColorMatrix, rgbToYuvMatrix, and yuvToRgbMatrix for sample
+     * matrices. The YUV conversion may not work for all color spaces.
+     *
+     * @param inputArray The buffer of the image to be converted.
+     * @param inputVectorSize The number of bytes in each input cell, a value from 1 to 4.
+     * @param sizeX The width of both buffers, as a number of 1 to 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 1 to 4 byte cells.
+     * @param outputVectorSize The number of bytes in each output cell, a value from 1 to 4.
+     * @param matrix The 4x4 matrix to multiply, in row major format.
+     * @param addVector A vector of four floats that's added to the result of the multiplication.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The converted buffer.
+     */
+    @JvmOverloads
+    fun colorMatrix(
+        inputArray: ByteArray,
+        inputVectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        outputVectorSize: Int,
+        matrix: FloatArray,
+        addVector: FloatArray = floatArrayOf(0f, 0f, 0f, 0f),
+        restriction: Range2d? = null
+    ): ByteArray {
+        require(inputVectorSize in 1..4) {
+            "$externalName colorMatrix. The inputVectorSize should be between 1 and 4. " +
+                    "$inputVectorSize provided."
+        }
+        require(outputVectorSize in 1..4) {
+            "$externalName colorMatrix. The outputVectorSize should be between 1 and 4. " +
+                    "$outputVectorSize provided."
+        }
+        require(inputArray.size >= sizeX * sizeY * inputVectorSize) {
+            "$externalName colorMatrix. inputArray is too small for the given dimensions. " +
+                    "$sizeX*$sizeY*$inputVectorSize < ${inputArray.size}."
+        }
+        require(matrix.size == 16) {
+            "$externalName colorMatrix. matrix should have 16 entries. ${matrix.size} provided."
+        }
+        require(addVector.size == 4) {
+            "$externalName colorMatrix. addVector should have 4 entries. " +
+                    "${addVector.size} provided."
+        }
+        validateRestriction("colorMatrix", sizeX, sizeY, restriction)
+
+        val outputArray = ByteArray(sizeX * sizeY * paddedSize(outputVectorSize))
+        nativeColorMatrix(
+            nativeHandle, inputArray, inputVectorSize, sizeX, sizeY, outputArray, outputVectorSize,
+            matrix, addVector, restriction
+        )
+        return outputArray
+    }
+
+    /**
+     * Transform an image using a color matrix.
+     *
+     * Converts a bitmap, multiplying each RGBA value by a 4x4 matrix and adding an optional vector.
+     * Each byte of the RGBA is converted from 0-255 to 0.0-1.0 floats before the multiplication
+     * is done.
+     *
+     * Bitmaps with a stride different than width * vectorSize are not currently supported.
+     *
+     * The resulting value is normalized from 0.0-1.0 to a 0-255 value and stored in the output.
+     *
+     * If addVector is not specified, a vector of zeroes is added, i.e. a noop.
+     *
+     * Check identityMatrix, greyScaleColorMatrix, rgbToYuvMatrix, and yuvToRgbMatrix for sample
+     * matrices. The YUV conversion may not work for all color spaces.
+     *
+     * @param inputBitmap The image to be converted.
+     * @param matrix The 4x4 matrix to multiply, in row major format.
+     * @param addVector A vector of four floats that's added to the result of the multiplication.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The converted buffer.
+     */
+    @JvmOverloads
+    fun colorMatrix(
+        inputBitmap: Bitmap,
+        matrix: FloatArray,
+        addVector: FloatArray = floatArrayOf(0f, 0f, 0f, 0f),
+        restriction: Range2d? = null
+    ): Bitmap {
+        validateBitmap("colorMatrix", inputBitmap)
+        require(matrix.size == 16) {
+            "$externalName colorMatrix. matrix should have 16 entries. ${matrix.size} provided."
+        }
+        require(addVector.size == 4) {
+            "$externalName colorMatrix. addVector should have 4 entries."
+        }
+        validateRestriction("colorMatrix", inputBitmap.width, inputBitmap.height, restriction)
+
+        val outputBitmap = createCompatibleBitmap(inputBitmap)
+        nativeColorMatrixBitmap(
+            nativeHandle,
+            inputBitmap,
+            outputBitmap,
+            matrix,
+            addVector,
+            restriction
+        )
+        return outputBitmap
+    }
+
+    /**
+     * Convolve a ByteArray.
+     *
+     * Applies a 3x3 or 5x5 convolution to the input array using the provided coefficients.
+     * A variant of this method is available to convolve Bitmaps.
+     *
+     * For 3x3 convolutions, 9 coefficients must be provided. For 5x5, 25 coefficients are needed.
+     * The coefficients should be provided in row-major format.
+     *
+     * When the square extends past the edge, the edge values will be used as replacement for the
+     * values that's are off boundary.
+     *
+     * Each input cell can either be represented by one to four bytes. Each byte is multiplied
+     * and accumulated independently of the other bytes of the cell.
+     *
+     * An optional range parameter can be set to restrict the convolve operation to a rectangular
+     * subset of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY. NOTE: The output buffer will still be full size, with the
+     * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
+     *
+     * The source array should be large enough for sizeX * sizeY * vectorSize bytes. It has a
+     * row-major layout. The output array will have the same dimensions.
+     *
+     * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
+     *
+     * @param inputArray The buffer of the image to be blurred.
+     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+     * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
+     * @param coefficients A FloatArray of size 9 or 25, containing the multipliers.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The convolved array.
+     */
+    @JvmOverloads
+    fun convolve(
+        inputArray: ByteArray,
+        vectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        coefficients: FloatArray,
+        restriction: Range2d? = null
+    ): ByteArray {
+        require(vectorSize in 1..4) {
+            "$externalName convolve. The vectorSize should be between 1 and 4. " +
+                    "$vectorSize provided."
+        }
+        require(inputArray.size >= sizeX * sizeY * vectorSize) {
+            "$externalName convolve. inputArray is too small for the given dimensions. " +
+                    "$sizeX*$sizeY*$vectorSize < ${inputArray.size}."
+        }
+        require(coefficients.size == 9 || coefficients.size == 25) {
+            "$externalName convolve. Only 3x3 or 5x5 convolutions are supported. " +
+                    "${coefficients.size} coefficients provided."
+        }
+        validateRestriction("convolve", sizeX, sizeY, restriction)
+
+        val outputArray = ByteArray(inputArray.size)
+        nativeConvolve(
+            nativeHandle,
+            inputArray,
+            vectorSize,
+            sizeX,
+            sizeY,
+            outputArray,
+            coefficients,
+            restriction
+        )
+        return outputArray
+    }
+
+    /**
+     * Convolve a Bitmap.
+     *
+     * Applies a 3x3 or 5x5 convolution to the input Bitmap using the provided coefficients.
+     * A variant of this method is available to convolve ByteArrays. Bitmaps with a stride different
+     * than width * vectorSize are not currently supported.
+     *
+     * For 3x3 convolutions, 9 coefficients must be provided. For 5x5, 25 coefficients are needed.
+     * The coefficients should be provided in row-major format.
+     *
+     * Each input cell can either be represented by one to four bytes. Each byte is multiplied
+     * and accumulated independently of the other bytes of the cell.
+     *
+     * An optional range parameter can be set to restrict the convolve operation to a rectangular
+     * subset of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY. NOTE: The output Bitmap will still be full size, with the
+     * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
+     *
+     * @param inputBitmap The image to be blurred.
+     * @param coefficients A FloatArray of size 9 or 25, containing the multipliers.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The convolved Bitmap.
+     */
+    @JvmOverloads
+    fun convolve(
+        inputBitmap: Bitmap,
+        coefficients: FloatArray,
+        restriction: Range2d? = null
+    ): Bitmap {
+        validateBitmap("convolve", inputBitmap)
+        require(coefficients.size == 9 || coefficients.size == 25) {
+            "$externalName convolve. Only 3x3 or 5x5 convolutions are supported. " +
+                    "${coefficients.size} coefficients provided."
+        }
+        validateRestriction("convolve", inputBitmap, restriction)
+
+        val outputBitmap = createCompatibleBitmap(inputBitmap)
+        nativeConvolveBitmap(nativeHandle, inputBitmap, outputBitmap, coefficients, restriction)
+        return outputBitmap
+    }
+
+    /**
+     * Compute the histogram of an image.
+     *
+     * Tallies how many times each of the 256 possible values of a byte is found in the input.
+     * A variant of this method is available to do the histogram of a Bitmap.
+     *
+     * An input cell can be represented by one to four bytes. The tally is done independently
+     * for each of the bytes of the cell. Correspondingly, the returned IntArray will have
+     * 256 * vectorSize entries. The counts for value 0 are consecutive, followed by those for
+     * value 1, etc.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The source buffer should be large enough for sizeX * sizeY * vectorSize bytes. It has a
+     * row-major layout.
+     *
+     * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
+     *
+     * @param inputArray The buffer of the image to be analyzed.
+     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+     * @param sizeX The width of the input buffers, as a number of 1 to 4 byte cells.
+     * @param sizeY The height of the input buffers, as a number of 1 to 4 byte cells.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The resulting array of counts.
+     */
+    @JvmOverloads
+    fun histogram(
+        inputArray: ByteArray,
+        vectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        restriction: Range2d? = null
+    ): IntArray {
+        require(vectorSize in 1..4) {
+            "$externalName histogram. The vectorSize should be between 1 and 4. " +
+                    "$vectorSize provided."
+        }
+        require(inputArray.size >= sizeX * sizeY * vectorSize) {
+            "$externalName histogram. inputArray is too small for the given dimensions. " +
+                    "$sizeX*$sizeY*$vectorSize < ${inputArray.size}."
+        }
+        validateRestriction("histogram", sizeX, sizeY, restriction)
+
+        val outputArray = IntArray(256 * paddedSize(vectorSize))
+        nativeHistogram(
+            nativeHandle,
+            inputArray,
+            vectorSize,
+            sizeX,
+            sizeY,
+            outputArray,
+            restriction
+        )
+        return outputArray
+    }
+
+    /**
+     * Compute the histogram of an image.
+     *
+     * Tallies how many times each of the 256 possible values of a byte is found in the bitmap.
+     * This method supports Bitmaps of config ARGB_8888 and ALPHA_8.
+     *
+     * For ARGB_8888, the tally is done independently of the four bytes. Correspondingly, the
+     * returned IntArray will have 4 * 256 entries. The counts for value 0 are consecutive,
+     * followed by those for value 1, etc.
+     *
+     * For ALPHA_8, an IntArray of size 256 is returned.
+     *
+     * Bitmaps with a stride different than width * vectorSize are not currently supported.
+     *
+     * A variant of this method is available to do the histogram of a ByteArray.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * @param inputBitmap The bitmap to be analyzed.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The resulting array of counts.
+     */
+    @JvmOverloads
+    fun histogram(
+        inputBitmap: Bitmap,
+        restriction: Range2d? = null
+    ): IntArray {
+        validateBitmap("histogram", inputBitmap)
+        validateRestriction("histogram", inputBitmap, restriction)
+
+        val outputArray = IntArray(256 * vectorSize(inputBitmap))
+        nativeHistogramBitmap(nativeHandle, inputBitmap, outputArray, restriction)
+        return outputArray
+    }
+
+    /**
+     * Compute the histogram of the dot product of an image.
+     *
+     * This method supports cells of 1 to 4 bytes in length. For each cell of the array,
+     * the dot product of its bytes with the provided coefficients is computed. The resulting
+     * floating point value is converted to an unsigned byte and tallied in the histogram.
+     *
+     * If coefficients is null, the coefficients used for RGBA luminosity calculation will be used,
+     * i.e. the values [0.299f, 0.587f, 0.114f, 0.f].
+     *
+     * Each coefficients must be >= 0 and their sum must be 1.0 or less. There must be the same
+     * number of coefficients as vectorSize.
+     *
+     * A variant of this method is available to do the histogram of a Bitmap.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The source buffer should be large enough for sizeX * sizeY * vectorSize bytes. The returned
+     * array will have 256 ints.
+     *
+     * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
+     *
+     * @param inputArray The buffer of the image to be analyzed.
+     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+     * @param sizeX The width of the input buffers, as a number of 1 to 4 byte cells.
+     * @param sizeY The height of the input buffers, as a number of 1 to 4 byte cells.
+     * @param coefficients The dot product multipliers. Size should equal vectorSize. Can be null.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The resulting vector of counts.
+     */
+    @JvmOverloads
+    fun histogramDot(
+        inputArray: ByteArray,
+        vectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        coefficients: FloatArray? = null,
+        restriction: Range2d? = null
+    ): IntArray {
+        require(vectorSize in 1..4) {
+            "$externalName histogramDot. The vectorSize should be between 1 and 4. " +
+                    "$vectorSize provided."
+        }
+        require(inputArray.size >= sizeX * sizeY * vectorSize) {
+            "$externalName histogramDot. inputArray is too small for the given dimensions. " +
+                    "$sizeX*$sizeY*$vectorSize < ${inputArray.size}."
+        }
+        validateHistogramDotCoefficients(coefficients, vectorSize)
+        validateRestriction("histogramDot", sizeX, sizeY, restriction)
+
+        val outputArray = IntArray(256)
+        val actualCoefficients = coefficients ?: floatArrayOf(0.299f, 0.587f, 0.114f, 0f)
+        nativeHistogramDot(
+            nativeHandle,
+            inputArray,
+            vectorSize,
+            sizeX,
+            sizeY,
+            outputArray,
+            actualCoefficients,
+            restriction
+        )
+        return outputArray
+    }
+
+    /**
+     * Compute the histogram of the dot product of an image.
+     *
+     * This method supports Bitmaps of config ARGB_8888 and ALPHA_8. For each pixel of the bitmap,
+     * the dot product of its bytes with the provided coefficients is computed. The resulting
+     * floating point value is converted to an unsigned byte and tallied in the histogram.
+     *
+     * If coefficients is null, the coefficients used for RGBA luminosity calculation will be used,
+     * i.e. the values [0.299f, 0.587f, 0.114f, 0.f].
+     *
+     * Each coefficients must be >= 0 and their sum must be 1.0 or less. For ARGB_8888, four values
+     * must be provided; for ALPHA_8, one.
+     *
+     * Bitmaps with a stride different than width * vectorSize are not currently supported.
+     *
+     * A variant of this method is available to do the histogram of a ByteArray.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The returned array will have 256 ints.
+     *
+     * @param inputBitmap The bitmap to be analyzed.
+     * @param coefficients The one or four values used for the dot product. Can be null.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The resulting vector of counts.
+     */
+    @JvmOverloads
+    fun histogramDot(
+        inputBitmap: Bitmap,
+        coefficients: FloatArray? = null,
+        restriction: Range2d? = null
+    ): IntArray {
+        validateBitmap("histogramDot", inputBitmap)
+        validateHistogramDotCoefficients(coefficients, vectorSize(inputBitmap))
+        validateRestriction("histogramDot", inputBitmap, restriction)
+
+        val outputArray = IntArray(256)
+        val actualCoefficients = coefficients ?: floatArrayOf(0.299f, 0.587f, 0.114f, 0f)
+        nativeHistogramDotBitmap(
+            nativeHandle, inputBitmap, outputArray, actualCoefficients, restriction
+        )
+        return outputArray
+    }
+
+    /**
+     * Transform an image using a look up table
+     *
+     * Transforms an image by using a per-channel lookup table. Each channel of the input has an
+     * independent lookup table. The tables are 256 entries in size and can cover the full value
+     * range of a byte.
+     *
+     * The input array should be in RGBA format, where four consecutive bytes form an cell.
+     * A variant of this method is available to transform a Bitmap.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY. NOTE: The output Bitmap will still be full size, with the
+     * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
+     *
+     * The source array should be large enough for sizeX * sizeY * vectorSize bytes. The returned
+     * ray has the same dimensions as the input. The arrays have a row-major layout.
+     *
+     * @param inputArray The buffer of the image to be transformed.
+     * @param sizeX The width of both buffers, as a number of 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 4 byte cells.
+     * @param table The four arrays of 256 values that's used to convert each channel.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The transformed image.
+     */
+    @JvmOverloads
+    fun lut(
+        inputArray: ByteArray,
+        sizeX: Int,
+        sizeY: Int,
+        table: LookupTable,
+        restriction: Range2d? = null
+    ): ByteArray {
+        require(inputArray.size >= sizeX * sizeY * 4) {
+            "$externalName lut. inputArray is too small for the given dimensions. " +
+                    "$sizeX*$sizeY*4 < ${inputArray.size}."
+        }
+        validateRestriction("lut", sizeX, sizeY, restriction)
+
+        val outputArray = ByteArray(inputArray.size)
+        nativeLut(
+            nativeHandle,
+            inputArray,
+            outputArray,
+            sizeX,
+            sizeY,
+            table.red,
+            table.green,
+            table.blue,
+            table.alpha,
+            restriction
+        )
+        return outputArray
+    }
+
+    /**
+     * Transform an image using a look up table
+     *
+     * Transforms an image by using a per-channel lookup table. Each channel of the input has an
+     * independent lookup table. The tables are 256 entries in size and can cover the full value
+     * range of a byte.
+     *
+     * The input Bitmap should be in config ARGB_8888. A variant of this method is available to
+     * transform a ByteArray. Bitmaps with a stride different than width * vectorSize are not
+     * currently supported.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY. NOTE: The output Bitmap will still be full size, with the
+     * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
+     *
+     * @param inputBitmap The buffer of the image to be transformed.
+     * @param table The four arrays of 256 values that's used to convert each channel.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The transformed image.
+     */
+    @JvmOverloads
+    fun lut(
+        inputBitmap: Bitmap,
+        table: LookupTable,
+        restriction: Range2d? = null
+    ): Bitmap {
+        validateBitmap("lut", inputBitmap)
+        validateRestriction("lut", inputBitmap, restriction)
+
+        val outputBitmap = createCompatibleBitmap(inputBitmap)
+        nativeLutBitmap(
+            nativeHandle,
+            inputBitmap,
+            outputBitmap,
+            table.red,
+            table.green,
+            table.blue,
+            table.alpha,
+            restriction
+        )
+        return outputBitmap
+    }
+
+    /**
+     * Transform an image using a 3D look up table
+     *
+     * Transforms an image, converting RGB to RGBA by using a 3D lookup table. The incoming R, G,
+     * and B values are normalized to the dimensions of the provided 3D buffer. The eight nearest
+     * values in that 3D buffer are sampled and linearly interpolated. The resulting RGBA entry
+     * is returned in the output array.
+     *
+     * The input array should be in RGBA format, where four consecutive bytes form an cell.
+     * The fourth byte of each input cell is ignored. A variant of this method is also available
+     * to transform Bitmaps.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY. NOTE: The output array will still be full size, with the
+     * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
+     *
+     * The source array should be large enough for sizeX * sizeY * vectorSize bytes. The returned
+     * array will have the same dimensions. The arrays have a row-major layout.
+     *
+     * @param inputArray The buffer of the image to be transformed.
+     * @param sizeX The width of both buffers, as a number of 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 4 byte cells.
+     * @param cube The translation cube.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The transformed image.
+     */
+    @JvmOverloads
+    fun lut3d(
+        inputArray: ByteArray,
+        sizeX: Int,
+        sizeY: Int,
+        cube: Rgba3dArray,
+        restriction: Range2d? = null
+    ): ByteArray {
+        require(inputArray.size >= sizeX * sizeY * 4) {
+            "$externalName lut3d. inputArray is too small for the given dimensions. " +
+                    "$sizeX*$sizeY*4 < ${inputArray.size}."
+        }
+        require(
+            cube.sizeX >= 2 && cube.sizeY >= 2 && cube.sizeZ >= 2 &&
+                    cube.sizeX <= 256 && cube.sizeY <= 256 && cube.sizeZ <= 256
+        ) {
+            "$externalName lut3d. The dimensions of the cube should be between 2 and 256. " +
+                    "(${cube.sizeX}, ${cube.sizeY}, ${cube.sizeZ}) provided."
+        }
+        validateRestriction("lut3d", sizeX, sizeY, restriction)
+
+        val outputArray = ByteArray(inputArray.size)
+        nativeLut3d(
+            nativeHandle, inputArray, outputArray, sizeX, sizeY, cube.values, cube.sizeX,
+            cube.sizeY, cube.sizeZ, restriction
+        )
+        return outputArray
+    }
+
+    /**
+     * Transform an image using a 3D look up table
+     *
+     * Transforms an image, converting RGB to RGBA by using a 3D lookup table. The incoming R, G,
+     * and B values are normalized to the dimensions of the provided 3D buffer. The eight nearest
+     * values in that 3D buffer are sampled and linearly interpolated. The resulting RGBA entry
+     * is returned in the output array.
+     *
+     * The input bitmap should be in RGBA_8888 format. The A channel is preserved. A variant of this
+     * method is also available to transform ByteArray. Bitmaps with a stride different than
+     * width * vectorSize are not currently supported.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY. NOTE: The output array will still be full size, with the
+     * section that's not convolved all set to 0. This is to stay compatible with RenderScript.
+     *
+     * The source array should be large enough for sizeX * sizeY * vectorSize bytes. The returned
+     * array will have the same dimensions. The arrays have a row-major layout.
+     *
+     * @param inputBitmap The image to be transformed.
+     * @param cube The translation cube.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return The transformed image.
+     */
+    @JvmOverloads
+    fun lut3d(
+        inputBitmap: Bitmap,
+        cube: Rgba3dArray,
+        restriction: Range2d? = null
+    ): Bitmap {
+        validateBitmap("lut3d", inputBitmap)
+        validateRestriction("lut3d", inputBitmap, restriction)
+
+        val outputBitmap = createCompatibleBitmap(inputBitmap)
+        nativeLut3dBitmap(
+            nativeHandle, inputBitmap, outputBitmap, cube.values, cube.sizeX,
+            cube.sizeY, cube.sizeZ, restriction
+        )
+        return outputBitmap
+    }
+
+    /**
+     * Resize an image.
+     *
+     * Resizes an image using bicubic interpolation.
+     *
+     * This method supports elements of 1 to 4 bytes in length. Each byte of the element is
+     * interpolated independently from the others.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of the output buffer. The corresponding scaled range of the input will be used. If provided,
+     * the range must be wholly contained with the dimensions described by outputSizeX and
+     * outputSizeY.
+     *
+     * The input and output arrays have a row-major layout. The input array should be
+     * large enough for sizeX * sizeY * vectorSize bytes.
+     *
+     * Like the RenderScript Intrinsics, vectorSize of size 3 are padded to occupy 4 bytes.
+     *
+     * @param inputArray The buffer of the image to be resized.
+     * @param vectorSize The number of bytes in each element of both buffers. A value from 1 to 4.
+     * @param inputSizeX The width of the input buffer, as a number of 1-4 byte elements.
+     * @param inputSizeY The height of the input buffer, as a number of 1-4 byte elements.
+     * @param outputSizeX The width of the output buffer, as a number of 1-4 byte elements.
+     * @param outputSizeY The height of the output buffer, as a number of 1-4 byte elements.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return An array that contains the rescaled image.
+     */
+    @JvmOverloads
+    fun resize(
+        inputArray: ByteArray,
+        vectorSize: Int,
+        inputSizeX: Int,
+        inputSizeY: Int,
+        outputSizeX: Int,
+        outputSizeY: Int,
+        restriction: Range2d? = null
+    ): ByteArray {
+        require(vectorSize in 1..4) {
+            "$externalName resize. The vectorSize should be between 1 and 4. $vectorSize provided."
+        }
+        require(inputArray.size >= inputSizeX * inputSizeY * vectorSize) {
+            "$externalName resize. inputArray is too small for the given dimensions. " +
+                    "$inputSizeX*$inputSizeY*$vectorSize < ${inputArray.size}."
+        }
+        validateRestriction("resize", outputSizeX, outputSizeY, restriction)
+
+        val outputArray = ByteArray(outputSizeX * outputSizeY * paddedSize(vectorSize))
+        nativeResize(
+            nativeHandle,
+            inputArray,
+            vectorSize,
+            inputSizeX,
+            inputSizeY,
+            outputArray,
+            outputSizeX,
+            outputSizeY,
+            restriction
+        )
+        return outputArray
+    }
+
+    /**
+     * Resize an image.
+     *
+     * Resizes an image using bicubic interpolation.
+     *
+     * This method supports input Bitmap of config ARGB_8888 and ALPHA_8. The returned Bitmap
+     * has the same config. Bitmaps with a stride different than width * vectorSize are not
+     * currently supported.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of the output buffer. The corresponding scaled range of the input will be used. If provided,
+     * the range must be wholly contained with the dimensions described by outputSizeX and
+     * outputSizeY.
+     *
+     * @param inputBitmap The Bitmap to be resized.
+     * @param outputSizeX The width of the output buffer, as a number of 1-4 byte elements.
+     * @param outputSizeY The height of the output buffer, as a number of 1-4 byte elements.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     * @return A Bitmap that contains the rescaled image.
+     */
+    @JvmOverloads
+    fun resize(
+        inputBitmap: Bitmap,
+        outputSizeX: Int,
+        outputSizeY: Int,
+        restriction: Range2d? = null
+    ): Bitmap {
+        validateBitmap("resize", inputBitmap)
+        validateRestriction("resize", outputSizeX, outputSizeY, restriction)
+
+        val outputBitmap = Bitmap.createBitmap(outputSizeX, outputSizeY, Bitmap.Config.ARGB_8888)
+        nativeResizeBitmap(nativeHandle, inputBitmap, outputBitmap, restriction)
+        return outputBitmap
+    }
+
+    /**
+     * Convert an image from YUV to RGB.
+     *
+     * Converts a YUV buffer to RGB. The input array should be supplied in a supported YUV format.
+     * The output is RGBA; the alpha channel will be set to 255.
+     *
+     * Note that for YV12 and a sizeX that's not a multiple of 32, the RenderScript Intrinsic may
+     * not have converted the image correctly. This Toolkit method should.
+     *
+     * @param inputArray The buffer of the image to be converted.
+     * @param sizeX The width in pixels of the image.
+     * @param sizeY The height in pixels of the image.
+     * @param format Either YV12 or NV21.
+     * @return The converted image as a byte array.
+     */
+    fun yuvToRgb(inputArray: ByteArray, sizeX: Int, sizeY: Int, format: YuvFormat): ByteArray {
+        require(sizeX % 2 == 0 && sizeY % 2 == 0) {
+            "$externalName yuvToRgb. Non-even dimensions are not supported. " +
+                    "$sizeX and $sizeY were provided."
+        }
+
+        val outputArray = ByteArray(sizeX * sizeY * 4)
+        nativeYuvToRgb(nativeHandle, inputArray, outputArray, sizeX, sizeY, format.value)
+        return outputArray
+    }
+
+    /**
+     * Convert an image from YUV to an RGB Bitmap.
+     *
+     * Converts a YUV buffer to an RGB Bitmap. The input array should be supplied in a supported
+     * YUV format. The output is RGBA; the alpha channel will be set to 255.
+     *
+     * Note that for YV12 and a sizeX that's not a multiple of 32, the RenderScript Intrinsic may
+     * not have converted the image correctly. This Toolkit method should.
+     *
+     * @param inputArray The buffer of the image to be converted.
+     * @param sizeX The width in pixels of the image.
+     * @param sizeY The height in pixels of the image.
+     * @param format Either YV12 or NV21.
+     * @return The converted image.
+     */
+    fun yuvToRgbBitmap(inputArray: ByteArray, sizeX: Int, sizeY: Int, format: YuvFormat): Bitmap {
+        require(sizeX % 2 == 0 && sizeY % 2 == 0) {
+            "$externalName yuvToRgbBitmap. Non-even dimensions are not supported. " +
+                    "$sizeX and $sizeY were provided."
+        }
+
+        val outputBitmap = Bitmap.createBitmap(sizeX, sizeY, Bitmap.Config.ARGB_8888)
+        nativeYuvToRgbBitmap(nativeHandle, inputArray, sizeX, sizeY, outputBitmap, format.value)
+        return outputBitmap
+    }
+
+    init {
+        System.loadLibrary("renderscript-toolkit")
+        nativeHandle = createNative()
+    }
+
+    /**
+     * Shutdown the thread pool.
+     *
+     * Waits for the threads to complete their work and destroys them.
+     *
+     * An application should call this method only if it is sure that it won't call the
+     * toolkit again, as it is irreversible.
+     */
+    fun shutdown() {
+        destroyNative(nativeHandle)
+        nativeHandle = 0
+    }
+
+    private var nativeHandle: Long = 0
+
+    private external fun createNative(): Long
+
+    private external fun destroyNative(nativeHandle: Long)
+
+    private external fun nativeBlend(
+        nativeHandle: Long,
+        mode: Int,
+        sourceArray: ByteArray,
+        destArray: ByteArray,
+        sizeX: Int,
+        sizeY: Int,
+        restriction: Range2d?
+    )
+
+    private external fun nativeBlendBitmap(
+        nativeHandle: Long,
+        mode: Int,
+        sourceBitmap: Bitmap,
+        destBitmap: Bitmap,
+        restriction: Range2d?
+    )
+
+    private external fun nativeBlur(
+        nativeHandle: Long,
+        inputArray: ByteArray,
+        vectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        radius: Int,
+        outputArray: ByteArray,
+        restriction: Range2d?
+    )
+
+    private external fun nativeBlurBitmap(
+        nativeHandle: Long,
+        inputBitmap: Bitmap,
+        outputBitmap: Bitmap,
+        radius: Int,
+        restriction: Range2d?
+    )
+
+    private external fun nativeColorMatrix(
+        nativeHandle: Long,
+        inputArray: ByteArray,
+        inputVectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        outputArray: ByteArray,
+        outputVectorSize: Int,
+        matrix: FloatArray,
+        addVector: FloatArray,
+        restriction: Range2d?
+    )
+
+    private external fun nativeColorMatrixBitmap(
+        nativeHandle: Long,
+        inputBitmap: Bitmap,
+        outputBitmap: Bitmap,
+        matrix: FloatArray,
+        addVector: FloatArray,
+        restriction: Range2d?
+    )
+
+    private external fun nativeConvolve(
+        nativeHandle: Long,
+        inputArray: ByteArray,
+        vectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        outputArray: ByteArray,
+        coefficients: FloatArray,
+        restriction: Range2d?
+    )
+
+    private external fun nativeConvolveBitmap(
+        nativeHandle: Long,
+        inputBitmap: Bitmap,
+        outputBitmap: Bitmap,
+        coefficients: FloatArray,
+        restriction: Range2d?
+    )
+
+    private external fun nativeHistogram(
+        nativeHandle: Long,
+        inputArray: ByteArray,
+        vectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        outputArray: IntArray,
+        restriction: Range2d?
+    )
+
+    private external fun nativeHistogramBitmap(
+        nativeHandle: Long,
+        inputBitmap: Bitmap,
+        outputArray: IntArray,
+        restriction: Range2d?
+    )
+
+    private external fun nativeHistogramDot(
+        nativeHandle: Long,
+        inputArray: ByteArray,
+        vectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        outputArray: IntArray,
+        coefficients: FloatArray,
+        restriction: Range2d?
+    )
+
+    private external fun nativeHistogramDotBitmap(
+        nativeHandle: Long,
+        inputBitmap: Bitmap,
+        outputArray: IntArray,
+        coefficients: FloatArray,
+        restriction: Range2d?
+    )
+
+    private external fun nativeLut(
+        nativeHandle: Long,
+        inputArray: ByteArray,
+        outputArray: ByteArray,
+        sizeX: Int,
+        sizeY: Int,
+        red: ByteArray,
+        green: ByteArray,
+        blue: ByteArray,
+        alpha: ByteArray,
+        restriction: Range2d?
+    )
+
+    private external fun nativeLutBitmap(
+        nativeHandle: Long,
+        inputBitmap: Bitmap,
+        outputBitmap: Bitmap,
+        red: ByteArray,
+        green: ByteArray,
+        blue: ByteArray,
+        alpha: ByteArray,
+        restriction: Range2d?
+    )
+
+    private external fun nativeLut3d(
+        nativeHandle: Long,
+        inputArray: ByteArray,
+        outputArray: ByteArray,
+        sizeX: Int,
+        sizeY: Int,
+        cube: ByteArray,
+        cubeSizeX: Int,
+        cubeSizeY: Int,
+        cubeSizeZ: Int,
+        restriction: Range2d?
+    )
+
+    private external fun nativeLut3dBitmap(
+        nativeHandle: Long,
+        inputBitmap: Bitmap,
+        outputBitmap: Bitmap,
+        cube: ByteArray,
+        cubeSizeX: Int,
+        cubeSizeY: Int,
+        cubeSizeZ: Int,
+        restriction: Range2d?
+    )
+
+    private external fun nativeResize(
+        nativeHandle: Long,
+        inputArray: ByteArray,
+        vectorSize: Int,
+        inputSizeX: Int,
+        inputSizeY: Int,
+        outputArray: ByteArray,
+        outputSizeX: Int,
+        outputSizeY: Int,
+        restriction: Range2d?
+    )
+
+    private external fun nativeResizeBitmap(
+        nativeHandle: Long,
+        inputBitmap: Bitmap,
+        outputBitmap: Bitmap,
+        restriction: Range2d?
+    )
+
+    private external fun nativeYuvToRgb(
+        nativeHandle: Long,
+        inputArray: ByteArray,
+        outputArray: ByteArray,
+        sizeX: Int,
+        sizeY: Int,
+        format: Int
+    )
+
+    private external fun nativeYuvToRgbBitmap(
+        nativeHandle: Long,
+        inputArray: ByteArray,
+        sizeX: Int,
+        sizeY: Int,
+        outputBitmap: Bitmap,
+        value: Int
+    )
+}
+
+/**
+ * Determines how a source buffer is blended into a destination buffer.
+ * See {@link RenderScriptToolkit::blend}.
+ *
+ * blend only works on 4 byte RGBA data. In the descriptions below, ".a" represents
+ * the alpha channel.
+ */
+enum class BlendingMode(val value: Int) {
+    /**
+     * dest = 0
+     *
+     * The destination is cleared, i.e. each pixel is set to (0, 0, 0, 0)
+     */
+    CLEAR(0),
+
+    /**
+     * dest = src
+     *
+     * Sets each pixel of the destination to the corresponding one in the source.
+     */
+    SRC(1),
+
+    /**
+     * dest = dest
+     *
+     * Leaves the destination untouched. This is a no-op.
+     */
+    DST(2),
+
+    /**
+     * dest = src + dest * (1.0 - src.a)
+     */
+    SRC_OVER(3),
+
+    /**
+     * dest = dest + src * (1.0 - dest.a)
+     */
+    DST_OVER(4),
+
+    /**
+     * dest = src * dest.a
+     */
+    SRC_IN(5),
+
+    /**
+     * dest = dest * src.a
+     */
+    DST_IN(6),
+
+    /**
+     * dest = src * (1.0 - dest.a)
+     */
+    SRC_OUT(7),
+
+    /**
+     * dest = dest * (1.0 - src.a)
+     */
+    DST_OUT(8),
+
+    /**
+     * dest.rgb = src.rgb * dest.a + (1.0 - src.a) * dest.rgb, dest.a = dest.a
+     */
+    SRC_ATOP(9),
+
+    /**
+     * dest = dest.rgb * src.a + (1.0 - dest.a) * src.rgb, dest.a = src.a
+     */
+    DST_ATOP(10),
+
+    /**
+     * dest = {src.r ^ dest.r, src.g ^ dest.g, src.b ^ dest.b, src.a ^ dest.a}
+     *
+     * Note: this is NOT the Porter/Duff XOR mode; this is a bitwise xor.
+     */
+    XOR(11),
+
+    /**
+     * dest = src * dest
+     */
+    MULTIPLY(12),
+
+    /**
+     * dest = min(src + dest, 1.0)
+     */
+    ADD(13),
+
+    /**
+     * dest = max(dest - src, 0.0)
+     */
+    SUBTRACT(14)
+}
+
+/**
+ * A translation table used by the lut method. For each potential red, green, blue, and alpha
+ * value, specifies it's replacement value.
+ *
+ * The fields are initialized to be a no-op operation, i.e. replace 1 by 1, 2 by 2, etc.
+ * You can modify just the values you're interested in having a translation.
+ */
+class LookupTable {
+    var red = ByteArray(256) { it.toByte() }
+    var green = ByteArray(256) { it.toByte() }
+    var blue = ByteArray(256) { it.toByte() }
+    var alpha = ByteArray(256) { it.toByte() }
+}
+
+/**
+ * The YUV formats supported by yuvToRgb.
+ */
+enum class YuvFormat(val value: Int) {
+    NV21(0x11),
+    YV12(0x32315659),
+}
+
+/**
+ * Define a range of data to process.
+ *
+ * This class is used to restrict a [Toolkit] operation to a rectangular subset of the input
+ * tensor.
+ *
+ * @property startX The index of the first value to be included on the X axis.
+ * @property endX The index after the last value to be included on the X axis.
+ * @property startY The index of the first value to be included on the Y axis.
+ * @property endY The index after the last value to be included on the Y axis.
+ */
+data class Range2d(
+    val startX: Int,
+    val endX: Int,
+    val startY: Int,
+    val endY: Int
+) {
+    constructor() : this(0, 0, 0, 0)
+}
+
+class Rgba3dArray(val values: ByteArray, val sizeX: Int, val sizeY: Int, val sizeZ: Int) {
+    init {
+        require(values.size >= sizeX * sizeY * sizeZ * 4)
+    }
+
+    operator fun get(x: Int, y: Int, z: Int): ByteArray {
+        val index = indexOfVector(x, y, z)
+        return ByteArray(4) { values[index + it] }
+    }
+
+    operator fun set(x: Int, y: Int, z: Int, value: ByteArray) {
+        require(value.size == 4)
+        val index = indexOfVector(x, y, z)
+        for (i in 0..3) {
+            values[index + i] = value[i]
+        }
+    }
+
+    private fun indexOfVector(x: Int, y: Int, z: Int): Int {
+        require(x in 0 until sizeX)
+        require(y in 0 until sizeY)
+        require(z in 0 until sizeZ)
+        return ((z * sizeY + y) * sizeX + x) * 4
+    }
+}
+
+internal fun validateBitmap(
+    function: String,
+    inputBitmap: Bitmap,
+    alphaAllowed: Boolean = true
+) {
+    if (alphaAllowed) {
+        require(
+            inputBitmap.config == Bitmap.Config.ARGB_8888 ||
+                    inputBitmap.config == Bitmap.Config.ALPHA_8
+        ) {
+            "$externalName. $function supports only ARGB_8888 and ALPHA_8 bitmaps. " +
+                    "${inputBitmap.config} provided."
+        }
+    } else {
+        require(inputBitmap.config == Bitmap.Config.ARGB_8888) {
+            "$externalName. $function supports only ARGB_8888. " +
+                    "${inputBitmap.config} provided."
+        }
+    }
+    require(inputBitmap.width * vectorSize(inputBitmap) == inputBitmap.rowBytes) {
+        "$externalName $function. Only bitmaps with rowSize equal to the width * vectorSize are " +
+                "currently supported. Provided were rowBytes=${inputBitmap.rowBytes}, " +
+                "width={${inputBitmap.width}, and vectorSize=${vectorSize(inputBitmap)}."
+    }
+}
+
+internal fun createCompatibleBitmap(inputBitmap: Bitmap) =
+    Bitmap.createBitmap(inputBitmap.width, inputBitmap.height, inputBitmap.config)
+
+internal fun validateHistogramDotCoefficients(
+    coefficients: FloatArray?,
+    vectorSize: Int
+) {
+    require(coefficients == null || coefficients.size == vectorSize) {
+        "$externalName histogramDot. The coefficients should be null or have $vectorSize values."
+    }
+    if (coefficients !== null) {
+        var sum = 0f
+        for (i in 0 until vectorSize) {
+            require(coefficients[i] >= 0.0f) {
+                "$externalName histogramDot. Coefficients should not be negative. " +
+                        "Coefficient $i was ${coefficients[i]}."
+            }
+            sum += coefficients[i]
+        }
+        require(sum <= 1.0f) {
+            "$externalName histogramDot. Coefficients should add to 1 or less. Their sum is $sum."
+        }
+    }
+}
+
+internal fun validateRestriction(tag: String, bitmap: Bitmap, restriction: Range2d? = null) {
+    validateRestriction(tag, bitmap.width, bitmap.height, restriction)
+}
+
+internal fun validateRestriction(
+    tag: String,
+    sizeX: Int,
+    sizeY: Int,
+    restriction: Range2d? = null
+) {
+    if (restriction == null) return
+    require(restriction.startX < sizeX && restriction.endX <= sizeX) {
+        "$externalName $tag. sizeX should be greater than restriction.startX and greater " +
+                "or equal to restriction.endX. $sizeX, ${restriction.startX}, " +
+                "and ${restriction.endX} were provided respectively."
+    }
+    require(restriction.startY < sizeY && restriction.endY <= sizeY) {
+        "$externalName $tag. sizeY should be greater than restriction.startY and greater " +
+                "or equal to restriction.endY. $sizeY, ${restriction.startY}, " +
+                "and ${restriction.endY} were provided respectively."
+    }
+    require(restriction.startX < restriction.endX) {
+        "$externalName $tag. Restriction startX should be less than endX. " +
+                "${restriction.startX} and ${restriction.endX} were provided respectively."
+    }
+    require(restriction.startY < restriction.endY) {
+        "$externalName $tag. Restriction startY should be less than endY. " +
+                "${restriction.startY} and ${restriction.endY} were provided respectively."
+    }
+}
+
+internal fun vectorSize(bitmap: Bitmap): Int {
+    return when (bitmap.config) {
+        Bitmap.Config.ARGB_8888 -> 4
+        Bitmap.Config.ALPHA_8 -> 1
+        else -> throw IllegalArgumentException(
+            "$externalName. Only ARGB_8888 and ALPHA_8 Bitmap are supported."
+        )
+    }
+}
+
+internal fun paddedSize(vectorSize: Int) = if (vectorSize == 3) 4 else vectorSize
diff --git a/toolkit/test/AllTests.kt b/toolkit/test/AllTests.kt
new file mode 100644
index 0000000..5833795
--- /dev/null
+++ b/toolkit/test/AllTests.kt
@@ -0,0 +1,1244 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO Rename to something better
+package com.example.testapp
+
+import android.content.Context
+import android.graphics.Bitmap
+import android.graphics.BitmapFactory
+import android.renderscript.RenderScript
+import android.renderscript.toolkit.BlendingMode
+import android.renderscript.toolkit.LookupTable
+import android.renderscript.toolkit.Range2d
+import android.renderscript.toolkit.Rgba3dArray
+import android.renderscript.toolkit.Toolkit
+import android.renderscript.toolkit.YuvFormat
+import kotlin.math.abs
+import kotlin.math.min
+
+data class TestLayout(
+    val sizeX: Int,
+    val sizeY: Int,
+    val restriction: Range2d?
+)
+
+// List of dimensions (sizeX, sizeY) to try when generating random data.
+val commonLayoutsToTry = listOf(
+    // Small layouts to start with
+    TestLayout(3, 4, null),
+    TestLayout(3, 4, Range2d(0, 1, 0, 3)),
+    TestLayout(3, 4, Range2d(2, 3, 1, 4)),
+    TestLayout(10, 14, null),
+    TestLayout(10, 14, Range2d(2, 3, 8, 14)),
+    // The size of most CTS intrinsic tests
+    TestLayout(160, 100, null),
+    TestLayout(125, 227, Range2d(50, 125, 100, 227)),
+    // A larger one
+    TestLayout(800, 600, null),
+    // Weirdly shaped ones
+    TestLayout(1, 1, null), // A single item
+    // TODO This size makes Intrinsic Blur fail.
+    TestLayout(16000, 1, null), // A single item
+    TestLayout(1, 16000, null), // One large row
+    // A very large test
+    TestLayout(1024, 2048, null),
+)
+
+
+class Tester(context: Context, private val validate: Boolean) {
+    private val renderscriptContext = RenderScript.create(context)
+    private val toolkit = Toolkit()
+    private val testImage1 = BitmapFactory.decodeResource(context.resources, R.drawable.img800x450a)
+    private val testImage2 = BitmapFactory.decodeResource(context.resources, R.drawable.img800x450b)
+
+    init {
+        validateTestImage(testImage1)
+        validateTestImage(testImage2)
+    }
+
+    /**
+     * Verify that the test images are in format that works for our tests.
+     */
+    private fun validateTestImage(bitmap: Bitmap) {
+        require(bitmap.config == Bitmap.Config.ARGB_8888)
+        require(bitmap.rowBytes == bitmap.width * 4) {
+            "Can't handle bitmaps that have extra padding. " +
+                "${bitmap.rowBytes} != ${bitmap.width} * 4." }
+        require(bitmap.byteCount == bitmap.rowBytes * bitmap.height)
+    }
+
+    fun destroy() {
+        renderscriptContext.destroy()
+    }
+
+    @ExperimentalUnsignedTypes
+    fun testAll(timer: TimingTracker): String {
+        val tests  = listOf(
+            Pair("blend", ::testBlend),
+            Pair("blur", ::testBlur),
+            Pair("colorMatrix", ::testColorMatrix),
+            Pair("convolve", ::testConvolve),
+            Pair("histogram", ::testHistogram),
+            Pair("lut", ::testLut),
+            Pair("lut3d", ::testLut3d),
+            Pair("resize", ::testResize),
+            Pair("yuvToRgb", ::testYuvToRgb),
+        )
+        val results = Array(tests.size) { "" }
+        for (i in tests.indices) {
+            val (name, test) = tests[i]
+            println("Doing $name")
+            val success = test(timer)
+            results[i] = "$name " + if (success) "succeeded" else "FAILED! FAILED! FAILED! FAILED!"
+            println("      ${results[i]}")
+        }
+
+        return results.joinToString("\n")
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testBlend(timer: TimingTracker): Boolean {
+        return BlendingMode.values().all { mode ->
+            testOneBitmapBlend(timer, testImage1, testImage2, mode, null) and
+                    testOneBitmapBlend(
+                        timer, testImage1, testImage2, mode,
+                        Range2d(6, 23, 2, 4)
+                    ) and
+                    commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+                        testOneRandomBlend(timer, sizeX, sizeY, mode, restriction)
+                    }
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneRandomBlend(
+        timer: TimingTracker,
+        sizeX: Int,
+        sizeY: Int,
+        mode: BlendingMode,
+        restriction: Range2d?
+    ): Boolean {
+        val sourceArray = randomByteArray(0x50521f0, sizeX, sizeY, 4)
+        val destArray = randomByteArray(0x2932147, sizeX, sizeY, 4)
+        // Make clones because these will be modified by the blend.
+        val intrinsicDestArray = destArray.clone()
+        val referenceDestArray = destArray.clone()
+        val toolkitDestArray = destArray.clone()
+
+        timer.measure("IntrinsicBlend") {
+            intrinsicBlend(
+                renderscriptContext, mode, sourceArray, intrinsicDestArray, sizeX, sizeY,
+                restriction
+            )
+        }
+        timer.measure("ToolkitBlend") {
+            toolkit.blend(mode, sourceArray, toolkitDestArray, sizeX, sizeY, restriction)
+        }
+        if (!validate) return true
+
+        timer.measure("ReferenceBlend") {
+            referenceBlend(mode, sourceArray, referenceDestArray, sizeX, sizeY, restriction)
+        }
+
+        return validateSame(
+            "Blend_$mode", intrinsicDestArray, referenceDestArray, toolkitDestArray
+        ) {
+            println("blend $mode ($sizeX, $sizeY) $restriction")
+            logArray("Blend_$mode src", sourceArray, 48)
+            logArray("Blend_$mode dst", destArray, 48)
+            logArray("Blend_$mode reference out", referenceDestArray, 48)
+            logArray("Blend_$mode intrinsic out", intrinsicDestArray, 48)
+            logArray("Blend_$mode toolkit   out", toolkitDestArray, 48)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneBitmapBlend(
+        timer: TimingTracker,
+        sourceBitmap: Bitmap,
+        destBitmap: Bitmap,
+        mode: BlendingMode,
+        restriction: Range2d?
+    ): Boolean {
+        // Make clones because these will be modified by the blend.
+        val intrinsicDestBitmap = duplicateBitmap(destBitmap)
+        val toolkitDestBitmap = duplicateBitmap(destBitmap)
+        val referenceDestBitmap = duplicateBitmap(destBitmap)
+
+        timer.measure("IntrinsicBlend") {
+            intrinsicBlend(
+                renderscriptContext, mode, sourceBitmap, intrinsicDestBitmap, restriction
+            )
+        }
+        timer.measure("ToolkitBlend") {
+            toolkit.blend(mode, sourceBitmap, toolkitDestBitmap, restriction)
+        }
+        if (!validate) return true
+
+        val referenceDestArray = getBitmapBytes(referenceDestBitmap)
+        timer.measure("ReferenceBlend") {
+            referenceBlend(
+                mode, getBitmapBytes(sourceBitmap), referenceDestArray, sourceBitmap.width,
+                sourceBitmap.height, restriction
+            )
+        }
+
+        val intrinsicDestArray = getBitmapBytes(intrinsicDestBitmap)
+        val toolkitDestArray = getBitmapBytes(toolkitDestBitmap)
+        return validateSame(
+            "BlendBitmap_$mode", intrinsicDestArray, referenceDestArray, toolkitDestArray
+        ) {
+            println("BlendBitmap $mode $restriction")
+            //logArray("BlendBitmap_$mode src", sourceArray, 48)
+            //logArray("BlendBitmap_$mode dst", destArray, 48)
+            logArray("BlendBitmap_$mode reference out", referenceDestArray, 48)
+            logArray("BlendBitmap_$mode intrinsic out", intrinsicDestArray, 48)
+            logArray("BlendBitmap_$mode toolkit   out", toolkitDestArray, 48)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testBlur(timer: TimingTracker): Boolean {
+        return arrayOf(1, 3, 8, 25).all { radius ->
+            testOneBitmapBlur(timer, testImage1, radius, null) and
+                    testOneBitmapBlur(timer, testImage1, radius, Range2d(6, 23, 2, 4)) and
+                    commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+                        arrayOf(1, 4).all { vectorSize ->
+                            testOneRandomBlur(timer, vectorSize, sizeX, sizeY, radius, restriction)
+                        }
+                    }
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneRandomBlur(
+        timer: TimingTracker,
+        vectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        radius: Int,
+        restriction: Range2d?
+    ): Boolean {
+        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, vectorSize)
+        val intrinsicOutArray = timer.measure("IntrinsicBlur") {
+            intrinsicBlur(
+                renderscriptContext, inputArray, vectorSize, sizeX, sizeY, radius, restriction
+            )
+        }
+        val toolkitOutArray = timer.measure("ToolkitBlur") {
+            toolkit.blur(inputArray, vectorSize, sizeX, sizeY, radius, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceBlur") {
+            referenceBlur(inputArray, vectorSize, sizeX, sizeY, radius, restriction)
+        }
+        return validateSame("blur", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+            println("blur $vectorSize ($sizeX, $sizeY) radius = $radius $restriction")
+            logArray("blur input        ", inputArray)
+            logArray("blur reference out", referenceOutArray)
+            logArray("blur intrinsic out", intrinsicOutArray)
+            logArray("blur toolkit   out", toolkitOutArray)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneBitmapBlur(
+        timer: TimingTracker,
+        bitmap: Bitmap,
+        radius: Int,
+        restriction: Range2d?
+    ): Boolean {
+        val intrinsicOutArray = timer.measure("IntrinsicBlur") {
+            intrinsicBlur(renderscriptContext, bitmap, radius, restriction)
+        }
+
+        val toolkitOutBitmap = timer.measure("ToolkitBlur") {
+            toolkit.blur(bitmap, radius, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceBlur") {
+            referenceBlur(
+                getBitmapBytes(bitmap),
+                vectorSizeOfBitmap(bitmap),
+                bitmap.width,
+                bitmap.height,
+                radius,
+                restriction
+            )
+        }
+
+        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+        return validateSame("blur", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+            println("BlurBitmap ${bitmap.config} $radius $restriction")
+            logArray("blur reference out", referenceOutArray)
+            logArray("blur intrinsic out", intrinsicOutArray)
+            logArray("blur toolkit   out", toolkitOutArray)
+        }
+    }
+
+    enum class ColorMatrixConversionType {
+        RGB_TO_YUV,
+        YUV_TO_RGB,
+        GREYSCALE,
+        RANDOM
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testColorMatrix(timer: TimingTracker): Boolean {
+        return ColorMatrixConversionType.values().all { conversion ->
+            testOneBitmapColorMatrix(timer, testImage1, conversion, null) and
+                    testOneBitmapColorMatrix(
+                        timer,
+                        testImage1,
+                        conversion,
+                        Range2d(6, 23, 2, 4)
+                    ) and
+                    commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+                        (1..4).all { inputVectorSize ->
+                            (1..4).all { outputVectorSize ->
+                                testOneRandomColorMatrix(
+                                    timer,
+                                    inputVectorSize,
+                                    sizeX,
+                                    sizeY,
+                                    outputVectorSize,
+                                    conversion,
+                                    restriction
+                                )
+                            }
+                        }
+                    }
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneRandomColorMatrix(
+        timer: TimingTracker,
+        inputVectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        outputVectorSize: Int,
+        conversion: ColorMatrixConversionType,
+        restriction: Range2d?
+    ): Boolean {
+        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, paddedSize(inputVectorSize))
+        val addVector = randomFloatArray(0x243238, 4, 1, 1, 0.3f)
+        val matrix = when (conversion) {
+            ColorMatrixConversionType.RGB_TO_YUV -> toolkit.rgbToYuvMatrix
+            ColorMatrixConversionType.YUV_TO_RGB -> toolkit.yuvToRgbMatrix
+            ColorMatrixConversionType.GREYSCALE -> toolkit.greyScaleColorMatrix
+            ColorMatrixConversionType.RANDOM -> randomFloatArray(0x234348, 4, 4, 1)
+        }
+
+        val intrinsicOutArray = timer.measure("IntrinsicColorMatrix") {
+            intrinsicColorMatrix(
+                renderscriptContext,
+                conversion,
+                inputArray,
+                inputVectorSize,
+                sizeX,
+                sizeY,
+                outputVectorSize,
+                matrix,
+                addVector,
+                restriction
+            )
+        }
+        val toolkitOutArray = timer.measure("ToolkitColorMatrix") {
+            toolkit.colorMatrix(
+                inputArray,
+                inputVectorSize,
+                sizeX,
+                sizeY,
+                outputVectorSize,
+                matrix,
+                addVector,
+                restriction
+            )
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceColorMatrix") {
+            referenceColorMatrix(
+                inputArray, inputVectorSize, sizeX, sizeY, outputVectorSize, matrix, addVector,
+                restriction
+            )
+        }
+
+        return validateSame("colorMatrix", intrinsicOutArray, referenceOutArray, toolkitOutArray,
+            outputVectorSize == 3) {
+            println("colorMatrix ($sizeX, $sizeY) $inputVectorSize->$outputVectorSize $restriction")
+            logArray("colorMatrix matrix   ", matrix, 16)
+            logArray("colorMatrix addVector", addVector, 4)
+            logArray("colorMatrix in           ", inputArray)
+            logArray("colorMatrix reference out", referenceOutArray, 300)
+            logArray("colorMatrix intrinsic out", intrinsicOutArray, 300)
+            logArray("colorMatrix toolkit   out", toolkitOutArray, 300)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneBitmapColorMatrix(
+        timer: TimingTracker,
+        bitmap: Bitmap,
+        conversion: ColorMatrixConversionType,
+        restriction: Range2d?
+    ): Boolean {
+        val addVector = randomFloatArray(0x243238, 4, 1, 1, 0.3f)
+        val matrix = when (conversion) {
+            ColorMatrixConversionType.RGB_TO_YUV -> toolkit.rgbToYuvMatrix
+            ColorMatrixConversionType.YUV_TO_RGB -> toolkit.yuvToRgbMatrix
+            ColorMatrixConversionType.GREYSCALE -> toolkit.greyScaleColorMatrix
+            ColorMatrixConversionType.RANDOM -> randomFloatArray(0x234348, 4, 4, 1)
+        }
+
+        val intrinsicOutArray = timer.measure("IntrinsicColorMatrix") {
+            intrinsicColorMatrix(
+                renderscriptContext, conversion, bitmap, matrix, addVector, restriction
+            )
+        }
+        val toolkitOutBitmap = timer.measure("ToolkitColorMatrix") {
+            toolkit.colorMatrix(bitmap, matrix, addVector, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceColorMatrix") {
+            referenceColorMatrix(
+                getBitmapBytes(bitmap), vectorSizeOfBitmap(bitmap), bitmap.width, bitmap.height,
+                vectorSizeOfBitmap(bitmap), matrix, addVector, restriction
+            )
+        }
+
+        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+        return validateSame("ColorMatrix", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+            println("colorMatrixBitmap $restriction")
+            logArray("colorMatrixBitmap matrix   ", matrix, 16)
+            logArray("colorMatrixBitmap addVector", addVector, 4)
+            logArray("colorMatrixBitmap reference out", referenceOutArray)
+            logArray("colorMatrixBitmap intrinsic out", intrinsicOutArray)
+            logArray("colorMatrixBitmap toolkit   out", toolkitOutArray)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testConvolve(timer: TimingTracker): Boolean {
+        val coefficientsToTry = listOf(
+            randomFloatArray(0x2937021, 3, 3, 1, 0.1f),
+            randomFloatArray(0x2937021, 5, 5, 1, 0.05f)
+        )
+        return coefficientsToTry.all { coefficients ->
+            testOneBitmapConvolve(timer, testImage1, coefficients, null) and
+                    testOneBitmapConvolve(timer, testImage1, coefficients, Range2d(6, 23, 2, 4)) and
+
+                    commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+                        (1..4).all { vectorSize ->
+                            testOneRandomConvolve(
+                                timer,
+                                vectorSize,
+                                sizeX,
+                                sizeY,
+                                coefficients,
+                                restriction
+                            )
+                        }
+                    }
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneRandomConvolve(
+        timer: TimingTracker,
+        vectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        coefficients: FloatArray,
+        restriction: Range2d?
+    ): Boolean {
+        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, paddedSize(vectorSize))
+
+        val intrinsicOutArray = timer.measure("IntrinsicConvolve") {
+            intrinsicConvolve(
+                renderscriptContext, inputArray, vectorSize, sizeX, sizeY, coefficients, restriction
+            )
+        }
+        val toolkitOutArray = timer.measure("ToolkitConvolve") {
+            toolkit.convolve(inputArray, vectorSize, sizeX, sizeY, coefficients, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceConvolve") {
+            referenceConvolve(inputArray, vectorSize, sizeX, sizeY, coefficients, restriction)
+        }
+
+        val task = if (coefficients.size == 9) "convolve3x3 $vectorSize" else "convolve5x5 $vectorSize"
+        return validateSame(task, intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+            println("Convolve $vectorSize ($sizeX, $sizeY) $restriction")
+            logArray("Convolve coefficients", coefficients, 25)
+            logArray("Convolve in           ", inputArray)
+            logArray("Convolve reference out", referenceOutArray)
+            logArray("Convolve intrinsic out", intrinsicOutArray)
+            logArray("Convolve toolkit   out", toolkitOutArray)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneBitmapConvolve(
+        timer: TimingTracker,
+        bitmap: Bitmap,
+        coefficients: FloatArray,
+        restriction: Range2d?
+    ): Boolean {
+        val intrinsicOutArray = timer.measure("IntrinsicConvolve") {
+            intrinsicConvolve(renderscriptContext, bitmap, coefficients, restriction)
+        }
+        val toolkitOutBitmap = timer.measure("ToolkitConvolve") {
+            toolkit.convolve(bitmap, coefficients, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceConvolve") {
+            referenceConvolve(
+                getBitmapBytes(bitmap), vectorSizeOfBitmap(bitmap), bitmap.width, bitmap.height,
+                coefficients, restriction
+            )
+        }
+
+        val task = if (coefficients.size == 9) "convolve3x3" else "convolve5x5"
+        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+        return validateSame(task, intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+            println("ConvolveBitmap $restriction")
+            logArray("ConvolveBitmap coefficients", coefficients, 25)
+            //logArray("ConvolveBitmap in           ", inputArray)
+            logArray("ConvolveBitmap reference out", referenceOutArray)
+            logArray("ConvolveBitmap intrinsic out", intrinsicOutArray)
+            logArray("ConvolveBitmap toolkit   out", toolkitOutArray)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testHistogram(timer: TimingTracker): Boolean {
+        val coefficients = floatArrayOf(0.1f, 0.3f, 0.5f, 0.05f)
+        return testOneBitmapHistogram(timer, testImage1, null) and
+                testOneBitmapHistogram(timer, testImage1, Range2d(6, 23, 2, 4)) and
+                testOneBitmapHistogramDot(timer, testImage1, null, null) and
+                testOneBitmapHistogramDot(timer, testImage1, coefficients, null) and
+                testOneBitmapHistogramDot(timer, testImage1, coefficients, Range2d(6, 23, 2, 4)) and
+        commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+            (1..4).all { vectorSize ->
+                testOneRandomHistogram(timer, vectorSize, sizeX, sizeY, restriction) &&
+                        testOneRandomHistogramDot(
+                            timer,
+                            vectorSize,
+                            sizeX,
+                            sizeY,
+                            null,
+                            restriction
+                        ) &&
+                        testOneRandomHistogramDot(
+                            timer,
+                            vectorSize,
+                            sizeX,
+                            sizeY,
+                            coefficients.sliceArray(0 until vectorSize),
+                            restriction
+                        )
+            }
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneRandomHistogram(
+        timer: TimingTracker,
+        vectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        restriction: Range2d?
+    ): Boolean {
+        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, paddedSize(vectorSize))
+
+        val intrinsicOutput = timer.measure("IntrinsicHistogram") {
+            intrinsicHistogram(
+                renderscriptContext, inputArray, vectorSize, sizeX, sizeY, restriction
+            )
+        }
+        val toolkitOutput = timer.measure("ToolkitHistogram") {
+            toolkit.histogram(inputArray, vectorSize, sizeX, sizeY, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutput = timer.measure("ReferenceHistogram") {
+            referenceHistogram(
+                inputArray, vectorSize, sizeX, sizeY, restriction
+            )
+        }
+
+        return validateSame("histogram", intrinsicOutput, referenceOutput, toolkitOutput, 0) {
+            println("histogram $vectorSize ($sizeX, $sizeY) $restriction")
+            logArray("histogram in           ", inputArray, 200)
+            logArray("histogram reference out", referenceOutput, 200)
+            logArray("histogram intrinsic out", intrinsicOutput, 200)
+            logArray("histogram toolkit   out", toolkitOutput, 200)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneBitmapHistogram(
+        timer: TimingTracker,
+        bitmap: Bitmap,
+        restriction: Range2d?
+    ): Boolean {
+        val intrinsicOutput = timer.measure("IntrinsicHistogram") {
+            intrinsicHistogram(renderscriptContext, bitmap, restriction)
+        }
+        val toolkitOutput = timer.measure("ToolkitHistogram") {
+            toolkit.histogram(bitmap, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutput = timer.measure("ReferenceHistogram") {
+            referenceHistogram(
+                getBitmapBytes(bitmap), vectorSizeOfBitmap(bitmap), bitmap.width, bitmap.height,
+                restriction
+            )
+        }
+
+        return validateSame("histogram", intrinsicOutput, referenceOutput, toolkitOutput, 0) {
+            println("HistogramBitmap $restriction")
+            logArray("HistogramBitmap reference out", referenceOutput)
+            logArray("HistogramBitmap intrinsic out", intrinsicOutput)
+            logArray("HistogramBitmap toolkit   out", toolkitOutput)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneRandomHistogramDot(
+        timer: TimingTracker,
+        vectorSize: Int,
+        sizeX: Int,
+        sizeY: Int,
+        coefficients: FloatArray?, restriction: Range2d?
+    ): Boolean {
+        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, paddedSize(vectorSize))
+
+        val intrinsicOutArray = timer.measure("IntrinsicHistogramDot") {
+            intrinsicHistogramDot(
+                renderscriptContext, inputArray, vectorSize, sizeX, sizeY, coefficients, restriction
+            )
+        }
+        val toolkitOutArray = timer.measure("ToolkitHistogramDot") {
+            toolkit.histogramDot(
+                inputArray, vectorSize, sizeX, sizeY, coefficients, restriction
+            )
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceHistogramDot") {
+            referenceHistogramDot(inputArray, vectorSize, sizeX, sizeY, coefficients, restriction)
+        }
+
+        return validateSame("histogramDot", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+            println("histogramDot $vectorSize ($sizeX, $sizeY) $restriction")
+            logArray("histogramDot coefficients ", coefficients)
+            logArray("histogramDot in           ", inputArray)
+            logArray("histogramDot reference out", referenceOutArray, 256)
+            logArray("histogramDot intrinsic out", intrinsicOutArray, 256)
+            logArray("histogramDot toolkit   out", toolkitOutArray, 256)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneBitmapHistogramDot(
+        timer: TimingTracker,
+        bitmap: Bitmap,
+        coefficients: FloatArray?,
+        restriction: Range2d?
+    ): Boolean {
+        val intrinsicOutArray = timer.measure("IntrinsicHistogramDot") {
+            intrinsicHistogramDot(renderscriptContext, bitmap, coefficients, restriction)
+        }
+        val toolkitOutArray = timer.measure("ToolkitHistogramDot") {
+            toolkit.histogramDot(bitmap, coefficients, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceHistogramDot") {
+            referenceHistogramDot(
+                getBitmapBytes(bitmap),
+                vectorSizeOfBitmap(bitmap),
+                bitmap.width,
+                bitmap.height,
+                coefficients,
+                restriction
+            )
+        }
+
+        return validateSame(
+            "HistogramDotBitmap",
+            intrinsicOutArray,
+            referenceOutArray,
+            toolkitOutArray
+        ) {
+            println("HistogramDotBitmap $restriction")
+            logArray("HistogramDotBitmap coefficients ", coefficients)
+            //logArray("HistogramDotBitmap in           ", inputArray)
+            logArray("HistogramDotBitmap reference out", referenceOutArray, 256)
+            logArray("HistogramDotBitmap intrinsic out", intrinsicOutArray, 256)
+            logArray("HistogramDotBitmap toolkit   out", toolkitOutArray, 256)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testLut(timer: TimingTracker): Boolean {
+        return testOneBitmapLut(timer, testImage1, null) and
+                testOneBitmapLut(timer, testImage1, Range2d(6, 23, 2, 4)) and
+        commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+            testOneRandomLut(timer, sizeX, sizeY, restriction)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneRandomLut(
+        timer: TimingTracker,
+        sizeX: Int,
+        sizeY: Int,
+        restriction: Range2d?
+    ): Boolean {
+        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, 4)
+        val newRed = randomByteArray(0x32425, 256, 1, 1)
+        val newGreen = randomByteArray(0x1F3225, 256, 1, 1)
+        val newBlue = randomByteArray(0x32D4F27, 256, 1, 1)
+        val newAlpha = randomByteArray(0x3A20001, 256, 1, 1)
+        val table = LookupTable()
+        table.red = newRed
+        table.blue = newBlue
+        table.green = newGreen
+        table.alpha = newAlpha
+
+        val intrinsicOutArray = timer.measure("IntrinsicLUT") {
+            intrinsicLut(
+                renderscriptContext, inputArray, sizeX, sizeY, newRed, newGreen, newBlue, newAlpha,
+                restriction
+            )
+        }
+        val toolkitOutArray = timer.measure("ToolkitLUT") {
+            toolkit.lut(inputArray, sizeX, sizeY, table, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceLUT") {
+            referenceLut(inputArray, sizeX, sizeY, table, restriction)
+        }
+
+        return validateSame("LUT", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+            println("lut ($sizeX, $sizeY) $restriction")
+            logArray("LUT red  ", newRed, 256)
+            logArray("LUT green", newGreen, 256)
+            logArray("LUT blue ", newBlue, 256)
+            logArray("LUT alpha", newAlpha, 256)
+            logArray("LUT in           ", inputArray)
+            logArray("LUT reference out", referenceOutArray)
+            logArray("LUT intrinsic out", intrinsicOutArray)
+            logArray("LUT toolkit   out", toolkitOutArray)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneBitmapLut(
+        timer: TimingTracker,
+        bitmap: Bitmap,
+        restriction: Range2d?
+    ): Boolean {
+        val newRed = randomByteArray(0x32425, 256, 1, 1)
+        val newGreen = randomByteArray(0x1F3225, 256, 1, 1)
+        val newBlue = randomByteArray(0x32D4F27, 256, 1, 1)
+        val newAlpha = randomByteArray(0x3A20001, 256, 1, 1)
+        val table = LookupTable()
+        table.red = newRed
+        table.blue = newBlue
+        table.green = newGreen
+        table.alpha = newAlpha
+
+        val intrinsicOutArray = timer.measure("IntrinsicLUT") {
+            intrinsicLut(
+                renderscriptContext, bitmap, newRed, newGreen, newBlue, newAlpha, restriction
+            )
+        }
+        val toolkitOutBitmap = timer.measure("ToolkitLUT") {
+            toolkit.lut(bitmap, table, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceLUT") {
+            referenceLut(
+                getBitmapBytes(bitmap),
+                bitmap.width,
+                bitmap.height,
+                table,
+                restriction
+            )
+        }
+
+        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+        return validateSame("LutBitmap", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+            println("LutBitmap $restriction")
+            logArray("LutBitmap red  ", newRed, 256)
+            logArray("LutBitmap green", newGreen, 256)
+            logArray("LutBitmap blue ", newBlue, 256)
+            logArray("LutBitmap alpha", newAlpha, 256)
+            //logArray("LutBitmap in           ", inputArray, 80)
+            logArray("LutBitmap reference out", referenceOutArray)
+            logArray("LutBitmap intrinsic out", intrinsicOutArray)
+            logArray("LutBitmap toolkit   out", toolkitOutArray)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testLut3d(timer: TimingTracker): Boolean {
+        val cubeSizesToTry = listOf(
+            Dimension(2, 2, 2),
+            Dimension(32, 32, 16),
+            Dimension(256, 256, 256)
+        )
+        return cubeSizesToTry.all { cubeSize ->
+                val identityCube = identityCube(cubeSize)
+                val randomCube = randomCube(0x23424, cubeSize)
+                testOneBitmapLut3d(timer, testImage1, cubeSize, identityCube, 1, null) and
+                        testOneBitmapLut3d(timer, testImage2, cubeSize, randomCube, 3, null) and
+                        testOneBitmapLut3d(timer, testImage2, cubeSize, randomCube, 3, Range2d(6, 23, 2, 4)) and
+                commonLayoutsToTry.all { (sizeX, sizeY, restriction) ->
+                    testOneRandomLut3d(timer, sizeX, sizeY, cubeSize, identityCube, 1, restriction) &&
+                            testOneRandomLut3d(
+                                timer,
+                                sizeX,
+                                sizeY,
+                                cubeSize,
+                                randomCube,
+                                3,
+                                restriction
+                            )
+                }
+            }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneRandomLut3d(
+        timer: TimingTracker,
+        sizeX: Int,
+        sizeY: Int,
+        cubeSize: Dimension,
+        cubeArray: ByteArray,
+        allowedIntError: Int, restriction: Range2d?
+    ): Boolean {
+        val inputArray = randomByteArray(0x50521f0, sizeX, sizeY, 4)
+
+        val intrinsicOutArray = timer.measure("IntrinsicLut3d") {
+            intrinsicLut3d(
+                renderscriptContext, inputArray, sizeX, sizeY, cubeArray, cubeSize, restriction
+            )
+        }
+        val toolkitOutArray = timer.measure("ToolkitLut3d") {
+            val toolkitCube = Rgba3dArray(cubeArray, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
+            toolkit.lut3d(inputArray, sizeX, sizeY, toolkitCube, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceLut3d") {
+            val cube = Rgba3dArray(cubeArray, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
+            referenceLut3d(inputArray, sizeX, sizeY, cube, restriction)
+        }
+
+        return validateSame(
+            "lut3d",
+            intrinsicOutArray,
+            referenceOutArray,
+            toolkitOutArray,
+            false,
+            allowedIntError
+        ) {
+            println("lut3d ($sizeX, $sizeY) $restriction")
+            logArray("lut3d cube", cubeArray, 256)
+            logArray("lut3d in           ", inputArray, 64)
+            logArray("lut3d reference out", referenceOutArray, 64)
+            logArray("lut3d intrinsic out", intrinsicOutArray, 64)
+            logArray("lut3d toolkit   out", toolkitOutArray)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneBitmapLut3d(
+        timer: TimingTracker,
+        bitmap: Bitmap,
+        cubeSize: Dimension,
+        cubeArray: ByteArray,
+        allowedIntError: Int, restriction: Range2d?
+    ): Boolean {
+        val intrinsicOutArray = timer.measure("IntrinsicLut3d") {
+            intrinsicLut3d(renderscriptContext, bitmap, cubeArray, cubeSize, restriction)
+        }
+        val toolkitOutBitmap = timer.measure("ToolkitLut3d") {
+            val toolkitCube = Rgba3dArray(cubeArray, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
+            toolkit.lut3d(bitmap, toolkitCube, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceLut3d") {
+            val cube = Rgba3dArray(cubeArray, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
+            referenceLut3d(getBitmapBytes(bitmap), bitmap.width, bitmap.height, cube, restriction)
+        }
+
+        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+        return validateSame(
+            "Lut3dBitmap",
+            intrinsicOutArray,
+            referenceOutArray,
+            toolkitOutArray,
+            false,
+            allowedIntError
+        ) {
+            println("Lut3dBitmap $restriction")
+            logArray("Lut3dBitmap cube", cubeArray, 256)
+            //logArray("Lut3dBitmap in           ", inputArray, 64)
+            logArray("Lut3dBitmap reference out", referenceOutArray, 64)
+            logArray("Lut3dBitmap intrinsic out", intrinsicOutArray, 64)
+            logArray("Lut3dBitmap toolkit   out", toolkitOutArray)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testResize(timer: TimingTracker): Boolean {
+        val factorsToTry = listOf(
+            Pair(1f, 1f),
+            Pair(0.5f, 1f),
+            Pair(2f, 2f),
+            Pair(0.5f, 2f),
+            Pair(2f, 0.5f),
+            // The RenderScript Intrinsic tests used the above factors. It's tempting to use
+            // less regular ones like Pair(6.37f, 0.17f) however this creates small offset
+            // errors between the result provided by the C++ code and the SIMD code. This is
+            // due to the SIMD code using a scaled integer to increment going from one pixel to the
+            // next, while the C++ code uses float operations.
+        )
+        val layoutsToTry = listOf(
+            TestLayout(37, 47, null),
+            TestLayout(60, 10, null),
+            TestLayout(6, 4, Range2d(1, 3, 0, 2)),
+            TestLayout(10, 14, Range2d(2, 3, 3, 7)),
+        )
+
+        return factorsToTry.all { (scaleX, scaleY) ->
+            // Do one resize that's greater than 4x, as that's used in the code but don't do it
+            // for everything, as some images will get very large
+            testOneRandomResize(timer, 1, 25, 30, 6f, 6f, null) and
+            testOneBitmapResize(timer, testImage1, scaleX, scaleY, null) and
+                    testOneBitmapResize(timer, testImage1, scaleX, scaleY, Range2d(6, 23, 2, 4)) and
+                    layoutsToTry.all { (sizeX, sizeY, restriction) ->
+                        (1..4).all { vectorSize ->
+                            testOneRandomResize(
+                                timer,
+                                vectorSize,
+                                sizeX,
+                                sizeY,
+                                scaleX,
+                                scaleY,
+                                restriction
+                            )
+                        }
+                    }
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneRandomResize(
+        timer: TimingTracker,
+        vectorSize: Int,
+        inSizeX: Int,
+        inSizeY: Int,
+        scaleX: Float,
+        scaleY: Float,
+        restriction: Range2d?
+    ): Boolean {
+        val inputArray = randomByteArray(0x50521f0, inSizeX, inSizeY, paddedSize(vectorSize))
+        val outSizeX = (inSizeX * scaleX).toInt()
+        val outSizeY = (inSizeY * scaleY).toInt()
+
+        val intrinsicOutArray = timer.measure("IntrinsicResize") {
+            intrinsicResize(
+                renderscriptContext, inputArray, vectorSize, inSizeX, inSizeY, outSizeX, outSizeY,
+                restriction
+            )
+        }
+        val toolkitOutArray = timer.measure("ToolkitResize") {
+            toolkit.resize(
+                inputArray, vectorSize, inSizeX, inSizeY, outSizeX, outSizeY, restriction
+            )
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceResize") {
+            referenceResize(
+                inputArray, vectorSize, inSizeX, inSizeY, outSizeX, outSizeY, restriction
+            )
+        }
+
+        return validateSame("resize", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+            println("resize $vectorSize ($inSizeX, $inSizeY) by ($scaleX, $scaleY) to ($outSizeX, $outSizeY), $restriction")
+            logArray("resize in           ", inputArray)
+            logArray("resize reference out", referenceOutArray)
+            logArray("resize intrinsic out", intrinsicOutArray)
+            logArray("resize toolkit   out", toolkitOutArray)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneBitmapResize(
+        timer: TimingTracker,
+        bitmap: Bitmap,
+        scaleX: Float,
+        scaleY: Float,
+        restriction: Range2d?
+    ): Boolean {
+        // println("Doing resize $inSizeX x $inSizeY x $vectorSize, $scaleX x $scaleY, $restriction")
+        val outSizeX = (bitmap.width * scaleX).toInt()
+        val outSizeY = (bitmap.height * scaleY).toInt()
+
+        val intrinsicOutArray = timer.measure("IntrinsicResize") {
+            intrinsicResize(renderscriptContext, bitmap, outSizeX, outSizeY, restriction)
+        }
+        val toolkitOutBitmap = timer.measure("ToolkitResize") {
+            toolkit.resize(bitmap, outSizeX, outSizeY, restriction)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceResize") {
+            referenceResize(
+                getBitmapBytes(bitmap),
+                vectorSizeOfBitmap(bitmap),
+                bitmap.width,
+                bitmap.height,
+                outSizeX,
+                outSizeY,
+                restriction
+            )
+        }
+
+        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+        return validateSame("ResizeBitmap", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+            println("ResizeBitmap by ($scaleX, $scaleY) to ($outSizeX, $outSizeY), $restriction")
+            //logArray("ResizeBitmap in           ", inputArray, 100)
+            logArray("ResizeBitmap reference out", referenceOutArray)
+            logArray("ResizeBitmap intrinsic out", intrinsicOutArray)
+            logArray("ResizeBitmap toolkit   out", toolkitOutArray)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testYuvToRgb(timer: TimingTracker): Boolean {
+        val layoutsToTry = listOf(
+            // Don't try sizeX with odd values. That's not allowed by definition of some
+            // of the video formats.
+            TestLayout(10, 14, null),
+            TestLayout(64, 40, null),
+            TestLayout(96, 94, null),
+        )
+        return layoutsToTry.all { (sizeX, sizeY, _) ->
+            YuvFormat.values().all { format ->
+                testOneRandomYuvToRgb(timer, sizeX, sizeY, format) and
+                testOneRandomYuvToRgbBitmap(timer, sizeX, sizeY, format)
+            }
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneRandomYuvToRgb(
+        timer: TimingTracker,
+        sizeX: Int,
+        sizeY: Int,
+        format: YuvFormat
+    ): Boolean {
+        // The RenderScript Intrinsic does not handle this combination correctly.
+        if (format == YuvFormat.YV12 && sizeX % 32 != 0) {
+            return true
+        }
+        val inputArray = randomYuvArray(0x50521f0, sizeX, sizeY, format)
+
+        val intrinsicOutArray = timer.measure("IntrinsicYuvToRgb") {
+            intrinsicYuvToRgb(renderscriptContext, inputArray, sizeX, sizeY, format)
+        }
+        val toolkitOutArray = timer.measure("ToolkitYuvToRgb") {
+            toolkit.yuvToRgb(inputArray, sizeX, sizeY, format)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceYuvToRgb") {
+            referenceYuvToRgb(inputArray, sizeX, sizeY, format)
+        }
+
+        return validateSame("yuvToRgb", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+            println("yuvToRgb ($sizeX, $sizeY) $format")
+            logArray("yuvToRgb in           ", inputArray)
+            logArray("yuvToRgb reference out", referenceOutArray)
+            logArray("yuvToRgb intrinsic out", intrinsicOutArray)
+            logArray("yuvToRgb toolkit   out", toolkitOutArray)
+        }
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun testOneRandomYuvToRgbBitmap(
+        timer: TimingTracker,
+        sizeX: Int,
+        sizeY: Int,
+        format: YuvFormat
+    ): Boolean {
+        // The RenderScript Intrinsic does not handle this combination correctly.
+        if (format == YuvFormat.YV12 && sizeX % 32 != 0) {
+            return true
+        }
+        val inputArray = randomYuvArray(0x50521f0, sizeX, sizeY, format)
+
+        val intrinsicOutArray = timer.measure("IntrinsicYuvToRgb") {
+            intrinsicYuvToRgb(renderscriptContext, inputArray, sizeX, sizeY, format)
+        }
+        val toolkitOutBitmap = timer.measure("ToolkitYuvToRgb") {
+            toolkit.yuvToRgbBitmap(inputArray, sizeX, sizeY, format)
+        }
+        if (!validate) return true
+
+        val referenceOutArray = timer.measure("ReferenceYuvToRgb") {
+            referenceYuvToRgb(inputArray, sizeX, sizeY, format)
+        }
+
+        val toolkitOutArray = getBitmapBytes(toolkitOutBitmap)
+        return validateSame("yuvToRgb", intrinsicOutArray, referenceOutArray, toolkitOutArray) {
+            println("yuvToRgb ($sizeX, $sizeY) $format")
+            logArray("yuvToRgb in           ", inputArray)
+            logArray("yuvToRgb reference out", referenceOutArray)
+            logArray("yuvToRgb intrinsic out", intrinsicOutArray)
+            logArray("yuvToRgb toolkit   out", toolkitOutArray)
+        }
+    }
+
+    /**
+     * Verifies that the arrays returned by the Intrinsic, the reference code, and the Toolkit
+     * are all within a margin of error.
+     *
+     * RenderScript Intrinsic test (rc/android/cts/rscpp/RSCppTest.java) used 3 for ints.
+     * For floats, rc/android/cts/rscpp/verify.rscript uses 0.0001f.
+     */
+    @ExperimentalUnsignedTypes
+    private fun validateSame(
+        task: String,
+        intrinsic: ByteArray,
+        reference: ByteArray,
+        toolkit: ByteArray,
+        skipFourth: Boolean = false,
+        allowedIntDelta: Int = 3,
+        errorLogging: () -> Unit
+    ): Boolean {
+        val success = validateAgainstReference(
+            task, reference, "Intrinsic", intrinsic, skipFourth, allowedIntDelta
+        ) and validateAgainstReference(
+            task, reference, "Toolkit", toolkit, skipFourth, allowedIntDelta
+        )
+        if (!success) {
+            println("$task FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!")
+            errorLogging()
+        }
+        return success
+    }
+
+    private fun validateSame(
+        task: String,
+        intrinsic: IntArray,
+        reference: IntArray,
+        toolkit: IntArray,
+        allowedIntDelta: Int = 3,
+        errorLogging: () -> Unit
+    ): Boolean {
+        val success = validateAgainstReference(
+            task, reference, "Intrinsic", intrinsic, allowedIntDelta
+        ) and validateAgainstReference(
+            task, reference, "Toolkit", toolkit, allowedIntDelta
+        )
+        if (!success) {
+            println("$task FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!FAIL!")
+            errorLogging()
+        }
+        return success
+    }
+
+    @ExperimentalUnsignedTypes
+    private fun validateAgainstReference(
+        task: String,
+        in1: ByteArray,
+        name2: String,
+        in2: ByteArray,
+        skipFourth: Boolean,
+        allowedIntDelta: Int
+    ): Boolean {
+        if (in1.size != in2.size) {
+            println("$task. Sizes don't match: Reference ${in1.size}, $name2 ${in2.size}")
+            return false
+        }
+        var same = true
+        val maxDetails = 80
+        val diffs = CharArray(min(in1.size, maxDetails)) {'.'}
+        for (i in in1.indices) {
+            if (skipFourth && i % 4 == 3) {
+                continue
+            }
+            val delta = abs(in1[i].toUByte().toInt() - in2[i].toUByte().toInt())
+            if (delta > allowedIntDelta) {
+                if (same) {
+                    println(
+                        "$task. At $i, Reference is ${in1[i].toUByte()}, $name2 is ${in2[i].toUByte()}"
+                    )
+                }
+                if (i < maxDetails) diffs[i] = 'X'
+                same = false
+            }
+        }
+        if (!same) {
+            for (i in 0 until (min(in1.size, maxDetails) / 4)) print("%-3d|".format(i))
+            println()
+            println(diffs)
+        }
+        return same
+    }
+
+    private fun validateAgainstReference(
+        task: String,
+        in1: IntArray,
+        name2: String,
+        in2: IntArray,
+        allowedIntDelta: Int
+    ): Boolean {
+        if (in1.size != in2.size) {
+            println("$task. Sizes don't match: Reference ${in1.size}, $name2 ${in2.size}")
+            return false
+        }
+        for (i in in1.indices) {
+            val delta = abs(in1[i] - in2[i])
+            if (delta > allowedIntDelta) {
+                println("$task. At $i, Reference is ${in1[i]}, $name2 is ${in2[i]}")
+                return false
+            }
+        }
+        return true
+    }
+}
diff --git a/toolkit/test/Android.bp b/toolkit/test/Android.bp
new file mode 100644
index 0000000..abeace1
--- /dev/null
+++ b/toolkit/test/Android.bp
@@ -0,0 +1,35 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package {
+    default_applicable_licenses: ["Android-Apache-2.0"],
+}
+
+android_app {
+    name: "RenderScriptToolkitTest",
+//    srcs: ["src/**/*.kt"],
+    sdk_version: "current",
+    resource_dirs: ["res"],
+//    jni_libs: [ "librenderscripttoolkit"],
+//    certificate: "platform",
+//    //product_specific: true,
+//    //optimize: {
+//    //    proguard_flags_files: ["proguard.flags"],
+//    //},
+//    shared_libs: ["librenderscripttoolkit",
+//
+//    ]
+}
diff --git a/tests/lldb/java/Reduction/AndroidManifest.xml b/toolkit/test/AndroidManifest.xml
similarity index 61%
rename from tests/lldb/java/Reduction/AndroidManifest.xml
rename to toolkit/test/AndroidManifest.xml
index 61177d9..f709790 100644
--- a/tests/lldb/java/Reduction/AndroidManifest.xml
+++ b/toolkit/test/AndroidManifest.xml
@@ -1,11 +1,15 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.android.rs.lldbreductiontest">
-    <uses-sdk android:minSdkVersion="21" />
-    <application android:label="Reduction">
-        <activity android:name="MainActivity" android:screenOrientation="portrait">
+        package="com.example.testapp">
+
+    <application
+            android:allowBackup="true"
+            android:label="Toolkit Test"
+            android:supportsRtl="true">
+        <activity android:name=".MainActivity">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
+
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
diff --git a/toolkit/test/BufferUtils.kt b/toolkit/test/BufferUtils.kt
new file mode 100644
index 0000000..f2197b0
--- /dev/null
+++ b/toolkit/test/BufferUtils.kt
@@ -0,0 +1,508 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.graphics.Canvas
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.toolkit.Range2d
+import android.renderscript.toolkit.Rgba3dArray
+import android.renderscript.toolkit.YuvFormat
+import java.nio.ByteBuffer
+import java.util.Random
+import kotlin.math.floor
+import kotlin.math.max
+import kotlin.math.min
+
+/**
+ * A vector of 4 integers.
+ */
+class Int4(
+    var x: Int = 0,
+    var y: Int = 0,
+    var z: Int = 0,
+    var w: Int = 0
+) {
+    operator fun plus(other: Int4) = Int4(x + other.x, y + other.y, z + other.z, w + other.w)
+    operator fun plus(n: Int) = Int4(x + n, y + n, z + n, w + n)
+
+    operator fun minus(other: Int4) = Int4(x - other.x, y - other.y, z - other.z, w - other.w)
+    operator fun minus(n: Int) = Int4(x - n, y - n, z - n, w - n)
+
+    operator fun times(other: Int4) = Int4(x * other.x, y * other.y, z * other.z, w * other.w)
+    operator fun times(n: Int) = Int4(x * n, y * n, z * n, w * n)
+
+    fun toFloat4() = Float4(x.toFloat(), y.toFloat(), z.toFloat(), w.toFloat())
+}
+
+fun min(a: Int4, b: Int4) = Int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w))
+
+/**
+ * A vector of 4 floats.
+ */
+data class Float4(
+    var x: Float = 0f,
+    var y: Float = 0f,
+    var z: Float = 0f,
+    var w: Float = 0f
+) {
+    operator fun plus(other: Float4) = Float4(x + other.x, y + other.y, z + other.z, w + other.w)
+    operator fun plus(f: Float) = Float4(x + f, y + f, z + f, w + f)
+
+    operator fun minus(other: Float4) = Float4(x - other.x, y - other.y, z - other.z, w - other.w)
+    operator fun minus(f: Float) = Float4(x - f, y - f, z - f, w - f)
+
+    operator fun times(other: Float4) = Float4(x * other.x, y * other.y, z * other.z, w * other.w)
+    operator fun times(f: Float) = Float4(x * f, y * f, z * f, w * f)
+
+    operator fun div(other: Float4) = Float4(x / other.x, y / other.y, z / other.z, w / other.w)
+    operator fun div(f: Float) = Float4(x / f, y / f, z / f, w / f)
+
+    fun intFloor() = Int4(floor(x).toInt(), floor(y).toInt(), floor(z).toInt(), floor(w).toInt())
+}
+
+/**
+ * Convert a UByteArray to a Float4 vector
+ */
+@ExperimentalUnsignedTypes
+fun UByteArray.toFloat4(): Float4 {
+    require(size == 4)
+    return Float4(this[0].toFloat(), this[1].toFloat(), this[2].toFloat(), this[3].toFloat())
+}
+
+/**
+ * Convert a ByteArray to a Float4 vector
+ */
+@ExperimentalUnsignedTypes
+fun ByteArray.toFloat4(): Float4 {
+    require(size == 4)
+    return Float4(
+        this[0].toUByte().toFloat(),
+        this[1].toUByte().toFloat(),
+        this[2].toUByte().toFloat(),
+        this[3].toUByte().toFloat()
+    )
+}
+
+data class Dimension(val sizeX: Int, val sizeY: Int, val sizeZ: Int)
+
+/**
+ * An RGBA value represented by 4 Int.
+ *
+ * Note that the arithmetical operations consider a 0..255 value the equivalent of 0f..1f.
+ * After adding or subtracting, the value is clamped. After multiplying, the value is rescaled to
+ * stay in the 0..255 range. This is useful for the Blend operation.
+ */
+@ExperimentalUnsignedTypes
+data class Rgba(
+    var r: Int = 0,
+    var g: Int = 0,
+    var b: Int = 0,
+    var a: Int = 0
+) {
+    operator fun plus(other: Rgba) =
+        Rgba(r + other.r, g + other.g, b + other.b, a + other.a).clampToUByteRange()
+
+    operator fun minus(other: Rgba) =
+        Rgba(r - other.r, g - other.g, b - other.b, a - other.a).clampToUByteRange()
+
+    operator fun times(other: Rgba) = Rgba(r * other.r, g * other.g, b * other.b, a * other.a) shr 8
+    operator fun times(scalar: Int) = Rgba(r * scalar, g * scalar, b * scalar, a * scalar) shr 8
+
+    infix fun xor(other: Rgba) = Rgba(r xor other.r, g xor other.g, b xor other.b, a xor other.a)
+
+    infix fun shr(other: Int) = Rgba(r shr other, g shr other, b shr other, a shr other)
+
+    private fun clampToUByteRange() = Rgba(
+        r.clampToUByteRange(),
+        g.clampToUByteRange(),
+        b.clampToUByteRange(),
+        a.clampToUByteRange()
+    )
+}
+
+/**
+ * A 2D array of UByte vectors, stored in row-major format.
+ *
+ * Arrays of vectorSize == 3 are padded to 4.
+ */
+@ExperimentalUnsignedTypes
+class Vector2dArray(
+    val values: UByteArray,
+    val vectorSize: Int,
+    val sizeX: Int,
+    val sizeY: Int
+) {
+    /**
+     * If true, index access that would try to get a value that's out of bounds will simply
+     * return the border value instead. E.g. for [3, -3] would return the value for [3, 0],
+     * assuming that the sizeX > 3.
+     */
+    var clipReadToRange: Boolean = false
+
+    operator fun get(x: Int, y: Int): UByteArray {
+        var fixedX = x
+        var fixedY = y
+        if (clipReadToRange) {
+            fixedX = min(max(x, 0), sizeX - 1)
+            fixedY = min(max(y, 0), sizeY - 1)
+        } else {
+            require(x in 0 until sizeX && y in 0 until sizeY) { "Out of bounds" }
+        }
+        val start = indexOfVector(fixedX, fixedY)
+        return UByteArray(paddedSize(vectorSize)) { values[start + it] }
+    }
+
+    operator fun set(x: Int, y: Int, value: UByteArray) {
+        require(value.size == paddedSize(vectorSize)) { "Not the expected vector size" }
+        require(x in 0 until sizeX && y in 0 until sizeY) { "Out of bounds" }
+        val start = indexOfVector(x, y)
+        for (i in value.indices) {
+            values[start + i] = value[i]
+        }
+    }
+
+    private fun indexOfVector(x: Int, y: Int) = ((y * sizeX) + x) * paddedSize(vectorSize)
+
+    fun createSameSized() = Vector2dArray(UByteArray(values.size), vectorSize, sizeX, sizeY)
+
+    fun forEach(restriction: Range2d?, work: (Int, Int) -> (Unit)) {
+        forEachCell(sizeX, sizeY, restriction, work)
+    }
+}
+
+/**
+ * A 2D array of float vectors, stored in row-major format.
+ *
+ * Arrays of vectorSize == 3 are padded to 4.
+ */
+class FloatVector2dArray(
+    val values: FloatArray,
+    val vectorSize: Int,
+    val sizeX: Int,
+    val sizeY: Int
+) {
+    /**
+     * If true, index access that would try to get a value that's out of bounds will simply
+     * return the border value instead. E.g. for [3, -3] would return the value for [3, 0],
+     * assuming that the sizeX > 3.
+     */
+    var clipAccessToRange: Boolean = false
+
+    operator fun get(x: Int, y: Int): FloatArray {
+        var fixedX = x
+        var fixedY = y
+        if (clipAccessToRange) {
+            fixedX = min(max(x, 0), sizeX - 1)
+            fixedY = min(max(y, 0), sizeY - 1)
+        } else {
+            require(x in 0 until sizeX && y in 0 until sizeY) { "Out of bounds" }
+        }
+        val start = indexOfVector(fixedX, fixedY)
+        return FloatArray(vectorSize) { values[start + it] }
+    }
+
+    operator fun set(x: Int, y: Int, value: FloatArray) {
+        require(x in 0 until sizeX && y in 0 until sizeY) { "Out of bounds" }
+        val start = indexOfVector(x, y)
+        for (i in value.indices) {
+            values[start + i] = value[i]
+        }
+    }
+
+    private fun indexOfVector(x: Int, y: Int) = ((y * sizeX) + x) * paddedSize(vectorSize)
+
+    fun createSameSized() = FloatVector2dArray(FloatArray(values.size), vectorSize, sizeX, sizeY)
+
+    fun forEach(restriction: Range2d?, work: (Int, Int) -> (Unit)) {
+        forEachCell(sizeX, sizeY, restriction, work)
+    }
+}
+
+/**
+ * A 2D array of RGBA data.
+ */
+@ExperimentalUnsignedTypes
+class Rgba2dArray(
+    private val values: ByteArray,
+    val sizeX: Int,
+    val sizeY: Int
+) {
+    operator fun get(x: Int, y: Int): Rgba {
+        val i = indexOfVector(x, y)
+        return Rgba(
+            values[i].toUByte().toInt(),
+            values[i + 1].toUByte().toInt(),
+            values[i + 2].toUByte().toInt(),
+            values[i + 3].toUByte().toInt()
+        )
+    }
+
+    operator fun set(x: Int, y: Int, value: Rgba) {
+        // Verify that x, y, z, w are in the 0..255 range
+        require(value.r in 0..255)
+        require(value.g in 0..255)
+        require(value.b in 0..255)
+        require(value.a in 0..255)
+        val i = indexOfVector(x, y)
+        values[i] = value.r.toUByte().toByte()
+        values[i + 1] = value.g.toUByte().toByte()
+        values[i + 2] = value.b.toUByte().toByte()
+        values[i + 3] = value.a.toUByte().toByte()
+    }
+
+    private fun indexOfVector(x: Int, y: Int) = ((y * sizeX) + x) * 4
+
+    fun forEachCell(restriction: Range2d?, work: (Int, Int) -> (Unit)) =
+        forEachCell(sizeX, sizeY, restriction, work)
+}
+
+/**
+ * Return a value that's between start and end, with the fraction indicating how far along.
+ */
+fun mix(start: Float, end: Float, fraction: Float) = start + (end - start) * fraction
+
+fun mix(a: Float4, b: Float4, fraction: Float) = Float4(
+    mix(a.x, b.x, fraction),
+    mix(a.y, b.y, fraction),
+    mix(a.z, b.z, fraction),
+    mix(a.w, b.w, fraction)
+)
+
+/**
+ * For vectors of size 3, the original RenderScript has them occupy the same space as a size 4.
+ * While RenderScript had a method to avoid this padding, it did not apply to Intrinsics.
+ *
+ * To preserve compatibility, the Toolkit doing the same.
+ */
+fun paddedSize(vectorSize: Int) = if (vectorSize == 3) 4 else vectorSize
+
+/**
+ * Create a ByteArray of the specified size filled with random data.
+ */
+fun randomByteArray(seed: Long, sizeX: Int, sizeY: Int, elementSize: Int): ByteArray {
+    val r = Random(seed)
+    return ByteArray(sizeX * sizeY * elementSize) { (r.nextInt(255) - 128).toByte() }
+}
+
+/**
+ * Create a FloatArray of the specified size filled with random data.
+ *
+ * By default, the random data is between 0f and 1f. The factor can be used to scale that.
+ */
+fun randomFloatArray(
+    seed: Long,
+    sizeX: Int,
+    sizeY: Int,
+    elementSize: Int,
+    factor: Float = 1f
+): FloatArray {
+    val r = Random(seed)
+    return FloatArray(sizeX * sizeY * elementSize) { r.nextFloat() * factor }
+}
+
+/**
+ * Create a cube of the specified size filled with random data.
+ */
+fun randomCube(seed: Long, cubeSize: Dimension): ByteArray {
+    val r = Random(seed)
+    return ByteArray(cubeSize.sizeX * cubeSize.sizeY * cubeSize.sizeZ * 4) {
+        (r.nextInt(255) - 128).toByte()
+    }
+}
+
+/**
+ * Create the identity cube, i.e. one that if used in Lut3d, the output is the same as the input
+ */
+@ExperimentalUnsignedTypes
+fun identityCube(cubeSize: Dimension): ByteArray {
+    val data = ByteArray(cubeSize.sizeX * cubeSize.sizeY * cubeSize.sizeZ * 4)
+    val cube = Rgba3dArray(data, cubeSize.sizeX, cubeSize.sizeY, cubeSize.sizeZ)
+    for (z in 0 until cubeSize.sizeZ) {
+        for (y in 0 until cubeSize.sizeY) {
+            for (x in 0 until cubeSize.sizeX) {
+                cube[x, y, z] =
+                    byteArrayOf(
+                        (x * 255 / (cubeSize.sizeX - 1)).toByte(),
+                        (y * 255 / (cubeSize.sizeY - 1)).toByte(),
+                        (z * 255 / (cubeSize.sizeZ - 1)).toByte(),
+                        (255).toByte()
+                    )
+            }
+        }
+    }
+    return data
+}
+
+fun randomYuvArray(seed: Long, sizeX: Int, sizeY: Int, format: YuvFormat): ByteArray {
+    // YUV formats are not well defined for odd dimensions
+    require(sizeX % 2 == 0 && sizeY % 2 == 0)
+    val halfSizeX = sizeX / 2
+    val halfSizeY = sizeY / 2
+    var totalSize = 0
+    when (format) {
+        YuvFormat.YV12 -> {
+            val strideX = roundUpTo16(sizeX)
+            totalSize = strideX * sizeY + roundUpTo16(strideX / 2) * halfSizeY * 2
+        }
+        YuvFormat.NV21 -> totalSize = sizeX * sizeY + halfSizeX * halfSizeY * 2
+        else -> require(false) { "Unknown YUV format $format" }
+    }
+
+    return randomByteArray(seed, totalSize, 1, 1)
+}
+
+/**
+ * Converts a float to a byte, clamping to make it fit the limited range.
+ */
+@ExperimentalUnsignedTypes
+fun Float.clampToUByte(): UByte = min(255, max(0, (this + 0.5f).toInt())).toUByte()
+
+/**
+ * Converts a FloatArray to UByteArray, clamping.
+ */
+@ExperimentalUnsignedTypes
+fun FloatArray.clampToUByte() = UByteArray(size) { this[it].clampToUByte() }
+
+/**
+ * Limits an Int to what can fit in a UByte.
+ */
+fun Int.clampToUByteRange(): Int = min(255, max(0, this))
+
+/**
+ * Converts an Int to a UByte, clamping.
+ */
+@ExperimentalUnsignedTypes
+fun Int.clampToUByte(): UByte = this.clampToUByteRange().toUByte()
+
+/**
+ * Converts a float (0f .. 1f) to a byte (0 .. 255)
+ */
+@ExperimentalUnsignedTypes
+fun unitFloatClampedToUByte(num: Float): UByte = (num * 255f).clampToUByte()
+
+/**
+ * Convert a byte (0 .. 255) to a float (0f .. 1f)
+ */
+@ExperimentalUnsignedTypes
+fun byteToUnitFloat(num: UByte) = num.toFloat() * 0.003921569f
+
+@ExperimentalUnsignedTypes
+fun UByteArray.toFloatArray() = FloatArray(size) { this[it].toFloat() }
+
+/**
+ * For each cell that's in the 2D array defined by sizeX and sizeY, and clipped down by the
+ * restriction, invoke the work function.
+ */
+fun forEachCell(sizeX: Int, sizeY: Int, restriction: Range2d?, work: (Int, Int) -> (Unit)) {
+    val startX = restriction?.startX ?: 0
+    val startY = restriction?.startY ?: 0
+    val endX = restriction?.endX ?: sizeX
+    val endY = restriction?.endY ?: sizeY
+    for (y in startY until endY) {
+        for (x in startX until endX) {
+            work(x, y)
+        }
+    }
+}
+
+operator fun FloatArray.times(other: FloatArray) = FloatArray(size) { this[it] * other[it] }
+operator fun FloatArray.times(other: Float) = FloatArray(size) { this[it] * other }
+operator fun FloatArray.plus(other: FloatArray) = FloatArray(size) { this[it] + other[it] }
+operator fun FloatArray.minus(other: FloatArray) = FloatArray(size) { this[it] - other[it] }
+
+fun renderScriptVectorElementForU8(rs: RenderScript?, vectorSize: Int): Element {
+    when (vectorSize) {
+        1 -> return Element.U8(rs)
+        2 -> return Element.U8_2(rs)
+        3 -> return Element.U8_3(rs)
+        4 -> return Element.U8_4(rs)
+    }
+    throw java.lang.IllegalArgumentException("RenderScriptToolkit tests. Only vectors of size 1-4 are supported. $vectorSize provided.")
+}
+
+fun renderScriptVectorElementForI32(rs: RenderScript?, vectorSize: Int): Element {
+    when (vectorSize) {
+        1 -> return Element.I32(rs)
+        2 -> return Element.I32_2(rs)
+        3 -> return Element.I32_3(rs)
+        4 -> return Element.I32_4(rs)
+    }
+    throw java.lang.IllegalArgumentException("RenderScriptToolkit tests. Only vectors of size 1-4 are supported. $vectorSize provided.")
+}
+
+/* When we'll handle floats
+fun renderScriptVectorElementForF32(rs: RenderScript?, vectorSize: Int): Element {
+    when (vectorSize) {
+        1 -> return Element.F32(rs)
+        2 -> return Element.F32_2(rs)
+        3 -> return Element.F32_3(rs)
+        4 -> return Element.F32_4(rs)
+    }
+    throw java.lang.IllegalArgumentException("RenderScriptToolkit tests. Only vectors of size 1-4 are supported. $vectorSize provided.")
+}*/
+
+fun renderScriptElementForBitmap(context: RenderScript, bitmap: Bitmap): Element {
+    return when (val config = bitmap.config) {
+        Bitmap.Config.ALPHA_8 -> Element.A_8(context)
+        Bitmap.Config.ARGB_8888 -> Element.RGBA_8888(context)
+        else -> throw IllegalArgumentException("RenderScript Toolkit can't support bitmaps with config $config.")
+    }
+}
+
+fun getBitmapBytes(bitmap: Bitmap): ByteArray {
+    val buffer: ByteBuffer = ByteBuffer.allocate(bitmap.byteCount)
+    bitmap.copyPixelsToBuffer(buffer)
+    return buffer.array()
+}
+
+fun vectorSizeOfBitmap(bitmap: Bitmap): Int {
+    return when (val config = bitmap.config) {
+        Bitmap.Config.ALPHA_8 -> 1
+        Bitmap.Config.ARGB_8888 -> 4
+        else -> throw IllegalArgumentException("RenderScript Toolkit can't support bitmaps with config $config.")
+    }
+}
+
+fun duplicateBitmap(original: Bitmap): Bitmap {
+    val copy = Bitmap.createBitmap(original.width, original.height, original.config)
+    val canvas = Canvas(copy)
+    canvas.drawBitmap(original, 0f, 0f, null)
+    return copy
+}
+
+@ExperimentalUnsignedTypes
+fun logArray(prefix: String, array: ByteArray, number: Int = 20) {
+    val values = array.joinToString(limit = number) { it.toUByte().toString() }
+    println("$prefix[${array.size}] $values}\n")
+}
+
+fun logArray(prefix: String, array: IntArray, number: Int = 20) {
+    val values = array.joinToString(limit = number)
+    println("$prefix[${array.size}] $values}\n")
+}
+
+fun logArray(prefix: String, array: FloatArray?, number: Int = 20) {
+    val values = array?.joinToString(limit = number) { "%.2f".format(it) } ?: "(null)"
+    println("$prefix[${array?.size}] $values}\n")
+}
+
+fun roundUpTo16(value: Int): Int {
+    require(value >= 0)
+    return (value + 15) and 15.inv()
+}
diff --git a/toolkit/test/IntrinsicBlend.kt b/toolkit/test/IntrinsicBlend.kt
new file mode 100644
index 0000000..873cb15
--- /dev/null
+++ b/toolkit/test/IntrinsicBlend.kt
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicBlend
+import android.renderscript.Type
+import android.renderscript.toolkit.BlendingMode
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a Blend operation using the RenderScript Intrinsics.
+ */
+fun intrinsicBlend(
+    context: RenderScript,
+    mode: BlendingMode,
+    sourceArray: ByteArray,
+    destArray: ByteArray,
+    sizeX: Int,
+    sizeY: Int,
+    restriction: Range2d?
+) {
+    val scriptBlend = ScriptIntrinsicBlend.create(context, Element.U8_4(context))
+    val builder = Type.Builder(context, Element.U8_4(context))
+    builder.setX(sizeX)
+    builder.setY(sizeY)
+    val arrayType = builder.create()
+    val sourceAllocation = Allocation.createTyped(context, arrayType)
+    val destAllocation = Allocation.createTyped(context, arrayType)
+    sourceAllocation.copyFrom(sourceArray)
+    destAllocation.copyFrom(destArray)
+
+    callBlendForEach(scriptBlend, sourceAllocation, destAllocation, mode, restriction)
+    destAllocation.copyTo(destArray)
+
+    sourceAllocation.destroy()
+    destAllocation.destroy()
+    arrayType.destroy()
+    scriptBlend.destroy()
+}
+
+fun intrinsicBlend(
+    context: RenderScript,
+    mode: BlendingMode,
+    sourceBitmap: Bitmap,
+    destBitmap: Bitmap,
+    restriction: Range2d?
+) {
+    val scriptBlend = ScriptIntrinsicBlend.create(context, Element.U8_4(context))
+    val sourceAllocation = Allocation.createFromBitmap(context, sourceBitmap)
+    val destAllocation = Allocation.createFromBitmap(context, destBitmap)
+    sourceAllocation.copyFrom(sourceBitmap)
+    destAllocation.copyFrom(destBitmap)
+
+    callBlendForEach(scriptBlend, sourceAllocation, destAllocation, mode, restriction)
+    destAllocation.copyTo(destBitmap)
+
+    sourceAllocation.destroy()
+    destAllocation.destroy()
+    scriptBlend.destroy()
+}
+
+private fun callBlendForEach(
+    scriptBlend: ScriptIntrinsicBlend,
+    sourceAllocation: Allocation,
+    destAllocation: Allocation,
+    mode: BlendingMode,
+    restriction: Range2d?
+) {
+    if (restriction != null) {
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        when (mode) {
+            BlendingMode.CLEAR -> scriptBlend.forEachClear(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.SRC -> scriptBlend.forEachSrc(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.DST -> scriptBlend.forEachDst(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.SRC_OVER -> scriptBlend.forEachSrcOver(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.DST_OVER -> scriptBlend.forEachDstOver(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.SRC_IN -> scriptBlend.forEachSrcIn(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.DST_IN -> scriptBlend.forEachDstIn(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.SRC_OUT -> scriptBlend.forEachSrcOut(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.DST_OUT -> scriptBlend.forEachDstOut(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.SRC_ATOP -> scriptBlend.forEachSrcAtop(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.DST_ATOP -> scriptBlend.forEachDstAtop(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.XOR -> scriptBlend.forEachXor(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.MULTIPLY -> scriptBlend.forEachMultiply(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.ADD -> scriptBlend.forEachAdd(
+                sourceAllocation, destAllocation, options
+            )
+            BlendingMode.SUBTRACT -> scriptBlend.forEachSubtract(
+                sourceAllocation, destAllocation, options
+            )
+        }
+    } else {
+        when (mode) {
+            BlendingMode.CLEAR -> scriptBlend.forEachClear(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.SRC -> scriptBlend.forEachSrc(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.DST -> scriptBlend.forEachDst(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.SRC_OVER -> scriptBlend.forEachSrcOver(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.DST_OVER -> scriptBlend.forEachDstOver(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.SRC_IN -> scriptBlend.forEachSrcIn(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.DST_IN -> scriptBlend.forEachDstIn(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.SRC_OUT -> scriptBlend.forEachSrcOut(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.DST_OUT -> scriptBlend.forEachDstOut(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.SRC_ATOP -> scriptBlend.forEachSrcAtop(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.DST_ATOP -> scriptBlend.forEachDstAtop(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.XOR -> scriptBlend.forEachXor(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.MULTIPLY -> scriptBlend.forEachMultiply(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.ADD -> scriptBlend.forEachAdd(
+                sourceAllocation, destAllocation
+            )
+            BlendingMode.SUBTRACT -> scriptBlend.forEachSubtract(
+                sourceAllocation, destAllocation
+            )
+        }
+    }
+}
diff --git a/toolkit/test/IntrinsicBlur.kt b/toolkit/test/IntrinsicBlur.kt
new file mode 100644
index 0000000..be09094
--- /dev/null
+++ b/toolkit/test/IntrinsicBlur.kt
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicBlur
+import android.renderscript.Type
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a Blur operation using the RenderScript Intrinsics.
+ */
+fun intrinsicBlur(
+    context: RenderScript,
+    inputArray: ByteArray,
+    vectorSize: Int,
+    sizeX: Int,
+    sizeY: Int,
+    radius: Int,
+    restriction: Range2d?
+): ByteArray {
+    val scriptBlur = ScriptIntrinsicBlur.create(
+        context,
+        if (vectorSize == 4) Element.RGBA_8888(context) else Element.U8(context)
+    )
+    val builder =
+        Type.Builder(
+            context,
+            renderScriptVectorElementForU8(context, vectorSize)
+        )
+    builder.setX(sizeX)
+    builder.setY(sizeY)
+    val arrayType = builder.create()
+    val inputAllocation = Allocation.createTyped(context, arrayType)
+    inputAllocation.copyFrom(inputArray)
+    val outAllocation = Allocation.createTyped(context, arrayType)
+
+    val intrinsicOutArray = ByteArray(sizeX * sizeY * vectorSize)
+    scriptBlur.setRadius(radius.toFloat())
+    scriptBlur.setInput(inputAllocation)
+
+    if (restriction != null) {
+        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptBlur.forEach(outAllocation, options)
+    } else {
+        scriptBlur.forEach(outAllocation)
+    }
+    outAllocation.copyTo(intrinsicOutArray)
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    arrayType.destroy()
+    scriptBlur.destroy()
+    return intrinsicOutArray
+}
+
+fun intrinsicBlur(
+    context: RenderScript,
+    bitmap: Bitmap,
+    radius: Int,
+    restriction: Range2d?
+): ByteArray {
+    val baseElement = renderScriptElementForBitmap(context, bitmap)
+    val scriptBlur = ScriptIntrinsicBlur.create(context, baseElement)
+    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+    inputAllocation.copyFrom(bitmap)
+    val outAllocation = Allocation.createTyped(context, inputAllocation.type)
+    val intrinsicOutArray = ByteArray(bitmap.byteCount)
+
+    scriptBlur.setRadius(radius.toFloat())
+    scriptBlur.setInput(inputAllocation)
+
+    if (restriction != null) {
+        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptBlur.forEach(outAllocation, options)
+    } else {
+        scriptBlur.forEach(outAllocation)
+    }
+    outAllocation.copyTo(intrinsicOutArray)
+
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    scriptBlur.destroy()
+    return intrinsicOutArray
+}
diff --git a/toolkit/test/IntrinsicColorMatrix.kt b/toolkit/test/IntrinsicColorMatrix.kt
new file mode 100644
index 0000000..c0ccc67
--- /dev/null
+++ b/toolkit/test/IntrinsicColorMatrix.kt
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Matrix4f
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicColorMatrix
+import android.renderscript.Type
+import android.renderscript.Float4
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a ColorMatrix operation using the RenderScript Intrinsics.
+ */
+fun intrinsicColorMatrix(
+    context: RenderScript,
+    conversion: Tester.ColorMatrixConversionType,
+    inputArray: ByteArray,
+    inputVectorSize: Int,
+    sizeX: Int,
+    sizeY: Int,
+    outputVectorSize: Int,
+    matrix: FloatArray,
+    addVector: FloatArray,
+    restriction: Range2d?
+): ByteArray {
+    val scriptColorMatrix = ScriptIntrinsicColorMatrix.create(context)
+    val inputBuilder = Type.Builder(
+        context, renderScriptVectorElementForU8(
+            context,
+            inputVectorSize
+        )
+    )
+    inputBuilder.setX(sizeX)
+    inputBuilder.setY(sizeY)
+    val inputArrayType = inputBuilder.create()
+    val inputAllocation = Allocation.createTyped(context, inputArrayType)
+    val outputBuilder = Type.Builder(
+        context, renderScriptVectorElementForU8(
+            context,
+            outputVectorSize
+        )
+    )
+    outputBuilder.setX(sizeX)
+    outputBuilder.setY(sizeY)
+    val outputArrayType = outputBuilder.create()
+    val outAllocation = Allocation.createTyped(context, outputArrayType)
+
+    inputAllocation.copyFrom(inputArray)
+    val intrinsicOutArray = ByteArray(sizeX * sizeY * paddedSize(outputVectorSize))
+    when (conversion) {
+        Tester.ColorMatrixConversionType.RGB_TO_YUV -> scriptColorMatrix.setRGBtoYUV()
+        Tester.ColorMatrixConversionType.YUV_TO_RGB -> scriptColorMatrix.setYUVtoRGB()
+        Tester.ColorMatrixConversionType.GREYSCALE -> scriptColorMatrix.setGreyscale()
+        Tester.ColorMatrixConversionType.RANDOM -> {
+            val m = Matrix4f()
+            var index = 0
+            // RS is column major
+            for (x in 0..3) {
+                for (y in 0..3) {
+                    m.set(x, y, matrix[index++])
+                }
+            }
+            scriptColorMatrix.setColorMatrix(m)
+        }
+    }
+    val vector = Float4(
+        addVector[0],
+        addVector[1],
+        addVector[2],
+        addVector[3]
+    )
+    scriptColorMatrix.setAdd(vector)
+    if (restriction != null) {
+        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptColorMatrix.forEach(inputAllocation, outAllocation, options)
+    } else {
+        scriptColorMatrix.forEach(inputAllocation, outAllocation)
+    }
+    outAllocation.copyTo(intrinsicOutArray)
+
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    inputArrayType.destroy()
+    outputArrayType.destroy()
+    scriptColorMatrix.destroy()
+    return intrinsicOutArray
+}
+
+fun intrinsicColorMatrix(
+    context: RenderScript,
+    conversion: Tester.ColorMatrixConversionType,
+    bitmap: Bitmap,
+    matrix: FloatArray,
+    addVector: FloatArray,
+    restriction: Range2d?
+): ByteArray {
+    val scriptColorMatrix = ScriptIntrinsicColorMatrix.create(context)
+    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+    inputAllocation.copyFrom(bitmap)
+    val outAllocation = Allocation.createTyped(context, inputAllocation.type)
+    val intrinsicOutArray = ByteArray(bitmap.byteCount)
+
+    when (conversion) {
+        Tester.ColorMatrixConversionType.RGB_TO_YUV -> scriptColorMatrix.setRGBtoYUV()
+        Tester.ColorMatrixConversionType.YUV_TO_RGB -> scriptColorMatrix.setYUVtoRGB()
+        Tester.ColorMatrixConversionType.GREYSCALE -> scriptColorMatrix.setGreyscale()
+        Tester.ColorMatrixConversionType.RANDOM -> {
+            val m = Matrix4f()
+            var index = 0
+            // RS is column major
+            for (x in 0..3) {
+                for (y in 0..3) {
+                    m.set(x, y, matrix[index++])
+                }
+            }
+            scriptColorMatrix.setColorMatrix(m)
+        }
+    }
+    val vector = Float4(
+        addVector[0],
+        addVector[1],
+        addVector[2],
+        addVector[3]
+    )
+    scriptColorMatrix.setAdd(vector)
+    if (restriction != null) {
+        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptColorMatrix.forEach(inputAllocation, outAllocation, options)
+    } else {
+        scriptColorMatrix.forEach(inputAllocation, outAllocation)
+    }
+    outAllocation.copyTo(intrinsicOutArray)
+
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    scriptColorMatrix.destroy()
+    return intrinsicOutArray
+}
diff --git a/toolkit/test/IntrinsicConvolve.kt b/toolkit/test/IntrinsicConvolve.kt
new file mode 100644
index 0000000..0c9e4f0
--- /dev/null
+++ b/toolkit/test/IntrinsicConvolve.kt
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicConvolve3x3
+import android.renderscript.ScriptIntrinsicConvolve5x5
+import android.renderscript.Type
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a Convolve operation using the RenderScript Intrinsics.
+ */
+fun intrinsicConvolve(
+    context: RenderScript,
+    inputArray: ByteArray,
+    vectorSize: Int,
+    sizeX: Int,
+    sizeY: Int,
+    coefficients: FloatArray,
+    restriction: Range2d?
+): ByteArray {
+    val baseElement = renderScriptVectorElementForU8(context, vectorSize)
+    val builder = Type.Builder(context, baseElement)
+    builder.setX(sizeX)
+    builder.setY(sizeY)
+    val arrayType = builder.create()
+    val inputAllocation = Allocation.createTyped(context, arrayType)
+    val outAllocation = Allocation.createTyped(context, arrayType)
+    inputAllocation.copyFrom(inputArray)
+    val intrinsicOutArray = ByteArray(sizeX * sizeY * paddedSize(vectorSize))
+    if (restriction != null) {
+        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+    }
+    invokeConvolveKernel(
+        coefficients,
+        context,
+        baseElement,
+        inputAllocation,
+        restriction,
+        outAllocation
+    )
+    outAllocation.copyTo(intrinsicOutArray)
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    arrayType.destroy()
+    return intrinsicOutArray
+}
+
+fun intrinsicConvolve(
+    context: RenderScript,
+    bitmap: Bitmap,
+    coefficients: FloatArray,
+    restriction: Range2d?
+): ByteArray {
+    val baseElement = renderScriptElementForBitmap(context, bitmap)
+
+    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+    val outAllocation = Allocation.createTyped(context, inputAllocation.type)
+    val intrinsicOutArray = ByteArray(bitmap.byteCount)
+    inputAllocation.copyFrom(bitmap)
+    if (restriction != null) {
+        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+    }
+    invokeConvolveKernel(
+        coefficients,
+        context,
+        baseElement,
+        inputAllocation,
+        restriction,
+        outAllocation
+    )
+    outAllocation.copyTo(intrinsicOutArray)
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    return intrinsicOutArray
+}
+
+private fun invokeConvolveKernel(
+    coefficients: FloatArray,
+    context: RenderScript,
+    baseElement: Element,
+    inputAllocation: Allocation?,
+    restriction: Range2d?,
+    outAllocation: Allocation?
+) {
+    when (coefficients.size) {
+        9 -> {
+            val scriptConvolve3x3 =
+                ScriptIntrinsicConvolve3x3.create(context, baseElement)
+            scriptConvolve3x3.setCoefficients(coefficients)
+            scriptConvolve3x3.setInput(inputAllocation)
+            if (restriction != null) {
+                val options = Script.LaunchOptions()
+                options.setX(restriction.startX, restriction.endX)
+                options.setY(restriction.startY, restriction.endY)
+                scriptConvolve3x3.forEach(outAllocation, options)
+            } else {
+                scriptConvolve3x3.forEach(outAllocation)
+            }
+            scriptConvolve3x3.destroy()
+        }
+        25 -> {
+            val scriptConvolve5x5 =
+                ScriptIntrinsicConvolve5x5.create(context, baseElement)
+            scriptConvolve5x5.setCoefficients(coefficients)
+            scriptConvolve5x5.setInput(inputAllocation)
+            if (restriction != null) {
+                val options = Script.LaunchOptions()
+                options.setX(restriction.startX, restriction.endX)
+                options.setY(restriction.startY, restriction.endY)
+                scriptConvolve5x5.forEach(outAllocation, options)
+            } else {
+                scriptConvolve5x5.forEach(outAllocation)
+            }
+            scriptConvolve5x5.destroy()
+        }
+        else -> {
+            throw IllegalArgumentException("RenderScriptToolkit tests. Only 3x3 and 5x5 convolutions are supported. ${coefficients.size} coefficients provided.")
+        }
+    }
+}
diff --git a/toolkit/test/IntrinsicHistogram.kt b/toolkit/test/IntrinsicHistogram.kt
new file mode 100644
index 0000000..25cc55d
--- /dev/null
+++ b/toolkit/test/IntrinsicHistogram.kt
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicHistogram
+import android.renderscript.Type
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a Histogram operation using the RenderScript Intrinsics.
+ */
+fun intrinsicHistogram(
+    context: RenderScript,
+    inputArray: ByteArray,
+    vectorSize: Int,
+    sizeX: Int,
+    sizeY: Int,
+    restriction: Range2d?
+): IntArray {
+    val element = renderScriptVectorElementForU8(context, vectorSize)
+    val scriptHistogram = ScriptIntrinsicHistogram.create(context, element)
+    val builder = Type.Builder(context, element)
+    builder.setX(sizeX)
+    builder.setY(sizeY)
+    val arrayType = builder.create()
+    val inputAllocation = Allocation.createTyped(context, arrayType)
+    val outAllocation =
+        Allocation.createSized(
+            context,
+            renderScriptVectorElementForI32(context, vectorSize),
+            256
+        )
+    inputAllocation.copyFrom(inputArray)
+    scriptHistogram.setOutput(outAllocation)
+    if (restriction != null) {
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptHistogram.forEach(inputAllocation, options)
+    } else {
+        scriptHistogram.forEach(inputAllocation)
+    }
+
+    val intrinsicOutArray = IntArray(256 * paddedSize(vectorSize))
+    outAllocation.copyTo(intrinsicOutArray)
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    arrayType.destroy()
+    scriptHistogram.destroy()
+    return intrinsicOutArray
+}
+
+fun intrinsicHistogram(
+    context: RenderScript,
+    bitmap: Bitmap,
+    restriction: Range2d?
+): IntArray {
+    val baseElement = renderScriptElementForBitmap(context, bitmap)
+    val scriptHistogram = ScriptIntrinsicHistogram.create(context, baseElement)
+    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+    inputAllocation.copyFrom(bitmap)
+    val vectorSize = vectorSizeOfBitmap(bitmap)
+    val outAllocation =
+        Allocation.createSized(
+            context,
+            renderScriptVectorElementForI32(context, vectorSize),
+            256
+        )
+    scriptHistogram.setOutput(outAllocation)
+    if (restriction != null) {
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptHistogram.forEach(inputAllocation, options)
+    } else {
+        scriptHistogram.forEach(inputAllocation)
+    }
+
+    val intrinsicOutArray = IntArray(256 * vectorSize)
+    outAllocation.copyTo(intrinsicOutArray)
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    scriptHistogram.destroy()
+    return intrinsicOutArray
+}
+
+fun intrinsicHistogramDot(
+    context: RenderScript,
+    inputArray: ByteArray,
+    vectorSize: Int,
+    sizeX: Int,
+    sizeY: Int,
+    coefficients: FloatArray?,
+    restriction: Range2d?
+): IntArray {
+    val element = renderScriptVectorElementForU8(context, vectorSize)
+    val scriptHistogram = ScriptIntrinsicHistogram.create(context, element)
+    val builder = Type.Builder(context, element)
+    builder.setX(sizeX)
+    builder.setY(sizeY)
+    val arrayType = builder.create()
+    val inputAllocation = Allocation.createTyped(context, arrayType)
+    val outAllocation =
+        Allocation.createSized(context, Element.I32(context), 256)
+    inputAllocation.copyFrom(inputArray)
+
+    if (coefficients != null) {
+        require(coefficients.size == vectorSize) {
+            "RenderScriptToolkit tests. $vectorSize coefficients are required for histogram. " +
+                "${coefficients.size} provided."
+        }
+        scriptHistogram.setDotCoefficients(
+            coefficients[0],
+            if (vectorSize > 1) coefficients[1] else 0f,
+            if (vectorSize > 2) coefficients[2] else 0f,
+            if (vectorSize > 3) coefficients[3] else 0f
+        )
+    }
+    scriptHistogram.setOutput(outAllocation)
+    if (restriction != null) {
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptHistogram.forEach_Dot(inputAllocation, options)
+    } else {
+        scriptHistogram.forEach_Dot(inputAllocation)
+    }
+    val intrinsicOutArray = IntArray(256)
+    outAllocation.copyTo(intrinsicOutArray)
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    arrayType.destroy()
+    scriptHistogram.destroy()
+    return intrinsicOutArray
+}
+
+fun intrinsicHistogramDot(
+    context: RenderScript,
+    bitmap: Bitmap,
+    coefficients: FloatArray?,
+    restriction: Range2d?
+): IntArray {
+    val baseElement = renderScriptElementForBitmap(context, bitmap)
+    val scriptHistogram = ScriptIntrinsicHistogram.create(context, baseElement)
+    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+    inputAllocation.copyFrom(bitmap)
+    val outAllocation =
+        Allocation.createSized(context, Element.I32(context), 256)
+
+    if (coefficients != null) {
+        require(coefficients.size == 4) {
+            "RenderScriptToolkit tests. Four coefficients are required for histogram. " +
+                "${coefficients.size} provided."
+        }
+        scriptHistogram.setDotCoefficients(
+            coefficients[0],
+            coefficients[1],
+            coefficients[2],
+            coefficients[3]
+        )
+    }
+    scriptHistogram.setOutput(outAllocation)
+    if (restriction != null) {
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptHistogram.forEach_Dot(inputAllocation, options)
+    } else {
+        scriptHistogram.forEach_Dot(inputAllocation)
+    }
+    val intrinsicOutArray = IntArray(256)
+    outAllocation.copyTo(intrinsicOutArray)
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    scriptHistogram.destroy()
+    return intrinsicOutArray
+}
diff --git a/toolkit/test/IntrinsicLut.kt b/toolkit/test/IntrinsicLut.kt
new file mode 100644
index 0000000..1ed03ac
--- /dev/null
+++ b/toolkit/test/IntrinsicLut.kt
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicLUT
+import android.renderscript.Type
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a LookUpTable operation using the RenderScript Intrinsics.
+ */
+@ExperimentalUnsignedTypes
+fun intrinsicLut(
+    context: RenderScript,
+    inputArray: ByteArray,
+    sizeX: Int,
+    sizeY: Int,
+    newRed: ByteArray,
+    newGreen: ByteArray,
+    newBlue: ByteArray,
+    newAlpha: ByteArray,
+    restriction: Range2d?
+): ByteArray {
+    val scriptLut: ScriptIntrinsicLUT = ScriptIntrinsicLUT.create(
+        context,
+        Element.U8_4(context)
+    )
+    val builder = Type.Builder(context, Element.U8_4(context))
+    builder.setX(sizeX)
+    builder.setY(sizeY)
+    val arrayType = builder.create()
+    val inputAllocation = Allocation.createTyped(context, arrayType)
+    val outAllocation = Allocation.createTyped(context, arrayType)
+    inputAllocation.copyFrom(inputArray)
+    val intrinsicOutArray = ByteArray(sizeX * sizeY * 4)
+
+    for (v in 0..255) {
+        scriptLut.setRed(v, newRed[v].toUByte().toInt())
+        scriptLut.setGreen(v, newGreen[v].toUByte().toInt())
+        scriptLut.setBlue(v, newBlue[v].toUByte().toInt())
+        scriptLut.setAlpha(v, newAlpha[v].toUByte().toInt())
+    }
+    if (restriction != null) {
+        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptLut.forEach(inputAllocation, outAllocation, options)
+    } else {
+        scriptLut.forEach(inputAllocation, outAllocation)
+    }
+
+    outAllocation.copyTo(intrinsicOutArray)
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    arrayType.destroy()
+    scriptLut.destroy()
+    return intrinsicOutArray
+}
+
+@ExperimentalUnsignedTypes
+fun intrinsicLut(
+    context: RenderScript,
+    bitmap: Bitmap,
+    newRed: ByteArray,
+    newGreen: ByteArray,
+    newBlue: ByteArray,
+    newAlpha: ByteArray,
+    restriction: Range2d?
+): ByteArray {
+    val baseElement = renderScriptElementForBitmap(context, bitmap)
+    val scriptLut: ScriptIntrinsicLUT = ScriptIntrinsicLUT.create(context, baseElement)
+    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+    inputAllocation.copyFrom(bitmap)
+    val outAllocation = Allocation.createTyped(context, inputAllocation.type)
+    val intrinsicOutArray = ByteArray(bitmap.byteCount)
+
+    for (v in 0..255) {
+        scriptLut.setRed(v, newRed[v].toUByte().toInt())
+        scriptLut.setGreen(v, newGreen[v].toUByte().toInt())
+        scriptLut.setBlue(v, newBlue[v].toUByte().toInt())
+        scriptLut.setAlpha(v, newAlpha[v].toUByte().toInt())
+    }
+    if (restriction != null) {
+        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptLut.forEach(inputAllocation, outAllocation, options)
+    } else {
+        scriptLut.forEach(inputAllocation, outAllocation)
+    }
+
+    outAllocation.copyTo(intrinsicOutArray)
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    scriptLut.destroy()
+    return intrinsicOutArray
+}
diff --git a/toolkit/test/IntrinsicLut3d.kt b/toolkit/test/IntrinsicLut3d.kt
new file mode 100644
index 0000000..48e785e
--- /dev/null
+++ b/toolkit/test/IntrinsicLut3d.kt
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsic3DLUT
+import android.renderscript.Type
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a 3D LookUpTable operation using the RenderScript Intrinsics.
+ */
+fun intrinsicLut3d(
+    context: RenderScript,
+    inputArray: ByteArray,
+    sizeX: Int,
+    sizeY: Int,
+    cubeArray: ByteArray,
+    cubeSize: Dimension,
+    restriction: Range2d?
+): ByteArray {
+    val scriptLut3d: ScriptIntrinsic3DLUT = ScriptIntrinsic3DLUT.create(
+        context, Element.U8_4(
+            context
+        )
+    )
+    val builder = Type.Builder(context, Element.U8_4(context))
+    builder.setX(sizeX)
+    builder.setY(sizeY)
+    val arrayType = builder.create()
+    val inputAllocation = Allocation.createTyped(context, arrayType)
+    val outAllocation = Allocation.createTyped(context, arrayType)
+    inputAllocation.copyFrom(inputArray)
+    val intrinsicOutArray = ByteArray(sizeX * sizeY * 4)
+
+    val cubeTypeBuilder: Type.Builder =
+        Type.Builder(context, Element.U8_4(context))
+    cubeTypeBuilder.setX(cubeSize.sizeX)
+    cubeTypeBuilder.setY(cubeSize.sizeY)
+    cubeTypeBuilder.setZ(cubeSize.sizeZ)
+    val cubeType: Type = cubeTypeBuilder.create()
+    val cubeAllocation = Allocation.createTyped(context, cubeType)
+    cubeAllocation.copyFrom(cubeArray)
+    scriptLut3d.setLUT(cubeAllocation)
+    if (restriction != null) {
+        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptLut3d.forEach(inputAllocation, outAllocation, options)
+    } else {
+        scriptLut3d.forEach(inputAllocation, outAllocation)
+    }
+
+    outAllocation.copyTo(intrinsicOutArray)
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    cubeAllocation.destroy()
+    arrayType.destroy()
+    cubeType.destroy()
+    scriptLut3d.destroy()
+    return intrinsicOutArray
+}
+
+fun intrinsicLut3d(
+    context: RenderScript,
+    bitmap: Bitmap,
+    cubeArray: ByteArray,
+    cubeSize: Dimension,
+    restriction: Range2d?
+): ByteArray {
+    val baseElement = renderScriptElementForBitmap(context, bitmap)
+    val scriptLut3d: ScriptIntrinsic3DLUT = ScriptIntrinsic3DLUT.create(context, baseElement)
+    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+    inputAllocation.copyFrom(bitmap)
+    val outAllocation = Allocation.createTyped(context, inputAllocation.type)
+    val intrinsicOutArray = ByteArray(bitmap.byteCount)
+
+    val cubeTypeBuilder: Type.Builder =
+        Type.Builder(context, Element.U8_4(context))
+    cubeTypeBuilder.setX(cubeSize.sizeX)
+    cubeTypeBuilder.setY(cubeSize.sizeY)
+    cubeTypeBuilder.setZ(cubeSize.sizeZ)
+    val cubeType: Type = cubeTypeBuilder.create()
+    val cubeAllocation = Allocation.createTyped(context, cubeType)
+    cubeAllocation.copyFrom(cubeArray)
+    scriptLut3d.setLUT(cubeAllocation)
+    if (restriction != null) {
+        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptLut3d.forEach(inputAllocation, outAllocation, options)
+    } else {
+        scriptLut3d.forEach(inputAllocation, outAllocation)
+    }
+
+    outAllocation.copyTo(intrinsicOutArray)
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    cubeAllocation.destroy()
+    cubeType.destroy()
+    scriptLut3d.destroy()
+    return intrinsicOutArray
+}
diff --git a/toolkit/test/IntrinsicResize.kt b/toolkit/test/IntrinsicResize.kt
new file mode 100644
index 0000000..5cdf89a
--- /dev/null
+++ b/toolkit/test/IntrinsicResize.kt
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.Bitmap
+import android.renderscript.Allocation
+import android.renderscript.RenderScript
+import android.renderscript.Script
+import android.renderscript.ScriptIntrinsicResize
+import android.renderscript.Type
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Does a Resize operation using the RenderScript Intrinsics.
+ */
+fun intrinsicResize(
+    context: RenderScript,
+    inputArray: ByteArray,
+    vectorSize: Int,
+    inSizeX: Int,
+    inSizeY: Int,
+    outSizeX: Int,
+    outSizeY: Int,
+    restriction: Range2d?
+): ByteArray {
+    val scriptResize = ScriptIntrinsicResize.create(context)
+    val builder = Type.Builder(
+        context,
+        renderScriptVectorElementForU8(context, vectorSize)
+    )
+    builder.setX(inSizeX)
+    builder.setY(inSizeY)
+    val inputArrayType = builder.create()
+    val inputAllocation = Allocation.createTyped(context, inputArrayType)
+    builder.setX(outSizeX)
+    builder.setY(outSizeY)
+    val outputArrayType = builder.create()
+    val outAllocation = Allocation.createTyped(context, outputArrayType)
+    val intrinsicOutArray = ByteArray(outSizeX * outSizeY * paddedSize(vectorSize))
+
+    inputAllocation.copyFrom(inputArray)
+    scriptResize.setInput(inputAllocation)
+    if (restriction != null) {
+        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptResize.forEach_bicubic(outAllocation, options)
+    } else {
+        scriptResize.forEach_bicubic(outAllocation)
+    }
+    outAllocation.copyTo(intrinsicOutArray)
+
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    scriptResize.destroy()
+    inputArrayType.destroy()
+    outputArrayType.destroy()
+    return intrinsicOutArray
+}
+
+fun intrinsicResize(
+    context: RenderScript,
+    bitmap: Bitmap,
+    outSizeX: Int,
+    outSizeY: Int,
+    restriction: Range2d?
+): ByteArray {
+    val scriptResize = ScriptIntrinsicResize.create(context)
+    val inputAllocation = Allocation.createFromBitmap(context, bitmap)
+    inputAllocation.copyFrom(bitmap)
+
+    val vectorSize = when (bitmap.config) {
+        Bitmap.Config.ARGB_8888 -> 4
+        Bitmap.Config.ALPHA_8 -> 1
+        else -> error("Unrecognized bitmap config $bitmap.config")
+    }
+    val builder = Type.Builder(
+        context,
+        renderScriptVectorElementForU8(context, vectorSize)
+    )
+    builder.setX(outSizeX)
+    builder.setY(outSizeY)
+    val outputArrayType = builder.create()
+    val outAllocation = Allocation.createTyped(context, outputArrayType)
+    val intrinsicOutArray = ByteArray(outSizeX * outSizeY * vectorSize)
+
+    scriptResize.setInput(inputAllocation)
+    if (restriction != null) {
+        outAllocation.copyFrom(intrinsicOutArray) // To initialize to zero
+        val options = Script.LaunchOptions()
+        options.setX(restriction.startX, restriction.endX)
+        options.setY(restriction.startY, restriction.endY)
+        scriptResize.forEach_bicubic(outAllocation, options)
+    } else {
+        scriptResize.forEach_bicubic(outAllocation)
+    }
+    outAllocation.copyTo(intrinsicOutArray)
+
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    outputArrayType.destroy()
+    scriptResize.destroy()
+    return intrinsicOutArray
+}
diff --git a/toolkit/test/IntrinsicYuvToRgb.kt b/toolkit/test/IntrinsicYuvToRgb.kt
new file mode 100644
index 0000000..5e46f2e
--- /dev/null
+++ b/toolkit/test/IntrinsicYuvToRgb.kt
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.graphics.ImageFormat
+import android.renderscript.Allocation
+import android.renderscript.Element
+import android.renderscript.RenderScript
+import android.renderscript.ScriptIntrinsicYuvToRGB
+import android.renderscript.Type
+import android.renderscript.toolkit.YuvFormat
+
+/**
+ * Does a YUV to RGB operation using the RenderScript Intrinsics.
+ */
+fun intrinsicYuvToRgb(
+    context: RenderScript,
+    inputArray: ByteArray,
+    sizeX: Int,
+    sizeY: Int,
+    format: YuvFormat
+): ByteArray {
+    val scriptYuvToRgb = ScriptIntrinsicYuvToRGB.create(
+        context,
+        Element.YUV(context)
+    )
+    val inputBuilder = Type.Builder(context, Element.YUV(context))
+    inputBuilder.setX(sizeX)
+    inputBuilder.setY(sizeY)
+    when (format) {
+        YuvFormat.NV21 -> inputBuilder.setYuvFormat(ImageFormat.NV21)
+        YuvFormat.YV12 -> inputBuilder.setYuvFormat(ImageFormat.YV12)
+        else -> require(false) { "Unknown YUV format $format" }
+    }
+    val inputArrayType = inputBuilder.create()
+    val inputAllocation = Allocation.createTyped(context, inputArrayType)
+
+    val outputBuilder = Type.Builder(context, Element.U8_4(context))
+    outputBuilder.setX(sizeX)
+    outputBuilder.setY(sizeY)
+    val outputArrayType = outputBuilder.create()
+    val outAllocation = Allocation.createTyped(context, outputArrayType)
+    val intrinsicOutArray = ByteArray(sizeX * sizeY * 4)
+
+    inputAllocation.copyFrom(inputArray)
+    scriptYuvToRgb.setInput(inputAllocation)
+    scriptYuvToRgb.forEach(outAllocation)
+    outAllocation.copyTo(intrinsicOutArray)
+
+    inputAllocation.destroy()
+    outAllocation.destroy()
+    inputArrayType.destroy()
+    outputArrayType.destroy()
+    scriptYuvToRgb.destroy()
+    return intrinsicOutArray
+}
diff --git a/toolkit/test/MainActivity.kt b/toolkit/test/MainActivity.kt
new file mode 100644
index 0000000..4092861
--- /dev/null
+++ b/toolkit/test/MainActivity.kt
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.os.Bundle
+import android.widget.TextView
+import androidx.appcompat.app.AppCompatActivity
+
+@ExperimentalUnsignedTypes
+class MainActivity : AppCompatActivity() {
+
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+        setContentView(R.layout.activity_main)
+
+        // To debug resources not destroyed
+        // "A resource failed to call destroy."
+        try {
+            Class.forName("dalvik.system.CloseGuard")
+                .getMethod("setEnabled", Boolean::class.javaPrimitiveType)
+                .invoke(null, true)
+        } catch (e: ReflectiveOperationException) {
+            throw RuntimeException(e)
+        }
+
+        val validate = true
+        val tester = Tester(this, validate)
+        val numberOfIterations = if (validate) 1 else 28
+        val t = TimingTracker(numberOfIterations, 0)
+        for (i in 1..numberOfIterations) {
+            println("*** Iteration $i of $numberOfIterations ****")
+            //startMethodTracing("myTracing")
+            //startMethodTracingSampling("myTracing_sample", 8000000, 10)
+            val r = tester.testAll(t)
+            //stopMethodTracing()
+            findViewById<TextView>(R.id.sample_text).text = "$r\n\n${t.report()}"
+            t.nextIteration()
+        }
+        tester.destroy()
+    }
+}
diff --git a/toolkit/test/ReferenceBlend.kt b/toolkit/test/ReferenceBlend.kt
new file mode 100644
index 0000000..ba60bc8
--- /dev/null
+++ b/toolkit/test/ReferenceBlend.kt
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.BlendingMode
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Reference implementation of a Blend operation.
+ *
+ * See the class Rgba for details of arithmetic operation using that class.
+ */
+@ExperimentalUnsignedTypes
+fun referenceBlend(
+    mode: BlendingMode,
+    sourceArray: ByteArray,
+    destArray: ByteArray,
+    sizeX: Int,
+    sizeY: Int,
+    restriction: Range2d?
+) {
+    val source = Rgba2dArray(sourceArray, sizeX, sizeY)
+    val dest = Rgba2dArray(destArray, sizeX, sizeY)
+
+    /**
+     * For each corresponding RGBA value of the source and destination arrays, invoke the blend
+     * function and store the result in the destination array.
+     */
+    fun blendEachPair(blendFunction: (src: Rgba, dst: Rgba) -> Rgba) {
+        dest.forEachCell(restriction) { x, y ->
+            dest[x, y] = blendFunction(source[x, y], dest[x, y])
+        }
+    }
+
+    when (mode) {
+        BlendingMode.CLEAR -> blendEachPair { _, _ -> Rgba(0, 0, 0, 0) }
+        BlendingMode.SRC -> blendEachPair { src, _ -> src }
+        BlendingMode.DST -> { /* This doesn't do anything. */ }
+        BlendingMode.SRC_OVER -> blendEachPair { src, dst -> blendOver(src, dst) }
+        BlendingMode.DST_OVER -> blendEachPair { src, dst -> blendOver(dst, src) }
+        BlendingMode.SRC_IN -> blendEachPair { src, dst -> blendIn(src, dst) }
+        BlendingMode.DST_IN -> blendEachPair { src, dst -> blendIn(dst, src) }
+        BlendingMode.SRC_OUT -> blendEachPair { src, dst -> blendOut(src, dst) }
+        BlendingMode.DST_OUT -> blendEachPair { src, dst -> blendOut(dst, src) }
+        BlendingMode.SRC_ATOP -> blendEachPair { src, dst -> blendAtop(src, dst) }
+        BlendingMode.DST_ATOP -> blendEachPair { src, dst -> blendAtop(dst, src) }
+        BlendingMode.XOR -> blendEachPair { src, dst -> src xor dst }
+        BlendingMode.MULTIPLY -> blendEachPair { src, dst -> src * dst }
+        BlendingMode.ADD -> blendEachPair { src, dst -> dst + src }
+        BlendingMode.SUBTRACT -> blendEachPair { src, dst -> dst - src }
+    }
+}
+
+@ExperimentalUnsignedTypes
+private fun blendOver(src: Rgba, dst: Rgba) = src + (dst * (255 - src.a))
+
+@ExperimentalUnsignedTypes
+private fun blendIn(src: Rgba, dst: Rgba) = src * dst.a
+
+@ExperimentalUnsignedTypes
+private fun blendOut(src: Rgba, dst: Rgba) = src * (255 - dst.a)
+
+@ExperimentalUnsignedTypes
+private fun blendAtop(src: Rgba, dst: Rgba): Rgba {
+    val value = src * dst.a + dst * (255 - src.a)
+    value.a = dst.a
+    return value
+}
diff --git a/toolkit/test/ReferenceBlur.kt b/toolkit/test/ReferenceBlur.kt
new file mode 100644
index 0000000..66c2a05
--- /dev/null
+++ b/toolkit/test/ReferenceBlur.kt
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.Range2d
+import kotlin.math.max
+import kotlin.math.min
+import kotlin.math.pow
+import kotlin.math.sqrt
+
+/**
+ * Reference implementation of a Blur operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceBlur(inputArray: ByteArray,
+                  vectorSize: Int,
+                  sizeX: Int,
+                  sizeY: Int,
+                  radius: Int = 5, restriction: Range2d?): ByteArray {
+    val maxRadius = 25
+    require (radius in 1..maxRadius) {
+        "RenderScriptToolkit blur. Radius should be between 1 and $maxRadius. $radius provided."
+    }
+    val gaussian = buildGaussian(radius)
+
+    // Convert input data to float so that the blurring goes faster.
+    val inputValues = FloatArray(inputArray.size) { byteToUnitFloat(inputArray[it].toUByte()) }
+    val inputInFloat = FloatVector2dArray(inputValues, vectorSize, sizeX, sizeY)
+
+    val scratch = horizontalBlur(inputInFloat, gaussian, radius, restriction)
+    val outInFloat = verticalBlur(scratch, gaussian, radius, restriction)
+
+    // Convert the results back to bytes.
+    return ByteArray(outInFloat.values.size) { unitFloatClampedToUByte(outInFloat.values[it]).toByte() }
+}
+
+/**
+ * Blurs along the horizontal direction using the specified gaussian weights.
+ */
+private fun horizontalBlur(
+    input: FloatVector2dArray,
+    gaussian: FloatArray,
+    radius: Int,
+    restriction: Range2d?
+): FloatVector2dArray {
+    var expandedRestriction: Range2d? = null
+    if (restriction != null) {
+        // Expand the restriction in the vertical direction so that the vertical pass
+        // will have all the data it needs.
+        expandedRestriction = Range2d(
+            restriction.startX,
+            restriction.endX,
+            max(restriction.startY - radius, 0),
+            min(restriction.endY + radius, input.sizeY)
+        )
+    }
+
+    input.clipAccessToRange = true
+    val out = input.createSameSized()
+    out.forEach(expandedRestriction) { x, y ->
+        for ((gaussianIndex, delta: Int) in (-radius..radius).withIndex()) {
+            val v = input[x + delta, y] * gaussian[gaussianIndex]
+            out[x, y] += v
+        }
+    }
+    return out
+}
+
+/**
+ * Blurs along the horizontal direction using the specified gaussian weights.
+ */
+private fun verticalBlur(
+    input: FloatVector2dArray,
+    gaussian: FloatArray,
+    radius: Int,
+    restriction: Range2d?
+): FloatVector2dArray {
+    input.clipAccessToRange = true
+    val out = input.createSameSized()
+    out.forEach(restriction) { x, y ->
+        for ((gaussianIndex, delta: Int) in (-radius..radius).withIndex()) {
+            val v = input[x, y + delta] * gaussian[gaussianIndex]
+            out[x, y] += v
+        }
+    }
+    return out
+}
+
+/**
+ * Builds an array of gaussian weights that will be used for doing the horizontal and vertical
+ * blur.
+ *
+ * @return An array of (2 * radius + 1) floats.
+ */
+private fun buildGaussian(radius: Int): FloatArray {
+    val e: Float = kotlin.math.E.toFloat()
+    val pi: Float = kotlin.math.PI.toFloat()
+    val sigma: Float = 0.4f * radius.toFloat() + 0.6f
+    val coefficient1: Float = 1.0f / (sqrt(2.0f * pi) * sigma)
+    val coefficient2: Float = -1.0f / (2.0f * sigma * sigma)
+
+    var sum = 0.0f
+    val gaussian = FloatArray(radius * 2 + 1)
+    for (r in -radius..radius) {
+        val floatR: Float = r.toFloat()
+        val v: Float = coefficient1 * e.pow(floatR * floatR * coefficient2)
+        gaussian[r + radius] = v
+        sum += v
+    }
+
+    // Normalize so that the sum of the weights equal 1f.
+    val normalizeFactor: Float = 1.0f / sum
+    for (r in -radius..radius) {
+        gaussian[r + radius] *= normalizeFactor
+    }
+    return gaussian
+}
diff --git a/toolkit/test/ReferenceColorMatrix.kt b/toolkit/test/ReferenceColorMatrix.kt
new file mode 100644
index 0000000..75f93af
--- /dev/null
+++ b/toolkit/test/ReferenceColorMatrix.kt
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Reference implementation of a ColorMatrix operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceColorMatrix(inputArray: ByteArray,
+                         inputVectorSize: Int,
+                         sizeX: Int,
+                         sizeY: Int,
+                         outputVectorSize: Int,
+                         matrix: FloatArray, addVector: FloatArray,
+                         restriction: Range2d?): ByteArray {
+    require (matrix.size == 16) { "RenderScriptToolkit colorMatrix. Matrix should have 16 values. ${matrix.size} provided." }
+
+    val input = Vector2dArray(inputArray.asUByteArray(), inputVectorSize, sizeX, sizeY)
+    val outputArray = ByteArray(sizeX * sizeY * paddedSize(outputVectorSize))
+    val output = Vector2dArray(outputArray.asUByteArray(), outputVectorSize, sizeX, sizeY)
+
+    output.forEach (restriction) { x, y ->
+        val inUByteValue = input[x, y]
+        val inFloatValue = FloatArray(4) { if (it >= inputVectorSize) 0f else byteToUnitFloat(inUByteValue[it]) }
+        val outFloatValue = multiplyAndAdd(matrix, inFloatValue, addVector)
+        val outUByteValue = UByteArray(paddedSize(output.vectorSize)) { unitFloatClampedToUByte(outFloatValue[it]) }
+        output[x, y] = outUByteValue
+    }
+    return outputArray
+}
+
+private fun multiplyAndAdd(matrix: FloatArray, inVector: FloatArray, addVector: FloatArray): FloatArray {
+    // In RenderScript, matrix were set in column major format
+    val result = addVector.clone()
+    for (i in 0..3) {
+        for (j in 0..3) {
+            result[i] += matrix[j * 4 + i] * inVector[j]
+        }
+    }
+    return result
+}
diff --git a/toolkit/test/ReferenceConvolve.kt b/toolkit/test/ReferenceConvolve.kt
new file mode 100644
index 0000000..b9181a9
--- /dev/null
+++ b/toolkit/test/ReferenceConvolve.kt
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Reference implementation of a Convolve operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceConvolve(
+    inputArray: ByteArray,
+    vectorSize: Int,
+    sizeX: Int,
+    sizeY: Int,
+    coefficients: FloatArray,
+    restriction: Range2d?
+): ByteArray {
+    val input = Vector2dArray(inputArray.asUByteArray(), vectorSize, sizeX, sizeY)
+    val radius = when (coefficients.size) {
+        9 -> 1
+        25 -> 2
+        else -> {
+            throw IllegalArgumentException("RenderScriptToolkit Convolve. Only 3x3 and 5x5 convolutions are supported. ${coefficients.size} coefficients provided.")
+        }
+    }
+
+    input.clipReadToRange = true
+    val output = input.createSameSized()
+    input.forEach(restriction) { x, y ->
+        output[x, y] = convolveOne(input, x, y, coefficients, radius)
+    }
+    return output.values.asByteArray()
+}
+
+@ExperimentalUnsignedTypes
+private fun convolveOne(
+    inputAlloc: Vector2dArray,
+    x: Int,
+    y: Int,
+    coefficients: FloatArray,
+    radius: Int
+): UByteArray {
+    var sum = FloatArray(paddedSize(inputAlloc.vectorSize))
+    var coefficientIndex = 0
+    for (deltaY in -radius..radius) {
+        for (deltaX in -radius..radius) {
+            val inputVector = inputAlloc[x + deltaX, y + deltaY]
+            sum += inputVector.toFloatArray() * coefficients[coefficientIndex]
+            coefficientIndex++
+        }
+    }
+    return sum.clampToUByte()
+}
diff --git a/toolkit/test/ReferenceHistogram.kt b/toolkit/test/ReferenceHistogram.kt
new file mode 100644
index 0000000..6bd9167
--- /dev/null
+++ b/toolkit/test/ReferenceHistogram.kt
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Reference implementation of a Histogram operation.
+ *
+ * Return an array of 4 * 256 ints.
+ * Position 0 is the number of R with a value of 0,
+ * Position 1 is the number of G with a value of 0,
+ * Position 2 is the number of B with a value of 0,
+ * Position 3 is the number of A with a value of 0,
+ * Position 4 is the number of R with a value of 1,
+ * etc.
+*/
+@ExperimentalUnsignedTypes
+fun referenceHistogram(
+    inputArray: ByteArray,
+    vectorSize: Int,
+    sizeX: Int,
+    sizeY: Int,
+    restriction: Range2d?
+): IntArray {
+    val input = Vector2dArray(inputArray.asUByteArray(), vectorSize, sizeX, sizeY)
+
+    val counts = IntArray(paddedSize(input.vectorSize) * 256)
+    input.forEach(restriction) { x, y ->
+        val value = input[x, y]
+        for (i in 0 until vectorSize) {
+            counts[value[i].toInt() * paddedSize(input.vectorSize) + i]++
+        }
+    }
+    return counts
+}
+
+/**
+ * Reference implementation of a HistogramDot operation.
+ *
+ * Each RGBA input value is dot-multiplied first by the specified coefficients.
+ * The resulting value is converted to an integer and used for the histogram.
+ */
+@ExperimentalUnsignedTypes
+fun referenceHistogramDot(
+    inputArray: ByteArray,
+    vectorSize: Int,
+    sizeX: Int,
+    sizeY: Int,
+    coefficients: FloatArray?,
+    restriction: Range2d?
+): IntArray {
+    val floatCoefficients = coefficients ?: floatArrayOf(0.299f, 0.587f, 0.114f, 0f)
+    val input = Vector2dArray(inputArray.asUByteArray(), vectorSize, sizeX, sizeY)
+    var coefficientSum = 0f
+    for (c in floatCoefficients) {
+        require (c >= 0) {
+            "RenderScriptToolkit histogramDot. Coefficients must be positive. $c provided."
+        }
+        coefficientSum += c
+    }
+    require(coefficientSum <= 1f) { "RenderScriptToolkit histogramDot. Coefficients should " +
+            "add to 1.0 or less. $coefficientSum provided." }
+
+    // Compute integer
+    val intCoefficients = IntArray(input.vectorSize) { (floatCoefficients[it] * 256f + 0.5f).toInt() }
+
+    val counts = IntArray(256)
+    input.forEach(restriction) { x, y ->
+        val value = input[x, y]
+        // While we could do the computation using floats, we won't get the same results as
+        // the existing intrinsics.
+        var sum = 0
+        // We don't use value.indices because we want to accumulate only 3 values, in the case
+        // of vectorSize == 3.
+        for (i in 0 until vectorSize) {
+            sum += intCoefficients[i] * value[i].toInt()
+        }
+        // Round up and normalize
+        val index = (sum + 0x7f) shr 8
+        counts[index]++
+    }
+    return counts
+}
diff --git a/toolkit/test/ReferenceLut.kt b/toolkit/test/ReferenceLut.kt
new file mode 100644
index 0000000..cd832f0
--- /dev/null
+++ b/toolkit/test/ReferenceLut.kt
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.LookupTable
+import android.renderscript.toolkit.Range2d
+
+/**
+ * Reference implementation of a LookUpTable operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceLut(
+    inputArray: ByteArray,
+    sizeX: Int,
+    sizeY: Int,
+    table: LookupTable,
+    restriction: Range2d?
+): ByteArray {
+    val input = Vector2dArray(inputArray.asUByteArray(), 4, sizeX, sizeY)
+
+    val output = input.createSameSized()
+    input.forEach(restriction) { x, y ->
+        val oldValue = input[x, y]
+        val newValue = byteArrayOf(
+            table.red[oldValue[0].toInt()],
+            table.green[oldValue[1].toInt()],
+            table.blue[oldValue[2].toInt()],
+            table.alpha[oldValue[3].toInt()]
+        )
+        output[x, y] = newValue.asUByteArray()
+    }
+    return output.values.asByteArray()
+}
+
diff --git a/toolkit/test/ReferenceLut3d.kt b/toolkit/test/ReferenceLut3d.kt
new file mode 100644
index 0000000..afd977b
--- /dev/null
+++ b/toolkit/test/ReferenceLut3d.kt
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.Range2d
+import android.renderscript.toolkit.Rgba3dArray
+
+/**
+ * Reference implementation of a 3D LookUpTable operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceLut3d(
+    inputArray: ByteArray,
+    sizeX: Int,
+    sizeY: Int,
+    cube: Rgba3dArray,
+    restriction: Range2d?
+): ByteArray {
+    val input = Vector2dArray(inputArray.asUByteArray(), 4, sizeX, sizeY)
+    val output = input.createSameSized()
+    input.forEach(restriction) { x, y ->
+        output[x, y] = lookup(input[x, y], cube)
+    }
+    return output.values.asByteArray()
+}
+
+@ExperimentalUnsignedTypes
+private fun lookup(input: UByteArray, cube: Rgba3dArray): UByteArray {
+    // Calculate the two points at opposite edges of the size 1
+    // cube that contains our point.
+    val maxIndex = Int4(cube.sizeX - 1, cube.sizeY - 1, cube.sizeZ - 1, 0)
+    val baseCoordinate: Float4 = input.toFloat4() * maxIndex.toFloat4() / 255f
+    val point1: Int4 = baseCoordinate.intFloor()
+    val point2: Int4 = min(point1 + 1, maxIndex)
+    val fractionAwayFromPoint1: Float4 = baseCoordinate - point1.toFloat4()
+
+    // Get the RGBA values at each of the four corners of the size 1 cube.
+    val v000 = cube[point1.x, point1.y, point1.z].toFloat4()
+    val v100 = cube[point2.x, point1.y, point1.z].toFloat4()
+    val v010 = cube[point1.x, point2.y, point1.z].toFloat4()
+    val v110 = cube[point2.x, point2.y, point1.z].toFloat4()
+    val v001 = cube[point1.x, point1.y, point2.z].toFloat4()
+    val v101 = cube[point2.x, point1.y, point2.z].toFloat4()
+    val v011 = cube[point1.x, point2.y, point2.z].toFloat4()
+    val v111 = cube[point2.x, point2.y, point2.z].toFloat4()
+
+    // Do the linear mixing of these eight values.
+    val yz00 = mix(v000, v100, fractionAwayFromPoint1.x)
+    val yz10 = mix(v010, v110, fractionAwayFromPoint1.x)
+    val yz01 = mix(v001, v101, fractionAwayFromPoint1.x)
+    val yz11 = mix(v011, v111, fractionAwayFromPoint1.x)
+
+    val z0 = mix(yz00, yz10, fractionAwayFromPoint1.y)
+    val z1 = mix(yz01, yz11, fractionAwayFromPoint1.y)
+
+    val v = mix(z0, z1, fractionAwayFromPoint1.z)
+
+    // Preserve the alpha of the original value
+    return ubyteArrayOf(v.x.clampToUByte(), v.y.clampToUByte(), v.z.clampToUByte(), input[3])
+}
diff --git a/toolkit/test/ReferenceResize.kt b/toolkit/test/ReferenceResize.kt
new file mode 100644
index 0000000..023825e
--- /dev/null
+++ b/toolkit/test/ReferenceResize.kt
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.Range2d
+import kotlin.math.floor
+import kotlin.math.max
+
+var trace = false
+
+/**
+ * Reference implementation of a Resize operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceResize(inputArray: ByteArray,
+                    vectorSize: Int,
+                    inSizeX: Int,
+                    inSizeY: Int,
+                    outSizeX: Int, outSizeY: Int,
+                    restriction: Range2d?): ByteArray {
+    val input = Vector2dArray(inputArray.asUByteArray(), vectorSize, inSizeX, inSizeY)
+    val scaleX: Float = input.sizeX.toFloat() / outSizeX.toFloat()
+    val scaleY: Float = input.sizeY.toFloat() / outSizeY.toFloat()
+    val outArray = UByteArray(outSizeX * outSizeY * paddedSize(input.vectorSize))
+    val out = Vector2dArray(outArray, input.vectorSize, outSizeX, outSizeY)
+    out.forEach (restriction) { x, y ->
+        if (x == 1827 && y == 46) {
+            println("Found it")
+            trace = true
+        }
+        val o = bicubicU4(x, y, input, scaleX, scaleY)
+        out[x, y] = o.clampToUByte()
+    }
+    return out.values.asByteArray()
+}
+
+private fun cubicInterpolateF(p0: FloatArray, p1: FloatArray, p2: FloatArray, p3: FloatArray,
+                              x: Float): FloatArray {
+    return p1 + (p2 - p0 + (p0 * 2f - p1 * 5f + p2 * 4f - p3
+            + ((p1 - p2) * 3f + p3 - p0) * x) * x) * x * 0.5f
+}
+
+@ExperimentalUnsignedTypes
+private fun bicubicU4(x: Int, y: Int, gIn: Vector2dArray, scaleX: Float, scaleY: Float): FloatArray {
+    var xf: Float = (x + 0.5f) * scaleX - 0.5f
+    var yf: Float = (y + 0.5f) * scaleY - 0.5f
+
+    val startX: Int = floor(xf - 1).toInt()
+    val startY: Int = floor(yf - 1).toInt()
+    xf -= floor(xf)
+    yf -= floor(yf)
+    val maxX: Int = gIn.sizeX - 1
+    val maxY: Int = gIn.sizeY - 1
+
+    val xs0: Int = max(0, startX + 0)
+    val xs1: Int = max(0, startX + 1)
+    val xs2: Int = kotlin.math.min(maxX, startX + 2)
+    val xs3: Int = kotlin.math.min(maxX, startX + 3)
+
+    val ys0: Int = max(0, startY + 0)
+    val ys1: Int = max(0, startY + 1)
+    val ys2: Int = kotlin.math.min(maxY, startY + 2)
+    val ys3: Int = kotlin.math.min(maxY, startY + 3)
+
+    val p00 = gIn[xs0, ys0].toFloatArray()
+    val p01 = gIn[xs1, ys0].toFloatArray()
+    val p02 = gIn[xs2, ys0].toFloatArray()
+    val p03 = gIn[xs3, ys0].toFloatArray()
+    val p0  = cubicInterpolateF(p00, p01, p02, p03, xf)
+
+    val p10 = gIn[xs0, ys1].toFloatArray()
+    val p11 = gIn[xs1, ys1].toFloatArray()
+    val p12 = gIn[xs2, ys1].toFloatArray()
+    val p13 = gIn[xs3, ys1].toFloatArray()
+    val p1  = cubicInterpolateF(p10, p11, p12, p13, xf)
+
+    val p20 = gIn[xs0, ys2].toFloatArray()
+    val p21 = gIn[xs1, ys2].toFloatArray()
+    val p22 = gIn[xs2, ys2].toFloatArray()
+    val p23 = gIn[xs3, ys2].toFloatArray()
+    val p2  = cubicInterpolateF(p20, p21, p22, p23, xf)
+
+    val p30 = gIn[xs0, ys3].toFloatArray()
+    val p31 = gIn[xs1, ys3].toFloatArray()
+    val p32 = gIn[xs2, ys3].toFloatArray()
+    val p33 = gIn[xs3, ys3].toFloatArray()
+    val p3  = cubicInterpolateF(p30, p31, p32, p33, xf)
+
+    return cubicInterpolateF(p0, p1, p2, p3, yf)
+}
+
+
+/* To be used if we implement Floats
+private fun bicubic_F4(x: Int, y: Int, gin: ByteArray, sizeX: Int, sizeY: Int, scaleX: Float, scaleY: Float): Float4 {
+    var xf: Float = (x + 0.5f) * scaleX - 0.5f
+    var yf: Float = (y + 0.5f) * scaleY - 0.5f
+
+    val startX: Int = floor(xf - 1).toInt()
+    val startY: Int = floor(yf - 1).toInt()
+    xf = xf - floor(xf)
+    yf = yf - floor(yf)
+    val maxX: Int = sizeX - 1
+    val maxY: Int = sizeY - 1
+
+    val xs0: Int = max(0, startX + 0)
+    val xs1: Int = max(0, startX + 1)
+    val xs2: Int = min(maxX, startX + 2)
+    val xs3: Int = min(maxX, startX + 3)
+
+    val ys0: Int = max(0, startY + 0)
+    val ys1: Int = max(0, startY + 1)
+    val ys2: Int = min(maxY, startY + 2)
+    val ys3: Int = min(maxY, startY + 3)
+
+    val p00: Float4 = rsGetElementAt_Float4(gIn, xs0, ys0)
+    val p01: Float4 = rsGetElementAt_Float4(gIn, xs1, ys0)
+    val p02: Float4 = rsGetElementAt_Float4(gIn, xs2, ys0)
+    val p03: Float4 = rsGetElementAt_Float4(gIn, xs3, ys0)
+    val p0: Float4  = cubicInterpolate_F4(p00, p01, p02, p03, xf)
+
+    val p10: Float4 = rsGetElementAt_Float4(gIn, xs0, ys1)
+    val p11: Float4 = rsGetElementAt_Float4(gIn, xs1, ys1)
+    val p12: Float4 = rsGetElementAt_Float4(gIn, xs2, ys1)
+    val p13: Float4 = rsGetElementAt_Float4(gIn, xs3, ys1)
+    val p1: Float4  = cubicInterpolate_F4(p10, p11, p12, p13, xf)
+
+    val p20: Float4 = rsGetElementAt_Float4(gIn, xs0, ys2)
+    val p21: Float4 = rsGetElementAt_Float4(gIn, xs1, ys2)
+    val p22: Float4 = rsGetElementAt_Float4(gIn, xs2, ys2)
+    val p23: Float4 = rsGetElementAt_Float4(gIn, xs3, ys2)
+    val p2: Float4  = cubicInterpolate_F4(p20, p21, p22, p23, xf)
+
+    val p30: Float4 = rsGetElementAt_Float4(gIn, xs0, ys3)
+    val p31: Float4 = rsGetElementAt_Float4(gIn, xs1, ys3)
+    val p32: Float4 = rsGetElementAt_Float4(gIn, xs2, ys3)
+    val p33: Float4 = rsGetElementAt_Float4(gIn, xs3, ys3)
+    val p3: Float4  = cubicInterpolate_F4(p30, p31, p32, p33, xf)
+
+    val p: Float4  = cubicInterpolate_F4(p0, p1, p2, p3, yf)
+
+    return p
+}
+*/
diff --git a/toolkit/test/ReferenceYuvToRgb.kt b/toolkit/test/ReferenceYuvToRgb.kt
new file mode 100644
index 0000000..4d91cf6
--- /dev/null
+++ b/toolkit/test/ReferenceYuvToRgb.kt
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+import android.renderscript.toolkit.YuvFormat
+import java.lang.IllegalArgumentException
+
+/**
+ * Reference implementation of a YUV to RGB operation.
+ */
+@ExperimentalUnsignedTypes
+fun referenceYuvToRgb(inputSignedArray: ByteArray, sizeX: Int, sizeY: Int, format: YuvFormat): ByteArray {
+    require(sizeX % 2 == 0) { "The width of the input should be even."}
+    val inputArray = inputSignedArray.asUByteArray()
+
+    val outputArray = ByteArray(sizeX * sizeY * 4)
+    val output = Vector2dArray(outputArray.asUByteArray(), 4, sizeX, sizeY)
+
+    when (format) {
+        YuvFormat.NV21 -> {
+            val startY = 0
+            val startU = sizeX * sizeY + 1
+            val startV = sizeX * sizeY
+
+            for (y in 0 until sizeY) {
+                for (x in 0 until sizeX) {
+                    val offsetY = y * sizeX + x
+                    val offsetU = ((y shr 1) * sizeX + (x shr 1) * 2)
+                    val offsetV = ((y shr 1) * sizeX + (x shr 1) * 2)
+                    output[x, y] = yuvToRGBA4(
+                        inputArray[startY + offsetY],
+                        inputArray[startU + offsetU],
+                        inputArray[startV + offsetV]
+                    )
+                }
+            }
+        }
+
+        YuvFormat.YV12 -> {
+            /* According to https://developer.android.com/reference/kotlin/android/graphics/ImageFormat#yv12,
+             * strideX and strideUV should be aligned to 16 byte boundaries. If we do this, we
+             * won't get the same results as RenderScript.
+             *
+             * We may want to test & require that sizeX is a multiple of 16/32.
+             */
+            val strideX = roundUpTo16(sizeX) // sizeX //
+            val strideUV = roundUpTo16(strideX / 2) // strideX / 2 //
+            val startY = 0
+            val startU = strideX * sizeY
+            val startV = startU + strideUV * sizeY / 2
+
+            for (y in 0 until sizeY) {
+                for (x in 0 until sizeX) {
+                    val offsetY = y * sizeX + x
+                    val offsetUV = (y shr 1) * strideUV + (x shr 1)
+                    output[x, y] = yuvToRGBA4(
+                        inputArray[startY + offsetY],
+                        inputArray[startU + offsetUV],
+                        inputArray[startV + offsetUV],
+                    )
+                }
+            }
+        }
+        else -> throw IllegalArgumentException("Unknown YUV format $format")
+    }
+
+    return outputArray
+}
+
+@ExperimentalUnsignedTypes
+private fun yuvToRGBA4(y: UByte, u: UByte, v: UByte): UByteArray {
+    val intY = y.toInt() - 16
+    val intU = u.toInt() - 128
+    val intV = v.toInt() - 128
+    val p = intArrayOf(
+        intY * 298 + intV * 409 + 128 shr 8,
+        intY * 298 - intU * 100 - intV * 208 + 128 shr 8,
+        intY * 298 + intU * 516 + 128 shr 8,
+        255
+    )
+    return UByteArray(4) { p[it].clampToUByte() }
+}
+
+/* To be used if we support Float
+private fun yuvToRGBA_f4(y: UByte, u: UByte, v: UByte): UByteArray {
+    val yuv_U_values = floatArrayOf(0f, -0.392f * 0.003921569f, 2.02f * 0.003921569f, 0f)
+    val yuv_V_values = floatArrayOf(1.603f * 0.003921569f, -0.815f * 0.003921569f, 0f, 0f)
+
+    var color = FloatArray(4) {y.toFloat() * 0.003921569f}
+    val fU = FloatArray(4) {u.toFloat() - 128f}
+    val fV = FloatArray(4) {v.toFloat() - 128f}
+
+    color += fU * yuv_U_values;
+    color += fV * yuv_V_values;
+    //color = clamp(color, 0.f, 1.f);
+    return UByteArray(4) { unitFloatClampedToUByte(color[it]) }
+}
+*/
diff --git a/toolkit/test/TimingTracker.kt b/toolkit/test/TimingTracker.kt
new file mode 100644
index 0000000..81e90f2
--- /dev/null
+++ b/toolkit/test/TimingTracker.kt
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.testapp
+
+class TimingTracker(
+    private val numberOfIterations: Int = 1,
+    private var numberOfIterationsToIgnore: Int = 0
+) {
+    init {
+        require(numberOfIterations > numberOfIterationsToIgnore)
+    }
+    private val timings = mutableMapOf<String, IntArray>()
+    private var currentIteration: Int = 0
+    fun nextIteration() {
+        currentIteration++
+    }
+    fun <T> measure(name: String, workToTime: () -> T): T {
+        val start = System.nanoTime()
+        val t = workToTime()
+        if (currentIteration >= numberOfIterationsToIgnore) {
+            val end = System.nanoTime()
+            val deltaInMicroseconds: Int = ((end - start) / 1000).toInt()
+            val timing = timings.getOrPut(name) {
+                IntArray(numberOfIterations - numberOfIterationsToIgnore)
+            }
+            timing[currentIteration - numberOfIterationsToIgnore] += deltaInMicroseconds
+        }
+        return t
+    }
+    fun report(): String {
+        var minimum: Int = Int.MAX_VALUE
+        for (timing in timings.values) {
+            val m = timing.minOrNull()
+            if (m != null && m < minimum) minimum = m
+        }
+
+        println(timings.map { (name, timing) -> name + ": " + timing.minOrNull() }.joinToString(separator = "\n"))
+
+        return (timings.map { (name, timing) -> name + ": " + timing.joinToString() }.joinToString() + "\n\n" +
+                timings.map { (name, timing) -> name + ": " + timing.joinToString { "%.2f".format(it.toFloat() / minimum) } }.joinToString() + "\n\n" +
+                timings.map { (name, timing) -> name + ": " + timing.minOrNull() }.joinToString())
+    }
+}
+
diff --git a/toolkit/test/res/drawable-nodpi/img800x450a.jpg b/toolkit/test/res/drawable-nodpi/img800x450a.jpg
new file mode 100644
index 0000000..6d5b623
--- /dev/null
+++ b/toolkit/test/res/drawable-nodpi/img800x450a.jpg
Binary files differ
diff --git a/toolkit/test/res/drawable-nodpi/img800x450b.jpg b/toolkit/test/res/drawable-nodpi/img800x450b.jpg
new file mode 100644
index 0000000..2013e07
--- /dev/null
+++ b/toolkit/test/res/drawable-nodpi/img800x450b.jpg
Binary files differ
diff --git a/toolkit/x86.cpp b/toolkit/x86.cpp
new file mode 100644
index 0000000..d25c3d7
--- /dev/null
+++ b/toolkit/x86.cpp
@@ -0,0 +1,1323 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+#include <x86intrin.h>
+
+namespace android {
+namespace renderscript {
+
+/* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
+static inline __m128i cvtepu8_epi32(__m128i x) {
+#if defined(__SSE4_1__)
+    return _mm_cvtepu8_epi32(x);
+#elif defined(__SSSE3__)
+    const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
+    x = _mm_shuffle_epi8(x, M8to32);
+    return x;
+#else
+#   error "Require at least SSSE3"
+#endif
+}
+
+static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
+#if defined(__SSE4_1__)
+    return _mm_packus_epi32(lo, hi);
+#elif defined(__SSSE3__)
+    const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
+    const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
+    const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
+    const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
+    lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
+    lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
+    hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
+    hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
+    return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
+                        _mm_shuffle_epi8(hi, M32to16H));
+#else
+#   error "Require at least SSSE3"
+#endif
+}
+
+static inline __m128i mullo_epi32(__m128i x, __m128i y) {
+#if defined(__SSE4_1__)
+    return _mm_mullo_epi32(x, y);
+#elif defined(__SSSE3__)
+    const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
+    __m128i even = _mm_mul_epu32(x, y);
+    __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
+                                _mm_srli_si128(y, 4));
+    even = _mm_and_si128(even, Meven);
+    odd = _mm_and_si128(odd, Meven);
+    return _mm_or_si128(even, _mm_slli_si128(odd, 4));
+#else
+#   error "Require at least SSSE3"
+#endif
+}
+
+/* 'mask' must packed 8-bit of 0x00 or 0xff */
+static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
+#if defined(__SSE4_1__)
+    return _mm_blendv_epi8(x, y, mask);
+#elif defined(__SSSE3__)
+    return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
+#else
+#   error "Require at least SSSE3"
+#endif
+}
+
+extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
+                                          const void *y1, const void *y2,
+                                          const short *coef, uint32_t count) {
+    __m128i x;
+    __m128i c0, c2, c4, c6, c8;
+    __m128i r0, r1, r2;
+    __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
+    __m128i o0, o1;
+    uint32_t i;
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+0));
+    c0 = _mm_shuffle_epi32(x, 0x00);
+    c2 = _mm_shuffle_epi32(x, 0x55);
+    x = _mm_loadl_epi64((const __m128i *)(coef+4));
+    c4 = _mm_shuffle_epi32(x, 0x00);
+    c6 = _mm_shuffle_epi32(x, 0x55);
+    x = _mm_loadl_epi64((const __m128i *)(coef+8));
+    c8 = _mm_shuffle_epi32(x, 0x00);
+
+    for (i = 0; i < count; ++i) {
+
+        p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
+        p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
+        p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
+        p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
+        p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
+        p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
+        p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
+        p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
+        p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
+        p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
+        p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
+        p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
+
+        o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
+        o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
+
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
+
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
+
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
+
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
+
+        o0 = _mm_srai_epi32(o0, 8);
+        o1 = _mm_srai_epi32(o1, 8);
+
+        o0 = packus_epi32(o0, o1);
+        o0 = _mm_packus_epi16(o0, o0);
+        _mm_storel_epi64((__m128i *)dst, o0);
+
+        y0 = (const char *)y0 + 8;
+        y1 = (const char *)y1 + 8;
+        y2 = (const char *)y2 + 8;
+        dst = (char *)dst + 8;
+    }
+}
+
+void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
+                                  const short *coef, uint32_t count) {
+    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+                                      14, 10, 6, 2,
+                                      13,  9, 5, 1,
+                                      12,  8, 4, 0);
+
+    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
+    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
+    __m128i c0, c1, c2, c3;
+    __m128i i4, o4;
+    __m128i xy, zw;
+    __m128i x2, y2, z2, w2;
+    uint32_t i;
+
+    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
+    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
+    c0 = _mm_unpacklo_epi16(c0, c1);
+
+    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
+    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
+    c2 = _mm_unpacklo_epi16(c2, c3);
+
+    for (i = 0; i < count; ++i) {
+        i4 = _mm_load_si128((const __m128i *)src);
+        xy = _mm_shuffle_epi8(i4, Mxy);
+        zw = _mm_shuffle_epi8(i4, Mzw);
+
+        x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
+        y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
+        z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
+        w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
+
+        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
+        y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
+        z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
+        w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
+
+        x2 = _mm_srai_epi32(x2, 8);
+        y2 = _mm_srai_epi32(y2, 8);
+        z2 = _mm_srai_epi32(z2, 8);
+        w2 = _mm_srai_epi32(w2, 8);
+
+        x2 = packus_epi32(x2, y2);
+        z2 = packus_epi32(z2, w2);
+        o4 = _mm_packus_epi16(x2, z2);
+
+        o4 = _mm_shuffle_epi8(o4, T4x4);
+        _mm_storeu_si128((__m128i *)dst, o4);
+
+        src = (const char *)src + 16;
+        dst = (char *)dst + 16;
+    }
+}
+
+void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
+                                  const short *coef, uint32_t count) {
+    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+                                      14, 10, 6, 2,
+                                      13,  9, 5, 1,
+                                      12,  8, 4, 0);
+
+    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
+    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
+
+    __m128i c0, c1, c2, c3;
+    __m128i i4, o4;
+    __m128i xy, zw;
+    __m128i x2, y2, z2, w2;
+    uint32_t i;
+
+    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
+    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
+    c0 = _mm_unpacklo_epi16(c0, c1);
+
+    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
+    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
+    c2 = _mm_unpacklo_epi16(c2, c3);
+
+    for (i = 0; i < count; ++i) {
+        i4 = _mm_loadu_si128((const __m128i *)src);
+        xy = _mm_shuffle_epi8(i4, Mxy);
+        zw = _mm_shuffle_epi8(i4, Mzw);
+
+        x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
+        y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
+        z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
+
+        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
+        y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
+        z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
+
+        x2 = _mm_srai_epi32(x2, 8);
+        y2 = _mm_srai_epi32(y2, 8);
+        z2 = _mm_srai_epi32(z2, 8);
+        w2 = _mm_srli_epi32(zw, 16);
+
+        x2 = packus_epi32(x2, y2);
+        z2 = packus_epi32(z2, w2);
+        o4 = _mm_packus_epi16(x2, z2);
+
+        o4 = _mm_shuffle_epi8(o4, T4x4);
+        _mm_storeu_si128((__m128i *)dst, o4);
+
+        src = (const char *)src + 16;
+        dst = (char *)dst + 16;
+    }
+}
+
+void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
+                                  const short *coef, uint32_t count) {
+    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+                                      14, 10, 6, 2,
+                                      13,  9, 5, 1,
+                                      12,  8, 4, 0);
+    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
+    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
+    __m128i c0, c1, c2, c3;
+    __m128i i4, o4;
+    __m128i xy, zw;
+    __m128i x2, y2, z2, w2;
+    uint32_t i;
+
+    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
+    c0 = _mm_shufflelo_epi16(c0, 0);
+    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
+    c1 = _mm_shufflelo_epi16(c1, 0);
+    c0 = _mm_unpacklo_epi16(c0, c1);
+
+    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
+    c2 = _mm_shufflelo_epi16(c2, 0);
+    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
+    c3 = _mm_shufflelo_epi16(c3, 0);
+    c2 = _mm_unpacklo_epi16(c2, c3);
+
+    for (i = 0; i < count; ++i) {
+        i4 = _mm_loadu_si128((const __m128i *)src);
+
+        xy = _mm_shuffle_epi8(i4, Mxy);
+        zw = _mm_shuffle_epi8(i4, Mzw);
+
+        x2 =  _mm_madd_epi16(xy, c0);
+        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
+
+        x2 = _mm_srai_epi32(x2, 8);
+        y2 = x2;
+        z2 = x2;
+        w2 = _mm_srli_epi32(zw, 16);
+
+        x2 = packus_epi32(x2, y2);
+        z2 = packus_epi32(z2, w2);
+        o4 = _mm_packus_epi16(x2, z2);
+
+        o4 = _mm_shuffle_epi8(o4, T4x4);
+        _mm_storeu_si128((__m128i *)dst, o4);
+
+        src = (const char *)src + 16;
+        dst = (char *)dst + 16;
+    }
+}
+
+void rsdIntrinsicBlurVFU4_K(void *dst,
+                          const void *pin, int stride, const void *gptr,
+                          int rct, int x1, int x2) {
+    const char *pi;
+    __m128i pi0, pi1;
+    __m128 pf0, pf1;
+    __m128 bp0, bp1;
+    __m128 x;
+    int r;
+
+    for (; x1 < x2; x1 += 2) {
+        pi = (const char *)pin + (x1 << 2);
+        bp0 = _mm_setzero_ps();
+        bp1 = _mm_setzero_ps();
+
+        for (r = 0; r < rct; ++r) {
+            x = _mm_load_ss((const float *)gptr + r);
+            x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
+
+            pi0 = _mm_cvtsi32_si128(*(const int *)pi);
+            pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
+
+            pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
+            pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
+
+            bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
+            bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
+
+            pi += stride;
+        }
+
+        _mm_storeu_ps((float *)dst, bp0);
+        _mm_storeu_ps((float *)dst + 4, bp1);
+        dst = (char *)dst + 32;
+    }
+}
+
+void rsdIntrinsicBlurHFU4_K(void *dst,
+                          const void *pin, const void *gptr,
+                          int rct, int x1, int x2) {
+    const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
+    const float *pi;
+    __m128 pf, x, y;
+    __m128i o;
+    int r;
+
+    for (; x1 < x2; ++x1) {
+        /* rct is define as 2*r+1 by the caller */
+        x = _mm_load_ss((const float *)gptr);
+        x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
+
+        pi = (const float *)pin + (x1 << 2);
+        pf = _mm_mul_ps(x, _mm_load_ps(pi));
+
+        for (r = 1; r < rct; r += 2) {
+            x = _mm_load_ss((const float *)gptr + r);
+            y = _mm_load_ss((const float *)gptr + r + 1);
+            x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
+            y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
+
+            pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
+            pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
+        }
+
+        o = _mm_cvtps_epi32(pf);
+        *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
+        dst = (char *)dst + 4;
+    }
+}
+
+void rsdIntrinsicBlurHFU1_K(void *dst,
+                          const void *pin, const void *gptr,
+                          int rct, int x1, int x2) {
+    const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
+    const float *pi;
+    __m128 pf, g0, g1, g2, g3, gx, p0, p1;
+    __m128i o;
+    int r;
+
+    for (; x1 < x2; x1+=4) {
+        g0 = _mm_load_ss((const float *)gptr);
+        g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
+
+        pi = (const float *)pin + x1;
+        pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
+
+        for (r = 1; r < rct; r += 4) {
+            gx = _mm_loadu_ps((const float *)gptr + r);
+            p0 = _mm_loadu_ps(pi + r);
+            p1 = _mm_loadu_ps(pi + r + 4);
+
+            g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
+            pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
+            g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
+            pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
+            g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
+            pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
+            g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
+            pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
+        }
+
+        o = _mm_cvtps_epi32(pf);
+        *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
+        dst = (char *)dst + 4;
+    }
+}
+
+void rsdIntrinsicYuv_K(void *dst,
+                       const unsigned char *pY, const unsigned char *pUV,
+                       uint32_t count, const short *param) {
+    __m128i biasY, biasUV;
+    __m128i c0, c1, c2, c3, c4;
+
+    biasY = _mm_set1_epi32(param[8]);   /*  16 */
+    biasUV = _mm_set1_epi32(param[16]); /* 128 */
+
+    c0 = _mm_set1_epi32(param[0]);  /*  298 */
+    c1 = _mm_set1_epi32(param[1]);  /*  409 */
+    c2 = _mm_set1_epi32(param[2]);  /* -100 */
+    c3 = _mm_set1_epi32(param[3]);  /*  516 */
+    c4 = _mm_set1_epi32(param[4]);  /* -208 */
+
+    __m128i Y, UV, U, V, R, G, B, A;
+
+    A = _mm_set1_epi32(255);
+    uint32_t i;
+
+    for (i = 0; i < (count << 1); ++i) {
+        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
+        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
+
+        Y = _mm_sub_epi32(Y, biasY);
+        UV = _mm_sub_epi32(UV, biasUV);
+
+        U = _mm_shuffle_epi32(UV, 0xf5);
+        V = _mm_shuffle_epi32(UV, 0xa0);
+
+        Y = mullo_epi32(Y, c0);
+
+        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
+        R = _mm_add_epi32(R, biasUV);
+        R = _mm_srai_epi32(R, 8);
+
+        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
+        G = _mm_add_epi32(G, mullo_epi32(V, c4));
+        G = _mm_add_epi32(G, biasUV);
+        G = _mm_srai_epi32(G, 8);
+
+        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
+        B = _mm_add_epi32(B, biasUV);
+        B = _mm_srai_epi32(B, 8);
+
+        __m128i y1, y2, y3, y4;
+
+        y1 = packus_epi32(R, G);
+        y2 = packus_epi32(B, A);
+        y3 = _mm_packus_epi16(y1, y2);
+        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+                                          14, 10, 6, 2,
+                                          13,  9, 5, 1,
+                                          12,  8, 4, 0);
+        y4 = _mm_shuffle_epi8(y3, T4x4);
+        _mm_storeu_si128((__m128i *)dst, y4);
+        pY += 4;
+        pUV += 4;
+        dst = (__m128i *)dst + 1;
+    }
+}
+
+void rsdIntrinsicYuvR_K(void *dst,
+                       const unsigned char *pY, const unsigned char *pUV,
+                       uint32_t count, const short *param) {
+    __m128i biasY, biasUV;
+    __m128i c0, c1, c2, c3, c4;
+
+    biasY = _mm_set1_epi32(param[8]);   /*  16 */
+    biasUV = _mm_set1_epi32(param[16]); /* 128 */
+
+    c0 = _mm_set1_epi32(param[0]);  /*  298 */
+    c1 = _mm_set1_epi32(param[1]);  /*  409 */
+    c2 = _mm_set1_epi32(param[2]);  /* -100 */
+    c3 = _mm_set1_epi32(param[3]);  /*  516 */
+    c4 = _mm_set1_epi32(param[4]);  /* -208 */
+
+    __m128i Y, UV, U, V, R, G, B, A;
+
+    A = _mm_set1_epi32(255);
+    uint32_t i;
+
+    for (i = 0; i < (count << 1); ++i) {
+        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
+        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
+
+        Y = _mm_sub_epi32(Y, biasY);
+        UV = _mm_sub_epi32(UV, biasUV);
+
+        V = _mm_shuffle_epi32(UV, 0xf5);
+        U = _mm_shuffle_epi32(UV, 0xa0);
+
+        Y = mullo_epi32(Y, c0);
+
+        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
+        R = _mm_add_epi32(R, biasUV);
+        R = _mm_srai_epi32(R, 8);
+
+        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
+        G = _mm_add_epi32(G, mullo_epi32(V, c4));
+        G = _mm_add_epi32(G, biasUV);
+        G = _mm_srai_epi32(G, 8);
+
+        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
+        B = _mm_add_epi32(B, biasUV);
+        B = _mm_srai_epi32(B, 8);
+
+        __m128i y1, y2, y3, y4;
+
+        y1 = packus_epi32(R, G);
+        y2 = packus_epi32(B, A);
+        y3 = _mm_packus_epi16(y1, y2);
+        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+                                          14, 10, 6, 2,
+                                          13,  9, 5, 1,
+                                          12,  8, 4, 0);
+        y4 = _mm_shuffle_epi8(y3, T4x4);
+        _mm_storeu_si128((__m128i *)dst, y4);
+        pY += 4;
+        pUV += 4;
+        dst = (__m128i *)dst + 1;
+    }
+}
+
+void rsdIntrinsicYuv2_K(void *dst,
+                       const unsigned char *pY, const unsigned char *pU,
+                       const unsigned char *pV, uint32_t count, const short *param) {
+    __m128i biasY, biasUV;
+    __m128i c0, c1, c2, c3, c4;
+
+    biasY = _mm_set1_epi32(param[8]);   /*  16 */
+    biasUV = _mm_set1_epi32(param[16]); /* 128 */
+
+    c0 = _mm_set1_epi32(param[0]);  /*  298 */
+    c1 = _mm_set1_epi32(param[1]);  /*  409 */
+    c2 = _mm_set1_epi32(param[2]);  /* -100 */
+    c3 = _mm_set1_epi32(param[3]);  /*  516 */
+    c4 = _mm_set1_epi32(param[4]);  /* -208 */
+
+    __m128i Y, U, V, R, G, B, A;
+
+    A = _mm_set1_epi32(255);
+    uint32_t i;
+
+    for (i = 0; i < (count << 1); ++i) {
+        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
+        U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
+		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
+
+        Y = _mm_sub_epi32(Y, biasY);
+        U = _mm_sub_epi32(U, biasUV);
+		V = _mm_sub_epi32(V, biasUV);
+
+        Y = mullo_epi32(Y, c0);
+
+        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
+        R = _mm_add_epi32(R, biasUV);
+        R = _mm_srai_epi32(R, 8);
+
+        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
+        G = _mm_add_epi32(G, mullo_epi32(V, c4));
+        G = _mm_add_epi32(G, biasUV);
+        G = _mm_srai_epi32(G, 8);
+
+        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
+        B = _mm_add_epi32(B, biasUV);
+        B = _mm_srai_epi32(B, 8);
+
+        __m128i y1, y2, y3, y4;
+
+        y1 = packus_epi32(R, G);
+        y2 = packus_epi32(B, A);
+        y3 = _mm_packus_epi16(y1, y2);
+        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+                                          14, 10, 6, 2,
+                                          13,  9, 5, 1,
+                                          12,  8, 4, 0);
+        y4 = _mm_shuffle_epi8(y3, T4x4);
+        _mm_storeu_si128((__m128i *)dst, y4);
+        pY += 4;
+        pU += 4;
+		pV += 4;
+        dst = (__m128i *)dst + 1;
+    }
+}
+
+extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
+                                          const void *y1, const void *y2,
+                                          const void *y3, const void *y4,
+                                          const short *coef, uint32_t count) {
+    __m128i x;
+    __m128i c0, c2, c4, c6, c8, c10, c12;
+    __m128i c14, c16, c18, c20, c22, c24;
+    __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
+    __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
+    __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
+    __m128i p16, p17, p18, p19, p20, p21, p22, p23;
+    __m128i p24, p25, p26, p27, p28, p29, p30, p31;
+    __m128i p32, p33, p34, p35, p36, p37, p38, p39;
+    __m128i o0, o1, o2, o3;
+    uint32_t i;
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+0));
+    c0  = _mm_shuffle_epi32(x, 0x00);
+    c2  = _mm_shuffle_epi32(x, 0x55);
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+4));
+    c4  = _mm_shuffle_epi32(x, 0x00);
+    c6  = _mm_shuffle_epi32(x, 0x55);
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+8));
+    c8  = _mm_shuffle_epi32(x, 0x00);
+    c10  = _mm_shuffle_epi32(x, 0x55);
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+12));
+    c12  = _mm_shuffle_epi32(x, 0x00);
+    c14  = _mm_shuffle_epi32(x, 0x55);
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+16));
+    c16  = _mm_shuffle_epi32(x, 0x00);
+    c18  = _mm_shuffle_epi32(x, 0x55);
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+20));
+    c20  = _mm_shuffle_epi32(x, 0x00);
+    c22  = _mm_shuffle_epi32(x, 0x55);
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+24));
+    c24  = _mm_shuffle_epi32(x, 0x00);
+
+    for (i = 0; i < count; ++i) {
+
+        p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
+        p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
+        p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
+        p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
+        p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
+        p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
+        p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
+        p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
+
+        p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
+        p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
+        p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
+        p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
+        p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
+        p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
+        p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
+        p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
+
+        p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
+        p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
+        p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
+        p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
+        p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
+        p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
+        p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
+        p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
+
+        p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
+        p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
+        p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
+        p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
+        p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
+        p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
+        p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
+        p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
+
+        p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
+        p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
+        p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
+        p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
+        p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
+        p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
+        p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
+        p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
+
+        o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
+        o0 = _mm_srai_epi32(o0, 8);
+
+        o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
+        o1 = _mm_srai_epi32(o1, 8);
+
+        o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
+        o2 = _mm_srai_epi32(o2, 8);
+
+        o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
+        o3 = _mm_srai_epi32(o3, 8);
+
+        o0 = packus_epi32(o0, o1);
+        o2 = packus_epi32(o2, o3);
+        o0 = _mm_packus_epi16(o0, o2);
+        _mm_storeu_si128((__m128i *)dst, o0);
+
+        y0 = (const char *)y0 + 16;
+        y1 = (const char *)y1 + 16;
+        y2 = (const char *)y2 + 16;
+        y3 = (const char *)y3 + 16;
+        y4 = (const char *)y4 + 16;
+        dst = (char *)dst + 16;
+    }
+}
+
+void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
+    __m128i all1s, ina, ins;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    all1s = _mm_set1_epi16(255);
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
+        t0 = _mm_srli_epi16(t0, 8);
+        t0 = _mm_add_epi16(t0, ins);
+
+        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
+        t1 = _mm_srli_epi16(t1, 8);
+        t1 = _mm_add_epi16(t1, ins);
+
+        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
+        t2 = _mm_srli_epi16(t2, 8);
+        t2 = _mm_add_epi16(t2, ins);
+
+        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
+        t3 = _mm_srli_epi16(t3, 8);
+        t3 = _mm_add_epi16(t3, ins);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
+    __m128i all1s, outa, outs;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    all1s = _mm_set1_epi16(255);
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+
+        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
+        t0 = _mm_srli_epi16(t0, 8);
+        t0 = _mm_add_epi16(t0, outs);
+
+        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
+        t1 = _mm_srli_epi16(t1, 8);
+        t1 = _mm_add_epi16(t1, outs);
+
+        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
+        t2 = _mm_srli_epi16(t2, 8);
+        t2 = _mm_add_epi16(t2, outs);
+
+        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
+        t3 = _mm_srli_epi16(t3, 8);
+        t3 = _mm_add_epi16(t3, outs);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
+    __m128i outa;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, outa);
+        t0 = _mm_srli_epi16(t0, 8);
+
+        outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, outa);
+        t1 = _mm_srli_epi16(t1, 8);
+
+        outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, outa);
+        t2 = _mm_srli_epi16(t2, 8);
+
+        outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, outa);
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
+    __m128i ina;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, ina);
+        t0 = _mm_srli_epi16(t0, 8);
+
+        ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, ina);
+        t1 = _mm_srli_epi16(t1, 8);
+
+        ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, ina);
+        t2 = _mm_srli_epi16(t2, 8);
+
+        ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, ina);
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
+    __m128i all1s, outa;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    all1s = _mm_set1_epi16(255);
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
+        t0 = _mm_srli_epi16(t0, 8);
+
+        outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
+        t1 = _mm_srli_epi16(t1, 8);
+
+        outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
+        t2 = _mm_srli_epi16(t2, 8);
+
+        outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
+    __m128i all1s, ina;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    all1s = _mm_set1_epi16(255);
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
+        t0 = _mm_srli_epi16(t0, 8);
+
+        ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
+        t1 = _mm_srli_epi16(t1, 8);
+
+        ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
+        t2 = _mm_srli_epi16(t2, 8);
+
+        ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
+    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
+    __m128i all1s, ina, outa, ins, outs;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    all1s = _mm_set1_epi16(255);
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t0 = _mm_sub_epi16(all1s, ina);
+        t0 = _mm_mullo_epi16(t0, outs);
+        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
+        t0 = _mm_srli_epi16(t0, 8);
+
+        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t1 = _mm_sub_epi16(all1s, ina);
+        t1 = _mm_mullo_epi16(t1, outs);
+        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
+        t1 = _mm_srli_epi16(t1, 8);
+
+        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t2 = _mm_sub_epi16(all1s, ina);
+        t2 = _mm_mullo_epi16(t2, outs);
+        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
+        t2 = _mm_srli_epi16(t2, 8);
+
+        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t3 = _mm_sub_epi16(all1s, ina);
+        t3 = _mm_mullo_epi16(t3, outs);
+        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t0 = blendv_epi8(t0, out0, M0001);
+        t2 = _mm_packus_epi16(t2, t3);
+        t2 = blendv_epi8(t2, out1, M0001);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
+    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
+    __m128i all1s, ina, ins, outa, outs;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    all1s = _mm_set1_epi16(255);
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t0 = _mm_sub_epi16(all1s, outa);
+        t0 = _mm_mullo_epi16(t0, ins);
+        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
+        t0 = _mm_srli_epi16(t0, 8);
+
+        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t1 = _mm_sub_epi16(all1s, outa);
+        t1 = _mm_mullo_epi16(t1, ins);
+        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
+        t1 = _mm_srli_epi16(t1, 8);
+
+        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t2 = _mm_sub_epi16(all1s, outa);
+        t2 = _mm_mullo_epi16(t2, ins);
+        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
+        t2 = _mm_srli_epi16(t2, 8);
+
+        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t3 = _mm_sub_epi16(all1s, outa);
+        t3 = _mm_mullo_epi16(t3, ins);
+        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t0 = blendv_epi8(t0, in0, M0001);
+        t2 = _mm_packus_epi16(t2, t3);
+        t2 = blendv_epi8(t2, in1, M0001);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
+    __m128i in0, in1, out0, out1;
+    uint32_t i;
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        out0 = _mm_xor_si128(out0, in0);
+        out1 = _mm_xor_si128(out1, in1);
+
+        _mm_storeu_si128((__m128i *)dst, out0);
+        _mm_storeu_si128((__m128i *)dst + 1, out1);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
+        t0 = _mm_srli_epi16(t0, 8);
+
+        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
+        t1 = _mm_srli_epi16(t1, 8);
+
+        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
+        t2 = _mm_srli_epi16(t2, 8);
+
+        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
+    __m128i in0, in1, out0, out1;
+    uint32_t i;
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        out0 = _mm_adds_epu8(out0, in0);
+        out1 = _mm_adds_epu8(out1, in1);
+
+        _mm_storeu_si128((__m128i *)dst, out0);
+        _mm_storeu_si128((__m128i *)dst + 1, out1);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
+    __m128i in0, in1, out0, out1;
+    uint32_t i;
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        out0 = _mm_subs_epu8(out0, in0);
+        out1 = _mm_subs_epu8(out1, in1);
+
+        _mm_storeu_si128((__m128i *)dst, out0);
+        _mm_storeu_si128((__m128i *)dst + 1, out1);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+}  // namespace android
+}  // namespace renderscript