diff --git a/Android.mk b/Android.mk
old mode 100755
new mode 100644
index 13fe13e..c26a122
--- a/Android.mk
+++ b/Android.mk
@@ -49,11 +49,8 @@
   $(call intermediates-dir-for,SHARED_LIBRARIES,libclcore_debug.bc,,)/libclcore_debug.bc
 
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
-  # Disable NEON on cortex-a15 temporarily
-  ifneq ($(strip $(TARGET_CPU_VARIANT)), cortex-a15)
-    libbcc_SHA1_SRCS += \
-      $(call intermediates-dir-for,SHARED_LIBRARIES,libclcore_neon.bc,,)/libclcore_neon.bc
-  endif
+  libbcc_SHA1_SRCS += \
+    $(call intermediates-dir-for,SHARED_LIBRARIES,libclcore_neon.bc,,)/libclcore_neon.bc
 endif
 
 libbcc_GEN_SHA1_STAMP := $(LOCAL_PATH)/tools/build/gen-sha1-stamp.py
@@ -97,10 +94,7 @@
 endif
 
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
-  # Disable NEON on cortex-a15 temporarily
-  ifneq ($(strip $(TARGET_CPU_VARIANT)), cortex-a15)
-    LOCAL_REQUIRED_MODULES += libclcore_neon.bc
-  endif
+  LOCAL_REQUIRED_MODULES += libclcore_neon.bc
 endif
 
 # Generate build information (Build time + Build git revision + Build Semi SHA1)
diff --git a/CleanSpec.mk b/CleanSpec.mk
index 1d4b562..5e023f4 100644
--- a/CleanSpec.mk
+++ b/CleanSpec.mk
@@ -57,6 +57,7 @@
 $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/libclcore_neon.bc_intermediates)
 $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/libclcore*.bc_intermediates)
 $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/libbcinfo_intermediates)
+$(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/libclcore*.bc_intermediates)
 $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/SHARED_LIBRARIES/libbc*_intermediates)
 $(call add-clean-step, rm -rf $(HOST_OUT)/obj/STATIC_LIBRARIES/libbc*_intermediates)
 $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libbc*_intermediates)
diff --git a/bcinfo/Android.mk b/bcinfo/Android.mk
index 1da7d16..cb922ad 100644
--- a/bcinfo/Android.mk
+++ b/bcinfo/Android.mk
@@ -39,11 +39,15 @@
   BitcodeWrapper.cpp \
   MetadataExtractor.cpp
 
-libbcinfo_C_INCLUDES := $(LOCAL_PATH)/../include
+libbcinfo_C_INCLUDES := \
+  $(LOCAL_PATH)/../include \
+  $(LOCAL_PATH)/../../slang
+
 libbcinfo_STATIC_LIBRARIES := \
   libLLVMWrap \
   libLLVMBitReader_2_7 \
-  libLLVMBitReader_3_0
+  libLLVMBitReader_3_0 \
+  libLLVMBitWriter_3_2
 
 LLVM_ROOT_PATH := external/llvm
 
diff --git a/bcinfo/BitReader_2_7/BitcodeReader.cpp b/bcinfo/BitReader_2_7/BitcodeReader.cpp
index 40a6586..4d92715 100644
--- a/bcinfo/BitReader_2_7/BitcodeReader.cpp
+++ b/bcinfo/BitReader_2_7/BitcodeReader.cpp
@@ -2985,7 +2985,7 @@
   Stream.init(*StreamFile);
 
   unsigned char buf[16];
-  if (Bytes->readBytes(0, 16, buf, NULL) == -1)
+  if (Bytes->readBytes(0, 16, buf) == -1)
     return Error("Bitcode stream must be at least 16 bytes in length");
 
   if (!isBitcode(buf, buf + 16))
diff --git a/bcinfo/BitReader_3_0/BitcodeReader.cpp b/bcinfo/BitReader_3_0/BitcodeReader.cpp
index 243314b..5ac3ab9 100644
--- a/bcinfo/BitReader_3_0/BitcodeReader.cpp
+++ b/bcinfo/BitReader_3_0/BitcodeReader.cpp
@@ -3324,7 +3324,7 @@
   Stream.init(*StreamFile);
 
   unsigned char buf[16];
-  if (Bytes->readBytes(0, 16, buf, NULL) == -1)
+  if (Bytes->readBytes(0, 16, buf) == -1)
     return Error("Bitcode stream must be at least 16 bytes in length");
 
   if (!isBitcode(buf, buf + 16))
diff --git a/bcinfo/BitcodeTranslator.cpp b/bcinfo/BitcodeTranslator.cpp
index b4755c0..506a12a 100644
--- a/bcinfo/BitcodeTranslator.cpp
+++ b/bcinfo/BitcodeTranslator.cpp
@@ -21,6 +21,8 @@
 #include "BitReader_2_7/BitReader_2_7.h"
 #include "BitReader_3_0/BitReader_3_0.h"
 
+#include "BitWriter_3_2/ReaderWriter_3_2.h"
+
 #define LOG_TAG "bcinfo"
 #include <cutils/log.h>
 
@@ -141,7 +143,8 @@
   std::string Buffer;
 
   llvm::raw_string_ostream OS(Buffer);
-  llvm::WriteBitcodeToFile(module, OS);
+  // Use the LLVM 3.2 bitcode writer, instead of the top-of-tree version.
+  llvm_3_2::WriteBitcodeToFile(module, OS);
   OS.flush();
 
   AndroidBitcodeWrapper wrapper;
diff --git a/bcinfo/MetadataExtractor.cpp b/bcinfo/MetadataExtractor.cpp
index c4b2965..19262b1 100644
--- a/bcinfo/MetadataExtractor.cpp
+++ b/bcinfo/MetadataExtractor.cpp
@@ -20,7 +20,9 @@
 
 #define LOG_TAG "bcinfo"
 #include <cutils/log.h>
+#ifdef HAVE_ANDROID_OS
 #include <cutils/properties.h>
+#endif
 
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Bitcode/ReaderWriter.h"
@@ -248,6 +250,7 @@
     mRSFloatPrecision = RS_FP_Relaxed;
   }
 
+#ifdef HAVE_ANDROID_OS
   // Provide an override for precsion via adb shell setprop
   // adb shell setprop debug.rs.precision rs_fp_full
   // adb shell setprop debug.rs.precision rs_fp_relaxed
@@ -267,6 +270,7 @@
       mRSFloatPrecision = RS_FP_Full;
     }
   }
+#endif
 
   return;
 }
@@ -327,7 +331,7 @@
 bool MetadataExtractor::populateForEachMetadata(
     const llvm::NamedMDNode *Names,
     const llvm::NamedMDNode *Signatures) {
-  if (!Names && !Signatures) {
+  if (!Names && !Signatures && mCompilerVersion == 0) {
     // Handle legacy case for pre-ICS bitcode that doesn't contain a metadata
     // section for ForEach. We generate a full signature for a "root" function
     // which means that we need to set the bottom 5 bits in the mask.
diff --git a/bcinfo/tools/main.cpp b/bcinfo/tools/main.cpp
index 28d29aa..a2cc0ba 100644
--- a/bcinfo/tools/main.cpp
+++ b/bcinfo/tools/main.cpp
@@ -336,7 +336,7 @@
 
     llvm::OwningPtr<llvm::tool_output_file> tof(
         new llvm::tool_output_file(outFile.c_str(), errmsg,
-                                   llvm::raw_fd_ostream::F_Binary));
+                                   llvm::sys::fs::F_Binary));
     llvm::OwningPtr<llvm::AssemblyAnnotationWriter> ann;
     module->print(tof->os(), ann.get());
 
diff --git a/include/bcc/Compiler.h b/include/bcc/Compiler.h
index 265f1e3..5167b9a 100644
--- a/include/bcc/Compiler.h
+++ b/include/bcc/Compiler.h
@@ -61,7 +61,6 @@
 
     kErrHookBeforeAddLTOPasses,
     kErrHookAfterAddLTOPasses,
-    kErrHookBeforeExecuteLTOPasses,
     kErrHookAfterExecuteLTOPasses,
 
     kErrHookBeforeAddCodeGenPasses,
@@ -69,7 +68,7 @@
     kErrHookBeforeExecuteCodeGenPasses,
     kErrHookAfterExecuteCodeGenPasses,
 
-    kMaxErrorCode,
+    kErrInvalidSource
   };
 
   static const char *GetErrorString(enum ErrorCode pErrCode);
@@ -89,10 +88,15 @@
   enum ErrorCode config(const CompilerConfig &pConfig);
 
   // Compile a script and output the result to a LLVM stream.
-  enum ErrorCode compile(Script &pScript, llvm::raw_ostream &pResult);
+  //
+  // @param IRStream If not NULL, the LLVM-IR that is fed to code generation
+  //                 will be written to IRStream.
+  enum ErrorCode compile(Script &pScript, llvm::raw_ostream &pResult,
+                         llvm::raw_ostream *IRStream);
 
   // Compile a script and output the result to a file.
-  enum ErrorCode compile(Script &pScript, OutputFile &pResult);
+  enum ErrorCode compile(Script &pScript, OutputFile &pResult,
+                         llvm::raw_ostream *IRStream = 0);
 
   const llvm::TargetMachine& getTargetMachine() const
   { return *mTarget; }
diff --git a/include/bcc/Renderscript/RSCompiler.h b/include/bcc/Renderscript/RSCompiler.h
index 537e749..a46d558 100644
--- a/include/bcc/Renderscript/RSCompiler.h
+++ b/include/bcc/Renderscript/RSCompiler.h
@@ -24,7 +24,8 @@
 class RSCompiler : public Compiler {
 private:
   virtual bool beforeAddLTOPasses(Script &pScript, llvm::PassManager &pPM);
-  virtual bool beforeExecuteLTOPasses(Script &pScript, llvm::PassManager &pPM);
+  bool addInternalizeSymbolsPass(Script &pScript, llvm::PassManager &pPM);
+  bool addExpandForEachPass(Script &pScript, llvm::PassManager &pPM);
 };
 
 } // end namespace bcc
diff --git a/include/bcc/Renderscript/RSCompilerDriver.h b/include/bcc/Renderscript/RSCompilerDriver.h
index 7b54a13..371014c 100644
--- a/include/bcc/Renderscript/RSCompilerDriver.h
+++ b/include/bcc/Renderscript/RSCompilerDriver.h
@@ -22,13 +22,13 @@
 #include "bcc/ExecutionEngine/SymbolResolverProxy.h"
 #include "bcc/Renderscript/RSInfo.h"
 #include "bcc/Renderscript/RSCompiler.h"
+#include "bcc/Renderscript/RSScript.h"
 
 namespace bcc {
 
 class BCCContext;
 class CompilerConfig;
 class RSExecutable;
-class RSScript;
 
 class RSCompilerDriver {
 private:
@@ -42,19 +42,23 @@
   // Are we compiling under an RS debug context with additional checks?
   bool mDebugContext;
 
-  RSExecutable *loadScriptCache(const char *pOutputPath,
-                                const RSInfo::DependencyTableTy &pDeps);
+  // Do we merge global variables on ARM using LLVM's optimization pass?
+  // Disabling LLVM's global merge pass allows static globals to be correctly
+  // emitted to ELF. This can result in decreased performance due to increased
+  // register pressure, but it does make the resulting code easier to debug
+  // and work with.
+  bool mEnableGlobalMerge;
 
   // Setup the compiler config for the given script. Return true if mConfig has
   // been changed and false if it remains unchanged.
   bool setupConfig(const RSScript &pScript);
 
-  RSExecutable *compileScript(RSScript &pScript,
-                              const char* pScriptName,
-                              const char *pOutputPath,
-                              const char *pRuntimePath,
-                              const RSInfo::DependencyTableTy &pDeps,
-                              bool pSkipLoad);
+  Compiler::ErrorCode compileScript(RSScript &pScript,
+                                    const char* pScriptName,
+                                    const char *pOutputPath,
+                                    const char *pRuntimePath,
+                                    const RSInfo::DependencyTableTy &pDeps,
+                                    bool pSkipLoad, bool pDumpIR = false);
 
 public:
   RSCompilerDriver(bool pUseCompilerRT = true);
@@ -78,16 +82,32 @@
     mDebugContext = v;
   }
 
-  // FIXME: This method accompany with loadScriptCache and compileScript should
+  // This function enables/disables merging of global static variables.
+  // Note that it only takes effect on ARM architectures (other architectures
+  // do not offer this option).
+  void setEnableGlobalMerge(bool v) {
+    mEnableGlobalMerge = v;
+  }
+
+  bool getEnableGlobalMerge() const {
+    return mEnableGlobalMerge;
+  }
+
+  // FIXME: This method accompany with loadScript and compileScript should
   //        all be const-methods. They're not now because the getAddress() in
   //        SymbolResolverInterface is not a const-method.
-  RSExecutable *build(BCCContext &pContext,
-                      const char *pCacheDir, const char *pResName,
-                      const char *pBitcode, size_t pBitcodeSize,
-                      const char *pRuntimePath,
-                      RSLinkRuntimeCallback pLinkRuntimeCallback = NULL);
-  RSExecutable *build(RSScript &pScript, const char *pOut,
-                      const char *pRuntimePath);
+  // Returns true if script is successfully compiled.
+  bool build(BCCContext &pContext, const char *pCacheDir, const char *pResName,
+             const char *pBitcode, size_t pBitcodeSize,
+             const char *pRuntimePath,
+             RSLinkRuntimeCallback pLinkRuntimeCallback = NULL,
+             bool pDumpIR = false);
+
+  // Returns true if script is successfully compiled.
+  bool build(RSScript &pScript, const char *pOut, const char *pRuntimePath);
+
+  RSExecutable *loadScript(const char *pCacheDir, const char *pResName,
+                           const char *pBitcode, size_t pBitcodeSize);
 };
 
 } // end namespace bcc
diff --git a/include/bcc/Renderscript/RSInfo.h b/include/bcc/Renderscript/RSInfo.h
index f76813d..a0a775d 100644
--- a/include/bcc/Renderscript/RSInfo.h
+++ b/include/bcc/Renderscript/RSInfo.h
@@ -21,13 +21,16 @@
 
 #include <utility>
 
-#include "bcc/Renderscript/RSScript.h"
 #include "bcc/Support/Log.h"
 #include "bcc/Support/Sha1Util.h"
 
 #include <utils/String8.h>
 #include <utils/Vector.h>
 
+namespace llvm {
+class Module;
+}
+
 namespace bcc {
 
 // Forward declarations
@@ -35,6 +38,9 @@
 class InputFile;
 class OutputFile;
 class Source;
+class RSScript;
+
+typedef llvm::Module* (*RSLinkRuntimeCallback) (bcc::RSScript *, llvm::Module *, llvm::Module *);
 
 namespace rsinfo {
 
@@ -157,7 +163,7 @@
 
   // Return the path of the RS info file corresponded to the given output
   // executable file.
-  static android::String8 GetPath(const FileBase &pFile);
+  static android::String8 GetPath(const char *pFilename);
 
   static const char LibBCCPath[];
   static const char LibCompilerRTPath[];
diff --git a/include/bcc/Renderscript/RSScript.h b/include/bcc/Renderscript/RSScript.h
index b6c19ef..7f927c7 100644
--- a/include/bcc/Renderscript/RSScript.h
+++ b/include/bcc/Renderscript/RSScript.h
@@ -18,6 +18,7 @@
 #define BCC_RS_SCRIPT_H
 
 #include "bcc/Script.h"
+#include "bcc/Renderscript/RSInfo.h"
 #include "bcc/Support/Sha1Util.h"
 
 namespace llvm {
@@ -26,12 +27,9 @@
 
 namespace bcc {
 
-class RSInfo;
 class RSScript;
 class Source;
 
-typedef llvm::Module* (*RSLinkRuntimeCallback) (bcc::RSScript *, llvm::Module *, llvm::Module *);
-
 class RSScript : public Script {
 public:
   // This is one-one mapping with the llvm::CodeGenOpt::Level in
@@ -64,6 +62,10 @@
 
   RSScript(Source &pSource);
 
+  virtual ~RSScript() {
+    delete mInfo;
+  }
+
   // Set the associated RSInfo of the script.
   void setInfo(const RSInfo *pInfo) {
     mInfo = pInfo;
diff --git a/include/bcc/Source.h b/include/bcc/Source.h
index 4aa76c1..9ba860b 100644
--- a/include/bcc/Source.h
+++ b/include/bcc/Source.h
@@ -47,8 +47,6 @@
   static Source *CreateFromFile(BCCContext &pContext,
                                 const std::string &pPath);
 
-  static Source *CreateFromFd(BCCContext &pContext, int pFd);
-
   // Create a Source object from an existing module. If pNoDelete
   // is true, destructor won't call delete on the given module.
   static Source *CreateFromModule(BCCContext &pContext,
diff --git a/include/bcc/Support/Properties.h b/include/bcc/Support/Properties.h
new file mode 100644
index 0000000..c82901c
--- /dev/null
+++ b/include/bcc/Support/Properties.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BCC_SUPPORT_PROPERTIES_H
+#define BCC_SUPPORT_PROPERTIES_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#if !defined(RS_SERVER) && defined(HAVE_ANDROID_OS)
+#include <cutils/properties.h>
+#endif
+
+static inline uint32_t getProperty(const char *str) {
+#if !defined(RS_SERVER) && defined(HAVE_ANDROID_OS)
+    char buf[PROPERTY_VALUE_MAX];
+    property_get(str, buf, "0");
+    return atoi(buf);
+#else
+    return 0;
+#endif
+}
+
+#endif // BCC_SUPPORT_PROPERTIES_H
diff --git a/include/bcinfo/MetadataExtractor.h b/include/bcinfo/MetadataExtractor.h
index d1a88d9..dbfd5ae 100644
--- a/include/bcinfo/MetadataExtractor.h
+++ b/include/bcinfo/MetadataExtractor.h
@@ -197,6 +197,66 @@
   enum RSFloatPrecision getRSFloatPrecision() const {
     return mRSFloatPrecision;
   }
+
+  /**
+   * \return whether or not this ForEach function signature has an "In"
+   * parameter.
+   *
+   * \param sig - ForEach function signature to check.
+   */
+  static bool hasForEachSignatureIn(uint32_t sig) {
+    return sig & 0x01;
+  }
+
+  /**
+   * \return whether or not this ForEach function signature has an "Out"
+   * parameter.
+   *
+   * \param sig - ForEach function signature to check.
+   */
+  static bool hasForEachSignatureOut(uint32_t sig) {
+    return sig & 0x02;
+  }
+
+  /**
+   * \return whether or not this ForEach function signature has a "UsrData"
+   * parameter.
+   *
+   * \param sig - ForEach function signature to check.
+   */
+  static bool hasForEachSignatureUsrData(uint32_t sig) {
+    return sig & 0x04;
+  }
+
+  /**
+   * \return whether or not this ForEach function signature has an "X"
+   * parameter.
+   *
+   * \param sig - ForEach function signature to check.
+   */
+  static bool hasForEachSignatureX(uint32_t sig) {
+    return sig & 0x08;
+  }
+
+  /**
+   * \return whether or not this ForEach function signature has a "Y"
+   * parameter.
+   *
+   * \param sig - ForEach function signature to check.
+   */
+  static bool hasForEachSignatureY(uint32_t sig) {
+    return sig & 0x10;
+  }
+
+  /**
+   * \return whether or not this ForEach function signature is a
+   * pass-by-value "Kernel".
+   *
+   * \param sig - ForEach function signature to check.
+   */
+  static bool hasForEachSignatureKernel(uint32_t sig) {
+    return sig & 0x20;
+  }
 };
 
 }  // namespace bcinfo
diff --git a/lib/Core/Compiler.cpp b/lib/Core/Compiler.cpp
index 458fbc0..3440570 100644
--- a/lib/Core/Compiler.cpp
+++ b/lib/Core/Compiler.cpp
@@ -25,6 +25,7 @@
 #include <llvm/IR/DataLayout.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #include <llvm/Transforms/Scalar.h>
 
 #include "bcc/Script.h"
@@ -36,57 +37,51 @@
 using namespace bcc;
 
 const char *Compiler::GetErrorString(enum ErrorCode pErrCode) {
-  static const char *ErrorString[] = {
-    /* kSuccess */
-    "Successfully compiled.",
-    /* kInvalidConfigNoTarget */
-    "Invalid compiler config supplied (getTarget() returns NULL.) "
-    "(missing call to CompilerConfig::initialize()?)",
-    /* kErrCreateTargetMachine */
-    "Failed to create llvm::TargetMachine.",
-    /* kErrSwitchTargetMachine */
-    "Failed to switch llvm::TargetMachine.",
-    /* kErrNoTargetMachine */
-    "Failed to compile the script since there's no available TargetMachine."
-    " (missing call to Compiler::config()?)",
-    /* kErrDataLayoutNoMemory */
-    "Out of memory when create DataLayout during compilation.",
-    /* kErrMaterialization */
-    "Failed to materialize the module.",
-    /* kErrInvalidOutputFileState */
-    "Supplied output file was invalid (in the error state.)",
-    /* kErrPrepareOutput */
-    "Failed to prepare file for output.",
-    /* kPrepareCodeGenPass */
-    "Failed to construct pass list for code-generation.",
-
-    /* kErrHookBeforeAddLTOPasses */
-    "Error occurred during beforeAddLTOPasses() in subclass.",
-    /* kErrHookAfterAddLTOPasses */
-    "Error occurred during afterAddLTOPasses() in subclass.",
-    /* kErrHookBeforeExecuteLTOPasses */
-    "Error occurred during beforeExecuteLTOPasses() in subclass.",
-    /* kErrHookAfterExecuteLTOPasses */
-    "Error occurred during afterExecuteLTOPasses() in subclass.",
-
-    /* kErrHookBeforeAddCodeGenPasses */
-    "Error occurred during beforeAddCodeGenPasses() in subclass.",
-    /* kErrHookAfterAddCodeGenPasses */
-    "Error occurred during afterAddCodeGenPasses() in subclass.",
-    /* kErrHookBeforeExecuteCodeGenPasses */
-    "Error occurred during beforeExecuteCodeGenPasses() in subclass.",
-    /* kErrHookAfterExecuteCodeGenPasses */
-    "Error occurred during afterExecuteCodeGenPasses() in subclass.",
-
-    /* kMaxErrorCode */
-    "(Unknown error code)"
-  };
-
-  if (pErrCode > kMaxErrorCode) {
-    pErrCode = kMaxErrorCode;
+  switch (pErrCode) {
+  case kSuccess:
+    return "Successfully compiled.";
+  case kInvalidConfigNoTarget:
+    return "Invalid compiler config supplied (getTarget() returns NULL.) "
+           "(missing call to CompilerConfig::initialize()?)";
+  case kErrCreateTargetMachine:
+    return "Failed to create llvm::TargetMachine.";
+  case kErrSwitchTargetMachine:
+    return  "Failed to switch llvm::TargetMachine.";
+  case kErrNoTargetMachine:
+    return "Failed to compile the script since there's no available "
+           "TargetMachine. (missing call to Compiler::config()?)";
+  case kErrDataLayoutNoMemory:
+    return "Out of memory when create DataLayout during compilation.";
+  case kErrMaterialization:
+    return "Failed to materialize the module.";
+  case kErrInvalidOutputFileState:
+    return "Supplied output file was invalid (in the error state.)";
+  case kErrPrepareOutput:
+    return "Failed to prepare file for output.";
+  case kPrepareCodeGenPass:
+    return "Failed to construct pass list for code-generation.";
+  case kErrHookBeforeAddLTOPasses:
+    return "Error occurred during beforeAddLTOPasses() in subclass.";
+  case kErrHookAfterAddLTOPasses:
+    return "Error occurred during afterAddLTOPasses() in subclass.";
+  case kErrHookAfterExecuteLTOPasses:
+    return "Error occurred during afterExecuteLTOPasses() in subclass.";
+  case kErrHookBeforeAddCodeGenPasses:
+    return "Error occurred during beforeAddCodeGenPasses() in subclass.";
+  case kErrHookAfterAddCodeGenPasses:
+    return "Error occurred during afterAddCodeGenPasses() in subclass.";
+  case kErrHookBeforeExecuteCodeGenPasses:
+    return "Error occurred during beforeExecuteCodeGenPasses() in subclass.";
+  case kErrHookAfterExecuteCodeGenPasses:
+    return "Error occurred during afterExecuteCodeGenPasses() in subclass.";
+  case kErrInvalidSource:
+    return "Error loading input bitcode";
   }
 
-  return ErrorString[ static_cast<size_t>(pErrCode) ];
+  // This assert should never be reached as the compiler verifies that the
+  // above switch coveres all enum values.
+  assert(false && "Unknown error code encountered");
+  return  "";
 }
 
 //===----------------------------------------------------------------------===//
@@ -167,108 +162,30 @@
   // Add DataLayout to the pass manager.
   lto_passes.add(data_layout);
 
-  // Invokde "beforeAddLTOPasses" before adding the first pass.
+  // Invoke "beforeAddLTOPasses" before adding the first pass.
   if (!beforeAddLTOPasses(pScript, lto_passes)) {
     return kErrHookBeforeAddLTOPasses;
   }
 
-  // We now create passes list performing LTO. These are copied from
-  // (including comments) llvm::PassManagerBuilder::populateLTOPassManager().
-  // Only a subset of these LTO passes are enabled in optimization level 0 as
-  // they interfere with interactive debugging.
-  //
-  // FIXME: Figure out which passes (if any) makes sense for levels 1 and 2.
-  //if ( != llvm::CodeGenOpt::None) {
   if (mTarget->getOptLevel() == llvm::CodeGenOpt::None) {
     lto_passes.add(llvm::createGlobalOptimizerPass());
     lto_passes.add(llvm::createConstantMergePass());
   } else {
-    // Propagate constants at call sites into the functions they call. This
-    // opens opportunities for globalopt (and inlining) by substituting
-    // function pointers passed as arguments to direct uses of functions.
-    lto_passes.add(llvm::createIPSCCPPass());
-
-    // Now that we internalized some globals, see if we can hack on them!
-    lto_passes.add(llvm::createGlobalOptimizerPass());
-
-    // Linking modules together can lead to duplicated global constants, only
-    // keep one copy of each constant...
-    lto_passes.add(llvm::createConstantMergePass());
-
-    // Remove unused arguments from functions...
-    lto_passes.add(llvm::createDeadArgEliminationPass());
-
-    // Reduce the code after globalopt and ipsccp. Both can open up
-    // significant simplification opportunities, and both can propagate
-    // functions through function pointers. When this happens, we often have
-    // to resolve varargs calls, etc, so let instcombine do this.
-    lto_passes.add(llvm::createInstructionCombiningPass());
-
-    // Inline small functions
-    lto_passes.add(llvm::createFunctionInliningPass());
-
-    // Remove dead EH info.
-    lto_passes.add(llvm::createPruneEHPass());
-
-    // Internalize the globals again after inlining
-    lto_passes.add(llvm::createGlobalOptimizerPass());
-
-    // Remove dead functions.
-    lto_passes.add(llvm::createGlobalDCEPass());
-
-    // If we didn't decide to inline a function, check to see if we can
-    // transform it to pass arguments by value instead of by reference.
-    lto_passes.add(llvm::createArgumentPromotionPass());
-
-    // The IPO passes may leave cruft around.  Clean up after them.
-    lto_passes.add(llvm::createInstructionCombiningPass());
-    lto_passes.add(llvm::createJumpThreadingPass());
-
-    // Break up allocas
-    lto_passes.add(llvm::createScalarReplAggregatesPass());
-
-    // Run a few AA driven optimizations here and now, to cleanup the code.
-    lto_passes.add(llvm::createFunctionAttrsPass());  // Add nocapture.
-    lto_passes.add(llvm::createGlobalsModRefPass());  // IP alias analysis.
-
-    // Hoist loop invariants.
-    lto_passes.add(llvm::createLICMPass());
-
-    // Remove redundancies.
-    lto_passes.add(llvm::createGVNPass());
-
-    // Remove dead memcpys.
-    lto_passes.add(llvm::createMemCpyOptPass());
-
-    // Nuke dead stores.
-    lto_passes.add(llvm::createDeadStoreEliminationPass());
-
-    // Cleanup and simplify the code after the scalar optimizations.
-    lto_passes.add(llvm::createInstructionCombiningPass());
-
-    lto_passes.add(llvm::createJumpThreadingPass());
-
-    // Delete basic blocks, which optimization passes may have killed.
-    lto_passes.add(llvm::createCFGSimplificationPass());
-
-    // Now that we have optimized the program, discard unreachable functions.
-    lto_passes.add(llvm::createGlobalDCEPass());
+    // FIXME: Figure out which passes should be executed.
+    llvm::PassManagerBuilder Builder;
+    Builder.populateLTOPassManager(lto_passes, /*Internalize*/false,
+                                   /*RunInliner*/true);
   }
 
-  // Invokde "afterAddLTOPasses" after pass manager finished its
+  // Invoke "afterAddLTOPasses" after pass manager finished its
   // construction.
   if (!afterAddLTOPasses(pScript, lto_passes)) {
     return kErrHookAfterAddLTOPasses;
   }
 
-  // Invokde "beforeExecuteLTOPasses" before executing the passes.
-  if (!beforeExecuteLTOPasses(pScript, lto_passes)) {
-    return kErrHookBeforeExecuteLTOPasses;
-  }
-
   lto_passes.run(pScript.getSource().getModule());
 
-  // Invokde "afterExecuteLTOPasses" before returning.
+  // Invoke "afterExecuteLTOPasses" before returning.
   if (!afterExecuteLTOPasses(pScript)) {
     return kErrHookAfterExecuteLTOPasses;
   }
@@ -327,7 +244,8 @@
 }
 
 enum Compiler::ErrorCode Compiler::compile(Script &pScript,
-                                           llvm::raw_ostream &pResult) {
+                                           llvm::raw_ostream &pResult,
+                                           llvm::raw_ostream *IRStream) {
   llvm::Module &module = pScript.getSource().getModule();
   enum ErrorCode err;
 
@@ -352,6 +270,9 @@
     return err;
   }
 
+  if (IRStream)
+    *IRStream << module;
+
   if ((err = runCodeGen(pScript, pResult)) != kSuccess) {
     return err;
   }
@@ -360,7 +281,8 @@
 }
 
 enum Compiler::ErrorCode Compiler::compile(Script &pScript,
-                                           OutputFile &pResult) {
+                                           OutputFile &pResult,
+                                           llvm::raw_ostream *IRStream) {
   // Check the state of the specified output file.
   if (pResult.hasError()) {
     return kErrInvalidOutputFileState;
@@ -373,7 +295,7 @@
   }
 
   // Delegate the request.
-  enum Compiler::ErrorCode err = compile(pScript, *out);
+  enum Compiler::ErrorCode err = compile(pScript, *out, IRStream);
 
   // Close the output before return.
   delete out;
diff --git a/lib/Core/Source.cpp b/lib/Core/Source.cpp
index e0e6886..41397b9 100644
--- a/lib/Core/Source.cpp
+++ b/lib/Core/Source.cpp
@@ -112,34 +112,6 @@
   return result;
 }
 
-Source *Source::CreateFromFd(BCCContext &pContext, int pFd) {
-  llvm::OwningPtr<llvm::MemoryBuffer> input_data;
-
-  llvm::error_code ec =
-      llvm::MemoryBuffer::getOpenFile(pFd, /* Filename */"", input_data);
-
-  if (ec != llvm::error_code::success()) {
-    ALOGE("Failed to load bitcode from file descriptor %d! (%s)",
-          pFd, ec.message().c_str());
-    return NULL;
-  }
-
-  llvm::MemoryBuffer *input_memory = input_data.take();
-  llvm::Module *module = helper_load_bitcode(pContext.mImpl->mLLVMContext,
-                                             input_memory);
-  if (module == NULL) {
-    delete input_memory;
-    return NULL;
-  }
-
-  Source *result = CreateFromModule(pContext, *module, /* pNoDelete */false);
-  if (result == NULL) {
-    delete module;
-  }
-
-  return result;
-}
-
 Source *Source::CreateFromModule(BCCContext &pContext, llvm::Module &pModule,
                                  bool pNoDelete) {
   Source *result = new (std::nothrow) Source(pContext, pModule, pNoDelete);
diff --git a/lib/ExecutionEngine/SymbolResolvers.cpp b/lib/ExecutionEngine/SymbolResolvers.cpp
index 3d21e7d..4194a6b 100644
--- a/lib/ExecutionEngine/SymbolResolvers.cpp
+++ b/lib/ExecutionEngine/SymbolResolvers.cpp
@@ -87,6 +87,9 @@
 }
 
 DyldSymbolResolver::~DyldSymbolResolver() {
-  ::dlclose(mHandle);
+  if (mHandle != NULL) {
+    ::dlclose(mHandle);
+    mHandle = NULL;
+  }
   delete [] mError;
 }
diff --git a/lib/Renderscript/Android.mk b/lib/Renderscript/Android.mk
index bbd1e64..b0ad839 100644
--- a/lib/Renderscript/Android.mk
+++ b/lib/Renderscript/Android.mk
@@ -68,6 +68,3 @@
 include $(LIBBCC_GEN_CONFIG_MK)
 include $(LLVM_HOST_BUILD_MK)
 include $(BUILD_HOST_STATIC_LIBRARY)
-
-# Build Renderscript runtime (libclcore.bc)
-include $(LOCAL_PATH)/runtime/Android.mk
diff --git a/lib/Renderscript/RSCompiler.cpp b/lib/Renderscript/RSCompiler.cpp
index 618c1c3..9acc455 100644
--- a/lib/Renderscript/RSCompiler.cpp
+++ b/lib/Renderscript/RSCompiler.cpp
@@ -29,7 +29,7 @@
 
 using namespace bcc;
 
-bool RSCompiler::beforeAddLTOPasses(Script &pScript, llvm::PassManager &pPM) {
+bool RSCompiler::addInternalizeSymbolsPass(Script &pScript, llvm::PassManager &pPM) {
   // Add a pass to internalize the symbols that don't need to have global
   // visibility.
   RSScript &script = static_cast<RSScript &>(pScript);
@@ -87,11 +87,7 @@
   return true;
 }
 
-bool RSCompiler::beforeExecuteLTOPasses(Script &pScript,
-                                        llvm::PassManager &pPM) {
-  // Execute a pass to expand foreach-able functions
-  llvm::PassManager rs_passes;
-
+bool RSCompiler::addExpandForEachPass(Script &pScript, llvm::PassManager &pPM) {
   // Script passed to RSCompiler must be a RSScript.
   RSScript &script = static_cast<RSScript &>(pScript);
   const RSInfo *info = script.getInfo();
@@ -104,14 +100,21 @@
   }
 
   // Expand ForEach on CPU path to reduce launch overhead.
-  rs_passes.add(createRSForEachExpandPass(info->getExportForeachFuncs(),
-                                          /* pEnableStepOpt */ true));
-  if (script.getEmbedInfo()) {
-    rs_passes.add(createRSEmbedInfoPass(info));
-  }
+  bool pEnableStepOpt = true;
+  pPM.add(createRSForEachExpandPass(info->getExportForeachFuncs(),
+                                    pEnableStepOpt));
+  if (script.getEmbedInfo())
+    pPM.add(createRSEmbedInfoPass(info));
 
-  // Execute the pass.
-  rs_passes.run(module);
+  return true;
+}
+
+bool RSCompiler::beforeAddLTOPasses(Script &pScript, llvm::PassManager &pPM) {
+  if (!addExpandForEachPass(pScript, pPM))
+    return false;
+
+  if (!addInternalizeSymbolsPass(pScript, pPM))
+    return false;
 
   return true;
 }
diff --git a/lib/Renderscript/RSCompilerDriver.cpp b/lib/Renderscript/RSCompilerDriver.cpp
index 3d8bc2c..253c4f3 100644
--- a/lib/Renderscript/RSCompilerDriver.cpp
+++ b/lib/Renderscript/RSCompilerDriver.cpp
@@ -16,10 +16,14 @@
 
 #include "bcc/Renderscript/RSCompilerDriver.h"
 
+#include <llvm/IR/Module.h>
+#include <llvm/Support/CommandLine.h>
 #include <llvm/Support/Path.h>
+#include <llvm/Support/raw_ostream.h>
 
 #include "bcinfo/BitcodeWrapper.h"
 
+#include "bcc/Compiler.h"
 #include "bcc/Renderscript/RSExecutable.h"
 #include "bcc/Renderscript/RSScript.h"
 #include "bcc/Support/CompilerConfig.h"
@@ -32,36 +36,17 @@
 #include "bcc/Support/Sha1Util.h"
 #include "bcc/Support/OutputFile.h"
 
+#ifdef HAVE_ANDROID_OS
 #include <cutils/properties.h>
+#endif
 #include <utils/String8.h>
 #include <utils/StopWatch.h>
 
 using namespace bcc;
 
-namespace {
-
-bool is_force_recompile() {
-  char buf[PROPERTY_VALUE_MAX];
-
-  // Re-compile if floating point precision has been overridden.
-  property_get("debug.rs.precision", buf, "");
-  if (buf[0] != '\0') {
-    return true;
-  }
-
-  // Re-compile if debug.rs.forcerecompile is set.
-  property_get("debug.rs.forcerecompile", buf, "0");
-  if ((::strcmp(buf, "1") == 0) || (::strcmp(buf, "true") == 0)) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-} // end anonymous namespace
-
 RSCompilerDriver::RSCompilerDriver(bool pUseCompilerRT) :
-    mConfig(NULL), mCompiler(), mCompilerRuntime(NULL), mDebugContext(false) {
+    mConfig(NULL), mCompiler(), mCompilerRuntime(NULL), mDebugContext(false),
+    mEnableGlobalMerge(true) {
   init::Initialize();
   // Chain the symbol resolvers for compiler_rt and RS runtimes.
   if (pUseCompilerRT) {
@@ -77,21 +62,38 @@
 }
 
 RSExecutable *
-RSCompilerDriver::loadScriptCache(const char *pOutputPath,
-                                  const RSInfo::DependencyTableTy &pDeps) {
-  //android::StopWatch load_time("bcc: RSCompilerDriver::loadScriptCache time");
-  RSExecutable *result = NULL;
-
-  if (is_force_recompile())
+RSCompilerDriver::loadScript(const char *pCacheDir, const char *pResName,
+                             const char *pBitcode, size_t pBitcodeSize) {
+  //android::StopWatch load_time("bcc: RSCompilerDriver::loadScript time");
+  if ((pCacheDir == NULL) || (pResName == NULL)) {
+    ALOGE("Missing pCacheDir and/or pResName");
     return NULL;
+  }
+
+  if ((pBitcode == NULL) || (pBitcodeSize <= 0)) {
+    ALOGE("No bitcode supplied! (bitcode: %p, size of bitcode: %zu)",
+          pBitcode, pBitcodeSize);
+    return NULL;
+  }
+
+  RSInfo::DependencyTableTy dep_info;
+  uint8_t bitcode_sha1[20];
+  Sha1Util::GetSHA1DigestFromBuffer(bitcode_sha1, pBitcode, pBitcodeSize);
+
+  // {pCacheDir}/{pResName}.o
+  llvm::SmallString<80> output_path(pCacheDir);
+  llvm::sys::path::append(output_path, pResName);
+  llvm::sys::path::replace_extension(output_path, ".o");
+
+  dep_info.push(std::make_pair(output_path.c_str(), bitcode_sha1));
 
   //===--------------------------------------------------------------------===//
-  // Acquire the read lock for reading output object file.
+  // Acquire the read lock for reading the Script object file.
   //===--------------------------------------------------------------------===//
-  FileMutex<FileBase::kReadLock> read_output_mutex(pOutputPath);
+  FileMutex<FileBase::kReadLock> read_output_mutex(output_path.c_str());
 
   if (read_output_mutex.hasError() || !read_output_mutex.lock()) {
-    ALOGE("Unable to acquire the read lock for %s! (%s)", pOutputPath,
+    ALOGE("Unable to acquire the read lock for %s! (%s)", output_path.c_str(),
           read_output_mutex.getErrorMessage().c_str());
     return NULL;
   }
@@ -99,25 +101,25 @@
   //===--------------------------------------------------------------------===//
   // Read the output object file.
   //===--------------------------------------------------------------------===//
-  InputFile *output_file = new (std::nothrow) InputFile(pOutputPath);
+  InputFile *object_file = new (std::nothrow) InputFile(output_path.c_str());
 
-  if ((output_file == NULL) || output_file->hasError()) {
-      //      ALOGE("Unable to open the %s for read! (%s)", pOutputPath,
-      //            output_file->getErrorMessage().c_str());
-    delete output_file;
+  if ((object_file == NULL) || object_file->hasError()) {
+      //      ALOGE("Unable to open the %s for read! (%s)", output_path.c_str(),
+      //            object_file->getErrorMessage().c_str());
+    delete object_file;
     return NULL;
   }
 
   //===--------------------------------------------------------------------===//
-  // Acquire the read lock on output_file for reading its RS info file.
+  // Acquire the read lock on object_file for reading its RS info file.
   //===--------------------------------------------------------------------===//
-  android::String8 info_path = RSInfo::GetPath(*output_file);
+  android::String8 info_path = RSInfo::GetPath(output_path.c_str());
 
-  if (!output_file->lock()) {
+  if (!object_file->lock()) {
     ALOGE("Unable to acquire the read lock on %s for reading %s! (%s)",
-          pOutputPath, info_path.string(),
-          output_file->getErrorMessage().c_str());
-    delete output_file;
+          output_path.c_str(), info_path.string(),
+          object_file->getErrorMessage().c_str());
+    delete object_file;
     return NULL;
   }
 
@@ -125,22 +127,22 @@
   // Open and load the RS info file.
   //===--------------------------------------------------------------------===//
   InputFile info_file(info_path.string());
-  RSInfo *info = RSInfo::ReadFromFile(info_file, pDeps);
+  RSInfo *info = RSInfo::ReadFromFile(info_file, dep_info);
 
-  // Release the lock on output_file.
-  output_file->unlock();
+  // Release the lock on object_file.
+  object_file->unlock();
 
   if (info == NULL) {
-    delete output_file;
+    delete object_file;
     return NULL;
   }
 
   //===--------------------------------------------------------------------===//
   // Create the RSExecutable.
   //===--------------------------------------------------------------------===//
-  result = RSExecutable::Create(*info, *output_file, mResolver);
+  RSExecutable *result = RSExecutable::Create(*info, *object_file, mResolver);
   if (result == NULL) {
-    delete output_file;
+    delete object_file;
     delete info;
     return NULL;
   }
@@ -148,6 +150,10 @@
   return result;
 }
 
+#if defined(DEFAULT_ARM_CODEGEN)
+extern llvm::cl::opt<bool> EnableGlobalMerge;
+#endif
+
 bool RSCompilerDriver::setupConfig(const RSScript &pScript) {
   bool changed = false;
 
@@ -169,6 +175,9 @@
       return false;
     }
     mConfig->setOptimizationLevel(script_opt_level);
+#if defined(DEFAULT_ARM_CODEGEN)
+    EnableGlobalMerge = mEnableGlobalMerge;
+#endif
     changed = true;
   }
 
@@ -185,15 +194,14 @@
   return changed;
 }
 
-RSExecutable *
+Compiler::ErrorCode
 RSCompilerDriver::compileScript(RSScript &pScript,
                                 const char* pScriptName,
                                 const char *pOutputPath,
                                 const char *pRuntimePath,
                                 const RSInfo::DependencyTableTy &pDeps,
-                                bool pSkipLoad) {
+                                bool pSkipLoad, bool pDumpIR) {
   //android::StopWatch compile_time("bcc: RSCompilerDriver::compileScript time");
-  RSExecutable *result = NULL;
   RSInfo *info = NULL;
 
   //===--------------------------------------------------------------------===//
@@ -203,7 +211,7 @@
   // compiler therefore it should be extracted before compilation.
   info = RSInfo::ExtractFromSource(pScript.getSource(), pDeps);
   if (info == NULL) {
-    return NULL;
+    return Compiler::kErrInvalidSource;
   }
 
   //===--------------------------------------------------------------------===//
@@ -218,129 +226,119 @@
   //===--------------------------------------------------------------------===//
   if (!RSScript::LinkRuntime(pScript, pRuntimePath)) {
     ALOGE("Failed to link script '%s' with Renderscript runtime!", pScriptName);
-    return NULL;
+    return Compiler::kErrInvalidSource;
   }
 
-  // FIXME(srhines): Windows compilation can't use locking like this, but
-  // we also don't need to worry about concurrent writers of the same file.
+  {
+    // FIXME(srhines): Windows compilation can't use locking like this, but
+    // we also don't need to worry about concurrent writers of the same file.
 #ifndef USE_MINGW
-  //===--------------------------------------------------------------------===//
-  // Acquire the write lock for writing output object file.
-  //===--------------------------------------------------------------------===//
-  FileMutex<FileBase::kWriteLock> write_output_mutex(pOutputPath);
+    //===------------------------------------------------------------------===//
+    // Acquire the write lock for writing output object file.
+    //===------------------------------------------------------------------===//
+    FileMutex<FileBase::kWriteLock> write_output_mutex(pOutputPath);
 
-  if (write_output_mutex.hasError() || !write_output_mutex.lock()) {
-    ALOGE("Unable to acquire the lock for writing %s! (%s)",
-          pOutputPath, write_output_mutex.getErrorMessage().c_str());
-    return NULL;
-  }
+    if (write_output_mutex.hasError() || !write_output_mutex.lock()) {
+      ALOGE("Unable to acquire the lock for writing %s! (%s)",
+            pOutputPath, write_output_mutex.getErrorMessage().c_str());
+      return Compiler::kErrInvalidSource;
+    }
 #endif
 
-  //===--------------------------------------------------------------------===//
-  // Open the output file for write.
-  //===--------------------------------------------------------------------===//
-  unsigned flags = FileBase::kTruncate | FileBase::kBinary;
-  if (mDebugContext) {
-    // Delete the cache file when we finish up under a debug context.
-    flags |= FileBase::kDeleteOnClose;
-  }
-  OutputFile *output_file = new (std::nothrow) OutputFile(pOutputPath, flags);
+    // Open the output file for write.
+    OutputFile output_file(pOutputPath,
+                           FileBase::kTruncate | FileBase::kBinary);
 
-  if ((output_file == NULL) || output_file->hasError()) {
-      ALOGE("Unable to open %s for write! (%s)", pOutputPath,
-            output_file->getErrorMessage().c_str());
-    delete info;
-    delete output_file;
-    return NULL;
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Setup the config to the compiler.
-  //===--------------------------------------------------------------------===//
-  bool compiler_need_reconfigure = setupConfig(pScript);
-
-  if (mConfig == NULL) {
-    ALOGE("Failed to setup config for RS compiler to compile %s!", pOutputPath);
-    delete info;
-    delete output_file;
-    return NULL;
-  }
-
-  // Compiler need to re-config if it's haven't run the config() yet or the
-  // configuration it referenced is changed.
-  if (compiler_need_reconfigure) {
-    Compiler::ErrorCode err = mCompiler.config(*mConfig);
-    if (err != Compiler::kSuccess) {
-      ALOGE("Failed to config the RS compiler for %s! (%s)",pOutputPath,
-            Compiler::GetErrorString(err));
-      delete info;
-      delete output_file;
-      return NULL;
+    if (output_file.hasError()) {
+        ALOGE("Unable to open %s for write! (%s)", pOutputPath,
+              output_file.getErrorMessage().c_str());
+      return Compiler::kErrInvalidSource;
     }
-  }
 
-  //===--------------------------------------------------------------------===//
-  // Run the compiler.
-  //===--------------------------------------------------------------------===//
-  Compiler::ErrorCode compile_result = mCompiler.compile(pScript, *output_file);
-  if (compile_result != Compiler::kSuccess) {
-    ALOGE("Unable to compile the source to file %s! (%s)", pOutputPath,
-          Compiler::GetErrorString(compile_result));
-    delete info;
-    delete output_file;
-    return NULL;
+    // Setup the config to the compiler.
+    bool compiler_need_reconfigure = setupConfig(pScript);
+
+    if (mConfig == NULL) {
+      ALOGE("Failed to setup config for RS compiler to compile %s!",
+            pOutputPath);
+      return Compiler::kErrInvalidSource;
+    }
+
+    if (compiler_need_reconfigure) {
+      Compiler::ErrorCode err = mCompiler.config(*mConfig);
+      if (err != Compiler::kSuccess) {
+        ALOGE("Failed to config the RS compiler for %s! (%s)",pOutputPath,
+              Compiler::GetErrorString(err));
+        return Compiler::kErrInvalidSource;
+      }
+    }
+
+    OutputFile *ir_file = NULL;
+    llvm::raw_fd_ostream *IRStream = NULL;
+    if (pDumpIR) {
+      android::String8 path(pOutputPath);
+      path.append(".ll");
+      ir_file = new OutputFile(path.string(), FileBase::kTruncate);
+      IRStream = ir_file->dup();
+    }
+
+    // Run the compiler.
+    Compiler::ErrorCode compile_result = mCompiler.compile(pScript,
+                                                           output_file, IRStream);
+
+    if (ir_file) {
+      ir_file->close();
+      delete ir_file;
+    }
+
+    if (compile_result != Compiler::kSuccess) {
+      ALOGE("Unable to compile the source to file %s! (%s)", pOutputPath,
+            Compiler::GetErrorString(compile_result));
+      return Compiler::kErrInvalidSource;
+    }
   }
 
   // No need to produce an RSExecutable in this case.
   // TODO: Error handling in this case is nonexistent.
   if (pSkipLoad) {
-    return NULL;
+    return Compiler::kSuccess;
   }
 
-  //===--------------------------------------------------------------------===//
-  // Create the RSExecutable.
-  //===--------------------------------------------------------------------===//
-  result = RSExecutable::Create(*info, *output_file, mResolver);
-  if (result == NULL) {
-    delete info;
-    delete output_file;
-    return NULL;
+  {
+    android::String8 info_path = RSInfo::GetPath(pOutputPath);
+    OutputFile info_file(info_path.string(), FileBase::kTruncate);
+
+    if (info_file.hasError()) {
+      ALOGE("Failed to open the info file %s for write! (%s)",
+            info_path.string(), info_file.getErrorMessage().c_str());
+      return Compiler::kErrInvalidSource;
+    }
+
+    FileMutex<FileBase::kWriteLock> write_info_mutex(info_path.string());
+    if (write_info_mutex.hasError() || !write_info_mutex.lock()) {
+      ALOGE("Unable to acquire the lock for writing %s! (%s)",
+            info_path.string(), write_info_mutex.getErrorMessage().c_str());
+      return Compiler::kErrInvalidSource;
+    }
+
+    // Perform the write.
+    if (!info->write(info_file)) {
+      ALOGE("Failed to sync the RS info file %s!", info_path.string());
+      return Compiler::kErrInvalidSource;
+    }
   }
 
-  //===--------------------------------------------------------------------===//
-  // Dump the disassembly for debug when possible.
-  //===--------------------------------------------------------------------===//
-#if USE_DISASSEMBLER
-  OutputFile *disassembly_output =
-      new (std::nothrow) OutputFile(DEBUG_DISASSEMBLER_FILE,
-                                    FileBase::kAppend);
-
-  if (disassembly_output != NULL) {
-    result->dumpDisassembly(*disassembly_output);
-    delete disassembly_output;
-  }
-#endif
-
-  //===--------------------------------------------------------------------===//
-  // Write out the RS info file.
-  //===--------------------------------------------------------------------===//
-  // Note that write failure only results in a warning since the source is
-  // successfully compiled and loaded.
-  if (!result->syncInfo(/* pForce */true)) {
-    ALOGW("%s was successfully compiled and loaded but its RS info file failed "
-          "to write out!", pOutputPath);
-  }
-
-  return result;
+  return Compiler::kSuccess;
 }
 
-RSExecutable *RSCompilerDriver::build(BCCContext &pContext,
-                                      const char *pCacheDir,
-                                      const char *pResName,
-                                      const char *pBitcode,
-                                      size_t pBitcodeSize,
-                                      const char *pRuntimePath,
-                                      RSLinkRuntimeCallback pLinkRuntimeCallback) {
+bool RSCompilerDriver::build(BCCContext &pContext,
+                             const char *pCacheDir,
+                             const char *pResName,
+                             const char *pBitcode,
+                             size_t pBitcodeSize,
+                             const char *pRuntimePath,
+                             RSLinkRuntimeCallback pLinkRuntimeCallback,
+                             bool pDumpIR) {
     //  android::StopWatch build_time("bcc: RSCompilerDriver::build time");
   //===--------------------------------------------------------------------===//
   // Check parameters.
@@ -349,13 +347,13 @@
     ALOGE("Invalid parameter passed to RSCompilerDriver::build()! (cache dir: "
           "%s, resource name: %s)", ((pCacheDir) ? pCacheDir : "(null)"),
                                     ((pResName) ? pResName : "(null)"));
-    return NULL;
+    return false;
   }
 
   if ((pBitcode == NULL) || (pBitcodeSize <= 0)) {
     ALOGE("No bitcode supplied! (bitcode: %p, size of bitcode: %u)",
           pBitcode, static_cast<unsigned>(pBitcodeSize));
-    return NULL;
+    return false;
   }
 
   //===--------------------------------------------------------------------===//
@@ -364,36 +362,16 @@
   RSInfo::DependencyTableTy dep_info;
   uint8_t bitcode_sha1[20];
   Sha1Util::GetSHA1DigestFromBuffer(bitcode_sha1, pBitcode, pBitcodeSize);
-  dep_info.push(std::make_pair(pResName, bitcode_sha1));
 
   //===--------------------------------------------------------------------===//
   // Construct output path.
-  //===--------------------------------------------------------------------===//
-  llvm::sys::Path output_path(pCacheDir);
-
-  // {pCacheDir}/{pResName}
-  if (!output_path.appendComponent(pResName)) {
-    ALOGE("Failed to construct output path %s/%s!", pCacheDir, pResName);
-    return NULL;
-  }
-
   // {pCacheDir}/{pResName}.o
-  output_path.appendSuffix("o");
-
   //===--------------------------------------------------------------------===//
-  // Load cache.
-  //===--------------------------------------------------------------------===//
-  RSExecutable *result = NULL;
+  llvm::SmallString<80> output_path(pCacheDir);
+  llvm::sys::path::append(output_path, pResName);
+  llvm::sys::path::replace_extension(output_path, ".o");
 
-  // Skip loading from the cache if we are using a debug context.
-  if (!mDebugContext) {
-    result = loadScriptCache(output_path.c_str(), dep_info);
-
-    if (result != NULL) {
-      // Cache hit
-      return result;
-    }
-  }
+  dep_info.push(std::make_pair(output_path.c_str(), bitcode_sha1));
 
   //===--------------------------------------------------------------------===//
   // Load the bitcode and create script.
@@ -401,7 +379,7 @@
   Source *source = Source::CreateFromBuffer(pContext, pResName,
                                             pBitcode, pBitcodeSize);
   if (source == NULL) {
-    return NULL;
+    return false;
   }
 
   RSScript *script = new (std::nothrow) RSScript(*source);
@@ -409,7 +387,7 @@
     ALOGE("Out of memory when create Script object for '%s'! (output: %s)",
           pResName, output_path.c_str());
     delete source;
-    return NULL;
+    return false;
   }
 
   script->setLinkRuntimeCallback(pLinkRuntimeCallback);
@@ -423,26 +401,28 @@
   //===--------------------------------------------------------------------===//
   // Compile the script
   //===--------------------------------------------------------------------===//
-  result = compileScript(*script, pResName, output_path.c_str(), pRuntimePath,
-                         dep_info, false);
+  Compiler::ErrorCode status = compileScript(*script, pResName,
+                                             output_path.c_str(),
+                                             pRuntimePath, dep_info, false,
+                                             pDumpIR);
 
   // Script is no longer used. Free it to get more memory.
   delete script;
 
-  if (result == NULL) {
-    return NULL;
+  if (status != Compiler::kSuccess) {
+    return false;
   }
 
-  return result;
+  return true;
 }
 
 
-RSExecutable *RSCompilerDriver::build(RSScript &pScript, const char *pOut,
-                                      const char *pRuntimePath) {
+bool RSCompilerDriver::build(RSScript &pScript, const char *pOut,
+                             const char *pRuntimePath) {
   RSInfo::DependencyTableTy dep_info;
   RSInfo *info = RSInfo::ExtractFromSource(pScript.getSource(), dep_info);
   if (info == NULL) {
-    return NULL;
+    return false;
   }
   pScript.setInfo(info);
 
@@ -450,8 +430,12 @@
   // offline (host) compilation.
   pScript.setEmbedInfo(true);
 
-  RSExecutable *result = compileScript(pScript, pOut, pOut, pRuntimePath,
-                                       dep_info, true);
-  return result;
+  Compiler::ErrorCode status = compileScript(pScript, pOut, pOut, pRuntimePath,
+                                             dep_info, true);
+  if (status != Compiler::kSuccess) {
+    return false;
+  }
+
+  return true;
 }
 
diff --git a/lib/Renderscript/RSExecutable.cpp b/lib/Renderscript/RSExecutable.cpp
index be39f3c..ed06f30 100644
--- a/lib/Renderscript/RSExecutable.cpp
+++ b/lib/Renderscript/RSExecutable.cpp
@@ -129,7 +129,7 @@
     return true;
   }
 
-  android::String8 info_path = RSInfo::GetPath(*mObjFile);
+  android::String8 info_path = RSInfo::GetPath(mObjFile->getName().c_str());
   OutputFile info_file(info_path.string(), FileBase::kTruncate);
 
   if (info_file.hasError()) {
diff --git a/lib/Renderscript/RSForEachExpand.cpp b/lib/Renderscript/RSForEachExpand.cpp
index bf1a199..ca0bb1b 100644
--- a/lib/Renderscript/RSForEachExpand.cpp
+++ b/lib/Renderscript/RSForEachExpand.cpp
@@ -23,16 +23,21 @@
 #include <llvm/IR/Function.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/MDBuilder.h>
 #include <llvm/IR/Module.h>
 #include <llvm/Pass.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/IR/DataLayout.h>
+#include <llvm/IR/Function.h>
 #include <llvm/IR/Type.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
 
 #include "bcc/Config/Config.h"
 #include "bcc/Renderscript/RSInfo.h"
 #include "bcc/Support/Log.h"
 
+#include "bcinfo/MetadataExtractor.h"
+
 using namespace bcc;
 
 namespace {
@@ -103,6 +108,12 @@
   }
 
   // Get the actual value we should use to step through an allocation.
+  //
+  // Normally the value we use to step through an allocation is given to us by
+  // the driver. However, for certain primitive data types, we can derive an
+  // integer constant for the step value. We use this integer constant whenever
+  // possible to allow further compiler optimizations to take place.
+  //
   // DL - Target Data size/layout information.
   // T - Type of allocation (should be a pointer).
   // OrigStep - Original step increment (root.expand() input from driver).
@@ -123,30 +134,149 @@
     }
   }
 
-  static bool hasIn(uint32_t Signature) {
-    return Signature & 0x01;
+  /// @brief Returns the type of the ForEach stub parameter structure.
+  ///
+  /// Renderscript uses a single structure in which all parameters are passed
+  /// to keep the signature of the expanded function independent of the
+  /// parameters passed to it.
+  llvm::Type *getForeachStubTy() {
+    llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*C);
+    llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*C);
+    llvm::Type *SizeTy = Int32Ty;
+    /* Defined in frameworks/base/libs/rs/rs_hal.h:
+     *
+     * struct RsForEachStubParamStruct {
+     *   const void *in;
+     *   void *out;
+     *   const void *usr;
+     *   size_t usr_len;
+     *   uint32_t x;
+     *   uint32_t y;
+     *   uint32_t z;
+     *   uint32_t lod;
+     *   enum RsAllocationCubemapFace face;
+     *   uint32_t ar[16];
+     * };
+     */
+    llvm::SmallVector<llvm::Type*, 9> StructTys;
+    StructTys.push_back(VoidPtrTy);  // const void *in
+    StructTys.push_back(VoidPtrTy);  // void *out
+    StructTys.push_back(VoidPtrTy);  // const void *usr
+    StructTys.push_back(SizeTy);     // size_t usr_len
+    StructTys.push_back(Int32Ty);    // uint32_t x
+    StructTys.push_back(Int32Ty);    // uint32_t y
+    StructTys.push_back(Int32Ty);    // uint32_t z
+    StructTys.push_back(Int32Ty);    // uint32_t lod
+    StructTys.push_back(Int32Ty);    // enum RsAllocationCubemapFace
+    StructTys.push_back(llvm::ArrayType::get(Int32Ty, 16));  // uint32_t ar[16]
+
+    return llvm::StructType::create(StructTys, "RsForEachStubParamStruct");
   }
 
-  static bool hasOut(uint32_t Signature) {
-    return Signature & 0x02;
+  /// @brief Create skeleton of the expanded function.
+  ///
+  /// This creates a function with the following signature:
+  ///
+  ///   void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
+  ///         uint32_t instep, uint32_t outstep)
+  ///
+  llvm::Function *createEmptyExpandedFunction(llvm::StringRef OldName) {
+    llvm::Type *ForEachStubPtrTy = getForeachStubTy()->getPointerTo();
+    llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*C);
+
+    llvm::SmallVector<llvm::Type*, 8> ParamTys;
+    ParamTys.push_back(ForEachStubPtrTy);  // const RsForEachStubParamStruct *p
+    ParamTys.push_back(Int32Ty);           // uint32_t x1
+    ParamTys.push_back(Int32Ty);           // uint32_t x2
+    ParamTys.push_back(Int32Ty);           // uint32_t instep
+    ParamTys.push_back(Int32Ty);           // uint32_t outstep
+
+    llvm::FunctionType *FT =
+        llvm::FunctionType::get(llvm::Type::getVoidTy(*C), ParamTys, false);
+    llvm::Function *F =
+        llvm::Function::Create(FT, llvm::GlobalValue::ExternalLinkage,
+                               OldName + ".expand", M);
+
+    llvm::Function::arg_iterator AI = F->arg_begin();
+
+    AI->setName("p");
+    AI++;
+    AI->setName("x1");
+    AI++;
+    AI->setName("x2");
+    AI++;
+    AI->setName("arg_instep");
+    AI++;
+    AI->setName("arg_outstep");
+    AI++;
+
+    assert(AI == F->arg_end());
+
+    llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*C, "Begin", F);
+    llvm::IRBuilder<> Builder(Begin);
+    Builder.CreateRetVoid();
+
+    return F;
   }
 
-  static bool hasUsrData(uint32_t Signature) {
-    return Signature & 0x04;
-  }
+  /// @brief Create an empty loop
+  ///
+  /// Create a loop of the form:
+  ///
+  /// for (i = LowerBound; i < UpperBound; i++)
+  ///   ;
+  ///
+  /// After the loop has been created, the builder is set such that
+  /// instructions can be added to the loop body.
+  ///
+  /// @param Builder The builder to use to build this loop. The current
+  ///                position of the builder is the position the loop
+  ///                will be inserted.
+  /// @param LowerBound The first value of the loop iterator
+  /// @param UpperBound The maximal value of the loop iterator
+  /// @param LoopIV A reference that will be set to the loop iterator.
+  /// @return The BasicBlock that will be executed after the loop.
+  llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder,
+                               llvm::Value *LowerBound,
+                               llvm::Value *UpperBound,
+                               llvm::PHINode **LoopIV) {
+    assert(LowerBound->getType() == UpperBound->getType());
 
-  static bool hasX(uint32_t Signature) {
-    return Signature & 0x08;
-  }
+    llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB;
+    llvm::Value *Cond, *IVNext;
+    llvm::PHINode *IV;
 
-  static bool hasY(uint32_t Signature) {
-    return Signature & 0x10;
-  }
+    CondBB = Builder.GetInsertBlock();
+    AfterBB = llvm::SplitBlock(CondBB, Builder.GetInsertPoint(), this);
+    HeaderBB = llvm::BasicBlock::Create(*C, "Loop", CondBB->getParent());
 
-  static bool isKernel(uint32_t Signature) {
-    return Signature & 0x20;
-  }
+    // if (LowerBound < Upperbound)
+    //   goto LoopHeader
+    // else
+    //   goto AfterBB
+    CondBB->getTerminator()->eraseFromParent();
+    Builder.SetInsertPoint(CondBB);
+    Cond = Builder.CreateICmpULT(LowerBound, UpperBound);
+    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
 
+    // iv = PHI [CondBB -> LowerBound], [LoopHeader -> NextIV ]
+    // iv.next = iv + 1
+    // if (iv.next < Upperbound)
+    //   goto LoopHeader
+    // else
+    //   goto AfterBB
+    Builder.SetInsertPoint(HeaderBB);
+    IV = Builder.CreatePHI(LowerBound->getType(), 2, "X");
+    IV->addIncoming(LowerBound, CondBB);
+    IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1));
+    IV->addIncoming(IVNext, HeaderBB);
+    Cond = Builder.CreateICmpULT(IVNext, UpperBound);
+    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
+    AfterBB->setName("Exit");
+    Builder.SetInsertPoint(HeaderBB->getFirstNonPHI());
+    *LoopIV = IV;
+    return AfterBB;
+  }
 
 public:
   RSForEachExpandPass(const RSInfo::ExportForeachFuncListTy &pForeachFuncs,
@@ -173,57 +303,7 @@
 
     llvm::DataLayout DL(M);
 
-    llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*C);
-    llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*C);
-    llvm::Type *SizeTy = Int32Ty;
-
-    /* Defined in frameworks/base/libs/rs/rs_hal.h:
-     *
-     * struct RsForEachStubParamStruct {
-     *   const void *in;
-     *   void *out;
-     *   const void *usr;
-     *   size_t usr_len;
-     *   uint32_t x;
-     *   uint32_t y;
-     *   uint32_t z;
-     *   uint32_t lod;
-     *   enum RsAllocationCubemapFace face;
-     *   uint32_t ar[16];
-     * };
-     */
-    llvm::SmallVector<llvm::Type*, 9> StructTys;
-    StructTys.push_back(VoidPtrTy);  // const void *in
-    StructTys.push_back(VoidPtrTy);  // void *out
-    StructTys.push_back(VoidPtrTy);  // const void *usr
-    StructTys.push_back(SizeTy);     // size_t usr_len
-    StructTys.push_back(Int32Ty);    // uint32_t x
-    StructTys.push_back(Int32Ty);    // uint32_t y
-    StructTys.push_back(Int32Ty);    // uint32_t z
-    StructTys.push_back(Int32Ty);    // uint32_t lod
-    StructTys.push_back(Int32Ty);    // enum RsAllocationCubemapFace
-    StructTys.push_back(llvm::ArrayType::get(Int32Ty, 16));  // uint32_t ar[16]
-
-    llvm::Type *ForEachStubPtrTy = llvm::StructType::create(
-        StructTys, "RsForEachStubParamStruct")->getPointerTo();
-
-    /* Create the function signature for our expanded function.
-     * void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
-     *       uint32_t instep, uint32_t outstep)
-     */
-    llvm::SmallVector<llvm::Type*, 8> ParamTys;
-    ParamTys.push_back(ForEachStubPtrTy);  // const RsForEachStubParamStruct *p
-    ParamTys.push_back(Int32Ty);           // uint32_t x1
-    ParamTys.push_back(Int32Ty);           // uint32_t x2
-    ParamTys.push_back(Int32Ty);           // uint32_t instep
-    ParamTys.push_back(Int32Ty);           // uint32_t outstep
-
-    llvm::FunctionType *FT =
-        llvm::FunctionType::get(llvm::Type::getVoidTy(*C), ParamTys, false);
-    llvm::Function *ExpandedFunc =
-        llvm::Function::Create(FT,
-                               llvm::GlobalValue::ExternalLinkage,
-                               F->getName() + ".expand", M);
+    llvm::Function *ExpandedFunc = createEmptyExpandedFunction(F->getName());
 
     // Create and name the actual arguments to this expanded function.
     llvm::SmallVector<llvm::Argument*, 8> ArgVec;
@@ -245,54 +325,38 @@
     llvm::Value *Arg_instep = ArgVec[3];
     llvm::Value *Arg_outstep = ArgVec[4];
 
-    Arg_p->setName("p");
-    Arg_x1->setName("x1");
-    Arg_x2->setName("x2");
-    Arg_instep->setName("arg_instep");
-    Arg_outstep->setName("arg_outstep");
-
     llvm::Value *InStep = NULL;
     llvm::Value *OutStep = NULL;
 
     // Construct the actual function body.
-    llvm::BasicBlock *Begin =
-        llvm::BasicBlock::Create(*C, "Begin", ExpandedFunc);
-    llvm::IRBuilder<> Builder(Begin);
-
-    // uint32_t X = x1;
-    llvm::AllocaInst *AX = Builder.CreateAlloca(Int32Ty, 0, "AX");
-    Builder.CreateStore(Arg_x1, AX);
+    llvm::IRBuilder<> Builder(ExpandedFunc->getEntryBlock().begin());
 
     // Collect and construct the arguments for the kernel().
     // Note that we load any loop-invariant arguments before entering the Loop.
     llvm::Function::arg_iterator Args = F->arg_begin();
 
     llvm::Type *InTy = NULL;
-    llvm::AllocaInst *AIn = NULL;
-    if (hasIn(Signature)) {
+    llvm::Value *InBasePtr = NULL;
+    if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
       InTy = Args->getType();
-      AIn = Builder.CreateAlloca(InTy, 0, "AIn");
       InStep = getStepValue(&DL, InTy, Arg_instep);
       InStep->setName("instep");
-      Builder.CreateStore(Builder.CreatePointerCast(Builder.CreateLoad(
-          Builder.CreateStructGEP(Arg_p, 0)), InTy), AIn);
+      InBasePtr = Builder.CreateLoad(Builder.CreateStructGEP(Arg_p, 0));
       Args++;
     }
 
     llvm::Type *OutTy = NULL;
-    llvm::AllocaInst *AOut = NULL;
-    if (hasOut(Signature)) {
+    llvm::Value *OutBasePtr = NULL;
+    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
       OutTy = Args->getType();
-      AOut = Builder.CreateAlloca(OutTy, 0, "AOut");
       OutStep = getStepValue(&DL, OutTy, Arg_outstep);
       OutStep->setName("outstep");
-      Builder.CreateStore(Builder.CreatePointerCast(Builder.CreateLoad(
-          Builder.CreateStructGEP(Arg_p, 1)), OutTy), AOut);
+      OutBasePtr = Builder.CreateLoad(Builder.CreateStructGEP(Arg_p, 1));
       Args++;
     }
 
     llvm::Value *UsrData = NULL;
-    if (hasUsrData(Signature)) {
+    if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) {
       llvm::Type *UsrDataTy = Args->getType();
       UsrData = Builder.CreatePointerCast(Builder.CreateLoad(
           Builder.CreateStructGEP(Arg_p, 2)), UsrDataTy);
@@ -300,27 +364,20 @@
       Args++;
     }
 
-    if (hasX(Signature)) {
+    if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
       Args++;
     }
 
     llvm::Value *Y = NULL;
-    if (hasY(Signature)) {
+    if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
       Y = Builder.CreateLoad(Builder.CreateStructGEP(Arg_p, 5), "Y");
       Args++;
     }
 
     bccAssert(Args == F->arg_end());
 
-    llvm::BasicBlock *Loop = llvm::BasicBlock::Create(*C, "Loop", ExpandedFunc);
-    llvm::BasicBlock *Exit = llvm::BasicBlock::Create(*C, "Exit", ExpandedFunc);
-
-    // if (x1 < x2) goto Loop; else goto Exit;
-    llvm::Value *Cond = Builder.CreateICmpSLT(Arg_x1, Arg_x2);
-    Builder.CreateCondBr(Cond, Loop, Exit);
-
-    // Loop:
-    Builder.SetInsertPoint(Loop);
+    llvm::PHINode *IV;
+    createLoop(Builder, Arg_x1, Arg_x2, &IV);
 
     // Populate the actual call to kernel().
     llvm::SmallVector<llvm::Value*, 8> RootArgs;
@@ -328,13 +385,32 @@
     llvm::Value *InPtr = NULL;
     llvm::Value *OutPtr = NULL;
 
-    if (AIn) {
-      InPtr = Builder.CreateLoad(AIn, "InPtr");
+    // Calculate the current input and output pointers
+    //
+    // We always calculate the input/output pointers with a GEP operating on i8
+    // values and only cast at the very end to OutTy. This is because the step
+    // between two values is given in bytes.
+    //
+    // TODO: We could further optimize the output by using a GEP operation of
+    // type 'OutTy' in cases where the element type of the allocation allows.
+    if (OutBasePtr) {
+      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
+      OutOffset = Builder.CreateMul(OutOffset, OutStep);
+      OutPtr = Builder.CreateGEP(OutBasePtr, OutOffset);
+      OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
+    }
+    if (InBasePtr) {
+      llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
+      InOffset = Builder.CreateMul(InOffset, InStep);
+      InPtr = Builder.CreateGEP(InBasePtr, InOffset);
+      InPtr = Builder.CreatePointerCast(InPtr, InTy);
+    }
+
+    if (InPtr) {
       RootArgs.push_back(InPtr);
     }
 
-    if (AOut) {
-      OutPtr = Builder.CreateLoad(AOut, "OutPtr");
+    if (OutPtr) {
       RootArgs.push_back(OutPtr);
     }
 
@@ -342,9 +418,8 @@
       RootArgs.push_back(UsrData);
     }
 
-    // We always have to load X, since it is used to iterate through the loop.
-    llvm::Value *X = Builder.CreateLoad(AX, "X");
-    if (hasX(Signature)) {
+    llvm::Value *X = IV;
+    if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
       RootArgs.push_back(X);
     }
 
@@ -354,96 +429,19 @@
 
     Builder.CreateCall(F, RootArgs);
 
-    if (InPtr) {
-      // InPtr += instep
-      llvm::Value *NewIn = Builder.CreateIntToPtr(Builder.CreateNUWAdd(
-          Builder.CreatePtrToInt(InPtr, Int32Ty), InStep), InTy);
-      Builder.CreateStore(NewIn, AIn);
-    }
-
-    if (OutPtr) {
-      // OutPtr += outstep
-      llvm::Value *NewOut = Builder.CreateIntToPtr(Builder.CreateNUWAdd(
-          Builder.CreatePtrToInt(OutPtr, Int32Ty), OutStep), OutTy);
-      Builder.CreateStore(NewOut, AOut);
-    }
-
-    // X++;
-    llvm::Value *XPlusOne =
-        Builder.CreateNUWAdd(X, llvm::ConstantInt::get(Int32Ty, 1));
-    Builder.CreateStore(XPlusOne, AX);
-
-    // If (X < x2) goto Loop; else goto Exit;
-    Cond = Builder.CreateICmpSLT(XPlusOne, Arg_x2);
-    Builder.CreateCondBr(Cond, Loop, Exit);
-
-    // Exit:
-    Builder.SetInsertPoint(Exit);
-    Builder.CreateRetVoid();
-
     return true;
   }
 
   /* Expand a pass-by-value kernel.
    */
   bool ExpandKernel(llvm::Function *F, uint32_t Signature) {
-    bccAssert(isKernel(Signature));
+    bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
     ALOGV("Expanding kernel Function %s", F->getName().str().c_str());
 
     // TODO: Refactor this to share functionality with ExpandFunction.
     llvm::DataLayout DL(M);
 
-    llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*C);
-    llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*C);
-    llvm::Type *SizeTy = Int32Ty;
-
-    /* Defined in frameworks/base/libs/rs/rs_hal.h:
-     *
-     * struct RsForEachStubParamStruct {
-     *   const void *in;
-     *   void *out;
-     *   const void *usr;
-     *   size_t usr_len;
-     *   uint32_t x;
-     *   uint32_t y;
-     *   uint32_t z;
-     *   uint32_t lod;
-     *   enum RsAllocationCubemapFace face;
-     *   uint32_t ar[16];
-     * };
-     */
-    llvm::SmallVector<llvm::Type*, 9> StructTys;
-    StructTys.push_back(VoidPtrTy);  // const void *in
-    StructTys.push_back(VoidPtrTy);  // void *out
-    StructTys.push_back(VoidPtrTy);  // const void *usr
-    StructTys.push_back(SizeTy);     // size_t usr_len
-    StructTys.push_back(Int32Ty);    // uint32_t x
-    StructTys.push_back(Int32Ty);    // uint32_t y
-    StructTys.push_back(Int32Ty);    // uint32_t z
-    StructTys.push_back(Int32Ty);    // uint32_t lod
-    StructTys.push_back(Int32Ty);    // enum RsAllocationCubemapFace
-    StructTys.push_back(llvm::ArrayType::get(Int32Ty, 16));  // uint32_t ar[16]
-
-    llvm::Type *ForEachStubPtrTy = llvm::StructType::create(
-        StructTys, "RsForEachStubParamStruct")->getPointerTo();
-
-    /* Create the function signature for our expanded function.
-     * void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
-     *       uint32_t instep, uint32_t outstep)
-     */
-    llvm::SmallVector<llvm::Type*, 8> ParamTys;
-    ParamTys.push_back(ForEachStubPtrTy);  // const RsForEachStubParamStruct *p
-    ParamTys.push_back(Int32Ty);           // uint32_t x1
-    ParamTys.push_back(Int32Ty);           // uint32_t x2
-    ParamTys.push_back(Int32Ty);           // uint32_t instep
-    ParamTys.push_back(Int32Ty);           // uint32_t outstep
-
-    llvm::FunctionType *FT =
-        llvm::FunctionType::get(llvm::Type::getVoidTy(*C), ParamTys, false);
-    llvm::Function *ExpandedFunc =
-        llvm::Function::Create(FT,
-                               llvm::GlobalValue::ExternalLinkage,
-                               F->getName() + ".expand", M);
+    llvm::Function *ExpandedFunc = createEmptyExpandedFunction(F->getName());
 
     // Create and name the actual arguments to this expanded function.
     llvm::SmallVector<llvm::Argument*, 8> ArgVec;
@@ -465,32 +463,28 @@
     llvm::Value *Arg_instep = ArgVec[3];
     llvm::Value *Arg_outstep = ArgVec[4];
 
-    Arg_p->setName("p");
-    Arg_x1->setName("x1");
-    Arg_x2->setName("x2");
-    Arg_instep->setName("arg_instep");
-    Arg_outstep->setName("arg_outstep");
-
     llvm::Value *InStep = NULL;
     llvm::Value *OutStep = NULL;
 
     // Construct the actual function body.
-    llvm::BasicBlock *Begin =
-        llvm::BasicBlock::Create(*C, "Begin", ExpandedFunc);
-    llvm::IRBuilder<> Builder(Begin);
+    llvm::IRBuilder<> Builder(ExpandedFunc->getEntryBlock().begin());
 
-    // uint32_t X = x1;
-    llvm::AllocaInst *AX = Builder.CreateAlloca(Int32Ty, 0, "AX");
-    Builder.CreateStore(Arg_x1, AX);
+    // Create TBAA meta-data.
+    llvm::MDNode *TBAARenderScript, *TBAAAllocation, *TBAAPointer;
+
+    llvm::MDBuilder MDHelper(*C);
+    TBAARenderScript = MDHelper.createTBAARoot("RenderScript TBAA");
+    TBAAAllocation = MDHelper.createTBAANode("allocation", TBAARenderScript);
+    TBAAPointer = MDHelper.createTBAANode("pointer", TBAARenderScript);
 
     // Collect and construct the arguments for the kernel().
     // Note that we load any loop-invariant arguments before entering the Loop.
     llvm::Function::arg_iterator Args = F->arg_begin();
 
     llvm::Type *OutTy = NULL;
-    llvm::AllocaInst *AOut = NULL;
     bool PassOutByReference = false;
-    if (hasOut(Signature)) {
+    llvm::LoadInst *OutBasePtr = NULL;
+    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
       llvm::Type *OutBaseTy = F->getReturnType();
       if (OutBaseTy->isVoidTy()) {
         PassOutByReference = true;
@@ -500,73 +494,83 @@
         OutTy = OutBaseTy->getPointerTo();
         // We don't increment Args, since we are using the actual return type.
       }
-      AOut = Builder.CreateAlloca(OutTy, 0, "AOut");
       OutStep = getStepValue(&DL, OutTy, Arg_outstep);
       OutStep->setName("outstep");
-      Builder.CreateStore(Builder.CreatePointerCast(Builder.CreateLoad(
-          Builder.CreateStructGEP(Arg_p, 1)), OutTy), AOut);
+      OutBasePtr = Builder.CreateLoad(Builder.CreateStructGEP(Arg_p, 1));
+      OutBasePtr->setMetadata("tbaa", TBAAPointer);
     }
 
     llvm::Type *InBaseTy = NULL;
     llvm::Type *InTy = NULL;
-    llvm::AllocaInst *AIn = NULL;
-    if (hasIn(Signature)) {
+    llvm::LoadInst *InBasePtr = NULL;
+    if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
       InBaseTy = Args->getType();
       InTy =InBaseTy->getPointerTo();
-      AIn = Builder.CreateAlloca(InTy, 0, "AIn");
       InStep = getStepValue(&DL, InTy, Arg_instep);
       InStep->setName("instep");
-      Builder.CreateStore(Builder.CreatePointerCast(Builder.CreateLoad(
-          Builder.CreateStructGEP(Arg_p, 0)), InTy), AIn);
+      InBasePtr = Builder.CreateLoad(Builder.CreateStructGEP(Arg_p, 0));
+      InBasePtr->setMetadata("tbaa", TBAAPointer);
       Args++;
     }
 
     // No usrData parameter on kernels.
-    bccAssert(!hasUsrData(Signature));
+    bccAssert(
+        !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature));
 
-    if (hasX(Signature)) {
+    if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
       Args++;
     }
 
     llvm::Value *Y = NULL;
-    if (hasY(Signature)) {
+    if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
       Y = Builder.CreateLoad(Builder.CreateStructGEP(Arg_p, 5), "Y");
       Args++;
     }
 
     bccAssert(Args == F->arg_end());
 
-    llvm::BasicBlock *Loop = llvm::BasicBlock::Create(*C, "Loop", ExpandedFunc);
-    llvm::BasicBlock *Exit = llvm::BasicBlock::Create(*C, "Exit", ExpandedFunc);
-
-    // if (x1 < x2) goto Loop; else goto Exit;
-    llvm::Value *Cond = Builder.CreateICmpSLT(Arg_x1, Arg_x2);
-    Builder.CreateCondBr(Cond, Loop, Exit);
-
-    // Loop:
-    Builder.SetInsertPoint(Loop);
+    llvm::PHINode *IV;
+    createLoop(Builder, Arg_x1, Arg_x2, &IV);
 
     // Populate the actual call to kernel().
     llvm::SmallVector<llvm::Value*, 8> RootArgs;
 
     llvm::Value *InPtr = NULL;
-    llvm::Value *In = NULL;
     llvm::Value *OutPtr = NULL;
 
+    // Calculate the current input and output pointers
+    //
+    // We always calculate the input/output pointers with a GEP operating on i8
+    // values and only cast at the very end to OutTy. This is because the step
+    // between two values is given in bytes.
+    //
+    // TODO: We could further optimize the output by using a GEP operation of
+    // type 'OutTy' in cases where the element type of the allocation allows.
+    if (OutBasePtr) {
+      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
+      OutOffset = Builder.CreateMul(OutOffset, OutStep);
+      OutPtr = Builder.CreateGEP(OutBasePtr, OutOffset);
+      OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
+    }
+    if (InBasePtr) {
+      llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
+      InOffset = Builder.CreateMul(InOffset, InStep);
+      InPtr = Builder.CreateGEP(InBasePtr, InOffset);
+      InPtr = Builder.CreatePointerCast(InPtr, InTy);
+    }
+
     if (PassOutByReference) {
-      OutPtr = Builder.CreateLoad(AOut, "OutPtr");
       RootArgs.push_back(OutPtr);
     }
 
-    if (AIn) {
-      InPtr = Builder.CreateLoad(AIn, "InPtr");
-      In = Builder.CreateLoad(InPtr, "In");
+    if (InPtr) {
+      llvm::LoadInst *In = Builder.CreateLoad(InPtr, "In");
+      In->setMetadata("tbaa", TBAAAllocation);
       RootArgs.push_back(In);
     }
 
-    // We always have to load X, since it is used to iterate through the loop.
-    llvm::Value *X = Builder.CreateLoad(AX, "X");
-    if (hasX(Signature)) {
+    llvm::Value *X = IV;
+    if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
       RootArgs.push_back(X);
     }
 
@@ -576,58 +580,125 @@
 
     llvm::Value *RetVal = Builder.CreateCall(F, RootArgs);
 
-    if (AOut && !PassOutByReference) {
-      OutPtr = Builder.CreateLoad(AOut, "OutPtr");
-      Builder.CreateStore(RetVal, OutPtr);
+    if (OutPtr && !PassOutByReference) {
+      llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
+      Store->setMetadata("tbaa", TBAAAllocation);
     }
 
-    if (InPtr) {
-      // InPtr += instep
-      llvm::Value *NewIn = Builder.CreateIntToPtr(Builder.CreateNUWAdd(
-          Builder.CreatePtrToInt(InPtr, Int32Ty), InStep), InTy);
-      Builder.CreateStore(NewIn, AIn);
-    }
-
-    if (OutPtr) {
-      // OutPtr += outstep
-      llvm::Value *NewOut = Builder.CreateIntToPtr(Builder.CreateNUWAdd(
-          Builder.CreatePtrToInt(OutPtr, Int32Ty), OutStep), OutTy);
-      Builder.CreateStore(NewOut, AOut);
-    }
-
-    // X++;
-    llvm::Value *XPlusOne =
-        Builder.CreateNUWAdd(X, llvm::ConstantInt::get(Int32Ty, 1));
-    Builder.CreateStore(XPlusOne, AX);
-
-    // If (X < x2) goto Loop; else goto Exit;
-    Cond = Builder.CreateICmpSLT(XPlusOne, Arg_x2);
-    Builder.CreateCondBr(Cond, Loop, Exit);
-
-    // Exit:
-    Builder.SetInsertPoint(Exit);
-    Builder.CreateRetVoid();
-
     return true;
   }
 
+  /// @brief Checks if pointers to allocation internals are exposed
+  ///
+  /// This function verifies if through the parameters passed to the kernel
+  /// or through calls to the runtime library the script gains access to
+  /// pointers pointing to data within a RenderScript Allocation.
+  /// If we know we control all loads from and stores to data within
+  /// RenderScript allocations and if we know the run-time internal accesses
+  /// are all annotated with RenderScript TBAA metadata, only then we
+  /// can safely use TBAA to distinguish between generic and from-allocation
+  /// pointers.
+  bool allocPointersExposed(llvm::Module &M) {
+    // Old style kernel function can expose pointers to elements within
+    // allocations.
+    // TODO: Extend analysis to allow simple cases of old-style kernels.
+    for (RSInfo::ExportForeachFuncListTy::const_iterator
+             func_iter = mFuncs.begin(), func_end = mFuncs.end();
+         func_iter != func_end; func_iter++) {
+      const char *Name = func_iter->first;
+      uint32_t Signature = func_iter->second;
+      if (M.getFunction(Name) &&
+          !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) {
+        return true;
+      }
+    }
+
+    // Check for library functions that expose a pointer to an Allocation or
+    // that are not yet annotated with RenderScript-specific tbaa information.
+    static std::vector<std::string> Funcs;
+
+    // rsGetElementAt(...)
+    Funcs.push_back("_Z14rsGetElementAt13rs_allocationj");
+    Funcs.push_back("_Z14rsGetElementAt13rs_allocationjj");
+    Funcs.push_back("_Z14rsGetElementAt13rs_allocationjjj");
+    // rsSetElementAt()
+    Funcs.push_back("_Z14rsSetElementAt13rs_allocationPvj");
+    Funcs.push_back("_Z14rsSetElementAt13rs_allocationPvjj");
+    Funcs.push_back("_Z14rsSetElementAt13rs_allocationPvjjj");
+    // rsGetElementAtYuv_uchar_Y()
+    Funcs.push_back("_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj");
+    // rsGetElementAtYuv_uchar_U()
+    Funcs.push_back("_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj");
+    // rsGetElementAtYuv_uchar_V()
+    Funcs.push_back("_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj");
+
+    for (std::vector<std::string>::iterator FI = Funcs.begin(),
+                                            FE = Funcs.end();
+         FI != FE; ++FI) {
+      llvm::Function *F = M.getFunction(*FI);
+
+      if (!F) {
+        ALOGE("Missing run-time function '%s'", FI->c_str());
+        return true;
+      }
+
+      if (F->getNumUses() > 0) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  /// @brief Connect RenderScript TBAA metadata to C/C++ metadata
+  ///
+  /// The TBAA metadata used to annotate loads/stores from RenderScript
+  /// Allocations is generated in a separate TBAA tree with a "RenderScript TBAA"
+  /// root node. LLVM does assume may-alias for all nodes in unrelated alias
+  /// analysis trees. This function makes the RenderScript TBAA a subtree of the
+  /// normal C/C++ TBAA tree aside of normal C/C++ types. With the connected trees
+  /// every access to an Allocation is resolved to must-alias if compared to
+  /// a normal C/C++ access.
+  void connectRenderScriptTBAAMetadata(llvm::Module &M) {
+    llvm::MDBuilder MDHelper(*C);
+    llvm::MDNode *TBAARenderScript = MDHelper.createTBAARoot("RenderScript TBAA");
+
+    llvm::MDNode *TBAARoot = MDHelper.createTBAARoot("Simple C/C++ TBAA");
+    llvm::MDNode *TBAAMergedRS = MDHelper.createTBAANode("RenderScript", TBAARoot);
+
+    TBAARenderScript->replaceAllUsesWith(TBAAMergedRS);
+  }
+
   virtual bool runOnModule(llvm::Module &M) {
     bool Changed = false;
     this->M = &M;
     C = &M.getContext();
 
+    bool AllocsExposed = allocPointersExposed(M);
+
     for (RSInfo::ExportForeachFuncListTy::const_iterator
              func_iter = mFuncs.begin(), func_end = mFuncs.end();
          func_iter != func_end; func_iter++) {
       const char *name = func_iter->first;
       uint32_t signature = func_iter->second;
       llvm::Function *kernel = M.getFunction(name);
-      if (kernel && isKernel(signature)) {
-        Changed |= ExpandKernel(kernel, signature);
+      if (kernel) {
+        if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
+          Changed |= ExpandKernel(kernel, signature);
+          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
+        } else if (kernel->getReturnType()->isVoidTy()) {
+          Changed |= ExpandFunction(kernel, signature);
+          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
+        } else {
+          // There are some graphics root functions that are not
+          // expanded, but that will be called directly. For those
+          // functions, we can not set the linkage to internal.
+        }
       }
-      else if (kernel && kernel->getReturnType()->isVoidTy()) {
-        Changed |= ExpandFunction(kernel, signature);
-      }
+    }
+
+    if (!AllocsExposed) {
+      connectRenderScriptTBAAMetadata(M);
     }
 
     return Changed;
diff --git a/lib/Renderscript/RSInfo.cpp b/lib/Renderscript/RSInfo.cpp
index 496f739..cbb4af9 100644
--- a/lib/Renderscript/RSInfo.cpp
+++ b/lib/Renderscript/RSInfo.cpp
@@ -28,7 +28,9 @@
 #include "bcc/Support/FileBase.h"
 #include "bcc/Support/Log.h"
 
+#ifdef HAVE_ANDROID_OS
 #include <cutils/properties.h>
+#endif
 
 using namespace bcc;
 
@@ -86,8 +88,8 @@
 #endif  // TARGET_BUILD
 }
 
-android::String8 RSInfo::GetPath(const FileBase &pFile) {
-  android::String8 result(pFile.getName().c_str());
+android::String8 RSInfo::GetPath(const char *pFilename) {
+  android::String8 result(pFilename);
   result.append(".info");
   return result;
 }
@@ -391,6 +393,7 @@
     result = FP_Relaxed;
   }
 
+#ifdef HAVE_ANDROID_OS
   // Provide an override for precsion via adb shell setprop
   // adb shell setprop debug.rs.precision rs_fp_full
   // adb shell setprop debug.rs.precision rs_fp_relaxed
@@ -410,6 +413,7 @@
       result = FP_Full;
     }
   }
+#endif
 
   return result;
 }
diff --git a/lib/Renderscript/runtime/Android.mk b/lib/Renderscript/runtime/Android.mk
deleted file mode 100755
index 08cefb6..0000000
--- a/lib/Renderscript/runtime/Android.mk
+++ /dev/null
@@ -1,115 +0,0 @@
-#
-# Copyright (C) 2011-2012 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH := $(call my-dir)
-
-# C/LLVM-IR source files for the library
-clcore_base_files := \
-    rs_allocation.c \
-    rs_cl.c \
-    rs_core.c \
-    rs_element.c \
-    rs_mesh.c \
-    rs_matrix.c \
-    rs_program.c \
-    rs_sample.c \
-    rs_sampler.c \
-    convert.ll \
-    rsClamp.ll
-
-clcore_files := \
-    $(clcore_base_files) \
-    math.ll \
-    arch/generic.c \
-    arch/sqrt.c \
-    arch/dot_length.c
-
-clcore_neon_files := \
-    $(clcore_base_files) \
-    math.ll \
-    arch/neon.ll \
-    arch/sqrt.c \
-    arch/dot_length.c
-
-ifeq ($(ARCH_X86_HAVE_SSE2), true)
-    clcore_x86_files := \
-    $(clcore_base_files) \
-    arch/x86_generic.c \
-    arch/x86_clamp.ll \
-    arch/x86_math.ll
-
-    ifeq ($(ARCH_X86_HAVE_SSE3), true)
-        clcore_x86_files += arch/x86_dot_length.ll
-    else
-        # FIXME: without SSE3, it is still able to get better code through PSHUFD. But,
-        # so far, there is no such device with SSE2 only.
-        clcore_x86_files += arch/dot_length.c
-    endif
-endif
-
-ifeq "REL" "$(PLATFORM_VERSION_CODENAME)"
-  RS_VERSION := $(PLATFORM_SDK_VERSION)
-else
-  # Increment by 1 whenever this is not a final release build, since we want to
-  # be able to see the RS version number change during development.
-  # See build/core/version_defaults.mk for more information about this.
-  RS_VERSION := "(1 + $(PLATFORM_SDK_VERSION))"
-endif
-
-# Build the base version of the library
-include $(CLEAR_VARS)
-LOCAL_MODULE := libclcore.bc
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE_CLASS := SHARED_LIBRARIES
-LOCAL_SRC_FILES := $(clcore_files)
-
-include $(LOCAL_PATH)/build_bc_lib.mk
-
-# Build a debug version of the library
-include $(CLEAR_VARS)
-LOCAL_MODULE := libclcore_debug.bc
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE_CLASS := SHARED_LIBRARIES
-rs_debug_runtime := 1
-LOCAL_SRC_FILES := $(clcore_files)
-
-include $(LOCAL_PATH)/build_bc_lib.mk
-
-# Build an optimized version of the library if the device is SSE2- or above
-# capable.
-ifeq ($(ARCH_X86_HAVE_SSE2),true)
-include $(CLEAR_VARS)
-LOCAL_MODULE := libclcore_x86.bc
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE_CLASS := SHARED_LIBRARIES
-LOCAL_SRC_FILES := $(clcore_x86_files)
-
-include $(LOCAL_PATH)/build_bc_lib.mk
-endif
-
-# Build a NEON-enabled version of the library (if possible)
-ifeq ($(ARCH_ARM_HAVE_NEON),true)
-# Disable NEON on cortex-a15 temporarily
-ifneq ($(strip $(TARGET_CPU_VARIANT)), cortex-a15)
-  include $(CLEAR_VARS)
-  LOCAL_MODULE := libclcore_neon.bc
-  LOCAL_MODULE_TAGS := optional
-  LOCAL_MODULE_CLASS := SHARED_LIBRARIES
-  LOCAL_SRC_FILES := $(clcore_neon_files)
-
-  include $(LOCAL_PATH)/build_bc_lib.mk
-endif
-endif
diff --git a/lib/Renderscript/runtime/arch/dot_length.c b/lib/Renderscript/runtime/arch/dot_length.c
deleted file mode 100644
index 94c99b6..0000000
--- a/lib/Renderscript/runtime/arch/dot_length.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "rs_types.rsh"
-
-extern float __attribute__((overloadable)) dot(float lhs, float rhs) {
-    return lhs * rhs;
-}
-extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) {
-    return lhs.x*rhs.x + lhs.y*rhs.y;
-}
-extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) {
-    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z;
-}
-extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) {
-    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w;
-}
-
-extern float __attribute__((overloadable)) fabs(float);
-extern float __attribute__((overloadable)) sqrt(float);
-
-extern float __attribute__((overloadable)) length(float v) {
-    return fabs(v);
-}
-extern float __attribute__((overloadable)) length(float2 v) {
-    return sqrt(v.x*v.x + v.y*v.y);
-}
-extern float __attribute__((overloadable)) length(float3 v) {
-    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
-}
-extern float __attribute__((overloadable)) length(float4 v) {
-    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
-}
-
diff --git a/lib/Renderscript/runtime/arch/generic.c b/lib/Renderscript/runtime/arch/generic.c
deleted file mode 100644
index 3724e22..0000000
--- a/lib/Renderscript/runtime/arch/generic.c
+++ /dev/null
@@ -1,936 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rs_types.rsh"
-
-extern short __attribute__((overloadable, always_inline)) rsClamp(short amount, short low, short high);
-extern float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high);
-extern uchar4 __attribute__((overloadable)) convert_uchar4(short4);
-extern uchar4 __attribute__((overloadable)) convert_uchar4(float4);
-extern float4 __attribute__((overloadable)) convert_float4(uchar4);
-extern float __attribute__((overloadable)) sqrt(float);
-
-/*
- * CLAMP
- */
-extern float __attribute__((overloadable)) clamp(float amount, float low, float high) {
-    return amount < low ? low : (amount > high ? high : amount);
-}
-
-extern float2 __attribute__((overloadable)) clamp(float2 amount, float2 low, float2 high) {
-    float2 r;
-    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
-    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) clamp(float3 amount, float3 low, float3 high) {
-    float3 r;
-    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
-    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
-    r.z = amount.z < low.z ? low.z : (amount.z > high.z ? high.z : amount.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high) {
-    float4 r;
-    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
-    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
-    r.z = amount.z < low.z ? low.z : (amount.z > high.z ? high.z : amount.z);
-    r.w = amount.w < low.w ? low.w : (amount.w > high.w ? high.w : amount.w);
-    return r;
-}
-
-extern float2 __attribute__((overloadable)) clamp(float2 amount, float low, float high) {
-    float2 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) clamp(float3 amount, float low, float high) {
-    float3 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) clamp(float4 amount, float low, float high) {
-    float4 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
-    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
-    return r;
-}
-
-
-/*
- * FMAX
- */
-
-extern float __attribute__((overloadable)) fmax(float v1, float v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern float2 __attribute__((overloadable)) fmax(float2 v1, float2 v2) {
-    float2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmax(float3 v1, float3 v2) {
-    float3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmax(float4 v1, float4 v2) {
-    float4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float2 __attribute__((overloadable)) fmax(float2 v1, float v2) {
-    float2 r;
-    r.x = v1.x > v2 ? v1.x : v2;
-    r.y = v1.y > v2 ? v1.y : v2;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmax(float3 v1, float v2) {
-    float3 r;
-    r.x = v1.x > v2 ? v1.x : v2;
-    r.y = v1.y > v2 ? v1.y : v2;
-    r.z = v1.z > v2 ? v1.z : v2;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmax(float4 v1, float v2) {
-    float4 r;
-    r.x = v1.x > v2 ? v1.x : v2;
-    r.y = v1.y > v2 ? v1.y : v2;
-    r.z = v1.z > v2 ? v1.z : v2;
-    r.w = v1.w > v2 ? v1.w : v2;
-    return r;
-}
-
-extern float __attribute__((overloadable)) fmin(float v1, float v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-
-/*
- * FMIN
- */
-extern float2 __attribute__((overloadable)) fmin(float2 v1, float2 v2) {
-    float2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmin(float3 v1, float3 v2) {
-    float3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmin(float4 v1, float4 v2) {
-    float4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float2 __attribute__((overloadable)) fmin(float2 v1, float v2) {
-    float2 r;
-    r.x = v1.x < v2 ? v1.x : v2;
-    r.y = v1.y < v2 ? v1.y : v2;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmin(float3 v1, float v2) {
-    float3 r;
-    r.x = v1.x < v2 ? v1.x : v2;
-    r.y = v1.y < v2 ? v1.y : v2;
-    r.z = v1.z < v2 ? v1.z : v2;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmin(float4 v1, float v2) {
-    float4 r;
-    r.x = v1.x < v2 ? v1.x : v2;
-    r.y = v1.y < v2 ? v1.y : v2;
-    r.z = v1.z < v2 ? v1.z : v2;
-    r.w = v1.w < v2 ? v1.w : v2;
-    return r;
-}
-
-
-/*
- * MAX
- */
-
-extern char __attribute__((overloadable)) max(char v1, char v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern char2 __attribute__((overloadable)) max(char2 v1, char2 v2) {
-    char2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern char3 __attribute__((overloadable)) max(char3 v1, char3 v2) {
-    char3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern char4 __attribute__((overloadable)) max(char4 v1, char4 v2) {
-    char4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern short __attribute__((overloadable)) max(short v1, short v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern short2 __attribute__((overloadable)) max(short2 v1, short2 v2) {
-    short2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern short3 __attribute__((overloadable)) max(short3 v1, short3 v2) {
-    short3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern short4 __attribute__((overloadable)) max(short4 v1, short4 v2) {
-    short4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int __attribute__((overloadable)) max(int v1, int v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern int2 __attribute__((overloadable)) max(int2 v1, int2 v2) {
-    int2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern int3 __attribute__((overloadable)) max(int3 v1, int3 v2) {
-    int3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern int4 __attribute__((overloadable)) max(int4 v1, int4 v2) {
-    int4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int64_t __attribute__((overloadable)) max(int64_t v1, int64_t v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) {
-    long2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) {
-    long3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) {
-    long4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uchar __attribute__((overloadable)) max(uchar v1, uchar v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern uchar2 __attribute__((overloadable)) max(uchar2 v1, uchar2 v2) {
-    uchar2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uchar3 __attribute__((overloadable)) max(uchar3 v1, uchar3 v2) {
-    uchar3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uchar4 __attribute__((overloadable)) max(uchar4 v1, uchar4 v2) {
-    uchar4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ushort __attribute__((overloadable)) max(ushort v1, ushort v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern ushort2 __attribute__((overloadable)) max(ushort2 v1, ushort2 v2) {
-    ushort2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ushort3 __attribute__((overloadable)) max(ushort3 v1, ushort3 v2) {
-    ushort3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ushort4 __attribute__((overloadable)) max(ushort4 v1, ushort4 v2) {
-    ushort4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uint __attribute__((overloadable)) max(uint v1, uint v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern uint2 __attribute__((overloadable)) max(uint2 v1, uint2 v2) {
-    uint2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uint3 __attribute__((overloadable)) max(uint3 v1, uint3 v2) {
-    uint3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uint4 __attribute__((overloadable)) max(uint4 v1, uint4 v2) {
-    uint4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) {
-    ulong2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) {
-    ulong3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) {
-    ulong4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float __attribute__((overloadable)) max(float v1, float v2) {
-    return fmax(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) max(float2 v1, float2 v2) {
-    return fmax(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) max(float2 v1, float v2) {
-    return fmax(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) max(float3 v1, float3 v2) {
-    return fmax(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) max(float3 v1, float v2) {
-    return fmax(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) max(float4 v1, float4 v2) {
-    return fmax(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) max(float4 v1, float v2) {
-    return fmax(v1, v2);
-}
-
-
-/*
- * MIN
- */
-
-extern int8_t __attribute__((overloadable)) min(int8_t v1, int8_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern char2 __attribute__((overloadable)) min(char2 v1, char2 v2) {
-    char2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern char3 __attribute__((overloadable)) min(char3 v1, char3 v2) {
-    char3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern char4 __attribute__((overloadable)) min(char4 v1, char4 v2) {
-    char4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int16_t __attribute__((overloadable)) min(int16_t v1, int16_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern short2 __attribute__((overloadable)) min(short2 v1, short2 v2) {
-    short2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern short3 __attribute__((overloadable)) min(short3 v1, short3 v2) {
-    short3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern short4 __attribute__((overloadable)) min(short4 v1, short4 v2) {
-    short4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int32_t __attribute__((overloadable)) min(int32_t v1, int32_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern int2 __attribute__((overloadable)) min(int2 v1, int2 v2) {
-    int2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern int3 __attribute__((overloadable)) min(int3 v1, int3 v2) {
-    int3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern int4 __attribute__((overloadable)) min(int4 v1, int4 v2) {
-    int4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int64_t __attribute__((overloadable)) min(int64_t v1, int64_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) {
-    long2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) {
-    long3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) {
-    long4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uchar __attribute__((overloadable)) min(uchar v1, uchar v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern uchar2 __attribute__((overloadable)) min(uchar2 v1, uchar2 v2) {
-    uchar2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uchar3 __attribute__((overloadable)) min(uchar3 v1, uchar3 v2) {
-    uchar3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uchar4 __attribute__((overloadable)) min(uchar4 v1, uchar4 v2) {
-    uchar4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ushort __attribute__((overloadable)) min(ushort v1, ushort v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern ushort2 __attribute__((overloadable)) min(ushort2 v1, ushort2 v2) {
-    ushort2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ushort3 __attribute__((overloadable)) min(ushort3 v1, ushort3 v2) {
-    ushort3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ushort4 __attribute__((overloadable)) min(ushort4 v1, ushort4 v2) {
-    ushort4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uint __attribute__((overloadable)) min(uint v1, uint v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern uint2 __attribute__((overloadable)) min(uint2 v1, uint2 v2) {
-    uint2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uint3 __attribute__((overloadable)) min(uint3 v1, uint3 v2) {
-    uint3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uint4 __attribute__((overloadable)) min(uint4 v1, uint4 v2) {
-    uint4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) {
-    ulong2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) {
-    ulong3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) {
-    ulong4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float __attribute__((overloadable)) min(float v1, float v2) {
-    return fmin(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) min(float2 v1, float2 v2) {
-    return fmin(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) min(float2 v1, float v2) {
-    return fmin(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) min(float3 v1, float3 v2) {
-    return fmin(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) min(float3 v1, float v2) {
-    return fmin(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) min(float4 v1, float4 v2) {
-    return fmin(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) min(float4 v1, float v2) {
-    return fmin(v1, v2);
-}
-
-/*
- * YUV
- */
-
-extern uchar4 __attribute__((overloadable)) rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v) {
-    short Y = ((short)y) - 16;
-    short U = ((short)u) - 128;
-    short V = ((short)v) - 128;
-
-    short4 p;
-    p.r = (Y * 298 + V * 409 + 128) >> 8;
-    p.g = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
-    p.b = (Y * 298 + U * 516 + 128) >> 8;
-    p.a = 255;
-    p.r = rsClamp(p.r, (short)0, (short)255);
-    p.g = rsClamp(p.g, (short)0, (short)255);
-    p.b = rsClamp(p.b, (short)0, (short)255);
-
-    return convert_uchar4(p);
-}
-
-static float4 yuv_U_values = {0.f, -0.392f * 0.003921569f, +2.02 * 0.003921569f, 0.f};
-static float4 yuv_V_values = {1.603f * 0.003921569f, -0.815f * 0.003921569f, 0.f, 0.f};
-
-extern float4 __attribute__((overloadable)) rsYuvToRGBA_float4(uchar y, uchar u, uchar v) {
-    float4 color = (float)y * 0.003921569f;
-    float4 fU = ((float)u) - 128.f;
-    float4 fV = ((float)v) - 128.f;
-
-    color += fU * yuv_U_values;
-    color += fV * yuv_V_values;
-    color = clamp(color, 0.f, 1.f);
-    return color;
-}
-
-
-/*
- * half_RECIP
- */
-
-extern float __attribute__((overloadable)) half_recip(float v) {
-    // FIXME:  actual algorithm for generic approximate reciprocal
-    return 1.f / v;
-}
-
-extern float2 __attribute__((overloadable)) half_recip(float2 v) {
-    float2 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_recip(float3 v) {
-    float3 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    r.z = half_recip(r.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_recip(float4 v) {
-    float4 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    r.z = half_recip(r.z);
-    r.w = half_recip(r.w);
-    return r;
-}
-
-
-/*
- * half_SQRT
- */
-
-extern float __attribute__((overloadable)) half_sqrt(float v) {
-    return sqrt(v);
-}
-
-extern float2 __attribute__((overloadable)) half_sqrt(float2 v) {
-    float2 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_sqrt(float3 v) {
-    float3 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    r.z = half_sqrt(v.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_sqrt(float4 v) {
-    float4 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    r.z = half_sqrt(v.z);
-    r.w = half_sqrt(v.w);
-    return r;
-}
-
-
-/*
- * half_rsqrt
- */
-
-extern float __attribute__((overloadable)) half_rsqrt(float v) {
-    return 1.f / sqrt(v);
-}
-
-extern float2 __attribute__((overloadable)) half_rsqrt(float2 v) {
-    float2 r;
-    r.x = half_rsqrt(v.x);
-    r.y = half_rsqrt(v.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_rsqrt(float3 v) {
-    float3 r;
-    r.x = half_rsqrt(v.x);
-    r.y = half_rsqrt(v.y);
-    r.z = half_rsqrt(v.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_rsqrt(float4 v) {
-    float4 r;
-    r.x = half_rsqrt(v.x);
-    r.y = half_rsqrt(v.y);
-    r.z = half_rsqrt(v.z);
-    r.w = half_rsqrt(v.w);
-    return r;
-}
-
-/**
- * matrix ops
- */
-
-extern float4 __attribute__((overloadable))
-rsMatrixMultiply(const rs_matrix4x4 *m, float4 in) {
-    float4 ret;
-    ret.x = (m->m[0] * in.x) + (m->m[4] * in.y) + (m->m[8] * in.z) + (m->m[12] * in.w);
-    ret.y = (m->m[1] * in.x) + (m->m[5] * in.y) + (m->m[9] * in.z) + (m->m[13] * in.w);
-    ret.z = (m->m[2] * in.x) + (m->m[6] * in.y) + (m->m[10] * in.z) + (m->m[14] * in.w);
-    ret.w = (m->m[3] * in.x) + (m->m[7] * in.y) + (m->m[11] * in.z) + (m->m[15] * in.w);
-    return ret;
-}
-
-extern float4 __attribute__((overloadable))
-rsMatrixMultiply(const rs_matrix4x4 *m, float3 in) {
-    float4 ret;
-    ret.x = (m->m[0] * in.x) + (m->m[4] * in.y) + (m->m[8] * in.z) + m->m[12];
-    ret.y = (m->m[1] * in.x) + (m->m[5] * in.y) + (m->m[9] * in.z) + m->m[13];
-    ret.z = (m->m[2] * in.x) + (m->m[6] * in.y) + (m->m[10] * in.z) + m->m[14];
-    ret.w = (m->m[3] * in.x) + (m->m[7] * in.y) + (m->m[11] * in.z) + m->m[15];
-    return ret;
-}
-
-extern float4 __attribute__((overloadable))
-rsMatrixMultiply(const rs_matrix4x4 *m, float2 in) {
-    float4 ret;
-    ret.x = (m->m[0] * in.x) + (m->m[4] * in.y) + m->m[12];
-    ret.y = (m->m[1] * in.x) + (m->m[5] * in.y) + m->m[13];
-    ret.z = (m->m[2] * in.x) + (m->m[6] * in.y) + m->m[14];
-    ret.w = (m->m[3] * in.x) + (m->m[7] * in.y) + m->m[15];
-    return ret;
-}
-
-extern float3 __attribute__((overloadable))
-rsMatrixMultiply(const rs_matrix3x3 *m, float3 in) {
-    float3 ret;
-    ret.x = (m->m[0] * in.x) + (m->m[3] * in.y) + (m->m[6] * in.z);
-    ret.y = (m->m[1] * in.x) + (m->m[4] * in.y) + (m->m[7] * in.z);
-    ret.z = (m->m[2] * in.x) + (m->m[5] * in.y) + (m->m[8] * in.z);
-    return ret;
-}
-
-extern float3 __attribute__((overloadable))
-rsMatrixMultiply(const rs_matrix3x3 *m, float2 in) {
-    float3 ret;
-    ret.x = (m->m[0] * in.x) + (m->m[3] * in.y);
-    ret.y = (m->m[1] * in.x) + (m->m[4] * in.y);
-    ret.z = (m->m[2] * in.x) + (m->m[5] * in.y);
-    return ret;
-}
-
-/**
- * Pixel Ops
- */
-extern uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
-{
-    uchar4 c;
-    c.x = (uchar)clamp((r * 255.f + 0.5f), 0.f, 255.f);
-    c.y = (uchar)clamp((g * 255.f + 0.5f), 0.f, 255.f);
-    c.z = (uchar)clamp((b * 255.f + 0.5f), 0.f, 255.f);
-    c.w = 255;
-    return c;
-}
-
-extern uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
-{
-    uchar4 c;
-    c.x = (uchar)clamp((r * 255.f + 0.5f), 0.f, 255.f);
-    c.y = (uchar)clamp((g * 255.f + 0.5f), 0.f, 255.f);
-    c.z = (uchar)clamp((b * 255.f + 0.5f), 0.f, 255.f);
-    c.w = (uchar)clamp((a * 255.f + 0.5f), 0.f, 255.f);
-    return c;
-}
-
-extern uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
-{
-    color *= 255.f;
-    color += 0.5f;
-    color = clamp(color, 0.f, 255.f);
-    uchar4 c = {color.x, color.y, color.z, 255};
-    return c;
-}
-
-extern uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
-{
-    color *= 255.f;
-    color += 0.5f;
-    color = clamp(color, 0.f, 255.f);
-    uchar4 c = {color.x, color.y, color.z, color.w};
-    return c;
-}
-
diff --git a/lib/Renderscript/runtime/arch/neon.ll b/lib/Renderscript/runtime/arch/neon.ll
deleted file mode 100644
index cc63631..0000000
--- a/lib/Renderscript/runtime/arch/neon.ll
+++ /dev/null
@@ -1,1037 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
-target triple = "armv7-none-linux-gnueabi"
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;               INTRINSICS               ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm.neon.vqshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
-
-declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                HELPERS                 ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline {
-  %1 = insertelement <4 x float> undef, float %in, i32 0
-  %2 = insertelement <4 x float> %1, float %in, i32 1
-  %3 = insertelement <4 x float> %2, float %in, i32 2
-  %4 = insertelement <4 x float> %3, float %in, i32 3
-  ret <4 x float> %4
-}
-
-define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline {
-  %1 = insertelement <2 x float> undef, float %in, i32 0
-  %2 = insertelement <2 x float> %1, float %in, i32 1
-  ret <2 x float> %2
-}
-
-define internal <4 x i32> @smear_4i32(i32 %in) nounwind readnone alwaysinline {
-  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
-  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
-  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
-  %4 = insertelement <4 x i32> %3, i32 %in, i32 3
-  ret <4 x i32> %4
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                 CLAMP                  ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly {
-  %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone
-  %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone
-  ret <4 x float> %2
-}
-
-define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly {
-  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
-  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
-  %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly
-  ret <4 x float> %out
-}
-
-define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly {
-  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
-  %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
-  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %c
-}
-
-define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly {
-  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
-  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
-  %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
-  %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
-  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %c
-}
-
-define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly {
-  %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone
-  %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone
-  ret <2 x float> %2
-}
-
-define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly {
-  %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
-  %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
-  %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone
-  %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
-  ret <2 x float> %b
-}
-
-define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly {
-  %1 = fcmp olt float %value, %high
-  %2 = select i1 %1, float %value, float %high
-  %3 = fcmp ogt float %2, %low
-  %4 = select i1 %3, float %2, float %low
-  ret float %4
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                  FMAX                  ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
-  %1 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
-  ret <4 x float> %1
-}
-
-define <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
-  %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
-  %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
-  ret <4 x float> %2
-}
-
-define <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
-  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
-  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %4
-}
-
-define <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
-  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
-  %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
-  %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %c
-}
-
-define <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
-  %1 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
-  ret <2 x float> %1
-}
-
-define <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
-  %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
-  %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
-  ret <2 x float> %2
-}
-
-define float @_Z4fmaxff(float %v1, float %v2) nounwind readonly {
-  %1 = fcmp ogt float %v1, %v2
-  %2 = select i1 %1, float %v1, float %v2
-  ret float %2
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                  FMIN                  ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
-  %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
-  ret <4 x float> %1
-}
-
-define <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
-  %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
-  %2 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
-  ret <4 x float> %2
-}
-
-define <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
-  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
-  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %4
-}
-
-define <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
-  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
-  %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
-  %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %c
-}
-
-define <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
-  %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
-  ret <2 x float> %1
-}
-
-define <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
-  %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
-  %2 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
-  ret <2 x float> %2
-}
-
-define float @_Z4fminff(float %v1, float %v2) nounwind readnone {
-  %1 = fcmp olt float %v1, %v2
-  %2 = select i1 %1, float %v1, float %v2
-  ret float %2
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                  MAX                   ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define signext i8 @_Z3maxcc(i8 signext %v1, i8 signext %v2) nounwind readnone {
-  %1 = icmp sgt i8 %v1, %v2
-  %2 = select i1 %1, i8 %v1, i8 %v2
-  ret i8 %2
-}
-
-define <2 x i8> @_Z3maxDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-  %1 = sext <2 x i8> %v1 to <2 x i32>
-  %2 = sext <2 x i8> %v2 to <2 x i32>
-  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
-  %4 = trunc <2 x i32> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <3 x i8> @_Z3maxDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
-  %1 = sext <3 x i8> %v1 to <3 x i32>
-  %2 = sext <3 x i8> %v2 to <3 x i32>
-  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
-  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  %7 = trunc <3 x i32> %6 to <3 x i8>
-  ret <3 x i8> %7
-}
-
-define <4 x i8> @_Z3maxDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
-  %1 = sext <4 x i8> %v1 to <4 x i32>
-  %2 = sext <4 x i8> %v2 to <4 x i32>
-  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
-  %4 = trunc <4 x i32> %3 to <4 x i8>
-  ret <4 x i8> %4
-}
-
-define signext i16 @_Z3maxss(i16 signext %v1, i16 signext %v2) nounwind readnone {
-  %1 = icmp sgt i16 %v1, %v2
-  %2 = select i1 %1, i16 %v1, i16 %v2
-  ret i16 %2
-}
-
-define <2 x i16> @_Z3maxDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
-  %1 = sext <2 x i16> %v1 to <2 x i32>
-  %2 = sext <2 x i16> %v2 to <2 x i32>
-  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
-  %4 = trunc <2 x i32> %3 to <2 x i16>
-  ret <2 x i16> %4
-}
-
-define <3 x i16> @_Z3maxDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
-  %1 = sext <3 x i16> %v1 to <3 x i32>
-  %2 = sext <3 x i16> %v2 to <3 x i32>
-  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
-  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  %7 = trunc <3 x i32> %6 to <3 x i16>
-  ret <3 x i16> %7
-}
-
-define <4 x i16> @_Z3maxDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
-  %1 = sext <4 x i16> %v1 to <4 x i32>
-  %2 = sext <4 x i16> %v2 to <4 x i32>
-  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
-  %4 = trunc <4 x i32> %3 to <4 x i16>
-  ret <4 x i16> %4
-}
-
-define i32 @_Z3maxii(i32 %v1, i32 %v2) nounwind readnone {
-  %1 = icmp sgt i32 %v1, %v2
-  %2 = select i1 %1, i32 %v1, i32 %v2
-  ret i32 %2
-}
-
-define <2 x i32> @_Z3maxDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-  %1 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
-  ret <2 x i32> %1
-}
-
-define <3 x i32> @_Z3maxDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
-  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = tail call <4 x i32   > @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
-  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x i32> %4
-}
-
-define <4 x i32> @_Z3maxDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
-  %1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
-  ret <4 x i32> %1
-}
-
-define i64 @_Z3maxxx(i64 %v1, i64 %v2) nounwind readnone {
-  %1 = icmp sgt i64 %v1, %v2
-  %2 = select i1 %1, i64 %v1, i64 %v2
-  ret i64 %2
-}
-
-; TODO:  long vector types
-
-define zeroext i8 @_Z3maxhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
-  %1 = icmp ugt i8 %v1, %v2
-  %2 = select i1 %1, i8 %v1, i8 %v2
-  ret i8 %2
-}
-
-define <2 x i8> @_Z3maxDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-  %1 = zext <2 x i8> %v1 to <2 x i32>
-  %2 = zext <2 x i8> %v2 to <2 x i32>
-  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
-  %4 = trunc <2 x i32> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <3 x i8> @_Z3maxDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
-  %1 = zext <3 x i8> %v1 to <3 x i32>
-  %2 = zext <3 x i8> %v2 to <3 x i32>
-  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
-  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  %7 = trunc <3 x i32> %6 to <3 x i8>
-  ret <3 x i8> %7
-}
-
-define <4 x i8> @_Z3maxDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
-  %1 = zext <4 x i8> %v1 to <4 x i32>
-  %2 = zext <4 x i8> %v2 to <4 x i32>
-  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
-  %4 = trunc <4 x i32> %3 to <4 x i8>
-  ret <4 x i8> %4
-}
-
-define zeroext i16 @_Z3maxtt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
-  %1 = icmp ugt i16 %v1, %v2
-  %2 = select i1 %1, i16 %v1, i16 %v2
-  ret i16 %2
-}
-
-define <2 x i16> @_Z3maxDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
-  %1 = zext <2 x i16> %v1 to <2 x i32>
-  %2 = zext <2 x i16> %v2 to <2 x i32>
-  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
-  %4 = trunc <2 x i32> %3 to <2 x i16>
-  ret <2 x i16> %4
-}
-
-define <3 x i16> @_Z3maxDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
-  %1 = zext <3 x i16> %v1 to <3 x i32>
-  %2 = zext <3 x i16> %v2 to <3 x i32>
-  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
-  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  %7 = trunc <3 x i32> %6 to <3 x i16>
-  ret <3 x i16> %7
-}
-
-define <4 x i16> @_Z3maxDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
-  %1 = zext <4 x i16> %v1 to <4 x i32>
-  %2 = zext <4 x i16> %v2 to <4 x i32>
-  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
-  %4 = trunc <4 x i32> %3 to <4 x i16>
-  ret <4 x i16> %4
-}
-
-define i32 @_Z3maxjj(i32 %v1, i32 %v2) nounwind readnone {
-  %1 = icmp ugt i32 %v1, %v2
-  %2 = select i1 %1, i32 %v1, i32 %v2
-  ret i32 %2
-}
-
-define <2 x i32> @_Z3maxDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-  %1 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
-  ret <2 x i32> %1
-}
-
-define <3 x i32> @_Z3maxDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
-  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = tail call <4 x i32   > @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
-  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x i32> %4
-}
-
-define <4 x i32> @_Z3maxDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
-  %1 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
-  ret <4 x i32> %1
-}
-
-define i64 @_Z3maxyy(i64 %v1, i64 %v2) nounwind readnone {
-  %1 = icmp ugt i64 %v1, %v2
-  %2 = select i1 %1, i64 %v1, i64 %v2
-  ret i64 %2
-}
-
-; TODO:  long vector types
-
-define float @_Z3maxff(float %v1, float %v2) nounwind readnone {
-  %1 = tail call float @_Z4fmaxff(float %v1, float %v2)
-  ret float %1
-}
-
-define <2 x float> @_Z3maxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
-  %1 = tail call <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2)
-  ret <2 x float> %1
-}
-
-define <2 x float> @_Z3maxDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
-  %1 = tail call <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2)
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z3maxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
-  %1 = tail call <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2)
-  ret <3 x float> %1
-}
-
-define <3 x float> @_Z3maxDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
-  %1 = tail call <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2)
-  ret <3 x float> %1
-}
-
-define <4 x float> @_Z3maxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
-  %1 = tail call <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2)
-  ret <4 x float> %1
-}
-
-define <4 x float> @_Z3maxDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
-  %1 = tail call <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2)
-  ret <4 x float> %1
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                  MIN                   ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define signext i8 @_Z3mincc(i8 signext %v1, i8 signext %v2) nounwind readnone {
-  %1 = icmp slt i8 %v1, %v2
-  %2 = select i1 %1, i8 %v1, i8 %v2
-  ret i8 %2
-}
-
-define <2 x i8> @_Z3minDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-  %1 = sext <2 x i8> %v1 to <2 x i32>
-  %2 = sext <2 x i8> %v2 to <2 x i32>
-  %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
-  %4 = trunc <2 x i32> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <3 x i8> @_Z3minDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
-  %1 = sext <3 x i8> %v1 to <3 x i32>
-  %2 = sext <3 x i8> %v2 to <3 x i32>
-  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
-  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  %7 = trunc <3 x i32> %6 to <3 x i8>
-  ret <3 x i8> %7
-}
-
-define <4 x i8> @_Z3minDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
-  %1 = sext <4 x i8> %v1 to <4 x i32>
-  %2 = sext <4 x i8> %v2 to <4 x i32>
-  %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
-  %4 = trunc <4 x i32> %3 to <4 x i8>
-  ret <4 x i8> %4
-}
-
-define signext i16 @_Z3minss(i16 signext %v1, i16 signext %v2) nounwind readnone {
-  %1 = icmp slt i16 %v1, %v2
-  %2 = select i1 %1, i16 %v1, i16 %v2
-  ret i16 %2
-}
-
-define <2 x i16> @_Z3minDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
-  %1 = sext <2 x i16> %v1 to <2 x i32>
-  %2 = sext <2 x i16> %v2 to <2 x i32>
-  %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
-  %4 = trunc <2 x i32> %3 to <2 x i16>
-  ret <2 x i16> %4
-}
-
-define <3 x i16> @_Z3minDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
-  %1 = sext <3 x i16> %v1 to <3 x i32>
-  %2 = sext <3 x i16> %v2 to <3 x i32>
-  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
-  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  %7 = trunc <3 x i32> %6 to <3 x i16>
-  ret <3 x i16> %7
-}
-
-define <4 x i16> @_Z3minDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
-  %1 = sext <4 x i16> %v1 to <4 x i32>
-  %2 = sext <4 x i16> %v2 to <4 x i32>
-  %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
-  %4 = trunc <4 x i32> %3 to <4 x i16>
-  ret <4 x i16> %4
-}
-
-define i32 @_Z3minii(i32 %v1, i32 %v2) nounwind readnone {
-  %1 = icmp slt i32 %v1, %v2
-  %2 = select i1 %1, i32 %v1, i32 %v2
-  ret i32 %2
-}
-
-define <2 x i32> @_Z3minDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-  %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
-  ret <2 x i32> %1
-}
-
-define <3 x i32> @_Z3minDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
-  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = tail call <4 x i32   > @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
-  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x i32> %4
-}
-
-define <4 x i32> @_Z3minDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
-  %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
-  ret <4 x i32> %1
-}
-
-define i64 @_Z3minxx(i64 %v1, i64 %v2) nounwind readnone {
-  %1 = icmp slt i64 %v1, %v2
-  %2 = select i1 %1, i64 %v1, i64 %v2
-  ret i64 %2
-}
-
-; TODO:  long vector types
-
-define zeroext i8 @_Z3minhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
-  %1 = icmp ult i8 %v1, %v2
-  %2 = select i1 %1, i8 %v1, i8 %v2
-  ret i8 %2
-}
-
-define <2 x i8> @_Z3minDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-  %1 = zext <2 x i8> %v1 to <2 x i32>
-  %2 = zext <2 x i8> %v2 to <2 x i32>
-  %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
-  %4 = trunc <2 x i32> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <3 x i8> @_Z3minDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
-  %1 = zext <3 x i8> %v1 to <3 x i32>
-  %2 = zext <3 x i8> %v2 to <3 x i32>
-  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
-  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  %7 = trunc <3 x i32> %6 to <3 x i8>
-  ret <3 x i8> %7
-}
-
-define <4 x i8> @_Z3minDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
-  %1 = zext <4 x i8> %v1 to <4 x i32>
-  %2 = zext <4 x i8> %v2 to <4 x i32>
-  %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
-  %4 = trunc <4 x i32> %3 to <4 x i8>
-  ret <4 x i8> %4
-}
-
-define zeroext i16 @_Z3mintt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
-  %1 = icmp ult i16 %v1, %v2
-  %2 = select i1 %1, i16 %v1, i16 %v2
-  ret i16 %2
-}
-
-define <2 x i16> @_Z3minDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
-  %1 = zext <2 x i16> %v1 to <2 x i32>
-  %2 = zext <2 x i16> %v2 to <2 x i32>
-  %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
-  %4 = trunc <2 x i32> %3 to <2 x i16>
-  ret <2 x i16> %4
-}
-
-define <3 x i16> @_Z3minDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
-  %1 = zext <3 x i16> %v1 to <3 x i32>
-  %2 = zext <3 x i16> %v2 to <3 x i32>
-  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
-  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  %7 = trunc <3 x i32> %6 to <3 x i16>
-  ret <3 x i16> %7
-}
-
-define <4 x i16> @_Z3minDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
-  %1 = zext <4 x i16> %v1 to <4 x i32>
-  %2 = zext <4 x i16> %v2 to <4 x i32>
-  %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
-  %4 = trunc <4 x i32> %3 to <4 x i16>
-  ret <4 x i16> %4
-}
-
-define i32 @_Z3minjj(i32 %v1, i32 %v2) nounwind readnone {
-  %1 = icmp ult i32 %v1, %v2
-  %2 = select i1 %1, i32 %v1, i32 %v2
-  ret i32 %2
-}
-
-define <2 x i32> @_Z3minDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-  %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
-  ret <2 x i32> %1
-}
-
-define <3 x i32> @_Z3minDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
-  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = tail call <4 x i32   > @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
-  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x i32> %4
-}
-
-define <4 x i32> @_Z3minDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
-  %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
-  ret <4 x i32> %1
-}
-
-define i64 @_Z3minyy(i64 %v1, i64 %v2) nounwind readnone {
-  %1 = icmp ult i64 %v1, %v2
-  %2 = select i1 %1, i64 %v1, i64 %v2
-  ret i64 %2
-}
-
-; TODO:  long vector types
-
-define float @_Z3minff(float %v1, float %v2) nounwind readnone {
-  %1 = tail call float @_Z4fminff(float %v1, float %v2)
-  ret float %1
-}
-
-define <2 x float> @_Z3minDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
-  %1 = tail call <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2)
-  ret <2 x float> %1
-}
-
-define <2 x float> @_Z3minDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
-  %1 = tail call <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2)
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z3minDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
-  %1 = tail call <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2)
-  ret <3 x float> %1
-}
-
-define <3 x float> @_Z3minDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
-  %1 = tail call <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2)
-  ret <3 x float> %1
-}
-
-define <4 x float> @_Z3minDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
-  %1 = tail call <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2)
-  ret <4 x float> %1
-}
-
-define <4 x float> @_Z3minDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
-  %1 = tail call <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2)
-  ret <4 x float> %1
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                  YUV                   ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-@yuv_U = internal constant <4 x i32> <i32 0, i32 -100, i32 516, i32 0>, align 16
-@yuv_V = internal constant <4 x i32> <i32 409, i32 -208, i32 0, i32 0>, align 16
-@yuv_0 = internal constant <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
-@yuv_255 = internal constant <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, align 16
-
-
-define <4 x i8> @_Z18rsYuvToRGBA_uchar4hhh(i8 %pY, i8 %pU, i8 %pV) nounwind readnone alwaysinline {
-  %_sy = zext i8 %pY to i32
-  %_su = zext i8 %pU to i32
-  %_sv = zext i8 %pV to i32
-
-  %_sy2 = add i32 -16, %_sy
-  %_sy3 = mul i32 298, %_sy2
-  %_su2 = add i32 -128, %_su
-  %_sv2 = add i32 -128, %_sv
-  %_y = tail call <4 x i32> @smear_4i32(i32 %_sy3) nounwind readnone
-  %_u = tail call <4 x i32> @smear_4i32(i32 %_su2) nounwind readnone
-  %_v = tail call <4 x i32> @smear_4i32(i32 %_sv2) nounwind readnone
-
-  %mu = load <4 x i32>* @yuv_U, align 8
-  %mv = load <4 x i32>* @yuv_V, align 8
-  %_u2 = mul <4 x i32> %_u, %mu
-  %_v2 = mul <4 x i32> %_v, %mv
-  %_y2 = add <4 x i32> %_y, %_u2
-  %_y3 = add <4 x i32> %_y2, %_v2
-
- ; %r1 = tail call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %_y3, <4 x i32> <i32 8, i32 8, i32 8, i32 8>) nounwind readnone
-;  %r2 = trunc <4 x i16> %r1 to <4 x i8>
-;  ret <4 x i8> %r2
-
-  %c0 = load <4 x i32>* @yuv_0, align 8
-  %c255 = load <4 x i32>* @yuv_255, align 8
-  %r1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %_y3, <4 x i32> %c0) nounwind readnone
-  %r2 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %r1, <4 x i32> %c255) nounwind readnone
-  %r3 = lshr <4 x i32> %r2, <i32 8, i32 8, i32 8, i32 8>
-  %r4 = trunc <4 x i32> %r3 to <4 x i8>
-  ret <4 x i8> %r4
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;              half_RECIP              ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define float @_Z10half_recipf(float %v) {
-  %1 = insertelement <2 x float> undef, float %v, i32 0
-  %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone
-  %3 = extractelement <2 x float> %2, i32 0
-  ret float %3
-}
-
-define <2 x float> @_Z10half_recip2Dv2_h(<2 x float> %v) nounwind readnone {
-  %1 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %v) nounwind readnone
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z10half_recip3Dv3_h(<3 x float> %v) nounwind readnone {
-  %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone
-  %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %3
-}
-
-define <4 x float> @_Z10half_recip4Dv4_h(<4 x float> %v) nounwind readnone {
-  %1 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %v) nounwind readnone
-  ret <4 x float> %1
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;              half_SQRT               ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define float @_Z9half_sqrtf(float %v) {
-  %1 = insertelement <2 x float> undef, float %v, i32 0
-  %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone
-  %3 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %2) nounwind readnone
-  %4 = extractelement <2 x float> %3, i32 0
-  ret float %4
-}
-
-define <2 x float> @_Z9half_sqrt2Dv2_h(<2 x float> %v) nounwind readnone {
-  %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone
-  %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone
-  ret <2 x float> %2
-}
-
-define <3 x float> @_Z9half_sqrt3Dv3_h(<3 x float> %v) nounwind readnone {
-  %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone
-  %3 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %2) nounwind readnone
-  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %4
-}
-
-define <4 x float> @_Z9half_sqrt4Dv4_h(<4 x float> %v) nounwind readnone {
-  %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone
-  %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone
-  ret <4 x float> %2
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;              half_RSQRT              ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define float @_Z10half_rsqrtf(float %v) {
-  %1 = insertelement <2 x float> undef, float %v, i32 0
-  %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone
-  %3 = extractelement <2 x float> %2, i32 0
-  ret float %3
-}
-
-define <2 x float> @_Z10half_rsqrt2Dv2_h(<2 x float> %v) nounwind readnone {
-  %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z10half_rsqrt3Dv3_h(<3 x float> %v) nounwind readnone {
-  %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone
-  %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %3
-}
-
-define <4 x float> @_Z10half_rsqrt4Dv4_h(<4 x float> %v) nounwind readnone {
-  %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone
-  ret <4 x float> %1
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;              matrix                    ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
-
-%struct.rs_matrix4x4 = type { [16 x float] }
-%struct.rs_matrix3x3 = type { [9 x float] }
-%struct.rs_matrix2x2 = type { [4 x float] }
-
-define internal <4 x float> @smear_f(float %in) nounwind readnone alwaysinline {
-  %1 = insertelement <4 x float> undef, float %in, i32 0
-  %2 = insertelement <4 x float> %1, float %in, i32 1
-  %3 = insertelement <4 x float> %2, float %in, i32 2
-  %4 = insertelement <4 x float> %3, float %in, i32 3
-  ret <4 x float> %4
-}
-
-
-define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly {
-  %x0 = extractelement <3 x float> %in, i32 0
-  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
-  %y0 = extractelement <3 x float> %in, i32 1
-  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
-  %z0 = extractelement <3 x float> %in, i32 2
-  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
-
-  %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
-  %px2 = bitcast float* %px to i8*
-  %xm = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %px2, i32 4) nounwind
-
-  %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
-  %py2 = bitcast float* %py to i8*
-  %ym = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %py2, i32 4) nounwind
-
-  %pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 5
-  %pz2 = bitcast float* %pz to i8*
-  %zm2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %pz2, i32 4) nounwind
-  %zm = shufflevector <4 x float> %zm2, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-
-  %a1 = fmul <4 x float> %x, %xm
-  %a2 = fmul <4 x float> %y, %ym
-  %a3 = fadd <4 x float> %a1, %a2
-  %a4 = fmul <4 x float> %z, %zm
-  %a5 = fadd <4 x float> %a4, %a3
-  %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %a6
-}
-
-define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly {
-  %x0 = extractelement <2 x float> %in, i32 0
-  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
-  %y0 = extractelement <2 x float> %in, i32 1
-  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
-
-  %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
-  %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2, align 4
-  %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
-  %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2, align 4
-
-  %a1 = fmul <4 x float> %x, %xm
-  %a2 = fmul <4 x float> %y, %ym
-  %a3 = fadd <4 x float> %a1, %a2
-  %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %a4
-}
-
-define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly {
-  %x0 = extractelement <4 x float> %in, i32 0
-  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
-  %y0 = extractelement <4 x float> %in, i32 1
-  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
-  %z0 = extractelement <4 x float> %in, i32 2
-  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
-  %w0 = extractelement <4 x float> %in, i32 3
-  %w = tail call <4 x float> @smear_f(float %w0) nounwind readnone
-
-  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
-  %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2, align 4
-  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
-  %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2, align 4
-  %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
-  %pz2 = bitcast float* %pz to <4 x float>*
-  %zm = load <4 x float>* %pz2, align 4
-  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
-  %pw2 = bitcast float* %pw to <4 x float>*
-  %wm = load <4 x float>* %pw2, align 4
-
-  %a1 = fmul <4 x float> %x, %xm
-  %a2 = fmul <4 x float> %y, %ym
-  %a3 = fadd <4 x float> %a1, %a2
-  %a4 = fmul <4 x float> %z, %zm
-  %a5 = fadd <4 x float> %a3, %a4
-  %a6 = fmul <4 x float> %w, %wm
-  %a7 = fadd <4 x float> %a5, %a6
-  ret <4 x float> %a7
-}
-
-define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly {
-  %x0 = extractelement <3 x float> %in, i32 0
-  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
-  %y0 = extractelement <3 x float> %in, i32 1
-  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
-  %z0 = extractelement <3 x float> %in, i32 2
-  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
-
-  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
-  %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2, align 4
-  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
-  %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2, align 4
-  %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
-  %pz2 = bitcast float* %pz to <4 x float>*
-  %zm = load <4 x float>* %pz2, align 4
-  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
-  %pw2 = bitcast float* %pw to <4 x float>*
-  %wm = load <4 x float>* %pw2, align 4
-
-  %a1 = fmul <4 x float> %x, %xm
-  %a2 = fadd <4 x float> %wm, %a1
-  %a3 = fmul <4 x float> %y, %ym
-  %a4 = fadd <4 x float> %a2, %a3
-  %a5 = fmul <4 x float> %z, %zm
-  %a6 = fadd <4 x float> %a4, %a5
-  ret <4 x float> %a6
-}
-
-define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly {
-  %x0 = extractelement <2 x float> %in, i32 0
-  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
-  %y0 = extractelement <2 x float> %in, i32 1
-  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
-
-  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
-  %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2, align 4
-  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
-  %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2, align 4
-  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
-  %pw2 = bitcast float* %pw to <4 x float>*
-  %wm = load <4 x float>* %pw2, align 4
-
-  %a1 = fmul <4 x float> %x, %xm
-  %a2 = fadd <4 x float> %wm, %a1
-  %a3 = fmul <4 x float> %y, %ym
-  %a4 = fadd <4 x float> %a2, %a3
-  ret <4 x float> %a4
-}
-
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;              pixel ops                 ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-
-@fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16
-@fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16
-@fc_0 = internal constant <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, align 16
-
-declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone
-declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone
-
-; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
-define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone {
-    %f255 = load <4 x float>* @fc_255.0, align 16
-    %f05 = load <4 x float>* @fc_0.5, align 16
-    %f0 = load <4 x float>* @fc_0, align 16
-    %v1 = fmul <4 x float> %f255, %color
-    %v2 = fadd <4 x float> %f05, %v1
-    %v3 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %v2, <4 x float> %f0, <4 x float> %f255) nounwind readnone
-    %v4 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v3) nounwind readnone
-    ret <4 x i8> %v4
-}
-
-; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
-define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<3 x float> %color) nounwind readnone {
-    %1 = shufflevector <3 x float> %color, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-    %2 = insertelement <4 x float> %1, float 1.0, i32 3
-    %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone
-    ret <4 x i8> %3
-}
-
-; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
-define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone {
-    %1 = insertelement <4 x float> undef, float %r, i32 0
-    %2 = insertelement <4 x float> %1, float %g, i32 1
-    %3 = insertelement <4 x float> %2, float %b, i32 2
-    %4 = insertelement <4 x float> %3, float 1.0, i32 3
-    %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
-    ret <4 x i8> %5
-}
-
-; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
-define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone {
-    %1 = insertelement <4 x float> undef, float %r, i32 0
-    %2 = insertelement <4 x float> %1, float %g, i32 1
-    %3 = insertelement <4 x float> %2, float %b, i32 2
-    %4 = insertelement <4 x float> %3, float %a, i32 3
-    %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
-    ret <4 x i8> %5
-}
-
diff --git a/lib/Renderscript/runtime/arch/sqrt.c b/lib/Renderscript/runtime/arch/sqrt.c
deleted file mode 100755
index f1dac5f..0000000
--- a/lib/Renderscript/runtime/arch/sqrt.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rs_types.rsh"
-
-#define FN_FUNC_FN(fnc)                                         \
-extern float2 __attribute__((overloadable)) fnc(float2 v) { \
-    float2 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    return r;                                                   \
-}                                                               \
-extern float3 __attribute__((overloadable)) fnc(float3 v) { \
-    float3 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    return r;                                                   \
-}                                                               \
-extern float4 __attribute__((overloadable)) fnc(float4 v) { \
-    float4 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    r.w = fnc(v.w);                                             \
-    return r;                                                   \
-}
-
-extern float __attribute__((overloadable)) sqrt(float);
-
-FN_FUNC_FN(sqrt)
diff --git a/lib/Renderscript/runtime/arch/x86_clamp.ll b/lib/Renderscript/runtime/arch/x86_clamp.ll
deleted file mode 100755
index 422e9f6..0000000
--- a/lib/Renderscript/runtime/arch/x86_clamp.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
-target triple = "i386-unknown-linux-gnu"
-
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
-
-define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %in, <4 x float> %low, <4 x float> %high) nounwind readnone alwaysinline {
-  %1 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %in, <4 x float> %high) nounwind readnone
-  %2 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %1, <4 x float> %low) nounwind readnone
-  ret <4 x float> %2
-}
-
-define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %in, <3 x float> %low, <3 x float> %high) nounwind readnone alwaysinline {
-  %1 = shufflevector <3 x float> %in, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %1, <4 x float> %2, <4 x float> %3) nounwind readnone
-  %5 = shufflevector <4 x float> %4, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %5
-}
-
-define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %in, <2 x float> %low, <2 x float> %high) nounwind readnone alwaysinline {
-  %1 = shufflevector <2 x float> %in, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = shufflevector <2 x float> %low, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = shufflevector <2 x float> %high, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %4 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %1, <4 x float> %2, <4 x float> %3) nounwind readnone
-  %5 = shufflevector <4 x float> %4, <4 x float> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %5
-}
-
-define float @_Z5clampfff(float %in, float %low, float %high) nounwind readnone alwaysinline {
-  %1 = insertelement <4 x float> undef, float %in, i32 0
-  %2 = insertelement <4 x float> undef, float %low, i32 0
-  %3 = insertelement <4 x float> undef, float %high, i32 0
-  %4 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %1, <4 x float> %3) nounwind readnone
-  %5 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %4, <4 x float> %2) nounwind readnone
-  %6 = extractelement <4 x float> %5, i32 0
-  ret float %6
-}
-
-define <4 x float> @_Z5clampDv4_fff(<4 x float> %in, float %low, float %high) nounwind readonly {
-  %1 = insertelement <4 x float> undef, float %low, i32 0
-  %2 = insertelement <4 x float> %1, float %low, i32 1
-  %3 = insertelement <4 x float> %2, float %low, i32 2
-  %4 = insertelement <4 x float> %3, float %low, i32 3
-  %5 = insertelement <4 x float> undef, float %high, i32 0
-  %6 = insertelement <4 x float> %5, float %high, i32 1
-  %7 = insertelement <4 x float> %6, float %high, i32 2
-  %8 = insertelement <4 x float> %7, float %high, i32 3
-  %9 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %in, <4 x float> %4, <4 x float> %8) nounwind readnone
-  ret <4 x float> %9
-}
-
-define <3 x float> @_Z5clampDv3_fff(<3 x float> %in, float %low, float %high) nounwind readonly {
-  %1 = insertelement <3 x float> undef, float %low, i32 0
-  %2 = insertelement <3 x float> %1, float %low, i32 1
-  %3 = insertelement <3 x float> %2, float %low, i32 2
-  %4 = insertelement <3 x float> undef, float %high, i32 0
-  %5 = insertelement <3 x float> %4, float %high, i32 1
-  %6 = insertelement <3 x float> %5, float %high, i32 2
-  %7 = tail call <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %in, <3 x float> %3, <3 x float> %6) nounwind readnone
-  ret <3 x float> %7
-}
-
-define <2 x float> @_Z5clampDv2_fff(<2 x float> %in, float %low, float %high) nounwind readonly {
-  %1 = insertelement <2 x float> undef, float %low, i32 0
-  %2 = insertelement <2 x float> %1, float %low, i32 1
-  %3 = insertelement <2 x float> undef, float %high, i32 0
-  %4 = insertelement <2 x float> %3, float %high, i32 1
-  %5 = tail call <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %in, <2 x float> %2, <2 x float> %4) nounwind readnone
-  ret <2 x float> %5
-}
diff --git a/lib/Renderscript/runtime/arch/x86_dot_length.ll b/lib/Renderscript/runtime/arch/x86_dot_length.ll
deleted file mode 100644
index 21f2f3e..0000000
--- a/lib/Renderscript/runtime/arch/x86_dot_length.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
-target triple = "i386-unknown-linux-gnu"
-
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
-declare float @llvm.sqrt.f32(float) nounwind readnone
-
-define float @_Z3dotDv4_fS_(<4 x float> %lhs, <4 x float> %rhs) nounwind readnone {
-  %1 = fmul <4 x float> %lhs, %rhs
-  %2 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %1, <4 x float> %1) nounwind readnone
-  %3 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %2) nounwind readnone
-  %4 = extractelement <4 x float> %3, i32 0
-  ret float %4
-}
-
-define float @_Z3dotDv3_fS_(<3 x float> %lhs, <3 x float> %rhs) nounwind readnone {
-  %1 = fmul <3 x float> %lhs, %rhs
-  %2 = shufflevector <3 x float> %1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = bitcast <4 x float> %2 to <2 x i64>
-  %4 = tail call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %3, i32 32)
-  %5 = bitcast <2 x i64> %4 to <4 x float>
-  %6 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %5, <4 x float> %5) nounwind readnone
-  %7 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %6, <4 x float> %6) nounwind readnone
-  %8 = extractelement <4 x float> %7, i32 0
-  ret float %8
-}
-
-define float @_Z3dotDv2_fS_(<2 x float> %lhs, <2 x float> %rhs) nounwind readnone {
-  %1 = fmul <2 x float> %lhs, %rhs
-  %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %2) nounwind readnone
-  %4 = extractelement <4 x float> %3, i32 0
-  ret float %4
-}
-
-define float @_Z3dotff(float %lhs, float %rhs) nounwind readnone {
-  %1 = fmul float %lhs, %rhs
-  ret float %1
-}
-
-define float @_Z6lengthDv4_f(<4 x float> %in) nounwind readnone alwaysinline {
-  %1 = fmul <4 x float> %in, %in
-  %2 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %1, <4 x float> %1) nounwind readnone
-  %3 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %2) nounwind readnone
-  %4 = extractelement <4 x float> %3, i32 0
-  %5 = tail call float @llvm.sqrt.f32(float %4) nounwind readnone
-  ret float %5
-}
-
-define float @_Z6lengthDv3_f(<3 x float> %in) nounwind readnone alwaysinline {
-  %1 = fmul <3 x float> %in, %in
-  %2 = shufflevector <3 x float> %1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = bitcast <4 x float> %2 to <2 x i64>
-  %4 = tail call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %3, i32 32)
-  %5 = bitcast <2 x i64> %4 to <4 x float>
-  %6 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %5, <4 x float> %5) nounwind readnone
-  %7 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %6, <4 x float> %6) nounwind readnone
-  %8 = extractelement <4 x float> %7, i32 0
-  %9 = tail call float @llvm.sqrt.f32(float %8) nounwind readnone
-  ret float %9
-}
-
-define float @_Z6lengthDv2_f(<2 x float> %in) nounwind readnone alwaysinline {
-  %1 = fmul <2 x float> %in, %in
-  %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %2) nounwind readnone
-  %4 = extractelement <4 x float> %3, i32 0
-  %5 = tail call float @llvm.sqrt.f32(float %4) nounwind readnone
-  ret float %5
-}
-
-define float @_Z6lengthf(float %in) nounwind readnone alwaysinline {
-  ret float %in
-}
-
diff --git a/lib/Renderscript/runtime/arch/x86_generic.c b/lib/Renderscript/runtime/arch/x86_generic.c
deleted file mode 100644
index c46c54a..0000000
--- a/lib/Renderscript/runtime/arch/x86_generic.c
+++ /dev/null
@@ -1,786 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rs_types.rsh"
-
-extern short __attribute__((overloadable, always_inline)) rsClamp(short amount, short low, short high);
-extern float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high);
-extern uchar4 __attribute__((overloadable)) convert_uchar4(short4);
-extern float __attribute__((overloadable)) sqrt(float);
-
-/*
- * FMAX
- */
-
-extern float __attribute__((overloadable)) fmax(float v1, float v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern float2 __attribute__((overloadable)) fmax(float2 v1, float2 v2) {
-    float2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmax(float3 v1, float3 v2) {
-    float3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmax(float4 v1, float4 v2) {
-    float4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float2 __attribute__((overloadable)) fmax(float2 v1, float v2) {
-    float2 r;
-    r.x = v1.x > v2 ? v1.x : v2;
-    r.y = v1.y > v2 ? v1.y : v2;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmax(float3 v1, float v2) {
-    float3 r;
-    r.x = v1.x > v2 ? v1.x : v2;
-    r.y = v1.y > v2 ? v1.y : v2;
-    r.z = v1.z > v2 ? v1.z : v2;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmax(float4 v1, float v2) {
-    float4 r;
-    r.x = v1.x > v2 ? v1.x : v2;
-    r.y = v1.y > v2 ? v1.y : v2;
-    r.z = v1.z > v2 ? v1.z : v2;
-    r.w = v1.w > v2 ? v1.w : v2;
-    return r;
-}
-
-extern float __attribute__((overloadable)) fmin(float v1, float v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-
-/*
- * FMIN
- */
-extern float2 __attribute__((overloadable)) fmin(float2 v1, float2 v2) {
-    float2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmin(float3 v1, float3 v2) {
-    float3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmin(float4 v1, float4 v2) {
-    float4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float2 __attribute__((overloadable)) fmin(float2 v1, float v2) {
-    float2 r;
-    r.x = v1.x < v2 ? v1.x : v2;
-    r.y = v1.y < v2 ? v1.y : v2;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmin(float3 v1, float v2) {
-    float3 r;
-    r.x = v1.x < v2 ? v1.x : v2;
-    r.y = v1.y < v2 ? v1.y : v2;
-    r.z = v1.z < v2 ? v1.z : v2;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmin(float4 v1, float v2) {
-    float4 r;
-    r.x = v1.x < v2 ? v1.x : v2;
-    r.y = v1.y < v2 ? v1.y : v2;
-    r.z = v1.z < v2 ? v1.z : v2;
-    r.w = v1.w < v2 ? v1.w : v2;
-    return r;
-}
-
-
-/*
- * MAX
- */
-
-extern char __attribute__((overloadable)) max(char v1, char v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern char2 __attribute__((overloadable)) max(char2 v1, char2 v2) {
-    char2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern char3 __attribute__((overloadable)) max(char3 v1, char3 v2) {
-    char3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern char4 __attribute__((overloadable)) max(char4 v1, char4 v2) {
-    char4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern short __attribute__((overloadable)) max(short v1, short v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern short2 __attribute__((overloadable)) max(short2 v1, short2 v2) {
-    short2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern short3 __attribute__((overloadable)) max(short3 v1, short3 v2) {
-    short3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern short4 __attribute__((overloadable)) max(short4 v1, short4 v2) {
-    short4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int __attribute__((overloadable)) max(int v1, int v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern int2 __attribute__((overloadable)) max(int2 v1, int2 v2) {
-    int2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern int3 __attribute__((overloadable)) max(int3 v1, int3 v2) {
-    int3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern int4 __attribute__((overloadable)) max(int4 v1, int4 v2) {
-    int4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int64_t __attribute__((overloadable)) max(int64_t v1, int64_t v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) {
-    long2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) {
-    long3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) {
-    long4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uchar __attribute__((overloadable)) max(uchar v1, uchar v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern uchar2 __attribute__((overloadable)) max(uchar2 v1, uchar2 v2) {
-    uchar2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uchar3 __attribute__((overloadable)) max(uchar3 v1, uchar3 v2) {
-    uchar3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uchar4 __attribute__((overloadable)) max(uchar4 v1, uchar4 v2) {
-    uchar4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ushort __attribute__((overloadable)) max(ushort v1, ushort v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern ushort2 __attribute__((overloadable)) max(ushort2 v1, ushort2 v2) {
-    ushort2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ushort3 __attribute__((overloadable)) max(ushort3 v1, ushort3 v2) {
-    ushort3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ushort4 __attribute__((overloadable)) max(ushort4 v1, ushort4 v2) {
-    ushort4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uint __attribute__((overloadable)) max(uint v1, uint v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern uint2 __attribute__((overloadable)) max(uint2 v1, uint2 v2) {
-    uint2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uint3 __attribute__((overloadable)) max(uint3 v1, uint3 v2) {
-    uint3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uint4 __attribute__((overloadable)) max(uint4 v1, uint4 v2) {
-    uint4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) {
-    ulong2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) {
-    ulong3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) {
-    ulong4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float __attribute__((overloadable)) max(float v1, float v2) {
-    return fmax(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) max(float2 v1, float2 v2) {
-    return fmax(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) max(float2 v1, float v2) {
-    return fmax(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) max(float3 v1, float3 v2) {
-    return fmax(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) max(float3 v1, float v2) {
-    return fmax(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) max(float4 v1, float4 v2) {
-    return fmax(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) max(float4 v1, float v2) {
-    return fmax(v1, v2);
-}
-
-
-/*
- * MIN
- */
-
-extern int8_t __attribute__((overloadable)) min(int8_t v1, int8_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern char2 __attribute__((overloadable)) min(char2 v1, char2 v2) {
-    char2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern char3 __attribute__((overloadable)) min(char3 v1, char3 v2) {
-    char3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern char4 __attribute__((overloadable)) min(char4 v1, char4 v2) {
-    char4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int16_t __attribute__((overloadable)) min(int16_t v1, int16_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern short2 __attribute__((overloadable)) min(short2 v1, short2 v2) {
-    short2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern short3 __attribute__((overloadable)) min(short3 v1, short3 v2) {
-    short3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern short4 __attribute__((overloadable)) min(short4 v1, short4 v2) {
-    short4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int32_t __attribute__((overloadable)) min(int32_t v1, int32_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern int2 __attribute__((overloadable)) min(int2 v1, int2 v2) {
-    int2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern int3 __attribute__((overloadable)) min(int3 v1, int3 v2) {
-    int3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern int4 __attribute__((overloadable)) min(int4 v1, int4 v2) {
-    int4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int64_t __attribute__((overloadable)) min(int64_t v1, int64_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) {
-    long2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) {
-    long3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) {
-    long4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uchar __attribute__((overloadable)) min(uchar v1, uchar v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern uchar2 __attribute__((overloadable)) min(uchar2 v1, uchar2 v2) {
-    uchar2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uchar3 __attribute__((overloadable)) min(uchar3 v1, uchar3 v2) {
-    uchar3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uchar4 __attribute__((overloadable)) min(uchar4 v1, uchar4 v2) {
-    uchar4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ushort __attribute__((overloadable)) min(ushort v1, ushort v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern ushort2 __attribute__((overloadable)) min(ushort2 v1, ushort2 v2) {
-    ushort2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ushort3 __attribute__((overloadable)) min(ushort3 v1, ushort3 v2) {
-    ushort3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ushort4 __attribute__((overloadable)) min(ushort4 v1, ushort4 v2) {
-    ushort4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uint __attribute__((overloadable)) min(uint v1, uint v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern uint2 __attribute__((overloadable)) min(uint2 v1, uint2 v2) {
-    uint2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uint3 __attribute__((overloadable)) min(uint3 v1, uint3 v2) {
-    uint3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uint4 __attribute__((overloadable)) min(uint4 v1, uint4 v2) {
-    uint4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) {
-    ulong2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) {
-    ulong3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) {
-    ulong4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float __attribute__((overloadable)) min(float v1, float v2) {
-    return fmin(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) min(float2 v1, float2 v2) {
-    return fmin(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) min(float2 v1, float v2) {
-    return fmin(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) min(float3 v1, float3 v2) {
-    return fmin(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) min(float3 v1, float v2) {
-    return fmin(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) min(float4 v1, float4 v2) {
-    return fmin(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) min(float4 v1, float v2) {
-    return fmin(v1, v2);
-}
-
-
-/*
- * YUV
- */
-
-extern uchar4 __attribute__((overloadable)) rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v) {
-    short Y = ((short)y) - 16;
-    short U = ((short)u) - 128;
-    short V = ((short)v) - 128;
-
-    short4 p;
-    p.r = (Y * 298 + V * 409 + 128) >> 8;
-    p.g = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
-    p.b = (Y * 298 + U * 516 + 128) >> 8;
-    p.a = 255;
-    p.r = rsClamp(p.r, (short)0, (short)255);
-    p.g = rsClamp(p.g, (short)0, (short)255);
-    p.b = rsClamp(p.b, (short)0, (short)255);
-
-    return convert_uchar4(p);
-}
-
-static float4 yuv_U_values = {0.f, -0.392f * 0.003921569f, +2.02 * 0.003921569f, 0.f};
-static float4 yuv_V_values = {1.603f * 0.003921569f, -0.815f * 0.003921569f, 0.f, 0.f};
-
-extern float4 __attribute__((overloadable)) rsYuvToRGBA_float4(uchar y, uchar u, uchar v) {
-    float4 color = (float)y * 0.003921569f;
-    float4 fU = ((float)u) - 128.f;
-    float4 fV = ((float)v) - 128.f;
-
-    color += fU * yuv_U_values;
-    color += fV * yuv_V_values;
-    color = clamp(color, 0.f, 1.f);
-    return color;
-}
-
-
-/*
- * half_RECIP
- */
-
-extern float __attribute__((overloadable)) half_recip(float v) {
-    // FIXME:  actual algorithm for generic approximate reciprocal
-    return 1.f / v;
-}
-
-extern float2 __attribute__((overloadable)) half_recip(float2 v) {
-    float2 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_recip(float3 v) {
-    float3 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    r.z = half_recip(r.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_recip(float4 v) {
-    float4 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    r.z = half_recip(r.z);
-    r.w = half_recip(r.w);
-    return r;
-}
-
-
-/*
- * half_SQRT
- */
-
-extern float __attribute__((overloadable)) half_sqrt(float v) {
-    return sqrt(v);
-}
-
-extern float2 __attribute__((overloadable)) half_sqrt(float2 v) {
-    float2 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_sqrt(float3 v) {
-    float3 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    r.z = half_sqrt(v.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_sqrt(float4 v) {
-    float4 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    r.z = half_sqrt(v.z);
-    r.w = half_sqrt(v.w);
-    return r;
-}
-
-
-/*
- * half_rsqrt
- */
-
-extern float __attribute__((overloadable)) half_rsqrt(float v) {
-    return 1.f / sqrt(v);
-}
-
-extern float2 __attribute__((overloadable)) half_rsqrt(float2 v) {
-    float2 r;
-    r.x = half_rsqrt(v.x);
-    r.y = half_rsqrt(v.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_rsqrt(float3 v) {
-    float3 r;
-    r.x = half_rsqrt(v.x);
-    r.y = half_rsqrt(v.y);
-    r.z = half_rsqrt(v.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_rsqrt(float4 v) {
-    float4 r;
-    r.x = half_rsqrt(v.x);
-    r.y = half_rsqrt(v.y);
-    r.z = half_rsqrt(v.z);
-    r.w = half_rsqrt(v.w);
-    return r;
-}
-
diff --git a/lib/Renderscript/runtime/arch/x86_math.ll b/lib/Renderscript/runtime/arch/x86_math.ll
deleted file mode 100755
index 60add80..0000000
--- a/lib/Renderscript/runtime/arch/x86_math.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
-target triple = "i386-unknown-linux-gnu"
-
-declare float @llvm.sqrt.f32(float) nounwind readnone
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone
-declare <3 x float> @llvm.sqrt.v3f32(<3 x float>) nounwind readnone
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone
-declare float @llvm.exp.f32(float) nounwind readonly
-declare float @llvm.pow.f32(float, float) nounwind readonly
-
-define float @_Z4sqrtf(float %in) nounwind readnone alwaysinline {
-  %1 = tail call float @llvm.sqrt.f32(float %in) nounwind readnone
-  ret float %1
-}
-
-define <2 x float> @_Z4sqrtDv2_f(<2 x float> %in) nounwind readnone alwaysinline {
-  %1 = tail call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) nounwind readnone
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z4sqrtDv3_f(<3 x float> %in) nounwind readnone alwaysinline {
-  %1 = tail call <3 x float> @llvm.sqrt.v3f32(<3 x float> %in) nounwind readnone
-  ret <3 x float> %1
-}
-
-define <4 x float> @_Z4sqrtDv4_f(<4 x float> %in) nounwind readnone alwaysinline {
-  %1 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) nounwind readnone
-  ret <4 x float> %1
-}
-
-define float @_Z3expf(float %in) nounwind readnone {
-  %1 = tail call float @llvm.exp.f32(float %in) nounwind readnone
-  ret float %1
-}
-
-define float @_Z3powff(float %v1, float %v2) nounwind readnone {
-  %1 = tail call float @llvm.pow.f32(float %v1, float %v2) nounwind readnone
-  ret float %1
-}
-
diff --git a/lib/Renderscript/runtime/build_bc_lib.mk b/lib/Renderscript/runtime/build_bc_lib.mk
deleted file mode 100644
index 1d20b7a..0000000
--- a/lib/Renderscript/runtime/build_bc_lib.mk
+++ /dev/null
@@ -1,74 +0,0 @@
-#
-# Copyright (C) 2012 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-include $(BUILD_SYSTEM)/base_rules.mk
-
-BCC_STRIP_ATTR := $(BUILD_OUT_EXECUTABLES)/bcc_strip_attr$(BUILD_EXECUTABLE_SUFFIX)
-
-# We need to pass the +long64 flag to the underlying version of Clang, since
-# we are generating a library for use with Renderscript (64-bit long type,
-# not 32-bit).
-bc_clang_cc1_cflags := -target-feature +long64
-bc_translated_clang_cc1_cflags := $(addprefix -Xclang , $(bc_clang_cc1_cflags))
-
-bc_cflags := -MD \
-             -DRS_VERSION=$(RS_VERSION) \
-             -std=c99 \
-             -c \
-             -O3 \
-             -fno-builtin \
-             -emit-llvm \
-             -target armv7-none-linux-gnueabi \
-             -fsigned-char \
-             $(bc_translated_clang_cc1_cflags)
-
-ifeq ($(rs_debug_runtime),1)
-bc_cflags += -DRS_DEBUG_RUNTIME
-endif
-rs_debug_runtime:=
-
-c_sources := $(filter %.c,$(LOCAL_SRC_FILES))
-ll_sources := $(filter %.ll,$(LOCAL_SRC_FILES))
-
-c_bc_files := $(patsubst %.c,%.bc, \
-    $(addprefix $(intermediates)/, $(c_sources)))
-
-ll_bc_files := $(patsubst %.ll,%.bc, \
-    $(addprefix $(intermediates)/, $(ll_sources)))
-
-$(c_bc_files): PRIVATE_INCLUDES := \
-    frameworks/rs/scriptc \
-    external/clang/lib/Headers
-$(c_bc_files): PRIVATE_CFLAGS := $(bc_cflags)
-
-$(c_bc_files): $(intermediates)/%.bc: $(LOCAL_PATH)/%.c  $(CLANG)
-	@mkdir -p $(dir $@)
-	$(hide) $(CLANG) $(addprefix -I, $(PRIVATE_INCLUDES)) $(PRIVATE_CFLAGS) $< -o $@
-
-$(ll_bc_files): $(intermediates)/%.bc: $(LOCAL_PATH)/%.ll $(LLVM_AS)
-	@mkdir -p $(dir $@)
-	$(hide) $(LLVM_AS) $< -o $@
-
--include $(c_bc_files:%.bc=%.d)
--include $(ll_bc_files:%.bc=%.d)
-
-$(LOCAL_BUILT_MODULE): PRIVATE_BC_FILES := $(c_bc_files) $(ll_bc_files)
-$(LOCAL_BUILT_MODULE): $(c_bc_files) $(ll_bc_files)
-$(LOCAL_BUILT_MODULE): $(LLVM_LINK) $(clcore_LLVM_LD)
-$(LOCAL_BUILT_MODULE): $(LLVM_AS) $(BCC_STRIP_ATTR)
-	@mkdir -p $(dir $@)
-	$(hide) $(LLVM_LINK) $(PRIVATE_BC_FILES) -o $@.unstripped
-	$(hide) $(BCC_STRIP_ATTR) -o $@ $@.unstripped
diff --git a/lib/Renderscript/runtime/build_clcore.sh b/lib/Renderscript/runtime/build_clcore.sh
deleted file mode 100755
index 842245c..0000000
--- a/lib/Renderscript/runtime/build_clcore.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/sh
-
-# Usually, manually running build_clcore.sh shouldn't be needed. build_clcore.mk should
-# kick in automatically during Android build process. 
-
-# Generate rs_cl.bc
-# =================
-
-scriptc_path=../../../../base/libs/rs/scriptc
-clang_header_path=../../../../../external/clang/lib/Headers
-
-clang -target armv7-none-linux-gnueabi -I${scriptc_path} -I${clang_header_path} -c -std=c99 -O3 rs_cl.c -emit-llvm -o rs_cl.bc
-
-# Generate rs_core.bc
-# ===================
-
-clang -target armv7-none-linux-gnueabi -I${scriptc_path} -I${clang_header_path} -c -std=c99 -O3 rs_core.c -emit-llvm -o rs_core.bc
-
-# Link everything together
-# ========================
-
-llvm-link rs_cl.bc rs_core.bc -o libclcore.bc
diff --git a/lib/Renderscript/runtime/convert.ll b/lib/Renderscript/runtime/convert.ll
deleted file mode 100644
index f45850d..0000000
--- a/lib/Renderscript/runtime/convert.ll
+++ /dev/null
@@ -1,731 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
-target triple = "armv7-none-linux-gnueabi"
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                  FLOAT                 ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define <2 x float> @_Z14convert_float2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
-  %1 = uitofp <2 x i8> %in to <2 x float>
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z14convert_float3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
-  %1 = uitofp <3 x i8> %in to <3 x float>
-  ret <3 x float> %1
-}
-
-define <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
-  %1 = uitofp <4 x i8> %in to <4 x float>
-  ret <4 x float> %1
-}
-
-define <2 x float> @_Z14convert_float2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
-  %1 = sitofp <2 x i8> %in to <2 x float>
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z14convert_float3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
-  %1 = sitofp <3 x i8> %in to <3 x float>
-  ret <3 x float> %1
-}
-
-define <4 x float> @_Z14convert_float4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
-  %1 = sitofp <4 x i8> %in to <4 x float>
-  ret <4 x float> %1
-}
-
-define <2 x float> @_Z14convert_float2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
-  %1 = uitofp <2 x i16> %in to <2 x float>
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z14convert_float3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
-  %1 = uitofp <3 x i16> %in to <3 x float>
-  ret <3 x float> %1
-}
-
-define <4 x float> @_Z14convert_float4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
-  %1 = uitofp <4 x i16> %in to <4 x float>
-  ret <4 x float> %1
-}
-
-define <2 x float> @_Z14convert_float2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
-  %1 = sitofp <2 x i16> %in to <2 x float>
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z14convert_float3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
-  %1 = sitofp <3 x i16> %in to <3 x float>
-  ret <3 x float> %1
-}
-
-define <4 x float> @_Z14convert_float4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
-  %1 = sitofp <4 x i16> %in to <4 x float>
-  ret <4 x float> %1
-}
-
-define <2 x float> @_Z14convert_float2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
-  %1 = uitofp <2 x i32> %in to <2 x float>
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z14convert_float3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
-  %1 = uitofp <3 x i32> %in to <3 x float>
-  ret <3 x float> %1
-}
-
-define <4 x float> @_Z14convert_float4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
-  %1 = uitofp <4 x i32> %in to <4 x float>
-  ret <4 x float> %1
-}
-
-define <2 x float> @_Z14convert_float2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
-  %1 = sitofp <2 x i32> %in to <2 x float>
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z14convert_float3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
-  %1 = sitofp <3 x i32> %in to <3 x float>
-  ret <3 x float> %1
-}
-
-define <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
-  %1 = sitofp <4 x i32> %in to <4 x float>
-  ret <4 x float> %1
-}
-
-define <2 x float> @_Z14convert_float2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
-  ret <2 x float> %in
-}
-
-define <3 x float> @_Z14convert_float3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
-  ret <3 x float> %in
-}
-
-define <4 x float> @_Z14convert_float4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
-  ret <4 x float> %in
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                  CHAR                  ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-define <4 x i8> @_Z13convert_char4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptosi <4 x float> %in to <4 x i8>
-  ret <4 x i8> %1
-}
-
-define <3 x i8> @_Z13convert_char3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptosi <3 x float> %in to <3 x i8>
-  ret <3 x i8> %1
-}
-
-define <2 x i8> @_Z13convert_char2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptosi <2 x float> %in to <2 x i8>
-  ret <2 x i8> %1
-}
-
-define <4 x i8> @_Z13convert_char4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
-  ret <4 x i8> %in
-}
-
-define <3 x i8> @_Z13convert_char3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
-  ret <3 x i8> %in
-}
-
-define <2 x i8> @_Z13convert_char2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
-  ret <2 x i8> %in
-}
-
-define <4 x i8> @_Z13convert_char4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
-  ret <4 x i8> %in
-}
-
-define <3 x i8> @_Z13convert_char3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
-  ret <3 x i8> %in
-}
-
-define <2 x i8> @_Z13convert_char2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
-  ret <2 x i8> %in
-}
-
-define <4 x i8> @_Z13convert_char4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i16> %in to <4 x i8>
-  ret <4 x i8> %1
-}
-
-define <3 x i8> @_Z13convert_char3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
-  %1 = trunc <3 x i16> %in to <3 x i8>
-  ret <3 x i8> %1
-}
-
-define <2 x i8> @_Z13convert_char2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
-  %1 = trunc <2 x i16> %in to <2 x i8>
-  ret <2 x i8> %1
-}
-
-define <4 x i8> @_Z13convert_char4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i16> %in to <4 x i8>
-  ret <4 x i8> %1
-}
-
-define <3 x i8> @_Z13convert_char3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
-  %1 = trunc <3 x i16> %in to <3 x i8>
-  ret <3 x i8> %1
-}
-
-define <2 x i8> @_Z13convert_char2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
-  %1 = trunc <2 x i16> %in to <2 x i8>
-  ret <2 x i8> %1
-}
-
-define <4 x i8> @_Z13convert_char4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i32> %in to <4 x i8>
-  ret <4 x i8> %1
-}
-
-define <3 x i8> @_Z13convert_char3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <3 x i32> %in to <3 x i8>
-  ret <3 x i8> %1
-}
-
-define <2 x i8> @_Z13convert_char2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <2 x i32> %in to <2 x i8>
-  ret <2 x i8> %1
-}
-
-define <4 x i8> @_Z13convert_char4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i32> %in to <4 x i8>
-  ret <4 x i8> %1
-}
-
-define <3 x i8> @_Z13convert_char3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <3 x i32> %in to <3 x i8>
-  ret <3 x i8> %1
-}
-
-define <2 x i8> @_Z13convert_char2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <2 x i32> %in to <2 x i8>
-  ret <2 x i8> %1
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                  UCHAR                 ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptoui <4 x float> %in to <4 x i8>
-  ret <4 x i8> %1
-}
-
-define <3 x i8> @_Z14convert_uchar3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptoui <3 x float> %in to <3 x i8>
-  ret <3 x i8> %1
-}
-
-define <2 x i8> @_Z14convert_uchar2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptoui <2 x float> %in to <2 x i8>
-  ret <2 x i8> %1
-}
-
-define <4 x i8> @_Z14convert_uchar4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
-  ret <4 x i8> %in
-}
-
-define <3 x i8> @_Z14convert_uchar3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
-  ret <3 x i8> %in
-}
-
-define <2 x i8> @_Z14convert_uchar2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
-  ret <2 x i8> %in
-}
-
-define <4 x i8> @_Z14convert_uchar4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
-  ret <4 x i8> %in
-}
-
-define <3 x i8> @_Z14convert_uchar3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
-  ret <3 x i8> %in
-}
-
-define <2 x i8> @_Z14convert_uchar2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
-  ret <2 x i8> %in
-}
-
-define <4 x i8> @_Z14convert_uchar4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i16> %in to <4 x i8>
-  ret <4 x i8> %1
-}
-
-define <3 x i8> @_Z14convert_uchar3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
-  %1 = trunc <3 x i16> %in to <3 x i8>
-  ret <3 x i8> %1
-}
-
-define <2 x i8> @_Z14convert_uchar2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
-  %1 = trunc <2 x i16> %in to <2 x i8>
-  ret <2 x i8> %1
-}
-
-define <4 x i8> @_Z14convert_uchar4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i16> %in to <4 x i8>
-  ret <4 x i8> %1
-}
-
-define <3 x i8> @_Z14convert_uchar3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
-  %1 = trunc <3 x i16> %in to <3 x i8>
-  ret <3 x i8> %1
-}
-
-define <2 x i8> @_Z14convert_uchar2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
-  %1 = trunc <2 x i16> %in to <2 x i8>
-  ret <2 x i8> %1
-}
-
-define <4 x i8> @_Z14convert_uchar4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i32> %in to <4 x i8>
-  ret <4 x i8> %1
-}
-
-define <3 x i8> @_Z14convert_uchar3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <3 x i32> %in to <3 x i8>
-  ret <3 x i8> %1
-}
-
-define <2 x i8> @_Z14convert_uchar2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <2 x i32> %in to <2 x i8>
-  ret <2 x i8> %1
-}
-
-define <4 x i8> @_Z14convert_uchar4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i32> %in to <4 x i8>
-  ret <4 x i8> %1
-}
-
-define <3 x i8> @_Z14convert_uchar3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <3 x i32> %in to <3 x i8>
-  ret <3 x i8> %1
-}
-
-define <2 x i8> @_Z14convert_uchar2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <2 x i32> %in to <2 x i8>
-  ret <2 x i8> %1
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                  SHORT                 ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define <4 x i16> @_Z14convert_short4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptosi <4 x float> %in to <4 x i16>
-  ret <4 x i16> %1
-}
-
-define <3 x i16> @_Z14convert_short3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptosi <3 x float> %in to <3 x i16>
-  ret <3 x i16> %1
-}
-
-define <2 x i16> @_Z14convert_short2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptosi <2 x float> %in to <2 x i16>
-  ret <2 x i16> %1
-}
-
-define <4 x i16> @_Z14convert_short4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <4 x i8> %in to <4 x i16>
-  ret <4 x i16> %1
-}
-
-define <3 x i16> @_Z14convert_short3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <3 x i8> %in to <3 x i16>
-  ret <3 x i16> %1
-}
-
-define <2 x i16> @_Z14convert_short2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <2 x i8> %in to <2 x i16>
-  ret <2 x i16> %1
-}
-
-define <4 x i16> @_Z14convert_short4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
-  %1 = sext <4 x i8> %in to <4 x i16>
-  ret <4 x i16> %1
-}
-
-define <3 x i16> @_Z14convert_short3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
-  %1 = sext <3 x i8> %in to <3 x i16>
-  ret <3 x i16> %1
-}
-
-define <2 x i16> @_Z14convert_short2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
-  %1 = sext <2 x i8> %in to <2 x i16>
-  ret <2 x i16> %1
-}
-
-define <4 x i16> @_Z14convert_short4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
-  ret <4 x i16> %in
-}
-
-define <3 x i16> @_Z14convert_short3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
-  ret <3 x i16> %in
-}
-
-define <2 x i16> @_Z14convert_short2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
-  ret <2 x i16> %in
-}
-
-define <4 x i16> @_Z14convert_short4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
-  ret <4 x i16> %in
-}
-
-define <3 x i16> @_Z14convert_short3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
-  ret <3 x i16> %in
-}
-
-define <2 x i16> @_Z14convert_short2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
-  ret <2 x i16> %in
-}
-
-define <4 x i16> @_Z14convert_short4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i32> %in to <4 x i16>
-  ret <4 x i16> %1
-}
-
-define <3 x i16> @_Z14convert_short3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <3 x i32> %in to <3 x i16>
-  ret <3 x i16> %1
-}
-
-define <2 x i16> @_Z14convert_short2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <2 x i32> %in to <2 x i16>
-  ret <2 x i16> %1
-}
-
-define <4 x i16> @_Z14convert_short4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i32> %in to <4 x i16>
-  ret <4 x i16> %1
-}
-
-define <3 x i16> @_Z14convert_short3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <3 x i32> %in to <3 x i16>
-  ret <3 x i16> %1
-}
-
-define <2 x i16> @_Z14convert_short2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <2 x i32> %in to <2 x i16>
-  ret <2 x i16> %1
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                 USHORT                 ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define <4 x i16> @_Z15convert_ushort4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptoui <4 x float> %in to <4 x i16>
-  ret <4 x i16> %1
-}
-
-define <3 x i16> @_Z15convert_ushort3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptoui <3 x float> %in to <3 x i16>
-  ret <3 x i16> %1
-}
-
-define <2 x i16> @_Z15convert_ushort2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptoui <2 x float> %in to <2 x i16>
-  ret <2 x i16> %1
-}
-
-define <4 x i16> @_Z15convert_ushort4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <4 x i8> %in to <4 x i16>
-  ret <4 x i16> %1
-}
-
-define <3 x i16> @_Z15convert_ushort3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <3 x i8> %in to <3 x i16>
-  ret <3 x i16> %1
-}
-
-define <2 x i16> @_Z15convert_ushort2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <2 x i8> %in to <2 x i16>
-  ret <2 x i16> %1
-}
-
-define <4 x i16> @_Z15convert_ushort4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <4 x i8> %in to <4 x i16>
-  ret <4 x i16> %1
-}
-
-define <3 x i16> @_Z15convert_ushort3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <3 x i8> %in to <3 x i16>
-  ret <3 x i16> %1
-}
-
-define <2 x i16> @_Z15convert_ushort2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <2 x i8> %in to <2 x i16>
-  ret <2 x i16> %1
-}
-
-define <4 x i16> @_Z15convert_ushort4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
-  ret <4 x i16> %in
-}
-
-define <3 x i16> @_Z15convert_ushort3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
-  ret <3 x i16> %in
-}
-
-define <2 x i16> @_Z15convert_ushort2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
-  ret <2 x i16> %in
-}
-
-define <4 x i16> @_Z15convert_ushort4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
-  ret <4 x i16> %in
-}
-
-define <3 x i16> @_Z15convert_ushort3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
-  ret <3 x i16> %in
-}
-
-define <2 x i16> @_Z15convert_ushort2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
-  ret <2 x i16> %in
-}
-
-define <4 x i16> @_Z15convert_ushort4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i32> %in to <4 x i16>
-  ret <4 x i16> %1
-}
-
-define <3 x i16> @_Z15convert_ushort3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <3 x i32> %in to <3 x i16>
-  ret <3 x i16> %1
-}
-
-define <2 x i16> @_Z15convert_ushort2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <2 x i32> %in to <2 x i16>
-  ret <2 x i16> %1
-}
-
-define <4 x i16> @_Z15convert_ushort4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i32> %in to <4 x i16>
-  ret <4 x i16> %1
-}
-
-define <3 x i16> @_Z15convert_ushort3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <3 x i32> %in to <3 x i16>
-  ret <3 x i16> %1
-}
-
-define <2 x i16> @_Z15convert_ushort2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <2 x i32> %in to <2 x i16>
-  ret <2 x i16> %1
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                   INT                  ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define <4 x i32> @_Z12convert_int4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptosi <4 x float> %in to <4 x i32>
-  ret <4 x i32> %1
-}
-
-define <3 x i32> @_Z12convert_int3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptosi <3 x float> %in to <3 x i32>
-  ret <3 x i32> %1
-}
-
-define <2 x i32> @_Z12convert_int2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptosi <2 x float> %in to <2 x i32>
-  ret <2 x i32> %1
-}
-
-define <4 x i32> @_Z12convert_int4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <4 x i8> %in to <4 x i32>
-  ret <4 x i32> %1
-}
-
-define <3 x i32> @_Z12convert_int3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <3 x i8> %in to <3 x i32>
-  ret <3 x i32> %1
-}
-
-define <2 x i32> @_Z12convert_int2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <2 x i8> %in to <2 x i32>
-  ret <2 x i32> %1
-}
-
-define <4 x i32> @_Z12convert_int4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
-  %1 = sext <4 x i8> %in to <4 x i32>
-  ret <4 x i32> %1
-}
-
-define <3 x i32> @_Z12convert_int3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
-  %1 = sext <3 x i8> %in to <3 x i32>
-  ret <3 x i32> %1
-}
-
-define <2 x i32> @_Z12convert_int2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
-  %1 = sext <2 x i8> %in to <2 x i32>
-  ret <2 x i32> %1
-}
-
-define <4 x i32> @_Z12convert_int4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
-  %1 = zext <4 x i16> %in to <4 x i32>
-  ret <4 x i32> %1
-}
-
-define <3 x i32> @_Z12convert_int3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
-  %1 = zext <3 x i16> %in to <3 x i32>
-  ret <3 x i32> %1
-}
-
-define <2 x i32> @_Z12convert_int2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
-  %1 = zext <2 x i16> %in to <2 x i32>
-  ret <2 x i32> %1
-}
-
-define <4 x i32> @_Z12convert_int4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
-  %1 = sext <4 x i16> %in to <4 x i32>
-  ret <4 x i32> %1
-}
-
-define <3 x i32> @_Z12convert_int3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
-  %1 = sext <3 x i16> %in to <3 x i32>
-  ret <3 x i32> %1
-}
-
-define <2 x i32> @_Z12convert_int2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
-  %1 = sext <2 x i16> %in to <2 x i32>
-  ret <2 x i32> %1
-}
-
-define <4 x i32> @_Z12convert_int4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
-  ret <4 x i32> %in
-}
-
-define <3 x i32> @_Z12convert_int3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
-  ret <3 x i32> %in
-}
-
-define <2 x i32> @_Z12convert_int2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
-  ret <2 x i32> %in
-}
-
-define <4 x i32> @_Z12convert_int4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
-  ret <4 x i32> %in
-}
-
-define <3 x i32> @_Z12convert_int3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
-  ret <3 x i32> %in
-}
-
-define <2 x i32> @_Z12convert_int2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
-  ret <2 x i32> %in
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;                  UINT                  ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define <4 x i32> @_Z13convert_uint4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptoui <4 x float> %in to <4 x i32>
-  ret <4 x i32> %1
-}
-
-define <3 x i32> @_Z13convert_uint3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptoui <3 x float> %in to <3 x i32>
-  ret <3 x i32> %1
-}
-
-define <2 x i32> @_Z13convert_uint2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptoui <2 x float> %in to <2 x i32>
-  ret <2 x i32> %1
-}
-
-define <4 x i32> @_Z13convert_uint4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <4 x i8> %in to <4 x i32>
-  ret <4 x i32> %1
-}
-
-define <3 x i32> @_Z13convert_uint3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <3 x i8> %in to <3 x i32>
-  ret <3 x i32> %1
-}
-
-define <2 x i32> @_Z13convert_uint2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <2 x i8> %in to <2 x i32>
-  ret <2 x i32> %1
-}
-
-define <4 x i32> @_Z13convert_uint4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <4 x i8> %in to <4 x i32>
-  ret <4 x i32> %1
-}
-
-define <3 x i32> @_Z13convert_uint3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <3 x i8> %in to <3 x i32>
-  ret <3 x i32> %1
-}
-
-define <2 x i32> @_Z13convert_uint2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
-  %1 = zext <2 x i8> %in to <2 x i32>
-  ret <2 x i32> %1
-}
-
-define <4 x i32> @_Z13convert_uint4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
-  %1 = zext <4 x i16> %in to <4 x i32>
-  ret <4 x i32> %1
-}
-
-define <3 x i32> @_Z13convert_uint3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
-  %1 = zext <3 x i16> %in to <3 x i32>
-  ret <3 x i32> %1
-}
-
-define <2 x i32> @_Z13convert_uint2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
-  %1 = zext <2 x i16> %in to <2 x i32>
-  ret <2 x i32> %1
-}
-
-define <4 x i32> @_Z13convert_uint4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
-  %1 = zext <4 x i16> %in to <4 x i32>
-  ret <4 x i32> %1
-}
-
-define <3 x i32> @_Z13convert_uint3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
-  %1 = zext <3 x i16> %in to <3 x i32>
-  ret <3 x i32> %1
-}
-
-define <2 x i32> @_Z13convert_uint2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
-  %1 = zext <2 x i16> %in to <2 x i32>
-  ret <2 x i32> %1
-}
-
-define <4 x i32> @_Z13convert_uint4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
-  ret <4 x i32> %in
-}
-
-define <3 x i32> @_Z13convert_uint3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
-  ret <3 x i32> %in
-}
-
-define <2 x i32> @_Z13convert_uint2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
-  ret <2 x i32> %in
-}
-
-define <4 x i32> @_Z13convert_uint4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
-  ret <4 x i32> %in
-}
-
-define <3 x i32> @_Z13convert_uint3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
-  ret <3 x i32> %in
-}
-
-define <2 x i32> @_Z13convert_uint2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
-  ret <2 x i32> %in
-}
diff --git a/lib/Renderscript/runtime/math.ll b/lib/Renderscript/runtime/math.ll
deleted file mode 100644
index f026d15..0000000
--- a/lib/Renderscript/runtime/math.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
-target triple = "armv7-none-linux-gnueabi"
-
-declare float @llvm.sqrt.f32(float)
-declare float @llvm.pow.f32(float, float)
-declare float @llvm.fabs.f32(float)
-declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
-declare <3 x float> @llvm.fabs.v3f32(<3 x float>)
-declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
-
-define float @_Z4sqrtf(float %v) nounwind readnone alwaysinline {
-  %1 = tail call float @llvm.sqrt.f32(float %v)
-  ret float %1
-}
-
-define float @_Z3powf(float %v1, float %v2) nounwind readnone alwaysinline {
-  %1 = tail call float @llvm.pow.f32(float  %v1, float %v2)
-  ret float %1
-}
diff --git a/lib/Renderscript/runtime/matrix.ll b/lib/Renderscript/runtime/matrix.ll
deleted file mode 100644
index c56405d..0000000
--- a/lib/Renderscript/runtime/matrix.ll
+++ /dev/null
@@ -1,176 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
-target triple = "armv7-none-linux-gnueabi"
-
-
-%struct.rs_matrix4x4 = type { [16 x float] }
-%struct.rs_matrix3x3 = type { [9 x float] }
-%struct.rs_matrix2x2 = type { [4 x float] }
-
-define internal <4 x float> @smear_f(float %in) nounwind readnone alwaysinline {
-  %1 = insertelement <4 x float> undef, float %in, i32 0
-  %2 = insertelement <4 x float> %1, float %in, i32 1
-  %3 = insertelement <4 x float> %2, float %in, i32 2
-  %4 = insertelement <4 x float> %3, float %in, i32 3
-  ret <4 x float> %4
-}
-
-
-define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly {
-  %x0 = extractelement <3 x float> %in, i32 0
-  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
-  %y0 = extractelement <3 x float> %in, i32 1
-  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
-  %z0 = extractelement <3 x float> %in, i32 2
-  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
-
-  %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
-  %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2, align 4
-  %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
-  %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2, align 4
-  %pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 6
-  %pz2 = bitcast float* %pz to <3 x float>*
-  %zm2 = load <3 x float>* %pz2, align 4
-  %zm = shufflevector <3 x float> %zm2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-
-  %a1 = fmul <4 x float> %x, %xm
-  %a2 = fmul <4 x float> %y, %ym
-  %a3 = fadd <4 x float> %a1, %a2
-  %a4 = fmul <4 x float> %z, %zm
-  %a5 = fadd <4 x float> %a4, %a3
-  %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %a6
-}
-
-define <3 x float> @_Z16rsMatrixMultiplyP12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly {
-  %r = tail call <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind
-  ret <3 x float> %r
-}
-
-define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly {
-  %x0 = extractelement <2 x float> %in, i32 0
-  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
-  %y0 = extractelement <2 x float> %in, i32 1
-  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
-
-  %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
-  %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2, align 4
-  %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
-  %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2, align 4
-
-  %a1 = fmul <4 x float> %x, %xm
-  %a2 = fmul <4 x float> %y, %ym
-  %a3 = fadd <4 x float> %a1, %a2
-  %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %a4
-}
-
-define <3 x float> @_Z16rsMatrixMultiplyP12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly {
-  %r = tail call <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind
-  ret <3 x float> %r
-}
-
-define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly {
-  %x0 = extractelement <4 x float> %in, i32 0
-  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
-  %y0 = extractelement <4 x float> %in, i32 1
-  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
-  %z0 = extractelement <4 x float> %in, i32 2
-  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
-  %w0 = extractelement <4 x float> %in, i32 3
-  %w = tail call <4 x float> @smear_f(float %w0) nounwind readnone
-
-  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
-  %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2, align 4
-  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
-  %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2, align 4
-  %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
-  %pz2 = bitcast float* %pz to <4 x float>*
-  %zm = load <4 x float>* %pz2, align 4
-  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
-  %pw2 = bitcast float* %pw to <4 x float>*
-  %wm = load <4 x float>* %pw2, align 4
-
-  %a1 = fmul <4 x float> %x, %xm
-  %a2 = fmul <4 x float> %y, %ym
-  %a3 = fadd <4 x float> %a1, %a2
-  %a4 = fmul <4 x float> %z, %zm
-  %a5 = fadd <4 x float> %a3, %a4
-  %a6 = fmul <4 x float> %w, %wm
-  %a7 = fadd <4 x float> %a5, %a6
-  ret <4 x float> %a7
-}
-
-define <4 x float> @_Z16rsMatrixMultiplyP12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly {
-  %r = tail call <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind
-  ret <4 x float> %r
-}
-
-define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly {
-  %x0 = extractelement <3 x float> %in, i32 0
-  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
-  %y0 = extractelement <3 x float> %in, i32 1
-  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
-  %z0 = extractelement <3 x float> %in, i32 2
-  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
-
-  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
-  %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2, align 4
-  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
-  %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2, align 4
-  %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
-  %pz2 = bitcast float* %pz to <4 x float>*
-  %zm = load <4 x float>* %pz2, align 4
-  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
-  %pw2 = bitcast float* %pw to <4 x float>*
-  %wm = load <4 x float>* %pw2, align 4
-
-  %a1 = fmul <4 x float> %x, %xm
-  %a2 = fadd <4 x float> %wm, %a1
-  %a3 = fmul <4 x float> %y, %ym
-  %a4 = fadd <4 x float> %a2, %a3
-  %a5 = fmul <4 x float> %z, %zm
-  %a6 = fadd <4 x float> %a4, %a5
-  ret <4 x float> %a6
-}
-
-define <4 x float> @_Z16rsMatrixMultiplyP12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly {
-  %r = tail call <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind
-  ret <4 x float> %r
-}
-
-define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly {
-  %x0 = extractelement <2 x float> %in, i32 0
-  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
-  %y0 = extractelement <2 x float> %in, i32 1
-  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
-
-  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
-  %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2, align 4
-  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
-  %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2, align 4
-  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
-  %pw2 = bitcast float* %pw to <4 x float>*
-  %wm = load <4 x float>* %pw2, align 4
-
-  %a1 = fmul <4 x float> %x, %xm
-  %a2 = fadd <4 x float> %wm, %a1
-  %a3 = fmul <4 x float> %y, %ym
-  %a4 = fadd <4 x float> %a2, %a3
-  ret <4 x float> %a4
-}
-
-define <4 x float> @_Z16rsMatrixMultiplyP12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly {
-  %r = tail call <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind
-  ret <4 x float> %r
-}
-
diff --git a/lib/Renderscript/runtime/rsClamp.ll b/lib/Renderscript/runtime/rsClamp.ll
deleted file mode 100644
index eba678a..0000000
--- a/lib/Renderscript/runtime/rsClamp.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
-target triple = "armv7-none-linux-gnueabi"
-
-
-define float @_Z7rsClampfff(float %value, float %low, float %high) nounwind readonly {
-  %1 = fcmp olt float %value, %high
-  %2 = select i1 %1, float %value, float %high
-  %3 = fcmp ogt float %2, %low
-  %4 = select i1 %3, float %2, float %low
-  ret float %4
-}
-
-define signext i8 @_Z7rsClampccc(i8 signext %value, i8 signext %low, i8 signext %high) nounwind readonly {
-  %1 = icmp slt i8 %value, %high
-  %2 = select i1 %1, i8 %value, i8 %high
-  %3 = icmp sgt i8 %2, %low
-  %4 = select i1 %3, i8 %2, i8 %low
-  ret i8 %4
-}
-
-define zeroext i8 @_Z7rsClamphhh(i8 zeroext %value, i8 zeroext %low, i8 zeroext %high) nounwind readonly {
-  %1 = icmp ult i8 %value, %high
-  %2 = select i1 %1, i8 %value, i8 %high
-  %3 = icmp ugt i8 %2, %low
-  %4 = select i1 %3, i8 %2, i8 %low
-  ret i8 %4
-}
-
-define signext i16 @_Z7rsClampsss(i16 signext %value, i16 signext %low, i16 signext %high) nounwind readonly {
-  %1 = icmp slt i16 %value, %high
-  %2 = select i1 %1, i16 %value, i16 %high
-  %3 = icmp sgt i16 %2, %low
-  %4 = select i1 %3, i16 %2, i16 %low
-  ret i16 %4
-}
-
-define zeroext i16 @_Z7rsClampttt(i16 zeroext %value, i16 zeroext %low, i16 zeroext %high) nounwind readonly {
-  %1 = icmp ult i16 %value, %high
-  %2 = select i1 %1, i16 %value, i16 %high
-  %3 = icmp ugt i16 %2, %low
-  %4 = select i1 %3, i16 %2, i16 %low
-  ret i16 %4
-}
-
-define i32 @_Z7rsClampiii(i32 %value, i32 %low, i32 %high) nounwind readonly {
-  %1 = icmp slt i32 %value, %high
-  %2 = select i1 %1, i32 %value, i32 %high
-  %3 = icmp sgt i32 %2, %low
-  %4 = select i1 %3, i32 %2, i32 %low
-  ret i32 %4
-}
-
-define i32 @_Z7rsClampjjj(i32 %value, i32 %low, i32 %high) nounwind readonly {
-  %1 = icmp ult i32 %value, %high
-  %2 = select i1 %1, i32 %value, i32 %high
-  %3 = icmp ugt i32 %2, %low
-  %4 = select i1 %3, i32 %2, i32 %low
-  ret i32 %4
-}
-
diff --git a/lib/Renderscript/runtime/rs_allocation.c b/lib/Renderscript/runtime/rs_allocation.c
deleted file mode 100644
index 1d0f5b6..0000000
--- a/lib/Renderscript/runtime/rs_allocation.c
+++ /dev/null
@@ -1,310 +0,0 @@
-#include "rs_core.rsh"
-#include "rs_graphics.rsh"
-#include "rs_structs.h"
-
-// Opaque Allocation type operations
-extern uint32_t __attribute__((overloadable))
-    rsAllocationGetDimX(rs_allocation a) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    return alloc->mHal.drvState.lod[0].dimX;
-}
-
-extern uint32_t __attribute__((overloadable))
-        rsAllocationGetDimY(rs_allocation a) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    return alloc->mHal.drvState.lod[0].dimY;
-}
-
-extern uint32_t __attribute__((overloadable))
-        rsAllocationGetDimZ(rs_allocation a) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    return alloc->mHal.drvState.lod[0].dimZ;
-}
-
-extern uint32_t __attribute__((overloadable))
-        rsAllocationGetDimLOD(rs_allocation a) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    return alloc->mHal.state.hasMipmaps;
-}
-
-extern uint32_t __attribute__((overloadable))
-        rsAllocationGetDimFaces(rs_allocation a) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    return alloc->mHal.state.hasFaces;
-}
-
-
-extern rs_element __attribute__((overloadable))
-        rsAllocationGetElement(rs_allocation a) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    if (alloc == NULL) {
-        rs_element nullElem = {0};
-        return nullElem;
-    }
-    Type_t *type = (Type_t *)alloc->mHal.state.type;
-    rs_element returnElem = {type->mHal.state.element};
-    return returnElem;
-}
-
-// TODO: this needs to be optimized, obviously
-static void memcpy(void* dst, void* src, size_t size) {
-    char* dst_c = (char*) dst, *src_c = (char*) src;
-    for (; size > 0; size--) {
-        *dst_c++ = *src_c++;
-    }
-}
-
-#ifdef RS_DEBUG_RUNTIME
-#define ELEMENT_AT(T)                                                   \
-    extern void __attribute__((overloadable))                           \
-        rsSetElementAt_##T(rs_allocation a, const T *val, uint32_t x);  \
-    extern void __attribute__((overloadable))                           \
-        rsSetElementAt_##T(rs_allocation a, const T *val, uint32_t x, uint32_t y); \
-    extern void __attribute__((overloadable))                           \
-        rsSetElementAt_##T(rs_allocation a, const T *val, uint32_t x, uint32_t y, uint32_t z); \
-    extern void __attribute__((overloadable))                           \
-        rsGetElementAt_##T(rs_allocation a, T *val, uint32_t x);  \
-    extern void __attribute__((overloadable))                           \
-        rsGetElementAt_##T(rs_allocation a, T *val, uint32_t x, uint32_t y); \
-    extern void __attribute__((overloadable))                           \
-        rsGetElementAt_##T(rs_allocation a, T *val, uint32_t x, uint32_t y, uint32_t z); \
-                                                                        \
-    extern void __attribute__((overloadable))                           \
-    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x) {            \
-        rsSetElementAt_##T(a, &val, x);                                 \
-    }                                                                   \
-    extern void __attribute__((overloadable))                           \
-    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x, uint32_t y) { \
-        rsSetElementAt_##T(a, &val, x, y);                              \
-    }                                                                   \
-    extern void __attribute__((overloadable))                           \
-    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x, uint32_t y, uint32_t z) { \
-        rsSetElementAt_##T(a, &val, x, y, z);                           \
-    }                                                                   \
-    extern T __attribute__((overloadable))                              \
-    rsGetElementAt_##T(rs_allocation a, uint32_t x) {                   \
-        T tmp;                                                          \
-        rsGetElementAt_##T(a, &tmp, x);                                 \
-        return tmp;                                                     \
-    }                                                                   \
-    extern T __attribute__((overloadable))                              \
-    rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y) {       \
-        T tmp;                                                          \
-        rsGetElementAt_##T(a, &tmp, x, y);                              \
-        return tmp;                                                     \
-    }                                                                   \
-    extern T __attribute__((overloadable))                              \
-    rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) { \
-        T tmp;                                                          \
-        rsGetElementAt_##T(a, &tmp, x, y, z);                           \
-        return tmp;                                                     \
-    }
-
-#else
-#define ELEMENT_AT(T)                                                   \
-    extern void __attribute__((overloadable))                           \
-    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x) {            \
-        Allocation_t *alloc = (Allocation_t *)a.p;                      \
-        uint8_t *p = (uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
-        const uint32_t eSize = sizeof(T);                               \
-        *((T*)&p[(eSize * x)]) = val;                                   \
-    }                                                                   \
-    extern void __attribute__((overloadable))                           \
-    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x, uint32_t y) { \
-        Allocation_t *alloc = (Allocation_t *)a.p;                      \
-        uint8_t *p = (uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
-        const uint32_t eSize = sizeof(T);                               \
-        const uint32_t stride = alloc->mHal.drvState.lod[0].stride;     \
-        *((T*)&p[(eSize * x) + (y * stride)]) = val;                    \
-    }                                                                   \
-    extern void __attribute__((overloadable))                           \
-    rsSetElementAt_##T(rs_allocation a, T val, uint32_t x, uint32_t y, uint32_t z) { \
-        Allocation_t *alloc = (Allocation_t *)a.p;                      \
-        uint8_t *p = (uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
-        const uint32_t stride = alloc->mHal.drvState.lod[0].stride;     \
-        const uint32_t dimY = alloc->mHal.drvState.lod[0].dimY;         \
-        uint8_t *dp = &p[(sizeof(T) * x) + (y * stride) + (z * stride * dimY)]; \
-        ((T*)dp)[0] = val;                                        \
-    }                                                                   \
-    extern T __attribute__((overloadable))                              \
-    rsGetElementAt_##T(rs_allocation a, uint32_t x) {                   \
-        Allocation_t *alloc = (Allocation_t *)a.p;                      \
-        const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
-        return *((T*)&p[(sizeof(T) * x)]);                              \
-    }                                                                   \
-    extern T __attribute__((overloadable))                              \
-    rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y) {       \
-        Allocation_t *alloc = (Allocation_t *)a.p;                      \
-        const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
-        const uint32_t stride = alloc->mHal.drvState.lod[0].stride;     \
-        return *((T*)&p[(sizeof(T) * x) + (y * stride)]);               \
-    }                                                                   \
-    extern T __attribute__((overloadable))                              \
-    rsGetElementAt_##T(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) { \
-        Allocation_t *alloc = (Allocation_t *)a.p;                      \
-        const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr; \
-        const uint32_t stride = alloc->mHal.drvState.lod[0].stride;     \
-        const uint32_t dimY = alloc->mHal.drvState.lod[0].dimY;         \
-        const uint8_t *dp = &p[(sizeof(T) * x) + (y * stride) + (z * stride * dimY)]; \
-        return ((const T*)dp)[0];                                       \
-    }
-
-
-
-extern const void * __attribute__((overloadable))
-        rsGetElementAt(rs_allocation a, uint32_t x) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr;
-    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
-    return &p[eSize * x];
-}
-
-extern const void * __attribute__((overloadable))
-        rsGetElementAt(rs_allocation a, uint32_t x, uint32_t y) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr;
-    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
-    const uint32_t stride = alloc->mHal.drvState.lod[0].stride;
-    return &p[(eSize * x) + (y * stride)];
-}
-
-extern const void * __attribute__((overloadable))
-        rsGetElementAt(rs_allocation a, uint32_t x, uint32_t y, uint32_t z) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr;
-    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
-    const uint32_t stride = alloc->mHal.drvState.lod[0].stride;
-    const uint32_t dimY = alloc->mHal.drvState.lod[0].dimY;
-    return &p[(eSize * x) + (y * stride) + (z * stride * dimY)];
-}
-extern void __attribute__((overloadable))
-        rsSetElementAt(rs_allocation a, void* ptr, uint32_t x) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr;
-    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
-    memcpy((void*)&p[eSize * x], ptr, eSize);
-}
-
-extern void __attribute__((overloadable))
-        rsSetElementAt(rs_allocation a, void* ptr, uint32_t x, uint32_t y) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr;
-    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
-    const uint32_t stride = alloc->mHal.drvState.lod[0].stride;
-    memcpy((void*)&p[(eSize * x) + (y * stride)], ptr, eSize);
-}
-
-extern void __attribute__((overloadable))
-        rsSetElementAt(rs_allocation a, void* ptr, uint32_t x, uint32_t y, uint32_t z) {
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr;
-    const uint32_t eSize = alloc->mHal.state.elementSizeBytes;
-    const uint32_t stride = alloc->mHal.drvState.lod[0].stride;
-    const uint32_t dimY = alloc->mHal.drvState.lod[0].dimY;
-    memcpy((void*)&p[(eSize * x) + (y * stride) + (z * stride * dimY)], ptr, eSize);
-}
-#endif
-
-ELEMENT_AT(char)
-ELEMENT_AT(char2)
-ELEMENT_AT(char3)
-ELEMENT_AT(char4)
-ELEMENT_AT(uchar)
-ELEMENT_AT(uchar2)
-ELEMENT_AT(uchar3)
-ELEMENT_AT(uchar4)
-ELEMENT_AT(short)
-ELEMENT_AT(short2)
-ELEMENT_AT(short3)
-ELEMENT_AT(short4)
-ELEMENT_AT(ushort)
-ELEMENT_AT(ushort2)
-ELEMENT_AT(ushort3)
-ELEMENT_AT(ushort4)
-ELEMENT_AT(int)
-ELEMENT_AT(int2)
-ELEMENT_AT(int3)
-ELEMENT_AT(int4)
-ELEMENT_AT(uint)
-ELEMENT_AT(uint2)
-ELEMENT_AT(uint3)
-ELEMENT_AT(uint4)
-ELEMENT_AT(long)
-ELEMENT_AT(long2)
-ELEMENT_AT(long3)
-ELEMENT_AT(long4)
-ELEMENT_AT(ulong)
-ELEMENT_AT(ulong2)
-ELEMENT_AT(ulong3)
-ELEMENT_AT(ulong4)
-ELEMENT_AT(float)
-ELEMENT_AT(float2)
-ELEMENT_AT(float3)
-ELEMENT_AT(float4)
-ELEMENT_AT(double)
-ELEMENT_AT(double2)
-ELEMENT_AT(double3)
-ELEMENT_AT(double4)
-
-#undef ELEMENT_AT
-
-
-extern const uchar __attribute__((overloadable))
-        rsGetElementAtYuv_uchar_Y(rs_allocation a, uint32_t x, uint32_t y) {
-    return rsGetElementAt_uchar(a, x, y);
-}
-
-extern const uchar __attribute__((overloadable))
-        rsGetElementAtYuv_uchar_U(rs_allocation a, uint32_t x, uint32_t y) {
-
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    const uint32_t yuvID = alloc->mHal.state.yuv;
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[1].mallocPtr;
-    const uint32_t stride = alloc->mHal.drvState.lod[1].stride;
-
-    switch(yuvID) {
-    case 0x32315659: //HAL_PIXEL_FORMAT_YV12:
-        x >>= 1;
-        y >>= 1;
-        return p[x + (y * stride)];
-    case 11: //HAL_PIXEL_FORMAT_YCrCb_420_SP:  // NV21
-        x >>= 1;
-        y >>= 1;
-        return p[(x<<1) + (y * stride)];
-    default:
-        break;
-    }
-
-    return 0;
-}
-
-extern const uchar __attribute__((overloadable))
-        rsGetElementAtYuv_uchar_V(rs_allocation a, uint32_t x, uint32_t y) {
-
-    Allocation_t *alloc = (Allocation_t *)a.p;
-    const uint32_t yuvID = alloc->mHal.state.yuv;
-
-    switch(yuvID) {
-    case 0x32315659: //HAL_PIXEL_FORMAT_YV12:
-        {
-        const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[2].mallocPtr;
-        const uint32_t stride = alloc->mHal.drvState.lod[2].stride;
-        x >>= 1;
-        y >>= 1;
-        return p[x + (y * stride)];
-        }
-    case 11: //HAL_PIXEL_FORMAT_YCrCb_420_SP:  // NV21
-        {
-        const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[1].mallocPtr;
-        const uint32_t stride = alloc->mHal.drvState.lod[1].stride;
-        x >>= 1;
-        y >>= 1;
-        return p[(x<<1) + (y * stride) + 1];
-        }
-    default:
-            break;
-    }
-
-    return 0;
-}
-
diff --git a/lib/Renderscript/runtime/rs_cl.c b/lib/Renderscript/runtime/rs_cl.c
deleted file mode 100755
index b7f9158..0000000
--- a/lib/Renderscript/runtime/rs_cl.c
+++ /dev/null
@@ -1,1194 +0,0 @@
-#include "rs_types.rsh"
-
-extern float2 __attribute__((overloadable)) convert_float2(int2 c);
-extern float3 __attribute__((overloadable)) convert_float3(int3 c);
-extern float4 __attribute__((overloadable)) convert_float4(int4 c);
-
-extern int2 __attribute__((overloadable)) convert_int2(float2 c);
-extern int3 __attribute__((overloadable)) convert_int3(float3 c);
-extern int4 __attribute__((overloadable)) convert_int4(float4 c);
-
-
-extern float __attribute__((overloadable)) fmin(float v, float v2);
-extern float2 __attribute__((overloadable)) fmin(float2 v, float v2);
-extern float3 __attribute__((overloadable)) fmin(float3 v, float v2);
-extern float4 __attribute__((overloadable)) fmin(float4 v, float v2);
-
-extern float __attribute__((overloadable)) fmax(float v, float v2);
-extern float2 __attribute__((overloadable)) fmax(float2 v, float v2);
-extern float3 __attribute__((overloadable)) fmax(float3 v, float v2);
-extern float4 __attribute__((overloadable)) fmax(float4 v, float v2);
-
-// Float ops, 6.11.2
-
-#define FN_FUNC_FN(fnc)                                         \
-extern float2 __attribute__((overloadable)) fnc(float2 v) { \
-    float2 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    return r;                                                   \
-}                                                               \
-extern float3 __attribute__((overloadable)) fnc(float3 v) { \
-    float3 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    return r;                                                   \
-}                                                               \
-extern float4 __attribute__((overloadable)) fnc(float4 v) { \
-    float4 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    r.w = fnc(v.w);                                             \
-    return r;                                                   \
-}
-
-#define IN_FUNC_FN(fnc)                                         \
-extern int2 __attribute__((overloadable)) fnc(float2 v) {   \
-    int2 r;                                                     \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    return r;                                                   \
-}                                                               \
-extern int3 __attribute__((overloadable)) fnc(float3 v) {   \
-    int3 r;                                                     \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    return r;                                                   \
-}                                                               \
-extern int4 __attribute__((overloadable)) fnc(float4 v) {   \
-    int4 r;                                                     \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    r.w = fnc(v.w);                                             \
-    return r;                                                   \
-}
-
-#define FN_FUNC_FN_FN(fnc)                                                  \
-extern float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2) { \
-    float2 r;                                                               \
-    r.x = fnc(v1.x, v2.x);                                                  \
-    r.y = fnc(v1.y, v2.y);                                                  \
-    return r;                                                               \
-}                                                                           \
-extern float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2) { \
-    float3 r;                                                               \
-    r.x = fnc(v1.x, v2.x);                                                  \
-    r.y = fnc(v1.y, v2.y);                                                  \
-    r.z = fnc(v1.z, v2.z);                                                  \
-    return r;                                                               \
-}                                                                           \
-extern float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2) { \
-    float4 r;                                                               \
-    r.x = fnc(v1.x, v2.x);                                                  \
-    r.y = fnc(v1.y, v2.y);                                                  \
-    r.z = fnc(v1.z, v2.z);                                                  \
-    r.w = fnc(v1.w, v2.w);                                                  \
-    return r;                                                               \
-}
-
-#define FN_FUNC_FN_F(fnc)                                                   \
-extern float2 __attribute__((overloadable)) fnc(float2 v1, float v2) {  \
-    float2 r;                                                               \
-    r.x = fnc(v1.x, v2);                                                    \
-    r.y = fnc(v1.y, v2);                                                    \
-    return r;                                                               \
-}                                                                           \
-extern float3 __attribute__((overloadable)) fnc(float3 v1, float v2) {  \
-    float3 r;                                                               \
-    r.x = fnc(v1.x, v2);                                                    \
-    r.y = fnc(v1.y, v2);                                                    \
-    r.z = fnc(v1.z, v2);                                                    \
-    return r;                                                               \
-}                                                                           \
-extern float4 __attribute__((overloadable)) fnc(float4 v1, float v2) {  \
-    float4 r;                                                               \
-    r.x = fnc(v1.x, v2);                                                    \
-    r.y = fnc(v1.y, v2);                                                    \
-    r.z = fnc(v1.z, v2);                                                    \
-    r.w = fnc(v1.w, v2);                                                    \
-    return r;                                                               \
-}
-
-#define FN_FUNC_FN_IN(fnc)                                                  \
-extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2) {   \
-    float2 r;                                                               \
-    r.x = fnc(v1.x, v2.x);                                                  \
-    r.y = fnc(v1.y, v2.y);                                                  \
-    return r;                                                               \
-}                                                                           \
-extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2) {   \
-    float3 r;                                                               \
-    r.x = fnc(v1.x, v2.x);                                                  \
-    r.y = fnc(v1.y, v2.y);                                                  \
-    r.z = fnc(v1.z, v2.z);                                                  \
-    return r;                                                               \
-}                                                                           \
-extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2) {   \
-    float4 r;                                                               \
-    r.x = fnc(v1.x, v2.x);                                                  \
-    r.y = fnc(v1.y, v2.y);                                                  \
-    r.z = fnc(v1.z, v2.z);                                                  \
-    r.w = fnc(v1.w, v2.w);                                                  \
-    return r;                                                               \
-}
-
-#define FN_FUNC_FN_I(fnc)                                                   \
-extern float2 __attribute__((overloadable)) fnc(float2 v1, int v2) {    \
-    float2 r;                                                               \
-    r.x = fnc(v1.x, v2);                                                    \
-    r.y = fnc(v1.y, v2);                                                    \
-    return r;                                                               \
-}                                                                           \
-extern float3 __attribute__((overloadable)) fnc(float3 v1, int v2) {    \
-    float3 r;                                                               \
-    r.x = fnc(v1.x, v2);                                                    \
-    r.y = fnc(v1.y, v2);                                                    \
-    r.z = fnc(v1.z, v2);                                                    \
-    return r;                                                               \
-}                                                                           \
-extern float4 __attribute__((overloadable)) fnc(float4 v1, int v2) {    \
-    float4 r;                                                               \
-    r.x = fnc(v1.x, v2);                                                    \
-    r.y = fnc(v1.y, v2);                                                    \
-    r.z = fnc(v1.z, v2);                                                    \
-    r.w = fnc(v1.w, v2);                                                    \
-    return r;                                                               \
-}
-
-#define FN_FUNC_FN_PFN(fnc)                     \
-extern float2 __attribute__((overloadable)) \
-        fnc(float2 v1, float2 *v2) {            \
-    float2 r;                                   \
-    float t[2];                                 \
-    r.x = fnc(v1.x, &t[0]);                     \
-    r.y = fnc(v1.y, &t[1]);                     \
-    v2->x = t[0];                               \
-    v2->y = t[1];                               \
-    return r;                                   \
-}                                               \
-extern float3 __attribute__((overloadable)) \
-        fnc(float3 v1, float3 *v2) {            \
-    float3 r;                                   \
-    float t[3];                                 \
-    r.x = fnc(v1.x, &t[0]);                     \
-    r.y = fnc(v1.y, &t[1]);                     \
-    r.z = fnc(v1.z, &t[2]);                     \
-    v2->x = t[0];                               \
-    v2->y = t[1];                               \
-    v2->z = t[2];                               \
-    return r;                                   \
-}                                               \
-extern float4 __attribute__((overloadable)) \
-        fnc(float4 v1, float4 *v2) {            \
-    float4 r;                                   \
-    float t[4];                                 \
-    r.x = fnc(v1.x, &t[0]);                     \
-    r.y = fnc(v1.y, &t[1]);                     \
-    r.z = fnc(v1.z, &t[2]);                     \
-    r.w = fnc(v1.w, &t[3]);                     \
-    v2->x = t[0];                               \
-    v2->y = t[1];                               \
-    v2->z = t[2];                               \
-    v2->w = t[3];                               \
-    return r;                                   \
-}
-
-#define FN_FUNC_FN_PIN(fnc)                                                 \
-extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2) {  \
-    float2 r;                                                               \
-    int t[2];                                                               \
-    r.x = fnc(v1.x, &t[0]);                                                 \
-    r.y = fnc(v1.y, &t[1]);                                                 \
-    v2->x = t[0];                                                           \
-    v2->y = t[1];                                                           \
-    return r;                                                               \
-}                                                                           \
-extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2) {  \
-    float3 r;                                                               \
-    int t[3];                                                               \
-    r.x = fnc(v1.x, &t[0]);                                                 \
-    r.y = fnc(v1.y, &t[1]);                                                 \
-    r.z = fnc(v1.z, &t[2]);                                                 \
-    v2->x = t[0];                                                           \
-    v2->y = t[1];                                                           \
-    v2->z = t[2];                                                           \
-    return r;                                                               \
-}                                                                           \
-extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2) {  \
-    float4 r;                                                               \
-    int t[4];                                                               \
-    r.x = fnc(v1.x, &t[0]);                                                 \
-    r.y = fnc(v1.y, &t[1]);                                                 \
-    r.z = fnc(v1.z, &t[2]);                                                 \
-    r.w = fnc(v1.w, &t[3]);                                                 \
-    v2->x = t[0];                                                           \
-    v2->y = t[1];                                                           \
-    v2->z = t[2];                                                           \
-    v2->w = t[3];                                                           \
-    return r;                                                               \
-}
-
-#define FN_FUNC_FN_FN_FN(fnc)                   \
-extern float2 __attribute__((overloadable)) \
-        fnc(float2 v1, float2 v2, float2 v3) {  \
-    float2 r;                                   \
-    r.x = fnc(v1.x, v2.x, v3.x);                \
-    r.y = fnc(v1.y, v2.y, v3.y);                \
-    return r;                                   \
-}                                               \
-extern float3 __attribute__((overloadable)) \
-        fnc(float3 v1, float3 v2, float3 v3) {  \
-    float3 r;                                   \
-    r.x = fnc(v1.x, v2.x, v3.x);                \
-    r.y = fnc(v1.y, v2.y, v3.y);                \
-    r.z = fnc(v1.z, v2.z, v3.z);                \
-    return r;                                   \
-}                                               \
-extern float4 __attribute__((overloadable)) \
-        fnc(float4 v1, float4 v2, float4 v3) {  \
-    float4 r;                                   \
-    r.x = fnc(v1.x, v2.x, v3.x);                \
-    r.y = fnc(v1.y, v2.y, v3.y);                \
-    r.z = fnc(v1.z, v2.z, v3.z);                \
-    r.w = fnc(v1.w, v2.w, v3.w);                \
-    return r;                                   \
-}
-
-#define FN_FUNC_FN_FN_PIN(fnc)                  \
-extern float2 __attribute__((overloadable)) \
-        fnc(float2 v1, float2 v2, int2 *v3) {   \
-    float2 r;                                   \
-    int t[2];                                   \
-    r.x = fnc(v1.x, v2.x, &t[0]);               \
-    r.y = fnc(v1.y, v2.y, &t[1]);               \
-    v3->x = t[0];                               \
-    v3->y = t[1];                               \
-    return r;                                   \
-}                                               \
-extern float3 __attribute__((overloadable)) \
-        fnc(float3 v1, float3 v2, int3 *v3) {   \
-    float3 r;                                   \
-    int t[3];                                   \
-    r.x = fnc(v1.x, v2.x, &t[0]);               \
-    r.y = fnc(v1.y, v2.y, &t[1]);               \
-    r.z = fnc(v1.z, v2.z, &t[2]);               \
-    v3->x = t[0];                               \
-    v3->y = t[1];                               \
-    v3->z = t[2];                               \
-    return r;                                   \
-}                                               \
-extern float4 __attribute__((overloadable)) \
-        fnc(float4 v1, float4 v2, int4 *v3) {   \
-    float4 r;                                   \
-    int t[4];                                   \
-    r.x = fnc(v1.x, v2.x, &t[0]);               \
-    r.y = fnc(v1.y, v2.y, &t[1]);               \
-    r.z = fnc(v1.z, v2.z, &t[2]);               \
-    r.w = fnc(v1.w, v2.w, &t[3]);               \
-    v3->x = t[0];                               \
-    v3->y = t[1];                               \
-    v3->z = t[2];                               \
-    v3->w = t[3];                               \
-    return r;                                   \
-}
-
-static const int iposinf = 0x7f800000;
-static const int ineginf = 0xff800000;
-
-static const float posinf() {
-    float f = *((float*)&iposinf);
-    return f;
-}
-
-static const float neginf() {
-    float f = *((float*)&ineginf);
-    return f;
-}
-
-static bool isinf(float f) {
-    int i = *((int*)(void*)&f);
-    return (i == iposinf) || (i == ineginf);
-}
-
-static bool isnan(float f) {
-    int i = *((int*)(void*)&f);
-    return (((i & 0x7f800000) == 0x7f800000) && (i & 0x007fffff));
-}
-
-static bool isposzero(float f) {
-    int i = *((int*)(void*)&f);
-    return (i == 0x00000000);
-}
-
-static bool isnegzero(float f) {
-    int i = *((int*)(void*)&f);
-    return (i == 0x80000000);
-}
-
-static bool iszero(float f) {
-    return isposzero(f) || isnegzero(f);
-}
-
-
-extern float __attribute__((overloadable)) acos(float);
-FN_FUNC_FN(acos)
-
-extern float __attribute__((overloadable)) acosh(float);
-FN_FUNC_FN(acosh)
-
-
-extern float __attribute__((overloadable)) acospi(float v) {
-    return acos(v) / M_PI;
-}
-FN_FUNC_FN(acospi)
-
-extern float __attribute__((overloadable)) asin(float);
-FN_FUNC_FN(asin)
-
-extern float __attribute__((overloadable)) asinh(float);
-FN_FUNC_FN(asinh)
-
-extern float __attribute__((overloadable)) asinpi(float v) {
-    return asin(v) / M_PI;
-}
-FN_FUNC_FN(asinpi)
-
-extern float __attribute__((overloadable)) atan(float);
-FN_FUNC_FN(atan)
-
-extern float __attribute__((overloadable)) atan2(float, float);
-FN_FUNC_FN_FN(atan2)
-
-extern float __attribute__((overloadable)) atanh(float);
-FN_FUNC_FN(atanh)
-
-extern float __attribute__((overloadable)) atanpi(float v) {
-    return atan(v) / M_PI;
-}
-FN_FUNC_FN(atanpi)
-
-
-extern float __attribute__((overloadable)) atan2pi(float y, float x) {
-    return atan2(y, x) / M_PI;
-}
-FN_FUNC_FN_FN(atan2pi)
-
-extern float __attribute__((overloadable)) cbrt(float);
-FN_FUNC_FN(cbrt)
-
-extern float __attribute__((overloadable)) ceil(float);
-FN_FUNC_FN(ceil)
-
-extern float __attribute__((overloadable)) copysign(float, float);
-FN_FUNC_FN_FN(copysign)
-
-extern float __attribute__((overloadable)) cos(float);
-FN_FUNC_FN(cos)
-
-extern float __attribute__((overloadable)) cosh(float);
-FN_FUNC_FN(cosh)
-
-extern float __attribute__((overloadable)) cospi(float v) {
-    return cos(v * M_PI);
-}
-FN_FUNC_FN(cospi)
-
-extern float __attribute__((overloadable)) erfc(float);
-FN_FUNC_FN(erfc)
-
-extern float __attribute__((overloadable)) erf(float);
-FN_FUNC_FN(erf)
-
-extern float __attribute__((overloadable)) exp(float);
-FN_FUNC_FN(exp)
-
-extern float __attribute__((overloadable)) exp2(float);
-FN_FUNC_FN(exp2)
-
-extern float __attribute__((overloadable)) pow(float, float);
-
-extern float __attribute__((overloadable)) exp10(float v) {
-    return exp2(v * 3.321928095f);
-}
-FN_FUNC_FN(exp10)
-
-extern float __attribute__((overloadable)) expm1(float);
-FN_FUNC_FN(expm1)
-
-extern float __attribute__((overloadable)) fabs(float v) {
-    int i = *((int*)(void*)&v) & 0x7fffffff;
-    return  *((float*)(void*)&i);
-}
-FN_FUNC_FN(fabs)
-
-extern float __attribute__((overloadable)) fdim(float, float);
-FN_FUNC_FN_FN(fdim)
-
-extern float __attribute__((overloadable)) floor(float);
-FN_FUNC_FN(floor)
-
-extern float __attribute__((overloadable)) fma(float, float, float);
-FN_FUNC_FN_FN_FN(fma)
-
-extern float __attribute__((overloadable)) fmin(float, float);
-
-extern float __attribute__((overloadable)) fmod(float, float);
-FN_FUNC_FN_FN(fmod)
-
-extern float __attribute__((overloadable)) fract(float v, float *iptr) {
-    int i = (int)floor(v);
-    if (iptr) {
-        iptr[0] = i;
-    }
-    return fmin(v - i, 0x1.fffffep-1f);
-}
-FN_FUNC_FN_PFN(fract)
-
-extern float __attribute__((overloadable)) frexp(float, int *);
-FN_FUNC_FN_PIN(frexp)
-
-extern float __attribute__((overloadable)) hypot(float, float);
-FN_FUNC_FN_FN(hypot)
-
-extern int __attribute__((overloadable)) ilogb(float);
-IN_FUNC_FN(ilogb)
-
-extern float __attribute__((overloadable)) ldexp(float, int);
-FN_FUNC_FN_IN(ldexp)
-FN_FUNC_FN_I(ldexp)
-
-extern float __attribute__((overloadable)) lgamma(float);
-FN_FUNC_FN(lgamma)
-extern float __attribute__((overloadable)) lgamma(float, int*);
-FN_FUNC_FN_PIN(lgamma)
-
-extern float __attribute__((overloadable)) log(float);
-FN_FUNC_FN(log)
-
-extern float __attribute__((overloadable)) log10(float);
-FN_FUNC_FN(log10)
-
-
-extern float __attribute__((overloadable)) log2(float v) {
-    return log10(v) * 3.321928095f;
-}
-FN_FUNC_FN(log2)
-
-extern float __attribute__((overloadable)) log1p(float);
-FN_FUNC_FN(log1p)
-
-extern float __attribute__((overloadable)) logb(float);
-FN_FUNC_FN(logb)
-
-extern float __attribute__((overloadable)) mad(float a, float b, float c) {
-    return a * b + c;
-}
-extern float2 __attribute__((overloadable)) mad(float2 a, float2 b, float2 c) {
-    return a * b + c;
-}
-extern float3 __attribute__((overloadable)) mad(float3 a, float3 b, float3 c) {
-    return a * b + c;
-}
-extern float4 __attribute__((overloadable)) mad(float4 a, float4 b, float4 c) {
-    return a * b + c;
-}
-
-extern float __attribute__((overloadable)) modf(float, float *);
-FN_FUNC_FN_PFN(modf);
-
-extern float __attribute__((overloadable)) nan(uint v) {
-    float f[1];
-    uint32_t *ip = (uint32_t *)f;
-    *ip = v | 0x7fc00000;
-    return f[0];
-}
-
-extern float __attribute__((overloadable)) nextafter(float, float);
-FN_FUNC_FN_FN(nextafter)
-
-FN_FUNC_FN_FN(pow)
-
-extern float __attribute__((overloadable)) pown(float v, int p) {
-    return pow(v, (float)p);
-}
-extern float2 __attribute__((overloadable)) pown(float2 v, int2 p) {
-    float2 f2 = convert_float2(p);
-    return pow(v, f2);
-}
-extern float3 __attribute__((overloadable)) pown(float3 v, int3 p) {
-    float3 f3 = convert_float3(p);
-    return pow(v, f3);
-}
-extern float4 __attribute__((overloadable)) pown(float4 v, int4 p) {
-    float4 f4 = convert_float4(p);
-    return pow(v, f4);
-}
-
-extern float __attribute__((overloadable)) powr(float v, float p) {
-    return pow(v, p);
-}
-extern float2 __attribute__((overloadable)) powr(float2 v, float2 p) {
-    return pow(v, p);
-}
-extern float3 __attribute__((overloadable)) powr(float3 v, float3 p) {
-    return pow(v, p);
-}
-extern float4 __attribute__((overloadable)) powr(float4 v, float4 p) {
-    return pow(v, p);
-}
-
-extern float __attribute__((overloadable)) remainder(float, float);
-FN_FUNC_FN_FN(remainder)
-
-extern float __attribute__((overloadable)) remquo(float, float, int *);
-FN_FUNC_FN_FN_PIN(remquo)
-
-extern float __attribute__((overloadable)) rint(float);
-FN_FUNC_FN(rint)
-
-extern float __attribute__((overloadable)) rootn(float v, int r) {
-    if (r == 0) {
-        return nan(0);
-    }
-
-    if (iszero(v)) {
-        if (r < 0) {
-            if (r & 1) {
-                return copysign(posinf(), v);
-            } else {
-                return posinf();
-            }
-        } else {
-            if (r & 1) {
-                return copysign(0.f, v);
-            } else {
-                return 0.f;
-            }
-        }
-    }
-
-    if (!isinf(v) && !isnan(v) && (v < 0.f)) {
-        if (r & 1) {
-            return (-1.f * pow(-1.f * v, 1.f / r));
-        } else {
-            return nan(0);
-        }
-    }
-
-    return pow(v, 1.f / r);
-}
-FN_FUNC_FN_IN(rootn);
-
-extern float __attribute__((overloadable)) round(float);
-FN_FUNC_FN(round)
-
-
-extern float __attribute__((overloadable)) sqrt(float);
-extern float __attribute__((overloadable)) rsqrt(float v) {
-    return 1.f / sqrt(v);
-}
-FN_FUNC_FN(rsqrt)
-
-extern float __attribute__((overloadable)) sin(float);
-FN_FUNC_FN(sin)
-
-extern float __attribute__((overloadable)) sincos(float v, float *cosptr) {
-    *cosptr = cos(v);
-    return sin(v);
-}
-extern float2 __attribute__((overloadable)) sincos(float2 v, float2 *cosptr) {
-    *cosptr = cos(v);
-    return sin(v);
-}
-extern float3 __attribute__((overloadable)) sincos(float3 v, float3 *cosptr) {
-    *cosptr = cos(v);
-    return sin(v);
-}
-extern float4 __attribute__((overloadable)) sincos(float4 v, float4 *cosptr) {
-    *cosptr = cos(v);
-    return sin(v);
-}
-
-extern float __attribute__((overloadable)) sinh(float);
-FN_FUNC_FN(sinh)
-
-extern float __attribute__((overloadable)) sinpi(float v) {
-    return sin(v * M_PI);
-}
-FN_FUNC_FN(sinpi)
-
-extern float __attribute__((overloadable)) tan(float);
-FN_FUNC_FN(tan)
-
-extern float __attribute__((overloadable)) tanh(float);
-FN_FUNC_FN(tanh)
-
-extern float __attribute__((overloadable)) tanpi(float v) {
-    return tan(v * M_PI);
-}
-FN_FUNC_FN(tanpi)
-
-
-extern float __attribute__((overloadable)) tgamma(float);
-FN_FUNC_FN(tgamma)
-
-extern float __attribute__((overloadable)) trunc(float);
-FN_FUNC_FN(trunc)
-
-// Int ops (partial), 6.11.3
-
-#define XN_FUNC_YN(typeout, fnc, typein)                                \
-extern typeout __attribute__((overloadable)) fnc(typein);               \
-extern typeout##2 __attribute__((overloadable)) fnc(typein##2 v) {  \
-    typeout##2 r;                                                       \
-    r.x = fnc(v.x);                                                     \
-    r.y = fnc(v.y);                                                     \
-    return r;                                                           \
-}                                                                       \
-extern typeout##3 __attribute__((overloadable)) fnc(typein##3 v) {  \
-    typeout##3 r;                                                       \
-    r.x = fnc(v.x);                                                     \
-    r.y = fnc(v.y);                                                     \
-    r.z = fnc(v.z);                                                     \
-    return r;                                                           \
-}                                                                       \
-extern typeout##4 __attribute__((overloadable)) fnc(typein##4 v) {  \
-    typeout##4 r;                                                       \
-    r.x = fnc(v.x);                                                     \
-    r.y = fnc(v.y);                                                     \
-    r.z = fnc(v.z);                                                     \
-    r.w = fnc(v.w);                                                     \
-    return r;                                                           \
-}
-
-
-#define UIN_FUNC_IN(fnc)          \
-XN_FUNC_YN(uchar, fnc, char)      \
-XN_FUNC_YN(ushort, fnc, short)    \
-XN_FUNC_YN(uint, fnc, int)
-
-#define IN_FUNC_IN(fnc)           \
-XN_FUNC_YN(uchar, fnc, uchar)     \
-XN_FUNC_YN(char, fnc, char)       \
-XN_FUNC_YN(ushort, fnc, ushort)   \
-XN_FUNC_YN(short, fnc, short)     \
-XN_FUNC_YN(uint, fnc, uint)       \
-XN_FUNC_YN(int, fnc, int)
-
-
-#define XN_FUNC_XN_XN_BODY(type, fnc, body)         \
-extern type __attribute__((overloadable))       \
-        fnc(type v1, type v2) {                     \
-    return body;                                    \
-}                                                   \
-extern type##2 __attribute__((overloadable))    \
-        fnc(type##2 v1, type##2 v2) {               \
-    type##2 r;                                      \
-    r.x = fnc(v1.x, v2.x);                          \
-    r.y = fnc(v1.y, v2.y);                          \
-    return r;                                       \
-}                                                   \
-extern type##3 __attribute__((overloadable))    \
-        fnc(type##3 v1, type##3 v2) {               \
-    type##3 r;                                      \
-    r.x = fnc(v1.x, v2.x);                          \
-    r.y = fnc(v1.y, v2.y);                          \
-    r.z = fnc(v1.z, v2.z);                          \
-    return r;                                       \
-}                                                   \
-extern type##4 __attribute__((overloadable))    \
-        fnc(type##4 v1, type##4 v2) {               \
-    type##4 r;                                      \
-    r.x = fnc(v1.x, v2.x);                          \
-    r.y = fnc(v1.y, v2.y);                          \
-    r.z = fnc(v1.z, v2.z);                          \
-    r.w = fnc(v1.w, v2.w);                          \
-    return r;                                       \
-}
-
-#define IN_FUNC_IN_IN_BODY(fnc, body) \
-XN_FUNC_XN_XN_BODY(uchar, fnc, body)  \
-XN_FUNC_XN_XN_BODY(char, fnc, body)   \
-XN_FUNC_XN_XN_BODY(ushort, fnc, body) \
-XN_FUNC_XN_XN_BODY(short, fnc, body)  \
-XN_FUNC_XN_XN_BODY(uint, fnc, body)   \
-XN_FUNC_XN_XN_BODY(int, fnc, body)    \
-XN_FUNC_XN_XN_BODY(float, fnc, body)
-
-
-/**
- * abs
- */
-extern uint32_t __attribute__((overloadable)) abs(int32_t v) {
-    if (v < 0)
-        return -v;
-    return v;
-}
-extern uint16_t __attribute__((overloadable)) abs(int16_t v) {
-    if (v < 0)
-        return -v;
-    return v;
-}
-extern uint8_t __attribute__((overloadable)) abs(int8_t v) {
-    if (v < 0)
-        return -v;
-    return v;
-}
-
-/**
- * clz
- */
-extern uint32_t __attribute__((overloadable)) clz(uint32_t v) {
-    return __builtin_clz(v);
-}
-extern uint16_t __attribute__((overloadable)) clz(uint16_t v) {
-    return (uint16_t)__builtin_clz(v);
-}
-extern uint8_t __attribute__((overloadable)) clz(uint8_t v) {
-    return (uint8_t)__builtin_clz(v);
-}
-extern int32_t __attribute__((overloadable)) clz(int32_t v) {
-    return (int32_t)__builtin_clz((uint32_t)v);
-}
-extern int16_t __attribute__((overloadable)) clz(int16_t v) {
-    return (int16_t)__builtin_clz(v);
-}
-extern int8_t __attribute__((overloadable)) clz(int8_t v) {
-    return (int8_t)__builtin_clz(v);
-}
-
-
-UIN_FUNC_IN(abs)
-IN_FUNC_IN(clz)
-
-
-// 6.11.4
-
-
-extern float __attribute__((overloadable)) degrees(float radians) {
-    return radians * (180.f / M_PI);
-}
-extern float2 __attribute__((overloadable)) degrees(float2 radians) {
-    return radians * (180.f / M_PI);
-}
-extern float3 __attribute__((overloadable)) degrees(float3 radians) {
-    return radians * (180.f / M_PI);
-}
-extern float4 __attribute__((overloadable)) degrees(float4 radians) {
-    return radians * (180.f / M_PI);
-}
-
-extern float __attribute__((overloadable)) mix(float start, float stop, float amount) {
-    return start + (stop - start) * amount;
-}
-extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount) {
-    return start + (stop - start) * amount;
-}
-extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount) {
-    return start + (stop - start) * amount;
-}
-extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount) {
-    return start + (stop - start) * amount;
-}
-extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount) {
-    return start + (stop - start) * amount;
-}
-extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount) {
-    return start + (stop - start) * amount;
-}
-extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount) {
-    return start + (stop - start) * amount;
-}
-
-extern float __attribute__((overloadable)) radians(float degrees) {
-    return degrees * (M_PI / 180.f);
-}
-extern float2 __attribute__((overloadable)) radians(float2 degrees) {
-    return degrees * (M_PI / 180.f);
-}
-extern float3 __attribute__((overloadable)) radians(float3 degrees) {
-    return degrees * (M_PI / 180.f);
-}
-extern float4 __attribute__((overloadable)) radians(float4 degrees) {
-    return degrees * (M_PI / 180.f);
-}
-
-extern float __attribute__((overloadable)) step(float edge, float v) {
-    return (v < edge) ? 0.f : 1.f;
-}
-extern float2 __attribute__((overloadable)) step(float2 edge, float2 v) {
-    float2 r;
-    r.x = (v.x < edge.x) ? 0.f : 1.f;
-    r.y = (v.y < edge.y) ? 0.f : 1.f;
-    return r;
-}
-extern float3 __attribute__((overloadable)) step(float3 edge, float3 v) {
-    float3 r;
-    r.x = (v.x < edge.x) ? 0.f : 1.f;
-    r.y = (v.y < edge.y) ? 0.f : 1.f;
-    r.z = (v.z < edge.z) ? 0.f : 1.f;
-    return r;
-}
-extern float4 __attribute__((overloadable)) step(float4 edge, float4 v) {
-    float4 r;
-    r.x = (v.x < edge.x) ? 0.f : 1.f;
-    r.y = (v.y < edge.y) ? 0.f : 1.f;
-    r.z = (v.z < edge.z) ? 0.f : 1.f;
-    r.w = (v.w < edge.w) ? 0.f : 1.f;
-    return r;
-}
-extern float2 __attribute__((overloadable)) step(float2 edge, float v) {
-    float2 r;
-    r.x = (v < edge.x) ? 0.f : 1.f;
-    r.y = (v < edge.y) ? 0.f : 1.f;
-    return r;
-}
-extern float3 __attribute__((overloadable)) step(float3 edge, float v) {
-    float3 r;
-    r.x = (v < edge.x) ? 0.f : 1.f;
-    r.y = (v < edge.y) ? 0.f : 1.f;
-    r.z = (v < edge.z) ? 0.f : 1.f;
-    return r;
-}
-extern float4 __attribute__((overloadable)) step(float4 edge, float v) {
-    float4 r;
-    r.x = (v < edge.x) ? 0.f : 1.f;
-    r.y = (v < edge.y) ? 0.f : 1.f;
-    r.z = (v < edge.z) ? 0.f : 1.f;
-    r.w = (v < edge.w) ? 0.f : 1.f;
-    return r;
-}
-
-extern float __attribute__((overloadable)) smoothstep(float, float, float);
-extern float2 __attribute__((overloadable)) smoothstep(float2, float2, float2);
-extern float3 __attribute__((overloadable)) smoothstep(float3, float3, float3);
-extern float4 __attribute__((overloadable)) smoothstep(float4, float4, float4);
-extern float2 __attribute__((overloadable)) smoothstep(float, float, float2);
-extern float3 __attribute__((overloadable)) smoothstep(float, float, float3);
-extern float4 __attribute__((overloadable)) smoothstep(float, float, float4);
-
-extern float __attribute__((overloadable)) sign(float v) {
-    if (v > 0) return 1.f;
-    if (v < 0) return -1.f;
-    return v;
-}
-FN_FUNC_FN(sign)
-
-
-// 6.11.5
-extern float3 __attribute__((overloadable)) cross(float3 lhs, float3 rhs) {
-    float3 r;
-    r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
-    r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
-    r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) cross(float4 lhs, float4 rhs) {
-    float4 r;
-    r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
-    r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
-    r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
-    r.w = 0.f;
-    return r;
-}
-
-extern float __attribute__((overloadable)) length(float v);
-extern float __attribute__((overloadable)) length(float2 v);
-extern float __attribute__((overloadable)) length(float3 v);
-extern float __attribute__((overloadable)) length(float4 v);
-
-extern float __attribute__((overloadable)) distance(float lhs, float rhs) {
-    return length(lhs - rhs);
-}
-extern float __attribute__((overloadable)) distance(float2 lhs, float2 rhs) {
-    return length(lhs - rhs);
-}
-extern float __attribute__((overloadable)) distance(float3 lhs, float3 rhs) {
-    return length(lhs - rhs);
-}
-extern float __attribute__((overloadable)) distance(float4 lhs, float4 rhs) {
-    return length(lhs - rhs);
-}
-
-extern float __attribute__((overloadable)) normalize(float v) {
-    return 1.f;
-}
-extern float2 __attribute__((overloadable)) normalize(float2 v) {
-    return v / length(v);
-}
-extern float3 __attribute__((overloadable)) normalize(float3 v) {
-    return v / length(v);
-}
-extern float4 __attribute__((overloadable)) normalize(float4 v) {
-    return v / length(v);
-}
-
-extern float __attribute__((overloadable)) half_sqrt(float);
-
-extern float __attribute__((overloadable)) fast_length(float v) {
-    return fabs(v);
-}
-extern float __attribute__((overloadable)) fast_length(float2 v) {
-    return half_sqrt(v.x*v.x + v.y*v.y);
-}
-extern float __attribute__((overloadable)) fast_length(float3 v) {
-    return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
-}
-extern float __attribute__((overloadable)) fast_length(float4 v) {
-    return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
-}
-
-extern float __attribute__((overloadable)) fast_distance(float lhs, float rhs) {
-    return fast_length(lhs - rhs);
-}
-extern float __attribute__((overloadable)) fast_distance(float2 lhs, float2 rhs) {
-    return fast_length(lhs - rhs);
-}
-extern float __attribute__((overloadable)) fast_distance(float3 lhs, float3 rhs) {
-    return fast_length(lhs - rhs);
-}
-extern float __attribute__((overloadable)) fast_distance(float4 lhs, float4 rhs) {
-    return fast_length(lhs - rhs);
-}
-
-extern float __attribute__((overloadable)) half_rsqrt(float);
-
-extern float __attribute__((overloadable)) fast_normalize(float v) {
-    return 1.f;
-}
-extern float2 __attribute__((overloadable)) fast_normalize(float2 v) {
-    return v * half_rsqrt(v.x*v.x + v.y*v.y);
-}
-extern float3 __attribute__((overloadable)) fast_normalize(float3 v) {
-    return v * half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z);
-}
-extern float4 __attribute__((overloadable)) fast_normalize(float4 v) {
-    return v * half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
-}
-
-extern float __attribute__((overloadable)) half_recip(float);
-
-/*
-extern float __attribute__((overloadable)) approx_atan(float x) {
-    if (x == 0.f)
-        return 0.f;
-    if (x < 0.f)
-        return -1.f * approx_atan(-1.f * x);
-    if (x > 1.f)
-        return M_PI_2 - approx_atan(approx_recip(x));
-    return x * approx_recip(1.f + 0.28f * x*x);
-}
-FN_FUNC_FN(approx_atan)
-*/
-
-typedef union
-{
-  float fv;
-  int32_t iv;
-} ieee_float_shape_type;
-
-/* Get a 32 bit int from a float.  */
-
-#define GET_FLOAT_WORD(i,d)                 \
-do {                                \
-  ieee_float_shape_type gf_u;                   \
-  gf_u.fv = (d);                     \
-  (i) = gf_u.iv;                      \
-} while (0)
-
-/* Set a float from a 32 bit int.  */
-
-#define SET_FLOAT_WORD(d,i)                 \
-do {                                \
-  ieee_float_shape_type sf_u;                   \
-  sf_u.iv = (i);                      \
-  (d) = sf_u.fv;                     \
-} while (0)
-
-
-
-// Valid -125 to 125
-extern float __attribute__((overloadable)) native_exp2(float v) {
-    int32_t iv = (int)v;
-    int32_t x = iv + (iv >> 31); // ~floor(v)
-    float r = (v - x);
-
-    float fo;
-    SET_FLOAT_WORD(fo, (x + 127) << 23);
-
-    r *= 0.694f; // ~ log(e) / log(2)
-    float r2 = r*r;
-    float adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
-    return fo * adj;
-}
-
-extern float2 __attribute__((overloadable)) native_exp2(float2 v) {
-    int2 iv = convert_int2(v);
-    int2 x = iv + (iv >> (int2)31);//floor(v);
-    float2 r = (v - convert_float2(x));
-
-    x += 127;
-
-    float2 fo = (float2)(x << (int2)23);
-
-    r *= 0.694f; // ~ log(e) / log(2)
-    float2 r2 = r*r;
-    float2 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
-    return fo * adj;
-}
-
-extern float4 __attribute__((overloadable)) native_exp2(float4 v) {
-    int4 iv = convert_int4(v);
-    int4 x = iv + (iv >> (int4)31);//floor(v);
-    float4 r = (v - convert_float4(x));
-
-    x += 127;
-
-    float4 fo = (float4)(x << (int4)23);
-
-    r *= 0.694f; // ~ log(e) / log(2)
-    float4 r2 = r*r;
-    float4 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
-    return fo * adj;
-}
-
-extern float3 __attribute__((overloadable)) native_exp2(float3 v) {
-    float4 t = 1.f;
-    t.xyz = v;
-    return native_exp2(t).xyz;
-}
-
-
-extern float __attribute__((overloadable)) native_exp(float v) {
-    return native_exp2(v * 1.442695041f);
-}
-extern float2 __attribute__((overloadable)) native_exp(float2 v) {
-    return native_exp2(v * 1.442695041f);
-}
-extern float3 __attribute__((overloadable)) native_exp(float3 v) {
-    return native_exp2(v * 1.442695041f);
-}
-extern float4 __attribute__((overloadable)) native_exp(float4 v) {
-    return native_exp2(v * 1.442695041f);
-}
-
-extern float __attribute__((overloadable)) native_exp10(float v) {
-    return native_exp2(v * 3.321928095f);
-}
-extern float2 __attribute__((overloadable)) native_exp10(float2 v) {
-    return native_exp2(v * 3.321928095f);
-}
-extern float3 __attribute__((overloadable)) native_exp10(float3 v) {
-    return native_exp2(v * 3.321928095f);
-}
-extern float4 __attribute__((overloadable)) native_exp10(float4 v) {
-    return native_exp2(v * 3.321928095f);
-}
-
-extern float __attribute__((overloadable)) native_log2(float v) {
-    int32_t ibits;
-    GET_FLOAT_WORD(ibits, v);
-
-    int32_t e = (ibits >> 23) & 0xff;
-
-    ibits &= 0x7fffff;
-    ibits |= 127 << 23;
-
-    float ir;
-    SET_FLOAT_WORD(ir, ibits);
-
-    ir -= 1.5f;
-    float ir2 = ir*ir;
-    float adj2 = 0.405465108f + // -0.00009f +
-                 (0.666666667f * ir) -
-                 (0.222222222f * ir2) +
-                 (0.098765432f * ir*ir2) -
-                 (0.049382716f * ir2*ir2) +
-                 (0.026337449f * ir*ir2*ir2) -
-                 (0.014631916f * ir2*ir2*ir2);
-    adj2 *= (1.f / 0.693147181f);
-
-    return (float)(e - 127) + adj2;
-}
-extern float2 __attribute__((overloadable)) native_log2(float2 v) {
-    float2 v2 = {native_log2(v.x), native_log2(v.y)};
-    return v2;
-}
-extern float3 __attribute__((overloadable)) native_log2(float3 v) {
-    float3 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z)};
-    return v2;
-}
-extern float4 __attribute__((overloadable)) native_log2(float4 v) {
-    float4 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z), native_log2(v.w)};
-    return v2;
-}
-
-extern float __attribute__((overloadable)) native_log(float v) {
-    return native_log2(v) * (1.f / 1.442695041f);
-}
-extern float2 __attribute__((overloadable)) native_log(float2 v) {
-    return native_log2(v) * (1.f / 1.442695041f);
-}
-extern float3 __attribute__((overloadable)) native_log(float3 v) {
-    return native_log2(v) * (1.f / 1.442695041f);
-}
-extern float4 __attribute__((overloadable)) native_log(float4 v) {
-    return native_log2(v) * (1.f / 1.442695041f);
-}
-
-extern float __attribute__((overloadable)) native_log10(float v) {
-    return native_log2(v) * (1.f / 3.321928095f);
-}
-extern float2 __attribute__((overloadable)) native_log10(float2 v) {
-    return native_log2(v) * (1.f / 3.321928095f);
-}
-extern float3 __attribute__((overloadable)) native_log10(float3 v) {
-    return native_log2(v) * (1.f / 3.321928095f);
-}
-extern float4 __attribute__((overloadable)) native_log10(float4 v) {
-    return native_log2(v) * (1.f / 3.321928095f);
-}
-
-
-extern float __attribute__((overloadable)) native_powr(float v, float y) {
-    float v2 = native_log2(v);
-    v2 = fmax(v2, -125.f);
-    return native_exp2(v2 * y);
-}
-extern float2 __attribute__((overloadable)) native_powr(float2 v, float2 y) {
-    float2 v2 = native_log2(v);
-    v2 = fmax(v2, -125.f);
-    return native_exp2(v2 * y);
-}
-extern float3 __attribute__((overloadable)) native_powr(float3 v, float3 y) {
-    float3 v2 = native_log2(v);
-    v2 = fmax(v2, -125.f);
-    return native_exp2(v2 * y);
-}
-extern float4 __attribute__((overloadable)) native_powr(float4 v, float4 y) {
-    float4 v2 = native_log2(v);
-    v2 = fmax(v2, -125.f);
-    return native_exp2(v2 * y);
-}
-
-
-#undef FN_FUNC_FN
-#undef IN_FUNC_FN
-#undef FN_FUNC_FN_FN
-#undef FN_FUNC_FN_F
-#undef FN_FUNC_FN_IN
-#undef FN_FUNC_FN_I
-#undef FN_FUNC_FN_PFN
-#undef FN_FUNC_FN_PIN
-#undef FN_FUNC_FN_FN_FN
-#undef FN_FUNC_FN_FN_PIN
-#undef XN_FUNC_YN
-#undef UIN_FUNC_IN
-#undef IN_FUNC_IN
-#undef XN_FUNC_XN_XN_BODY
-#undef IN_FUNC_IN_IN_BODY
diff --git a/lib/Renderscript/runtime/rs_core.c b/lib/Renderscript/runtime/rs_core.c
deleted file mode 100644
index 54fcccb..0000000
--- a/lib/Renderscript/runtime/rs_core.c
+++ /dev/null
@@ -1,204 +0,0 @@
-#include "rs_core.rsh"
-#include "rs_graphics.rsh"
-#include "rs_structs.h"
-
-/* Function declarations from libRS */
-extern float4 __attribute__((overloadable)) convert_float4(uchar4 c);
-
-/* Implementation of Core Runtime */
-
-/*
-extern uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
-{
-    uchar4 c;
-    c.x = (uchar)(r * 255.f + 0.5f);
-    c.y = (uchar)(g * 255.f + 0.5f);
-    c.z = (uchar)(b * 255.f + 0.5f);
-    c.w = 255;
-    return c;
-}
-
-extern uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
-{
-    uchar4 c;
-    c.x = (uchar)(r * 255.f + 0.5f);
-    c.y = (uchar)(g * 255.f + 0.5f);
-    c.z = (uchar)(b * 255.f + 0.5f);
-    c.w = (uchar)(a * 255.f + 0.5f);
-    return c;
-}
-
-extern uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
-{
-    color *= 255.f;
-    color += 0.5f;
-    uchar4 c = {color.x, color.y, color.z, 255};
-    return c;
-}
-
-extern uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
-{
-    color *= 255.f;
-    color += 0.5f;
-    uchar4 c = {color.x, color.y, color.z, color.w};
-    return c;
-}
-*/
-
-extern float4 rsUnpackColor8888(uchar4 c)
-{
-    return convert_float4(c) * 0.003921569f;
-}
-
-
-extern int32_t __attribute__((overloadable)) rsAtomicCas(volatile int32_t *ptr, int32_t expectedValue, int32_t newValue) {
-    return __sync_val_compare_and_swap(ptr, expectedValue, newValue);
-}
-
-extern uint32_t __attribute__((overloadable)) rsAtomicCas(volatile uint32_t *ptr, uint32_t expectedValue, uint32_t newValue) {
-    return __sync_val_compare_and_swap((volatile int32_t *)ptr, (int32_t)expectedValue, (int32_t)newValue);
-}
-
-extern int32_t __attribute__((overloadable)) rsAtomicInc(volatile int32_t *ptr) {
-    return __sync_fetch_and_add(ptr, 1);
-}
-
-extern int32_t __attribute__((overloadable)) rsAtomicDec(volatile int32_t *ptr) {
-    return __sync_fetch_and_sub(ptr, 1);
-}
-
-extern int32_t __attribute__((overloadable)) rsAtomicAdd(volatile int32_t *ptr, int32_t value) {
-    return __sync_fetch_and_add(ptr, value);
-}
-
-extern int32_t __attribute__((overloadable)) rsAtomicSub(volatile int32_t *ptr, int32_t value) {
-    return __sync_fetch_and_sub(ptr, value);
-}
-
-extern int32_t __attribute__((overloadable)) rsAtomicAnd(volatile int32_t *ptr, int32_t value) {
-    return __sync_fetch_and_and(ptr, value);
-}
-
-extern int32_t __attribute__((overloadable)) rsAtomicOr(volatile int32_t *ptr, int32_t value) {
-    return __sync_fetch_and_or(ptr, value);
-}
-
-extern int32_t __attribute__((overloadable)) rsAtomicXor(volatile int32_t *ptr, int32_t value) {
-    return __sync_fetch_and_xor(ptr, value);
-}
-
-extern uint32_t __attribute__((overloadable)) min(uint32_t, uint32_t);
-extern int32_t __attribute__((overloadable)) min(int32_t, int32_t);
-extern uint32_t __attribute__((overloadable)) max(uint32_t, uint32_t);
-extern int32_t __attribute__((overloadable)) max(int32_t, int32_t);
-
-extern uint32_t __attribute__((overloadable)) rsAtomicMin(volatile uint32_t *ptr, uint32_t value) {
-    uint32_t prev, status;
-    do {
-        prev = *ptr;
-        uint32_t n = min(value, prev);
-        status = rsAtomicCas((volatile int32_t*) ptr, (int32_t) prev, (int32_t)n);
-    } while (status != prev);
-    return prev;
-}
-
-extern int32_t __attribute__((overloadable)) rsAtomicMin(volatile int32_t *ptr, int32_t value) {
-    int32_t prev, status;
-    do {
-        prev = *ptr;
-        int32_t n = min(value, prev);
-        status = rsAtomicCas(ptr, prev, n);
-    } while (status != prev);
-    return prev;
-}
-
-extern uint32_t __attribute__((overloadable)) rsAtomicMax(volatile uint32_t *ptr, uint32_t value) {
-    uint32_t prev, status;
-    do {
-        prev = *ptr;
-        uint32_t n = max(value, prev);
-        status = rsAtomicCas((volatile int32_t*) ptr, (int32_t) prev, (int32_t) n);
-    } while (status != prev);
-    return prev;
-}
-
-extern int32_t __attribute__((overloadable)) rsAtomicMax(volatile int32_t *ptr, int32_t value) {
-    int32_t prev, status;
-    do {
-        prev = *ptr;
-        int32_t n = max(value, prev);
-        status = rsAtomicCas(ptr, prev, n);
-    } while (status != prev);
-    return prev;
-}
-
-
-
-extern int32_t rand();
-#define RAND_MAX 0x7fffffff
-
-
-
-extern float __attribute__((overloadable)) rsRand(float min, float max);/* {
-    float r = (float)rand();
-    r /= RAND_MAX;
-    r = r * (max - min) + min;
-    return r;
-}
-*/
-
-extern float __attribute__((overloadable)) rsRand(float max) {
-    return rsRand(0.f, max);
-    //float r = (float)rand();
-    //r *= max;
-    //r /= RAND_MAX;
-    //return r;
-}
-
-extern int __attribute__((overloadable)) rsRand(int max) {
-    return (int)rsRand((float)max);
-}
-
-extern int __attribute__((overloadable)) rsRand(int min, int max) {
-    return (int)rsRand((float)min, (float)max);
-}
-
-#define PRIM_DEBUG(T)                               \
-extern void __attribute__((overloadable)) rsDebug(const char *, const T *);     \
-void __attribute__((overloadable)) rsDebug(const char *txt, T val) {            \
-    rsDebug(txt, &val);                                                         \
-}
-
-PRIM_DEBUG(char2)
-PRIM_DEBUG(char3)
-PRIM_DEBUG(char4)
-PRIM_DEBUG(uchar2)
-PRIM_DEBUG(uchar3)
-PRIM_DEBUG(uchar4)
-PRIM_DEBUG(short2)
-PRIM_DEBUG(short3)
-PRIM_DEBUG(short4)
-PRIM_DEBUG(ushort2)
-PRIM_DEBUG(ushort3)
-PRIM_DEBUG(ushort4)
-PRIM_DEBUG(int2)
-PRIM_DEBUG(int3)
-PRIM_DEBUG(int4)
-PRIM_DEBUG(uint2)
-PRIM_DEBUG(uint3)
-PRIM_DEBUG(uint4)
-PRIM_DEBUG(long2)
-PRIM_DEBUG(long3)
-PRIM_DEBUG(long4)
-PRIM_DEBUG(ulong2)
-PRIM_DEBUG(ulong3)
-PRIM_DEBUG(ulong4)
-PRIM_DEBUG(float2)
-PRIM_DEBUG(float3)
-PRIM_DEBUG(float4)
-PRIM_DEBUG(double2)
-PRIM_DEBUG(double3)
-PRIM_DEBUG(double4)
-
-#undef PRIM_DEBUG
-
diff --git a/lib/Renderscript/runtime/rs_element.c b/lib/Renderscript/runtime/rs_element.c
deleted file mode 100644
index 4db5883..0000000
--- a/lib/Renderscript/runtime/rs_element.c
+++ /dev/null
@@ -1,111 +0,0 @@
-#include "rs_core.rsh"
-#include "rs_graphics.rsh"
-#include "rs_structs.h"
-
-/**
-* Element
-*/
-extern uint32_t __attribute__((overloadable))
-        rsElementGetSubElementCount(rs_element e) {
-    Element_t *element = (Element_t *)e.p;
-    if (element == NULL) {
-        return 0;
-    }
-    return element->mHal.state.fieldsCount;
-}
-
-extern rs_element __attribute__((overloadable))
-        rsElementGetSubElement(rs_element e, uint32_t index) {
-    Element_t *element = (Element_t *)e.p;
-    if (element == NULL || index >= element->mHal.state.fieldsCount) {
-        rs_element nullElem = {0};
-        return nullElem;
-    }
-    rs_element returnElem = {element->mHal.state.fields[index]};
-    return returnElem;
-}
-
-extern uint32_t __attribute__((overloadable))
-        rsElementGetSubElementNameLength(rs_element e, uint32_t index) {
-    Element_t *element = (Element_t *)e.p;
-    if (element == NULL || index >= element->mHal.state.fieldsCount) {
-        return 0;
-    }
-    return element->mHal.state.fieldNameLengths[index];
-}
-
-extern uint32_t __attribute__((overloadable))
-        rsElementGetSubElementName(rs_element e, uint32_t index, char *name, uint32_t nameLength) {
-    Element_t *element = (Element_t *)e.p;
-    if (element == NULL || index >= element->mHal.state.fieldsCount ||
-        nameLength == 0 || name == 0) {
-        return 0;
-    }
-
-    uint32_t numToCopy = element->mHal.state.fieldNameLengths[index];
-    if (nameLength < numToCopy) {
-        numToCopy = nameLength;
-    }
-    // Place the null terminator manually, in case of partial string
-    numToCopy --;
-    name[numToCopy] = '\0';
-    const char *nameSource = element->mHal.state.fieldNames[index];
-    for (uint32_t i = 0; i < numToCopy; i ++) {
-        name[i] = nameSource[i];
-    }
-    return numToCopy;
-}
-
-extern uint32_t __attribute__((overloadable))
-        rsElementGetSubElementArraySize(rs_element e, uint32_t index) {
-    Element_t *element = (Element_t *)e.p;
-    if (element == NULL || index >= element->mHal.state.fieldsCount) {
-        return 0;
-    }
-    return element->mHal.state.fieldArraySizes[index];
-}
-
-extern uint32_t __attribute__((overloadable))
-        rsElementGetSubElementOffsetBytes(rs_element e, uint32_t index) {
-    Element_t *element = (Element_t *)e.p;
-    if (element == NULL || index >= element->mHal.state.fieldsCount) {
-        return 0;
-    }
-    return element->mHal.state.fieldOffsetBytes[index];
-}
-
-extern uint32_t __attribute__((overloadable))
-        rsElementGetBytesSize(rs_element e) {
-    Element_t *element = (Element_t *)e.p;
-    if (element == NULL) {
-        return 0;
-    }
-    return element->mHal.state.elementSizeBytes;
-}
-
-extern rs_data_type __attribute__((overloadable))
-        rsElementGetDataType(rs_element e) {
-    Element_t *element = (Element_t *)e.p;
-    if (element == NULL) {
-        return RS_TYPE_INVALID;
-    }
-    return element->mHal.state.dataType;
-}
-
-extern rs_data_kind __attribute__((overloadable))
-        rsElementGetDataKind(rs_element e) {
-    Element_t *element = (Element_t *)e.p;
-    if (element == NULL) {
-        return RS_KIND_INVALID;
-    }
-    return element->mHal.state.dataKind;
-}
-
-extern uint32_t __attribute__((overloadable))
-        rsElementGetVectorSize(rs_element e) {
-    Element_t *element = (Element_t *)e.p;
-    if (element == NULL) {
-        return 0;
-    }
-    return element->mHal.state.vectorSize;
-}
diff --git a/lib/Renderscript/runtime/rs_matrix.c b/lib/Renderscript/runtime/rs_matrix.c
deleted file mode 100644
index 3afccc1..0000000
--- a/lib/Renderscript/runtime/rs_matrix.c
+++ /dev/null
@@ -1,314 +0,0 @@
-#include "rs_core.rsh"
-#include "rs_graphics.rsh"
-#include "rs_structs.h"
-
-/* Function declarations from libRS */
-extern float4 __attribute__((overloadable)) convert_float4(uchar4 c);
-
-/* Implementation of Core Runtime */
-
-
-/////////////////////////////////////////////////////
-// Matrix ops
-/////////////////////////////////////////////////////
-
-
-extern void __attribute__((overloadable))
-rsMatrixLoadIdentity(rs_matrix4x4 *m) {
-    m->m[0] = 1.f;
-    m->m[1] = 0.f;
-    m->m[2] = 0.f;
-    m->m[3] = 0.f;
-    m->m[4] = 0.f;
-    m->m[5] = 1.f;
-    m->m[6] = 0.f;
-    m->m[7] = 0.f;
-    m->m[8] = 0.f;
-    m->m[9] = 0.f;
-    m->m[10] = 1.f;
-    m->m[11] = 0.f;
-    m->m[12] = 0.f;
-    m->m[13] = 0.f;
-    m->m[14] = 0.f;
-    m->m[15] = 1.f;
-}
-
-extern void __attribute__((overloadable))
-rsMatrixLoadIdentity(rs_matrix3x3 *m) {
-    m->m[0] = 1.f;
-    m->m[1] = 0.f;
-    m->m[2] = 0.f;
-    m->m[3] = 0.f;
-    m->m[4] = 1.f;
-    m->m[5] = 0.f;
-    m->m[6] = 0.f;
-    m->m[7] = 0.f;
-    m->m[8] = 1.f;
-}
-extern void __attribute__((overloadable))
-rsMatrixLoadIdentity(rs_matrix2x2 *m) {
-    m->m[0] = 1.f;
-    m->m[1] = 0.f;
-    m->m[2] = 0.f;
-    m->m[3] = 1.f;
-}
-
-extern void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix4x4 *m, const float *f) {
-    m->m[0] = f[0];
-    m->m[1] = f[1];
-    m->m[2] = f[2];
-    m->m[3] = f[3];
-    m->m[4] = f[4];
-    m->m[5] = f[5];
-    m->m[6] = f[6];
-    m->m[7] = f[7];
-    m->m[8] = f[8];
-    m->m[9] = f[9];
-    m->m[10] = f[10];
-    m->m[11] = f[11];
-    m->m[12] = f[12];
-    m->m[13] = f[13];
-    m->m[14] = f[14];
-    m->m[15] = f[15];
-}
-extern void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix3x3 *m, const float *f) {
-    m->m[0] = f[0];
-    m->m[1] = f[1];
-    m->m[2] = f[2];
-    m->m[3] = f[3];
-    m->m[4] = f[4];
-    m->m[5] = f[5];
-    m->m[6] = f[6];
-    m->m[7] = f[7];
-    m->m[8] = f[8];
-}
-extern void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix2x2 *m, const float *f) {
-    m->m[0] = f[0];
-    m->m[1] = f[1];
-    m->m[2] = f[2];
-    m->m[3] = f[3];
-}
-
-extern void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix4x4 *s) {
-    m->m[0] = s->m[0];
-    m->m[1] = s->m[1];
-    m->m[2] = s->m[2];
-    m->m[3] = s->m[3];
-    m->m[4] = s->m[4];
-    m->m[5] = s->m[5];
-    m->m[6] = s->m[6];
-    m->m[7] = s->m[7];
-    m->m[8] = s->m[8];
-    m->m[9] = s->m[9];
-    m->m[10] = s->m[10];
-    m->m[11] = s->m[11];
-    m->m[12] = s->m[12];
-    m->m[13] = s->m[13];
-    m->m[14] = s->m[14];
-    m->m[15] = s->m[15];
-}
-extern void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix3x3 *v) {
-    m->m[0] = v->m[0];
-    m->m[1] = v->m[1];
-    m->m[2] = v->m[2];
-    m->m[3] = 0.f;
-    m->m[4] = v->m[3];
-    m->m[5] = v->m[4];
-    m->m[6] = v->m[5];
-    m->m[7] = 0.f;
-    m->m[8] = v->m[6];
-    m->m[9] = v->m[7];
-    m->m[10] = v->m[8];
-    m->m[11] = 0.f;
-    m->m[12] = 0.f;
-    m->m[13] = 0.f;
-    m->m[14] = 0.f;
-    m->m[15] = 1.f;
-}
-extern void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix4x4 *m, const rs_matrix2x2 *v) {
-    m->m[0] = v->m[0];
-    m->m[1] = v->m[1];
-    m->m[2] = 0.f;
-    m->m[3] = 0.f;
-    m->m[4] = v->m[2];
-    m->m[5] = v->m[3];
-    m->m[6] = 0.f;
-    m->m[7] = 0.f;
-    m->m[8] = 0.f;
-    m->m[9] = 0.f;
-    m->m[10] = 1.f;
-    m->m[11] = 0.f;
-    m->m[12] = 0.f;
-    m->m[13] = 0.f;
-    m->m[14] = 0.f;
-    m->m[15] = 1.f;
-}
-extern void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix3x3 *m, const rs_matrix3x3 *s) {
-    m->m[0] = s->m[0];
-    m->m[1] = s->m[1];
-    m->m[2] = s->m[2];
-    m->m[3] = s->m[3];
-    m->m[4] = s->m[4];
-    m->m[5] = s->m[5];
-    m->m[6] = s->m[6];
-    m->m[7] = s->m[7];
-    m->m[8] = s->m[8];
-}
-extern void __attribute__((overloadable))
-rsMatrixLoad(rs_matrix2x2 *m, const rs_matrix2x2 *s) {
-    m->m[0] = s->m[0];
-    m->m[1] = s->m[1];
-    m->m[2] = s->m[2];
-    m->m[3] = s->m[3];
-}
-
-
-extern void __attribute__((overloadable))
-rsMatrixSet(rs_matrix4x4 *m, uint32_t row, uint32_t col, float v) {
-    m->m[row * 4 + col] = v;
-}
-
-extern float __attribute__((overloadable))
-rsMatrixGet(const rs_matrix4x4 *m, uint32_t row, uint32_t col) {
-    return m->m[row * 4 + col];
-}
-
-extern void __attribute__((overloadable))
-rsMatrixSet(rs_matrix3x3 *m, uint32_t row, uint32_t col, float v) {
-    m->m[row * 3 + col] = v;
-}
-
-extern float __attribute__((overloadable))
-rsMatrixGet(const rs_matrix3x3 *m, uint32_t row, uint32_t col) {
-    return m->m[row * 3 + col];
-}
-
-extern void __attribute__((overloadable))
-rsMatrixSet(rs_matrix2x2 *m, uint32_t row, uint32_t col, float v) {
-    m->m[row * 2 + col] = v;
-}
-
-extern float __attribute__((overloadable))
-rsMatrixGet(const rs_matrix2x2 *m, uint32_t row, uint32_t col) {
-    return m->m[row * 2 + col];
-}
-
-extern float2 __attribute__((overloadable))
-rsMatrixMultiply(const rs_matrix2x2 *m, float2 in) {
-    float2 ret;
-    ret.x = (m->m[0] * in.x) + (m->m[2] * in.y);
-    ret.y = (m->m[1] * in.x) + (m->m[3] * in.y);
-    return ret;
-}
-extern float2 __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix2x2 *m, float2 in) {
-    return rsMatrixMultiply((const rs_matrix2x2 *)m, in);
-}
-
-extern float4 __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix4x4 *m, float4 in) {
-    return rsMatrixMultiply((const rs_matrix4x4 *)m, in);
-}
-
-extern float4 __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix4x4 *m, float3 in) {
-    return rsMatrixMultiply((const rs_matrix4x4 *)m, in);
-}
-
-extern float4 __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix4x4 *m, float2 in) {
-    return rsMatrixMultiply((const rs_matrix4x4 *)m, in);
-}
-
-extern float3 __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix3x3 *m, float3 in) {
-    return rsMatrixMultiply((const rs_matrix3x3 *)m, in);
-}
-
-extern float3 __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix3x3 *m, float2 in) {
-    return rsMatrixMultiply((const rs_matrix3x3 *)m, in);
-}
-
-extern void __attribute__((overloadable))
-rsMatrixLoadMultiply(rs_matrix4x4 *ret, const rs_matrix4x4 *lhs, const rs_matrix4x4 *rhs) {
-    for (int i=0 ; i<4 ; i++) {
-        float ri0 = 0;
-        float ri1 = 0;
-        float ri2 = 0;
-        float ri3 = 0;
-        for (int j=0 ; j<4 ; j++) {
-            const float rhs_ij = rsMatrixGet(rhs, i, j);
-            ri0 += rsMatrixGet(lhs, j, 0) * rhs_ij;
-            ri1 += rsMatrixGet(lhs, j, 1) * rhs_ij;
-            ri2 += rsMatrixGet(lhs, j, 2) * rhs_ij;
-            ri3 += rsMatrixGet(lhs, j, 3) * rhs_ij;
-        }
-        rsMatrixSet(ret, i, 0, ri0);
-        rsMatrixSet(ret, i, 1, ri1);
-        rsMatrixSet(ret, i, 2, ri2);
-        rsMatrixSet(ret, i, 3, ri3);
-    }
-}
-
-extern void __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix4x4 *lhs, const rs_matrix4x4 *rhs) {
-    rs_matrix4x4 r;
-    rsMatrixLoadMultiply(&r, lhs, rhs);
-    rsMatrixLoad(lhs, &r);
-}
-
-extern void __attribute__((overloadable))
-rsMatrixLoadMultiply(rs_matrix3x3 *ret, const rs_matrix3x3 *lhs, const rs_matrix3x3 *rhs) {
-    for (int i=0 ; i<3 ; i++) {
-        float ri0 = 0;
-        float ri1 = 0;
-        float ri2 = 0;
-        for (int j=0 ; j<3 ; j++) {
-            const float rhs_ij = rsMatrixGet(rhs, i, j);
-            ri0 += rsMatrixGet(lhs, j, 0) * rhs_ij;
-            ri1 += rsMatrixGet(lhs, j, 1) * rhs_ij;
-            ri2 += rsMatrixGet(lhs, j, 2) * rhs_ij;
-        }
-        rsMatrixSet(ret, i, 0, ri0);
-        rsMatrixSet(ret, i, 1, ri1);
-        rsMatrixSet(ret, i, 2, ri2);
-    }
-}
-
-extern void __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix3x3 *lhs, const rs_matrix3x3 *rhs) {
-    rs_matrix3x3 r;
-    rsMatrixLoadMultiply(&r, lhs, rhs);
-    rsMatrixLoad(lhs, &r);
-}
-
-extern void __attribute__((overloadable))
-rsMatrixLoadMultiply(rs_matrix2x2 *ret, const rs_matrix2x2 *lhs, const rs_matrix2x2 *rhs) {
-    for (int i=0 ; i<2 ; i++) {
-        float ri0 = 0;
-        float ri1 = 0;
-        for (int j=0 ; j<2 ; j++) {
-            const float rhs_ij = rsMatrixGet(rhs, i, j);
-            ri0 += rsMatrixGet(lhs, j, 0) * rhs_ij;
-            ri1 += rsMatrixGet(lhs, j, 1) * rhs_ij;
-        }
-        rsMatrixSet(ret, i, 0, ri0);
-        rsMatrixSet(ret, i, 1, ri1);
-    }
-}
-
-extern void __attribute__((overloadable))
-rsMatrixMultiply(rs_matrix2x2 *lhs, const rs_matrix2x2 *rhs) {
-    rs_matrix2x2 r;
-    rsMatrixLoadMultiply(&r, lhs, rhs);
-    rsMatrixLoad(lhs, &r);
-}
-
diff --git a/lib/Renderscript/runtime/rs_mesh.c b/lib/Renderscript/runtime/rs_mesh.c
deleted file mode 100644
index bb533bc..0000000
--- a/lib/Renderscript/runtime/rs_mesh.c
+++ /dev/null
@@ -1,55 +0,0 @@
-#include "rs_core.rsh"
-#include "rs_graphics.rsh"
-#include "rs_structs.h"
-
-/**
-* Mesh
-*/
-extern uint32_t __attribute__((overloadable))
-        rsgMeshGetVertexAllocationCount(rs_mesh m) {
-    Mesh_t *mesh = (Mesh_t *)m.p;
-    if (mesh == NULL) {
-        return 0;
-    }
-    return mesh->mHal.state.vertexBuffersCount;
-}
-
-extern uint32_t __attribute__((overloadable))
-        rsgMeshGetPrimitiveCount(rs_mesh m) {
-    Mesh_t *mesh = (Mesh_t *)m.p;
-    if (mesh == NULL) {
-        return 0;
-    }
-    return mesh->mHal.state.primitivesCount;
-}
-
-extern rs_allocation __attribute__((overloadable))
-        rsgMeshGetVertexAllocation(rs_mesh m, uint32_t index) {
-    Mesh_t *mesh = (Mesh_t *)m.p;
-    if (mesh == NULL || index >= mesh->mHal.state.vertexBuffersCount) {
-        rs_allocation nullAlloc = {0};
-        return nullAlloc;
-    }
-    rs_allocation returnAlloc = {mesh->mHal.state.vertexBuffers[index]};
-    return returnAlloc;
-}
-
-extern rs_allocation __attribute__((overloadable))
-        rsgMeshGetIndexAllocation(rs_mesh m, uint32_t index) {
-    Mesh_t *mesh = (Mesh_t *)m.p;
-    if (mesh == NULL || index >= mesh->mHal.state.primitivesCount) {
-        rs_allocation nullAlloc = {0};
-        return nullAlloc;
-    }
-    rs_allocation returnAlloc = {mesh->mHal.state.indexBuffers[index]};
-    return returnAlloc;
-}
-
-extern rs_primitive __attribute__((overloadable))
-        rsgMeshGetPrimitive(rs_mesh m, uint32_t index) {
-    Mesh_t *mesh = (Mesh_t *)m.p;
-    if (mesh == NULL || index >= mesh->mHal.state.primitivesCount) {
-        return RS_PRIMITIVE_INVALID;
-    }
-    return mesh->mHal.state.primitives[index];
-}
diff --git a/lib/Renderscript/runtime/rs_program.c b/lib/Renderscript/runtime/rs_program.c
deleted file mode 100644
index 64c656f..0000000
--- a/lib/Renderscript/runtime/rs_program.c
+++ /dev/null
@@ -1,108 +0,0 @@
-#include "rs_core.rsh"
-#include "rs_graphics.rsh"
-#include "rs_structs.h"
-
-/**
-* Program Store
-*/
-extern rs_depth_func __attribute__((overloadable))
-        rsgProgramStoreGetDepthFunc(rs_program_store ps) {
-    ProgramStore_t *prog = (ProgramStore_t *)ps.p;
-    if (prog == NULL) {
-        return RS_DEPTH_FUNC_INVALID;
-    }
-    return prog->mHal.state.depthFunc;
-}
-
-extern bool __attribute__((overloadable))
-        rsgProgramStoreIsDepthMaskEnabled(rs_program_store ps) {
-    ProgramStore_t *prog = (ProgramStore_t *)ps.p;
-    if (prog == NULL) {
-        return false;
-    }
-    return prog->mHal.state.depthWriteEnable;
-}
-
-extern bool __attribute__((overloadable))
-        rsgProgramStoreIsColorMaskRedEnabled(rs_program_store ps) {
-    ProgramStore_t *prog = (ProgramStore_t *)ps.p;
-    if (prog == NULL) {
-        return false;
-    }
-    return prog->mHal.state.colorRWriteEnable;
-}
-
-extern bool __attribute__((overloadable))
-        rsgProgramStoreIsColorMaskGreenEnabled(rs_program_store ps) {
-    ProgramStore_t *prog = (ProgramStore_t *)ps.p;
-    if (prog == NULL) {
-        return false;
-    }
-    return prog->mHal.state.colorGWriteEnable;
-}
-
-extern bool __attribute__((overloadable))
-        rsgProgramStoreIsColorMaskBlueEnabled(rs_program_store ps) {
-    ProgramStore_t *prog = (ProgramStore_t *)ps.p;
-    if (prog == NULL) {
-        return false;
-    }
-    return prog->mHal.state.colorBWriteEnable;
-}
-
-extern bool __attribute__((overloadable))
-        rsgProgramStoreIsColorMaskAlphaEnabled(rs_program_store ps) {
-    ProgramStore_t *prog = (ProgramStore_t *)ps.p;
-    if (prog == NULL) {
-        return false;
-    }
-    return prog->mHal.state.colorAWriteEnable;
-}
-
-extern rs_blend_src_func __attribute__((overloadable))
-        rsgProgramStoreGetBlendSrcFunc(rs_program_store ps) {
-    ProgramStore_t *prog = (ProgramStore_t *)ps.p;
-    if (prog == NULL) {
-        return RS_BLEND_SRC_INVALID;
-    }
-    return prog->mHal.state.blendSrc;
-}
-
-extern rs_blend_dst_func __attribute__((overloadable))
-        rsgProgramStoreGetBlendDstFunc(rs_program_store ps) {
-    ProgramStore_t *prog = (ProgramStore_t *)ps.p;
-    if (prog == NULL) {
-        return RS_BLEND_DST_INVALID;
-    }
-    return prog->mHal.state.blendDst;
-}
-
-extern bool __attribute__((overloadable))
-        rsgProgramStoreIsDitherEnabled(rs_program_store ps) {
-    ProgramStore_t *prog = (ProgramStore_t *)ps.p;
-    if (prog == NULL) {
-        return false;
-    }
-    return prog->mHal.state.ditherEnable;
-}
-
-/**
-* Program Raster
-*/
-extern bool __attribute__((overloadable))
-        rsgProgramRasterIsPointSpriteEnabled(rs_program_raster pr) {
-    ProgramRaster_t *prog = (ProgramRaster_t *)pr.p;
-    if (prog == NULL) {
-        return false;
-    }
-    return prog->mHal.state.pointSprite;
-}
-
-extern rs_cull_mode __attribute__((overloadable))
-        rsgProgramRasterGetCullMode(rs_program_raster pr) {
-    ProgramRaster_t *prog = (ProgramRaster_t *)pr.p;
-    if (prog == NULL) {
-        return RS_CULL_INVALID;
-    }
-    return prog->mHal.state.cull;
-}
diff --git a/lib/Renderscript/runtime/rs_sample.c b/lib/Renderscript/runtime/rs_sample.c
deleted file mode 100644
index 8bc6966..0000000
--- a/lib/Renderscript/runtime/rs_sample.c
+++ /dev/null
@@ -1,662 +0,0 @@
-#include "rs_core.rsh"
-#include "rs_graphics.rsh"
-#include "rs_structs.h"
-
-
-// 565 Conversion bits taken from SkBitmap
-#define SK_R16_BITS     5
-#define SK_G16_BITS     6
-#define SK_B16_BITS     5
-
-#define SK_R16_SHIFT    (SK_B16_BITS + SK_G16_BITS)
-#define SK_G16_SHIFT    (SK_B16_BITS)
-#define SK_B16_SHIFT    0
-
-#define SK_R16_MASK     ((1 << SK_R16_BITS) - 1)
-#define SK_G16_MASK     ((1 << SK_G16_BITS) - 1)
-#define SK_B16_MASK     ((1 << SK_B16_BITS) - 1)
-
-#define SkGetPackedR16(color)   (((unsigned)(color) >> SK_R16_SHIFT) & SK_R16_MASK)
-#define SkGetPackedG16(color)   (((unsigned)(color) >> SK_G16_SHIFT) & SK_G16_MASK)
-#define SkGetPackedB16(color)   (((unsigned)(color) >> SK_B16_SHIFT) & SK_B16_MASK)
-
-static inline unsigned SkR16ToR32(unsigned r) {
-    return (r << (8 - SK_R16_BITS)) | (r >> (2 * SK_R16_BITS - 8));
-}
-
-static inline unsigned SkG16ToG32(unsigned g) {
-    return (g << (8 - SK_G16_BITS)) | (g >> (2 * SK_G16_BITS - 8));
-}
-
-static inline unsigned SkB16ToB32(unsigned b) {
-    return (b << (8 - SK_B16_BITS)) | (b >> (2 * SK_B16_BITS - 8));
-}
-
-#define SkPacked16ToR32(c)      SkR16ToR32(SkGetPackedR16(c))
-#define SkPacked16ToG32(c)      SkG16ToG32(SkGetPackedG16(c))
-#define SkPacked16ToB32(c)      SkB16ToB32(SkGetPackedB16(c))
-
-static float3 getFrom565(uint16_t color) {
-    float3 result;
-    result.x = (float)SkPacked16ToR32(color);
-    result.y = (float)SkPacked16ToG32(color);
-    result.z = (float)SkPacked16ToB32(color);
-    return result;
-}
-
-/**
-* Allocation sampling
-*/
-static inline float __attribute__((overloadable))
-        getElementAt1(const uint8_t *p, int32_t x) {
-    float r = p[x];
-    return r;
-}
-
-static inline float2 __attribute__((overloadable))
-        getElementAt2(const uint8_t *p, int32_t x) {
-    x *= 2;
-    float2 r = {p[x], p[x+1]};
-    return r;
-}
-
-static inline float3 __attribute__((overloadable))
-        getElementAt3(const uint8_t *p, int32_t x) {
-    x *= 4;
-    float3 r = {p[x], p[x+1], p[x+2]};
-    return r;
-}
-
-static inline float4 __attribute__((overloadable))
-        getElementAt4(const uint8_t *p, int32_t x) {
-    x *= 4;
-    const uchar4 *p2 = (const uchar4 *)&p[x];
-    return convert_float4(p2[0]);
-}
-
-static inline float3 __attribute__((overloadable))
-        getElementAt565(const uint8_t *p, int32_t x) {
-    x *= 2;
-    float3 r = getFrom565(((const uint16_t *)p)[0]);
-    return r;
-}
-
-static inline float __attribute__((overloadable))
-        getElementAt1(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
-    p += y * stride;
-    float r = p[x];
-    return r;
-}
-
-static inline float2 __attribute__((overloadable))
-        getElementAt2(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
-    p += y * stride;
-    x *= 2;
-    float2 r = {p[x], p[x+1]};
-    return r;
-}
-
-static inline float3 __attribute__((overloadable))
-        getElementAt3(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
-    p += y * stride;
-    x *= 4;
-    float3 r = {p[x], p[x+1], p[x+2]};
-    return r;
-}
-
-static inline float4 __attribute__((overloadable))
-        getElementAt4(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
-    p += y * stride;
-    x *= 4;
-    float4 r = {p[x], p[x+1], p[x+2], p[x+3]};
-    return r;
-}
-
-static inline float3 __attribute__((overloadable))
-        getElementAt565(const uint8_t *p, size_t stride, int32_t x, int32_t y) {
-    p += y * stride;
-    x *= 2;
-    float3 r = getFrom565(((const uint16_t *)p)[0]);
-    return r;
-}
-
-
-
-
-
-static float4 __attribute__((overloadable))
-            getSample_A(const uint8_t *p, int32_t iPixel,
-                          int32_t next, float w0, float w1) {
-    float p0 = getElementAt1(p, iPixel);
-    float p1 = getElementAt1(p, next);
-    float r = p0 * w0 + p1 * w1;
-    r *= (1.f / 255.f);
-    float4 ret = {0.f, 0.f, 0.f, r};
-    return ret;
-}
-static float4 __attribute__((overloadable))
-            getSample_L(const uint8_t *p, int32_t iPixel,
-                          int32_t next, float w0, float w1) {
-    float p0 = getElementAt1(p, iPixel);
-    float p1 = getElementAt1(p, next);
-    float r = p0 * w0 + p1 * w1;
-    r *= (1.f / 255.f);
-    float4 ret = {r, r, r, 1.f};
-    return ret;
-}
-static float4 __attribute__((overloadable))
-            getSample_LA(const uint8_t *p, int32_t iPixel,
-                           int32_t next, float w0, float w1) {
-    float2 p0 = getElementAt2(p, iPixel);
-    float2 p1 = getElementAt2(p, next);
-    float2 r = p0 * w0 + p1 * w1;
-    r *= (1.f / 255.f);
-    float4 ret = {r.x, r.x, r.x, r.y};
-    return ret;
-}
-static float4 __attribute__((overloadable))
-            getSample_RGB(const uint8_t *p, int32_t iPixel,
-                            int32_t next, float w0, float w1) {
-    float3 p0 = getElementAt3(p, iPixel);
-    float3 p1 = getElementAt3(p, next);
-    float3 r = p0 * w0 + p1 * w1;
-    r *= (1.f / 255.f);
-    float4 ret = {r.x, r.x, r.z, 1.f};
-    return ret;
-}
-static float4 __attribute__((overloadable))
-            getSample_565(const uint8_t *p, int32_t iPixel,
-                           int32_t next, float w0, float w1) {
-    float3 p0 = getElementAt565(p, iPixel);
-    float3 p1 = getElementAt565(p, next);
-    float3 r = p0 * w0 + p1 * w1;
-    r *= (1.f / 255.f);
-    float4 ret = {r.x, r.x, r.z, 1.f};
-    return ret;
-}
-static float4 __attribute__((overloadable))
-            getSample_RGBA(const uint8_t *p, int32_t iPixel,
-                             int32_t next, float w0, float w1) {
-    float4 p0 = getElementAt4(p, iPixel);
-    float4 p1 = getElementAt4(p, next);
-    float4 r = p0 * w0 + p1 * w1;
-    r *= (1.f / 255.f);
-    return r;
-}
-
-
-static float4 __attribute__((overloadable))
-            getSample_A(const uint8_t *p, size_t stride,
-                          int locX, int locY, int nextX, int nextY,
-                          float w0, float w1, float w2, float w3) {
-    float p0 = getElementAt1(p, stride, locX, locY);
-    float p1 = getElementAt1(p, stride, nextX, locY);
-    float p2 = getElementAt1(p, stride, locX, nextY);
-    float p3 = getElementAt1(p, stride, nextX, nextY);
-    float r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
-    r *= (1.f / 255.f);
-    float4 ret = {0.f, 0.f, 0.f, r};
-    return ret;
-}
-static float4 __attribute__((overloadable))
-            getSample_L(const uint8_t *p, size_t stride,
-                         int locX, int locY, int nextX, int nextY,
-                         float w0, float w1, float w2, float w3) {
-    float p0 = getElementAt1(p, stride, locX, locY);
-    float p1 = getElementAt1(p, stride, nextX, locY);
-    float p2 = getElementAt1(p, stride, locX, nextY);
-    float p3 = getElementAt1(p, stride, nextX, nextY);
-    float r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
-    r *= (1.f / 255.f);
-    float4 ret = {r, r, r, 1.f};
-    return ret;
-}
-static float4 __attribute__((overloadable))
-            getSample_LA(const uint8_t *p, size_t stride,
-                         int locX, int locY, int nextX, int nextY,
-                         float w0, float w1, float w2, float w3) {
-    float2 p0 = getElementAt2(p, stride, locX, locY);
-    float2 p1 = getElementAt2(p, stride, nextX, locY);
-    float2 p2 = getElementAt2(p, stride, locX, nextY);
-    float2 p3 = getElementAt2(p, stride, nextX, nextY);
-    float2 r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
-    r *= (1.f / 255.f);
-    float4 ret = {r.x, r.x, r.x, r.y};
-    return ret;
-}
-static float4 __attribute__((overloadable))
-            getSample_RGB(const uint8_t *p, size_t stride,
-                         int locX, int locY, int nextX, int nextY,
-                         float w0, float w1, float w2, float w3) {
-    float4 p0 = getElementAt4(p, stride, locX, locY);
-    float4 p1 = getElementAt4(p, stride, nextX, locY);
-    float4 p2 = getElementAt4(p, stride, locX, nextY);
-    float4 p3 = getElementAt4(p, stride, nextX, nextY);
-    float4 r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
-    r *= (1.f / 255.f);
-    float4 ret = {r.x, r.y, r.z, 1.f};
-    return ret;
-}
-static float4 __attribute__((overloadable))
-            getSample_RGBA(const uint8_t *p, size_t stride,
-                         int locX, int locY, int nextX, int nextY,
-                         float w0, float w1, float w2, float w3) {
-    float4 p0 = getElementAt4(p, stride, locX, locY);
-    float4 p1 = getElementAt4(p, stride, nextX, locY);
-    float4 p2 = getElementAt4(p, stride, locX, nextY);
-    float4 p3 = getElementAt4(p, stride, nextX, nextY);
-    float4 r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
-    r *= (1.f / 255.f);
-    return r;
-}
-static float4 __attribute__((overloadable))
-            getSample_565(const uint8_t *p, size_t stride,
-                         int locX, int locY, int nextX, int nextY,
-                         float w0, float w1, float w2, float w3) {
-    float3 p0 = getElementAt565(p, stride, locX, locY);
-    float3 p1 = getElementAt565(p, stride, nextX, locY);
-    float3 p2 = getElementAt565(p, stride, locX, nextY);
-    float3 p3 = getElementAt565(p, stride, nextX, nextY);
-    float3 r = p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
-    r *= (1.f / 255.f);
-    float4 ret;
-    ret.rgb = r;
-    ret.w = 1.f;
-    return ret;
-}
-
-static float4 __attribute__((overloadable))
-        getBilinearSample1D(const Allocation_t *alloc, float2 weights,
-                          uint32_t iPixel, uint32_t next,
-                          rs_data_kind dk, rs_data_type dt, uint32_t lod) {
-
-     const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[lod].mallocPtr;
-
-     switch(dk) {
-     case RS_KIND_PIXEL_RGBA:
-         return getSample_RGBA(p, iPixel, next, weights.x, weights.y);
-     case RS_KIND_PIXEL_A:
-         return getSample_A(p, iPixel, next, weights.x, weights.y);
-     case RS_KIND_PIXEL_RGB:
-         if (dt == RS_TYPE_UNSIGNED_5_6_5) {
-             return getSample_565(p, iPixel, next, weights.x, weights.y);
-         }
-         return getSample_RGB(p, iPixel, next, weights.x, weights.y);
-     case RS_KIND_PIXEL_L:
-         return getSample_L(p, iPixel, next, weights.x, weights.y);
-     case RS_KIND_PIXEL_LA:
-         return getSample_LA(p, iPixel, next, weights.x, weights.y);
-
-     default:
-         //__builtin_unreachable();
-         break;
-     }
-
-     //__builtin_unreachable();
-     return 0.f;
-}
-
-static uint32_t wrapI(rs_sampler_value wrap, int32_t coord, int32_t size) {
-    if (wrap == RS_SAMPLER_WRAP) {
-        coord = coord % size;
-        if (coord < 0) {
-            coord += size;
-        }
-    }
-    if (wrap == RS_SAMPLER_MIRRORED_REPEAT) {
-        coord = coord % (size * 2);
-        if (coord < 0) {
-            coord = (size * 2) + coord;
-        }
-        if (coord >= size) {
-            coord = (size * 2) - coord;
-        }
-    }
-    return (uint32_t)max(0, min(coord, size - 1));
-}
-
-static float4 __attribute__((overloadable))
-        getBilinearSample2D(const Allocation_t *alloc, float w0, float w1, float w2, float w3,
-                          int lx, int ly, int nx, int ny,
-                          rs_data_kind dk, rs_data_type dt, uint32_t lod) {
-
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[lod].mallocPtr;
-    size_t stride = alloc->mHal.drvState.lod[lod].stride;
-
-    switch(dk) {
-    case RS_KIND_PIXEL_RGBA:
-        return getSample_RGBA(p, stride, lx, ly, nx, ny, w0, w1, w2, w3);
-    case RS_KIND_PIXEL_A:
-        return getSample_A(p, stride, lx, ly, nx, ny, w0, w1, w2, w3);
-    case RS_KIND_PIXEL_LA:
-        return getSample_LA(p, stride, lx, ly, nx, ny, w0, w1, w2, w3);
-    case RS_KIND_PIXEL_RGB:
-        if (dt == RS_TYPE_UNSIGNED_5_6_5) {
-            return getSample_565(p, stride, lx, ly, nx, ny, w0, w1, w2, w3);
-        }
-        return getSample_RGB(p, stride, lx, ly, nx, ny, w0, w1, w2, w3);
-    case RS_KIND_PIXEL_L:
-        return getSample_L(p, stride, lx, ly, nx, ny, w0, w1, w2, w3);
-
-    default:
-        //__builtin_unreachable();
-        break;
-    }
-
-    //__builtin_unreachable();
-    return 0.f;
-}
-
-static float4  __attribute__((overloadable))
-        getNearestSample(const Allocation_t *alloc, uint32_t iPixel, rs_data_kind dk,
-                         rs_data_type dt, uint32_t lod) {
-
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[lod].mallocPtr;
-
-    float4 result = {0.f, 0.f, 0.f, 255.f};
-
-    switch(dk) {
-    case RS_KIND_PIXEL_RGBA:
-        result = getElementAt4(p, iPixel);
-        break;
-    case RS_KIND_PIXEL_A:
-        result.w = getElementAt1(p, iPixel);
-        break;
-    case RS_KIND_PIXEL_LA:
-        result.zw = getElementAt2(p, iPixel);
-        result.xy = result.z;
-        break;
-    case RS_KIND_PIXEL_RGB:
-        if (dt == RS_TYPE_UNSIGNED_5_6_5) {
-            result.xyz = getElementAt565(p, iPixel);
-        } else {
-            result.xyz = getElementAt3(p, iPixel);
-        }
-        break;
-    case RS_KIND_PIXEL_L:
-        result.xyz = getElementAt1(p, iPixel);
-
-    default:
-        //__builtin_unreachable();
-        break;
-    }
-
-    return result * 0.003921569f;
-}
-
-static float4  __attribute__((overloadable))
-        getNearestSample(const Allocation_t *alloc, uint2 iPixel, rs_data_kind dk,
-                         rs_data_type dt, uint32_t lod) {
-
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[lod].mallocPtr;
-    size_t stride = alloc->mHal.drvState.lod[lod].stride;
-
-    float4 result = {0.f, 0.f, 0.f, 255.f};
-
-    switch(dk) {
-    case RS_KIND_PIXEL_RGBA:
-        result = getElementAt4(p, stride, iPixel.x, iPixel.y);
-        break;
-    case RS_KIND_PIXEL_A:
-        result.w = getElementAt1(p, stride, iPixel.x, iPixel.y);
-        break;
-    case RS_KIND_PIXEL_LA:
-        result.zw = getElementAt2(p, stride, iPixel.x, iPixel.y);
-        result.xy = result.z;
-        break;
-    case RS_KIND_PIXEL_RGB:
-        if (dt == RS_TYPE_UNSIGNED_5_6_5) {
-            result.xyz = getElementAt565(p, stride, iPixel.x, iPixel.y);
-        } else {
-            result.xyz = getElementAt3(p, stride, iPixel.x, iPixel.y);
-        }
-        break;
-
-    default:
-        //__builtin_unreachable();
-        break;
-    }
-
-    return result * 0.003921569f;
-}
-
-static float4 __attribute__((overloadable))
-        sample_LOD_LinearPixel(const Allocation_t *alloc, const Type_t *type,
-                               rs_data_kind dk, rs_data_type dt,
-                               rs_sampler s,
-                               float uv, uint32_t lod) {
-
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[lod].mallocPtr;
-
-    rs_sampler_value wrapS = rsSamplerGetWrapS(s);
-    int32_t sourceW = alloc->mHal.drvState.lod[lod].dimX;
-    float pixelUV = uv * (float)(sourceW);
-    int32_t iPixel = (int32_t)(pixelUV);
-    float frac = pixelUV - (float)iPixel;
-
-    if (frac < 0.5f) {
-        iPixel -= 1;
-        frac += 0.5f;
-    } else {
-        frac -= 0.5f;
-    }
-
-    float oneMinusFrac = 1.0f - frac;
-
-    float2 weights;
-    weights.x = oneMinusFrac;
-    weights.y = frac;
-
-    uint32_t next = wrapI(wrapS, iPixel + 1, sourceW);
-    uint32_t location = wrapI(wrapS, iPixel, sourceW);
-
-    return getBilinearSample1D(alloc, weights, location, next, dk, dt, lod);
-}
-
-static float4 __attribute__((overloadable))
-        sample_LOD_NearestPixel(const Allocation_t *alloc,
-                                rs_data_kind dk, rs_data_type dt,
-                                rs_sampler s,
-                                float uv, uint32_t lod) {
-
-    rs_sampler_value wrapS = rsSamplerGetWrapS(s);
-    int32_t sourceW = alloc->mHal.drvState.lod[lod].dimX;
-    int32_t iPixel = (int32_t)(uv * (float)(sourceW));
-    uint32_t location = wrapI(wrapS, iPixel, sourceW);
-
-    return getNearestSample(alloc, location, dk, dt, lod);
-}
-
-static float4 __attribute__((overloadable))
-        sample_LOD_LinearPixel(const Allocation_t *alloc,
-                               rs_data_kind dk, rs_data_type dt,
-                               rs_sampler s,
-                               float2 uv, uint32_t lod) {
-
-    const uint8_t *p = (const uint8_t *)alloc->mHal.drvState.lod[lod].mallocPtr;
-
-    rs_sampler_value wrapS = rsSamplerGetWrapS(s);
-    rs_sampler_value wrapT = rsSamplerGetWrapT(s);
-
-    int sourceW = alloc->mHal.drvState.lod[lod].dimX;
-    int sourceH = alloc->mHal.drvState.lod[lod].dimY;
-
-    float pixelU = uv.x * sourceW;
-    float pixelV = uv.y * sourceH;
-    int iPixelU = pixelU;
-    int iPixelV = pixelV;
-    float fracU = pixelU - iPixelU;
-    float fracV = pixelV - iPixelV;
-
-    if (fracU < 0.5f) {
-        iPixelU -= 1;
-        fracU += 0.5f;
-    } else {
-        fracU -= 0.5f;
-    }
-    if (fracV < 0.5f) {
-        iPixelV -= 1;
-        fracV += 0.5f;
-    } else {
-        fracV -= 0.5f;
-    }
-    float oneMinusFracU = 1.0f - fracU;
-    float oneMinusFracV = 1.0f - fracV;
-
-    float w0 = oneMinusFracU * oneMinusFracV;
-    float w1 = fracU * oneMinusFracV;
-    float w2 = oneMinusFracU * fracV;
-    float w3 = fracU * fracV;
-
-    int nx = wrapI(wrapS, iPixelU + 1, sourceW);
-    int ny = wrapI(wrapT, iPixelV + 1, sourceH);
-    int lx = wrapI(wrapS, iPixelU, sourceW);
-    int ly = wrapI(wrapT, iPixelV, sourceH);
-
-    return getBilinearSample2D(alloc, w0, w1, w2, w3, lx, ly, nx, ny, dk, dt, lod);
-
-}
-
-static float4 __attribute__((overloadable))
-        sample_LOD_NearestPixel(const Allocation_t *alloc,
-                                rs_data_kind dk, rs_data_type dt,
-                                rs_sampler s,
-                                float2 uv, uint32_t lod) {
-    rs_sampler_value wrapS = rsSamplerGetWrapS(s);
-    rs_sampler_value wrapT = rsSamplerGetWrapT(s);
-
-    int sourceW = alloc->mHal.drvState.lod[lod].dimX;
-    int sourceH = alloc->mHal.drvState.lod[lod].dimY;
-
-    float2 dimF;
-    dimF.x = (float)(sourceW);
-    dimF.y = (float)(sourceH);
-    int2 iPixel = convert_int2(uv * dimF);
-
-    uint2 location;
-    location.x = wrapI(wrapS, iPixel.x, sourceW);
-    location.y = wrapI(wrapT, iPixel.y, sourceH);
-    return getNearestSample(alloc, location, dk, dt, lod);
-}
-
-extern const float4 __attribute__((overloadable))
-        rsSample(rs_allocation a, rs_sampler s, float uv, float lod) {
-    rs_element elem = rsAllocationGetElement(a);
-    rs_data_kind dk = rsElementGetDataKind(elem);
-    rs_data_type dt = rsElementGetDataType(elem);
-
-    if (dk == RS_KIND_USER || (dt != RS_TYPE_UNSIGNED_8 && dt != RS_TYPE_UNSIGNED_5_6_5)) {
-        return 0.f;
-    }
-
-    const Allocation_t *alloc = (const Allocation_t *)a.p;
-    const Type_t *type = (const Type_t*)alloc->mHal.state.type;
-
-    rs_sampler_value sampleMin = rsSamplerGetMinification(s);
-    rs_sampler_value sampleMag = rsSamplerGetMagnification(s);
-
-    if (lod <= 0.0f) {
-        if (sampleMag == RS_SAMPLER_NEAREST) {
-            return sample_LOD_NearestPixel(alloc, dk, dt, s, uv, 0);
-        }
-        return sample_LOD_LinearPixel(alloc, dk, dt, s, uv, 0);
-    }
-
-    if (sampleMin == RS_SAMPLER_LINEAR_MIP_NEAREST) {
-        uint32_t maxLOD = type->mHal.state.lodCount - 1;
-        lod = min(lod, (float)maxLOD);
-        uint32_t nearestLOD = (uint32_t)round(lod);
-        return sample_LOD_LinearPixel(alloc, dk, dt, s, uv, nearestLOD);
-    }
-
-    if (sampleMin == RS_SAMPLER_LINEAR_MIP_LINEAR) {
-        uint32_t lod0 = (uint32_t)floor(lod);
-        uint32_t lod1 = (uint32_t)ceil(lod);
-        uint32_t maxLOD = type->mHal.state.lodCount - 1;
-        lod0 = min(lod0, maxLOD);
-        lod1 = min(lod1, maxLOD);
-        float4 sample0 = sample_LOD_LinearPixel(alloc, dk, dt, s, uv, lod0);
-        float4 sample1 = sample_LOD_LinearPixel(alloc, dk, dt, s, uv, lod1);
-        float frac = lod - (float)lod0;
-        return sample0 * (1.0f - frac) + sample1 * frac;
-    }
-
-    return sample_LOD_NearestPixel(alloc, dk, dt, s, uv, 0);
-}
-
-extern const float4 __attribute__((overloadable))
-        rsSample(rs_allocation a, rs_sampler s, float location) {
-    return rsSample(a, s, location, 0);
-}
-
-
-extern const float4 __attribute__((overloadable))
-        rsSample(rs_allocation a, rs_sampler s, float2 uv, float lod) {
-
-    const Allocation_t *alloc = (const Allocation_t *)a.p;
-
-    rs_element elem = rsAllocationGetElement(a);
-    rs_data_kind dk = rsElementGetDataKind(elem);
-    rs_data_type dt = rsElementGetDataType(elem);
-
-    if (dk == RS_KIND_USER ||
-        (dt != RS_TYPE_UNSIGNED_8 && dt != RS_TYPE_UNSIGNED_5_6_5) ||
-        !(alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE)) {
-        return 0.f;
-    }
-
-    rs_sampler_value sampleMin = rsSamplerGetMinification(s);
-    rs_sampler_value sampleMag = rsSamplerGetMagnification(s);
-
-    if (lod <= 0.0f) {
-        if (sampleMag == RS_SAMPLER_NEAREST) {
-            return sample_LOD_NearestPixel(alloc, dk, dt, s, uv, 0);
-        }
-        return sample_LOD_LinearPixel(alloc, dk, dt, s, uv, 0);
-    }
-
-    if (sampleMin == RS_SAMPLER_LINEAR_MIP_NEAREST) {
-        const Type_t *type = (const Type_t*)alloc->mHal.state.type;
-        uint32_t maxLOD = type->mHal.state.lodCount - 1;
-        lod = min(lod, (float)maxLOD);
-        uint32_t nearestLOD = (uint32_t)round(lod);
-        return sample_LOD_LinearPixel(alloc, dk, dt, s, uv, nearestLOD);
-    }
-
-    if (sampleMin == RS_SAMPLER_LINEAR_MIP_LINEAR) {
-        const Type_t *type = (const Type_t*)alloc->mHal.state.type;
-        uint32_t lod0 = (uint32_t)floor(lod);
-        uint32_t lod1 = (uint32_t)ceil(lod);
-        uint32_t maxLOD = type->mHal.state.lodCount - 1;
-        lod0 = min(lod0, maxLOD);
-        lod1 = min(lod1, maxLOD);
-        float4 sample0 = sample_LOD_LinearPixel(alloc, dk, dt, s, uv, lod0);
-        float4 sample1 = sample_LOD_LinearPixel(alloc, dk, dt, s, uv, lod1);
-        float frac = lod - (float)lod0;
-        return sample0 * (1.0f - frac) + sample1 * frac;
-    }
-
-    return sample_LOD_NearestPixel(alloc, dk, dt, s, uv, 0);
-}
-
-extern const float4 __attribute__((overloadable))
-        rsSample(rs_allocation a, rs_sampler s, float2 uv) {
-
-    const Allocation_t *alloc = (const Allocation_t *)a.p;
-
-    rs_element elem = rsAllocationGetElement(a);
-    rs_data_kind dk = rsElementGetDataKind(elem);
-    rs_data_type dt = rsElementGetDataType(elem);
-
-    if (dk == RS_KIND_USER ||
-        (dt != RS_TYPE_UNSIGNED_8 && dt != RS_TYPE_UNSIGNED_5_6_5) ||
-        !(alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE)) {
-        return 0.f;
-    }
-
-    if (rsSamplerGetMagnification(s) == RS_SAMPLER_NEAREST) {
-        return sample_LOD_NearestPixel(alloc, dk, dt, s, uv, 0);
-    }
-    return sample_LOD_LinearPixel(alloc, dk, dt, s, uv, 0);
-}
-
diff --git a/lib/Renderscript/runtime/rs_sampler.c b/lib/Renderscript/runtime/rs_sampler.c
deleted file mode 100644
index 39782de..0000000
--- a/lib/Renderscript/runtime/rs_sampler.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "rs_core.rsh"
-#include "rs_graphics.rsh"
-#include "rs_structs.h"
-
-/**
-* Sampler
-*/
-extern rs_sampler_value __attribute__((overloadable))
-        rsSamplerGetMinification(rs_sampler s) {
-    Sampler_t *prog = (Sampler_t *)s.p;
-    if (prog == NULL) {
-        return RS_SAMPLER_INVALID;
-    }
-    return prog->mHal.state.minFilter;
-}
-
-extern rs_sampler_value __attribute__((overloadable))
-        rsSamplerGetMagnification(rs_sampler s) {
-    Sampler_t *prog = (Sampler_t *)s.p;
-    if (prog == NULL) {
-        return RS_SAMPLER_INVALID;
-    }
-    return prog->mHal.state.magFilter;
-}
-
-extern rs_sampler_value __attribute__((overloadable))
-        rsSamplerGetWrapS(rs_sampler s) {
-    Sampler_t *prog = (Sampler_t *)s.p;
-    if (prog == NULL) {
-        return RS_SAMPLER_INVALID;
-    }
-    return prog->mHal.state.wrapS;
-}
-
-extern rs_sampler_value __attribute__((overloadable))
-        rsSamplerGetWrapT(rs_sampler s) {
-    Sampler_t *prog = (Sampler_t *)s.p;
-    if (prog == NULL) {
-        return RS_SAMPLER_INVALID;
-    }
-    return prog->mHal.state.wrapT;
-}
-
-extern float __attribute__((overloadable))
-        rsSamplerGetAnisotropy(rs_sampler s) {
-    Sampler_t *prog = (Sampler_t *)s.p;
-    if (prog == NULL) {
-        return 0.0f;
-    }
-    return prog->mHal.state.aniso;
-}
diff --git a/lib/Renderscript/runtime/rs_structs.h b/lib/Renderscript/runtime/rs_structs.h
deleted file mode 100644
index 6db4279..0000000
--- a/lib/Renderscript/runtime/rs_structs.h
+++ /dev/null
@@ -1,262 +0,0 @@
-#ifndef _RS_STRUCTS_H_
-#define _RS_STRUCTS_H_
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class Allocation owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsAllocation.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsAllocationGetDimX(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * allocations.
- *
- *****************************************************************************/
-typedef enum {
-    RS_ALLOCATION_MIPMAP_NONE = 0,
-    RS_ALLOCATION_MIPMAP_FULL = 1,
-    RS_ALLOCATION_MIPMAP_ON_SYNC_TO_TEXTURE = 2
-} rs_allocation_mipmap_control;
-
-typedef struct Allocation {
-    char __pad[32];
-    struct {
-        void * drv;
-        struct {
-            const void *type;
-            uint32_t usageFlags;
-            rs_allocation_mipmap_control mipmapControl;
-            uint32_t yuv;
-            uint32_t elementSizeBytes;
-            bool hasMipmaps;
-            bool hasFaces;
-            bool hasReferences;
-            void * usrPtr;
-            int32_t surfaceTextureID;
-            void * wndSurface;
-            void * surfaceTexture;
-        } state;
-
-        struct DrvState {
-            struct LodState {
-                void * mallocPtr;
-                size_t stride;
-                uint32_t dimX;
-                uint32_t dimY;
-                uint32_t dimZ;
-            } lod[16/*android::renderscript::Allocation::MAX_LOD*/];
-            size_t faceOffset;
-            uint32_t lodCount;
-            uint32_t faceCount;
-        } drvState;
-    } mHal;
-} Allocation_t;
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class ProgramStore owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsProgramStore.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsgProgramStoreGetDepthFunc(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * program store.
- *
- *****************************************************************************/
-typedef struct ProgramStore {
-    char __pad[40];
-    struct {
-        struct {
-            bool ditherEnable;
-            bool colorRWriteEnable;
-            bool colorGWriteEnable;
-            bool colorBWriteEnable;
-            bool colorAWriteEnable;
-            rs_blend_src_func blendSrc;
-            rs_blend_dst_func blendDst;
-            bool depthWriteEnable;
-            rs_depth_func depthFunc;
-        } state;
-    } mHal;
-} ProgramStore_t;
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class ProgramRaster owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsProgramRaster.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsgProgramRasterGetCullMode(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * program raster.
- *
- *****************************************************************************/
-typedef struct ProgramRaster {
-    char __pad[36];
-    struct {
-        void * drv;
-        struct {
-            bool pointSprite;
-            rs_cull_mode cull;
-        } state;
-    } mHal;
-} ProgramRaster_t;
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class Sampler owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsSampler.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsgProgramRasterGetMagFilter(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * samplers.
- *
- *****************************************************************************/
-typedef struct Sampler {
-    char __pad[32];
-    struct {
-        void *drv;
-        struct {
-            rs_sampler_value magFilter;
-            rs_sampler_value minFilter;
-            rs_sampler_value wrapS;
-            rs_sampler_value wrapT;
-            rs_sampler_value wrapR;
-            float aniso;
-        } state;
-    } mHal;
-} Sampler_t;
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class Element owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsElement.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsElementGetSubElementCount(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * elements.
- *
- *****************************************************************************/
-typedef struct Element {
-    char __pad[32];
-    struct {
-        void *drv;
-        struct {
-            rs_data_type dataType;
-            rs_data_kind dataKind;
-            uint32_t vectorSize;
-            uint32_t elementSizeBytes;
-
-            // Subelements
-            const void **fields;
-            uint32_t *fieldArraySizes;
-            const char **fieldNames;
-            uint32_t *fieldNameLengths;
-            uint32_t *fieldOffsetBytes;
-            uint32_t fieldsCount;
-        } state;
-    } mHal;
-} Element_t;
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class Type owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsType.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsAllocationGetElement(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * types.
- *
- *****************************************************************************/
-typedef struct Type {
-    char __pad[32];
-    struct {
-        void *drv;
-        struct {
-            const void * element;
-            uint32_t dimX;
-            uint32_t dimY;
-            uint32_t dimZ;
-            uint32_t *lodDimX;
-            uint32_t *lodDimY;
-            uint32_t *lodDimZ;
-            uint32_t *lodOffset;
-            uint32_t lodCount;
-            bool faces;
-        } state;
-    } mHal;
-} Type_t;
-
-/*****************************************************************************
- * CAUTION
- *
- * The following structure layout provides a more efficient way to access
- * internal members of the C++ class Mesh owned by librs. Unfortunately,
- * since this class has virtual members, we can't simply use offsetof() or any
- * other compiler trickery to dynamically get the appropriate values at
- * build-time. This layout may need to be updated whenever
- * frameworks/base/libs/rs/rsMesh.h is modified.
- *
- * Having the layout information available in this file allows us to
- * accelerate functionality like rsMeshGetVertexAllocationCount(). Without this
- * information, we would not be able to inline the bitcode, thus resulting in
- * potential runtime performance penalties for tight loops operating on
- * meshes.
- *
- *****************************************************************************/
-typedef struct Mesh {
-    char __pad[32];
-    struct {
-        void *drv;
-        struct {
-            void **vertexBuffers;
-            uint32_t vertexBuffersCount;
-
-            // indexBuffers[i] could be NULL, in which case only primitives[i] is used
-            void **indexBuffers;
-            uint32_t indexBuffersCount;
-            rs_primitive *primitives;
-            uint32_t primitivesCount;
-        } state;
-    } mHal;
-} Mesh_t;
-#endif // _RS_CORE_H_
diff --git a/lib/Support/Initialization.cpp b/lib/Support/Initialization.cpp
index b41962c..97e4834 100644
--- a/lib/Support/Initialization.cpp
+++ b/lib/Support/Initialization.cpp
@@ -26,7 +26,8 @@
 
 namespace {
 
-void llvm_error_handler(void *pUserData, const std::string &pMessage) {
+void llvm_error_handler(void *pUserData, const std::string &pMessage,
+                        bool pGenCrashDiag) {
   ALOGE("%s", pMessage.c_str());
   ::exit(1);
 }
@@ -46,6 +47,7 @@
 
 #if defined(PROVIDE_ARM_CODEGEN)
   LLVMInitializeARMAsmPrinter();
+  LLVMInitializeARMAsmParser();
 # if USE_DISASSEMBLER
   LLVMInitializeARMDisassembler();
 # endif
@@ -56,6 +58,7 @@
 
 #if defined(PROVIDE_MIPS_CODEGEN)
   LLVMInitializeMipsAsmPrinter();
+  LLVMInitializeMipsAsmParser();
 # if USE_DISASSEMBLER
   LLVMInitializeMipsDisassembler();
 # endif
@@ -66,6 +69,7 @@
 
 #if defined(PROVIDE_X86_CODEGEN)
   LLVMInitializeX86AsmPrinter();
+  LLVMInitializeX86AsmParser();
 # if USE_DISASSEMBLER
   LLVMInitializeX86Disassembler();
 # endif
diff --git a/lib/Support/TargetCompilerConfigs.cpp b/lib/Support/TargetCompilerConfigs.cpp
index 7d3de45..948e836 100644
--- a/lib/Support/TargetCompilerConfigs.cpp
+++ b/lib/Support/TargetCompilerConfigs.cpp
@@ -14,8 +14,12 @@
  * limitations under the License.
  */
 
+#include "bcc/Support/Properties.h"
 #include "bcc/Support/TargetCompilerConfigs.h"
 
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Host.h"
+
 // Get ARM version number (i.e., __ARM_ARCH__)
 #ifdef __arm__
 #include <machine/cpu-features.h>
@@ -45,6 +49,9 @@
 void
 ARMBaseCompilerConfig::GetFeatureVector(std::vector<std::string> &pAttributes,
                                         bool pInThumbMode, bool pEnableNEON) {
+  llvm::StringMap<bool> Features;
+  llvm::sys::getHostCPUFeatures(Features);
+
 #if defined(ARCH_ARM_HAVE_VFP)
   pAttributes.push_back("+vfp3");
 #  if !defined(ARCH_ARM_HAVE_VFP_D32)
@@ -60,17 +67,20 @@
     }
   }
 
-#if defined(ARCH_ARM_HAVE_NEON)
-  if (pEnableNEON) {
+  if (pEnableNEON && Features.count("neon") && Features["neon"]) {
     pAttributes.push_back("+neon");
   } else {
     pAttributes.push_back("-neon");
     pAttributes.push_back("-neonfp");
   }
-#else
-  pAttributes.push_back("-neon");
-  pAttributes.push_back("-neonfp");
-#endif
+
+  if (!getProperty("debug.rs.arm-no-hwdiv")) {
+    if (Features.count("hwdiv-arm") && Features["hwdiv-arm"])
+      pAttributes.push_back("+hwdiv-arm");
+
+    if (Features.count("hwdiv") && Features["hwdiv"])
+      pAttributes.push_back("+hwdiv");
+  }
 
   return;
 }
@@ -82,6 +92,9 @@
   // Enable NEON by default.
   mEnableNEON = true;
 
+  if (!getProperty("debug.rs.arm-no-tune-for-cpu"))
+    setCPU(llvm::sys::getHostCPUName());
+
   std::vector<std::string> attributes;
   GetFeatureVector(attributes, mInThumbMode, mEnableNEON);
   setFeatureString(attributes);
diff --git a/libbcc-device-build.mk b/libbcc-device-build.mk
index 48fa35f..54340c4 100644
--- a/libbcc-device-build.mk
+++ b/libbcc-device-build.mk
@@ -43,10 +43,7 @@
     endif
   endif
   ifeq ($(ARCH_ARM_HAVE_NEON),true)
-    # Disable NEON on cortex-a15 temporarily
-    ifneq ($(strip $(TARGET_CPU_VARIANT)), cortex-a15)
-      LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
-    endif
+    LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
   endif
 else
   ifeq ($(TARGET_ARCH),mips)
diff --git a/libbcc.mk b/libbcc.mk
index 3b2463f..adea004 100644
--- a/libbcc.mk
+++ b/libbcc.mk
@@ -24,7 +24,7 @@
 
 LLVM_ROOT_PATH          := external/llvm
 MCLD_ROOT_PATH          := frameworks/compile/mclinker
-RSLOADER_ROOT_PATH      := frameworks/rs/driver/linkloader
+RSLOADER_ROOT_PATH      := frameworks/rs/cpu_ref/linkloader
 
 #=====================================================================
 # Related Makefile Paths of libbcc
diff --git a/tests/bccarm b/tests/bccarm
deleted file mode 100755
index 0914e10..0000000
--- a/tests/bccarm
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/python
-#
-# Run a test on the ARM version of bcc.
-
-import unittest
-import subprocess
-import os
-import sys
-
-def compile(args):
-    proc = subprocess.Popen(["bcc"] + args, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-    result = proc.communicate()
-    return result
-
-def runCmd(args):
-    proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    result = proc.communicate()
-    return result[0].strip()
-
-def uname():
-    return runCmd(["uname"])
-
-def unameM():
-    return runCmd(["uname", "-m"])
-
-def which(item):
-    return runCmd(["which", item])
-
-def adb(args):
-    return runCmd(["adb"] + args)
-
-def setupArm(file):
-    print "Setting up arm"
-    adb(["remount"])
-    adb(["shell", "rm", "/system/bin/bcc"])
-    adb(["shell", "mkdir", "/system/bin/bccdata"])
-    adb(["shell", "mkdir", "/system/bin/bccdata/data"])
-
-    remoteFileName = os.path.join("/system/bin/bccdata", file)
-    adb(["push", file, remoteFileName])
-
-    # Copy over compiler
-    adb(["sync"])
-    return remoteFileName
-
-def compileArm(args):
-    remoteArgs = []
-    fileName = ""
-    for arg in sys.argv[1:]:
-        if arg.startswith('-'):
-            remoteArgs.append(arg)
-        else:
-            fileName = arg
-
-    remoteFileName = setupArm(fileName)
-    remoteArgs.append(remoteFileName)
-    remoteCmdLine = ["adb", "shell", "/system/bin/bcc"] + remoteArgs
-    proc = subprocess.Popen(remoteCmdLine, stdout=subprocess.PIPE)
-    result = proc.communicate()
-    return result[0].replace("\r","")
-
-
-def main():
-    print compileArm(sys.argv[1:])
-
-if __name__ == '__main__':
-    main()
-
-
diff --git a/tests/data/addressOf.bc b/tests/data/addressOf.bc
deleted file mode 100644
index 8b54383..0000000
--- a/tests/data/addressOf.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/array.bc b/tests/data/array.bc
deleted file mode 100644
index 0d389fc..0000000
--- a/tests/data/array.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/assignment.bc b/tests/data/assignment.bc
deleted file mode 100644
index 6cb36ea..0000000
--- a/tests/data/assignment.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/assignmentop.bc b/tests/data/assignmentop.bc
deleted file mode 100644
index f4131f8..0000000
--- a/tests/data/assignmentop.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/b2071670.bc b/tests/data/b2071670.bc
deleted file mode 100644
index e9b495e..0000000
--- a/tests/data/b2071670.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/brackets.bc b/tests/data/brackets.bc
deleted file mode 100644
index da5fc92..0000000
--- a/tests/data/brackets.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/casts.bc b/tests/data/casts.bc
deleted file mode 100644
index d21e54a..0000000
--- a/tests/data/casts.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/char.bc b/tests/data/char.bc
deleted file mode 100644
index 8ba6f9d..0000000
--- a/tests/data/char.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/comma.bc b/tests/data/comma.bc
deleted file mode 100644
index 0e159c1..0000000
--- a/tests/data/comma.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/constants.bc b/tests/data/constants.bc
deleted file mode 100644
index c0699e1..0000000
--- a/tests/data/constants.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/defines.bc b/tests/data/defines.bc
deleted file mode 100644
index 4457f46..0000000
--- a/tests/data/defines.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/double.bc b/tests/data/double.bc
deleted file mode 100644
index 3dc204e..0000000
--- a/tests/data/double.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/expr-ansi.bc b/tests/data/expr-ansi.bc
deleted file mode 100644
index 9b1ddc9..0000000
--- a/tests/data/expr-ansi.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/expr2.bc b/tests/data/expr2.bc
deleted file mode 100644
index 4b6ce5f..0000000
--- a/tests/data/expr2.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/film.bc b/tests/data/film.bc
deleted file mode 100644
index 5be0267..0000000
--- a/tests/data/film.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/float.bc b/tests/data/float.bc
deleted file mode 100644
index 8ec27f3..0000000
--- a/tests/data/float.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/floatdouble.bc b/tests/data/floatdouble.bc
deleted file mode 100644
index 8d30e62..0000000
--- a/tests/data/floatdouble.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/flops.bc b/tests/data/flops.bc
deleted file mode 100644
index 856b222..0000000
--- a/tests/data/flops.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/hello.bc b/tests/data/hello.bc
deleted file mode 100644
index 74c75e7..0000000
--- a/tests/data/hello.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/inc.bc b/tests/data/inc.bc
deleted file mode 100644
index 6763032..0000000
--- a/tests/data/inc.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/iops.bc b/tests/data/iops.bc
deleted file mode 100644
index 7edaca6..0000000
--- a/tests/data/iops.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/missing-main.bc b/tests/data/missing-main.bc
deleted file mode 100644
index 6eacb7f..0000000
--- a/tests/data/missing-main.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/otcc-ansi.bc b/tests/data/otcc-ansi.bc
deleted file mode 100644
index 377fb47..0000000
--- a/tests/data/otcc-ansi.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/otcc.c b/tests/data/otcc.c
deleted file mode 100644
index 433ae2e..0000000
--- a/tests/data/otcc.c
+++ /dev/null
@@ -1,448 +0,0 @@
-#include <stdio.h>
-#define k *(int*)
-#define a if(
-#define c ad()
-#define i else
-#define p while(
-#define x *(char*)
-#define b ==
-#define V =calloc(1,99999)
-#define f ()
-#define J return
-#define l ae(
-#define n e)
-#define u d!=
-#define F int 
-#define y (j)
-#define r m=
-#define t +4
-F d,z,C,h,P,K,ac,q,G,v,Q,R,D,L,W,M;
-E(n{
-x D++=e;
-}
-o f{
-a L){
-h=x L++;
-a h b 2){
-L=0;
-h=W;
-}
-}
-i h=fgetc(Q);
-}
-X f{
-J isalnum(h)|h b 95;
-}
-Y f{
-a h b 92){
-o f;
-a h b 110)h=10;
-}
-}
-c{
-F e,j,m;
-p isspace(h)|h b 35){
-a h b 35){
-o f;
-c;
-a d b 536){
-c;
-E(32);
-k d=1;
-k(d t)=D;
-}
-p h!=10){
-E(h);
-o f;
-}
-E(h);
-E(2);
-}
-o f;
-}
-C=0;
-d=h;
-a X f){
-E(32);
-M=D;
-p X f){
-E(h);
-o f;
-}
-a isdigit(d)){
-z=strtol(M,0,0);
-d=2;
-}
-i{
-x D=32;
-d=strstr(R,M-1)-R;
-x D=0;
-d=d*8+256;
-a d>536){
-d=P+d;
-a k d b 1){
-L=k(d t);
-W=h;
-o f;
-c;
-}
-}
-}
-}
-i{
-o f;
-a d b 39){
-d=2;
-Y f;
-z=h;
-o f;
-o f;
-}
-i a d b 47&h b 42){
-o f;
-p h){
-p h!=42)o f;
-o f;
-a h b 47)h=0;
-}
-o f;
-c;
-}
-i{
-e="++#m--%am*@R<^1c/@%[_[H3c%@%[_[H3c+@.B#d-@%:_^BKd<<Z/03e>>`/03e<=0f>=/f<@.f>@1f==&g!='g&&k||#l&@.BCh^@.BSi|@.B+j~@/%Yd!@&d*@b";
-p j=x e++){
-r x e++;
-z=0;
-p(C=x e++-98)<0)z=z*64+C+64;
-a j b d&(m b h|m b 64)){
-a m b h){
-o f;
-d=1;
-}
-break;
-}
-}
-}
-}
-}
-l g){
-p g&&g!=-1){
-x q++=g;
-g=g>>8;
-}
-}
-A(n{
-F g;
-p n{
-g=k e;
-k e=q-e-4;
-e=g;
-}
-}
-s(g,n{
-l g);
-k q=e;
-e=q;
-q=q t;
-J e;
-}
-H(n{
-s(184,n;
-}
-B(n{
-J s(233,n;
-}
-S(j,n{
-l 1032325);
-J s(132+j,n;
-}
-Z(n{
-l 49465);
-H(0);
-l 15);
-l e+144);
-l 192);
-}
-N(j,n{
-l j+131);
-s((e<512)<<7|5,n;
-}
-T y{
-F g,e,m,aa;
-g=1;
-a d b 34){
-H(v);
-p h!=34){
-Y f;
-x v++=h;
-o f;
-}
-x v=0;
-v=v t&-4;
-o f;
-c;
-}
-i{
-aa=C;
-r z;
-e=d;
-c;
-a e b 2){
-H(m);
-}
-i a aa b 2){
-T(0);
-s(185,0);
-a e b 33)Z(m);
-i l m);
-}
-i a e b 40){
-w f;
-c;
-}
-i a e b 42){
-c;
-e=d;
-c;
-c;
-a d b 42){
-c;
-c;
-c;
-c;
-e=0;
-}
-c;
-T(0);
-a d b 61){
-c;
-l 80);
-w f;
-l 89);
-l 392+(e b 256));
-}
-i a n{
-a e b 256)l 139);
-i l 48655);
-q++;
-}
-}
-i a e b 38){
-N(10,k d);
-c;
-}
-i{
-g=k e;
-a!g)g=dlsym(0,M);
-a d b 61&j){
-c;
-w f;
-N(6,g);
-}
-i a u 40){
-N(8,g);
-a C b 11){
-N(0,g);
-l z);
-c;
-}
-}
-}
-}
-a d b 40){
-a g b 1)l 80);
-r s(60545,0);
-c;
-j=0;
-p u 41){
-w f;
-s(2393225,j);
-a d b 44)c;
-j=j t;
-}
-k r j;
-c;
-a!g){
-e=e t;
-k e=s(232,k n;
-}
-i a g b 1){
-s(2397439,j);
-j=j t;
-}
-i{
-s(232,g-q-5);
-}
-a j)s(50305,j);
-}
-}
-O y{
-F e,g,m;
-a j--b 1)T(1);
-i{
-O y;
-r 0;
-p j b C){
-g=d;
-e=z;
-c;
-a j>8){
-r S(e,m);
-O y;
-}
-i{
-l 80);
-O y;
-l 89);
-a j b 4|j b 5){
-Z(n;
-}
-i{
-l n;
-a g b 37)l 146);
-}
-}
-}
-a m&&j>8){
-r S(e,m);
-H(e^1);
-B(5);
-A(m);
-H(n;
-}
-}
-}
-w f{
-O(11);
-}
-U f{
-w f;
-J S(0,0);
-}
-I y{
-F m,g,e;
-a d b 288){
-c;
-c;
-r U f;
-c;
-I y;
-a d b 312){
-c;
-g=B(0);
-A(m);
-I y;
-A(g);
-}
-i{
-A(m);
-}
-}
-i a d b 352|d b 504){
-e=d;
-c;
-c;
-a e b 352){
-g=q;
-r U f;
-}
-i{
-a u 59)w f;
-c;
-g=q;
-r 0;
-a u 59)r U f;
-c;
-a u 41){
-e=B(0);
-w f;
-B(g-q-5);
-A(n;
-g=e t;
-}
-}
-c;
-I(&m);
-B(g-q-5);
-A(m);
-}
-i a d b 123){
-c;
-ab(1);
-p u 125)I y;
-c;
-}
-i{
-a d b 448){
-c;
-a u 59)w f;
-K=B(K);
-}
-i a d b 400){
-c;
-k j=B(k j);
-}
-i a u 59)w f;
-c;
-}
-}
-ab y{
-F m;
-p d b 256|u-1&!j){
-a d b 256){
-c;
-p u 59){
-a j){
-G=G t;
-k d=-G;
-}
-i{
-k d=v;
-v=v t;
-}
-c;
-a d b 44)c;
-}
-c;
-}
-i{
-A(k(d t));
-k d=q;
-c;
-c;
-r 8;
-p u 41){
-k d=m;
-r m t;
-c;
-a d b 44)c;
-}
-c;
-K=G=0;
-l 15042901);
-r s(60545,0);
-I(0);
-A(K);
-l 50121);
-k r G;
-}
-}
-}
-main(g,n{
-Q=stdin;
-a g-->1){
-e=e t;
-Q=fopen(k e,"r");
-}
-D=strcpy(R V," int if else while break return for define main ")+48;
-v V;
-q=ac V;
-P V;
-o f;
-c;
-ab(0);
-mprotect(ac & (~ 4095), (99999 + 4095) & (~ 4095), 7);
-fprintf(stderr, "otcc.c: about to execute compiled code.\n");
-J(*(int(*)f)k(P+592))(g,n;
-}
-
diff --git a/tests/data/pointers.bc b/tests/data/pointers.bc
deleted file mode 100644
index d4e88e4..0000000
--- a/tests/data/pointers.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/pointers2.bc b/tests/data/pointers2.bc
deleted file mode 100644
index b6d1f91..0000000
--- a/tests/data/pointers2.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/returnval-ansi.bc b/tests/data/returnval-ansi.bc
deleted file mode 100644
index ac14ab1..0000000
--- a/tests/data/returnval-ansi.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/returnval.c b/tests/data/returnval.c
deleted file mode 100644
index 1cf5bae..0000000
--- a/tests/data/returnval.c
+++ /dev/null
@@ -1,4 +0,0 @@
-main() {
-  return 42;
-}
-
diff --git a/tests/data/short.bc b/tests/data/short.bc
deleted file mode 100644
index eae1d26..0000000
--- a/tests/data/short.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/simplest.bc b/tests/data/simplest.bc
deleted file mode 100644
index 4b6ce5f..0000000
--- a/tests/data/simplest.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/src/addressOf.c b/tests/data/src/addressOf.c
deleted file mode 100644
index e7acde5..0000000
--- a/tests/data/src/addressOf.c
+++ /dev/null
@@ -1,31 +0,0 @@
-void testStruct() {
-    struct str {
-        float x;
-        float y;
-    };
-
-    struct str base;
-    int index = 0;
-
-    base.x = 10.0;
-    struct str *s = &base;
-
-    float *v = &(*s).x;
-    float *v2 = &s[index].x;
-    printf("testStruct: %g %g %g\n",base.x, *v, *v2);
-}
-
-void testArray() {
-    int a[2];
-    a[0] = 1;
-    a[1] = 2;
-    int* p = &a[0];
-    int* p2 = a;
-    printf("testArray: %d %d %d\n", a[0], *p, *p2);
-}
-
-int main() {
-    testStruct();
-    testArray();
-    return 0;
-}
diff --git a/tests/data/src/array.c b/tests/data/src/array.c
deleted file mode 100644
index ca4a728..0000000
--- a/tests/data/src/array.c
+++ /dev/null
@@ -1,107 +0,0 @@
-// Array allocation tests
-
-void testLocalInt()
-{
-    int a[3];
-    a[0] = 1;
-    a[1] = 2;
-    a[2] = a[0] + a[1];
-    printf("localInt: %d\n", a[2]);
-}
-
-char a[3];
-double d[3];
-
-void testGlobalChar()
-{
-    a[0] = 1;
-    a[1] = 2;
-    a[2] = a[0] + a[1];
-    printf("globalChar: %d\n", a[2]);
-}
-
-void testGlobalDouble()
-{
-    d[0] = 1;
-    d[1] = 2;
-    d[2] = d[0] + d[1];
-    printf("globalDouble: %g\n", d[2]);
-}
-
-void testLocalDouble()
-{
-    double d[3];
-    float  m[12];
-    m[0] = 1.0f;
-    m[1] = 2.0f;
-    d[0] = 1.0;
-    d[1] = 2.0;
-    d[2] = d[0] + d[1];
-    m[2] = m[0] + m[1];
-    printf("localDouble: %g %g\n", d[2], m[2]);
-}
-
-void vectorAdd(int* a, int* b, float* c, int len) {
-    int i;
-    for(i = 0; i < len; i++) {
-        c[i] = a[i] + b[i];
-    }
-}
-
-void testArgs() {
-    int a[3], b[3];
-    float c[3];
-    int i;
-    int len = 3;
-    for(i = 0; i < len; i++) {
-        a[i] = i;
-        b[i] = i;
-        c[i] = 0;
-    }
-    vectorAdd(a,b,c, len);
-    printf("testArgs:");
-    for(i = 0; i < len; i++) {
-        printf(" %g", c[i]);
-    }
-    printf("\n");
-}
-
-void testDecay() {
-    char c[4];
-    c[0] = 'H';
-    c[1] = 'i';
-    c[2] = '!';
-    c[3] = 0;
-    printf("testDecay: %s\n", c);
-}
-
-void test2D() {
-    char c[10][20];
-    int x;
-    int y;
-    printf("test2D:\n");
-    for(y = 0; y < 10; y++) {
-        for(x = 0; x < 20; x++) {
-            c[y][x] = 'a' + (15 & (y * 19 + x));
-        }
-    }
-    for(y = 0; y < 10; y++) {
-        for(x = 0; x < 20; x++) {
-            printf("%c", c[y][x]);
-        }
-        printf("\n");
-    }
-
-}
-
-int main()
-{
-    testLocalInt();
-    testLocalDouble();
-    testGlobalChar();
-    testGlobalDouble();
-    testArgs();
-    testDecay();
-    test2D();
-    return 0;
-}
diff --git a/tests/data/src/assignment.c b/tests/data/src/assignment.c
deleted file mode 100644
index 4fc7801..0000000
--- a/tests/data/src/assignment.c
+++ /dev/null
@@ -1,9 +0,0 @@
-int main() {
-    int a = 0;
-    int b = 1;
-    a = b = 2; // Test that "b = 2" generates an rvalue.
-    if (a = 7) { // Test that a = 7 generates an rvalue.
-        b = 3;
-    }
-    return a;
-}
diff --git a/tests/data/src/assignmentop.c b/tests/data/src/assignmentop.c
deleted file mode 100644
index 649edf9..0000000
--- a/tests/data/src/assignmentop.c
+++ /dev/null
@@ -1,62 +0,0 @@
-// Test assignment operations
-
-void testAssignment() {
-    int a = 2;
-    a *= 5;
-    printf("2 *= 5  %d\n", a);
-    a = 20;
-    a /= 5;
-    printf("20 /= 5  %d\n", a);
-    a = 17;
-    a %= 5;
-    printf("17 %%= 5  %d\n", a);
-    a = 17;
-    a += 5;
-    printf("17 += 5  %d\n", a);
-    a = 17;
-    a-=5;
-    printf("17 -= 5  %d\n", a);
-    a = 17;
-    a<<=1;
-    printf("17<<= 1  %d\n", a);
-    a = 17;
-    a>>=1;
-    printf("17>>= 1  %d\n", a);
-    a = 17;
-    a&=1;
-    printf("17&= 1  %d\n", a);
-    a = 17;
-    a^=1;
-    printf("17^= 1  %d\n", a);
-    a = 16;
-    a^=1;
-    printf("16|= 1  %d\n", a);
-}
-
-int a;
-
-int* f() {
-    printf("f()\n");
-    return &a;
-}
-
-void testEval() {
-    a = 0;
-    printf("*f() = *f() + 10;\n");
-    *f() = *f() + 10;
-    printf("a = %d\n", a);
-}
-
-void testOpEval() {
-    a = 0;
-    printf("*f() += 10;\n");
-    *f() += 10;
-    printf("a = %d\n", a);
-}
-
-int main() {
-    testAssignment();
-    testEval();
-    testOpEval();
-    return 0;
-}
diff --git a/tests/data/src/b2071670.c b/tests/data/src/b2071670.c
deleted file mode 100644
index 311bc4f..0000000
--- a/tests/data/src/b2071670.c
+++ /dev/null
@@ -1,9 +0,0 @@
-// See http://b/2071670
-
-int main() {
-    float f = 10.0f;
-    float* floatPointer = &f;
-    // The following line used to incorrectly error: "Incompatible pointer or array types"
-    int* buffer = (int*) floatPointer;
-    return *buffer;
-}
diff --git a/tests/data/src/bellard.otccex.c b/tests/data/src/bellard.otccex.c
deleted file mode 100644
index e8f0989..0000000
--- a/tests/data/src/bellard.otccex.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/* #!/usr/local/bin/otcc */
-/*
- * Sample OTCC C example. You can uncomment the first line and install
- * otcc in /usr/local/bin to make otcc scripts !  
- */
-
-/* Any preprocessor directive except #define are ignored. We put this
-   include so that a standard C compiler can compile this code too. */
-#include <stdio.h>
-
-/* defines are handled, but macro arguments cannot be given. No
-   recursive defines are tolerated */
-#define DEFAULT_BASE 10
-
-/*
- * Only old style K&R prototypes are parsed. Only int arguments are
- * allowed (implicit types).
- * 
- * By benchmarking the execution time of this function (for example
- * for fib(35)), you'll notice that OTCC is quite fast because it
- * generates native i386 machine code.  
- */
-fib(n)
-{
-    if (n <= 2)
-        return 1;
-    else
-        return fib(n-1) + fib(n-2);
-}
-
-/* Identifiers are parsed the same way as C: begins with letter or
-   '_', and then letters, '_' or digits */
-fact(n)
-{
-    /* local variables can be declared. Only 'int' type is supported */
-    int i, r;
-    r = 1;
-    /* 'while' and 'for' loops are supported */
-    for(i=2;i<=n;i++)
-        r = r * i;
-    return r;
-}
-
-/* Well, we could use printf, but it would be too easy */
-print_num(n, b)
-{
-    int tab, p, c;
-    /* Numbers can be entered in decimal, hexadecimal ('0x' prefix) and
-       octal ('0' prefix) */
-    /* more complex programs use malloc */
-    tab = malloc(0x100); 
-    p = tab;
-    while (1) {
-        c = n % b;
-        /* Character constants can be used */
-        if (c >= 10)
-            c = c + 'a' - 10;
-        else
-            c = c + '0';
-        *(char *)p = c;
-        p++;
-        n = n / b;
-        /* 'break' is supported */
-        if (n == 0)
-            break;
-    }
-    while (p != tab) {
-        p--;
-        printf("%c", *(char *)p);
-    }
-    free(tab);
-}
-
-/* 'main' takes standard 'argc' and 'argv' parameters */
-main(argc, argv)
-{
-    /* no local name space is supported, but local variables ARE
-       supported. As long as you do not use a globally defined
-       variable name as local variable (which is a bad habbit), you
-       won't have any problem */
-    int s, n, f, base;
-    
-    /* && and || operator have the same semantics as C (left to right
-       evaluation and early exit) */
-    if (argc != 2 && argc != 3) {
-        /* '*' operator is supported with explicit casting to 'int *',
-           'char *' or 'int (*)()' (function pointer). Of course, 'int'
-           are supposed to be used as pointers too. */
-        s = *(int *)argv;
-        help(s);
-        return 1;
-    }
-    /* Any libc function can be used because OTCC uses dynamic linking */
-    n = atoi(*(int *)(argv + 4));
-    base = DEFAULT_BASE;
-    if (argc >= 3) {
-        base = atoi(*(int *)(argv + 8));
-        if (base < 2 || base > 36) {
-            /* external variables can be used too (here: 'stderr') */
-            fprintf(stderr, "Invalid base\n");
-            return 1;
-        }
-    }
-    printf("fib(%d) = ", n);
-    print_num(fib(n), base);
-    printf("\n");
-
-    printf("fact(%d) = ", n);
-    if (n > 12) {
-        printf("Overflow");
-    } else {
-        /* why not using a function pointer ? */
-        f = &fact;
-        print_num((*(int (*)())f)(n), base);
-    }
-    printf("\n");
-    return 0;
-}
-
-/* functions can be used before being defined */
-help(name)
-{
-    printf("usage: %s n [base]\n", name);
-    printf("Compute fib(n) and fact(n) and output the result in base 'base'\n");
-}
-
diff --git a/tests/data/src/brackets.c b/tests/data/src/brackets.c
deleted file mode 100644
index bab88a2..0000000
--- a/tests/data/src/brackets.c
+++ /dev/null
@@ -1,61 +0,0 @@
-void testBrackets(int* ar, int len) {
-    int i;
-    int errors = 0;
-    for (i = 0; i < len; i++) {
-        ar[i] = i;
-    }
-    for (i = 0; i < len; i++) {
-        if (ar[i] != i) {
-            printf("error: [%d] %d != %d\n", i, ar[i], i);
-            errors++;
-        }
-    }
-    printf("Errors: %d\n", errors);
-}
-
-void testBrackets2D(int** ar2D, int lenX, int lenY) {
-    int x, y;
-    int errors = 0;
-    for (x = 0; x < lenX; x++) {
-        for (y = 0; y < lenY; y++) {
-            ar2D[x][y] = x * lenY + y;
-        }
-    }
-    for (x = 0; x < lenX; x++) {
-        for (y = 0; y < lenY; y++) {
-            int expected = x * lenY + y;
-            int val = ar2D[x][y];
-            if (val != expected) {
-                printf("error: [%d][%d] %d != %d\n", x, y, val, expected);
-                errors++;
-            }
-        }
-    }
-    printf("2D Errors: %d\n", errors);
-}
-
-void testHeap() {
-    int* ar = (int*) malloc(100);
-    testBrackets(ar, 25);
-    free(ar);
-}
-
-void testHeap2D() {
-    int lenX = 10;
-    int lenY = 5;
-    int* ar = (int*) malloc(lenX * lenY * 4);
-    int** ar2D = (int**) malloc(lenX * 4);
-    int i;
-    for(i = 0; i < lenX; i++) {
-        ar2D[i] = ar + lenY * i;
-    }
-    testBrackets2D(ar2D, lenX, lenY);
-    free(ar);
-    free(ar2D);
-}
-
-int main() {
-    testHeap();
-    testHeap2D();
-    return 0;
-}
diff --git a/tests/data/src/casts.c b/tests/data/src/casts.c
deleted file mode 100644
index d3a49b4..0000000
--- a/tests/data/src/casts.c
+++ /dev/null
@@ -1,85 +0,0 @@
-void test1() {
-    int a = 3;
-    int* pb = &a;
-    int c = *pb;
-    printf("Reading from a pointer: %d %d\n", a, c);
-    *pb = 4;
-    printf("Writing to a pointer: %d\n", a);
-    printf("Testing casts: %d %g %g %d\n", 3, (float) 3, 4.5, (int) 4.5);
-}
-
-void test2() {
-    int x = 4;
-    int px = &x;
-    // int z = * px; // An error, expected a pointer type
-    int y = * (int*) px;
-    printf("Testing reading (int*): %d\n", y);
-}
-
-void test3() {
-    int px = (int) malloc(120);
-    * (int*) px = 8;
-    * (int*) (px + 4) = 9;
-    printf("Testing writing (int*): %d %d\n", * (int*) px, * (int*) (px + 4));
-    free((void*) px);
-}
-
-void test4() {
-    int x = 0x12345678;
-    int px = &x;
-    int a = * (char*) px;
-    int b = * (char*) (px + 1);
-    int c = * (char*) (px + 2);
-    int d = * (char*) (px + 3);
-    printf("Testing reading (char*): 0x%02x 0x%02x 0x%02x 0x%02x\n", a, b, c, d);
-}
-
-void test5() {
-    int x = 0xFFFFFFFF;
-    int px = &x;
-    * (char*) px = 0x21;
-    * (char*) (px + 1) = 0x43;
-    * (char*) (px + 2) = 0x65;
-    * (char*) (px + 3) = 0x87;
-    printf("Testing writing (char*): 0x%08x\n", x);
-}
-
-int f(int b) {
-    printf("f(%d)\n", b);
-    return 7 * b;
-}
-
-void test6() {
-    int fp = &f;
-    int x = (*(int(*)()) fp)(10);
-    printf("Function pointer result: %d\n", x);
-}
-
-void test7() {
-    int px = (int) malloc(120);
-    * (float*) px = 8.8f;
-    * (float*) (px + 4) = 9.9f;
-    printf("Testing read/write (float*): %g %g\n", * (float*) px, * (float*) (px + 4));
-    free((void*) px);
-}
-
-void test8() {
-    int px = (int) malloc(120);
-    * (double*) px = 8.8;
-    * (double*) (px + 8) = 9.9;
-    printf("Testing read/write (double*): %g %g\n", * (double*) px, * (double*) (px + 8));
-    free((void*) px);
-}
-
-
-int main() {
-    test1();
-    test2();
-    test3();
-    test4();
-    test5();
-    test6();
-    test7();
-    test8();
-    return 0;
-}
diff --git a/tests/data/src/char.c b/tests/data/src/char.c
deleted file mode 100644
index 8c63ba2..0000000
--- a/tests/data/src/char.c
+++ /dev/null
@@ -1,13 +0,0 @@
-char ga;
-char gb;
-
-int main() {
-    char a = 'c';
-    char b = a * 3;
-    printf("a = %d, b = %d\n", a, b);
-    ga = 'd';
-    gb = ga * 3;
-    printf("ga = %d, gb = %d\n", ga, gb);
-    return 0;
-}
-
diff --git a/tests/data/src/comma.c b/tests/data/src/comma.c
deleted file mode 100644
index 496944c..0000000
--- a/tests/data/src/comma.c
+++ /dev/null
@@ -1,35 +0,0 @@
-int testReturn() {
-    return 10, 20, 30;
-}
-
-int testArg(int a) {
-    return a;
-}
-
-void testComma() {
-    int a;
-    0, a = 10,20;
-    printf("statement: %d\n", a);
-    a = 1;
-    if (a = 0, 1) {
-        printf("if: a = %d\n", a);
-    }
-    int b = 0;
-    a = 10;
-    while(b++,a--) {}
-    printf("while: b = %d\n", b);
-    b = 0;
-    for(b++,a = 0;b++, a < 10; b++, a++) {}
-    printf("for: b = %d\n", b);
-    b = testReturn();
-    printf("return: %d\n", b);
-    b = testArg((a,12));
-    printf("arg: %d\n", b);
-}
-
-
-
-int main() {
-    testComma();
-    return 0;
-}
diff --git a/tests/data/src/constants.c b/tests/data/src/constants.c
deleted file mode 100644
index 230109a..0000000
--- a/tests/data/src/constants.c
+++ /dev/null
@@ -1,30 +0,0 @@
-#define FOO 0x10
-
-int main() {
-    printf("0 = %d\n", 0);
-    printf("010 = %d\n", 010);
-    printf("0x10 = %d\n", FOO);
-    printf("'\\a' = %d\n", '\a');
-    printf("'\\b' = %d\n", '\b');
-    printf("'\\f' = %d\n", '\f');
-    printf("'\\n' = %d\n", '\n');
-    printf("'\\r' = %d\n", '\r');
-    printf("'\\t' = %d\n", '\t');
-    printf("'\\v' = %d\n", '\v');
-    // Undefined
-    // printf("'\\z' = %d\n", '\z');
-    printf("'\\\\' = %d\n", '\\');
-    printf("'\\'' = %d\n", '\'');
-    printf("'\\\"' = %d\n", '\"');
-    printf("'\\?' = %d\n", '\?');
-    printf("'\\0' = %d\n", '\0');
-    printf("'\\1' = %d\n", '\1');
-    printf("'\\12' = %d\n", '\12');
-    printf("'\\123' = %d\n", '\123');
-    printf("'\\x0' = %d\n", '\x0');
-    printf("'\\x1' = %d\n", '\x1');
-    printf("'\\x12' = %d\n", '\x12');
-    printf("'\\x123' = %d\n", '\x123');
-    printf("'\\x1f' = %d\n", '\x1f');
-    printf("'\\x1F' = %d\n", '\x1F');
-}
diff --git a/tests/data/src/defines.c b/tests/data/src/defines.c
deleted file mode 100644
index 6cb6f7e..0000000
--- a/tests/data/src/defines.c
+++ /dev/null
@@ -1,9 +0,0 @@
-// Simple tests of the C preprocessor
-
-#define A 1
-#define A (4 / 2)
-#define B 1 // This is a comment. With a / in it.
-
-int main() {
-    return A + B;
-}
diff --git a/tests/data/src/double.c b/tests/data/src/double.c
deleted file mode 100644
index 5bc20a3..0000000
--- a/tests/data/src/double.c
+++ /dev/null
@@ -1,7 +0,0 @@
-double atof(char *nptr);
-
-int main() {
-    printf("Value = %g\n", atof("10.42"));
-    return 0;
-}
-
diff --git a/tests/data/src/error.c b/tests/data/src/error.c
deleted file mode 100644
index 2e08dcc..0000000
--- a/tests/data/src/error.c
+++ /dev/null
@@ -1,2 +0,0 @@
-void foo;
-
diff --git a/tests/data/src/expr-ansi.c b/tests/data/src/expr-ansi.c
deleted file mode 100644
index d463659..0000000
--- a/tests/data/src/expr-ansi.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Test operators */
-
-void testInc() { int a, b; a = 3; b = a++; printf("3++ = %d %d\n", b, a); }
-void testDec() { int a, b; a = 3; b = a--; printf("3-- = %d %d\n", b, a); }
-void testTimes(){ printf("%d * %d = %d\n", 10, 4, 10 * 4); }
-void testDiv(){ printf("%d / %d = %d\n", 11, 4, 11 / 4); }
-void testMod(){ printf("%d %% %d = %d\n", 11, 4, 11 % 4); }
-void testPlus(){ printf("%d + %d = %d\n", 10, 4, 10 + 4); }
-void testMinus(){ printf("%d - %d = %d\n", 10, 4, 10 - 4); }
-void testShiftLeft(){ printf("%d << %d = %d\n", 10, 4, 10 << 4); }
-void testShiftRight(){ printf("%d >> %d = %d\n", 100, 4, 100 >> 4); }
-void testLess(){ printf("%d < %d = %d\n", 10, 4, 10 < 4); }
-void testLesEqual(){ printf("%d <= %d = %d\n", 10, 4, 10 <= 4); }
-void testGreater(){ printf("%d > %d = %d\n", 10, 4, 10 > 4); }
-void testGreaterEqual(){ printf("%d >= %d = %d\n", 10, 4, 10 >= 4); }
-void testEqualTo(){ printf("%d == %d = %d\n", 10, 4, 10 == 4); }
-void testNotEqualTo(){ printf("%d != %d = %d\n", 10, 4, 10 != 4); }
-void testBitAnd(){ printf("%d & %d = %d\n", 10, 7, 10 & 7); }
-void testBitXor(){ printf("%d ^ %d = %d\n", 10, 7, 10 ^ 7); }
-void testBitOr(){ printf("%d | %d = %d\n", 10, 4, 10 | 4); }
-void testAssignment(){ int a, b; a = 3; b = a; printf("b == %d\n", b); }
-void testLogicalAnd(){ printf("%d && %d = %d\n", 10, 4, 10 && 4); }
-void testLogicalOr(){ printf("%d || %d = %d\n", 10, 4, 10 || 4); }
-void testAddressOf(){ int a; printf("&a is %d\n", &a); }
-void testPointerIndirection(){ int a, b; a = &b; b = 17; printf("*%d  = %d =?= %d\n", a, * (int*) a, b); }
-void testNegation(){ printf("-%d = %d\n", 10, -10); }
-void testUnaryPlus(){ printf("+%d = %d\n", 10, +10); }
-void testUnaryNot(){ printf("!%d = %d\n", 10, !10); }
-void testBitNot(){ printf("~%d = %d\n", 10, ~10); }
-
-int main(int a, char** b) {
-    testInc();
-    testDec();
-    testTimes();
-    testDiv();
-    testMod();
-    testPlus();
-    testMinus();
-    testShiftLeft();
-    testShiftRight();
-    testLess();
-    testLesEqual();
-    testGreater();
-    testGreaterEqual();
-    testEqualTo();
-    testNotEqualTo();
-    testBitAnd();
-    testBinXor();
-    testBitOr();
-    testAssignment();
-    testLogicalAnd();
-    testLogicalOr();
-    testAddressOf();
-    testPointerIndirection();
-    testNegation();
-    testUnaryPlus();
-    testUnaryNot();
-    testBitNot();
-    return 0;
-}
diff --git a/tests/data/src/expr.c b/tests/data/src/expr.c
deleted file mode 100644
index 4f2d2e7..0000000
--- a/tests/data/src/expr.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Test operators */
-
-testInc() { int a, b; a = 3; b = a++; printf("3++ = %d %d\n", b, a); }
-testDec() { int a, b; a = 3; b = a--; printf("3-- = %d %d\n", b, a); }
-testTimes(){ printf("%d * %d = %d\n", 10, 4, 10 * 4); }
-testDiv(){ printf("%d / %d = %d\n", 11, 4, 11 / 4); }
-testMod(){ printf("%d %% %d = %d\n", 11, 4, 11 % 4); }
-testPlus(){ printf("%d + %d = %d\n", 10, 4, 10 + 4); }
-testMinus(){ printf("%d - %d = %d\n", 10, 4, 10 - 4); }
-testShiftLeft(){ printf("%d << %d = %d\n", 10, 4, 10 << 4); }
-testShiftRight(){ printf("%d >> %d = %d\n", 100, 4, 100 >> 4); }
-testLess(){ printf("%d < %d = %d\n", 10, 4, 10 < 4); }
-testLesEqual(){ printf("%d <= %d = %d\n", 10, 4, 10 <= 4); }
-testGreater(){ printf("%d > %d = %d\n", 10, 4, 10 > 4); }
-testGreaterEqual(){ printf("%d >= %d = %d\n", 10, 4, 10 >= 4); }
-testEqualTo(){ printf("%d == %d = %d\n", 10, 4, 10 == 4); }
-testNotEqualTo(){ printf("%d != %d = %d\n", 10, 4, 10 != 4); }
-testBitAnd(){ printf("%d & %d = %d\n", 10, 7, 10 & 7); }
-testBitXor(){ printf("%d ^ %d = %d\n", 10, 7, 10 ^ 7); }
-testBitOr(){ printf("%d | %d = %d\n", 10, 4, 10 | 4); }
-testAssignment(){ int a, b; a = 3; b = a; printf("b == %d\n", b); }
-testLogicalAnd(){ printf("%d && %d = %d\n", 10, 4, 10 && 4); }
-testLogicalOr(){ printf("%d || %d = %d\n", 10, 4, 10 || 4); }
-testAddressOf(){ int a; printf("&a is %d\n", &a); }
-testPointerIndirection(){ int a, b; a = &b; b = 17; printf("*%d  = %d =?= %d\n", a, * (int*) a, b); }
-testNegation(){ printf("-%d = %d\n", 10, -10); }
-testUnaryPlus(){ printf("+%d = %d\n", 10, +10); }
-testUnaryNot(){ printf("!%d = %d\n", 10, !10); }
-testBitNot(){ printf("~%d = %d\n", 10, ~10); }
-
-main(a,b) {
-    testInc();
-    testDec();
-    testTimes();
-    testDiv();
-    testMod();
-    testPlus();
-    testMinus();
-    testShiftLeft();
-    testShiftRight();
-    testLess();
-    testLesEqual();
-    testGreater();
-    testGreaterEqual();
-    testEqualTo();
-    testNotEqualTo();
-    testBitAnd();
-    testBinXor();
-    testBitOr();
-    testAssignment();
-    testLogicalAnd();
-    testLogicalOr();
-    testAddressOf();
-    testPointerIndirection();
-    testNegation();
-    testUnaryPlus();
-    testUnaryNot();
-    testBitNot();
-    return 0;
-}
\ No newline at end of file
diff --git a/tests/data/src/expr2.c b/tests/data/src/expr2.c
deleted file mode 100644
index 04b6a38..0000000
--- a/tests/data/src/expr2.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/* Test operators */
-
-main() {
-    int a;
-    a = a++;
-}
diff --git a/tests/data/src/film.c b/tests/data/src/film.c
deleted file mode 100644
index 00c2d36..0000000
--- a/tests/data/src/film.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// Test logical and bitwise AND and OR
-
-int test(int x, int y) {
-    int v = x || y;
-    return v;
-}
-
-int test2(int x, int y) {
-    if(x | y) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-int test3(int x, int y) {
-    int v = x && y;
-    return v;
-}
-
-int test4(int x, int y) {
-    if(x & y) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-int main(int index)
-{
-    int x,y;
-    printf("testing...\n");
-    int totalBad = 0;
-    for(y = 0; y < 2; y++) {
-        for(x = 0; x < 2; x++) {
-            int a = test(x,y);
-            int b = test2(x,y);
-            if (a != b) {
-                printf("Results differ: OR x=%d y=%d a=%d b=%d\n", x, y, a, b);
-                totalBad++;
-            }
-            a = test3(x,y);
-            b = test4(x,y);
-            if (a != b) {
-                printf("Results differ: AND x=%d y=%d a=%d b=%d\n", x, y, a, b);
-                totalBad++;
-            }
-        }
-    }
-    printf("Total bad: %d\n", totalBad);
-    return 0;
-}
-
diff --git a/tests/data/src/float.c b/tests/data/src/float.c
deleted file mode 100644
index f48b3d1..0000000
--- a/tests/data/src/float.c
+++ /dev/null
@@ -1,57 +0,0 @@
-int ftoi(float f) {
-    return f;
-}
-
-int dtoi(double d) {
-    return d;
-}
-
-float itof(int i) {
-    return i;
-}
-
-double itod(int i) {
-    return i;
-}
-
-float f0, f1;
-double d0, d1;
-
-void testParseConsts() {
-    printf("Constants: %g %g %g %g %g %g %g %g %g\n", 0e1, 0E1, 0.f, .01f,
-          .01e0f, 1.0e-1, 1.0e1, 1.0e+1,
-          .1f);
-}
-void testVars(float arg0, float arg1, double arg2, double arg3) {
-    float local0, local1;
-    double local2, local3;
-    f0 = arg0;
-    f1 = arg1;
-    d0 = arg2;
-    d1 = arg3;
-    local0 = arg0;
-    local1 = arg1;
-    local2 = arg2;
-    local3 = arg3;
-    printf("globals: %g %g %g %g\n", f0, f1, d0, d1);
-    printf("args: %g %g %g %g\n", arg0, arg1, arg2, arg3);
-    printf("locals: %g %g %g %g\n", local0, local1, local2, local3);
-
-
-    printf("cast rval: %g %g\n", * (float*) & f1, * (double*) & d1);
-
-    * (float*) & f0 = 1.1f;
-    * (double*) & d0 = 3.3;
-    printf("cast lval: %g %g %g %g\n", f0, f1, d0, d1);
-}
-
-int main() {
-    testParseConsts();
-    printf("int: %d float: %g double: %g\n", 1, 2.2f, 3.3);
-    printf(" ftoi(1.4f)=%d\n", ftoi(1.4f));
-    printf(" dtoi(2.4)=%d\n", dtoi(2.4));
-    printf(" itof(3)=%g\n", itof(3));
-    printf(" itod(4)=%g\n", itod(4));
-    testVars(1.0f, 2.0f, 3.0, 4.0);
-    return 0;
-}
diff --git a/tests/data/src/floatdouble.c b/tests/data/src/floatdouble.c
deleted file mode 100644
index 264c641..0000000
--- a/tests/data/src/floatdouble.c
+++ /dev/null
@@ -1,9 +0,0 @@
-int main()
-{
-    // Test coercing values when storing.
-    float a = 0.002;
-    double b = 0.1f;
-    int c = 10.002;
-    printf("%g %g %d\n", a, b, c);
-    return 0;
-}
diff --git a/tests/data/src/flops.c b/tests/data/src/flops.c
deleted file mode 100644
index 40b1b28..0000000
--- a/tests/data/src/flops.c
+++ /dev/null
@@ -1,158 +0,0 @@
-// Test floating point operations.
-
-void unaryOps() {
-    // Unary ops
-    printf("-%g = %g\n", 1.1, -1.1);
-    printf("!%g = %d\n", 1.2, !1.2);
-    printf("!%g = %d\n", 0.0, !0.0);
-}
-
-void binaryOps() {
-    printf("double op double:\n");
-    printf("%g + %g = %g\n", 1.0, 2.0, 1.0 + 2.0);
-    printf("%g - %g = %g\n", 1.0, 2.0, 1.0 - 2.0);
-    printf("%g * %g = %g\n", 1.0, 2.0, 1.0 * 2.0);
-    printf("%g / %g = %g\n", 1.0, 2.0, 1.0 / 2.0);
-
-    printf("float op float:\n");
-    printf("%g + %g = %g\n", 1.0f, 2.0f, 1.0f + 2.0f);
-    printf("%g - %g = %g\n", 1.0f, 2.0f, 1.0f - 2.0f);
-    printf("%g * %g = %g\n", 1.0f, 2.0f, 1.0f * 2.0f);
-    printf("%g / %g = %g\n", 1.0f, 2.0f, 1.0f / 2.0f);
-
-    printf("double op float:\n");
-    printf("%g + %g = %g\n", 1.0, 2.0f, 1.0 + 2.0f);
-    printf("%g - %g = %g\n", 1.0, 2.0f, 1.0 - 2.0f);
-    printf("%g * %g = %g\n", 1.0, 2.0f, 1.0 * 2.0f);
-    printf("%g / %g = %g\n", 1.0, 2.0f, 1.0 / 2.0f);
-
-    printf("double op int:\n");
-    printf("%g + %d = %g\n", 1.0, 2, 1.0 + 2);
-    printf("%g - %d = %g\n", 1.0, 2, 1.0 - 2);
-    printf("%g * %d = %g\n", 1.0, 2, 1.0 * 2);
-    printf("%g / %d = %g\n", 1.0, 2, 1.0 / 2);
-
-    printf("int op double:\n");
-    printf("%d + %g = %g\n", 1, 2.0, 1 + 2.0);
-    printf("%d - %g = %g\n", 1, 2.0, 1 - 2.0);
-    printf("%d * %g = %g\n", 1, 2.0, 1 * 2.0);
-    printf("%d / %g = %g\n", 1, 2.0, 1 / 2.0);
-}
-
-void comparisonTestdd(double a, double b) {
-    printf("%g op %g: < %d   <= %d   == %d   >= %d   > %d   != %d\n",
-           a, b, a < b, a <= b, a == b, a >= b, a > b, a != b);
-}
-
-void comparisonOpsdd() {
-    printf("double op double:\n");
-    comparisonTestdd(1.0, 2.0);
-    comparisonTestdd(1.0, 1.0);
-    comparisonTestdd(2.0, 1.0);
-}
-
-
-void comparisonTestdf(double a, float b) {
-    printf("%g op %g: < %d   <= %d   == %d   >= %d   > %d   != %d\n",
-           a, b, a < b, a <= b, a == b, a >= b, a > b, a != b);
-}
-
-void comparisonOpsdf() {
-    printf("double op float:\n");
-    comparisonTestdf(1.0, 2.0f);
-    comparisonTestdf(1.0, 1.0f);
-    comparisonTestdf(2.0, 1.0f);
-}
-
-void comparisonTestff(float a, float b) {
-    printf("%g op %g: < %d   <= %d   == %d   >= %d   > %d   != %d\n",
-           a, b, a < b, a <= b, a == b, a >= b, a > b, a != b);
-}
-
-void comparisonOpsff() {
-    printf("float op float:\n");
-    comparisonTestff(1.0f, 2.0f);
-    comparisonTestff(1.0f, 1.0f);
-    comparisonTestff(2.0f, 1.0f);
-}
-
-void comparisonTestid(int a, double b) {
-    printf("%d op %g: < %d   <= %d   == %d   >= %d   > %d   != %d\n",
-           a, b, a < b, a <= b, a == b, a >= b, a > b, a != b);
-}
-
-void comparisonOpsid() {
-    printf("int op double:\n");
-    comparisonTestid(1, 2.0);
-    comparisonTestid(1, 1.0);
-    comparisonTestid(2, 1.0);
-}
-void comparisonTestdi(double a, int b) {
-    printf("%g op %d: < %d   <= %d   == %d   >= %d   > %d   != %d\n",
-           a, b, a < b, a <= b, a == b, a >= b, a > b, a != b);
-}
-
-void comparisonOpsdi() {
-    printf("double op int:\n");
-    comparisonTestdi(1.0f, 2);
-    comparisonTestdi(1.0f, 1);
-    comparisonTestdi(2.0f, 1);
-}
-
-void comparisonOps() {
-    comparisonOpsdd();
-    comparisonOpsdf();
-    comparisonOpsff();
-    comparisonOpsid();
-    comparisonOpsdi();
-}
-
-int branch(double d) {
-    if (d) {
-        return 1;
-    }
-    return 0;
-}
-
-void testBranching() {
-    printf("branching: %d %d %d\n", branch(-1.0), branch(0.0), branch(1.0));
-}
-
-void testpassi(int a, int b, int c, int d, int e, int f, int g, int h, int i, int j, int k, int l) {
-    printf("testpassi: %d %d %d %d %d %d %d %d %d %d %d %d\n", a, b, c, d, e, f, g, h, i, j, k, l);
-}
-
-void testpassf(float a, float b, float c, float d, float e, float f, float g, float h, float i, float j, float k, float l) {
-    printf("testpassf: %g %g %g %g %g %g %g %g %g %g %g %g\n", a, b, c, d, e, f, g, h, i, j, k, l);
-}
-
-void testpassd(double a, double b, double c, double d, double e, double f, double g, double h, double i, double j, double k, double l) {
-    printf("testpassd: %g %g %g %g %g %g %g %g %g %g %g %g\n", a, b, c, d, e, f, g, h, i, j, k, l);
-}
-
-void testpassidf(int i, double d, float f) {
-    printf("testpassidf: %d %g %g\n", i, d, f);
-}
-
-void testParameterPassing() {
-    float x;
-    testpassi(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
-    testpassf(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
-    testpassd(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
-    testpassi(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f);
-    testpassf(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f);
-    testpassd(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f);
-    testpassi(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0);
-    testpassf(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0);
-    testpassd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0);
-    testpassidf(1, 2.0, 3.0f);
-}
-
-int main() {
-    unaryOps();
-    binaryOps();
-    comparisonOps();
-    testBranching();
-    testParameterPassing();
-    return 0;
-}
diff --git a/tests/data/src/funcargs.c b/tests/data/src/funcargs.c
deleted file mode 100644
index 1dce226..0000000
--- a/tests/data/src/funcargs.c
+++ /dev/null
@@ -1,8 +0,0 @@
-int f(int a,int, int c) {
-    return a + c;
-}
-
-int main() {
-    return f(1,2,3);
-}
-
diff --git a/tests/data/src/hello.c b/tests/data/src/hello.c
deleted file mode 100644
index 06c9d03..0000000
--- a/tests/data/src/hello.c
+++ /dev/null
@@ -1,4 +0,0 @@
-int main() {
-    printf("Hello, world\n");
-    return 0;
-}
diff --git a/tests/data/src/inc.c b/tests/data/src/inc.c
deleted file mode 100644
index 14c09d1..0000000
--- a/tests/data/src/inc.c
+++ /dev/null
@@ -1,14 +0,0 @@
-// Check integer operations
-
-int main() {
-    int a = 0;
-    printf("%d\n", a++);
-    printf("%d\n", a++);
-    printf("%d\n", a--);
-    printf("%d\n", a--);
-    printf("%d\n", ++a);
-    printf("%d\n", ++a);
-    printf("%d\n", --a);
-    printf("%d\n", --a);
-    return a;
-}
diff --git a/tests/data/src/iops.c b/tests/data/src/iops.c
deleted file mode 100644
index 780e95d..0000000
--- a/tests/data/src/iops.c
+++ /dev/null
@@ -1,23 +0,0 @@
-// Check integer operations
-
-void loops() {
-    int y;
-    printf("++\n");
-    for(y = 0; y < 10; y++) {
-        printf("%d\n", y);
-    }
-    printf("--\n");
-    for(y = 10; y >= 0; y--) {
-        printf("%d\n", y);
-    }
-}
-
-void checkLiterals() {
-    printf("Literals: %d %d\n", 1, -1);
-}
-
-int main() {
-    checkLiterals();
-    loops();
-    return 0;
-}
diff --git a/tests/data/src/locals.c b/tests/data/src/locals.c
deleted file mode 100644
index f1ef363..0000000
--- a/tests/data/src/locals.c
+++ /dev/null
@@ -1,71 +0,0 @@
-int a;
-
-int f() {
-    int a;
-    // Undefined variable b
-    // printf("f 0: a = %d b = %d\n", a, b);
-    printf("f 0: a = %d\n", a);
-    a = 2;
-    printf("f 1: a = %d\n", a);
-}
-
-int g(int a) {
-    printf("g 0: a = %d\n", a);
-    a = 3;
-    printf("g 1: a = %d\n", a);
-}
-
-int h(int a) {
-    // int a; // gcc 4.3 says error: 'a' redeclared as different kind of symbol
-
-    printf("h 0: a = %d\n", a);
-    a = 4;
-    printf("h 1: a = %d\n", a);
-}
-
-// Already defined global 
-// int h() {}
-int globCheck() {
-    fprintf(stdout, "globCheck()\n");
-}
-
-int fwdCheck() {
-    b();
-    // Undefined forward reference
-    // c();
-}
-
-int b() {
-    printf("b()\n");
-}
-
-int nested() {
-    int a;
-    printf("nested 0: a = %d\n", a);
-    a = 50;
-    printf("nested 1: a = %d\n", a);
-    {
-        int a;
-        printf("nested 2: a = %d\n", a);
-        a = 51;
-        printf("nested 3: a = %d\n", a);
-    }
-    printf("nested 4: a = %d\n", a);
-}
-
-int main() {
-    globCheck();
-    fwdCheck();
-    printf("main 0: a = %d\n", a);
-    a = 5;
-    printf("main 1: a = %d\n", a);
-    f();
-    printf("main 2: a = %d\n", a);
-    g(77);
-    printf("main 3: a = %d\n", a);
-    h(30);
-    printf("main 4: a = %d\n", a);
-    nested();
-    printf("main 5: a = %d\n", a);
-    return 0;
-}
diff --git a/tests/data/src/missing-main.c b/tests/data/src/missing-main.c
deleted file mode 100644
index e73eec4..0000000
--- a/tests/data/src/missing-main.c
+++ /dev/null
@@ -1,4 +0,0 @@
-/* No main. */
-
-a() {
-}
\ No newline at end of file
diff --git a/tests/data/src/otcc-ansi.c b/tests/data/src/otcc-ansi.c
deleted file mode 100644
index e1534bd..0000000
--- a/tests/data/src/otcc-ansi.c
+++ /dev/null
@@ -1,469 +0,0 @@
-// #include <stdio.h>
-extern int stdin, stderr, errno;
-int d, z, C, h, P, K, ac, q, G, v, Q, R, D, L, W, M;
-
-void w();
-void ab(int);
-
-void E(int e) {
-    *(char*) D++ = e;
-}
-
-void o() {
-    if (L) {
-        h = *(char*) L++;
-        if (h == 2) {
-            L = 0;
-            h = W;
-        }
-    } else
-        h = fgetc(Q);
-}
-
-int X() {
-    return isalnum(h) | h == 95;
-}
-
-void Y() {
-    if (h == 92) {
-        o();
-        if (h == 110)
-            h = 10;
-    }
-}
-
-void ad() {
-    int e, j, m;
-    while (isspace(h) | h == 35) {
-        if (h == 35) {
-            o();
-            ad();
-            if (d == 536) {
-                ad();
-                E(32);
-                *(int*) d = 1;
-                *(int*) (d + 4) = D;
-            }
-            while (h != 10) {
-                E(h);
-                o();
-            }
-            E(h);
-            E(2);
-        }
-        o();
-    }
-    C = 0;
-    d = h;	
-    if (X()) {
-        E(32);
-        M = D;
-        while (X()) {
-            E(h);
-            o();
-        }
-        if (isdigit(d)) {
-            z = strtol(M, 0, 0);
-            d = 2;
-        } else {
-            *(char*) D = 32;
-            d = strstr(R, M - 1) - R;
-            *(char*) D = 0;
-            d = d * 8 + 256;
-            if (d > 536) {
-                d = P + d;
-                if (*(int*) d == 1) {
-                    L = *(int*) (d + 4);
-                    W = h;
-                    o();
-                    ad();
-                }
-            }
-        }
-    } else {
-        o();
-        if (d == 39) {
-            d = 2;
-            Y();
-            z = h;
-            o();
-            o();
-        } else if (d == 47 & h == 42) {
-            o();
-            while (h) {
-                while (h != 42)
-                    o();
-                o();
-                if (h == 47)
-                    h = 0;
-            }
-            o();
-            ad();
-        } else {
-            e
-                    = "++#m--%am*@R<^1c/@%[_[H3c%@%[_[H3c+@.B#d-@%:_^BKd<<Z/03e>>`/03e<=0f>=/f<@.f>@1f==&g!='g&&k||#l&@.BCh^@.BSi|@.B+j~@/%Yd!@&d*@b";
-            while (j = *(char*) e++) {
-                m = *(char*) e++;
-                z = 0;
-                while ((C = *(char*) e++ - 98) < 0)
-                    z = z * 64 + C + 64;
-                if (j == d & (m == h | m == 64)) {
-                    if (m == h) {
-                        o();
-                        d = 1;
-                    }
-                    break;
-                }
-            }
-        }
-    }
-}
-
-void ae(int g) {
-    while( g&&g!=-1) {
-        *(char*) q++=g;
-        g=g>>8;
-    }
-}
-
-void A(int e) {
-    int g;
-    while( e) {
-        g=*(int*) e;
-        *(int*) e=q-e-4;
-        e=g;
-    }
-}
-
-int s(int g, int e) {
-    ae(g);
-    *(int*) q = e;
-    e = q;
-    q = q + 4;
-    return e;
-}
-
-void H(int e) {
-    s(184,e);
-}
-
-int B(int e) {
-    return s(233,e);
-}
-
-int S(int j, int e) {
-    ae(1032325);
-    return s(132 + j, e);
-}
-
-void Z(int e) {
-    ae( 49465);
-    H(0);
-    ae( 15);
-    ae( e+144);
-    ae( 192);
-}
-
-void N(int j, int e) {
-    ae(j + 131);
-    s((e > -512 && e < 512) << 7 | 5, e);
-}
-
-void T (int j) {
-    int g,e,m,aa;
-    g=1;
-    if( d == 34) {
-        H(v);
-        while( h!=34) {
-            Y ();
-            *(char*) v++=h;
-            o ();
-        }
-        *(char*) v=0;
-        v=v +4&-4;
-        o ();
-        ad();
-    }
-    else {
-        aa=C;
-        m= z;
-        e=d;
-        ad();
-        if( e == 2) {
-            H(m);
-        }
-        else if( aa == 2) {
-            T(0);
-            s(185,0);
-            if( e == 33)Z(m);
-            else ae( m);
-        }
-        else if( e == 40) {
-            w ();
-            ad();
-        }
-        else if( e == 42) {
-            ad();
-            e=d;
-            ad();
-            ad();
-            if( d == 42) {
-                ad();
-                ad();
-                ad();
-                ad();
-                e=0;
-            }
-            ad();
-            T(0);
-            if( d == 61) {
-                ad();
-                ae( 80);
-                w ();
-                ae( 89);
-                ae( 392+(e == 256));
-            }
-            else if( e) {
-                if( e == 256)ae( 139);
-                else ae( 48655);
-                q++;
-            }
-        }
-        else if( e == 38) {
-            N(10,*(int*) d);
-            ad();
-        }
-        else {
-            g=*(int*) e;
-            if(!g)g=dlsym(0,M);
-            if( d == 61&j) {
-                ad();
-                w ();
-                N(6,g);
-            }
-            else if( d!= 40) {
-                N(8,g);
-                if( C == 11) {
-                    N(0,g);
-                    ae( z);
-                    ad();
-                }
-            }
-        }
-    }
-    if( d == 40) {
-        if( g == 1)ae( 80);
-        m= s(60545,0);
-        ad();
-        j=0;
-        while( d!= 41) {
-            w ();
-            s(2393225,j);
-            if( d == 44)ad();
-            j=j +4;
-        }
-        *(int*) m= j;
-        ad();
-        if(!g) {
-            e=e +4;
-            *(int*) e=s(232,*(int*) e);
-        }
-        else if( g == 1) {
-            s(2397439,j);
-            j=j +4;
-        }
-        else {
-            s(232,g-q-5);
-        }
-        if( j)s(50305,j);
-    }
-}
-
-void O (int j) {
-    int e,g,m;
-    if( j--== 1)T(1);
-    else {
-        O (j);
-        m= 0;
-        while( j == C) {
-            g=d;
-            e=z;
-            ad();
-            if( j>8) {
-                m= S(e,m);
-                O (j);
-            }
-            else {
-                ae( 80);
-                O (j);
-                ae( 89);
-                if( j == 4|j == 5) {
-                    Z(e);
-                }
-                else {
-                    ae( e);
-                    if( g == 37)ae( 146);
-                }
-            }
-        }
-        if( m&&j>8) {
-            m= S(e,m);
-            H(e^1);
-            B(5);
-            A(m);
-            H(e);
-        }
-    }
-}
-
-void w() {
-    O(11);
-}
-
-int U() {
-    w();
-    return S(0, 0);
-}
-
-void I (int j) {
-    int m,g,e;
-    if( d == 288) {
-        ad();
-        ad();
-        m= U ();
-        ad();
-        I (j);
-        if( d == 312) {
-            ad();
-            g=B(0);
-            A(m);
-            I (j);
-            A(g);
-        }
-        else {
-            A(m);
-        }
-    }
-    else if( d == 352|d == 504) {
-        e=d;
-        ad();
-        ad();
-        if( e == 352) {
-            g=q;
-            m= U ();
-        }
-        else {
-            if( d!= 59)w ();
-            ad();
-            g=q;
-            m= 0;
-            if( d!= 59)m= U ();
-            ad();
-            if( d!= 41) {
-                e=B(0);
-                w ();
-                B(g-q-5);
-                A(e);
-                g=e +4;
-            }
-        }
-        ad();
-        I(&m);
-        B(g-q-5);
-        A(m);
-    }
-    else if( d == 123) {
-        ad();
-        ab(1);
-        while( d!= 125)I (j);
-        ad();
-    }
-    else {
-        if( d == 448) {
-            ad();
-            if( d!= 59)w ();
-            K=B(K);
-        }
-        else if( d == 400) {
-            ad();
-            *(int*) j=B(*(int*) j);
-        }
-        else if( d!= 59)w ();
-        ad();
-    }
-}
-
-void ab (int j) {
-    int m;
-    while( d == 256|d!=-1&!j) {
-        if( d == 256) {
-            ad();
-            while( d!= 59) {
-                if( j) {
-                    G=G +4;
-                    *(int*) d=-G;
-                }
-                else {
-                    *(int*) d=v;
-                    v=v +4;
-                }
-                ad();
-                if( d == 44)ad()	;
-            }
-            ad();
-        }
-        else {
-            A(*(int*)(d +4));
-            *(int*) d=q;
-            ad();
-            ad();
-            m= 8;
-            while( d!= 41) {
-                *(int*) d=m;
-                m= m +4;
-                ad();
-                if( d == 44)ad();
-            }
-            ad();
-            K=G=0;
-            ae( 15042901);
-            m= s(60545,0);
-            I(0);
-            A(K);
-            ae( 50121);
-            *(int*) m= G;
-        }
-    }
-}
-
-int run(int g, int e) {
-    return (*(int(*)()) *(int*) (P + 592))(g, e);
-}
-
-int main(int g, char** e) {
-    int result;
-    Q = stdin;
-    if (g-- > 1) {
-        Q = fopen(e[1], "r");
-        if (!Q) {
-            fprintf(stderr, "otcc-ansi.c: could not open file %s\n", *(int*) e);
-            return -2;
-        }
-    }
-    D = strcpy(R = calloc(1, 99999), " int if else while break return for define main ") + 48;
-    v = calloc(1, 99999);
-    q = ac = calloc(1, 99999);
-    P = calloc(1, 99999);
-    o();
-    ad();
-    ab(0);
-    if (mprotect(ac & (~ 4095), (99999 + 4095) & (~ 4095), 7)) {
-        printf("Mprotect failed. %d\n", errno);
-        return -1;
-    }
-    fprintf(stderr, "otcc-ansi.c: About to execute compiled code:\n");
-    result = run(g, e);
-    fprintf(stderr, "atcc-ansi.c: result: %d\n", result);
-    return result;
-}
-
diff --git a/tests/data/src/otcc-noinclude.c b/tests/data/src/otcc-noinclude.c
deleted file mode 100644
index 530f9e2..0000000
--- a/tests/data/src/otcc-noinclude.c
+++ /dev/null
@@ -1,446 +0,0 @@
-// #include <stdio.h>
-#define k *(int*)
-#define a if(
-#define c ad()
-#define i else
-#define p while(
-#define x *(char*)
-#define b ==
-#define V =calloc(1,99999)
-#define f ()
-#define J return
-#define l ae(
-#define n e)
-#define u d!=
-#define F int 
-#define y (j)
-#define r m=
-#define t +4
-F d,z,C,h,P,K,ac,q,G,v,Q,R,D,L,W,M;
-E(n{
-x D++=e;
-}
-o f{
-a L){
-h=x L++;
-a h b 2){
-L=0;
-h=W;
-}
-}
-i h=fgetc(Q);
-}
-X f{
-J isalnum(h)|h b 95;
-}
-Y f{
-a h b 92){
-o f;
-a h b 110)h=10;
-}
-}
-c{
-F e,j,m;
-p isspace(h)|h b 35){
-a h b 35){
-o f;
-c;
-a d b 536){
-c;
-E(32);
-k d=1;
-k(d t)=D;
-}
-p h!=10){
-E(h);
-o f;
-}
-E(h);
-E(2);
-}
-o f;
-}
-C=0;
-d=h;
-a X f){
-E(32);
-M=D;
-p X f){
-E(h);
-o f;
-}
-a isdigit(d)){
-z=strtol(M,0,0);
-d=2;
-}
-i{
-x D=32;
-d=strstr(R,M-1)-R;
-x D=0;
-d=d*8+256;
-a d>536){
-d=P+d;
-a k d b 1){
-L=k(d t);
-W=h;
-o f;
-c;
-}
-}
-}
-}
-i{
-o f;
-a d b 39){
-d=2;
-Y f;
-z=h;
-o f;
-o f;
-}
-i a d b 47&h b 42){
-o f;
-p h){
-p h!=42)o f;
-o f;
-a h b 47)h=0;
-}
-o f;
-c;
-}
-i{
-e="++#m--%am*@R<^1c/@%[_[H3c%@%[_[H3c+@.B#d-@%:_^BKd<<Z/03e>>`/03e<=0f>=/f<@.f>@1f==&g!='g&&k||#l&@.BCh^@.BSi|@.B+j~@/%Yd!@&d*@b";
-p j=x e++){
-r x e++;
-z=0;
-p(C=x e++-98)<0)z=z*64+C+64;
-a j b d&(m b h|m b 64)){
-a m b h){
-o f;
-d=1;
-}
-break;
-}
-}
-}
-}
-}
-l g){
-p g&&g!=-1){
-x q++=g;
-g=g>>8;
-}
-}
-A(n{
-F g;
-p n{
-g=k e;
-k e=q-e-4;
-e=g;
-}
-}
-s(g,n{
-l g);
-k q=e;
-e=q;
-q=q t;
-J e;
-}
-H(n{
-s(184,n;
-}
-B(n{
-J s(233,n;
-}
-S(j,n{
-l 1032325);
-J s(132+j,n;
-}
-Z(n{
-l 49465);
-H(0);
-l 15);
-l e+144);
-l 192);
-}
-N(j,n{
-l j+131);
-s((e<512)<<7|5,n;
-}
-T y{
-F g,e,m,aa;
-g=1;
-a d b 34){
-H(v);
-p h!=34){
-Y f;
-x v++=h;
-o f;
-}
-x v=0;
-v=v t&-4;
-o f;
-c;
-}
-i{
-aa=C;
-r z;
-e=d;
-c;
-a e b 2){
-H(m);
-}
-i a aa b 2){
-T(0);
-s(185,0);
-a e b 33)Z(m);
-i l m);
-}
-i a e b 40){
-w f;
-c;
-}
-i a e b 42){
-c;
-e=d;
-c;
-c;
-a d b 42){
-c;
-c;
-c;
-c;
-e=0;
-}
-c;
-T(0);
-a d b 61){
-c;
-l 80);
-w f;
-l 89);
-l 392+(e b 256));
-}
-i a n{
-a e b 256)l 139);
-i l 48655);
-q++;
-}
-}
-i a e b 38){
-N(10,k d);
-c;
-}
-i{
-g=k e;
-a!g)g=dlsym(0,M);
-a d b 61&j){
-c;
-w f;
-N(6,g);
-}
-i a u 40){
-N(8,g);
-a C b 11){
-N(0,g);
-l z);
-c;
-}
-}
-}
-}
-a d b 40){
-a g b 1)l 80);
-r s(60545,0);
-c;
-j=0;
-p u 41){
-w f;
-s(2393225,j);
-a d b 44)c;
-j=j t;
-}
-k r j;
-c;
-a!g){
-e=e t;
-k e=s(232,k n;
-}
-i a g b 1){
-s(2397439,j);
-j=j t;
-}
-i{
-s(232,g-q-5);
-}
-a j)s(50305,j);
-}
-}
-O y{
-F e,g,m;
-a j--b 1)T(1);
-i{
-O y;
-r 0;
-p j b C){
-g=d;
-e=z;
-c;
-a j>8){
-r S(e,m);
-O y;
-}
-i{
-l 80);
-O y;
-l 89);
-a j b 4|j b 5){
-Z(n;
-}
-i{
-l n;
-a g b 37)l 146);
-}
-}
-}
-a m&&j>8){
-r S(e,m);
-H(e^1);
-B(5);
-A(m);
-H(n;
-}
-}
-}
-w f{
-O(11);
-}
-U f{
-w f;
-J S(0,0);
-}
-I y{
-F m,g,e;
-a d b 288){
-c;
-c;
-r U f;
-c;
-I y;
-a d b 312){
-c;
-g=B(0);
-A(m);
-I y;
-A(g);
-}
-i{
-A(m);
-}
-}
-i a d b 352|d b 504){
-e=d;
-c;
-c;
-a e b 352){
-g=q;
-r U f;
-}
-i{
-a u 59)w f;
-c;
-g=q;
-r 0;
-a u 59)r U f;
-c;
-a u 41){
-e=B(0);
-w f;
-B(g-q-5);
-A(n;
-g=e t;
-}
-}
-c;
-I(&m);
-B(g-q-5);
-A(m);
-}
-i a d b 123){
-c;
-ab(1);
-p u 125)I y;
-c;
-}
-i{
-a d b 448){
-c;
-a u 59)w f;
-K=B(K);
-}
-i a d b 400){
-c;
-k j=B(k j);
-}
-i a u 59)w f;
-c;
-}
-}
-ab y{
-F m;
-p d b 256|u-1&!j){
-a d b 256){
-c;
-p u 59){
-a j){
-G=G t;
-k d=-G;
-}
-i{
-k d=v;
-v=v t;
-}
-c;
-a d b 44)c;
-}
-c;
-}
-i{
-A(k(d t));
-k d=q;
-c;
-c;
-r 8;
-p u 41){
-k d=m;
-r m t;
-c;
-a d b 44)c;
-}
-c;
-K=G=0;
-l 15042901);
-r s(60545,0);
-I(0);
-A(K);
-l 50121);
-k r G;
-}
-}
-}
-main(g,n{
-Q=stdin;
-a g-->1){
-e=e t;
-Q=fopen(k e,"r");
-}
-D=strcpy(R V," int if else while break return for define main ")+48;
-v V;
-q=ac V;
-P V;
-o f;
-c;
-ab(0);
-J(*(int(*)f)k(P+592))(g,n;
-}
-
diff --git a/tests/data/src/pointers.c b/tests/data/src/pointers.c
deleted file mode 100644
index 461ebeb..0000000
--- a/tests/data/src/pointers.c
+++ /dev/null
@@ -1,15 +0,0 @@
-int main() {
-    int* pa = (int*) malloc(100);
-    int* pb = pa + 1;
-    int* pc = (int*) 0;
-    *pa = 1;
-    *pb = 2;
-    printf("Pointer difference: %d %d\n", pb - pa, ((int) pb) - ((int) pa));
-    int c = * (pa + 1);
-    printf("Pointer addition: %d\n", c);
-    printf("Pointer comparison to zero: %d %d %d\n", pa == 0, pb == 0, pc == 0);
-    printf("Pointer comparison: %d %d %d %d %d\n", pa < pb, pa == pb, pa > pb, ! pb, ! pc);
-    free(pa);
-    return 0;
-}
-
diff --git a/tests/data/src/pointers2.c b/tests/data/src/pointers2.c
deleted file mode 100644
index 69e402f..0000000
--- a/tests/data/src/pointers2.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// Test multiple levels of indirection
-
-void testsingle() {
-    int a = 0;
-    int* pa = &a;
-    printf("a = %d, *pa = %d\n", a, *pa);
-    *pa = 2;
-    printf("a = %d, *pa = %d\n", a, *pa);
-}
-
-void testdouble() {
-    int a = 0;
-    int* pa = &a;
-    int** ppa = &pa;
-    printf("a = %d, *pa = %d **ppa = %d\n", a, *pa, **ppa);
-    **ppa = 2;
-    printf("a = %d, *pa = %d **ppa = %d\n", a, *pa, **ppa);
-}
-
-void testtripple() {
-    int a = 0;
-    int* pa = &a;
-    int** ppa = &pa;
-    int*** pppa = &ppa;
-    printf("a = %d, *pa = %d **ppa = %d\n ***pppa = %d", a, *pa, **ppa, ***pppa);
-    ***pppa = 2;
-    printf("a = %d, *pa = %d **ppa = %d\n ***pppa = %d", a, *pa, **ppa, ***pppa);
-}
-
-int main() {
-    testsingle();
-    testdouble();
-    testdouble();
-    return 0;
-}
diff --git a/tests/data/src/returnval-ansi.c b/tests/data/src/returnval-ansi.c
deleted file mode 100644
index 6b53fd5..0000000
--- a/tests/data/src/returnval-ansi.c
+++ /dev/null
@@ -1,8 +0,0 @@
-
-int main(int argc, char** argv) {
-  return f();
-}
-
-int f() {
-    return 42;
-}
diff --git a/tests/data/src/rollo3.c b/tests/data/src/rollo3.c
deleted file mode 100644
index b21c12f..0000000
--- a/tests/data/src/rollo3.c
+++ /dev/null
@@ -1,9 +0,0 @@
-
-float fabsf(float);
-
-int main(void* con, int ft, int launchID)
-{
-   float f =  fabsf(-10.0f);
-   return f;
-}
-
diff --git a/tests/data/src/short.c b/tests/data/src/short.c
deleted file mode 100644
index 5e222f3..0000000
--- a/tests/data/src/short.c
+++ /dev/null
@@ -1,6 +0,0 @@
-short a = 3;
-int main() {
-    short* b = &a;
-    *b = *b - 5;
-    return a;
-}
diff --git a/tests/data/src/simplest.c b/tests/data/src/simplest.c
deleted file mode 100644
index bae895a..0000000
--- a/tests/data/src/simplest.c
+++ /dev/null
@@ -1 +0,0 @@
-main() {}
diff --git a/tests/data/src/structs.c b/tests/data/src/structs.c
deleted file mode 100644
index e824a3e..0000000
--- a/tests/data/src/structs.c
+++ /dev/null
@@ -1,90 +0,0 @@
-// struct definition and declaration
-struct a {
-    int a;
-    int b;
-} c;
-
-// Useful anonymous struct declaration
-struct {
-    int y;
-} anon1, anon2;
-
-// forward declarations
-struct a;
-struct b;
-struct c;
-
-struct b {int a; int b; };
-
-// struct c {b g; }; // syntax error.
-
-// struct s {float c,a,b,c;} s; // duplicate struct member
-
-struct c {struct b g; };
-
-// struct a { int w; }; // error
-
-void testCopying() {
-    struct a {int a[10]; char c;} a, b;
-    a.c = 37;
-    b.c = 38;
-    b = a;
-    printf("testCopying: %d == %d\n", a.c, b.c);
-}
-
-void testUnion() {
-    union u;
-    union u {float f;int i;} u;
-    u.f = 1.0f;
-    printf("testUnion: %g == 0x%08x\n", u.f, u.i);
-}
-
-struct v {float x, y, z, w; };
-
-void add(struct v* result, struct v* a, struct v* b) {
-    result->x = a->x + b->x;
-    result->y = a->y + b->y;
-    result->z = a->z + b->z;
-    result->w = a->w + b->w;
-}
-
-void set(struct v* v, float x, float y, float z, float w) {
-    v->x = x;
-    v->y = y;
-    v->z = z;
-    v->w = w;
-}
-
-void print(struct v* v) {
-    printf("(%g, %g, %g, %g)\n", v->x, v->y, v->z, v->w);
-}
-
-void testArgs() {
-    struct v a, b, c;
-    set(&a, 1.0f, 2.0f, 3.0f, 4.0f);
-    set(&b, 5.0f, 6.0f, 7.0f, 8.0f);
-    add(&c, &a, &b);
-    printf("testArgs: ");
-    print(&c);
-}
-
-int main() {
-    anon1.y = 3;
-    anon2.y = anon1.y;
-
-    testCopying();
-    testUnion();
-    testArgs();
-
-    struct c cc;
-    cc.g.a = 3;
-    c.a = 1;
-    c.b = 3;
-    struct a {int x, y; } z;
-    // struct a {int x, y; } z2;
-    z.x = c.a;
-    struct a *pA;
-    pA = &z;
-    pA->x += 5;
-    return pA->x;
-}
diff --git a/tests/data/src/testStringConcat.c b/tests/data/src/testStringConcat.c
deleted file mode 100644
index bf06ae1..0000000
--- a/tests/data/src/testStringConcat.c
+++ /dev/null
@@ -1,4 +0,0 @@
-int main() {
-    return printf("Hello" "," " world\n");
-}
-
diff --git a/tests/data/structs.bc b/tests/data/structs.bc
deleted file mode 100644
index 1534ee2..0000000
--- a/tests/data/structs.bc
+++ /dev/null
Binary files differ
diff --git a/tests/data/testStringConcat.bc b/tests/data/testStringConcat.bc
deleted file mode 100644
index 73385b1..0000000
--- a/tests/data/testStringConcat.bc
+++ /dev/null
Binary files differ
diff --git a/tests/test b/tests/test
deleted file mode 100755
index 8fd6916..0000000
--- a/tests/test
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-SCRIPT_DIR=`dirname $BASH_SOURCE`
-cd $SCRIPT_DIR
-python test.py "$@"
-
diff --git a/tests/test.py b/tests/test.py
deleted file mode 100644
index 92aeec1..0000000
--- a/tests/test.py
+++ /dev/null
@@ -1,510 +0,0 @@
-#
-# Copyright (C) 2010 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Test the bcc compiler
-
-import unittest
-import subprocess
-import os
-import sys
-
-gArmInitialized = False
-gUseArm = True
-gUseX86 = True
-gRunOTCCOutput = True
-
-
-def parseArgv():
-    global gUseArm
-    global gUseX86
-    global gRunOTCCOutput
-    for arg in sys.argv[1:]:
-        if arg == "--noarm":
-            print "--noarm: not testing ARM"
-            gUseArm = False
-        elif arg == "--nox86":
-            print "--nox86: not testing x86"
-            gUseX86 = False
-        elif arg == "--norunotcc":
-            print "--norunotcc detected, not running OTCC output"
-            gRunOTCCOutput = False
-        else:
-            print "Unknown parameter: ", arg
-            raise "Unknown parameter"
-    sys.argv = sys.argv[0:1]
-
-def compile(args):
-    proc = subprocess.Popen(["../libbcc_driver"] + args, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-    result = proc.communicate()
-    return result
-
-def runCmd(args):
-    proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    result = proc.communicate()
-    return result[0].strip()
-
-def uname():
-    return runCmd(["uname"])
-
-def unameM():
-    return runCmd(["uname", "-m"])
-
-def which(item):
-    return runCmd(["which", item])
-
-def fileType(item):
-    return runCmd(["file", item])
-
-def outputCanRun():
-    ft = fileType(which("bcc"))
-    return ft.find("ELF 32-bit LSB executable, Intel 80386") >= 0
-
-def checkEnvironment():
-    global gRunOTCCOutput
-    gRunOTCCOutput = uname() == "Linux" and unameM() != "x86_64" and outputCanRun()
-
-def adb(args):
-    return runCmd(["adb"] + args)
-
-def setupArm():
-    global gArmInitialized
-    if gArmInitialized:
-        return
-    print "Setting up arm"
-    adb(["remount"])
-    adb(["shell", "rm", "/system/bin/bcc"])
-    adb(["shell", "mkdir", "/system/bin/bccdata"])
-    adb(["shell", "mkdir", "/system/bin/bccdata/data"])
-    # Clear out old data TODO: handle recursion
-    adb(["shell", "rm", "/system/bin/bccdata/data/*"])
-    # Copy over data
-    for root, dirs, files in os.walk("data"):
-        for d in dirs:
-            adb(["shell", "mkdir", os.path.join(root, d)])
-        for f in files:
-            adb(["push", os.path.join(root, f), os.path.join("/system/bin/bccdata", root, f)])
-    # Copy over compiler
-    adb(["sync"])
-    gArmInitialized = True
-
-def compileArm(args):
-    setupArm()
-    proc = subprocess.Popen(["adb", "shell", "/system/bin/bcc"] + args, stdout=subprocess.PIPE)
-    result = proc.communicate()
-    return result[0].replace("\r","")
-
-def compare(a, b):
-    if a != b:
-        firstDiff = firstDifference(a, b)
-        print "Strings differ at character %d. Common: %s. Difference '%s' != '%s'" % (
-            firstDiff, a[0:firstDiff], safeAccess(a, firstDiff), safeAccess(b, firstDiff))
-
-def safeAccess(s, i):
-    if 0 <= i < len(s):
-        return s[i]
-    else:
-        return '?'
-
-def firstDifference(a, b):
-    commonLen = min(len(a), len(b))
-    for i in xrange(0, commonLen):
-        if a[i] != b[i]:
-            return i
-    return commonLen
-
-# a1 and a2 are the expected stdout and stderr.
-# b1 and b2 are the actual stdout and stderr.
-# Compare the two, sets. Allow any individual line
-# to appear in either stdout or stderr. This is because
-# the way we obtain output on the ARM combines both
-# streams into one sequence.
-
-def compareOuput(a1,a2,b1,b2):
-    while True:
-        totalLen = len(a1) + len(a2) + len(b1) + len(b2)
-        a1, b1 = matchCommon(a1, b1)
-        a1, b2 = matchCommon(a1, b2)
-        a2, b1 = matchCommon(a2, b1)
-        a2, b2 = matchCommon(a2, b2)
-        newTotalLen = len(a1) + len(a2) + len(b1) + len(b2)
-        if newTotalLen == 0:
-            return True
-        if newTotalLen == totalLen:
-            print "Failed at %d %d %d %d" % (len(a1), len(a2), len(b1), len(b2))
-            print "a1", a1
-            print "a2", a2
-            print "b1", b1
-            print "b2", b2
-            return False
-
-def matchCommon(a, b):
-    """Remove common items from the beginning of a and b,
-       return just the tails that are different."""
-    while len(a) > 0 and len(b) > 0 and a[0] == b[0]:
-        a = a[1:]
-        b = b[1:]
-    return a, b
-
-def rewritePaths(args):
-    return [rewritePath(x) for x in args]
-
-def rewritePath(p):
-    """Take a path that's correct on the x86 and convert to a path
-       that's correct on ARM."""
-    if p.startswith("data/"):
-        p = "/system/bin/bccdata/" + p
-    return p
-
-class TestACC(unittest.TestCase):
-
-    def checkResult(self, out, err, stdErrResult, stdOutResult=""):
-        a1 = out.splitlines()
-        a2 = err.splitlines()
-        b2 = stdErrResult.splitlines()
-        b1 = stdOutResult.splitlines()
-        self.assertEqual(True, compareOuput(a1,a2,b1,b2))
-
-    def compileCheck(self, args, stdErrResult, stdOutResult="",
-                     targets=['arm', 'x86']):
-        global gUseArm
-        global gUseX86
-        targetSet = frozenset(targets)
-        if gUseX86 and 'x86' in targetSet:
-            print args
-            out, err = compile(args)
-            self.checkResult(out, err, stdErrResult, stdOutResult)
-        if gUseArm and 'arm' in targetSet:
-            out = compileArm(rewritePaths(args))
-            self.checkResult(out, "", stdErrResult, stdOutResult)
-
-    def compileCheckArm(self, args, result):
-        self.assertEqual(compileArm(args), result)
-
-    def testCompileReturnVal(self):
-        self.compileCheck(["data/returnval-ansi.bc"], "")
-
-    def testCompileOTCCANSII(self):
-        self.compileCheck(["data/otcc-ansi.bc"], "", "", ['x86'])
-
-    def testRunReturnVal(self):
-        self.compileCheck(["-c -R", "data/returnval-ansi.bc"],
-        "Executing compiled code:\nresult: 42\n")
-
-    def testStringLiteralConcatenation(self):
-        self.compileCheck(["-c -R", "data/testStringConcat.bc"],
-        "Executing compiled code:\nresult: 13\n", "Hello, world\n")
-
-    def testRunOTCCANSI(self):
-        global gRunOTCCOutput
-        if gRunOTCCOutput:
-            self.compileCheck(["-c -R", "data/otcc-ansi.bc", "data/returnval.c"],
-                "Executing compiled code:\notcc-ansi.c: About to execute compiled code:\natcc-ansi.c: result: 42\nresult: 42\n", "",
-                 ['x86'])
-
-    def testRunOTCCANSI2(self):
-        global gRunOTCCOutput
-        if gRunOTCCOutput:
-            self.compileCheck(["-c -R", "data/otcc-ansi.bc", "data/otcc.c", "data/returnval.c"],
-                "Executing compiled code:\notcc-ansi.c: About to execute compiled code:\notcc.c: about to execute compiled code.\natcc-ansi.c: result: 42\nresult: 42\n", "",['x86'])
-
-    def testRunConstants(self):
-        self.compileCheck(["-c -R", "data/constants.bc"],
-            "Executing compiled code:\nresult: 0\n",
-            "0 = 0\n010 = 8\n0x10 = 16\n'\\a' = 7\n'\\b' = 8\n'\\f' = 12\n'\\n' = 10\n'\\r' = 13\n'\\t' = 9\n'\\v' = 11\n'\\\\' = 92\n'\\'' = 39\n" +
-            "'\\\"' = 34\n'\\?' = 63\n'\\0' = 0\n'\\1' = 1\n'\\12' = 10\n'\\123' = 83\n'\\x0' = 0\n'\\x1' = 1\n'\\x12' = 18\n'\\x123' = 35\n'\\x1f' = 31\n'\\x1F' = 31\n")
-
-    def testRunFloat(self):
-        self.compileCheck(["-c -R", "data/float.bc"],
-            "Executing compiled code:\nresult: 0\n",
-            """Constants: 0 0 0 0.01 0.01 0.1 10 10 0.1
-int: 1 float: 2.2 double: 3.3
- ftoi(1.4f)=1
- dtoi(2.4)=2
- itof(3)=3
- itod(4)=4
-globals: 1 2 3 4
-args: 1 2 3 4
-locals: 1 2 3 4
-cast rval: 2 4
-cast lval: 1.1 2 3.3 4
-""")
-
-    def testRunFlops(self):
-        self.compileCheck(["-c -R", "data/flops.bc"],
-            """Executing compiled code:
-result: 0""",
-"""-1.1 = -1.1
-!1.2 = 0
-!0 = 1
-double op double:
-1 + 2 = 3
-1 - 2 = -1
-1 * 2 = 2
-1 / 2 = 0.5
-float op float:
-1 + 2 = 3
-1 - 2 = -1
-1 * 2 = 2
-1 / 2 = 0.5
-double op float:
-1 + 2 = 3
-1 - 2 = -1
-1 * 2 = 2
-1 / 2 = 0.5
-double op int:
-1 + 2 = 3
-1 - 2 = -1
-1 * 2 = 2
-1 / 2 = 0.5
-int op double:
-1 + 2 = 3
-1 - 2 = -1
-1 * 2 = 2
-1 / 2 = 0.5
-double op double:
-1 op 2: < 1   <= 1   == 0   >= 0   > 0   != 1
-1 op 1: < 0   <= 1   == 1   >= 1   > 0   != 0
-2 op 1: < 0   <= 0   == 0   >= 1   > 1   != 1
-double op float:
-1 op 2: < 1   <= 1   == 0   >= 0   > 0   != 1
-1 op 1: < 0   <= 1   == 1   >= 1   > 0   != 0
-2 op 1: < 0   <= 0   == 0   >= 1   > 1   != 1
-float op float:
-1 op 2: < 1   <= 1   == 0   >= 0   > 0   != 1
-1 op 1: < 0   <= 1   == 1   >= 1   > 0   != 0
-2 op 1: < 0   <= 0   == 0   >= 1   > 1   != 1
-int op double:
-1 op 2: < 1   <= 1   == 0   >= 0   > 0   != 1
-1 op 1: < 0   <= 1   == 1   >= 1   > 0   != 0
-2 op 1: < 0   <= 0   == 0   >= 1   > 1   != 1
-double op int:
-1 op 2: < 1   <= 1   == 0   >= 0   > 0   != 1
-1 op 1: < 0   <= 1   == 1   >= 1   > 0   != 0
-2 op 1: < 0   <= 0   == 0   >= 1   > 1   != 1
-branching: 1 0 1
-testpassi: 1 2 3 4 5 6 7 8 9 10 11 12
-testpassf: 1 2 3 4 5 6 7 8 9 10 11 12
-testpassd: 1 2 3 4 5 6 7 8 9 10 11 12
-testpassi: 1 2 3 4 5 6 7 8 9 10 11 12
-testpassf: 1 2 3 4 5 6 7 8 9 10 11 12
-testpassd: 1 2 3 4 5 6 7 8 9 10 11 12
-testpassi: 1 2 3 4 5 6 7 8 9 10 11 12
-testpassf: 1 2 3 4 5 6 7 8 9 10 11 12
-testpassd: 1 2 3 4 5 6 7 8 9 10 11 12
-testpassidf: 1 2 3
-""")
-    def testCasts(self):
-        self.compileCheck(["-c -R", "data/casts.bc"],
-            """Executing compiled code:
-result: 0""", """Reading from a pointer: 3 3
-Writing to a pointer: 4
-Testing casts: 3 3 4.5 4
-Testing reading (int*): 4
-Testing writing (int*): 8 9
-Testing reading (char*): 0x78 0x56 0x34 0x12
-Testing writing (char*): 0x87654321
-f(10)
-Function pointer result: 70
-Testing read/write (float*): 8.8 9.9
-Testing read/write (double*): 8.8 9.9
-""")
-
-    def testChar(self):
-        self.compileCheck(["-c -R", "data/char.bc"], """Executing compiled code:
-result: 0""", """a = 99, b = 41
-ga = 100, gb = 44""")
-
-    def testPointerArithmetic(self):
-        self.compileCheck(["-c -R", "data/pointers.bc"], """Executing compiled code:
-result: 0""", """Pointer difference: 1 4
-Pointer addition: 2
-Pointer comparison to zero: 0 0 1
-Pointer comparison: 1 0 0 0 1
-""")
-    def testRollo3(self):
-        self.compileCheck(["-c -R", "data/rollo3.bc"], """Executing compiled code:
-result: 10""", """""")
-
-    def testFloatDouble(self):
-        self.compileCheck(["-c -R", "data/floatdouble.bc"], """Executing compiled code:
-result: 0""", """0.002 0.1 10""")
-
-    def testIncDec(self):
-        self.compileCheck(["-c -R", "data/inc.bc"], """Executing compiled code:
-0
-1
-2
-1
-1
-2
-1
-0
-result: 0
-""","""""")
-
-    def testIops(self):
-        self.compileCheck(["-c -R", "data/iops.bc"], """Executing compiled code:
-result: 0""", """Literals: 1 -1
-++
-0
-1
-2
-3
-4
-5
-6
-7
-8
-9
---
-10
-9
-8
-7
-6
-5
-4
-3
-2
-1
-0
-""")
-
-    def testFilm(self):
-        self.compileCheck(["-c -R", "data/film.bc"], """Executing compiled code:
-result: 0""", """testing...
-Total bad: 0
-""")
-
-    def testpointers2(self):
-        self.compileCheck(["-c -R", "data/pointers2.bc"], """Executing compiled code:
-result: 0""", """a = 0, *pa = 0
-a = 2, *pa = 2
-a = 0, *pa = 0 **ppa = 0
-a = 2, *pa = 2 **ppa = 2
-a = 0, *pa = 0 **ppa = 0
-a = 2, *pa = 2 **ppa = 2
-""")
-
-    def testassignmentop(self):
-        self.compileCheck(["-c -R", "data/assignmentop.bc"], """Executing compiled code:
-result: 0""", """2 *= 5  10
-20 /= 5  4
-17 %= 5  2
-17 += 5  22
-17 -= 5  12
-17<<= 1  34
-17>>= 1  8
-17&= 1  1
-17^= 1  16
-16|= 1  17
-*f() = *f() + 10;
-f()
-f()
-a = 10
-*f() += 10;
-f()
-a = 10
-""")
-
-    def testcomma(self):
-        self.compileCheck(["-c -R", "data/comma.bc"], """Executing compiled code:
-result: 0""", """statement: 10
-if: a = 0
-while: b = 11
-for: b = 22
-return: 30
-arg: 12
-""")
-
-    def testBrackets(self):
-        self.compileCheck(["-c -R", "data/brackets.bc"], """Executing compiled code:
-Errors: 0
-2D Errors: 0
-result: 0
-""","""""")
-
-    def testShort(self):
-        self.compileCheck(["-c -R", "data/short.bc"], """Executing compiled code:
-result: -2
-""","""""")
-
-    def testAssignment(self):
-        self.compileCheck(["-c -R", "data/assignment.bc"], """Executing compiled code:
-result: 7
-""","""""")
-
-    def testArray(self):
-        self.compileCheck(["-c -R", "data/array.bc"], """Executing compiled code:
-localInt: 3
-localDouble: 3 3
-globalChar: 3
-globalDouble: 3
-testArgs: 0 2 4
-testDecay: Hi!
-test2D:
-abcdefghijklmnopabcd
-defghijklmnopabcdefg
-ghijklmnopabcdefghij
-jklmnopabcdefghijklm
-mnopabcdefghijklmnop
-pabcdefghijklmnopabc
-cdefghijklmnopabcdef
-fghijklmnopabcdefghi
-ijklmnopabcdefghijkl
-lmnopabcdefghijklmno
-result: 0
-""","""""")
-
-    def testDefines(self):
-        self.compileCheck(["-c -R", "data/defines.bc"], """Executing compiled code:
-result: 3
-""","""""")
-
-    def testFuncArgs(self):
-        self.compileCheck(["-c -R", "data/funcargs.bc"], """Executing compiled code:
-result: 4
-""","""""")
-
-    def testB2071670(self):
-        self.compileCheck(["-c -R", "data/b2071670.bc"], """Executing compiled code:
-result: 1092616192
-""","""""")
-
-    def testStructs(self):
-        self.compileCheck(["-c -R", "data/structs.bc"], """Executing compiled code:
-testCopying: 37 == 37
-testUnion: 1 == 0x3f800000
-testArgs: (6, 8, 10, 12)
-result: 6
-""","""""")
-
-    def testAddressOf(self):
-        self.compileCheck(["-c -R", "data/addressOf.bc"], """Executing compiled code:
-testStruct: 10 10 10
-testArray: 1 1 1
-result: 0
-""","""""")
-
-def main():
-    checkEnvironment()
-    parseArgv()
-    unittest.main()
-
-if __name__ == '__main__':
-    main()
-
diff --git a/tools/bcc/Android.mk b/tools/bcc/Android.mk
index e005ffc..4d6d462 100644
--- a/tools/bcc/Android.mk
+++ b/tools/bcc/Android.mk
@@ -23,8 +23,7 @@
 LOCAL_MODULE := bcc
 LOCAL_MODULE_CLASS := EXECUTABLES
 
-LOCAL_SRC_FILES := \
-  main.cpp
+LOCAL_SRC_FILES := Main.cpp
 
 LOCAL_SHARED_LIBRARIES := \
   libbcc \
@@ -35,7 +34,6 @@
   $(LOCAL_PATH)/../../include
 
 LOCAL_LDLIBS = -ldl
-LOCAL_SRC_FILES := Main.cpp
 
 include $(LIBBCC_HOST_BUILD_MK)
 include $(LIBBCC_GEN_CONFIG_MK)
diff --git a/tools/bcc/Main.cpp b/tools/bcc/Main.cpp
index 0d2cff4..0a79338 100644
--- a/tools/bcc/Main.cpp
+++ b/tools/bcc/Main.cpp
@@ -24,7 +24,7 @@
 #include <llvm/Config/config.h>
 #include <llvm/Support/CommandLine.h>
 #include <llvm/Support/FileSystem.h>
-#include <llvm/Support/Path.h>
+#include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Support/system_error.h>
 
@@ -36,6 +36,7 @@
 #include <bcc/ExecutionEngine/ObjectLoader.h>
 #include <bcc/ExecutionEngine/SymbolResolverProxy.h>
 #include <bcc/ExecutionEngine/SymbolResolvers.h>
+#include <bcc/Renderscript/RSCompilerDriver.h>
 #include <bcc/Script.h>
 #include <bcc/Source.h>
 #include <bcc/Support/CompilerConfig.h>
@@ -51,13 +52,27 @@
 //===----------------------------------------------------------------------===//
 namespace {
 
-llvm::cl::list<std::string>
-OptInputFilenames(llvm::cl::Positional, llvm::cl::OneOrMore,
-                  llvm::cl::desc("<input bitcode files>"));
+llvm::cl::opt<std::string>
+OptInputFilename(llvm::cl::Positional, llvm::cl::ValueRequired,
+                 llvm::cl::desc("<input bitcode file>"));
 
 llvm::cl::opt<std::string>
 OptOutputFilename("o", llvm::cl::desc("Specify the output filename"),
-                  llvm::cl::value_desc("filename"));
+                  llvm::cl::value_desc("filename"),
+                  llvm::cl::init("bcc_output"));
+
+llvm::cl::opt<std::string>
+OptBCLibFilename("bclib", llvm::cl::desc("Specify the bclib filename"),
+                 llvm::cl::value_desc("bclib"));
+
+llvm::cl::opt<std::string>
+OptOutputPath("output_path", llvm::cl::desc("Specify the output path"),
+              llvm::cl::value_desc("output path"),
+              llvm::cl::init("."));
+
+llvm::cl::opt<bool>
+OptEmitLLVM("emit-llvm",
+            llvm::cl::desc("Emit an LLVM-IR version of the generated program"));
 
 #ifdef TARGET_BUILD
 const std::string OptTargetTriple(DEFAULT_TARGET_TRIPLE_STRING);
@@ -77,56 +92,12 @@
 //===----------------------------------------------------------------------===//
 // Compiler Options
 //===----------------------------------------------------------------------===//
-llvm::cl::opt<bool>
-OptPIC("fPIC", llvm::cl::desc("Generate fully relocatable, position independent"
-                              " code"));
 
+// RenderScript uses -O3 by default
 llvm::cl::opt<char>
 OptOptLevel("O", llvm::cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
-                                "(default: -O2)"),
-            llvm::cl::Prefix, llvm::cl::ZeroOrMore, llvm::cl::init('2'));
-
-llvm::cl::opt<bool>
-OptC("c", llvm::cl::desc("Compile and assemble, but do not link."));
-
-//===----------------------------------------------------------------------===//
-// Linker Options
-//===----------------------------------------------------------------------===//
-// FIXME: this option will be removed in the future when MCLinker is capable
-//        of generating shared library directly from given bitcode. It only
-//        takes effect when -shared is supplied.
-llvm::cl::opt<std::string>
-OptImmObjectOutput("or", llvm::cl::desc("Specify the filename for output the "
-                                        "intermediate relocatable when linking "
-                                        "the input bitcode to the shared "
-                                        "library"), llvm::cl::ValueRequired);
-
-llvm::cl::opt<bool>
-OptShared("shared", llvm::cl::desc("Create a shared library from input bitcode "
-                                   "files"));
-
-
-//===----------------------------------------------------------------------===//
-// Loader Options
-//===----------------------------------------------------------------------===//
-llvm::cl::opt<bool>
-OptRunEntry("R", llvm::cl::desc("Run the entry method after successfully load "
-                                "and compile."));
-
-llvm::cl::opt<std::string>
-OptEntryFunction("entry-function", llvm::cl::desc("Specify the entry function "
-                                                  "for -R (default: main)"),
-                 llvm::cl::value_desc("function"), llvm::cl::init("main"));
-
-llvm::cl::opt<bool>
-OptEnableGDB("enable-gdb", llvm::cl::desc("Enable GDB JIT debugging when "
-                                          "runs the entry method"));
-
-llvm::cl::list<std::string>
-OptRuntimeLibs("load", llvm::cl::desc("Specify the shared libraries for "
-                                      "execution (e.g., -load=c will search "
-                                      "and load libc.so for execution)"),
-               llvm::cl::ZeroOrMore, llvm::cl::value_desc("namespec"));
+                                "(default: -O3)"),
+            llvm::cl::Prefix, llvm::cl::ZeroOrMore, llvm::cl::init('3'));
 
 // Override "bcc -version" since the LLVM version information is not correct on
 // Android build.
@@ -148,42 +119,8 @@
 } // end anonymous namespace
 
 static inline
-Script *PrepareScript(BCCContext &pContext,
-                      const llvm::cl::list<std::string> &pBitcodeFiles) {
-  Script *result = NULL;
-
-  for (unsigned i = 0; i < pBitcodeFiles.size(); i++) {
-    const std::string &input_bitcode = pBitcodeFiles[i];
-    Source *source = Source::CreateFromFile(pContext, input_bitcode);
-    if (source == NULL) {
-      llvm::errs() << "Failed to load llvm module from file `" << input_bitcode
-                   << "'!\n";
-      return NULL;
-    }
-
-    if (result != NULL) {
-      if (!result->mergeSource(*source, /* pPreserveSource */false)) {
-        llvm::errs() << "Failed to merge the llvm module `" << input_bitcode
-                     << "' to compile!\n";
-        delete source;
-        return NULL;
-      }
-    } else {
-      result = new (std::nothrow) Script(*source);
-      if (result == NULL) {
-        llvm::errs() << "Out of memory when create script for file `"
-                     << input_bitcode << "'!\n";
-        delete source;
-        return NULL;
-      }
-    }
-  }
-
-  return result;
-}
-
-static inline
-bool ConfigCompiler(Compiler &pCompiler) {
+bool ConfigCompiler(RSCompilerDriver &pRSCD) {
+  RSCompiler *RSC = pRSCD.getCompiler();
   CompilerConfig *config = NULL;
 
 #ifdef TARGET_BUILD
@@ -196,24 +133,19 @@
     return false;
   }
 
-  // Setup the config according to the valud of command line option.
-  if (OptPIC) {
-    config->setRelocationModel(llvm::Reloc::PIC_);
-  }
   switch (OptOptLevel) {
     case '0': config->setOptimizationLevel(llvm::CodeGenOpt::None); break;
     case '1': config->setOptimizationLevel(llvm::CodeGenOpt::Less); break;
-    case '3': config->setOptimizationLevel(llvm::CodeGenOpt::Aggressive); break;
-    case '2':
+    case '2': config->setOptimizationLevel(llvm::CodeGenOpt::Default); break;
+    case '3':
     default: {
-      config->setOptimizationLevel(llvm::CodeGenOpt::Default);
+      config->setOptimizationLevel(llvm::CodeGenOpt::Aggressive);
       break;
     }
   }
 
-  Compiler::ErrorCode result = pCompiler.config(*config);
-
-  delete config;
+  pRSCD.setConfig(config);
+  Compiler::ErrorCode result = RSC->config(*config);
 
   if (result != Compiler::kSuccess) {
     llvm::errs() << "Failed to configure the compiler! (detail: "
@@ -224,43 +156,6 @@
   return true;
 }
 
-#define DEFAULT_OUTPUT_PATH   "/sdcard/a.out"
-static inline
-std::string DetermineOutputFilename(const std::string &pOutputPath) {
-  if (!pOutputPath.empty()) {
-    return pOutputPath;
-  }
-
-  // User doesn't specify the value to -o.
-  if (OptInputFilenames.size() > 1) {
-    llvm::errs() << "Use " DEFAULT_OUTPUT_PATH " for output file!\n";
-    return DEFAULT_OUTPUT_PATH;
-  }
-
-  // There's only one input bitcode file.
-  const std::string &input_path = OptInputFilenames[0];
-  llvm::SmallString<200> output_path(input_path);
-
-  llvm::error_code err = llvm::sys::fs::make_absolute(output_path);
-  if (err != llvm::errc::success) {
-    llvm::errs() << "Failed to determine the absolute path of `" << input_path
-                 << "'! (detail: " << err.message() << ")\n";
-    return "";
-  }
-
-  if (OptC) {
-    // -c was specified. Replace the extension to .o.
-    llvm::sys::path::replace_extension(output_path, "o");
-  } else {
-    // Use a.out under current working directory when compile executable or
-    // shared library.
-    llvm::sys::path::remove_filename(output_path);
-    llvm::sys::path::append(output_path, "a.out");
-  }
-
-  return output_path.c_str();
-}
-
 static inline
 bool CompileScript(Compiler &pCompiler, Script &pScript,
                    const std::string &pOutputPath) {
@@ -284,150 +179,38 @@
   return true;
 }
 
-static inline
-bool PrepareRuntimes(std::vector<SymbolResolverInterface *> &pRuntimes) {
-  llvm::SmallVector<const char *, 2> search_paths;
-
-#ifdef TARGET_BUILD
-  search_paths.push_back("/system/lib/");
-#else
-  search_paths.push_back("/lib/");
-  search_paths.push_back("/usr/lib/");
-#endif
-
-  // Most of the following lines comes from llvm/tools/llvm-ld.cpp.
-  for (unsigned i = 0; i < OptRuntimeLibs.size(); i++) {
-    const std::string &lib = OptRuntimeLibs[i];
-    llvm::sys::Path lib_path;
-    for (llvm::SmallVectorImpl<const char *>::const_iterator
-             search_path_iter = search_paths.begin(),
-             search_path_end = search_paths.end();
-         search_path_iter != search_path_end; search_path_iter++) {
-
-      lib_path = *search_path_iter;
-      lib_path.appendComponent("lib" + lib);
-      lib_path.appendSuffix(llvm::sys::Path::GetDLLSuffix());
-
-      if (lib_path.isEmpty()) {
-        if (!lib_path.isDynamicLibrary()) {
-          lib_path = llvm::sys::Path();
-        } else {
-          break;
-        }
-      }
-    } // for each search_paths
-    if (lib_path.isEmpty()) {
-      // FIXME: llvm::sys::Path::FindLibrary(...) is able to consume
-      //        'const std::string &' instead of 'std::string &'.
-      std::string lib_tmp = lib;
-      lib_path = llvm::sys::Path::FindLibrary(lib_tmp);
-    }
-    if (lib_path.isEmpty()) {
-      llvm::errs() << "Unable to find `lib" << lib << "' for execution!\n";
-      llvm::DeleteContainerPointers(pRuntimes);
-      return false;
-    } else {
-      DyldSymbolResolver *dyld_resolver =
-          new (std::nothrow) DyldSymbolResolver(lib_path.str().c_str());
-
-      if (dyld_resolver != NULL) {
-        pRuntimes.push_back(dyld_resolver);
-      } else {
-        llvm::errs() << "Out of memory when load `" << lib_path.str() << "'!\n";
-        llvm::DeleteContainerPointers(pRuntimes);
-        return false;
-      }
-    }
-  } // for each OptRuntimeLibs
-
-  return true;
-}
-
-static inline
-bool LoadAndRun(const std::string &pOutputExecutable) {
-  SymbolResolverProxy runtime_resolver;
-
-  // Include compiler runtime.
-  CompilerRTSymbolResolver compiler_runtimes;
-  runtime_resolver.chainResolver(compiler_runtimes);
-
-  // Open the output file for execution.
-  InputFile input_exec(pOutputExecutable);
-  if (input_exec.hasError()) {
-    llvm::errs() << "Failed to open the executable `" << pOutputExecutable
-                 << "'! (detail: " << input_exec.getErrorMessage() << ")\n";
-    return false;
-  }
-
-  // Load the runtime libraries given in command line.
-  std::vector<SymbolResolverInterface *> lib_runtimes;
-  if (!PrepareRuntimes(lib_runtimes)) {
-    return false;
-  }
-
-  for (std::vector<SymbolResolverInterface *>::const_iterator
-           librt_iter = lib_runtimes.begin(), librt_end = lib_runtimes.end();
-       librt_iter != librt_end; librt_iter++) {
-    runtime_resolver.chainResolver(*(*librt_iter));
-  }
-
-  // Load the output file.
-  ObjectLoader *loader = ObjectLoader::Load(input_exec, runtime_resolver,
-                                            OptEnableGDB);
-  if (loader == NULL) {
-    llvm::errs() << "Failed to load `" << pOutputExecutable << "'!\n";
-    llvm::DeleteContainerPointers(lib_runtimes);
-    return false;
-  }
-
-  // Retrieve the address of entry function.
-  void *entry = loader->getSymbolAddress(OptEntryFunction.c_str());
-  if (entry == NULL) {
-    llvm::errs() << "Couldn't find entry method `" << OptEntryFunction
-                 << "' in " << pOutputExecutable << "' for execution!\n";
-    delete loader;
-    llvm::DeleteContainerPointers(lib_runtimes);
-    return false;
-  }
-
-  // Execute the entry function.
-  int run_result = reinterpret_cast<int (*)()>(entry)();
-  llvm::errs() << "result: " << run_result << "\n";
-
-  // Clean up.
-  delete loader;
-  llvm::DeleteContainerPointers(lib_runtimes);
-
-  return true;
-}
-
 int main(int argc, char **argv) {
   llvm::cl::SetVersionPrinter(BCCVersionPrinter);
   llvm::cl::ParseCommandLineOptions(argc, argv);
   init::Initialize();
 
   BCCContext context;
-  Compiler compiler;
+  RSCompilerDriver RSCD;
 
-  Script *script = PrepareScript(context, OptInputFilenames);
-  if (script == NULL) {
+  llvm::OwningPtr<llvm::MemoryBuffer> input_data;
+
+  llvm::error_code ec =
+      llvm::MemoryBuffer::getFile(OptInputFilename.c_str(), input_data);
+  if (ec != llvm::error_code::success()) {
+    ALOGE("Failed to load bitcode from path %s! (%s)",
+          OptInputFilename.c_str(), ec.message().c_str());
     return EXIT_FAILURE;
   }
 
-  if (!ConfigCompiler(compiler)) {
+  llvm::MemoryBuffer *input_memory = input_data.take();
+
+  const char *bitcode = input_memory->getBufferStart();
+  size_t bitcodeSize = input_memory->getBufferSize();
+
+  if (!ConfigCompiler(RSCD)) {
+    ALOGE("Failed to configure compiler");
     return EXIT_FAILURE;
   }
+  bool built = RSCD.build(context, OptOutputPath.c_str(),
+      OptOutputFilename.c_str(), bitcode, bitcodeSize,
+      OptBCLibFilename.c_str(), NULL, OptEmitLLVM);
 
-  std::string OutputFilename = DetermineOutputFilename(OptOutputFilename);
-  if (OutputFilename.empty()) {
-    return EXIT_FAILURE;
-  }
-
-  if (!CompileScript(compiler, *script, OutputFilename)) {
-    return EXIT_FAILURE;
-  }
-
-  if (OptRunEntry && !LoadAndRun(OutputFilename)) {
+  if (!built) {
     return EXIT_FAILURE;
   }
 
diff --git a/tools/bcc_compat/Main.cpp b/tools/bcc_compat/Main.cpp
index 4399893..1391de8 100644
--- a/tools/bcc_compat/Main.cpp
+++ b/tools/bcc_compat/Main.cpp
@@ -279,7 +279,10 @@
 
   RSScript *s = NULL;
   s = PrepareRSScript(context, OptInputFilenames);
-  rscd.build(*s, OutputFilename.c_str(), OptRuntimePath.c_str());
+  if (!rscd.build(*s, OutputFilename.c_str(), OptRuntimePath.c_str())) {
+    fprintf(stderr, "Failed to compile script!");
+    return EXIT_FAILURE;
+  }
 
   return EXIT_SUCCESS;
 }
diff --git a/tools/bcc_strip_attr/bcc_strip_attr.cpp b/tools/bcc_strip_attr/bcc_strip_attr.cpp
index 0c1d9cb..19fa9d1 100644
--- a/tools/bcc_strip_attr/bcc_strip_attr.cpp
+++ b/tools/bcc_strip_attr/bcc_strip_attr.cpp
@@ -18,13 +18,14 @@
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/IRReader.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/SystemUtils.h"
 #include "llvm/Support/ToolOutputFile.h"
 using namespace llvm;
@@ -90,18 +91,11 @@
 static inline std::auto_ptr<Module> LoadFile(const char *argv0,
                                              const std::string &FN,
                                              LLVMContext& Context) {
-  sys::Path Filename;
-  if (!Filename.set(FN)) {
-    errs() << "Invalid file name: '" << FN << "'\n";
-    return std::auto_ptr<Module>();
-  }
-
   SMDiagnostic Err;
-  Module* Result = 0;
-
-  const std::string &FNStr = Filename.str();
-  Result = ParseIRFile(FNStr, Err, Context);
-  if (Result) return std::auto_ptr<Module>(Result);   // Load successful!
+  Module* Result = ParseIRFile(FN, Err, Context);
+  if (Result) {
+    return std::auto_ptr<Module>(Result);   // Load successful!
+  }
 
   Err.print(argv0, errs());
   return std::auto_ptr<Module>();
@@ -133,7 +127,7 @@
 
   std::string ErrorInfo;
   tool_output_file Out(OutputFilename.c_str(), ErrorInfo,
-                       raw_fd_ostream::F_Binary);
+                       sys::fs::F_Binary);
   if (!ErrorInfo.empty()) {
     errs() << ErrorInfo << '\n';
     return 1;
diff --git a/tools/build/gen-sha1-stamp.py b/tools/build/gen-sha1-stamp.py
index 239d040..012d522 100755
--- a/tools/build/gen-sha1-stamp.py
+++ b/tools/build/gen-sha1-stamp.py
@@ -76,7 +76,6 @@
 
 def print_asm_data(data, size):
     col = 0
-    sys.stdout.write(".align 8\n")
     for i in xrange(size):
         c = data[i]
         if col == 0:
@@ -95,6 +94,7 @@
 
 def print_asm_symbol_data(sym, h):
     sys.stdout.write("""
+.align 8
 #ifdef __APPLE_CC__
 _%s:\n\
 #else\n\
